68 lines
1.6 KiB
Python
68 lines
1.6 KiB
Python
"""Lexer for arithmetic expressions."""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Union
|
|
|
|
|
|
class LexError(Exception):
|
|
"""Raised on an unrecognised character."""
|
|
|
|
|
|
@dataclass
|
|
class Token:
|
|
kind: str # NUMBER PLUS MINUS STAR SLASH LPAREN RPAREN EOF
|
|
value: Union[int, float, str, None]
|
|
|
|
|
|
_SINGLE = {
|
|
'+': 'PLUS',
|
|
'-': 'MINUS',
|
|
'*': 'STAR',
|
|
'/': 'SLASH',
|
|
'(': 'LPAREN',
|
|
')': 'RPAREN',
|
|
}
|
|
|
|
|
|
def tokenize(src: str) -> list:
|
|
"""Return a list of Token for *src*, ending with EOF."""
|
|
tokens = []
|
|
i = 0
|
|
n = len(src)
|
|
|
|
while i < n:
|
|
ch = src[i]
|
|
|
|
# Skip whitespace
|
|
if ch in ' \t\r\n':
|
|
i += 1
|
|
continue
|
|
|
|
# Number: integer or float (leading dot allowed, trailing dot allowed)
|
|
if ch.isdigit() or ch == '.':
|
|
j = i
|
|
has_dot = False
|
|
while j < n and (src[j].isdigit() or (src[j] == '.' and not has_dot)):
|
|
if src[j] == '.':
|
|
has_dot = True
|
|
j += 1
|
|
raw = src[i:j]
|
|
if raw == '.':
|
|
raise LexError(f"Unexpected character '.' at position {i}")
|
|
value = float(raw) if has_dot else int(raw)
|
|
tokens.append(Token('NUMBER', value))
|
|
i = j
|
|
continue
|
|
|
|
# Single-character operators and parentheses
|
|
if ch in _SINGLE:
|
|
tokens.append(Token(_SINGLE[ch], ch))
|
|
i += 1
|
|
continue
|
|
|
|
# Anything else is an error
|
|
raise LexError(f"Unexpected character {ch!r} at position {i}")
|
|
|
|
tokens.append(Token('EOF', None))
|
|
return tokens
|