Lexer
Source
Marc-André Cournoyer, Create Your Own Programming Language
Chapter 4, “Lexer”
Contents
Notes
The book uses Ruby. This is a fairly direct translation into Python.
My version
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pytest
import re
import textwrap
class Lexer:
KEYWORDS = ['def', 'class', 'if', 'true', 'false', 'nil']
def tokenize(self, code):
code.rstrip('\n') # Remove newline character
tokens = []
current_indent = 0 # Number of spaces in current indentation
indent_stack = []
i, codelen = 0, len(code)+1 # `i` is current character position
while i < codelen:
chunk = code[i:]
try: # advance over white space
while code[i] == ' ':
i += 1
except IndexError:
break
m = re.compile(r'[a-z]\w*').match(chunk) # identifier
if m is not None:
identifier = m.group()
if identifier in self.KEYWORDS:
tokens.append((identifier.upper(), identifier))
else:
tokens.append(('IDENTIFIER', identifier))
i += len(identifier)
continue
m = re.compile(r'[A-Z]\w*').match(chunk) # constant
if m is not None:
constant = m.group()
tokens.append(('CONSTANT', constant))
i += len(constant)
continue
m = re.compile('[0-9]+').match(chunk) # number (int)
if m is not None:
number = m.group()
tokens.append(('NUMBER', number))
i += len(number)
continue
m = re.compile('"([^"]*)"').match(chunk) # string
if m is not None:
string = m.group()
tokens.append(('STRING', string))
i += len(string)
continue
# Indentation. There are three cases:
# - Case 1: creating a block with `:` and new indentation
# - Case 2: newline inside indented block, maintaining current indentation
# - Case 3: ending the current block and dedenting
m = re.compile(r'\:\n( +)').match(chunk) # case 1
if m is not None:
indent = m.group(1) # we're interested in spaces only
ilen = len(indent)
if ilen <= current_indent:
raise IndentationError(
f'Bad indent level: {ilen} spaces '
f'(expected > {current_indent})')
current_indent = ilen
indent_stack.append(current_indent)
tokens.append(('INDENT', ilen))
i += ilen + 2
continue
m = re.compile('\n( *)').match(chunk) # cases 2, 3
if m is not None:
indent = m.group(1)
ilen = len(indent)
if ilen == current_indent: # case 2
tokens.append(('NEWLINE', '\n'))
elif ilen < current_indent: # case 3
while ilen < current_indent:
indent_stack.pop()
try:
current_indent = indent_stack[-1]
except IndexError:
current_indent = 0
tokens.append(('DEDENT', ilen))
tokens.append(('NEWLINE', '\n'))
else:
raise SyntaxError("Missing ':'")
i += ilen + 1
continue
# Long operators: ||, &&, ==, etc.
m = re.compile(r'\|\||&&|==|!=|<=|>=').match(chunk)
if m is not None:
longop = m.group()
tokens.append(('operator', longop))
i += len(longop)
continue
# All single characters. Mostly operators.
m = re.compile(r'[^\s]{1}').match(chunk)
if m is not None:
value = m.group()
tokens.append((value, value))
i += 1
continue
# Close all open blocks.
if len(indent_stack):
indent = indent_stack.pop()
while indent:
dedent_value = indent_stack[0] if len(indent_stack) else 0
tokens.append(('DEDENT', dedent_value))
indent = indent_stack.pop() if len(indent_stack) else 0
return tokens
Tests
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
mylexer = Lexer()
def test_instance():
assert mylexer.KEYWORDS[0] == 'def'
def test_tokenize_identifier():
tokens = mylexer.tokenize('if foo then bar is true')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('IDENTIFIER', 'then'), ('IDENTIFIER', 'bar'),
('IDENTIFIER', 'is'), ('TRUE', 'true')]
def test_tokenize_identifier_constant():
tokens = mylexer.tokenize('if foo then BAR is true')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('IDENTIFIER', 'then'), ('CONSTANT', 'BAR'),
('IDENTIFIER', 'is'), ('TRUE', 'true')]
def test_tokenize_identifier_number():
tokens = mylexer.tokenize('if foo then BAR is 99')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('IDENTIFIER', 'then'), ('CONSTANT', 'BAR'),
('IDENTIFIER', 'is'), ('NUMBER', '99')]
def test_tokenize_identifier_string():
tokens = mylexer.tokenize('if foo then BAR is "baz"')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('IDENTIFIER', 'then'), ('CONSTANT', 'BAR'),
('IDENTIFIER', 'is'), ('STRING', '"baz"')]
def test_tokenize_identifier_indent_spaces():
tokens = mylexer.tokenize('if foo:\n ')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'), ('INDENT', 1),
('DEDENT', 0)] # close open block
tokens = mylexer.tokenize('if foo:\n ')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'), ('INDENT', 2),
('DEDENT', 0)] # close open block
with pytest.raises(IndentationError) as e:
mylexer.tokenize('if foo:\n if bar:\n baz')
assert str(e.value) == 'Bad indent level: 2 spaces (expected > 2)'
def test_tokenize_identifier_indent_block():
tokens = mylexer.tokenize('if foo:\n bar\n baz\nqux')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('INDENT', 2), ('IDENTIFIER', 'bar'), ('NEWLINE', '\n'),
('IDENTIFIER', 'baz'), ('DEDENT', 0), ('NEWLINE', '\n'),
('IDENTIFIER', 'qux')]
with pytest.raises(SyntaxError) as e:
mylexer.tokenize('if foo\n bar\n baz\nqux')
assert str(e.value) == "Missing ':'"
def test_tokenize_identifier_long_operator():
tokens = mylexer.tokenize('if foo == bar || baz == qux')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('operator', '=='), ('IDENTIFIER', 'bar'), ('operator', '||'),
('IDENTIFIER', 'baz'), ('operator', '=='), ('IDENTIFIER', 'qux')]
def test_tokenize_identifier_long_operator_value():
tokens = mylexer.tokenize('if foo + bar - baz == qux')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('+', '+'), ('IDENTIFIER', 'bar'), ('-', '-'),
('IDENTIFIER', 'baz'), ('operator', '=='), ('IDENTIFIER', 'qux')]
def test_tokenize_identifier_close_blocks():
tokens = mylexer.tokenize('if foo:\n bar\n baz')
assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
('INDENT', 2), ('IDENTIFIER', 'bar'), ('NEWLINE', '\n'),
('IDENTIFIER', 'baz'), ('DEDENT', 0)]
with pytest.raises(SyntaxError) as e:
mylexer.tokenize('if foo\n bar\n baz')
assert str(e.value) == "Missing ':'"
def test_short_program():
program = textwrap.dedent("""\
if 1:
if 2:
print(\"...\")
if false:
pass
print(\"done!\")
2
print \"The End\"""")
tokens = mylexer.tokenize(program)
assert tokens == [
('IF', 'if'), ('NUMBER', '1'),
('INDENT', 2),
('IF', 'if'), ('NUMBER', '2'),
('INDENT', 4),
('IDENTIFIER', 'print'),
('(', '('),
('STRING', '"..."'),
(')', ')'), ('NEWLINE', '\n'),
('IF', 'if'), ('FALSE', 'false'),
('INDENT', 6),
('IDENTIFIER', 'pass'),
('DEDENT', 4), ('NEWLINE', '\n'),
('IDENTIFIER', 'print'),
('(', '('),
('STRING', '"done!"'),
(')', ')'),
('DEDENT', 2), ('NEWLINE', '\n'),
('NUMBER', '2'),
('DEDENT', 0), ('NEWLINE', '\n'),
('NEWLINE', '\n'),
('IDENTIFIER', 'print'), ('STRING', '"The End"')
]