Source

Marc-André Cournoyer, Create Your Own Programming Language

Chapter 4, “Lexer”

Contents

Notes

The book uses Ruby. This is a fairly direct translation into Python.

My version

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pytest
import re
import textwrap


class Lexer:
    KEYWORDS = ['def', 'class', 'if', 'true', 'false', 'nil']

    def tokenize(self, code):
        code.rstrip('\n')    # Remove newline character
        tokens = []
        current_indent = 0   # Number of spaces in current indentation
        indent_stack = []
        i, codelen = 0, len(code)+1   # `i` is current character position

        while i < codelen:
            chunk = code[i:]

            try:   # advance over white space
                while code[i] == ' ':
                    i += 1
            except IndexError:
                break

            m = re.compile(r'[a-z]\w*').match(chunk)   # identifier
            if m is not None:
                identifier = m.group()
                if identifier in self.KEYWORDS:
                    tokens.append((identifier.upper(), identifier))
                else:
                    tokens.append(('IDENTIFIER', identifier))
                i += len(identifier)
                continue

            m = re.compile(r'[A-Z]\w*').match(chunk)   # constant
            if m is not None:
                constant = m.group()
                tokens.append(('CONSTANT', constant))
                i += len(constant)
                continue

            m = re.compile('[0-9]+').match(chunk)   # number (int)
            if m is not None:
                number = m.group()
                tokens.append(('NUMBER', number))
                i += len(number)
                continue

            m = re.compile('"([^"]*)"').match(chunk)   # string
            if m is not None:
                string = m.group()
                tokens.append(('STRING', string))
                i += len(string)
                continue

            # Indentation. There are three cases:
            # - Case 1: creating a block with `:` and new indentation
            # - Case 2: newline inside indented block, maintaining current indentation
            # - Case 3: ending the current block and dedenting
            m = re.compile(r'\:\n( +)').match(chunk)   # case 1
            if m is not None:
                indent = m.group(1)   # we're interested in spaces only
                ilen = len(indent)
                if ilen <= current_indent:
                    raise IndentationError(
                        f'Bad indent level: {ilen} spaces '
                        f'(expected > {current_indent})')
                current_indent = ilen
                indent_stack.append(current_indent)
                tokens.append(('INDENT', ilen))
                i += ilen + 2
                continue

            m = re.compile('\n( *)').match(chunk)   # cases 2, 3
            if m is not None:
                indent = m.group(1)
                ilen = len(indent)
                if ilen == current_indent:   # case 2
                    tokens.append(('NEWLINE', '\n'))
                elif ilen < current_indent:   # case 3
                    while ilen < current_indent:
                        indent_stack.pop()
                        try:
                            current_indent = indent_stack[-1]
                        except IndexError:
                            current_indent = 0
                        tokens.append(('DEDENT', ilen))
                    tokens.append(('NEWLINE', '\n'))
                else:
                    raise SyntaxError("Missing ':'")
                i += ilen + 1
                continue

            # Long operators: ||, &&, ==, etc.
            m = re.compile(r'\|\||&&|==|!=|<=|>=').match(chunk)
            if m is not None:
                longop = m.group()
                tokens.append(('operator', longop))
                i += len(longop)
                continue

            # All single characters. Mostly operators.
            m = re.compile(r'[^\s]{1}').match(chunk)
            if m is not None:
                value = m.group()
                tokens.append((value, value))
                i += 1
                continue

        # Close all open blocks.
        if len(indent_stack):
            indent = indent_stack.pop()
            while indent:
                dedent_value = indent_stack[0] if len(indent_stack) else 0
                tokens.append(('DEDENT', dedent_value))
                indent = indent_stack.pop() if len(indent_stack) else 0

        return tokens

Tests

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
mylexer = Lexer()

def test_instance():
    assert mylexer.KEYWORDS[0] == 'def'


def test_tokenize_identifier():
    tokens = mylexer.tokenize('if foo then bar is true')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('IDENTIFIER', 'then'), ('IDENTIFIER', 'bar'),
        ('IDENTIFIER', 'is'), ('TRUE', 'true')]


def test_tokenize_identifier_constant():
    tokens = mylexer.tokenize('if foo then BAR is true')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('IDENTIFIER', 'then'), ('CONSTANT', 'BAR'),
        ('IDENTIFIER', 'is'), ('TRUE', 'true')]


def test_tokenize_identifier_number():
    tokens = mylexer.tokenize('if foo then BAR is 99')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('IDENTIFIER', 'then'), ('CONSTANT', 'BAR'),
        ('IDENTIFIER', 'is'), ('NUMBER', '99')]


def test_tokenize_identifier_string():
    tokens = mylexer.tokenize('if foo then BAR is "baz"')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('IDENTIFIER', 'then'), ('CONSTANT', 'BAR'),
        ('IDENTIFIER', 'is'), ('STRING', '"baz"')]


def test_tokenize_identifier_indent_spaces():
    tokens = mylexer.tokenize('if foo:\n ')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'), ('INDENT', 1),
                      ('DEDENT', 0)] # close open block
    tokens = mylexer.tokenize('if foo:\n  ')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'), ('INDENT', 2),
                      ('DEDENT', 0)] # close open block
    with pytest.raises(IndentationError) as e:
        mylexer.tokenize('if foo:\n  if bar:\n  baz')
    assert str(e.value) == 'Bad indent level: 2 spaces (expected > 2)'


def test_tokenize_identifier_indent_block():
    tokens = mylexer.tokenize('if foo:\n  bar\n  baz\nqux')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('INDENT', 2), ('IDENTIFIER', 'bar'), ('NEWLINE', '\n'),
        ('IDENTIFIER', 'baz'), ('DEDENT', 0), ('NEWLINE', '\n'),
        ('IDENTIFIER', 'qux')]
    with pytest.raises(SyntaxError) as e:
        mylexer.tokenize('if foo\n  bar\n  baz\nqux')
    assert str(e.value) == "Missing ':'"


def test_tokenize_identifier_long_operator():
    tokens = mylexer.tokenize('if foo == bar || baz == qux')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('operator', '=='), ('IDENTIFIER', 'bar'), ('operator', '||'),
        ('IDENTIFIER', 'baz'), ('operator', '=='), ('IDENTIFIER', 'qux')]


def test_tokenize_identifier_long_operator_value():
    tokens = mylexer.tokenize('if foo + bar - baz == qux')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('+', '+'), ('IDENTIFIER', 'bar'), ('-', '-'),
        ('IDENTIFIER', 'baz'), ('operator', '=='), ('IDENTIFIER', 'qux')]


def test_tokenize_identifier_close_blocks():
    tokens = mylexer.tokenize('if foo:\n  bar\n  baz')
    assert tokens == [('IF', 'if'), ('IDENTIFIER', 'foo'),
        ('INDENT', 2), ('IDENTIFIER', 'bar'), ('NEWLINE', '\n'),
        ('IDENTIFIER', 'baz'), ('DEDENT', 0)]
    with pytest.raises(SyntaxError) as e:
        mylexer.tokenize('if foo\n  bar\n  baz')
    assert str(e.value) == "Missing ':'"


def test_short_program():
    program = textwrap.dedent("""\
        if 1:
          if 2:
            print(\"...\")
            if false:
              pass
            print(\"done!\")
          2

        print \"The End\"""")

    tokens = mylexer.tokenize(program)

    assert tokens == [
        ('IF', 'if'), ('NUMBER', '1'),
            ('INDENT', 2),
            ('IF', 'if'), ('NUMBER', '2'),
                ('INDENT', 4),
                ('IDENTIFIER', 'print'),
                    ('(', '('), 
                        ('STRING', '"..."'), 
                    (')', ')'), ('NEWLINE', '\n'),
                ('IF', 'if'), ('FALSE', 'false'),
                    ('INDENT', 6),
                    ('IDENTIFIER', 'pass'),
                ('DEDENT', 4), ('NEWLINE', '\n'),
                ('IDENTIFIER', 'print'),
                    ('(', '('), 
                        ('STRING', '"done!"'), 
                    (')', ')'),
            ('DEDENT', 2), ('NEWLINE', '\n'),
            ('NUMBER', '2'),
        ('DEDENT', 0), ('NEWLINE', '\n'),
        ('NEWLINE', '\n'),
        ('IDENTIFIER', 'print'), ('STRING', '"The End"')
    ]