Source: Python Morsels

Description

Take a file object representing a file containing diary entries in this format:

1
2
3
4
5
6
7
8
9
2018-01-01

Coded.

Did laundry.

2018-01-02

Slept all day.

Return entries as follows:

1
2
3
>>> diary_file = open('my_diary.txt')
>>> entries_by_date(diary_file)
[("2018-01-01", "Coded.\n\nDid laundry."), ("2018-01-02", "Slept all day.")]

Bonus 1: Unescape HTML  , ", and &.

Bonus 2: Take a command-line argument representing a diary filename and create a new file for each diary entry.

Notes

None.

My solution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from re import fullmatch, sub


def clean(line):
    '''Unescape HTML ` `, `"`, and `&`.'''
    entities = {
        " ": ' ',
        """: '\"',
        "&": '&'
    }
    for k in entities.keys():
        line = sub(k, entities[k], line)
    return line


def get_entry_date(line):
    '''Return the post date or an empty string.'''
    ret = ''
    try:
        ret = fullmatch('[0-9]{4}-[0-9]{2}-[0-9]{2}', line.rstrip()).group()
    except AttributeError:
        pass
    return ret


def entries_by_date_str(lines):
    '''Parse `lines` and return a list of tuples.
    
    Each tuple contains two strings: the entry date and the entry body.
    The entry body contains no whitespace or newlines at its beginning
    or end, but preserves whitespace and newlines otherwise.
    
    If the first line is not a date, returns an empty list.
    '''
    entries = []
    entry_body = []
    try:
        entry_date = get_entry_date(lines[0])
    except IndexError:
        return entries
    newdate = True # if we have just encountered a new date
    for line in lines[2:]:
        next_date = get_entry_date(clean(line))
        if next_date:
            newdate = True
            entry = (entry_date, ''.join(entry_body).rstrip())
            entries.append(entry)
            entry_body = []
            entry_date = next_date
        else:
            if newdate:
                entry_body.append(clean(line).lstrip())
                newdate = False
            else:
                entry_body.append(clean(line))
    entry = (entry_date, ''.join(entry_body).rstrip())
    entries.append(entry)
    return entries


def entries_by_date(file_obj):
    '''Accept a file object. Wrapper for `entries_by_date_str()`.'''
    return entries_by_date_str(list(file_obj))


def main(filename):
    with open(filename) as f:
        entries = entries_by_date(f)
        for entry in entries:
            with open(f'{entry[0]}.txt', 'w') as f:
                f.write(entry[1])
                f.close()



## Interaction

if __name__ == '__main__':
    print("My tests:")
    import pytest
    pytest.main([__file__])

    print("Pymorsel tests:")
    from os import system
    system('python test_reformat_diary.py')



## Tests

def test_clean():
    s = 'I said "rabbit, rabbit" today & burped.'
    assert clean(s) == 'I said "rabbit, rabbit" today & burped.'

def test_get_entry_date_empty():
    assert get_entry_date('foo bar') == ''

def test_get_entry_date_dwim():
    assert get_entry_date('2018-12-09') == '2018-12-09'

def test_get_entry_date_whtspc():
    '''Input includes whitespace.'''
    assert get_entry_date('2018-12-09 ') == '2018-12-09'
    assert get_entry_date('2018-12-09\n') == '2018-12-09'

def test_entries_by_date_str_0():
    txt = []
    assert entries_by_date_str(txt) == []

def test_entries_by_date_str_1():
    txt = ['2018-12-09\n', '\n', 'Foo.\n', '\n', 'Bar.\n']
    assert entries_by_date_str(txt) == [('2018-12-09', 'Foo.\n\nBar.')]

def test_entries_by_date_str_2():
    txt = ['2018-12-09\n', '\n', 'Foo.\n', '\n', 'Bar.\n', '\n',
           '2018-12-10\n', '\n', 'Baz.\n', '\n', 'Qux.\n']

    assert entries_by_date_str(txt) == [('2018-12-09', 'Foo.\n\nBar.'),
                                        ('2018-12-10', 'Baz.\n\nQux.')]
def test_entries_by_date_str_3():
    txt = ['2018-12-09\n', '\n', 'Foo.\n', '\n', 'Bar.\n', '\n',
           '2018-12-10\n', '\n', 'Baz.\n', '\n', 'Qux.\n', '\n',
           '2018-12-11\n', '\n', 'Corge.\n', '\n', 'Grault.\n']

    assert entries_by_date_str(txt) == [('2018-12-09', 'Foo.\n\nBar.'),
                                        ('2018-12-10', 'Baz.\n\nQux.'),
                                        ('2018-12-11', 'Corge.\n\nGrault.')]

def test_entries_by_date():
    with open('my_diary.txt') as f:
        assert entries_by_date(f) == \
            [("2018-01-01", "Coded.\n\nDid laundry."), 
            ("2018-01-02", "Slept all day.")]
    f.close()

def test_main():
    main('my_diary.txt')
    with open('2018-01-01.txt') as f1:
        assert list(f1) == ['Coded.\n', '\n', 'Did laundry.']
        f1.close()
    with open('2018-01-02.txt') as f2:
        assert list(f2) == ['Slept all day.']
        f2.close()

Provided solution

In the provided solution, entries_by_date() is a generator function — which makes sense, though I’m not sure why it might be better.

The provided solution suggests various ways to strip out HTML entities, including this one, which appeals to me but feels hard to read. It passes a lambda function as argument to re.sub().

1
2
3
4
5
6
7
8
9
10
REPLACEMENTS = {
    " ": " ",
    """: '"',
    "&": '&',
}

REPLACEMENTS_RE = re.compile('|'.join(REPLACEMENTS.keys()))

def clean_entry(text):
    return REPLACEMENTS_RE.sub(lambda m: REPLACEMENTS[m.group()], text).strip()