forked from obsgolem/LOGOSTools
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathorig_md_to_json.py
More file actions
39 lines (35 loc) · 1.41 KB
/
orig_md_to_json.py
File metadata and controls
39 lines (35 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pathlib import Path
import re
import unicodedata
import json
"""
Converts the original markdown files into json files so we can add metadata to them
"""
chap_extractor = re.compile(r"Ch (\d*) (.*)")
page_extractor = re.compile(r"Pg\.? (.*)")
ex_extractor = re.compile(r"Ex. (.*)-(.*)")
header_extractor = re.compile(r"# (.*)")
for file in (Path.cwd() / "original_markdown").glob("*.md"):
(number, name) = chap_extractor.match(file.stem).group(1, 2)
name = unicodedata.normalize("NFKD", name)
with file.open() as f:
page = next(f)
word_list_page = page_extractor.match(page).group(1)
ex_pages = ex_extractor.match(next(f)).group(1, 2)
sections = []
for line in f:
line = unicodedata.normalize("NFKD", line).strip()
header = header_extractor.match(line)
if header:
current_sect = header.group(1)
sections.append({"section": current_sect, "words": []})
else:
sections[-1]["words"].append({"book_entry": line})
final_object = {
"chapter_name": name,
"word_list_page_number": int(word_list_page),
"exercise_page_range": [int(ex_pages[0]), int(ex_pages[1])],
"sections": sections,
}
with (Path.cwd() / f"json/Ch {number}.json").open("w") as out:
json.dump(final_object, out, indent=2, ensure_ascii=False)