Skip to content

Commit 1c6bde6

Browse files
authored
Port recent Machine updates (#264)
* Port unported updates since Machine df7d6e9c0bf1de8cba9462fba89208e6546db8fe * Additional changes for consistency with Machine * Address reviewer comments
1 parent f1dc4f1 commit 1c6bde6

20 files changed

Lines changed: 555 additions & 66 deletions

machine/corpora/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@
2828
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
2929
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
3030
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
31-
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
31+
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
3232
from .paratext_text_corpus import ParatextTextCorpus
3333
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
3434
from .scripture_element import ScriptureElement
3535
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
36-
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
36+
from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
3737
from .scripture_text_corpus import (
3838
ScriptureTextCorpus,
3939
create_versification_ref_corpus,
@@ -139,15 +139,15 @@
139139
"ParatextProjectSettingsParserBase",
140140
"ParatextProjectTermsParserBase",
141141
"ParatextProjectTextUpdaterBase",
142-
"ParatextProjectVersificationErrorDetector",
142+
"ParatextProjectVersificationErrorDetectorBase",
143143
"ParatextTextCorpus",
144144
"parse_usfm",
145145
"PlaceMarkersAlignmentInfo",
146146
"PlaceMarkersUsfmUpdateBlockHandler",
147147
"RtlReferenceOrder",
148148
"ScriptureElement",
149149
"ScriptureRef",
150-
"ScriptureRefUsfmParserHandler",
150+
"ScriptureRefUsfmParserHandlerBase",
151151
"ScriptureTextCorpus",
152152
"ScriptureTextType",
153153
"StandardParallelTextCorpus",

machine/corpora/file_paratext_project_file_handler.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from pathlib import Path
23
from typing import BinaryIO, Optional
34

@@ -11,17 +12,29 @@ def __init__(self, project_dir: StrPath) -> None:
1112
self._project_dir = Path(project_dir)
1213

1314
def exists(self, file_name: str) -> bool:
14-
return (self._project_dir / file_name).is_file()
15+
return self._get_file_name(file_name) is not None
1516

1617
def open(self, file_name: str) -> BinaryIO:
18+
actual_file_name = self._get_file_name(file_name)
19+
if actual_file_name is not None:
20+
file_name = actual_file_name
1721
return open(self._project_dir / file_name, "rb")
1822

1923
def find(self, extension: str) -> Optional[Path]:
2024
return next(self._project_dir.glob(f"*{extension}"), None)
2125

2226
def create_stylesheet(self, file_name: str) -> UsfmStylesheet:
23-
custom_stylesheet_filename = self._project_dir / "custom.sty"
27+
custom_stylesheet_file_name = self._get_file_name("custom.sty")
28+
if custom_stylesheet_file_name is None:
29+
custom_stylesheet_file_name = "custom.sty"
30+
custom_stylesheet_path = self._project_dir / custom_stylesheet_file_name
2431
return UsfmStylesheet(
2532
file_name,
26-
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
33+
custom_stylesheet_path if custom_stylesheet_path.is_file() else None,
2734
)
35+
36+
def _get_file_name(self, case_insensitive_file_name: str) -> Optional[str]:
37+
for actual_file_name in os.listdir(self._project_dir):
38+
if actual_file_name.lower() == case_insensitive_file_name.lower():
39+
return actual_file_name
40+
return None

machine/corpora/file_paratext_project_versification_error_detector.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from ..utils.typeshed import StrPath
22
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
33
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
4-
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
4+
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
55

66

7-
class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector):
7+
class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetectorBase):
88
def __init__(self, project_dir: StrPath) -> None:
99
super().__init__(
1010
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()

machine/corpora/paratext_backup_terms_corpus.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
2929
text = MemoryText(
3030
text_id,
3131
[
32-
TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD)
32+
TextRow(text_id, key_term.id, [rendering], content_type=TextRowContentType.WORD)
3333
for key_term in key_terms
34+
for rendering in key_term.renderings
3435
],
3536
)
3637
self._add_text(text)

machine/corpora/paratext_project_settings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ def get_book_file_name(self, book_id: str) -> str:
5353
book_part = _get_book_file_name_digits(book_id) + book_id
5454
return self.file_name_prefix + book_part + self.file_name_suffix
5555

56-
def get_all_scripture_book_file_names(self) -> Iterable[str]:
56+
def get_all_scripture_book_ids(self) -> Iterable[str]:
5757
for book_id in get_scripture_books():
58-
yield self.get_book_file_name(book_id)
58+
yield book_id
5959

6060

6161
def _get_book_file_name_digits(book_id: str) -> str:

machine/corpora/paratext_project_versification_error_detector.py renamed to machine/corpora/paratext_project_versification_error_detector_base.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
from typing import List, Optional, Union
1+
from typing import List, Optional, Set, Union
22

3+
from ..scripture.canon import book_id_to_number
34
from .paratext_project_file_handler import ParatextProjectFileHandler
45
from .paratext_project_settings import ParatextProjectSettings
56
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
67
from .usfm_parser import parse_usfm
78
from .usfm_versification_error_detector import UsfmVersificationError, UsfmVersificationErrorDetector
89

910

10-
class ParatextProjectVersificationErrorDetector:
11+
class ParatextProjectVersificationErrorDetectorBase:
1112
def __init__(
1213
self,
1314
paratext_project_file_handler: ParatextProjectFileHandler,
@@ -20,14 +21,19 @@ def __init__(
2021
self._settings = settings
2122

2223
def get_usfm_versification_errors(
23-
self,
24-
handler: Optional[UsfmVersificationErrorDetector] = None,
24+
self, handler: Optional[UsfmVersificationErrorDetector] = None, books: Optional[Set[int]] = None
2525
) -> List[UsfmVersificationError]:
2626
handler = handler or UsfmVersificationErrorDetector(self._settings)
27-
for file_name in self._settings.get_all_scripture_book_file_names():
27+
for book_id in self._settings.get_all_scripture_book_ids():
28+
29+
file_name = self._settings.get_book_file_name(book_id)
30+
2831
if not self._paratext_project_file_handler.exists(file_name):
2932
continue
3033

34+
if books is not None and not book_id_to_number(book_id) in books:
35+
continue
36+
3137
with self._paratext_project_file_handler.open(file_name) as sfm_file:
3238
usfm: str = sfm_file.read().decode(self._settings.encoding)
3339
try:

machine/corpora/scripture_ref_usfm_parser_handler.py renamed to machine/corpora/scripture_ref_usfm_parser_handler_base.py

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,14 @@ class ScriptureTextType(Enum):
2222

2323

2424
def _is_embed_style(marker: Optional[str]) -> bool:
25-
return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z"))
25+
return marker is not None and marker.strip("*") in _EMBED_STYLES
2626

2727

28-
class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
28+
def _is_private_use_marker(marker: str) -> bool:
29+
return marker is not None and marker.startswith("z")
30+
31+
32+
class ScriptureRefUsfmParserHandlerBase(UsfmParserHandler, ABC):
2933
def __init__(self) -> None:
3034
self._cur_verse_ref: VerseRef = VerseRef()
3135
self._cur_elements_stack: List[ScriptureElement] = []
@@ -46,22 +50,29 @@ def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number:
4650
def verse(
4751
self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
4852
) -> None:
49-
if state.verse_ref == self._cur_verse_ref and not self._duplicate_verse:
50-
self._end_verse_text(state, self._create_verse_refs())
51-
# ignore duplicate verses
52-
self._duplicate_verse = True
53+
# Non-latin numbers are implicitly handled
54+
55+
if state.chapter_has_verse_zero and state.verse_ref.verse_num == 0:
56+
# Fall through for the special case of verse 0 being specified in the USFM
57+
pass
58+
elif state.verse_ref == self._cur_verse_ref and not self._duplicate_verse:
59+
if state.verse_ref.verse_num > 0:
60+
self._end_verse_text(state, self._create_verse_refs())
61+
# ignore duplicate verses
62+
self._duplicate_verse = True
63+
return
5364
elif are_overlapping_verse_ranges(verse1=number, verse2=self._cur_verse_ref.verse):
5465
# merge overlapping verse ranges in to one range
5566
verse_ref: VerseRef = self._cur_verse_ref.copy()
5667
verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse)
5768
self._update_verse_ref(verse_ref, marker)
69+
return
70+
if self._current_text_type == ScriptureTextType.NONVERSE:
71+
self._end_non_verse_text_wrapper(state)
5872
else:
59-
if self._current_text_type == ScriptureTextType.NONVERSE:
60-
self._end_non_verse_text_wrapper(state)
61-
elif self._current_text_type == ScriptureTextType.VERSE:
62-
self._end_verse_text_wrapper(state)
63-
self._update_verse_ref(state.verse_ref, marker)
64-
self._start_verse_text_wrapper(state)
73+
self._end_verse_text_wrapper(state)
74+
self._update_verse_ref(state.verse_ref, marker)
75+
self._start_verse_text_wrapper(state)
6576

6677
def start_para(
6778
self,
@@ -70,13 +81,21 @@ def start_para(
7081
unknown: Optional[bool],
7182
attributes: Optional[Sequence[UsfmAttribute]],
7283
) -> None:
84+
# ignore private-use markers
85+
if _is_private_use_marker(marker):
86+
return
87+
7388
if self._cur_verse_ref.is_default:
7489
self._update_verse_ref(state.verse_ref, marker)
7590
if not state.is_verse_text:
7691
self._start_parent_element(marker)
7792
self._start_non_verse_text_wrapper(state)
7893

7994
def end_para(self, state: UsfmParserState, marker: str) -> None:
95+
# ignore private-use markers
96+
if _is_private_use_marker(marker):
97+
return
98+
8099
if self._current_text_type == ScriptureTextType.NONVERSE:
81100
self._end_parent_element()
82101
self._end_non_verse_text_wrapper(state)
@@ -126,6 +145,10 @@ def opt_break(self, state: UsfmParserState) -> None:
126145
def start_char(
127146
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
128147
) -> None:
148+
# ignore private-use markers
149+
if _is_private_use_marker(marker):
150+
return
151+
129152
# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
130153
self._check_convert_verse_para_to_non_verse(state)
131154

@@ -135,6 +158,10 @@ def start_char(
135158
def end_char(
136159
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
137160
) -> None:
161+
# ignore private-use markers
162+
if _is_private_use_marker(marker):
163+
return
164+
138165
if _is_embed_style(marker):
139166
self._end_embed_text_wrapper(state)
140167

@@ -162,9 +189,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
162189
self._start_verse_text(state, self._create_verse_refs())
163190

164191
def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
165-
if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
192+
if not self._duplicate_verse and (self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero):
166193
self._end_verse_text(state, self._create_verse_refs())
167-
if self._cur_verse_ref.verse_num > 0:
194+
if self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero:
168195
self._cur_text_type_stack.pop()
169196

170197
def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
@@ -177,7 +204,17 @@ def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
177204
self._cur_text_type_stack.pop()
178205

179206
def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
180-
if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
207+
if (
208+
self._cur_verse_ref.verse_num == 0
209+
and verse_ref.verse_num == 0
210+
and not verse_ref.has_multiple
211+
and marker == "v"
212+
):
213+
# As the verse 0 marker appears within the middle of verse 0,
214+
# we should not break the position of current element stack by clearing it.
215+
# Instead, we just need to pop the current element off the stack.
216+
self._cur_elements_stack.pop()
217+
elif not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
181218
self._cur_elements_stack.clear()
182219
self._cur_elements_stack.append(ScriptureElement(0, marker))
183220
self._cur_verse_ref = verse_ref.copy()
@@ -239,6 +276,8 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
239276
and para_tag.marker != "tr"
240277
and state.is_verse_para
241278
and self._cur_verse_ref.verse_num == 0
279+
and not state.chapter_has_verse_zero
280+
and not _is_private_use_marker(para_tag.marker)
242281
):
243282
self._start_parent_element(para_tag.marker)
244283
self._start_non_verse_text_wrapper(state)

machine/corpora/update_usfm_parser_handler.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
55
from .scripture_ref import ScriptureRef
6-
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
6+
from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
77
from .usfm_parser_state import UsfmParserState
88
from .usfm_stylesheet import UsfmStylesheet
99
from .usfm_tag import UsfmTextType
@@ -38,7 +38,11 @@ def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[d
3838
self.metadata = metadata
3939

4040

41-
class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
41+
def _sanitize_verse_data(verse_data: str) -> str:
42+
return verse_data.replace("\u200F", "")
43+
44+
45+
class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandlerBase):
4246
def __init__(
4347
self,
4448
rows: Optional[Sequence[UpdateUsfmRow]] = None,
@@ -319,10 +323,16 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
319323
self._end_update_block(state, [scripture_ref])
320324

321325
def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
326+
# If this embed is outside an update block, create an update block just for this embed
327+
embed_outside_of_block = len(self._update_block_stack) == 0
328+
if embed_outside_of_block:
329+
self._start_update_block([scripture_ref])
322330
self._update_block_stack[-1].add_embed(
323331
self._embed_tokens, marked_for_removal=self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP
324332
)
325333
self._embed_tokens.clear()
334+
if embed_outside_of_block:
335+
self._end_update_block(state, [scripture_ref])
326336

327337
def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
328338
if isinstance(stylesheet, str):
@@ -349,6 +359,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
349359
row_texts: List[str] = []
350360
row_metadata = None
351361
source_index: int = 0
362+
363+
# handle the special case of verse 0, which although first in the rows,
364+
# it will be retrieved some of other segments in the verse.
365+
if len(seg_scr_refs) > 0 and seg_scr_refs[0].verse_num == 0 and len(seg_scr_refs[0].path) == 0:
366+
self._verse_row_index = 0
367+
352368
while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs):
353369
compare: int = 0
354370
row = self._rows[self._verse_rows[self._verse_row_index]]
@@ -378,6 +394,8 @@ def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
378394
self._use_updated_text()
379395
while self._token_index <= state.index + state.special_token_count:
380396
token = state.tokens[self._token_index]
397+
if token.type == UsfmTokenType.VERSE and token.data is not None:
398+
token.data = _sanitize_verse_data(token.data)
381399
if self._current_text_type == ScriptureTextType.EMBED:
382400
self._embed_tokens.append(token)
383401
elif (

machine/corpora/usfm_parser.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ def process_token(self) -> bool:
223223
verse_ref = self.state.verse_ref
224224
verse_ref.chapter = token.data
225225
verse_ref.verse_num = 0
226+
self.state.chapter_has_verse_zero = False
227+
226228
# Verse offset is not zeroed for chapter 1, as it is part of intro
227229
if verse_ref.chapter_num != 1:
228230
self.state.verse_offset = 0
@@ -261,7 +263,12 @@ def process_token(self) -> bool:
261263

262264
assert token.data is not None
263265
verse_ref = self.state.verse_ref
266+
prev_verse_num = verse_ref.verse_num
264267
verse_ref.verse = token.data
268+
if verse_ref.verse_num == 0: # This token is \v 0
269+
self.state.chapter_has_verse_zero = True
270+
elif verse_ref.verse_num == -1: # Ignore invalid verse numbers
271+
verse_ref.verse_num = prev_verse_num
265272
self.state.verse_offset = 0
266273

267274
if self.handler is not None:

machine/corpora/usfm_parser_state.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def __init__(self, stylesheet: UsfmStylesheet, versification: Versification, tok
3737
self._tokens = tokens
3838
self.index = -1
3939
self.special_token = False
40+
self.chapter_has_verse_zero = False
4041
self._special_token_count: int = 0
4142

4243
@property
@@ -108,8 +109,8 @@ def is_verse_para(self) -> bool:
108109

109110
@property
110111
def is_verse_text(self) -> bool:
111-
# anything before verse 1 is not verse text
112-
if self.verse_ref.verse_num == 0:
112+
# anything before verse 1 is not verse text, unless the USFM specified verse 0
113+
if self.verse_ref.verse_num == 0 and not self.chapter_has_verse_zero:
113114
return False
114115

115116
# Sidebars and notes are not verse text

0 commit comments

Comments
 (0)