sillsdev
diff --git a/‎machine/corpora/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎machine/corpora/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎machine/corpora/file_paratext_project_file_handler.py‎
Lines changed: 16 additions & 3 deletions b/‎machine/corpora/file_paratext_project_file_handler.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎machine/corpora/file_paratext_project_versification_error_detector.py‎
Lines changed: 2 additions & 2 deletions b/‎machine/corpora/file_paratext_project_versification_error_detector.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎machine/corpora/paratext_backup_terms_corpus.py‎
Lines changed: 2 additions & 1 deletion b/‎machine/corpora/paratext_backup_terms_corpus.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎machine/corpora/paratext_project_settings.py‎
Lines changed: 2 additions & 2 deletions b/‎machine/corpora/paratext_project_settings.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎…_project_versification_error_detector.py‎ ‎…ect_versification_error_detector_base.py‎machine/corpora/paratext_project_versification_error_detector.py renamed to machine/corpora/paratext_project_versification_error_detector_base.py
Lines changed: 11 additions & 5 deletions b/‎…_project_versification_error_detector.py‎ ‎…ect_versification_error_detector_base.py‎machine/corpora/paratext_project_versification_error_detector.py renamed to machine/corpora/paratext_project_versification_error_detector_base.py
Lines changed: 11 additions & 5 deletions
diff --git a/‎…ora/scripture_ref_usfm_parser_handler.py‎ ‎…cripture_ref_usfm_parser_handler_base.py‎machine/corpora/scripture_ref_usfm_parser_handler.py renamed to machine/corpora/scripture_ref_usfm_parser_handler_base.py
Lines changed: 54 additions & 15 deletions b/‎…ora/scripture_ref_usfm_parser_handler.py‎ ‎…cripture_ref_usfm_parser_handler_base.py‎machine/corpora/scripture_ref_usfm_parser_handler.py renamed to machine/corpora/scripture_ref_usfm_parser_handler_base.py
Lines changed: 54 additions & 15 deletions
diff --git a/‎machine/corpora/update_usfm_parser_handler.py‎
Lines changed: 20 additions & 2 deletions b/‎machine/corpora/update_usfm_parser_handler.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎machine/corpora/usfm_parser.py‎
Lines changed: 7 additions & 0 deletions b/‎machine/corpora/usfm_parser.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎machine/corpora/usfm_parser_state.py‎
Lines changed: 3 additions & 2 deletions b/‎machine/corpora/usfm_parser_state.py‎
Lines changed: 3 additions & 2 deletions
@@ -28,12 +28,12 @@
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
 from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
-from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
+from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
 from .paratext_text_corpus import ParatextTextCorpus
 from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
 from .scripture_element import ScriptureElement
 from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
-from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
+from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
 from .scripture_text_corpus import (
     ScriptureTextCorpus,
     create_versification_ref_corpus,
@@ -139,15 +139,15 @@
     "ParatextProjectSettingsParserBase",
     "ParatextProjectTermsParserBase",
     "ParatextProjectTextUpdaterBase",
-    "ParatextProjectVersificationErrorDetector",
+    "ParatextProjectVersificationErrorDetectorBase",
     "ParatextTextCorpus",
     "parse_usfm",
     "PlaceMarkersAlignmentInfo",
     "PlaceMarkersUsfmUpdateBlockHandler",
     "RtlReferenceOrder",
     "ScriptureElement",
     "ScriptureRef",
-    "ScriptureRefUsfmParserHandler",
+    "ScriptureRefUsfmParserHandlerBase",
     "ScriptureTextCorpus",
     "ScriptureTextType",
     "StandardParallelTextCorpus",
 
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from typing import BinaryIO, Optional
 
@@ -11,17 +12,29 @@ def __init__(self, project_dir: StrPath) -> None:
         self._project_dir = Path(project_dir)
 
     def exists(self, file_name: str) -> bool:
-        return (self._project_dir / file_name).is_file()
+        return self._get_file_name(file_name) is not None
 
     def open(self, file_name: str) -> BinaryIO:
+        actual_file_name = self._get_file_name(file_name)
+        if actual_file_name is not None:
+            file_name = actual_file_name
         return open(self._project_dir / file_name, "rb")
 
     def find(self, extension: str) -> Optional[Path]:
         return next(self._project_dir.glob(f"*{extension}"), None)
 
     def create_stylesheet(self, file_name: str) -> UsfmStylesheet:
-        custom_stylesheet_filename = self._project_dir / "custom.sty"
+        custom_stylesheet_file_name = self._get_file_name("custom.sty")
+        if custom_stylesheet_file_name is None:
+            custom_stylesheet_file_name = "custom.sty"
+        custom_stylesheet_path = self._project_dir / custom_stylesheet_file_name
         return UsfmStylesheet(
             file_name,
-            custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
+            custom_stylesheet_path if custom_stylesheet_path.is_file() else None,
         )
+
+    def _get_file_name(self, case_insensitive_file_name: str) -> Optional[str]:
+        for actual_file_name in os.listdir(self._project_dir):
+            if actual_file_name.lower() == case_insensitive_file_name.lower():
+                return actual_file_name
+        return None
@@ -1,10 +1,10 @@
 from ..utils.typeshed import StrPath
 from .file_paratext_project_file_handler import FileParatextProjectFileHandler
 from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
-from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
+from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
 
 
-class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector):
+class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetectorBase):
     def __init__(self, project_dir: StrPath) -> None:
         super().__init__(
             FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
 
@@ -29,8 +29,9 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
             text = MemoryText(
                 text_id,
                 [
-                    TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD)
+                    TextRow(text_id, key_term.id, [rendering], content_type=TextRowContentType.WORD)
                     for key_term in key_terms
+                    for rendering in key_term.renderings
                 ],
             )
             self._add_text(text)
@@ -53,9 +53,9 @@ def get_book_file_name(self, book_id: str) -> str:
             book_part = _get_book_file_name_digits(book_id) + book_id
         return self.file_name_prefix + book_part + self.file_name_suffix
 
-    def get_all_scripture_book_file_names(self) -> Iterable[str]:
+    def get_all_scripture_book_ids(self) -> Iterable[str]:
         for book_id in get_scripture_books():
-            yield self.get_book_file_name(book_id)
+            yield book_id
 
 
 def _get_book_file_name_digits(book_id: str) -> str:
 
@@ -1,13 +1,14 @@
-from typing import List, Optional, Union
+from typing import List, Optional, Set, Union
 
+from ..scripture.canon import book_id_to_number
 from .paratext_project_file_handler import ParatextProjectFileHandler
 from .paratext_project_settings import ParatextProjectSettings
 from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
 from .usfm_parser import parse_usfm
 from .usfm_versification_error_detector import UsfmVersificationError, UsfmVersificationErrorDetector
 
 
-class ParatextProjectVersificationErrorDetector:
+class ParatextProjectVersificationErrorDetectorBase:
     def __init__(
         self,
         paratext_project_file_handler: ParatextProjectFileHandler,
@@ -20,14 +21,19 @@ def __init__(
             self._settings = settings
 
     def get_usfm_versification_errors(
-        self,
-        handler: Optional[UsfmVersificationErrorDetector] = None,
+        self, handler: Optional[UsfmVersificationErrorDetector] = None, books: Optional[Set[int]] = None
     ) -> List[UsfmVersificationError]:
         handler = handler or UsfmVersificationErrorDetector(self._settings)
-        for file_name in self._settings.get_all_scripture_book_file_names():
+        for book_id in self._settings.get_all_scripture_book_ids():
+
+            file_name = self._settings.get_book_file_name(book_id)
+
             if not self._paratext_project_file_handler.exists(file_name):
                 continue
 
+            if books is not None and not book_id_to_number(book_id) in books:
+                continue
+
             with self._paratext_project_file_handler.open(file_name) as sfm_file:
                 usfm: str = sfm_file.read().decode(self._settings.encoding)
             try:
 
@@ -22,10 +22,14 @@ class ScriptureTextType(Enum):
 
 
 def _is_embed_style(marker: Optional[str]) -> bool:
-    return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z"))
+    return marker is not None and marker.strip("*") in _EMBED_STYLES
 
 
-class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
+def _is_private_use_marker(marker: str) -> bool:
+    return marker is not None and marker.startswith("z")
+
+
+class ScriptureRefUsfmParserHandlerBase(UsfmParserHandler, ABC):
     def __init__(self) -> None:
         self._cur_verse_ref: VerseRef = VerseRef()
         self._cur_elements_stack: List[ScriptureElement] = []
@@ -46,22 +50,29 @@ def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number:
     def verse(
         self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
     ) -> None:
-        if state.verse_ref == self._cur_verse_ref and not self._duplicate_verse:
-            self._end_verse_text(state, self._create_verse_refs())
-            # ignore duplicate verses
-            self._duplicate_verse = True
+        # Non-latin numbers are implicitly handled
+
+        if state.chapter_has_verse_zero and state.verse_ref.verse_num == 0:
+            # Fall through for the special case of verse 0 being specified in the USFM
+            pass
+        elif state.verse_ref == self._cur_verse_ref and not self._duplicate_verse:
+            if state.verse_ref.verse_num > 0:
+                self._end_verse_text(state, self._create_verse_refs())
+                # ignore duplicate verses
+                self._duplicate_verse = True
+            return
         elif are_overlapping_verse_ranges(verse1=number, verse2=self._cur_verse_ref.verse):
             # merge overlapping verse ranges in to one range
             verse_ref: VerseRef = self._cur_verse_ref.copy()
             verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse)
             self._update_verse_ref(verse_ref, marker)
+            return
+        if self._current_text_type == ScriptureTextType.NONVERSE:
+            self._end_non_verse_text_wrapper(state)
         else:
-            if self._current_text_type == ScriptureTextType.NONVERSE:
-                self._end_non_verse_text_wrapper(state)
-            elif self._current_text_type == ScriptureTextType.VERSE:
-                self._end_verse_text_wrapper(state)
-            self._update_verse_ref(state.verse_ref, marker)
-            self._start_verse_text_wrapper(state)
+            self._end_verse_text_wrapper(state)
+        self._update_verse_ref(state.verse_ref, marker)
+        self._start_verse_text_wrapper(state)
 
     def start_para(
         self,
@@ -70,13 +81,21 @@ def start_para(
         unknown: Optional[bool],
         attributes: Optional[Sequence[UsfmAttribute]],
     ) -> None:
+        # ignore private-use markers
+        if _is_private_use_marker(marker):
+            return
+
         if self._cur_verse_ref.is_default:
             self._update_verse_ref(state.verse_ref, marker)
         if not state.is_verse_text:
             self._start_parent_element(marker)
             self._start_non_verse_text_wrapper(state)
 
     def end_para(self, state: UsfmParserState, marker: str) -> None:
+        # ignore private-use markers
+        if _is_private_use_marker(marker):
+            return
+
         if self._current_text_type == ScriptureTextType.NONVERSE:
             self._end_parent_element()
             self._end_non_verse_text_wrapper(state)
@@ -126,6 +145,10 @@ def opt_break(self, state: UsfmParserState) -> None:
     def start_char(
         self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
     ) -> None:
+        # ignore private-use markers
+        if _is_private_use_marker(marker):
+            return
+
         # if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
         self._check_convert_verse_para_to_non_verse(state)
 
@@ -135,6 +158,10 @@ def start_char(
     def end_char(
         self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
     ) -> None:
+        # ignore private-use markers
+        if _is_private_use_marker(marker):
+            return
+
         if _is_embed_style(marker):
             self._end_embed_text_wrapper(state)
 
@@ -162,9 +189,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
         self._start_verse_text(state, self._create_verse_refs())
 
     def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
-        if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
+        if not self._duplicate_verse and (self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero):
             self._end_verse_text(state, self._create_verse_refs())
-        if self._cur_verse_ref.verse_num > 0:
+        if self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero:
             self._cur_text_type_stack.pop()
 
     def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
@@ -177,7 +204,17 @@ def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
         self._cur_text_type_stack.pop()
 
     def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
-        if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
+        if (
+            self._cur_verse_ref.verse_num == 0
+            and verse_ref.verse_num == 0
+            and not verse_ref.has_multiple
+            and marker == "v"
+        ):
+            # As the verse 0 marker appears within the middle of verse 0,
+            # we should not break the position of current element stack by clearing it.
+            # Instead, we just need to pop the current element off the stack.
+            self._cur_elements_stack.pop()
+        elif not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
             self._cur_elements_stack.clear()
             self._cur_elements_stack.append(ScriptureElement(0, marker))
         self._cur_verse_ref = verse_ref.copy()
@@ -239,6 +276,8 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
             and para_tag.marker != "tr"
             and state.is_verse_para
             and self._cur_verse_ref.verse_num == 0
+            and not state.chapter_has_verse_zero
+            and not _is_private_use_marker(para_tag.marker)
         ):
             self._start_parent_element(para_tag.marker)
             self._start_non_verse_text_wrapper(state)
@@ -3,7 +3,7 @@
 
 from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
 from .scripture_ref import ScriptureRef
-from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
+from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
 from .usfm_parser_state import UsfmParserState
 from .usfm_stylesheet import UsfmStylesheet
 from .usfm_tag import UsfmTextType
@@ -38,7 +38,11 @@ def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[d
         self.metadata = metadata
 
 
-class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
+def _sanitize_verse_data(verse_data: str) -> str:
+    return verse_data.replace("\u200F", "")
+
+
+class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandlerBase):
     def __init__(
         self,
         rows: Optional[Sequence[UpdateUsfmRow]] = None,
@@ -319,10 +323,16 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
         self._end_update_block(state, [scripture_ref])
 
     def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
+        # If this embed is outside an update block, create an update block just for this embed
+        embed_outside_of_block = len(self._update_block_stack) == 0
+        if embed_outside_of_block:
+            self._start_update_block([scripture_ref])
         self._update_block_stack[-1].add_embed(
             self._embed_tokens, marked_for_removal=self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP
         )
         self._embed_tokens.clear()
+        if embed_outside_of_block:
+            self._end_update_block(state, [scripture_ref])
 
     def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
         if isinstance(stylesheet, str):
@@ -349,6 +359,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
         row_texts: List[str] = []
         row_metadata = None
         source_index: int = 0
+
+        # handle the special case of verse 0, which although first in the rows,
+        # it will be retrieved some of other segments in the verse.
+        if len(seg_scr_refs) > 0 and seg_scr_refs[0].verse_num == 0 and len(seg_scr_refs[0].path) == 0:
+            self._verse_row_index = 0
+
         while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs):
             compare: int = 0
             row = self._rows[self._verse_rows[self._verse_row_index]]
@@ -378,6 +394,8 @@ def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
         self._use_updated_text()
         while self._token_index <= state.index + state.special_token_count:
             token = state.tokens[self._token_index]
+            if token.type == UsfmTokenType.VERSE and token.data is not None:
+                token.data = _sanitize_verse_data(token.data)
             if self._current_text_type == ScriptureTextType.EMBED:
                 self._embed_tokens.append(token)
             elif (
 
@@ -223,6 +223,8 @@ def process_token(self) -> bool:
             verse_ref = self.state.verse_ref
             verse_ref.chapter = token.data
             verse_ref.verse_num = 0
+            self.state.chapter_has_verse_zero = False
+
             # Verse offset is not zeroed for chapter 1, as it is part of intro
             if verse_ref.chapter_num != 1:
                 self.state.verse_offset = 0
@@ -261,7 +263,12 @@ def process_token(self) -> bool:
 
             assert token.data is not None
             verse_ref = self.state.verse_ref
+            prev_verse_num = verse_ref.verse_num
             verse_ref.verse = token.data
+            if verse_ref.verse_num == 0:  # This token is \v 0
+                self.state.chapter_has_verse_zero = True
+            elif verse_ref.verse_num == -1:  # Ignore invalid verse numbers
+                verse_ref.verse_num = prev_verse_num
             self.state.verse_offset = 0
 
             if self.handler is not None:
 
@@ -37,6 +37,7 @@ def __init__(self, stylesheet: UsfmStylesheet, versification: Versification, tok
         self._tokens = tokens
         self.index = -1
         self.special_token = False
+        self.chapter_has_verse_zero = False
         self._special_token_count: int = 0
 
     @property
@@ -108,8 +109,8 @@ def is_verse_para(self) -> bool:
 
     @property
     def is_verse_text(self) -> bool:
-        # anything before verse 1 is not verse text
-        if self.verse_ref.verse_num == 0:
+        # anything before verse 1 is not verse text, unless the USFM specified verse 0
+        if self.verse_ref.verse_num == 0 and not self.chapter_has_verse_zero:
             return False
 
         # Sidebars and notes are not verse text
Original file line number	Diff line number	Diff line change
`@@ -29,8 +29,9 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g`
`29`	`29`	`text = MemoryText(`
`30`	`30`	`text_id,`
`31`	`31`	`[`
`32`		`- TextRow(text_id, key_term.id, key_term.renderings, content_type=TextRowContentType.WORD)`
	`32`	`+ TextRow(text_id, key_term.id, [rendering], content_type=TextRowContentType.WORD)`
`33`	`33`	`for key_term in key_terms`
	`34`	`+ for rendering in key_term.renderings`
`34`	`35`	`],`
`35`	`36`	`)`
`36`	`37`	`self._add_text(text)`