From 19e1714a4ef2c4bd1545fb45db25597587b6d76d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Mon, 9 Feb 2026 19:43:54 -0500 Subject: [PATCH 1/4] Port key terms updates https://github.com/sillsdev/machine.py/pull/257 --- src/SIL.Machine/Corpora/AlignmentRow.cs | 2 + src/SIL.Machine/Corpora/CorporaUtils.cs | 10 +- .../Corpora/FileParatextProjectFileHandler.cs | 30 +-- src/SIL.Machine/Corpora/IRow.cs | 2 + src/SIL.Machine/Corpora/KeyTerm.cs | 29 +++ .../Corpora/NParallelTextCorpus.cs | 13 +- src/SIL.Machine/Corpora/NParallelTextRow.cs | 16 +- src/SIL.Machine/Corpora/ParallelTextCorpus.cs | 3 +- src/SIL.Machine/Corpora/ParallelTextRow.cs | 14 +- .../Corpora/ParatextBackupTermsCorpus.cs | 11 +- .../Corpora/ParatextProjectTermsParserBase.cs | 175 +++++++++++------- src/SIL.Machine/Corpora/ScriptureText.cs | 2 +- src/SIL.Machine/Corpora/TextBase.cs | 11 +- .../Corpora/TextFileAlignmentCorpus.cs | 2 +- src/SIL.Machine/Corpora/TextFileText.cs | 4 +- src/SIL.Machine/Corpora/TextFileTextCorpus.cs | 34 +++- src/SIL.Machine/Corpora/TextRow.cs | 7 +- src/SIL.Machine/Corpora/TextRowContentType.cs | 5 + src/SIL.Machine/Corpora/UsfmTextBase.cs | 14 ++ .../ParatextProjectTermsParserTests.cs | 35 ++-- .../Corpora/UsfmMemoryTextTests.cs | 4 +- 21 files changed, 298 insertions(+), 125 deletions(-) create mode 100644 src/SIL.Machine/Corpora/KeyTerm.cs create mode 100644 src/SIL.Machine/Corpora/TextRowContentType.cs diff --git a/src/SIL.Machine/Corpora/AlignmentRow.cs b/src/SIL.Machine/Corpora/AlignmentRow.cs index d6d3a4e82..35c010a94 100644 --- a/src/SIL.Machine/Corpora/AlignmentRow.cs +++ b/src/SIL.Machine/Corpora/AlignmentRow.cs @@ -20,6 +20,8 @@ public AlignmentRow(string textId, object segRef) public bool IsEmpty => AlignedWordPairs.Count == 0; + public TextRowContentType ContentType => throw new NotImplementedException(); + public AlignmentRow Invert() { return new AlignmentRow(TextId, Ref) diff --git a/src/SIL.Machine/Corpora/CorporaUtils.cs b/src/SIL.Machine/Corpora/CorporaUtils.cs index 928b3e2b2..a76b1403f 100644 --- a/src/SIL.Machine/Corpora/CorporaUtils.cs +++ b/src/SIL.Machine/Corpora/CorporaUtils.cs @@ -75,12 +75,14 @@ public static string MergeVerseRanges(string verse1, string verse2) return sb.ToString(); } - internal static IEnumerable<(string Id, string FileName)> GetFiles(IEnumerable filePatterns) + internal static IEnumerable<(string Id, string FileName, int PatternIndex)> GetFiles( + IEnumerable filePatterns + ) { string[] filePatternArray = filePatterns.ToArray(); if (filePatternArray.Length == 1 && File.Exists(filePatternArray[0])) { - yield return ("*all*", filePatternArray[0]); + yield return ("*all*", filePatternArray[0], 0); } else { @@ -89,7 +91,7 @@ public static string MergeVerseRanges(string verse1, string verse2) string filePattern = filePatternArray[i]; if (File.Exists(filePattern)) { - yield return (i.ToString(CultureInfo.InvariantCulture), filePattern); + yield return (i.ToString(CultureInfo.InvariantCulture), filePattern, i); continue; } @@ -145,7 +147,7 @@ public static string MergeVerseRanges(string verse1, string verse2) if (sb.Length > 0) id = sb.ToString(); } - yield return (id, fileName); + yield return (id, fileName, i); } } } diff --git a/src/SIL.Machine/Corpora/FileParatextProjectFileHandler.cs b/src/SIL.Machine/Corpora/FileParatextProjectFileHandler.cs index df6dd0453..b192c968b 100644 --- a/src/SIL.Machine/Corpora/FileParatextProjectFileHandler.cs +++ b/src/SIL.Machine/Corpora/FileParatextProjectFileHandler.cs @@ -14,31 +14,21 @@ public FileParatextProjectFileHandler(string projectDir) public bool Exists(string fileName) { - return Directory - .EnumerateFiles(_projectDir) - .Any(f => Path.GetFileName(f).Equals(fileName, System.StringComparison.InvariantCultureIgnoreCase)); + return GetFileName(fileName) != null; } public Stream Open(string fileName) { - return File.OpenRead( - Path.Combine( - _projectDir, - Directory - .EnumerateFiles(_projectDir) - .FirstOrDefault(f => - Path.GetFileName(f).Equals(fileName, System.StringComparison.InvariantCultureIgnoreCase) - ) - ) - ); + fileName = GetFileName(fileName) ?? fileName; + return File.OpenRead(Path.Combine(_projectDir, fileName)); } public UsfmStylesheet CreateStylesheet(string fileName) { - string customStylesheetFileName = Path.Combine(_projectDir, "custom.sty"); + string customStylesheetFileName = GetFileName("custom.sty"); return new UsfmStylesheet( fileName, - File.Exists(customStylesheetFileName) ? customStylesheetFileName : null + customStylesheetFileName != null ? Path.Combine(_projectDir, customStylesheetFileName) : null ); } @@ -46,5 +36,15 @@ public string Find(string extension) { return Directory.EnumerateFiles(_projectDir, "*" + extension).FirstOrDefault(); } + + private string GetFileName(string caseInsensitiveFileName) + { + return Directory + .EnumerateFiles(_projectDir) + .Select(p => Path.GetFileName(p)) + .FirstOrDefault(f => + f.Equals(caseInsensitiveFileName, System.StringComparison.InvariantCultureIgnoreCase) + ); + } } } diff --git a/src/SIL.Machine/Corpora/IRow.cs b/src/SIL.Machine/Corpora/IRow.cs index beaf4c691..7d9c4487a 100644 --- a/src/SIL.Machine/Corpora/IRow.cs +++ b/src/SIL.Machine/Corpora/IRow.cs @@ -5,5 +5,7 @@ public interface IRow object Ref { get; } bool IsEmpty { get; } + + TextRowContentType ContentType { get; } } } diff --git a/src/SIL.Machine/Corpora/KeyTerm.cs b/src/SIL.Machine/Corpora/KeyTerm.cs new file mode 100644 index 000000000..21a315ca5 --- /dev/null +++ b/src/SIL.Machine/Corpora/KeyTerm.cs @@ -0,0 +1,29 @@ +using System.Collections.Generic; +using SIL.Scripture; + +public class KeyTerm +{ + public string Id { get; } + public string Category { get; } + public string Domain { get; } + public IReadOnlyList Renderings { get; } + public IReadOnlyList References { get; } + public IReadOnlyList RenderingsPatterns { get; } + + public KeyTerm( + string id, + string category, + string domain, + IReadOnlyList renderings, + IReadOnlyList references, + IReadOnlyList renderingsPatterns + ) + { + Id = id; + Category = category; + Domain = domain; + Renderings = renderings; + References = references; + RenderingsPatterns = renderingsPatterns; + } +} diff --git a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs index 1001119a6..94673413d 100644 --- a/src/SIL.Machine/Corpora/NParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/NParallelTextCorpus.cs @@ -294,6 +294,8 @@ private IEnumerable CreateRows( throw new ArgumentNullException("A corpus row must be specified."); object[] defaultRefs = new object[] { rows.Where(r => r != null).Select(r => r.Ref).First() }; + TextRowContentType contentType = TextRowContentType.Segment; + string textId = null; object[][] refs = new object[N][]; TextRowFlags[] flags = new TextRowFlags[N]; @@ -327,7 +329,7 @@ private IEnumerable CreateRows( } refs = refs.Select(r => r ?? defaultRefs).ToArray(); - yield return new NParallelTextRow(textId, refs) + yield return new NParallelTextRow(textId, refs, contentType) { NSegments = rows.Select(r => r?.Segment ?? Array.Empty()).ToArray(), NFlags = flags.ToReadOnlyList() @@ -441,6 +443,7 @@ private class RangeRow public bool IsSentenceStart { get; set; } = false; public bool IsInRange => Refs.Count > 0; public bool IsEmpty => Segment.Count == 0; + public TextRowContentType ContentType { get; set; } = TextRowContentType.Segment; } private class NRangeInfo @@ -451,6 +454,7 @@ private class NRangeInfo public IComparer RowRefComparer { get; set; } = null; public List Rows { get; } public bool IsInRange => Rows.Any(r => r.IsInRange); + public TextRowContentType ContentType { get; set; } = TextRowContentType.Segment; public NRangeInfo(int n) { @@ -472,6 +476,7 @@ public void AddTextRow(TextRow row, int index) } TextId = row.TextId; Rows[index].Refs.Add(row.Ref); + Rows[index].ContentType = row.ContentType; if (Rows[index].IsEmpty) Rows[index].IsSentenceStart = row.IsSentenceStart; Rows[index].Segment.AddRange(row.Segment); @@ -486,8 +491,8 @@ public NParallelTextRow CreateRow() .ToList(); foreach (int i in Enumerable.Range(0, Rows.Count)) { - var row = Rows[i]; - + RangeRow row = Rows[i]; + ContentType = row.ContentType; if (Versifications.All(v => v != null) && row.Refs.Count() == 0) { refs[i] = referenceRefs @@ -502,7 +507,7 @@ public NParallelTextRow CreateRow() refs[i] = row.Refs.ToArray(); } } - var nParRow = new NParallelTextRow(TextId, refs) + var nParRow = new NParallelTextRow(TextId, refs, ContentType) { NSegments = Rows.Select(r => r.Segment.ToArray()).ToArray(), NFlags = Rows.Select(r => r.IsSentenceStart ? TextRowFlags.SentenceStart : TextRowFlags.None) diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index 4d58e9079..115c862a4 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -8,7 +8,11 @@ namespace SIL.Machine.Corpora { public class NParallelTextRow : IRow { - public NParallelTextRow(string textId, IEnumerable> nRefs) + public NParallelTextRow( + string textId, + IEnumerable> nRefs, + TextRowContentType contentType = TextRowContentType.Segment + ) { if (string.IsNullOrEmpty(textId)) throw new ArgumentNullException(nameof(textId)); @@ -21,6 +25,7 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) N = NRefs.Count; NSegments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); NFlags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); + _contentType = contentType; } public string TextId { get; } @@ -33,6 +38,10 @@ public NParallelTextRow(string textId, IEnumerable> nRefs) public IReadOnlyList> NSegments { get; set; } public IReadOnlyList NFlags { get; set; } + private readonly TextRowContentType _contentType; + + public TextRowContentType ContentType => _contentType; + public bool IsSentenceStart(int i) => NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); @@ -48,7 +57,10 @@ public bool IsRangeStart(int i) => public NParallelTextRow Invert() { - return new NParallelTextRow(TextId, NRefs.Reverse()) { NFlags = NFlags.Reverse().ToImmutableArray(), }; + return new NParallelTextRow(TextId, NRefs.Reverse(), _contentType) + { + NFlags = NFlags.Reverse().ToImmutableArray(), + }; } } } diff --git a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs index 53d07257c..f7d8f6700 100644 --- a/src/SIL.Machine/Corpora/ParallelTextCorpus.cs +++ b/src/SIL.Machine/Corpora/ParallelTextCorpus.cs @@ -61,7 +61,8 @@ public override IEnumerable GetRows(IEnumerable textIds yield return new ParallelTextRow( nRow.TextId, nRow.NRefs[0].Count > 0 || !isScripture ? nRow.NRefs[0] : new object[] { nRow.Ref }, - nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref } + nRow.NRefs[1].Count > 0 || !isScripture ? nRow.NRefs[1] : new object[] { nRow.Ref }, + nRow.ContentType ) { SourceFlags = nRow.NFlags[0], diff --git a/src/SIL.Machine/Corpora/ParallelTextRow.cs b/src/SIL.Machine/Corpora/ParallelTextRow.cs index 9b8618e42..710c6740d 100644 --- a/src/SIL.Machine/Corpora/ParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/ParallelTextRow.cs @@ -6,7 +6,12 @@ namespace SIL.Machine.Corpora { public class ParallelTextRow : IRow { - public ParallelTextRow(string textId, IReadOnlyList sourceRefs, IReadOnlyList targetRefs) + public ParallelTextRow( + string textId, + IReadOnlyList sourceRefs, + IReadOnlyList targetRefs, + TextRowContentType contentType = TextRowContentType.Segment + ) { if (string.IsNullOrEmpty(textId)) throw new ArgumentNullException(nameof(textId)); @@ -17,6 +22,7 @@ public ParallelTextRow(string textId, IReadOnlyList sourceRefs, IReadOnl TextId = textId; SourceRefs = sourceRefs; TargetRefs = targetRefs; + _contentType = contentType; } public string TextId { get; } @@ -37,6 +43,10 @@ public ParallelTextRow(string textId, IReadOnlyList sourceRefs, IReadOnl public TextRowFlags SourceFlags { get; set; } = TextRowFlags.SentenceStart; public TextRowFlags TargetFlags { get; set; } = TextRowFlags.SentenceStart; + private readonly TextRowContentType _contentType; + + public TextRowContentType ContentType => _contentType; + public bool IsSourceSentenceStart => SourceFlags.HasFlag(TextRowFlags.SentenceStart); public bool IsSourceInRange => SourceFlags.HasFlag(TextRowFlags.InRange); public bool IsSourceRangeStart => SourceFlags.HasFlag(TextRowFlags.RangeStart); @@ -51,7 +61,7 @@ public ParallelTextRow(string textId, IReadOnlyList sourceRefs, IReadOnl public ParallelTextRow Invert() { - return new ParallelTextRow(TextId, TargetRefs, SourceRefs) + return new ParallelTextRow(TextId, TargetRefs, SourceRefs, _contentType) { SourceSegment = TargetSegment, TargetSegment = SourceSegment, diff --git a/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs b/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs index 60ce88002..e8db5d93c 100644 --- a/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs +++ b/src/SIL.Machine/Corpora/ParatextBackupTermsCorpus.cs @@ -15,9 +15,9 @@ public ParatextBackupTermsCorpus( { using (var archive = ZipFile.OpenRead(fileName)) { - IEnumerable<(string, IReadOnlyList)> glosses = new ZipParatextProjectTermsParser(archive) + IEnumerable keyTerms = new ZipParatextProjectTermsParser(archive) .Parse(termCategories, useTermGlosses, chapters) - .OrderBy(g => g.TermId); + .OrderBy(g => g.Id); ParatextProjectSettings settings = ZipParatextProjectSettingsParser.Parse(archive); @@ -26,8 +26,11 @@ public ParatextBackupTermsCorpus( IText text = new MemoryText( textId, - glosses.SelectMany(kvp => - kvp.Item2.Select(gloss => new TextRow(textId, kvp.Item1) { Segment = new string[] { gloss } }) + keyTerms.SelectMany(keyTerm => + keyTerm.Renderings.Select(gloss => new TextRow(textId, keyTerm.Id, TextRowContentType.Word) + { + Segment = new string[] { gloss } + }) ) ); AddText(text); diff --git a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs index 15e761756..35aaaeec1 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs @@ -48,38 +48,25 @@ ParatextProjectSettings settings _paratextProjectFileHandler = paratextProjectFileHandler; } - public IEnumerable<(string TermId, IReadOnlyList Glosses)> Parse( + public IEnumerable Parse( IEnumerable termCategories, bool useTermGlosses = true, IDictionary> chapters = null ) { XDocument biblicalTermsDoc; - IDictionary termIdToCategoryDictionary; + IDictionary termIdToCategory; + IDictionary termIdToDomain; IDictionary> termIdToReferences; - if (_settings.BiblicalTermsListType == "Project") + if ( + _settings.BiblicalTermsListType == "Project" + && _paratextProjectFileHandler.Exists(_settings.BiblicalTermsFileName) + ) { - if (_paratextProjectFileHandler.Exists(_settings.BiblicalTermsFileName)) - { - using (Stream keyTermsFile = _paratextProjectFileHandler.Open(_settings.BiblicalTermsFileName)) - { - biblicalTermsDoc = XDocument.Load(keyTermsFile); - termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); - termIdToReferences = GetReferences(biblicalTermsDoc); - } - } - else + using (Stream keyTermsFile = _paratextProjectFileHandler.Open(_settings.BiblicalTermsFileName)) { - using ( - Stream keyTermsFile = Assembly - .GetExecutingAssembly() - .GetManifestResourceStream("SIL.Machine.Corpora.BiblicalTerms.xml") - ) - { - biblicalTermsDoc = XDocument.Load(keyTermsFile); - termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); - termIdToReferences = GetReferences(biblicalTermsDoc); - } + biblicalTermsDoc = XDocument.Load(keyTermsFile); + (termIdToCategory, termIdToDomain, termIdToReferences) = GetTermData(biblicalTermsDoc); } } else if (PredefinedTermsListTypes.Contains(_settings.BiblicalTermsListType)) @@ -91,13 +78,13 @@ ParatextProjectSettings settings ) { biblicalTermsDoc = XDocument.Load(keyTermsFile); - termIdToCategoryDictionary = GetCategoryPerId(biblicalTermsDoc); - termIdToReferences = GetReferences(biblicalTermsDoc); + (termIdToCategory, termIdToDomain, termIdToReferences) = GetTermData(biblicalTermsDoc); } } else { - termIdToCategoryDictionary = new Dictionary(); + termIdToCategory = new Dictionary(); + termIdToDomain = new Dictionary(); termIdToReferences = new Dictionary>(); } @@ -129,14 +116,15 @@ ParatextProjectSettings settings termsRenderings = termRenderingsDoc .Descendants() .Where(n => n.Name.LocalName == "TermRendering") + .Where(ele => ele.Attribute("Guess") == null || ele.Attribute("Guess").Value == "false") .Select(ele => (ele.Attribute("Id").Value, ele)) - .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary)) + .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategory)) .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) .Select(kvp => { string id = kvp.Item1.Replace("\n", " "); string rendering = kvp.Item2.Element("Renderings").Value; - IReadOnlyList renderings = GetRenderings(rendering); + IReadOnlyList renderings = GetRenderingsWithPattern(rendering); return (id, renderings); }) .GroupBy(kvp => kvp.Item1, kvp => kvp.Item2) //Handle duplicate term ids (which do exist) e.g. שִׁלֵּמִי @@ -151,7 +139,7 @@ ParatextProjectSettings settings .Descendants() .Where(n => n.Name.LocalName == "Localization") .Select(ele => (ele.Attribute("Id").Value, ele)) - .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategoryDictionary)) + .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategory)) .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) .Select(kvp => { @@ -166,11 +154,53 @@ ParatextProjectSettings settings } if (termsGlosses.Count > 0 || termsRenderings.Count > 0) { - return termsRenderings - .Concat(termsGlosses.Where(kvp => !termsRenderings.ContainsKey(kvp.Key))) - .Select(kvp => (kvp.Key, (IReadOnlyList)kvp.Value.ToList())); + List terms = new List(); + foreach ( + string id in termsRenderings.Keys.Distinct().Union(termsGlosses.Keys.Distinct()).OrderBy(k => k) + ) + { + if (!termsRenderings.TryGetValue(id, out IEnumerable renderingsPatterns)) + { + renderingsPatterns = new List(); + } + if (!termIdToCategory.TryGetValue(id, out string category)) + { + category = "?"; + } + if (!termIdToDomain.TryGetValue(id, out string domain)) + { + domain = "?"; + } + if (!termsGlosses.TryGetValue(id, out IEnumerable glosses)) + { + glosses = new List(); + } + if (!termIdToReferences.TryGetValue(id, out ImmutableHashSet references)) + { + references = ImmutableHashSet.Create(); + } + IEnumerable renderings = renderingsPatterns.Select(r => r.Replace("*", "")); + if (!renderings.Any()) + { + if (!glosses.Any()) + { + continue; + } + renderings = glosses; + } + KeyTerm term = new KeyTerm( + id, + category, + domain, + renderings.ToArray(), + references.ToArray(), + renderingsPatterns.ToArray() + ); + terms.Add(term); + } + return terms; } - return new List<(string, IReadOnlyList)>(); + return new List(); } private static bool IsInCategory( @@ -227,14 +257,9 @@ public static IReadOnlyList GetGlosses(string gloss) return Regex.Split(gloss, @"[;,/]").Select(g => g.Trim()).Where(s => s != "").Distinct().ToList(); } - public static IReadOnlyList GetRenderings(string rendering) + public static IReadOnlyList GetRenderingsWithPattern(string rendering) { - return Regex - .Split(rendering.Trim(), @"\|\|") - .Select(r => CleanTerm(r).Trim()) - .Select(r => r.Replace("*", "")) - .Where(r => r != "") - .ToList(); + return Regex.Split(rendering.Trim(), @"\|\|").Select(r => CleanTerm(r).Trim()).Where(r => r != "").ToList(); } /// @@ -273,30 +298,56 @@ public static string StripParens(string termString, char left = '(', char right return termString; } - private static IDictionary GetCategoryPerId(XDocument biblicalTermsDocument) + private static ( + IDictionary TermCategories, + IDictionary TermDomains, + IDictionary> TermReferences + ) GetTermData(XDocument biblicalTermsDocument) { - return biblicalTermsDocument - .Descendants() - .Where(n => n.Name.LocalName == "Term") - .DistinctBy(e => e.Attribute("Id").Value) - .ToDictionary(e => e.Attribute("Id").Value, e => e.Element("Category")?.Value ?? ""); - } + var termIdToCategory = new Dictionary(); + var termIdToDomain = new Dictionary(); + var termIdToReferences = new Dictionary>(); + foreach (XElement term in biblicalTermsDocument.Descendants().Where(n => n.Name.LocalName == "Term")) + { + string termId = term.Attribute("Id").Value; + if (termId == null) + continue; - private static IDictionary> GetReferences(XDocument biblicalTermsDocument) - { - return biblicalTermsDocument - .Descendants() - .Where(n => n.Name.LocalName == "Term") - .DistinctBy(e => e.Attribute("Id").Value) - .ToDictionary( - e => e.Attribute("Id").Value, - e => - e.Element("References") - ?.Descendants() - .Where(reference => int.TryParse(reference.Value.Substring(0, 9), out int _)) - .Select(reference => new VerseRef(int.Parse(reference.Value.Substring(0, 9)))) - .ToImmutableHashSet() - ); + if (!termIdToCategory.ContainsKey(termId)) + { + XElement category = term.Element("Category"); + termIdToCategory[termId] = category != null && category.Value != "" ? category.Value : ""; + } + if (!termIdToDomain.ContainsKey(termId)) + { + XElement domain = term.Element("Domain"); + termIdToDomain[termId] = domain != null && domain.Value != "" ? domain.Value : ""; + } + if (!termIdToReferences.ContainsKey(termId)) + { + XElement referencesElement = term.Element("References"); + List references = new List(); + if (referencesElement != null) + { + foreach (XElement verseElement in referencesElement.Elements("Verse")) + { + if ( + verseElement == null + || !int.TryParse(verseElement.Value.Substring(0, 9), out int bbbcccvvv) + ) + { + continue; + } + + var verseRef = new VerseRef(bbbcccvvv); + verseRef.ChangeVersification(ScrVers.Original); + references.Add(verseRef); + } + termIdToReferences[termId] = ImmutableHashSet.Create(references.ToArray()); + } + } + } + return (termIdToCategory, termIdToDomain, termIdToReferences); } } } diff --git a/src/SIL.Machine/Corpora/ScriptureText.cs b/src/SIL.Machine/Corpora/ScriptureText.cs index b55b328d6..7693d0343 100644 --- a/src/SIL.Machine/Corpora/ScriptureText.cs +++ b/src/SIL.Machine/Corpora/ScriptureText.cs @@ -6,7 +6,7 @@ namespace SIL.Machine.Corpora public abstract class ScriptureText : TextBase { protected ScriptureText(string id, ScrVers versification) - : base(id, CorporaUtils.GetScriptureTextSortKey(id)) + : base(id, CorporaUtils.GetScriptureTextSortKey(id), TextRowContentType.Segment) { Versification = versification ?? ScrVers.English; } diff --git a/src/SIL.Machine/Corpora/TextBase.cs b/src/SIL.Machine/Corpora/TextBase.cs index f72fef40c..3db21d542 100644 --- a/src/SIL.Machine/Corpora/TextBase.cs +++ b/src/SIL.Machine/Corpora/TextBase.cs @@ -7,16 +7,23 @@ namespace SIL.Machine.Corpora { public abstract class TextBase : IText { - protected TextBase(string id, string sortKey) + protected TextBase( + string id, + string sortKey, + TextRowContentType defaultContentType = TextRowContentType.Segment + ) { Id = id; SortKey = sortKey; + DefaultContentType = defaultContentType; } public string Id { get; } public string SortKey { get; } + public TextRowContentType DefaultContentType { get; } + public virtual int Count(bool includeEmpty = true) { return includeEmpty ? GetRows().Count() : GetRows().Count(r => !r.IsEmpty); @@ -27,7 +34,7 @@ public virtual int Count(bool includeEmpty = true) protected TextRow CreateRow(string text, object segRef, TextRowFlags flags = TextRowFlags.SentenceStart) { text = text.Trim(); - return new TextRow(Id, segRef) + return new TextRow(Id, segRef, DefaultContentType) { Segment = text.Length == 0 ? Array.Empty() : new[] { text }, Flags = flags diff --git a/src/SIL.Machine/Corpora/TextFileAlignmentCorpus.cs b/src/SIL.Machine/Corpora/TextFileAlignmentCorpus.cs index 84b87f72b..39002ffd2 100644 --- a/src/SIL.Machine/Corpora/TextFileAlignmentCorpus.cs +++ b/src/SIL.Machine/Corpora/TextFileAlignmentCorpus.cs @@ -12,7 +12,7 @@ public TextFileAlignmentCorpus(IEnumerable filePatterns) private static IEnumerable GetAlignmentCollections(IEnumerable filePatterns) { - foreach ((string id, string fileName) in CorporaUtils.GetFiles(filePatterns)) + foreach ((string id, string fileName, int _) in CorporaUtils.GetFiles(filePatterns)) yield return new TextFileAlignmentCollection(id, fileName); } } diff --git a/src/SIL.Machine/Corpora/TextFileText.cs b/src/SIL.Machine/Corpora/TextFileText.cs index 405a0fd01..078de36b3 100644 --- a/src/SIL.Machine/Corpora/TextFileText.cs +++ b/src/SIL.Machine/Corpora/TextFileText.cs @@ -7,8 +7,8 @@ namespace SIL.Machine.Corpora { public class TextFileText : TextBase { - public TextFileText(string id, string fileName) - : base(id, id) + public TextFileText(string id, string fileName, TextRowContentType contentType = TextRowContentType.Segment) + : base(id, id, contentType) { FileName = fileName; } diff --git a/src/SIL.Machine/Corpora/TextFileTextCorpus.cs b/src/SIL.Machine/Corpora/TextFileTextCorpus.cs index c69ebc5af..493723135 100644 --- a/src/SIL.Machine/Corpora/TextFileTextCorpus.cs +++ b/src/SIL.Machine/Corpora/TextFileTextCorpus.cs @@ -1,19 +1,41 @@ using System.Collections.Generic; +using System.Linq; namespace SIL.Machine.Corpora { public class TextFileTextCorpus : DictionaryTextCorpus { public TextFileTextCorpus(params string[] filePatterns) - : this((IEnumerable)filePatterns) { } + : this(filePatterns, new List()) { } - public TextFileTextCorpus(IEnumerable filePatterns) - : base(GetTexts(filePatterns)) { } + public TextFileTextCorpus(IEnumerable contentTypes = null, params string[] filePatterns) + : this(filePatterns, contentTypes) { } - private static IEnumerable GetTexts(IEnumerable filePatterns) + public TextFileTextCorpus(IEnumerable filePatterns, IEnumerable contentTypes = null) + : base(GetTexts(filePatterns, contentTypes)) { } + + private static IEnumerable GetTexts( + IEnumerable filePatterns, + IEnumerable contentTypes + ) { - foreach ((string id, string fileName) in CorporaUtils.GetFiles(filePatterns)) - yield return new TextFileText(id, fileName); + List contentTypesList; + if (contentTypes == null) + { + contentTypesList = new List(); + } + else + { + contentTypesList = contentTypes.ToList(); + } + foreach ((string id, string fileName, int patternIndex) in CorporaUtils.GetFiles(filePatterns)) + { + yield return new TextFileText( + id, + fileName, + patternIndex < contentTypesList.Count ? contentTypesList[patternIndex] : TextRowContentType.Segment + ); + } } } } diff --git a/src/SIL.Machine/Corpora/TextRow.cs b/src/SIL.Machine/Corpora/TextRow.cs index 76494a038..f08874ee0 100644 --- a/src/SIL.Machine/Corpora/TextRow.cs +++ b/src/SIL.Machine/Corpora/TextRow.cs @@ -14,16 +14,21 @@ public enum TextRowFlags public class TextRow : IRow { - public TextRow(string textId, object rowRef) + public TextRow(string textId, object rowRef, TextRowContentType contentType = TextRowContentType.Segment) { TextId = textId; Ref = rowRef; + _contentType = contentType; } public string TextId { get; } public object Ref { get; } + private readonly TextRowContentType _contentType; + + public TextRowContentType ContentType => _contentType; + public bool IsEmpty => Segment.Count == 0; public TextRowFlags Flags { get; set; } = TextRowFlags.SentenceStart; diff --git a/src/SIL.Machine/Corpora/TextRowContentType.cs b/src/SIL.Machine/Corpora/TextRowContentType.cs new file mode 100644 index 000000000..ed4816b70 --- /dev/null +++ b/src/SIL.Machine/Corpora/TextRowContentType.cs @@ -0,0 +1,5 @@ +public enum TextRowContentType +{ + Word, + Segment, +} diff --git a/src/SIL.Machine/Corpora/UsfmTextBase.cs b/src/SIL.Machine/Corpora/UsfmTextBase.cs index 61cddf446..40061f3f0 100644 --- a/src/SIL.Machine/Corpora/UsfmTextBase.cs +++ b/src/SIL.Machine/Corpora/UsfmTextBase.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.IO; using System.Text; +using SIL.Extensions; using SIL.Machine.Utils; using SIL.Scripture; @@ -104,6 +105,19 @@ public TextRowCollector(UsfmTextBase text) public IEnumerable Rows => _rows; + public override void StartBook(UsfmParserState state, string marker, string code) + { + base.StartBook(state, marker, code); + if (!Canon.AllBookIds.Contains(code, StringComparison.InvariantCulture)) + { + throw new ArgumentException($"The book {code} is not a valid book id."); + } + if (code != _text.Id) + { + throw new ArgumentException($"The \\id marker {code} does not match the text id {_text.Id}."); + } + } + public override void StartPara( UsfmParserState state, string marker, diff --git a/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs b/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs index 00329912a..8de0cad95 100644 --- a/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/ParatextProjectTermsParserTests.cs @@ -36,9 +36,9 @@ public void TestGetKeyTermsFromTermsRenderings() } } ); - IEnumerable<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses(); + IEnumerable terms = env.GetGlosses(); Assert.That(terms.Count, Is.EqualTo(1)); - Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Xerxes")); + Assert.That(string.Join(" ", terms.First().Renderings), Is.EqualTo("Xerxes")); } [Test] @@ -51,9 +51,9 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings() ), useTermGlosses: true ); - IEnumerable<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses(); + IEnumerable terms = env.GetGlosses(); Assert.That(terms.Count, Is.EqualTo(5726)); - Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Abagtha")); + Assert.That(string.Join(" ", terms.First().Renderings), Is.EqualTo("Aaron")); } [Test] @@ -66,7 +66,7 @@ public void TestGetKeyTermsFromTermsLocalizations_NoTermRenderings_DoNotUseTermG ), useTermGlosses: false ); - IEnumerable<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses(); + IEnumerable terms = env.GetGlosses(); Assert.That(terms.Count, Is.EqualTo(0)); } @@ -81,9 +81,9 @@ public void TestGetKeyTermsFromTermsLocalizations() ), useTermGlosses: true ); - IEnumerable<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses(); + IEnumerable terms = env.GetGlosses(); Assert.That(terms.Count, Is.EqualTo(5715)); - Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Aaron")); + Assert.That(string.Join(" ", terms.First().Renderings), Is.EqualTo("Aaron")); } [Test] @@ -104,9 +104,9 @@ public void TestGetKeyTermsFromTermsLocalizations_FilterByChapters() } } ); - IEnumerable<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses(); + IEnumerable terms = env.GetGlosses(); Assert.That(terms.Count, Is.EqualTo(3)); //Habakkuk, YHWH, Kashdi/Chaldean are the only PN terms in HAB 1 - Assert.That(string.Join(" ", terms.First().Glosses), Is.EqualTo("Habaquq")); + Assert.That(string.Join(" ", terms.First().Renderings), Is.EqualTo("Habaquq")); } [Test] @@ -135,10 +135,10 @@ public void TestGetKeyTermsFromTermsLocalizations_TermRenderingsExists_PreferLoc }, useTermGlosses: true ); - IReadOnlyList<(string TermId, IReadOnlyList Glosses)> terms = env.GetGlosses().ToList(); + IReadOnlyList terms = env.GetGlosses().ToList(); Assert.That(terms.Count, Is.EqualTo(5726)); - Assert.That(string.Join(" ", terms[1].Glosses), Is.EqualTo("Abagtha")); - Assert.That(string.Join(" ", terms[2].Glosses), Is.EqualTo("Abi")); + Assert.That(string.Join(" ", terms[1].Renderings), Is.EqualTo("Aaron")); + Assert.That(string.Join(" ", terms[2].Renderings), Is.EqualTo("Abaddon Destroyer")); } [Test] @@ -166,13 +166,16 @@ public void TestGetGlosses(string glossString, IReadOnlyList expectedOut [Test] [TestCase("", new string[] { })] - [TestCase("*Abba*", new string[] { "Abba" })] + [TestCase("*Abba*", new string[] { "*Abba*" })] [TestCase("Abba|| ", new string[] { "Abba" })] [TestCase("Abba||Abbah", new string[] { "Abba", "Abbah" })] [TestCase("Abba (note)", new string[] { "Abba" })] public void TestGetRenderings(string renderingString, IReadOnlyList expectedOutput) { - Assert.That(ParatextProjectTermsParserBase.GetRenderings(renderingString), Is.EqualTo(expectedOutput)); + Assert.That( + ParatextProjectTermsParserBase.GetRenderingsWithPattern(renderingString), + Is.EqualTo(expectedOutput) + ); } private class TestEnvironment( @@ -187,9 +190,9 @@ private class TestEnvironment( public ParatextProjectTermsParserBase Parser { get; } = new MemoryParatextProjectTermsParser(files, settings); - public IEnumerable<(string TermId, IReadOnlyList Glosses)> GetGlosses() + public IEnumerable GetGlosses() { - return Parser.Parse(new string[] { "PN" }, _useTermGlosses, _chapters); + return Parser.Parse(["PN"], _useTermGlosses, _chapters); } } } diff --git a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs index 243eb6541..d2dd5dbdf 100644 --- a/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs +++ b/tests/SIL.Machine.Tests/Corpora/UsfmMemoryTextTests.cs @@ -399,7 +399,7 @@ public void GetRows_VerseZeroWithText() public void GetRows_PrivateUseMarker() { TextRow[] rows = GetRows( - @"\id FRT - Test English Apocrypha + @"\id MAT - Test English Apocrypha \zmt Ignore this paragraph \mt1 Test English Apocrypha \pc Copyright Statement \zimagecopyrights @@ -414,7 +414,7 @@ public void GetRows_PrivateUseMarker() Assert.That( rows[1].Ref, - Is.EqualTo(ScriptureRef.Parse("FRT 1:0/2:pc")), + Is.EqualTo(ScriptureRef.Parse("MAT 1:0/2:pc")), string.Join(",", rows.ToList().Select(tr => tr.Ref.ToString())) ); Assert.That( From 95aa3ccd4239b21d4e47498eac8e99dfedda099d Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 10 Feb 2026 13:38:48 -0500 Subject: [PATCH 2/4] Address reviewer comments --- src/SIL.Machine/Corpora/AlignmentRow.cs | 2 -- src/SIL.Machine/Corpora/IRow.cs | 2 -- src/SIL.Machine/Corpora/KeyTerm.cs | 13 +++++++------ src/SIL.Machine/Corpora/NParallelTextRow.cs | 4 ++-- src/SIL.Machine/Corpora/ParallelTextRow.cs | 9 +++------ .../Corpora/ParatextProjectTermsParserBase.cs | 2 +- src/SIL.Machine/Corpora/TextRow.cs | 6 ++---- 7 files changed, 15 insertions(+), 23 deletions(-) diff --git a/src/SIL.Machine/Corpora/AlignmentRow.cs b/src/SIL.Machine/Corpora/AlignmentRow.cs index 35c010a94..d6d3a4e82 100644 --- a/src/SIL.Machine/Corpora/AlignmentRow.cs +++ b/src/SIL.Machine/Corpora/AlignmentRow.cs @@ -20,8 +20,6 @@ public AlignmentRow(string textId, object segRef) public bool IsEmpty => AlignedWordPairs.Count == 0; - public TextRowContentType ContentType => throw new NotImplementedException(); - public AlignmentRow Invert() { return new AlignmentRow(TextId, Ref) diff --git a/src/SIL.Machine/Corpora/IRow.cs b/src/SIL.Machine/Corpora/IRow.cs index 7d9c4487a..beaf4c691 100644 --- a/src/SIL.Machine/Corpora/IRow.cs +++ b/src/SIL.Machine/Corpora/IRow.cs @@ -5,7 +5,5 @@ public interface IRow object Ref { get; } bool IsEmpty { get; } - - TextRowContentType ContentType { get; } } } diff --git a/src/SIL.Machine/Corpora/KeyTerm.cs b/src/SIL.Machine/Corpora/KeyTerm.cs index 21a315ca5..6316129a5 100644 --- a/src/SIL.Machine/Corpora/KeyTerm.cs +++ b/src/SIL.Machine/Corpora/KeyTerm.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Linq; using SIL.Scripture; public class KeyTerm @@ -14,16 +15,16 @@ public KeyTerm( string id, string category, string domain, - IReadOnlyList renderings, - IReadOnlyList references, - IReadOnlyList renderingsPatterns + IEnumerable renderings, + IEnumerable references, + IEnumerable renderingsPatterns ) { Id = id; Category = category; Domain = domain; - Renderings = renderings; - References = references; - RenderingsPatterns = renderingsPatterns; + Renderings = renderings.ToArray(); + References = references.ToArray(); + RenderingsPatterns = renderingsPatterns.ToArray(); } } diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index 115c862a4..c48eb2faa 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -25,7 +25,7 @@ public NParallelTextRow( N = NRefs.Count; NSegments = Enumerable.Range(0, N).Select(_ => Array.Empty()).ToImmutableArray(); NFlags = Enumerable.Range(0, N).Select(_ => TextRowFlags.SentenceStart).ToImmutableArray(); - _contentType = contentType; + ContentType = contentType; } public string TextId { get; } @@ -40,7 +40,7 @@ public NParallelTextRow( private readonly TextRowContentType _contentType; - public TextRowContentType ContentType => _contentType; + public TextRowContentType ContentType { get; } public bool IsSentenceStart(int i) => NFlags.Count > i ? NFlags[i].HasFlag(TextRowFlags.SentenceStart) : throw new ArgumentOutOfRangeException(); diff --git a/src/SIL.Machine/Corpora/ParallelTextRow.cs b/src/SIL.Machine/Corpora/ParallelTextRow.cs index 710c6740d..7cc981300 100644 --- a/src/SIL.Machine/Corpora/ParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/ParallelTextRow.cs @@ -22,7 +22,7 @@ public ParallelTextRow( TextId = textId; SourceRefs = sourceRefs; TargetRefs = targetRefs; - _contentType = contentType; + ContentType = contentType; } public string TextId { get; } @@ -42,10 +42,7 @@ public ParallelTextRow( public TextRowFlags SourceFlags { get; set; } = TextRowFlags.SentenceStart; public TextRowFlags TargetFlags { get; set; } = TextRowFlags.SentenceStart; - - private readonly TextRowContentType _contentType; - - public TextRowContentType ContentType => _contentType; + public TextRowContentType ContentType { get; } public bool IsSourceSentenceStart => SourceFlags.HasFlag(TextRowFlags.SentenceStart); public bool IsSourceInRange => SourceFlags.HasFlag(TextRowFlags.InRange); @@ -61,7 +58,7 @@ public ParallelTextRow( public ParallelTextRow Invert() { - return new ParallelTextRow(TextId, TargetRefs, SourceRefs, _contentType) + return new ParallelTextRow(TextId, TargetRefs, SourceRefs, ContentType) { SourceSegment = TargetSegment, TargetSegment = SourceSegment, diff --git a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs index 35aaaeec1..b7ece42f7 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs @@ -116,7 +116,7 @@ public IEnumerable Parse( termsRenderings = termRenderingsDoc .Descendants() .Where(n => n.Name.LocalName == "TermRendering") - .Where(ele => ele.Attribute("Guess") == null || ele.Attribute("Guess").Value == "false") + .Where(ele => ((string)ele.Attribute("Guess") ?? "false") == "false") .Select(ele => (ele.Attribute("Id").Value, ele)) .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategory)) .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) diff --git a/src/SIL.Machine/Corpora/TextRow.cs b/src/SIL.Machine/Corpora/TextRow.cs index f08874ee0..3b4298f2a 100644 --- a/src/SIL.Machine/Corpora/TextRow.cs +++ b/src/SIL.Machine/Corpora/TextRow.cs @@ -18,16 +18,14 @@ public TextRow(string textId, object rowRef, TextRowContentType contentType = Te { TextId = textId; Ref = rowRef; - _contentType = contentType; + ContentType = contentType; } public string TextId { get; } public object Ref { get; } - private readonly TextRowContentType _contentType; - - public TextRowContentType ContentType => _contentType; + public TextRowContentType ContentType { get; } public bool IsEmpty => Segment.Count == 0; From 3ca60c8444ec1b5dd7627fb45c69692122886660 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Tue, 10 Feb 2026 13:40:26 -0500 Subject: [PATCH 3/4] Fix typo --- src/SIL.Machine/Corpora/NParallelTextRow.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/SIL.Machine/Corpora/NParallelTextRow.cs b/src/SIL.Machine/Corpora/NParallelTextRow.cs index c48eb2faa..d939534c8 100644 --- a/src/SIL.Machine/Corpora/NParallelTextRow.cs +++ b/src/SIL.Machine/Corpora/NParallelTextRow.cs @@ -37,9 +37,6 @@ public NParallelTextRow( public IReadOnlyList> NSegments { get; set; } public IReadOnlyList NFlags { get; set; } - - private readonly TextRowContentType _contentType; - public TextRowContentType ContentType { get; } public bool IsSentenceStart(int i) => @@ -57,7 +54,7 @@ public bool IsRangeStart(int i) => public NParallelTextRow Invert() { - return new NParallelTextRow(TextId, NRefs.Reverse(), _contentType) + return new NParallelTextRow(TextId, NRefs.Reverse(), ContentType) { NFlags = NFlags.Reverse().ToImmutableArray(), }; From 7a3c968f9ca1536baedb3298c84a9411e422ec34 Mon Sep 17 00:00:00 2001 From: Enkidu93 Date: Wed, 11 Feb 2026 12:23:40 -0500 Subject: [PATCH 4/4] Cast rather than using .Value --- .../Corpora/ParatextProjectTermsParserBase.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs index b7ece42f7..fdc3f2ac3 100644 --- a/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs +++ b/src/SIL.Machine/Corpora/ParatextProjectTermsParserBase.cs @@ -117,13 +117,14 @@ public IEnumerable Parse( .Descendants() .Where(n => n.Name.LocalName == "TermRendering") .Where(ele => ((string)ele.Attribute("Guess") ?? "false") == "false") - .Select(ele => (ele.Attribute("Id").Value, ele)) + .Select(ele => ((string)ele.Attribute("Id"), ele)) + .Where(kvp => kvp.Item1 != null) .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategory)) .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) .Select(kvp => { string id = kvp.Item1.Replace("\n", " "); - string rendering = kvp.Item2.Element("Renderings").Value; + string rendering = (string)kvp.Item2.Element("Renderings") ?? ""; IReadOnlyList renderings = GetRenderingsWithPattern(rendering); return (id, renderings); }) @@ -138,7 +139,8 @@ public IEnumerable Parse( termsGlosses = termsGlossesDoc .Descendants() .Where(n => n.Name.LocalName == "Localization") - .Select(ele => (ele.Attribute("Id").Value, ele)) + .Select(ele => ((string)ele.Attribute("Id"), ele)) + .Where(kvp => kvp.Item1 != null) .Where(kvp => IsInCategory(kvp.Item1, termCategories, termIdToCategory)) .Where(kvp => IsInChapters(kvp.Item1, chapters, termIdToReferences)) .Select(kvp => @@ -309,7 +311,7 @@ IDictionary> TermReferences var termIdToReferences = new Dictionary>(); foreach (XElement term in biblicalTermsDocument.Descendants().Where(n => n.Name.LocalName == "Term")) { - string termId = term.Attribute("Id").Value; + string termId = (string)term.Attribute("Id"); if (termId == null) continue;