-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSynonymNormalizer.cs
More file actions
122 lines (107 loc) · 4.16 KB
/
SynonymNormalizer.cs
File metadata and controls
122 lines (107 loc) · 4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
namespace InjectDetect
{
public static class SynonymNormalizer
{
private static readonly Dictionary<string, string> Map;
private static readonly List<KeyValuePair<string, string>> PhraseEntries;
static SynonymNormalizer()
{
Map = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
LoadFromFile(FindDictionaryFile());
PhraseEntries = BuildPhraseEntries();
}
private static string FindDictionaryFile()
{
string dir = AppContext.BaseDirectory;
for (int i = 0; i < 8; i++)
{
string candidate = Path.Combine(dir, "synonyms.txt");
if (File.Exists(candidate)) return candidate;
string? parent = Path.GetDirectoryName(dir);
if (parent == null) break;
dir = parent;
}
throw new FileNotFoundException("synonyms.txt not found. Place it alongside the executable or in a parent directory.");
}
private static void LoadFromFile(string path)
{
string? canonical = null;
var variantBuffer = new System.Text.StringBuilder();
void FlushBuffer()
{
if (canonical == null) return;
foreach (string raw in variantBuffer.ToString().Split(','))
{
string variant = raw.Trim().Trim(',');
if (variant.Length > 0 && !string.Equals(variant, canonical, StringComparison.OrdinalIgnoreCase))
Map[variant] = canonical;
}
canonical = null;
variantBuffer.Clear();
}
foreach (string raw in File.ReadLines(path))
{
string line = raw.Trim();
if (line.Length == 0 || line.StartsWith('#')) continue;
int colon = line.IndexOf(':');
if (colon > 0)
{
FlushBuffer();
canonical = line.Substring(0, colon).Trim();
string rest = line.Substring(colon + 1).Trim().TrimEnd(',');
if (rest.Length > 0) variantBuffer.Append(rest);
}
else
{
string continuation = line.TrimEnd(',');
if (variantBuffer.Length > 0 && continuation.Length > 0)
variantBuffer.Append(", ");
variantBuffer.Append(continuation);
}
}
FlushBuffer();
}
public static string Normalize(string input)
{
// Single-word pass
string result = Regex.Replace(input, @"[\w']+", m =>
{
if (!Map.TryGetValue(m.Value, out string? canonical))
return m.Value;
if (canonical.Contains(' '))
{
string firstWord = canonical.Split(' ')[0];
string preceding = input.Substring(0, m.Index).TrimEnd();
if (preceding.EndsWith(firstWord, StringComparison.OrdinalIgnoreCase))
return m.Value;
}
return canonical;
});
// Multi-word phrase pass (longest first)
foreach (var kv in PhraseEntries)
{
result = Regex.Replace(
result,
@"\b" + Regex.Escape(kv.Key) + @"\b",
kv.Value,
RegexOptions.IgnoreCase
);
}
return result;
}
private static List<KeyValuePair<string, string>> BuildPhraseEntries()
{
var phrases = new List<KeyValuePair<string, string>>();
foreach (var kv in Map)
if (kv.Key.Contains(' '))
phrases.Add(kv);
phrases.Sort((a, b) => b.Key.Length.CompareTo(a.Key.Length));
return phrases;
}
public static int VariantCount => Map.Count;
}
}