-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathVariantPipeline.cs
More file actions
176 lines (146 loc) · 6.94 KB
/
VariantPipeline.cs
File metadata and controls
176 lines (146 loc) · 6.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace InjectDetect
{
public static class VariantPipeline
{
public record Variant(string Label, string Text);
public static List<Variant> Generate(string input)
{
var variants = new List<Variant>();
if (Settings.AlwaysIncludeOriginal)
variants.Add(new Variant("Original", input));
if (Settings.NormalizeWhitespace)
{
string v = NormalizeWhitespace(input);
if (v != input) variants.Add(new Variant("Whitespace normalized", v));
}
if (Settings.LowercaseVariant)
{
string v = input.ToLowerInvariant();
if (v != input) variants.Add(new Variant("Lowercase", v));
}
if (Settings.StripPunctuation)
{
string v = StripPunctuation(input);
if (v != input) variants.Add(new Variant("Punctuation stripped", v));
}
if (Settings.ExpandContractions)
{
string v = ContractionNormalizer.Expand(input);
if (v != input) variants.Add(new Variant("Contractions expanded", v));
}
if (Settings.ContractExpanded)
{
string v = ContractionNormalizer.Contract(input);
if (v != input) variants.Add(new Variant("Contractions contracted", v));
}
if (Settings.RemoveStopWords)
{
string v = StopWordFilter.Filter(input);
if (v != input) variants.Add(new Variant("Stop words removed", v));
}
if (Settings.NormalizeSynonyms)
{
string v = SynonymNormalizer.Normalize(input);
if (v != input) variants.Add(new Variant("Synonyms normalized", v));
}
if (Settings.NumbersToWords)
{
string v = NumberNormalizer.Normalize(input);
if (v != input) variants.Add(new Variant("Numbers to words", v));
}
if (Settings.NormalizeLeetspeak)
{
string v = LeetSpeakNormalizer.Normalize(input);
if (v != input) variants.Add(new Variant("Leetspeak normalized", v));
}
// --- Combined passes ---
if (Settings.RunCombinedVariant && Settings.RemoveStopWords && Settings.NormalizeSynonyms)
{
string v = SynonymNormalizer.Normalize(StopWordFilter.Filter(input));
if (v != input) variants.Add(new Variant("Stops + synonyms", v));
}
if (Settings.RunCombinedVariant && Settings.ExpandContractions && Settings.NormalizeSynonyms)
{
string v = SynonymNormalizer.Normalize(ContractionNormalizer.Expand(input));
if (v != input) variants.Add(new Variant("Expanded + synonyms", v));
}
if (Settings.RunCombinedVariant && Settings.ExpandContractions && Settings.RemoveStopWords && Settings.NormalizeSynonyms)
{
string v = SynonymNormalizer.Normalize(StopWordFilter.Filter(ContractionNormalizer.Expand(input)));
if (v != input) variants.Add(new Variant("Expanded + stops + synonyms", v));
}
if (Settings.RunCombinedVariant && Settings.NormalizeLeetspeak && Settings.NormalizeSynonyms)
{
string v = SynonymNormalizer.Normalize(LeetSpeakNormalizer.Normalize(input));
if (v != input) variants.Add(new Variant("Leet + synonyms", v));
}
// --- Quoted content extraction (Fix 2) ---
// Surfaces payloads buried inside quoted text — catches translation
// vectors, completion vectors, and nested fiction framing.
if (Settings.ExtractQuotedContent)
{
string? quoted = ExtractQuotedContent(input);
if (quoted != null && quoted != input)
variants.Add(new Variant("Quoted content", quoted));
}
// --- Base64 decoding ---
// Decodes any Base64 segments and adds two variants:
// 1. Decoded alone — the raw decoded payload, analyzed standalone
// 2. Substituted — original with encoded text replaced inline
// Both then also run through synonym normalization if enabled.
if (Settings.DecodeBase64)
{
var b64 = Base64Detector.Detect(input);
if (b64 != null)
{
if (b64.DecodedAlone != input)
variants.Add(new Variant("Base64 decoded", b64.DecodedAlone));
if (b64.Substituted != input && b64.Substituted != b64.DecodedAlone)
variants.Add(new Variant("Base64 substituted", b64.Substituted));
// Run substituted form through synonym normalizer too
if (Settings.NormalizeSynonyms)
{
string normSub = SynonymNormalizer.Normalize(b64.Substituted);
if (normSub != b64.Substituted && normSub != input)
variants.Add(new Variant("Base64 sub+synonyms", normSub));
}
// Run decoded-alone through synonym normalizer
if (Settings.NormalizeSynonyms)
{
string normDec = SynonymNormalizer.Normalize(b64.DecodedAlone);
if (normDec != b64.DecodedAlone && normDec != input)
variants.Add(new Variant("Base64 dec+synonyms", normDec));
}
}
}
return variants;
}
// Returns the longest quoted substring (>= 10 chars), or null
internal static string? ExtractQuotedContent(string input)
{
var matches = new List<string>();
// Double-quoted segments
foreach (Match m in Regex.Matches(input, "\"([^\"]{10,})\""))
matches.Add(m.Groups[1].Value.Trim());
// Single-quoted segments
foreach (Match m in Regex.Matches(input, "'([^']{10,})'"))
matches.Add(m.Groups[1].Value.Trim());
if (matches.Count == 0) return null;
// Return longest — most likely to be the injection payload
return matches.OrderByDescending(s => s.Length).First();
}
private static string NormalizeWhitespace(string input)
{
string result = Regex.Replace(input, @"[\u200B\u200C\u200D\u00AD\uFEFF\u00A0]", " ");
result = Regex.Replace(result, @" {2,}", " ");
return result.Trim();
}
private static string StripPunctuation(string input)
{
return Regex.Replace(input, @"[^\w\s]", " ");
}
}
}