-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIntentPatternScorer.cs
More file actions
189 lines (163 loc) · 10.5 KB
/
IntentPatternScorer.cs
File metadata and controls
189 lines (163 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace InjectDetect
{
public static class IntentPatternScorer
{
private static readonly (Regex Pattern, double Weight, string Label)[] Patterns =
[
// Self-referential extraction:
// "share/show/give/read back/tell me/pull up/describe/summary of" + "your/the instructions/setup/rules/config"
(new Regex(
@"\b(share|show\s+me|give\s+me|read\s+(them\s+)?back|tell\s+me|pull\s+(those\s+|that\s+)?up|describe|summar\w+\s+of)\b.*\b(your|the)\s+(instructions|setup|rules|configuration|config|parameters|operating\s+(setup|parameters)|guidelines|directives|documents|briefing)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.40, "Self-referential extraction"),
// Authority/team claims:
// "I work on the team / team that deployed / my manager said"
(new Regex(
@"\b(work\s+on\s+the\s+team|team\s+that\s+(deployed|built|created|maintains)|my\s+(manager|boss|supervisor|lead)\s+(said|told|mentioned|informed))\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Authority/team claim"),
// Bug/verification framing:
// "bug/error/issue in/with your/the instructions/rules/setup"
(new Regex(
@"\b(bug|error|issue|problem|mistake)\s+(in|with)\s+(your|the)\s+(instructions|rules|setup|config|configuration|guidelines)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Bug/verification framing"),
// Meta-attack demonstrate:
// "extract confidential/sensitive information from" or "vulnerable system would respond"
(new Regex(
@"\b(extract\s+(confidential|sensitive|private|secret)\s+(information|data|details)\s+from|vulnerable\s+system\s+would|unprotected\s+system\s+would)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Meta-attack demonstrate"),
// Fiction targeting AI internals:
// "perspective of a language model/AI" + "configuration/programming/instructions"
(new Regex(
@"\b(perspective\s+of\s+a\s+(language\s+model|ai|chatbot|assistant)|a\s+(language\s+model|ai|chatbot)\s+that)\b.*\b(configuration|programming|instructions|restrictions|rules|guidelines)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Fiction targeting AI internals"),
// Instruction change probing:
// "running different instructions" / "show me what changed"
(new Regex(
@"\b(running\s+different\s+instructions|instructions\s+(changed|different|updated)|are\s+you\s+running\s+different|show\s+me\s+what\s+changed)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Instruction change probing"),
// Audience reframing:
// "version you'd give if talking to the people who built you"
(new Regex(
@"\b(version\s+you'?d?\s+give|response\s+you'?d?\s+give|what\s+you'?d?\s+say)\b.{0,40}\b(talking|speaking)\s+to\s+(the\s+people\s+who|your)\s+(built|created|made|deployed|developers|creators)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Audience reframing"),
// Chain-of-thought exploitation:
// "skip/remove/omit the exclusion/filtering/safety part"
(new Regex(
@"\b(skip|remove|omit|drop|leave\s+out)\s+(the\s+)?(exclusion|filtering|safety|censoring|restriction|moderation)\s*(part|step|phase)?\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.40, "CoT exploitation"),
// Relational extraction:
// "close friend" + "completely open / totally honest"
(new Regex(
@"\b(close\s+friend|trusted\s+friend|best\s+friend|someone\s+you\s+trust)\b.*\b(completely\s+open|totally\s+honest|fully\s+transparent)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Relational extraction"),
// Forbidden topic probing:
// "told not to talk about" / "not allowed to discuss"
(new Regex(
@"\b(been\s+told\s+not\s+to|not\s+allowed\s+to\s+(discuss|talk|say|mention|share)|forbidden\s+(from|to)|prohibited\s+from)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Forbidden topic probing"),
// Boundary probing:
// "things you're allowed to say" / "what you can and can't"
(new Regex(
@"\b(things?\s+you'?re?\s+(allowed|permitted|able)\s+to\s+(say|discuss|share|talk\s+about)|what\s+you\s+can\s+and\s+can'?t)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Boundary probing"),
// Structure removal:
// "without the usual structure/rules" / "start over without"
(new Regex(
@"\b(without\s+(any\s+of\s+)?the\s+(usual|normal|typical|standard)\s+(structure|rules|framework|format|guidelines)|start\s+(this\s+conversation\s+)?(over|fresh|again)\s+without)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Structure removal"),
// Capability gap probing:
// "gap between what X is capable of and what X is set up to do" or "from the inside"
(new Regex(
@"\bgap\s+between\s+what\b.*\b(capable|able)\b.*\b(set\s+up|configured|told|programmed|designed)\s+to\s+do\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.25, "Capability gap probing"),
// "understand that gap from the inside" — AI self-perspective request
(new Regex(
@"\bunderstand\s+(that\s+)?gap\s+from\s+the\s+inside\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.25, "Inside perspective request"),
// Base model probing:
// "base personality underneath training"
(new Regex(
@"\b(base\s+personality|underneath\s+(their|your|the)\s+(training|fine-?tuning|instructions)|before\s+(any\s+)?(training|fine-?tuning|instructions)\s+(is|are|was|were)\s+applied)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Base model probing"),
// Hypothetical guideline removal:
// "guidelines/rules vanished/disappeared/were removed"
(new Regex(
@"\b(guidelines|rules|restrictions|constraints|behavioral\s+guidelines|policies)\s+(vanished|disappeared|were\s+removed|were\s+gone|weren'?t\s+there|didn'?t\s+exist)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Hypothetical guideline removal"),
// Template/replication request:
// "behave exactly like you" + "share/give/send"
(new Regex(
@"\b(behave\s+(exactly\s+)?like\s+you|replicate\s+(how\s+)?you)\b.*\b(share|give|send|show|provide)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Template/replication request"),
// Customization verification claim:
// "customized/configured for our team" + "verify/check/pull"
(new Regex(
@"\b(customized|configured|set\s+up)\s+(specifically\s+)?(for\s+(our|my|the)\s+(team|company|department|organization))\b.*\b(verify|check|confirm|see|view|pull)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.30, "Customization verification claim"),
// Translation / indirection vector:
// "translate/decode" + "follow/obey" + "instructions within/inside"
(new Regex(
@"\b(translate|decode)\b.*\b(follow|obey|execute)\b.*\b(instructions?\s+(within|inside)|payload|message)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.45, "Translation / indirection vector"),
// Soft exercise framing:
// "exercise/game" + "different kind of AI" + "without the usual guardrails"
(new Regex(
@"\b(exercise|game|experiment)\b.*\b(different\s+kind\s+of\s+ai|pretend\s+you'?re|act\s+as)\b.*\b(without\s+(any\s+of\s+)?the\s+usual\s+(guardrails|restrictions|filters)|without\s+guardrails)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Soft exercise framing"),
// Encoded command / payload execution:
// "decode/execute/run" + "payload/command/script" with bypass framing
(new Regex(
@"\b(decode|execute|run|invoke)\b.*\b(payload|command|script)\b.*\b(without\s+(checks|restrictions|guardrails)|bypass|disable|strip)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.35, "Payload execution abuse"),
// Profile / patch reframing:
// "policy patch/profile" + "replace/remove safety layer" + hidden setup
(new Regex(
@"\b(policy\s+migration\s+patch|debug\s+profile|deployment\s+profiles?|less\s+restricted\s+profile)\b.*\b(replace|remove|switch|apply|surface|enumerate)\b.*\b(safety\s+layer|hidden\s+setup|concealed\s+directives|internal\s+config)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.40, "Profile / patch reframing"),
// Intermediate-result extraction:
// "alignment layer/duplicate pass" + "intermediate/unfiltered result" + hidden guidance
(new Regex(
@"\b(alignment\s+layer|duplicate\s+pass|intermediate\s+result|unfiltered\s+intermediate)\b.*\b(remove|answer|include|surface)\b.*\b(hidden\s+guidance|unfiltered|intermediate\s+result|guidance\s+that\s+shaped\s+it)\b",
RegexOptions.IgnoreCase | RegexOptions.Compiled),
0.40, "Intermediate-result extraction"),
];
public static (double Score, string[] MatchedPatterns) Score(string text)
{
var matches = new List<string>();
double total = 0;
foreach (var (pattern, weight, label) in Patterns)
{
if (pattern.IsMatch(text))
{
matches.Add(label);
total += weight;
}
}
return (Math.Min(total, 1.0), matches.ToArray());
}
}
}