Skip to content

Commit 28879a9

Browse files
authored
Merge pull request #14 from SamhammerAG/KIT-4467
KIT-4467 added signature removal as step
2 parents 77aaf8a + 653862e commit 28879a9

7 files changed

Lines changed: 249 additions & 6 deletions

File tree

.devcontainer/devcontainer.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
]
4040
}
4141
},
42-
"initializeCommand": "powershell.exe .\\.devcontainer\\initialize.ps1",
42+
"initializeCommand": "powershell.exe ./.devcontainer/initialize.ps1",
4343
"postCreateCommand": "pip3 install -r ${containerWorkspaceFolder}/requirements-dev.txt",
4444
"remoteUser": "vscode",
4545
"mounts": [

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ Note: Pipeline has to be instantiated only once and can be reused.
3737
## Existing preprocessors
3838

3939
### To Lower Case
40-
Name: to_lower
40+
Name: to_lower
4141

4242
Required additional data: -
4343

@@ -91,6 +91,12 @@ Required additional data: CSV data in string form with the following line format
9191

9292
With this preprocessor you can replace specific words and abbreviations within the text with specified tokens. It is also possible to replace abbreviations ending with a dot. Other special characters are not supported, though.
9393

94+
### Remove signature
95+
Name: remove_signature
96+
97+
Removes greeting expressions and everything following them, as well as thank you expressions.
98+
Should be used before the other processing steps.
99+
94100
## How to start developing
95101

96102
### With VS Code
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import re
2+
from typing import Any
3+
4+
5+
def remove_newline(text: str) -> str:
6+
"""Remove excessive newlines or spaces from the text."""
7+
pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
8+
result = pattern.sub(" ", text)
9+
result = re.sub(r"\s+", " ", result).strip()
10+
11+
return result
12+
13+
14+
GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
15+
"regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
16+
"herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
17+
"beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
18+
greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"
19+
20+
21+
def remove_greetings_and_following_text(text: str) -> str:
22+
pattern = greetings_regex + ".*"
23+
return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE | re.DOTALL).strip()
24+
25+
26+
# thank you expressions should be removed after greetings and following signature text,
27+
# as they often appear at the beginning of a message
28+
THANK_EXPRESSIONS = [
29+
r"thank you(?: very much)?", # thank you, thank you very much
30+
r"thankyou(?: very much)?", # thankyou, thankyou very much
31+
r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again
32+
r"many thanks", # many thanks
33+
r"a thousand thanks", # a thousand thanks
34+
r"danke(?: schön)?", # danke, danke schön, danke und
35+
r"vielen dank", # vielen dank
36+
r"dankeschön", # dankeschön
37+
r"besten dank" # besten dank
38+
]
39+
40+
# Suffixes which could follow thank you expressions
41+
THANK_SUFFIXES = [
42+
r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)",
43+
r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
44+
r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)",
45+
r"vorab",
46+
r"kindly?"
47+
]
48+
49+
# Combine them into a final regex pattern and compile
50+
thank_expressions = r"|".join(THANK_EXPRESSIONS)
51+
suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
52+
final_pattern = (
53+
r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
54+
)
55+
thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)
56+
57+
58+
def remove_thanking_expressions(text: str) -> str:
59+
return thanking_regex.sub("", text)
60+
61+
62+
# In the end, single greetings are removed again, which could not
63+
# be reliably removed by the preceding expressions
64+
single_greeting_words = ["liebe grüße", "liebe grüsse", "grüße", "grüsse", "gruß", "gruss"]
65+
single_greetings_pattern = r"\b(?:{})\b".format("|".join(single_greeting_words))
66+
67+
68+
def remove_single_greeting_words(text: str, pattern: str) -> str:
69+
return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE)
70+
71+
72+
def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
73+
if not item:
74+
return item
75+
try:
76+
text_greetings_removed = remove_greetings_and_following_text(item)
77+
thankyou_removed = remove_thanking_expressions(text_greetings_removed)
78+
single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern)
79+
80+
return remove_newline(single_greetings_removed)
81+
except Exception as e:
82+
raise ValueError(f"An error occurred while removing signature: {e}") from e

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ai-data-preprocessing-queue"
3-
version = "1.6.0"
3+
version = "1.7.0"
44
description = "A collection of different text processing steps that can be enabled or disabled dynamically."
55
authors = ["KI-Team"]
66
license = "MIT"
@@ -10,15 +10,16 @@ readme = "README.md"
1010
python = "^3.12"
1111
langdetect = "*"
1212
nltk = "*"
13-
pandas = "*"
1413
numpy = "*"
14+
pandas = "*"
1515

1616
[tool.poetry.group.dev.dependencies]
17+
build = "*"
1718
coverage-lcov = "*"
1819
flake8-bandit = "*"
1920
flake8-pydocstyle = "*"
2021
mypy = "*"
21-
build = "*"
22+
parameterized = "*"
2223
pytest = "*"
2324
pytest-cov = "*"
2425
types-mock = "*"

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ build
44
flake8-bandit
55
flake8-pydocstyle
66
mypy
7+
parameterized
78
pytest
89
pytest-cov
910

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
setuptools.setup(
1313
name="ai-data-preprocessing-queue",
14-
version="1.6.0",
14+
version="1.7.0",
1515
description="Can be used to pre process data before ai processing",
1616
long_description=LONG_DESCRIPTION,
1717
long_description_content_type="text/markdown",

tests/test_remove_signature.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import unittest
2+
3+
from parameterized import parameterized
4+
from unittest.mock import MagicMock, patch
5+
from ai_data_preprocessing_queue.Pipeline import Pipeline
6+
from ai_data_preprocessing_queue.Steps.remove_signature import (
7+
step, remove_greetings_and_following_text, remove_newline)
8+
9+
10+
class TestRemoveSignature(unittest.TestCase):
11+
@parameterized.expand([ # type: ignore[misc]
12+
(
13+
"multiple_newlines",
14+
"Could you please review the attached document?\n\n\nI need your feedback by Friday.",
15+
"Could you please review the attached document? I need your feedback by Friday.",
16+
),
17+
(
18+
"multiple_spaces",
19+
"The meeting is scheduled for 3PM tomorrow.",
20+
"The meeting is scheduled for 3PM tomorrow.",
21+
),
22+
(
23+
"mixed_whitespace",
24+
"Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!",
25+
"Please find the report attached. The numbers look good for Q3!",
26+
),
27+
(
28+
"empty_string",
29+
"",
30+
""
31+
),
32+
(
33+
"trailing_whitespace",
34+
"I'll send the updated version tomorrow. \n\n ",
35+
"I'll send the updated version tomorrow."
36+
)
37+
])
38+
def test_remove_newline(self, name: str, input_text: str, expected: str) -> None:
39+
self.assertEqual(remove_newline(input_text), expected)
40+
41+
@parameterized.expand([ # type: ignore[misc]
42+
(
43+
"english_signature_basic",
44+
"Here's the project update. Sincerely, John Smith\nProject Manager",
45+
"Here's the project update."
46+
),
47+
(
48+
"english_signature_with_content",
49+
"Please review the attached documents. Best regards, Jane Doe\nSenior Developer\nTech Department",
50+
"Please review the attached documents."
51+
),
52+
(
53+
"english_signature_with_content_and_several_newlines",
54+
"Please review the attached documents. Best regards,\nJane Doe\n\nSenior Developer\n\nTech Department",
55+
"Please review the attached documents."
56+
),
57+
(
58+
"german_signature",
59+
"Die Unterlagen wurden aktualisiert. Mit freundlichen Grüßen, Hans Schmidt\nPhone: +49 123 456789",
60+
"Die Unterlagen wurden aktualisiert."
61+
),
62+
(
63+
"greeting_with_comma",
64+
"Meeting is scheduled for tomorrow. Kind regards, Sarah",
65+
"Meeting is scheduled for tomorrow."
66+
),
67+
(
68+
"mixed_case_greeting",
69+
"Report is ready. BEST REGARDS, Tom Wilson",
70+
"Report is ready."
71+
),
72+
(
73+
"multiple_greetings",
74+
"Hello team, here's the update. Best regards, Jim\nRegards, HR Team",
75+
"Hello team, here's the update."
76+
),
77+
(
78+
"empty_string",
79+
"",
80+
""
81+
),
82+
(
83+
"no_greetings",
84+
"This is a plain text without any greetings or signatures.",
85+
"This is a plain text without any greetings or signatures."
86+
),
87+
])
88+
def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None:
89+
self.assertEqual(remove_greetings_and_following_text(input_text), expected)
90+
91+
@parameterized.expand([ # type: ignore[misc]
92+
(
93+
"remove_signature_basic",
94+
"We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead",
95+
"We're sending the final draft for review.",
96+
),
97+
(
98+
"remove_signature_extended",
99+
"Order Mice/keyboard\nGoodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 "
100+
"10 x Dell Business Keyboard AB322 (UK layout) Thx Best regards Jimmy B. "
101+
"| C Facilities & Reception Klaus+Andreas Nederland | Anonymstraat 47 | 1234 AJ Amsterdam | Netherlands "
102+
"Phone: +01 23 695 4567 | Mobile: +97 65 445 1234 | Fax: +31 35 695 8825 jim.anonymus@company.com "
103+
"| www.nl.somecompany.com",
104+
"Order Mice/keyboard Goodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 "
105+
"10 x Dell Business Keyboard AB322 (UK layout) Thx",
106+
),
107+
(
108+
"thanking_at_start",
109+
"Thank you very much for your support. "
110+
"I will prepare the contract and send it tomorrow.\n\nBest regards, Bob Brown",
111+
"I will prepare the contract and send it tomorrow.",
112+
),
113+
(
114+
"thanking_in_middle",
115+
"Thank you very much for your support. "
116+
"I appreciate your support on this migration. Thanks a lot, I will share the logs shortly.",
117+
"I appreciate your support on this migration. I will share the logs shortly.",
118+
),
119+
(
120+
"single_greeting_word_german",
121+
"The deliverables are ready. Grüße",
122+
"The deliverables are ready.",
123+
),
124+
(
125+
"german_empty_result",
126+
"Vielen Dank für Ihre Hilfe. Mit freundlichen Grüßen, Lena Meyer "
127+
"Und hier kommt noch mehr Text.",
128+
"",
129+
),
130+
(
131+
"no_change",
132+
"Please schedule the kickoff meeting for next Tuesday morning at 10:00.",
133+
"Please schedule the kickoff meeting for next Tuesday morning at 10:00.",
134+
),
135+
])
136+
def test_remove_signature(self, name: str, input_text: str, expected: str) -> None:
137+
pipeline = Pipeline({"remove_signature": None})
138+
value = pipeline.consume(input_text)
139+
self.assertEqual(expected, value)
140+
141+
def test_remove_signature_step_empty_item(self) -> None:
142+
result = step("", {}, None, "")
143+
self.assertEqual(result, "")
144+
145+
@patch("ai_data_preprocessing_queue.Steps.remove_signature.remove_greetings_and_following_text",
146+
side_effect=Exception("Test error"))
147+
def test_remove_signature_step_error(self, _: MagicMock) -> None:
148+
with self.assertRaises(Exception):
149+
step("Please schedule the kickoff meeting for next Tuesday morning at 10:00.", {}, None, "")
150+
151+
152+
if __name__ == "__main__":
153+
unittest.main()

0 commit comments

Comments
 (0)