From b0e946ca99ac5e3bc5e8886b4b86c72bd65ee3c6 Mon Sep 17 00:00:00 2001 From: cwehmeier Date: Thu, 23 Oct 2025 16:03:49 +0200 Subject: [PATCH 1/5] KIT-4467 added signature removal as step --- .devcontainer/devcontainer.json | 2 +- README.md | 8 +- .../Steps/remove_signature.py | 81 +++++++++++++++++++ 3 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 ai_data_preprocessing_queue/Steps/remove_signature.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e10b18d..9f84f0c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -39,7 +39,7 @@ ] } }, - "initializeCommand": "powershell.exe .\\.devcontainer\\initialize.ps1", + "initializeCommand": "powershell.exe ./.devcontainer/initialize.ps1", "postCreateCommand": "pip3 install -r ${containerWorkspaceFolder}/requirements-dev.txt", "remoteUser": "vscode", "mounts": [ diff --git a/README.md b/README.md index 26f715b..b3229d9 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Note: Pipeline has to be instantiated only once and can be reused. ## Existing preprocessors ### To Lower Case -Name: to_lower +Name: to_lower Required additional data: - @@ -91,6 +91,12 @@ Required additional data: CSV data in string form with the following line format With this preprocessor you can replace specific words and abbreviations within the text with specified tokens. It is also possible to replace abbreviations ending with a dot. Other special characters are not supported, though. +### Remove signature +Name: remove_signature + +Removes greeting expressions and everything following them, as well as thank you expressions. +Should be used before the other processing steps. + ## How to start developing ### With VS Code diff --git a/ai_data_preprocessing_queue/Steps/remove_signature.py b/ai_data_preprocessing_queue/Steps/remove_signature.py new file mode 100644 index 0000000..3538b9b --- /dev/null +++ b/ai_data_preprocessing_queue/Steps/remove_signature.py @@ -0,0 +1,81 @@ +import re + + +def remove_newline(text: str) -> str: + """Remove excessive newlines or spaces from the text.""" + pattern = re.compile(r"\s{2,}|[\n\r]{3,}") + result = pattern.sub(" ", text) + result = re.sub(r"\s+", " ", result).strip() + + return result + + +GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers", + "regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße", + "herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse", + "beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"] +greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*" + + +def remove_greetings_and_following_text(text: str) -> str: + pattern = greetings_regex + ".*" + return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE) + + +# thank you expressions should be removed after greetings and following signature text, +# as they often appear at the beginning of a message +THANK_EXPRESSIONS = [ + r"thank you(?: very much)?", # thank you, thank you very much + r"thankyou(?: very much)?", # thankyou, thankyou very much + r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again + r"many thanks", # many thanks + r"a thousand thanks", # a thousand thanks + r"danke(?: schön)?", # danke, danke schön, danke und + r"vielen dank", # vielen dank + r"dankeschön", # dankeschön + r"besten dank" # besten dank +] + +# Suffixes which could follow thank you expressions +THANK_SUFFIXES = [ + r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)", + r"(?:for (?:your|the) (?:help|support|understanding|assistance))", + r"(?:schon mal )?(?:im voraus)?(?: für (?:ihre|ihr|eure|die|den) (?:hilfe|support|verständnis))?", + r"vorab", + r"kindly?" +] + +# Combine them into a final regex pattern and compile +thank_expressions = r"|".join(THANK_EXPRESSIONS) +suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?" +final_pattern = ( + r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*" +) +thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE) + + +def remove_thanking_expressions(text: str) -> str: + return thanking_regex.sub("", text) + + +# In the end, single greetings are removed again, which could not +# be reliably removed by the preceding expressions +single_greeting_words = ["liebe grüße", "liebe grüsse", "grüße", "grüsse", "gruß", "gruss"] +single_greetings_pattern = r"\b(?:{})\b".format("|".join(single_greeting_words)) + + +def remove_single_greeting_words(text: str, pattern: str) -> str: + return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE) + + +def step(text: str) -> str: + if not text: + return text + try: + text_greetings_removed = remove_greetings_and_following_text(text) + thankyou_removed = remove_thanking_expressions(text_greetings_removed) + single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern) + + return remove_newline(single_greetings_removed) + except Exception as e: + raise ValueError(f"An error occurred while removing signature: {e}") from e From 366ec59bf9918a152dc3d3574ac4bc3fc8453f86 Mon Sep 17 00:00:00 2001 From: cwehmeier Date: Fri, 24 Oct 2025 12:31:10 +0200 Subject: [PATCH 2/5] KIT-4469 added unit tests --- .../Steps/remove_signature.py | 2 +- requirements.txt | 1 + tests/test_remove_signature.py | 92 +++++++++++++++++++ 3 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 tests/test_remove_signature.py diff --git a/ai_data_preprocessing_queue/Steps/remove_signature.py b/ai_data_preprocessing_queue/Steps/remove_signature.py index 3538b9b..05f849a 100644 --- a/ai_data_preprocessing_queue/Steps/remove_signature.py +++ b/ai_data_preprocessing_queue/Steps/remove_signature.py @@ -19,7 +19,7 @@ def remove_newline(text: str) -> str: def remove_greetings_and_following_text(text: str) -> str: pattern = greetings_regex + ".*" - return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE) + return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE | re.DOTALL).strip() # thank you expressions should be removed after greetings and following signature text, diff --git a/requirements.txt b/requirements.txt index d67e519..aa9fa8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ langdetect~=1.0.9 nltk>=3.9.0, <4.0 pandas>=2.0.0, <3.0 numpy>=2.0.0, <3.0 +parameterized==0.9.0 diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py new file mode 100644 index 0000000..8d5ac0f --- /dev/null +++ b/tests/test_remove_signature.py @@ -0,0 +1,92 @@ +import unittest + +from parameterized import parameterized + +from ai_data_preprocessing_queue.Steps.remove_signature import ( + remove_greetings_and_following_text, remove_newline) + + +class TestRemoveSignature(unittest.TestCase): + @parameterized.expand([ # type: ignore[misc] + ("multiple_newlines", + "Could you please review the attached document?\n\n\nI need your feedback by Friday.", + "Could you please review the attached document? I need your feedback by Friday." + ), + ( + "multiple_spaces", + "The meeting is scheduled for 3PM tomorrow.", + "The meeting is scheduled for 3PM tomorrow." + ), + ( + "mixed_whitespace", + "Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!", + "Please find the report attached. The numbers look good for Q3!" + ), + ( + "empty_string", + "", + "" + ), + ( + "trailing_whitespace", + "I'll send the updated version tomorrow. \n\n ", + "I'll send the updated version tomorrow." + ), + ]) + def test_remove_newline(self, name: str, input_text: str, expected: str) -> None: + self.assertEqual(remove_newline(input_text), expected) + + @parameterized.expand([ # type: ignore[misc] + ( + "english_signature_basic", + "Here's the project update. Sincerely, John Smith\nProject Manager", + "Here's the project update." + ), + ( + "english_signature_with_content", + "Please review the attached documents. Best regards, Jane Doe\nSenior Developer\nTech Department", + "Please review the attached documents." + ), + ( + "english_signature_with_content_and_several_newlines", + "Please review the attached documents. Best regards,\nJane Doe\n\nSenior Developer\n\nTech Department", + "Please review the attached documents." + ), + ( + "german_signature", + "Die Unterlagen wurden aktualisiert. Mit freundlichen Grüßen, Hans Schmidt\nPhone: +49 123 456789", + "Die Unterlagen wurden aktualisiert." + ), + ( + "greeting_with_comma", + "Meeting is scheduled for tomorrow. Kind regards, Sarah", + "Meeting is scheduled for tomorrow." + ), + ( + "mixed_case_greeting", + "Report is ready. BEST REGARDS, Tom Wilson", + "Report is ready." + ), + ( + "multiple_greetings", + "Hello team, here's the update. Best regards, Jim\nRegards, HR Team", + "Hello team, here's the update." + ), + ( + "empty_string", + "", + "" + ), + ( + "no_greetings", + "This is a plain text without any greetings or signatures.", + "This is a plain text without any greetings or signatures." + ) + ]) + def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None: + self.assertEqual(remove_greetings_and_following_text(input_text), expected) + + +if __name__ == '__main__': + unittest.main() + unittest.main() From 9db279c8f25d5d96baca0567baa724c7da75717d Mon Sep 17 00:00:00 2001 From: cwehmeier Date: Fri, 24 Oct 2025 13:48:51 +0200 Subject: [PATCH 3/5] KIT-4469 fixed linting issue --- tests/test_remove_signature.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py index 8d5ac0f..cd8c7b9 100644 --- a/tests/test_remove_signature.py +++ b/tests/test_remove_signature.py @@ -8,19 +8,20 @@ class TestRemoveSignature(unittest.TestCase): @parameterized.expand([ # type: ignore[misc] - ("multiple_newlines", + ( + "multiple_newlines", "Could you please review the attached document?\n\n\nI need your feedback by Friday.", - "Could you please review the attached document? I need your feedback by Friday." + "Could you please review the attached document? I need your feedback by Friday.", ), ( "multiple_spaces", "The meeting is scheduled for 3PM tomorrow.", - "The meeting is scheduled for 3PM tomorrow." + "The meeting is scheduled for 3PM tomorrow.", ), ( "mixed_whitespace", "Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!", - "Please find the report attached. The numbers look good for Q3!" + "Please find the report attached. The numbers look good for Q3!", ), ( "empty_string", @@ -31,7 +32,7 @@ class TestRemoveSignature(unittest.TestCase): "trailing_whitespace", "I'll send the updated version tomorrow. \n\n ", "I'll send the updated version tomorrow." - ), + ) ]) def test_remove_newline(self, name: str, input_text: str, expected: str) -> None: self.assertEqual(remove_newline(input_text), expected) @@ -81,12 +82,11 @@ def test_remove_newline(self, name: str, input_text: str, expected: str) -> None "no_greetings", "This is a plain text without any greetings or signatures.", "This is a plain text without any greetings or signatures." - ) + ), ]) def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None: self.assertEqual(remove_greetings_and_following_text(input_text), expected) -if __name__ == '__main__': - unittest.main() +if __name__ == "__main__": unittest.main() From 643e5aa3e584e47dfcca761d93e80368786dbeff Mon Sep 17 00:00:00 2001 From: cwehmeier Date: Fri, 24 Oct 2025 17:35:01 +0200 Subject: [PATCH 4/5] KIT-4469 fixed added more unit tests --- .../Steps/remove_signature.py | 11 ++--- tests/test_remove_signature.py | 41 +++++++++++++++++++ 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/ai_data_preprocessing_queue/Steps/remove_signature.py b/ai_data_preprocessing_queue/Steps/remove_signature.py index 05f849a..9b73b8e 100644 --- a/ai_data_preprocessing_queue/Steps/remove_signature.py +++ b/ai_data_preprocessing_queue/Steps/remove_signature.py @@ -1,4 +1,5 @@ import re +from typing import Any def remove_newline(text: str) -> str: @@ -40,7 +41,7 @@ def remove_greetings_and_following_text(text: str) -> str: THANK_SUFFIXES = [ r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)", r"(?:for (?:your|the) (?:help|support|understanding|assistance))", - r"(?:schon mal )?(?:im voraus)?(?: für (?:ihre|ihr|eure|die|den) (?:hilfe|support|verständnis))?", + r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)", r"vorab", r"kindly?" ] @@ -68,11 +69,11 @@ def remove_single_greeting_words(text: str, pattern: str) -> str: return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE) -def step(text: str) -> str: - if not text: - return text +def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any: + if not item: + return item try: - text_greetings_removed = remove_greetings_and_following_text(text) + text_greetings_removed = remove_greetings_and_following_text(item) thankyou_removed = remove_thanking_expressions(text_greetings_removed) single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern) diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py index cd8c7b9..7302044 100644 --- a/tests/test_remove_signature.py +++ b/tests/test_remove_signature.py @@ -2,6 +2,7 @@ from parameterized import parameterized +from ai_data_preprocessing_queue.Pipeline import Pipeline from ai_data_preprocessing_queue.Steps.remove_signature import ( remove_greetings_and_following_text, remove_newline) @@ -87,6 +88,46 @@ def test_remove_newline(self, name: str, input_text: str, expected: str) -> None def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None: self.assertEqual(remove_greetings_and_following_text(input_text), expected) + @parameterized.expand([ # type: ignore[misc] + ( + "remove_signature_basic", + "We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead", + "We're sending the final draft for review.", + ), + ( + "thanking_at_start", + "Thank you very much for your support. " + "I will prepare the contract and send it tomorrow.\n\nBest regards, Bob Brown", + "I will prepare the contract and send it tomorrow.", + ), + ( + "thanking_in_middle", + "Thank you very much for your support. " + "I appreciate your support on this migration. Thanks a lot, I will share the logs shortly.", + "I appreciate your support on this migration. I will share the logs shortly.", + ), + ( + "single_greeting_word_german", + "The deliverables are ready. Grüße", + "The deliverables are ready.", + ), + ( + "german_empty_result", + "Vielen Dank für Ihre Hilfe. Mit freundlichen Grüßen, Lena Meyer " + "Und hier kommt noch mehr Text.", + "", + ), + ( + "no_change", + "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", + "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", + ), + ]) + def test_remove_signature_parameterized(self, name: str, input_text: str, expected: str) -> None: + pipeline = Pipeline({"remove_signature": None}) + value = pipeline.consume(input_text) + self.assertEqual(expected, value) + if __name__ == "__main__": unittest.main() From 653862e9298c5cb3dd37c2f96fa1c9e09593473e Mon Sep 17 00:00:00 2001 From: cwehmeier Date: Tue, 28 Oct 2025 17:19:42 +0100 Subject: [PATCH 5/5] KIT-4467 add more unittests, organized reqs. --- pyproject.toml | 7 ++++--- requirements-dev.txt | 1 + requirements.txt | 1 - setup.py | 2 +- tests/test_remove_signature.py | 26 +++++++++++++++++++++++--- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b648416..e04c1b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ai-data-preprocessing-queue" -version = "1.6.0" +version = "1.7.0" description = "A collection of different text processing steps that can be enabled or disabled dynamically." authors = ["KI-Team"] license = "MIT" @@ -10,15 +10,16 @@ readme = "README.md" python = "^3.12" langdetect = "*" nltk = "*" -pandas = "*" numpy = "*" +pandas = "*" [tool.poetry.group.dev.dependencies] +build = "*" coverage-lcov = "*" flake8-bandit = "*" flake8-pydocstyle = "*" mypy = "*" -build = "*" +parameterized = "*" pytest = "*" pytest-cov = "*" types-mock = "*" diff --git a/requirements-dev.txt b/requirements-dev.txt index 4d9ec2e..0e1d798 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ build flake8-bandit flake8-pydocstyle mypy +parameterized pytest pytest-cov diff --git a/requirements.txt b/requirements.txt index aa9fa8d..d67e519 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,3 @@ langdetect~=1.0.9 nltk>=3.9.0, <4.0 pandas>=2.0.0, <3.0 numpy>=2.0.0, <3.0 -parameterized==0.9.0 diff --git a/setup.py b/setup.py index d57fece..a9cdc94 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setuptools.setup( name="ai-data-preprocessing-queue", - version="1.6.0", + version="1.7.0", description="Can be used to pre process data before ai processing", long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py index 7302044..9e33a9d 100644 --- a/tests/test_remove_signature.py +++ b/tests/test_remove_signature.py @@ -1,10 +1,10 @@ import unittest from parameterized import parameterized - +from unittest.mock import MagicMock, patch from ai_data_preprocessing_queue.Pipeline import Pipeline from ai_data_preprocessing_queue.Steps.remove_signature import ( - remove_greetings_and_following_text, remove_newline) + step, remove_greetings_and_following_text, remove_newline) class TestRemoveSignature(unittest.TestCase): @@ -94,6 +94,16 @@ def test_remove_greetings_and_following_text(self, name: str, input_text: str, e "We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead", "We're sending the final draft for review.", ), + ( + "remove_signature_extended", + "Order Mice/keyboard\nGoodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " + "10 x Dell Business Keyboard AB322 (UK layout) Thx Best regards Jimmy B. " + "| C Facilities & Reception Klaus+Andreas Nederland | Anonymstraat 47 | 1234 AJ Amsterdam | Netherlands " + "Phone: +01 23 695 4567 | Mobile: +97 65 445 1234 | Fax: +31 35 695 8825 jim.anonymus@company.com " + "| www.nl.somecompany.com", + "Order Mice/keyboard Goodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " + "10 x Dell Business Keyboard AB322 (UK layout) Thx", + ), ( "thanking_at_start", "Thank you very much for your support. " @@ -123,11 +133,21 @@ def test_remove_greetings_and_following_text(self, name: str, input_text: str, e "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", ), ]) - def test_remove_signature_parameterized(self, name: str, input_text: str, expected: str) -> None: + def test_remove_signature(self, name: str, input_text: str, expected: str) -> None: pipeline = Pipeline({"remove_signature": None}) value = pipeline.consume(input_text) self.assertEqual(expected, value) + def test_remove_signature_step_empty_item(self) -> None: + result = step("", {}, None, "") + self.assertEqual(result, "") + + @patch("ai_data_preprocessing_queue.Steps.remove_signature.remove_greetings_and_following_text", + side_effect=Exception("Test error")) + def test_remove_signature_step_error(self, _: MagicMock) -> None: + with self.assertRaises(Exception): + step("Please schedule the kickoff meeting for next Tuesday morning at 10:00.", {}, None, "") + if __name__ == "__main__": unittest.main()