diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e10b18d..9f84f0c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -39,7 +39,7 @@ ] } }, - "initializeCommand": "powershell.exe .\\.devcontainer\\initialize.ps1", + "initializeCommand": "powershell.exe ./.devcontainer/initialize.ps1", "postCreateCommand": "pip3 install -r ${containerWorkspaceFolder}/requirements-dev.txt", "remoteUser": "vscode", "mounts": [ diff --git a/README.md b/README.md index 26f715b..b3229d9 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Note: Pipeline has to be instantiated only once and can be reused. ## Existing preprocessors ### To Lower Case -Name: to_lower +Name: to_lower Required additional data: - @@ -91,6 +91,12 @@ Required additional data: CSV data in string form with the following line format With this preprocessor you can replace specific words and abbreviations within the text with specified tokens. It is also possible to replace abbreviations ending with a dot. Other special characters are not supported, though. +### Remove signature +Name: remove_signature + +Removes greeting expressions and everything following them, as well as thank you expressions. +Should be used before the other processing steps. + ## How to start developing ### With VS Code diff --git a/ai_data_preprocessing_queue/Steps/remove_signature.py b/ai_data_preprocessing_queue/Steps/remove_signature.py new file mode 100644 index 0000000..9b73b8e --- /dev/null +++ b/ai_data_preprocessing_queue/Steps/remove_signature.py @@ -0,0 +1,82 @@ +import re +from typing import Any + + +def remove_newline(text: str) -> str: + """Remove excessive newlines or spaces from the text.""" + pattern = re.compile(r"\s{2,}|[\n\r]{3,}") + result = pattern.sub(" ", text) + result = re.sub(r"\s+", " ", result).strip() + + return result + + +GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers", + "regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße", + "herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse", + "beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"] +greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*" + + +def remove_greetings_and_following_text(text: str) -> str: + pattern = greetings_regex + ".*" + return re.sub(pattern, "", text, flags=re.IGNORECASE | re.UNICODE | re.DOTALL).strip() + + +# thank you expressions should be removed after greetings and following signature text, +# as they often appear at the beginning of a message +THANK_EXPRESSIONS = [ + r"thank you(?: very much)?", # thank you, thank you very much + r"thankyou(?: very much)?", # thankyou, thankyou very much + r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again + r"many thanks", # many thanks + r"a thousand thanks", # a thousand thanks + r"danke(?: schön)?", # danke, danke schön, danke und + r"vielen dank", # vielen dank + r"dankeschön", # dankeschön + r"besten dank" # besten dank +] + +# Suffixes which could follow thank you expressions +THANK_SUFFIXES = [ + r"(?:in advance(?: for (?:your|the) (?:help|support|understanding|assistance))?)", + r"(?:for (?:your|the) (?:help|support|understanding|assistance))", + r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)", + r"vorab", + r"kindly?" +] + +# Combine them into a final regex pattern and compile +thank_expressions = r"|".join(THANK_EXPRESSIONS) +suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?" +final_pattern = ( + r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*" +) +thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE) + + +def remove_thanking_expressions(text: str) -> str: + return thanking_regex.sub("", text) + + +# In the end, single greetings are removed again, which could not +# be reliably removed by the preceding expressions +single_greeting_words = ["liebe grüße", "liebe grüsse", "grüße", "grüsse", "gruß", "gruss"] +single_greetings_pattern = r"\b(?:{})\b".format("|".join(single_greeting_words)) + + +def remove_single_greeting_words(text: str, pattern: str) -> str: + return re.sub(pattern, " ", text, flags=re.IGNORECASE | re.UNICODE) + + +def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any: + if not item: + return item + try: + text_greetings_removed = remove_greetings_and_following_text(item) + thankyou_removed = remove_thanking_expressions(text_greetings_removed) + single_greetings_removed = remove_single_greeting_words(thankyou_removed, single_greetings_pattern) + + return remove_newline(single_greetings_removed) + except Exception as e: + raise ValueError(f"An error occurred while removing signature: {e}") from e diff --git a/pyproject.toml b/pyproject.toml index b648416..e04c1b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ai-data-preprocessing-queue" -version = "1.6.0" +version = "1.7.0" description = "A collection of different text processing steps that can be enabled or disabled dynamically." authors = ["KI-Team"] license = "MIT" @@ -10,15 +10,16 @@ readme = "README.md" python = "^3.12" langdetect = "*" nltk = "*" -pandas = "*" numpy = "*" +pandas = "*" [tool.poetry.group.dev.dependencies] +build = "*" coverage-lcov = "*" flake8-bandit = "*" flake8-pydocstyle = "*" mypy = "*" -build = "*" +parameterized = "*" pytest = "*" pytest-cov = "*" types-mock = "*" diff --git a/requirements-dev.txt b/requirements-dev.txt index 4d9ec2e..0e1d798 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ build flake8-bandit flake8-pydocstyle mypy +parameterized pytest pytest-cov diff --git a/setup.py b/setup.py index d57fece..a9cdc94 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setuptools.setup( name="ai-data-preprocessing-queue", - version="1.6.0", + version="1.7.0", description="Can be used to pre process data before ai processing", long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py new file mode 100644 index 0000000..9e33a9d --- /dev/null +++ b/tests/test_remove_signature.py @@ -0,0 +1,153 @@ +import unittest + +from parameterized import parameterized +from unittest.mock import MagicMock, patch +from ai_data_preprocessing_queue.Pipeline import Pipeline +from ai_data_preprocessing_queue.Steps.remove_signature import ( + step, remove_greetings_and_following_text, remove_newline) + + +class TestRemoveSignature(unittest.TestCase): + @parameterized.expand([ # type: ignore[misc] + ( + "multiple_newlines", + "Could you please review the attached document?\n\n\nI need your feedback by Friday.", + "Could you please review the attached document? I need your feedback by Friday.", + ), + ( + "multiple_spaces", + "The meeting is scheduled for 3PM tomorrow.", + "The meeting is scheduled for 3PM tomorrow.", + ), + ( + "mixed_whitespace", + "Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!", + "Please find the report attached. The numbers look good for Q3!", + ), + ( + "empty_string", + "", + "" + ), + ( + "trailing_whitespace", + "I'll send the updated version tomorrow. \n\n ", + "I'll send the updated version tomorrow." + ) + ]) + def test_remove_newline(self, name: str, input_text: str, expected: str) -> None: + self.assertEqual(remove_newline(input_text), expected) + + @parameterized.expand([ # type: ignore[misc] + ( + "english_signature_basic", + "Here's the project update. Sincerely, John Smith\nProject Manager", + "Here's the project update." + ), + ( + "english_signature_with_content", + "Please review the attached documents. Best regards, Jane Doe\nSenior Developer\nTech Department", + "Please review the attached documents." + ), + ( + "english_signature_with_content_and_several_newlines", + "Please review the attached documents. Best regards,\nJane Doe\n\nSenior Developer\n\nTech Department", + "Please review the attached documents." + ), + ( + "german_signature", + "Die Unterlagen wurden aktualisiert. Mit freundlichen Grüßen, Hans Schmidt\nPhone: +49 123 456789", + "Die Unterlagen wurden aktualisiert." + ), + ( + "greeting_with_comma", + "Meeting is scheduled for tomorrow. Kind regards, Sarah", + "Meeting is scheduled for tomorrow." + ), + ( + "mixed_case_greeting", + "Report is ready. BEST REGARDS, Tom Wilson", + "Report is ready." + ), + ( + "multiple_greetings", + "Hello team, here's the update. Best regards, Jim\nRegards, HR Team", + "Hello team, here's the update." + ), + ( + "empty_string", + "", + "" + ), + ( + "no_greetings", + "This is a plain text without any greetings or signatures.", + "This is a plain text without any greetings or signatures." + ), + ]) + def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None: + self.assertEqual(remove_greetings_and_following_text(input_text), expected) + + @parameterized.expand([ # type: ignore[misc] + ( + "remove_signature_basic", + "We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead", + "We're sending the final draft for review.", + ), + ( + "remove_signature_extended", + "Order Mice/keyboard\nGoodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " + "10 x Dell Business Keyboard AB322 (UK layout) Thx Best regards Jimmy B. " + "| C Facilities & Reception Klaus+Andreas Nederland | Anonymstraat 47 | 1234 AJ Amsterdam | Netherlands " + "Phone: +01 23 695 4567 | Mobile: +97 65 445 1234 | Fax: +31 35 695 8825 jim.anonymus@company.com " + "| www.nl.somecompany.com", + "Order Mice/keyboard Goodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " + "10 x Dell Business Keyboard AB322 (UK layout) Thx", + ), + ( + "thanking_at_start", + "Thank you very much for your support. " + "I will prepare the contract and send it tomorrow.\n\nBest regards, Bob Brown", + "I will prepare the contract and send it tomorrow.", + ), + ( + "thanking_in_middle", + "Thank you very much for your support. " + "I appreciate your support on this migration. Thanks a lot, I will share the logs shortly.", + "I appreciate your support on this migration. I will share the logs shortly.", + ), + ( + "single_greeting_word_german", + "The deliverables are ready. Grüße", + "The deliverables are ready.", + ), + ( + "german_empty_result", + "Vielen Dank für Ihre Hilfe. Mit freundlichen Grüßen, Lena Meyer " + "Und hier kommt noch mehr Text.", + "", + ), + ( + "no_change", + "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", + "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", + ), + ]) + def test_remove_signature(self, name: str, input_text: str, expected: str) -> None: + pipeline = Pipeline({"remove_signature": None}) + value = pipeline.consume(input_text) + self.assertEqual(expected, value) + + def test_remove_signature_step_empty_item(self) -> None: + result = step("", {}, None, "") + self.assertEqual(result, "") + + @patch("ai_data_preprocessing_queue.Steps.remove_signature.remove_greetings_and_following_text", + side_effect=Exception("Test error")) + def test_remove_signature_step_error(self, _: MagicMock) -> None: + with self.assertRaises(Exception): + step("Please schedule the kickoff meeting for next Tuesday morning at 10:00.", {}, None, "") + + +if __name__ == "__main__": + unittest.main()