KIT-4745 fixed linting issues

cwehmeier · cwehmeier · commit 731277adb439 · 2026-03-11T16:40:30.000+01:00
diff --git a/ai_data_preprocessing_queue/Steps/language_detect.py b/ai_data_preprocessing_queue/Steps/language_detect.py
@@ -1,11 +1,11 @@
-"""
-Detects one of the following languages and writes the language to local state.
+"""Detects one of the following languages and writes the language to local state.
 
 af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
 hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
 pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi,
 zh-cn, zh-tw
 """
+
 from typing import Any
 
 from langdetect import detect
diff --git a/ai_data_preprocessing_queue/Steps/remove_numbers.py b/ai_data_preprocessing_queue/Steps/remove_numbers.py
@@ -3,5 +3,4 @@
 
 
 def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
-    item = re.sub(r"""\d""", " ", item)
-    return item
+    return re.sub(r"""\d""", " ", item)
diff --git a/ai_data_preprocessing_queue/Steps/remove_punctuation.py b/ai_data_preprocessing_queue/Steps/remove_punctuation.py
@@ -3,5 +3,4 @@
 
 
 def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
-    item = re.sub(r"[^\w\s]", " ", item)
-    return item
+    return re.sub(r"[^\w\s]", " ", item)
diff --git a/ai_data_preprocessing_queue/Steps/remove_signature.py b/ai_data_preprocessing_queue/Steps/remove_signature.py
@@ -6,15 +6,30 @@ def remove_newline(text: str) -> str:
     """Remove excessive newlines or spaces from the text."""
     pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
     result = pattern.sub(" ", text)
-    result = re.sub(r"\s+", " ", result).strip()
-
-    return result
-
-
-GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
-                       "regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
-                       "herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
-                       "beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
+    return re.sub(r"\s+", " ", result).strip()
+
+
+GreetingExpressions = [
+    "sincerely",
+    "best regards",
+    "happy holidays",
+    "kind regards",
+    "warm regards",
+    "cheers",
+    "regards",
+    "mit freundlichen grüßen",
+    "freundliche grüße",
+    "beste grüße",
+    "viele grüße",
+    "herzliche grüße",
+    "liebe grüße",
+    "mit freundlichen grüssen",
+    "freundliche grüsse",
+    "beste grüsse",
+    "viele grüsse",
+    "herzliche grüsse",
+    "liebe grüsse",
+]
 greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"
 
 
@@ -26,15 +41,15 @@ def remove_greetings_and_following_text(text: str) -> str:
 # thank you expressions should be removed after greetings and following signature text,
 # as they often appear at the beginning of a message
 THANK_EXPRESSIONS = [
-    r"thank you(?: very much)?",   # thank you, thank you very much
-    r"thankyou(?: very much)?",   # thankyou, thankyou very much
-    r"thanks(?: a lot| again)?",   # thanks, thanks a lot, thanks again
-    r"many thanks",                # many thanks
-    r"a thousand thanks",          # a thousand thanks
-    r"danke(?: schön)?",           # danke, danke schön, danke und
-    r"vielen dank",                # vielen dank
-    r"dankeschön",                 # dankeschön
-    r"besten dank"                 # besten dank
+    r"thank you(?: very much)?",  # thank you, thank you very much
+    r"thankyou(?: very much)?",  # thankyou, thankyou very much
+    r"thanks(?: a lot| again)?",  # thanks, thanks a lot, thanks again
+    r"many thanks",  # many thanks
+    r"a thousand thanks",  # a thousand thanks
+    r"danke(?: schön)?",  # danke, danke schön, danke und
+    r"vielen dank",  # vielen dank
+    r"dankeschön",  # dankeschön
+    r"besten dank",  # besten dank
 ]
 
 # Suffixes which could follow thank you expressions
@@ -43,15 +58,13 @@ def remove_greetings_and_following_text(text: str) -> str:
     r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
     r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)",
     r"vorab",
-    r"kindly?"
+    r"kindly?",
 ]
 
 # Combine them into a final regex pattern and compile
 thank_expressions = r"|".join(THANK_EXPRESSIONS)
 suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
-final_pattern = (
-    r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
-)
+final_pattern = r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
 thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)
 
 
diff --git a/ai_data_preprocessing_queue/Steps/spellcheck.py b/ai_data_preprocessing_queue/Steps/spellcheck.py
@@ -30,9 +30,12 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | N
         all_words_to_check: Any = reduce(lambda x, y: cast(str, x) + cast(str, y), items)
 
         for w in all_words_to_check:
-            if len(item_word) < 4 and _levenshtein(item_word, w) == 1:
-                item = item.replace(item_word, w)
-            elif len(item_word) >= 4 and 1 <= _levenshtein(item_word, w) <= 2:
+            if (
+                len(item_word) < 4  # noqa: PLR2004
+                and _levenshtein(item_word, w) == 1
+                or len(item_word) >= 4  # noqa: PLR2004
+                and 1 <= _levenshtein(item_word, w) <= 2  # noqa: PLR2004
+            ):
                 item = item.replace(item_word, w)
 
     return item
diff --git a/ai_data_preprocessing_queue/Steps/text_only.py b/ai_data_preprocessing_queue/Steps/text_only.py
@@ -4,5 +4,4 @@
 
 def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
     item = re.sub(r"[^\w\s]", " ", item)
-    item = re.sub(r"""\d""", " ", item)
-    return item
+    return re.sub(r"""\d""", " ", item)
diff --git a/ai_data_preprocessing_queue/Steps/token_replacement.py b/ai_data_preprocessing_queue/Steps/token_replacement.py
@@ -15,7 +15,7 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | N
 
         # also replace dots at end of word
         if not line[0].endswith("."):
-            regex = regex + "\\b"
+            regex += "\\b"
 
         pattern = re.compile(regex)
         item = pattern.sub(line[1], item)
@@ -38,18 +38,16 @@ def _get_data_from_store_or_reload(global_state: dict[str, Any] | None, preproce
 
 def _prepare_pre_processor_data(preprocessor_data: str) -> list[list[str]]:
     lines: list[list[str]] = [
-        [s.strip() for i, s in enumerate(line.split(",")) if (i == 2 and re.compile(r"^[0-9\s]+$").match(s)) or i < 2]
+        [s.strip() for i, s in enumerate(line.split(",")) if (i == 2 and re.compile(r"^[0-9\s]+$").match(s)) or i < 2]  # noqa: PLR2004
         for line in preprocessor_data.splitlines()
-        if line.count(",") == 2
+        if line.count(",") == 2  # noqa: PLR2004
     ]
-    lines = [line for line in lines if len(line) == 3]
+    lines = [line for line in lines if len(line) == 3]  # noqa: PLR2004
 
     i: int = 0
     while i < len(lines):
         lines[i][2] = int(lines[i][2])  # type: ignore
         i += 1
 
-    # sort
-    lines = sorted(lines, key=lambda f: 0 - f[2])  # type: ignore
-
-    return lines
+    # sort and return
+    return sorted(lines, key=lambda f: 0 - f[2])  # type: ignore
diff --git a/ai_data_preprocessing_queue/__init__.py b/ai_data_preprocessing_queue/__init__.py
@@ -1 +1 @@
-from .Pipeline import Pipeline
+from .Pipeline import Pipeline as Pipeline
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ai-data-preprocessing-queue"
-version = "1.7.1"
+version = "1.7.2"
 description = "A collection of different text processing steps that can be enabled or disabled dynamically."
 authors = ["KI-Team"]
 license = "MIT"
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -5,4 +5,5 @@ mypy
 parameterized
 pytest
 pytest-cov
+ruff
 
diff --git a/setup.py b/setup.py
@@ -1,17 +1,17 @@
 import setuptools
 
-with open("README.md", "r") as fh:
+with open("README.md", encoding="utf-8") as fh:
     LONG_DESCRIPTION = fh.read()
 
-with open("requirements.txt", "r") as fin:
+with open("requirements.txt", encoding="utf-8") as fin:
     REQS = fin.read().splitlines()
 
-with open("requirements-dev.txt", "r") as fin:
+with open("requirements-dev.txt", encoding="utf-8") as fin:
     REQS_DEV = [item for item in fin.read().splitlines() if not item.endswith(".txt")]
 
 setuptools.setup(
     name="ai-data-preprocessing-queue",
-    version="1.7.1",
+    version="1.7.2",
     description="Can be used to pre process data before ai processing",
     long_description=LONG_DESCRIPTION,
     long_description_content_type="text/markdown",
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -48,7 +48,7 @@ def test_regex_replacement_do_not_crash_for_no_data(self) -> None:
         self.assertEqual("test text", value)
 
     def test_regex_replacement(self) -> None:
-        with open(path.join(ABS_PATH_TEST_DATA, "regex_replacement_testdata.csv"), "r", encoding="utf-8") as handler:
+        with open(path.join(ABS_PATH_TEST_DATA, "regex_replacement_testdata.csv"), encoding="utf-8") as handler:
             pipeline = Pipeline({"regex_replacement": handler.read()})
         # date
         value = pipeline.consume("test 1.1.2019 20.2.2003 1.1.20 01.01.20 1.1.1900 1.1. 01.01. test")
@@ -88,19 +88,19 @@ def test_token_replacement_do_not_crash_for_no_data(self) -> None:
         self.assertEqual("test text", value)
 
     def test_token_replacement(self) -> None:
-        with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), "r", encoding="utf-8") as handler:
+        with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), encoding="utf-8") as handler:
             pipeline = Pipeline({"token_replacement": handler.read()})
         value = pipeline.consume("test asd bla 1212")
         self.assertEqual("test www blub 1212", value)
 
     def test_token_replacement_do_not_replace_parts_of_word(self) -> None:
-        with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), "r", encoding="utf-8") as handler:
+        with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), encoding="utf-8") as handler:
             pipeline = Pipeline({"token_replacement": handler.read()})
         value = pipeline.consume("test abg. abgabgeschlossen 1212")
         self.assertEqual("test abgeschlossen abgabgeschlossen 1212", value)
 
     def test_token_replacement_also_replace_dots_at_end_of_phrase(self) -> None:
-        with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), "r", encoding="utf-8") as handler:
+        with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), encoding="utf-8") as handler:
             pipeline = Pipeline({"token_replacement": handler.read()})
         value = pipeline.consume("abg. 1212")
         self.assertEqual("abgeschlossen 1212", value)
diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .Pipeline import Pipeline`
	`1`	`+from .Pipeline import Pipeline as Pipeline`