Skip to content

Commit 731277a

Browse files
committed
KIT-4745 fixed linting issues
1 parent d084cb7 commit 731277a

13 files changed

Lines changed: 181 additions & 174 deletions

ai_data_preprocessing_queue/Steps/language_detect.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
"""
2-
Detects one of the following languages and writes the language to local state.
1+
"""Detects one of the following languages and writes the language to local state.
32
43
af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
54
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
65
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi,
76
zh-cn, zh-tw
87
"""
8+
99
from typing import Any
1010

1111
from langdetect import detect

ai_data_preprocessing_queue/Steps/remove_numbers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33

44

55
def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
6-
item = re.sub(r"""\d""", " ", item)
7-
return item
6+
return re.sub(r"""\d""", " ", item)

ai_data_preprocessing_queue/Steps/remove_punctuation.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33

44

55
def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
6-
item = re.sub(r"[^\w\s]", " ", item)
7-
return item
6+
return re.sub(r"[^\w\s]", " ", item)

ai_data_preprocessing_queue/Steps/remove_signature.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,30 @@ def remove_newline(text: str) -> str:
66
"""Remove excessive newlines or spaces from the text."""
77
pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
88
result = pattern.sub(" ", text)
9-
result = re.sub(r"\s+", " ", result).strip()
10-
11-
return result
12-
13-
14-
GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
15-
"regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
16-
"herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
17-
"beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
9+
return re.sub(r"\s+", " ", result).strip()
10+
11+
12+
GreetingExpressions = [
13+
"sincerely",
14+
"best regards",
15+
"happy holidays",
16+
"kind regards",
17+
"warm regards",
18+
"cheers",
19+
"regards",
20+
"mit freundlichen grüßen",
21+
"freundliche grüße",
22+
"beste grüße",
23+
"viele grüße",
24+
"herzliche grüße",
25+
"liebe grüße",
26+
"mit freundlichen grüssen",
27+
"freundliche grüsse",
28+
"beste grüsse",
29+
"viele grüsse",
30+
"herzliche grüsse",
31+
"liebe grüsse",
32+
]
1833
greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"
1934

2035

@@ -26,15 +41,15 @@ def remove_greetings_and_following_text(text: str) -> str:
2641
# thank you expressions should be removed after greetings and following signature text,
2742
# as they often appear at the beginning of a message
2843
THANK_EXPRESSIONS = [
29-
r"thank you(?: very much)?", # thank you, thank you very much
30-
r"thankyou(?: very much)?", # thankyou, thankyou very much
31-
r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again
32-
r"many thanks", # many thanks
33-
r"a thousand thanks", # a thousand thanks
34-
r"danke(?: schön)?", # danke, danke schön, danke und
35-
r"vielen dank", # vielen dank
36-
r"dankeschön", # dankeschön
37-
r"besten dank" # besten dank
44+
r"thank you(?: very much)?", # thank you, thank you very much
45+
r"thankyou(?: very much)?", # thankyou, thankyou very much
46+
r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again
47+
r"many thanks", # many thanks
48+
r"a thousand thanks", # a thousand thanks
49+
r"danke(?: schön)?", # danke, danke schön, danke und
50+
r"vielen dank", # vielen dank
51+
r"dankeschön", # dankeschön
52+
r"besten dank", # besten dank
3853
]
3954

4055
# Suffixes which could follow thank you expressions
@@ -43,15 +58,13 @@ def remove_greetings_and_following_text(text: str) -> str:
4358
r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
4459
r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)",
4560
r"vorab",
46-
r"kindly?"
61+
r"kindly?",
4762
]
4863

4964
# Combine them into a final regex pattern and compile
5065
thank_expressions = r"|".join(THANK_EXPRESSIONS)
5166
suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
52-
final_pattern = (
53-
r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
54-
)
67+
final_pattern = r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
5568
thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)
5669

5770

ai_data_preprocessing_queue/Steps/spellcheck.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | N
3030
all_words_to_check: Any = reduce(lambda x, y: cast(str, x) + cast(str, y), items)
3131

3232
for w in all_words_to_check:
33-
if len(item_word) < 4 and _levenshtein(item_word, w) == 1:
34-
item = item.replace(item_word, w)
35-
elif len(item_word) >= 4 and 1 <= _levenshtein(item_word, w) <= 2:
33+
if (
34+
len(item_word) < 4 # noqa: PLR2004
35+
and _levenshtein(item_word, w) == 1
36+
or len(item_word) >= 4 # noqa: PLR2004
37+
and 1 <= _levenshtein(item_word, w) <= 2 # noqa: PLR2004
38+
):
3639
item = item.replace(item_word, w)
3740

3841
return item

ai_data_preprocessing_queue/Steps/text_only.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@
44

55
def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
66
item = re.sub(r"[^\w\s]", " ", item)
7-
item = re.sub(r"""\d""", " ", item)
8-
return item
7+
return re.sub(r"""\d""", " ", item)

ai_data_preprocessing_queue/Steps/token_replacement.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | N
1515

1616
# also replace dots at end of word
1717
if not line[0].endswith("."):
18-
regex = regex + "\\b"
18+
regex += "\\b"
1919

2020
pattern = re.compile(regex)
2121
item = pattern.sub(line[1], item)
@@ -38,18 +38,16 @@ def _get_data_from_store_or_reload(global_state: dict[str, Any] | None, preproce
3838

3939
def _prepare_pre_processor_data(preprocessor_data: str) -> list[list[str]]:
4040
lines: list[list[str]] = [
41-
[s.strip() for i, s in enumerate(line.split(",")) if (i == 2 and re.compile(r"^[0-9\s]+$").match(s)) or i < 2]
41+
[s.strip() for i, s in enumerate(line.split(",")) if (i == 2 and re.compile(r"^[0-9\s]+$").match(s)) or i < 2] # noqa: PLR2004
4242
for line in preprocessor_data.splitlines()
43-
if line.count(",") == 2
43+
if line.count(",") == 2 # noqa: PLR2004
4444
]
45-
lines = [line for line in lines if len(line) == 3]
45+
lines = [line for line in lines if len(line) == 3] # noqa: PLR2004
4646

4747
i: int = 0
4848
while i < len(lines):
4949
lines[i][2] = int(lines[i][2]) # type: ignore
5050
i += 1
5151

52-
# sort
53-
lines = sorted(lines, key=lambda f: 0 - f[2]) # type: ignore
54-
55-
return lines
52+
# sort and return
53+
return sorted(lines, key=lambda f: 0 - f[2]) # type: ignore
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from .Pipeline import Pipeline
1+
from .Pipeline import Pipeline as Pipeline

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ai-data-preprocessing-queue"
3-
version = "1.7.1"
3+
version = "1.7.2"
44
description = "A collection of different text processing steps that can be enabled or disabled dynamically."
55
authors = ["KI-Team"]
66
license = "MIT"

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ mypy
55
parameterized
66
pytest
77
pytest-cov
8+
ruff
89

0 commit comments

Comments
 (0)