@@ -6,15 +6,30 @@ def remove_newline(text: str) -> str:
66 """Remove excessive newlines or spaces from the text."""
77 pattern = re .compile (r"\s{2,}|[\n\r]{3,}" )
88 result = pattern .sub (" " , text )
9- result = re .sub (r"\s+" , " " , result ).strip ()
10-
11- return result
12-
13-
14- GreetingExpressions = ["sincerely" , "best regards" , "happy holidays" , "kind regards" , "warm regards" , "cheers" ,
15- "regards" , "mit freundlichen grüßen" , "freundliche grüße" , "beste grüße" , "viele grüße" ,
16- "herzliche grüße" , "liebe grüße" , "mit freundlichen grüssen" , "freundliche grüsse" ,
17- "beste grüsse" , "viele grüsse" , "herzliche grüsse" , "liebe grüsse" ]
9+ return re .sub (r"\s+" , " " , result ).strip ()
10+
11+
12+ GreetingExpressions = [
13+ "sincerely" ,
14+ "best regards" ,
15+ "happy holidays" ,
16+ "kind regards" ,
17+ "warm regards" ,
18+ "cheers" ,
19+ "regards" ,
20+ "mit freundlichen grüßen" ,
21+ "freundliche grüße" ,
22+ "beste grüße" ,
23+ "viele grüße" ,
24+ "herzliche grüße" ,
25+ "liebe grüße" ,
26+ "mit freundlichen grüssen" ,
27+ "freundliche grüsse" ,
28+ "beste grüsse" ,
29+ "viele grüsse" ,
30+ "herzliche grüsse" ,
31+ "liebe grüsse" ,
32+ ]
1833greetings_regex = r"(" + "|" .join (GreetingExpressions ) + r")\s*,?\s*"
1934
2035
@@ -26,15 +41,15 @@ def remove_greetings_and_following_text(text: str) -> str:
2641# thank you expressions should be removed after greetings and following signature text,
2742# as they often appear at the beginning of a message
2843THANK_EXPRESSIONS = [
29- r"thank you(?: very much)?" , # thank you, thank you very much
30- r"thankyou(?: very much)?" , # thankyou, thankyou very much
31- r"thanks(?: a lot| again)?" , # thanks, thanks a lot, thanks again
32- r"many thanks" , # many thanks
33- r"a thousand thanks" , # a thousand thanks
34- r"danke(?: schön)?" , # danke, danke schön, danke und
35- r"vielen dank" , # vielen dank
36- r"dankeschön" , # dankeschön
37- r"besten dank" # besten dank
44+ r"thank you(?: very much)?" , # thank you, thank you very much
45+ r"thankyou(?: very much)?" , # thankyou, thankyou very much
46+ r"thanks(?: a lot| again)?" , # thanks, thanks a lot, thanks again
47+ r"many thanks" , # many thanks
48+ r"a thousand thanks" , # a thousand thanks
49+ r"danke(?: schön)?" , # danke, danke schön, danke und
50+ r"vielen dank" , # vielen dank
51+ r"dankeschön" , # dankeschön
52+ r"besten dank" , # besten dank
3853]
3954
4055# Suffixes which could follow thank you expressions
@@ -43,15 +58,13 @@ def remove_greetings_and_following_text(text: str) -> str:
4358 r"(?:for (?:your|the) (?:help|support|understanding|assistance))" ,
4459 r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)" ,
4560 r"vorab" ,
46- r"kindly?"
61+ r"kindly?" ,
4762]
4863
4964# Combine them into a final regex pattern and compile
5065thank_expressions = r"|" .join (THANK_EXPRESSIONS )
5166suffixes = r"(?:\s+(?:" + r"|" .join (THANK_SUFFIXES ) + r"))?"
52- final_pattern = (
53- r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
54- )
67+ final_pattern = r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
5568thanking_regex = re .compile (final_pattern , flags = re .IGNORECASE | re .UNICODE )
5669
5770
0 commit comments