Skip to content

Commit 6420d0d

Browse files
committed
fix: Data_pipeline: removing special characters
1 parent ba372b6 commit 6420d0d

1 file changed

Lines changed: 109 additions & 51 deletions

File tree

src/processing/cleaner.py

Lines changed: 109 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,106 +1,164 @@
11
import re
2+
import unicodedata
23

34
import pandas as pd
45

56
from src.utils.logger import get_logger
67

78
logger = get_logger(__name__)
89

10+
# Move imports to top level for performance
11+
# --------------------------------------------------
12+
# Helpers
13+
# --------------------------------------------------
914

10-
def clean_generic_name(name: str) -> str:
11-
"""Normalize drug names"""
1215

13-
if pd.isna(name):
16+
def remove_garbage(text: str) -> str:
17+
"""
18+
Remove encoding artifacts and strange unicode characters.
19+
"""
20+
if pd.isna(text) or str(text).strip() == "":
1421
return ""
1522

16-
name = str(name).lower()
23+
text = str(text)
1724

18-
name = re.sub(r"\s+", " ", name)
25+
# 1. Normalize Unicode (Fixes ligatures like 'fi' to 'fi')
26+
text = unicodedata.normalize("NFKD", text)
1927

20-
return name.strip()
28+
# 2. Comprehensive Cleaning
29+
# Added : ; % to the allowed list as they appear in dosages
30+
text = re.sub(r"[^a-zA-Z0-9\s.,/()+\-|:;%]", " ", text)
2131

32+
# 3. Collapse multiple spaces/newlines
33+
text = re.sub(r"\s+", " ", text).strip()
2234

23-
def clean_dosage(dosage: str) -> str:
24-
"""Normalize dosage formatting"""
35+
return text
2536

26-
if pd.isna(dosage):
27-
return ""
2837

29-
dosage = str(dosage)
38+
def contains_garbage(text: str) -> bool:
39+
"""
40+
Detect suspicious characters.
41+
Note: Run this BEFORE remove_garbage if you want to log original artifacts.
42+
"""
43+
if pd.isna(text):
44+
return False
45+
# Check for characters outside basic printable ASCII
46+
return bool(re.search(r"[^\x20-\x7E]", str(text)))
3047

31-
# normalize separators
32-
dosage = dosage.replace("|", ";")
3348

34-
# normalize whitespace
35-
dosage = re.sub(r"\s+", " ", dosage)
49+
# --------------------------------------------------
50+
# Cleaning Functions
51+
# --------------------------------------------------
3652

37-
# normalize units
38-
dosage = dosage.replace(" mg", "mg")
39-
dosage = dosage.replace(" ml", "ml")
4053

41-
# remove duplicate separators
42-
dosage = re.sub(r";\s*;", ";", dosage)
54+
def clean_generic_name(name: str) -> str:
55+
"""Normalize drug generic names."""
56+
# remove_garbage already handles whitespace and case-normalization-prep
57+
name = remove_garbage(name).lower()
58+
return name
4359

44-
return dosage.strip()
4560

61+
def clean_dosage(dosage: str) -> str:
62+
"""Normalize dosage formatting."""
63+
dosage = remove_garbage(dosage)
4664

47-
def split_category(category: str):
48-
"""Split category code and name"""
65+
# Normalize separators: ensure consistency for downstream splitting
66+
dosage = dosage.replace("|", ";")
67+
68+
# Normalize units: ensure no space between number and unit (e.g., 500mg)
69+
# Using regex to catch '500 mg', '500 mg', etc.
70+
dosage = re.sub(r"(\d+)\s+(mg|ml|mcg|g|l)", r"\1\2", dosage, flags=re.IGNORECASE)
4971

50-
if pd.isna(category):
51-
return "", ""
72+
# Remove duplicate/trailing separators
73+
dosage = re.sub(r";\s*;", ";", dosage)
74+
return dosage.strip().strip(";")
5275

53-
category = str(category)
5476

55-
match = re.match(r"([A-Z]{2}\.\d{3})\s*(.*)", category)
77+
# --------------------------------------------------
78+
# Category Processing
79+
# --------------------------------------------------
5680

57-
if match:
58-
return match.group(1), match.group(2).lower().strip()
5981

60-
return category, ""
82+
def split_category(category: str):
83+
"""
84+
Split category code and category name.
85+
Example: AI.102 .Cephalosporins -> AI.102, cephalosporins
86+
"""
87+
if pd.isna(category) or str(category).strip() == "":
88+
return "UNK.000", "unknown"
89+
90+
category = str(category).strip()
91+
92+
# Optimized regex for multi-level codes and ghost spaces
93+
match = re.match(r"^([A-Z]{2}(?:\s*\.\s*\d+)+)\s*\.?\s*(.*)", category)
6194

95+
if match:
96+
code = re.sub(r"\s+", "", match.group(1)) # Clean 'AI . 102' -> 'AI.102'
97+
name = remove_garbage(match.group(2)).lower()
98+
return code, name
6299

63-
def generate_drug_id(category_code: str, s_no: str):
64-
"""Generate unique drug id"""
100+
return "UNK.000", category.lower()
65101

66-
s_no = str(s_no).replace(".", "").strip()
67102

68-
return f"{category_code}-{s_no.zfill(3)}"
103+
# --------------------------------------------------
104+
# Main DataFrame Cleaning
105+
# --------------------------------------------------
69106

70107

71108
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
72-
"""Clean and normalize extracted drug data"""
73-
109+
"""
110+
Clean and normalize extracted drug dataset.
111+
"""
74112
logger.info("[CLEANING] Starting dataframe cleaning")
75-
76113
cleaned_rows = []
114+
garbage_count = 0
77115

116+
# itertuples is faster than iterrows, but we need to handle column access safely
78117
for row in df.itertuples(index=False):
79118

80-
category_code, category_name = split_category(row.category)
119+
# Check for garbage before we clean it away
120+
if contains_garbage(row.generic_name) or contains_garbage(row.dosage):
121+
garbage_count += 1
122+
123+
cat_code, cat_name = split_category(row.category)
124+
125+
# Pre-clean generic and dosage
126+
gen_name = clean_generic_name(row.generic_name)
127+
dos_val = clean_dosage(row.dosage)
128+
129+
# Skip rows with no generic name (invalid data)
130+
if not gen_name:
131+
continue
132+
133+
# Generate unique ID
134+
# Strip dots from s_no to avoid '1.' becoming '001.'
135+
clean_sno = re.sub(r"\D", "", str(row.s_no))
136+
sno_str = clean_sno.zfill(3) if clean_sno else "000"
137+
drug_id = f"{cat_code}-{sno_str}"
81138

82139
cleaned_rows.append(
83140
{
84-
"drug_id": generate_drug_id(category_code, row.s_no),
85-
"generic_name": clean_generic_name(row.generic_name),
86-
"dosage": clean_dosage(row.dosage),
87-
"category_code": category_code,
88-
"category_name": category_name,
141+
"drug_id": drug_id,
142+
"generic_name": gen_name,
143+
"dosage": dos_val,
144+
"category_code": cat_code,
145+
"category_name": cat_name,
89146
}
90147
)
91148

149+
if not cleaned_rows:
150+
logger.error("[CLEANING] No rows remained after cleaning!")
151+
return pd.DataFrame()
152+
92153
clean_df = pd.DataFrame(cleaned_rows)
93154

155+
# Deduplication
94156
before = len(clean_df)
95-
96157
clean_df = clean_df.drop_duplicates(subset=["generic_name", "dosage"])
97-
98158
after = len(clean_df)
99159

100-
clean_df = clean_df.reset_index(drop=True)
101-
102-
logger.info(f"[CLEANING] Rows before deduplication: {before}")
103-
logger.info(f"[CLEANING] Rows after deduplication: {after}")
104-
logger.info(f"[CLEANING] Removed duplicates: {before - after}")
160+
logger.info(
161+
f"[CLEANING] Success. Total: {after} | Removed: {before - after} | Garbage items fixed: {garbage_count}"
162+
)
105163

106-
return clean_df
164+
return clean_df.reset_index(drop=True)

0 commit comments

Comments
 (0)