fix: Data_pipeline: removing special characters

epythonlab2 · epythonlab2 · commit 6420d0d4fada · 2026-03-22T14:09:38.000+03:00
diff --git a/src/processing/cleaner.py b/src/processing/cleaner.py
@@ -1,106 +1,164 @@
 import re
+import unicodedata
 
 import pandas as pd
 
 from src.utils.logger import get_logger
 
 logger = get_logger(__name__)
 
+# Move imports to top level for performance
+# --------------------------------------------------
+# Helpers
+# --------------------------------------------------
 
-def clean_generic_name(name: str) -> str:
-    """Normalize drug names"""
 
-    if pd.isna(name):
+def remove_garbage(text: str) -> str:
+    """
+    Remove encoding artifacts and strange unicode characters.
+    """
+    if pd.isna(text) or str(text).strip() == "":
         return ""
 
-    name = str(name).lower()
+    text = str(text)
 
-    name = re.sub(r"\s+", " ", name)
+    # 1. Normalize Unicode (Fixes ligatures like 'ﬁ' to 'fi')
+    text = unicodedata.normalize("NFKD", text)
 
-    return name.strip()
+    # 2. Comprehensive Cleaning
+    # Added : ; % to the allowed list as they appear in dosages
+    text = re.sub(r"[^a-zA-Z0-9\s.,/()+\-|:;%]", " ", text)
 
+    # 3. Collapse multiple spaces/newlines
+    text = re.sub(r"\s+", " ", text).strip()
 
-def clean_dosage(dosage: str) -> str:
-    """Normalize dosage formatting"""
+    return text
 
-    if pd.isna(dosage):
-        return ""
 
-    dosage = str(dosage)
+def contains_garbage(text: str) -> bool:
+    """
+    Detect suspicious characters.
+    Note: Run this BEFORE remove_garbage if you want to log original artifacts.
+    """
+    if pd.isna(text):
+        return False
+    # Check for characters outside basic printable ASCII
+    return bool(re.search(r"[^\x20-\x7E]", str(text)))
 
-    # normalize separators
-    dosage = dosage.replace("|", ";")
 
-    # normalize whitespace
-    dosage = re.sub(r"\s+", " ", dosage)
+# --------------------------------------------------
+# Cleaning Functions
+# --------------------------------------------------
 
-    # normalize units
-    dosage = dosage.replace(" mg", "mg")
-    dosage = dosage.replace(" ml", "ml")
 
-    # remove duplicate separators
-    dosage = re.sub(r";\s*;", ";", dosage)
+def clean_generic_name(name: str) -> str:
+    """Normalize drug generic names."""
+    # remove_garbage already handles whitespace and case-normalization-prep
+    name = remove_garbage(name).lower()
+    return name
 
-    return dosage.strip()
 
+def clean_dosage(dosage: str) -> str:
+    """Normalize dosage formatting."""
+    dosage = remove_garbage(dosage)
 
-def split_category(category: str):
-    """Split category code and name"""
+    # Normalize separators: ensure consistency for downstream splitting
+    dosage = dosage.replace("|", ";")
+
+    # Normalize units: ensure no space between number and unit (e.g., 500mg)
+    # Using regex to catch '500 mg', '500  mg', etc.
+    dosage = re.sub(r"(\d+)\s+(mg|ml|mcg|g|l)", r"\1\2", dosage, flags=re.IGNORECASE)
 
-    if pd.isna(category):
-        return "", ""
+    # Remove duplicate/trailing separators
+    dosage = re.sub(r";\s*;", ";", dosage)
+    return dosage.strip().strip(";")
 
-    category = str(category)
 
-    match = re.match(r"([A-Z]{2}\.\d{3})\s*(.*)", category)
+# --------------------------------------------------
+# Category Processing
+# --------------------------------------------------
 
-    if match:
-        return match.group(1), match.group(2).lower().strip()
 
-    return category, ""
+def split_category(category: str):
+    """
+    Split category code and category name.
+    Example: AI.102 .Cephalosporins -> AI.102, cephalosporins
+    """
+    if pd.isna(category) or str(category).strip() == "":
+        return "UNK.000", "unknown"
+
+    category = str(category).strip()
+
+    # Optimized regex for multi-level codes and ghost spaces
+    match = re.match(r"^([A-Z]{2}(?:\s*\.\s*\d+)+)\s*\.?\s*(.*)", category)
 
+    if match:
+        code = re.sub(r"\s+", "", match.group(1))  # Clean 'AI . 102' -> 'AI.102'
+        name = remove_garbage(match.group(2)).lower()
+        return code, name
 
-def generate_drug_id(category_code: str, s_no: str):
-    """Generate unique drug id"""
+    return "UNK.000", category.lower()
 
-    s_no = str(s_no).replace(".", "").strip()
 
-    return f"{category_code}-{s_no.zfill(3)}"
+# --------------------------------------------------
+# Main DataFrame Cleaning
+# --------------------------------------------------
 
 
 def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """Clean and normalize extracted drug data"""
-
+    """
+    Clean and normalize extracted drug dataset.
+    """
     logger.info("[CLEANING] Starting dataframe cleaning")
-
     cleaned_rows = []
+    garbage_count = 0
 
+    # itertuples is faster than iterrows, but we need to handle column access safely
     for row in df.itertuples(index=False):
 
-        category_code, category_name = split_category(row.category)
+        # Check for garbage before we clean it away
+        if contains_garbage(row.generic_name) or contains_garbage(row.dosage):
+            garbage_count += 1
+
+        cat_code, cat_name = split_category(row.category)
+
+        # Pre-clean generic and dosage
+        gen_name = clean_generic_name(row.generic_name)
+        dos_val = clean_dosage(row.dosage)
+
+        # Skip rows with no generic name (invalid data)
+        if not gen_name:
+            continue
+
+        # Generate unique ID
+        # Strip dots from s_no to avoid '1.' becoming '001.'
+        clean_sno = re.sub(r"\D", "", str(row.s_no))
+        sno_str = clean_sno.zfill(3) if clean_sno else "000"
+        drug_id = f"{cat_code}-{sno_str}"
 
         cleaned_rows.append(
             {
-                "drug_id": generate_drug_id(category_code, row.s_no),
-                "generic_name": clean_generic_name(row.generic_name),
-                "dosage": clean_dosage(row.dosage),
-                "category_code": category_code,
-                "category_name": category_name,
+                "drug_id": drug_id,
+                "generic_name": gen_name,
+                "dosage": dos_val,
+                "category_code": cat_code,
+                "category_name": cat_name,
             }
         )
 
+    if not cleaned_rows:
+        logger.error("[CLEANING] No rows remained after cleaning!")
+        return pd.DataFrame()
+
     clean_df = pd.DataFrame(cleaned_rows)
 
+    # Deduplication
     before = len(clean_df)
-
     clean_df = clean_df.drop_duplicates(subset=["generic_name", "dosage"])
-
     after = len(clean_df)
 
-    clean_df = clean_df.reset_index(drop=True)
-
-    logger.info(f"[CLEANING] Rows before deduplication: {before}")
-    logger.info(f"[CLEANING] Rows after deduplication: {after}")
-    logger.info(f"[CLEANING] Removed duplicates: {before - after}")
+    logger.info(
+        f"[CLEANING] Success. Total: {after} | Removed: {before - after} | Garbage items fixed: {garbage_count}"
+    )
 
-    return clean_df
+    return clean_df.reset_index(drop=True)