|
1 | 1 | import re |
| 2 | +import unicodedata |
2 | 3 |
|
3 | 4 | import pandas as pd |
4 | 5 |
|
5 | 6 | from src.utils.logger import get_logger |
6 | 7 |
|
7 | 8 | logger = get_logger(__name__) |
8 | 9 |
|
| 10 | +# Move imports to top level for performance |
| 11 | +# -------------------------------------------------- |
| 12 | +# Helpers |
| 13 | +# -------------------------------------------------- |
9 | 14 |
|
10 | | -def clean_generic_name(name: str) -> str: |
11 | | - """Normalize drug names""" |
12 | 15 |
|
13 | | - if pd.isna(name): |
| 16 | +def remove_garbage(text: str) -> str: |
| 17 | + """ |
| 18 | + Remove encoding artifacts and strange unicode characters. |
| 19 | + """ |
| 20 | + if pd.isna(text) or str(text).strip() == "": |
14 | 21 | return "" |
15 | 22 |
|
16 | | - name = str(name).lower() |
| 23 | + text = str(text) |
17 | 24 |
|
18 | | - name = re.sub(r"\s+", " ", name) |
| 25 | + # 1. Normalize Unicode (Fixes ligatures like 'fi' to 'fi') |
| 26 | + text = unicodedata.normalize("NFKD", text) |
19 | 27 |
|
20 | | - return name.strip() |
| 28 | + # 2. Comprehensive Cleaning |
| 29 | + # Added : ; % to the allowed list as they appear in dosages |
| 30 | + text = re.sub(r"[^a-zA-Z0-9\s.,/()+\-|:;%]", " ", text) |
21 | 31 |
|
| 32 | + # 3. Collapse multiple spaces/newlines |
| 33 | + text = re.sub(r"\s+", " ", text).strip() |
22 | 34 |
|
23 | | -def clean_dosage(dosage: str) -> str: |
24 | | - """Normalize dosage formatting""" |
| 35 | + return text |
25 | 36 |
|
26 | | - if pd.isna(dosage): |
27 | | - return "" |
28 | 37 |
|
29 | | - dosage = str(dosage) |
| 38 | +def contains_garbage(text: str) -> bool: |
| 39 | + """ |
| 40 | + Detect suspicious characters. |
| 41 | + Note: Run this BEFORE remove_garbage if you want to log original artifacts. |
| 42 | + """ |
| 43 | + if pd.isna(text): |
| 44 | + return False |
| 45 | + # Check for characters outside basic printable ASCII |
| 46 | + return bool(re.search(r"[^\x20-\x7E]", str(text))) |
30 | 47 |
|
31 | | - # normalize separators |
32 | | - dosage = dosage.replace("|", ";") |
33 | 48 |
|
34 | | - # normalize whitespace |
35 | | - dosage = re.sub(r"\s+", " ", dosage) |
| 49 | +# -------------------------------------------------- |
| 50 | +# Cleaning Functions |
| 51 | +# -------------------------------------------------- |
36 | 52 |
|
37 | | - # normalize units |
38 | | - dosage = dosage.replace(" mg", "mg") |
39 | | - dosage = dosage.replace(" ml", "ml") |
40 | 53 |
|
41 | | - # remove duplicate separators |
42 | | - dosage = re.sub(r";\s*;", ";", dosage) |
| 54 | +def clean_generic_name(name: str) -> str: |
| 55 | + """Normalize drug generic names.""" |
| 56 | + # remove_garbage already handles whitespace and case-normalization-prep |
| 57 | + name = remove_garbage(name).lower() |
| 58 | + return name |
43 | 59 |
|
44 | | - return dosage.strip() |
45 | 60 |
|
| 61 | +def clean_dosage(dosage: str) -> str: |
| 62 | + """Normalize dosage formatting.""" |
| 63 | + dosage = remove_garbage(dosage) |
46 | 64 |
|
47 | | -def split_category(category: str): |
48 | | - """Split category code and name""" |
| 65 | + # Normalize separators: ensure consistency for downstream splitting |
| 66 | + dosage = dosage.replace("|", ";") |
| 67 | + |
| 68 | + # Normalize units: ensure no space between number and unit (e.g., 500mg) |
| 69 | + # Using regex to catch '500 mg', '500 mg', etc. |
| 70 | + dosage = re.sub(r"(\d+)\s+(mg|ml|mcg|g|l)", r"\1\2", dosage, flags=re.IGNORECASE) |
49 | 71 |
|
50 | | - if pd.isna(category): |
51 | | - return "", "" |
| 72 | + # Remove duplicate/trailing separators |
| 73 | + dosage = re.sub(r";\s*;", ";", dosage) |
| 74 | + return dosage.strip().strip(";") |
52 | 75 |
|
53 | | - category = str(category) |
54 | 76 |
|
55 | | - match = re.match(r"([A-Z]{2}\.\d{3})\s*(.*)", category) |
| 77 | +# -------------------------------------------------- |
| 78 | +# Category Processing |
| 79 | +# -------------------------------------------------- |
56 | 80 |
|
57 | | - if match: |
58 | | - return match.group(1), match.group(2).lower().strip() |
59 | 81 |
|
60 | | - return category, "" |
| 82 | +def split_category(category: str): |
| 83 | + """ |
| 84 | + Split category code and category name. |
| 85 | + Example: AI.102 .Cephalosporins -> AI.102, cephalosporins |
| 86 | + """ |
| 87 | + if pd.isna(category) or str(category).strip() == "": |
| 88 | + return "UNK.000", "unknown" |
| 89 | + |
| 90 | + category = str(category).strip() |
| 91 | + |
| 92 | + # Optimized regex for multi-level codes and ghost spaces |
| 93 | + match = re.match(r"^([A-Z]{2}(?:\s*\.\s*\d+)+)\s*\.?\s*(.*)", category) |
61 | 94 |
|
| 95 | + if match: |
| 96 | + code = re.sub(r"\s+", "", match.group(1)) # Clean 'AI . 102' -> 'AI.102' |
| 97 | + name = remove_garbage(match.group(2)).lower() |
| 98 | + return code, name |
62 | 99 |
|
63 | | -def generate_drug_id(category_code: str, s_no: str): |
64 | | - """Generate unique drug id""" |
| 100 | + return "UNK.000", category.lower() |
65 | 101 |
|
66 | | - s_no = str(s_no).replace(".", "").strip() |
67 | 102 |
|
68 | | - return f"{category_code}-{s_no.zfill(3)}" |
| 103 | +# -------------------------------------------------- |
| 104 | +# Main DataFrame Cleaning |
| 105 | +# -------------------------------------------------- |
69 | 106 |
|
70 | 107 |
|
71 | 108 | def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
72 | | - """Clean and normalize extracted drug data""" |
73 | | - |
| 109 | + """ |
| 110 | + Clean and normalize extracted drug dataset. |
| 111 | + """ |
74 | 112 | logger.info("[CLEANING] Starting dataframe cleaning") |
75 | | - |
76 | 113 | cleaned_rows = [] |
| 114 | + garbage_count = 0 |
77 | 115 |
|
| 116 | + # itertuples is faster than iterrows, but we need to handle column access safely |
78 | 117 | for row in df.itertuples(index=False): |
79 | 118 |
|
80 | | - category_code, category_name = split_category(row.category) |
| 119 | + # Check for garbage before we clean it away |
| 120 | + if contains_garbage(row.generic_name) or contains_garbage(row.dosage): |
| 121 | + garbage_count += 1 |
| 122 | + |
| 123 | + cat_code, cat_name = split_category(row.category) |
| 124 | + |
| 125 | + # Pre-clean generic and dosage |
| 126 | + gen_name = clean_generic_name(row.generic_name) |
| 127 | + dos_val = clean_dosage(row.dosage) |
| 128 | + |
| 129 | + # Skip rows with no generic name (invalid data) |
| 130 | + if not gen_name: |
| 131 | + continue |
| 132 | + |
| 133 | + # Generate unique ID |
| 134 | + # Strip dots from s_no to avoid '1.' becoming '001.' |
| 135 | + clean_sno = re.sub(r"\D", "", str(row.s_no)) |
| 136 | + sno_str = clean_sno.zfill(3) if clean_sno else "000" |
| 137 | + drug_id = f"{cat_code}-{sno_str}" |
81 | 138 |
|
82 | 139 | cleaned_rows.append( |
83 | 140 | { |
84 | | - "drug_id": generate_drug_id(category_code, row.s_no), |
85 | | - "generic_name": clean_generic_name(row.generic_name), |
86 | | - "dosage": clean_dosage(row.dosage), |
87 | | - "category_code": category_code, |
88 | | - "category_name": category_name, |
| 141 | + "drug_id": drug_id, |
| 142 | + "generic_name": gen_name, |
| 143 | + "dosage": dos_val, |
| 144 | + "category_code": cat_code, |
| 145 | + "category_name": cat_name, |
89 | 146 | } |
90 | 147 | ) |
91 | 148 |
|
| 149 | + if not cleaned_rows: |
| 150 | + logger.error("[CLEANING] No rows remained after cleaning!") |
| 151 | + return pd.DataFrame() |
| 152 | + |
92 | 153 | clean_df = pd.DataFrame(cleaned_rows) |
93 | 154 |
|
| 155 | + # Deduplication |
94 | 156 | before = len(clean_df) |
95 | | - |
96 | 157 | clean_df = clean_df.drop_duplicates(subset=["generic_name", "dosage"]) |
97 | | - |
98 | 158 | after = len(clean_df) |
99 | 159 |
|
100 | | - clean_df = clean_df.reset_index(drop=True) |
101 | | - |
102 | | - logger.info(f"[CLEANING] Rows before deduplication: {before}") |
103 | | - logger.info(f"[CLEANING] Rows after deduplication: {after}") |
104 | | - logger.info(f"[CLEANING] Removed duplicates: {before - after}") |
| 160 | + logger.info( |
| 161 | + f"[CLEANING] Success. Total: {after} | Removed: {before - after} | Garbage items fixed: {garbage_count}" |
| 162 | + ) |
105 | 163 |
|
106 | | - return clean_df |
| 164 | + return clean_df.reset_index(drop=True) |
0 commit comments