-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_dataset.py
More file actions
244 lines (203 loc) · 15.5 KB
/
generate_dataset.py
File metadata and controls
244 lines (203 loc) · 15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import pandas as pd
def generate_variations(base_name):
words = base_name.split("_")
variations = set()
# Original case
variations.add(base_name)
# Snake case (lower & upper)
variations.add("_".join(words).lower())
variations.add("_".join(words).upper())
# Pascal case
variations.add("".join(word.capitalize() for word in words))
# Camel case
variations.add(words[0].lower() + "".join(word.capitalize() for word in words[1:]))
# Kebab case
variations.add("-".join(words).lower())
# No separators (concatenated)
variations.add("".join(words).lower())
variations.add("".join(words).upper())
return variations
# Base field names (HIPAA-sensitive)
hipaa_field_names = [
# Name Variations
"first_name", "last_name", "middle_name", "full_name", "given_name", "surname", "alias_name",
"nick_name", "initials", "maiden_name", "patient_name", "legal_name", "preferred_name",
"registered_name", "official_name",
# Date of Birth Variations
"dob", "date_of_birth", "birthdate", "birth_date", "DATEOFBIRTH", "Birth_Age", "birth_year",
"birth_month", "date_of_birth_year", "DOB_YMD", "dob_day", "date_of_birth_full", "year_of_birth",
"day_of_birth", "birth_details",
# SSN Variations
"ssn", "social_security_number", "SSN_HASHED", "encrypted_ssn", "ssn_number", "SSN_ENCRYPTED",
"ssn_last4", "ssn_partial", "masked_ssn", "national_ssn",
# Address Variations
"home_address", "residential_address", "mailing_address", "current_address", "permanent_address",
"street_address", "house_address", "physical_address", "registered_address", "patient_address",
"billing_address", "delivery_address", "home_location", "residence_location",
# Phone Variations
"phone_number", "home_phone", "work_phone", "mobile_number", "cell_phone", "contact_number",
"patient_phone", "emergency_phone", "alternate_phone", "fax_number", "guardian_phone",
# Email Variations
"email_address", "personal_email", "work_email", "user_email", "primary_email", "email_id",
"contact_email", "patient_email", "guardian_email", "alternate_email", "notification_email",
# Patient ID Variations
"patient_id", "patient_identifier", "patient_number", "patient_uid", "patient_record_id",
"medical_record_id", "health_record_id", "patient_reference", "individual_patient_id",
# Medical Record Number Variations
"medical_record_number", "MRN", "record_number", "health_record_id", "patient_record_number",
"hospital_record_number", "case_file_number", "admission_record_id", "clinic_record_id",
# Insurance ID Variations
"insurance_id", "policy_number", "health_insurance_number", "medical_policy_id", "insurance_plan_id",
"payer_policy_number", "coverage_id", "insurance_card_number", "group_policy_id", "subscriber_id",
# Credit Card Variations
"credit_card_number", "debit_card_number", "card_number", "masked_card_number", "encrypted_card_number",
"hashed_card_number", "tokenized_card_number", "card_expiration_date", "card_expiry", "card_expiry_month",
"card_expiry_year", "card_cvv", "cvv_code", "cvv2", "card_security_code", "card_verification_value",
"cardholder_name", "cardholder_address", "billing_name", "billing_address", "billing_street", "billing_city",
"billing_state", "billing_zip", "billing_postal_code", "billing_country", "billing_phone", "billing_email",
"payer_name", "payer_email", "payer_id", "payment_amount", "payment_currency", "payment_date", "payment_status",
"payment_reference", "payment_token", "payment_authorization", "payment_processor", "payment_gateway",
"payment_method", "transaction_id", "transaction_reference", "transaction_status", "transaction_amount",
"transaction_currency", "transaction_date", "transaction_auth_code", "transaction_settlement_date",
"transaction_fees", "merchant_id", "merchant_account_id", "merchant_reference", "charge_id", "refund_id",
"settlement_id", "reversal_id", "dispute_id", "invoice_id", "receipt_id", "order_id", "subscription_id",
"recurring_payment_id", "statement_descriptor", "authorization_code", "authorization_expiry", "tokenization_status",
"fraud_detection_status", "risk_assessment_score", "stripe_card_number", "stripe_card_token",
"stripe_card_fingerprint", "stripe_card_last4", "stripe_card_expiration", "stripe_card_expiry_month",
"stripe_card_expiry_year", "stripe_card_brand", "stripe_card_country", "stripe_payment_method",
"stripe_payment_intent", "stripe_payment_status", "stripe_payment_currency", "stripe_payment_amount",
"stripe_transaction_id", "stripe_charge_id", "stripe_customer_id", "stripe_session_id", "stripe_billing_details",
"stripe_invoice_id", "stripe_refund_id", "stripe_balance_transaction", "stripe_dispute_id", "stripe_setup_intent",
"stripe_subscription_id", "stripe_subscription_status", "stripe_subscription_start_date",
"stripe_subscription_end_date", "stripe_subscription_interval", "stripe_subscription_plan",
"stripe_subscription_trial_end", "stripe_payout_id", "paypal_email", "paypal_payer_id", "paypal_payer_status",
"paypal_transaction_id", "paypal_transaction_status", "paypal_payment_id", "paypal_payment_token",
"paypal_payment_status", "paypal_payment_currency", "paypal_payment_amount", "paypal_payment_date",
"paypal_payment_reference", "paypal_billing_agreement", "paypal_billing_plan_id", "paypal_order_id",
"paypal_invoice_id", "paypal_subscription_id", "paypal_subscription_plan", "paypal_subscription_status",
"paypal_subscription_start_date", "paypal_subscription_end_date", "paypal_subscription_interval",
"paypal_refund_id", "paypal_merchant_id", "paypal_settlement_id", "paypal_reversal_id", "paypal_dispute_id",
"paypal_payout_batch_id", "paypal_payout_id", "braintree_customer_id", "braintree_payment_method",
"braintree_transaction_id", "braintree_transaction_status", "braintree_transaction_reference",
"braintree_card_number", "braintree_card_last4", "braintree_card_expiration", "braintree_card_expiry_month",
"braintree_card_expiry_year", "braintree_card_token", "braintree_payment_nonce", "braintree_payment_token",
"braintree_payment_status", "braintree_payment_currency", "braintree_payment_amount", "braintree_subscription_id",
"braintree_subscription_status", "braintree_subscription_start_date", "braintree_subscription_end_date",
"braintree_subscription_plan", "braintree_subscription_interval", "braintree_merchant_account_id",
"braintree_settlement_batch_id", "braintree_refund_id", "braintree_dispute_id", "square_card_nonce",
"square_payment_id", "square_transaction_id", "square_transaction_status", "square_transaction_reference",
"square_payment_status", "square_payment_currency", "square_payment_amount", "square_payment_date",
"square_customer_id", "square_order_id", "square_invoice_id", "square_subscription_id", "square_subscription_plan",
"square_subscription_status", "square_payout_id", "square_refund_id", "square_dispute_id",
"adyen_payment_reference", "adyen_transaction_id", "adyen_transaction_status", "adyen_payment_status",
"adyen_payment_currency", "adyen_payment_amount", "adyen_payment_date", "adyen_card_alias", "adyen_card_expiry",
"adyen_card_last4", "adyen_card_brand", "adyen_psp_reference", "adyen_merchant_account", "adyen_settlement_id",
"adyen_refund_id", "adyen_dispute_id", "authorize_net_transaction_id", "authorize_net_transaction_status",
"authorize_net_payment_status", "authorize_net_payment_currency", "authorize_net_payment_amount",
"authorize_net_payment_date", "authorize_net_card_number", "authorize_net_card_last4", "authorize_net_card_expiry",
"authorize_net_refund_id", "authorize_net_settlement_id", "authorize_net_dispute_id", "worldpay_transaction_id",
"worldpay_transaction_status", "worldpay_payment_status", "worldpay_payment_currency", "worldpay_payment_amount",
"worldpay_payment_date", "worldpay_card_number", "worldpay_card_last4", "worldpay_card_expiry",
"worldpay_payment_token", "worldpay_settlement_id", "worldpay_refund_id", "worldpay_dispute_id"
# Banking Variations "bank_account_number", "account_number",
"routing_number", "bank_details", "iban_number",
"account_holder_name", "financial_account_number", "masked_account_number", "bank_identifier",
# Diagnosis Variations
"diagnosis_code", "diagnosis_description", "medical_condition", "patient_diagnosis", "health_diagnosis",
"clinical_diagnosis", "diagnostic_code", "disease_code", "diagnosis_notes", "medical_findings",
# Treatment Plan Variations
"treatment_plan", "medical_treatment", "therapy_plan", "treatment_procedure", "treatment_guideline",
"prescribed_treatment", "patient_treatment", "doctor_recommendation", "care_plan", "healthcare_plan",
"allergy"
# Blood Type Variations
"blood_type", "blood_group", "patient_blood_type", "blood_classification", "ABO_group", "RH_factor",
# Security & Access Control Variations
"user_id", "login_id", "token", "security", "access_token", "session_token", "audit_log_id", "access_timestamp",
"access_reason", "security_event_id", "breach_notification_status", "multi_factor_auth_enabled",
"account_locked", "password", "password_hash", "encryption_status", "HIPAA_compliance_status",
"authentication_token",
"system_user_id", "access_logs", "authorization_token", "user_role", "security_question", "jwt",
"username", "email", "password_salt", "encrypted_password", "hashed_password", "refresh_token", "session_token",
"api_key", "secret_key",
"oauth_token", "jwt_token", "auth_token", "mfa_token", "otp_code", "otp_secret", "two_factor_code", "multi_factor",
"multi_factor_auth_enabled", "account_status", "failed_login_attempts", "login_attempts", "last_login",
"login_timestamp", "security_answer",
"recovery_email", "recovery_phone", "password_reset_token", "password_reset_expiration", "biometric_hash",
"fingerprint_hash", "face_id", "iris_scan", "voice_auth",
]
# Base field names (Non-HIPAA-sensitive)
non_hipaa_field_names = [
"record_id", "created_at", "updated_at", "modified_at", "deleted_at", "status", "active_status", "category", "tag","notes",
"comments", "description", "reference_code", "version", "approval_status", "approval_date", "document_type","file_name",
"file_path", "file_size", "file_extension", "attachment", "metadata", "tags", "priority", "rank", "rating","feedback", "flag",
"is_active", "is_deleted", "is_verified", "workflow_status",
"pipeline_stage", "job_status", "error_code", "error_message",
"response_status", "response_code", "operation_mode", "settings", "preferences", "timestamp",
"created_by", "modified_by", "approved_by", "verified_by",
# Product & Inventory Fields
"product_id", "product_name", "product_category", "product_description", "product_sku", "product_barcode",
"product_price", "product_quantity", "stock_level", "warehouse_location", "supplier_name", "supplier_id",
"supply_chain_status", "shipment_status", "order_status", "inventory_count", "purchase_order_id",
"reorder_level", "restock_date", "delivery_date", "shipment_tracking", "batch_number",
"manufacturing_date", "expiry_date", "unit_price", "discount", "tax_rate", "tax_amount", "profit_margin",
"cost_of_goods_sold", "sales_count", "return_rate", "refund_status", "refund_reason",
# System & Logging Fields
"log_id", "event_type", "event_details", "log_level", "log_timestamp", "log_source", "log_category",
"log_message", "event_status", "action_performed", "error_logs", "warning_logs", "debug_message",
"performance_metrics", "resource_usage", "cpu_usage", "memory_usage", "disk_space", "execution_time",
"thread_id", "process_id", "request_id", "session_duration", "system_uptime", "server_name",
"server_region", "api_endpoint", "request_method", "response_time", "status_code",
# Geographic & Location Data
"city_name", "state_name", "region", "country", "continent", "time_zone", "latitude", "longitude",
"altitude", "geolocation_code", "weather_conditions", "climate_zone", "temperature", "humidity",
"wind_speed", "air_quality", "location_id", "map_reference", "landmark", "route_name", "road_condition",
# Organizational & HR Fields
"department_id", "department_name", "team_name", "team_lead", "employee_id", "position", "role",
"designation", "work_shift", "office_location", "branch_name", "branch_code", "payroll_status",
"payroll_cycle", "leave_status", "attendance_status", "work_hours", "project_name", "project_id",
"task_id", "task_status", "milestone", "project_budget", "operating_expense", "revenue_stream",
"business_unit", "performance_rating", "bonus_eligibility", "promotion_status", "training_completion",
# Finance & Accounting Fields
"account_type", "financial_year", "budget", "expense_category", "expense_amount", "fund_source",
"ledger_code", "revenue_growth", "operating_income", "profit_loss", "investment_type",
"stock_market_trend", "portfolio_value", "dividend_yield", "asset_id", "asset_category",
"asset_lifetime", "depreciation", "return_on_investment", "cost_savings", "payment_mode",
"billing_cycle", "pricing_strategy", "profit_forecast", "economic_indicator",
# Marketing & User Engagement Fields
"campaign_id", "campaign_name", "ad_group", "advertisement_id", "conversion_rate",
"click_through_rate", "impressions", "page_views", "email_open_rate", "bounce_rate",
"user_feedback", "customer_satisfaction_score", "net_promoter_score", "brand_awareness",
"market_trend", "consumer_preference", "survey_responses", "focus_group_feedback",
"competitor_analysis", "advertising_cost", "customer_lifetime_value",
# Tech & Software Development Fields
"feature_flag", "bug_id", "bug_status", "release_version", "sprint_number", "repository_name",
"commit_id", "branch_name", "build_number", "ci_cd_status", "deployment_status",
"integration_test_results", "unit_test_coverage", "code_review_status", "pull_request_id",
"api_version", "webhook_event", "cron_job_status", "job_execution_time", "server_load",
"api_call_count", "database_query_time", "cache_hit_ratio", "error_stack_trace",
# Additional General Fields
"reference_number", "support_ticket_id", "customer_id", "user_experience_score", "support_case_status",
"workflow_step", "escalation_status", "notification_status", "usage_statistics", "service_quality_score",
"event_category", "configuration_settings", "feature_usage", "time_spent", "page_load_time",
"interaction_count", "process_status", "feedback_status", "survey_completion_status", "resource_allocation"
]
hipaa_variations = set()
for field in hipaa_field_names:
hipaa_variations.update(generate_variations(field))
non_hipaa_variations = set()
for field in non_hipaa_field_names:
non_hipaa_variations.update(generate_variations(field))
df_hipaa = pd.DataFrame({
"column_name": list(hipaa_variations),
"is_hipaa_sensitive": [1] * len(hipaa_variations)
})
df_non_hipaa = pd.DataFrame({
"column_name": list(non_hipaa_variations),
"is_hipaa_sensitive": [0] * len(non_hipaa_variations)
})
df_combined = pd.concat([df_hipaa, df_non_hipaa], ignore_index=True)
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_combined.insert(0, "id", range(1, len(df_combined) + 1))
csv_filename_variations = "dataset.csv"
df_combined.to_csv(csv_filename_variations, index=False)
print(f"CSV file '{csv_filename_variations}' has been created successfully!")