-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_handler.py
More file actions
282 lines (232 loc) · 10.5 KB
/
data_handler.py
File metadata and controls
282 lines (232 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import json
import os
import hashlib
import uuid
from datetime import datetime
import streamlit as st # type: ignore
from cryptography.fernet import Fernet # type: ignore
class DataHandler:
"""
Class to handle sensitive candidate data with GDPR compliance.
Provides functionality for:
- Data anonymization
- Secure storage
- Encryption/decryption of sensitive information
- GDPR compliance features
"""
def __init__(self, encryption_key=None):
"""
Initialize the DataHandler with an encryption key
Args:
encryption_key (str, optional): The encryption key. If None, a new one will be generated.
"""
# Create data directory if it doesn't exist
self.data_dir = "candidate_data"
os.makedirs(self.data_dir, exist_ok=True)
# Set up encryption
if encryption_key:
self.key = encryption_key
else:
# Generate a key if not provided or stored
if 'encryption_key' in st.session_state:
self.key = st.session_state.encryption_key
else:
self.key = Fernet.generate_key()
st.session_state.encryption_key = self.key
self.cipher = Fernet(self.key)
def anonymize_data(self, candidate_info):
"""
Create an anonymized version of candidate data for analysis purposes
Args:
candidate_info (dict): The candidate information to anonymize
Returns:
dict: Anonymized candidate data
"""
anonymous_data = candidate_info.copy()
# Generate unique anonymous ID based on email
if anonymous_data.get("email"):
# Create consistent but anonymous ID
email_hash = hashlib.sha256(anonymous_data["email"].encode()).hexdigest()[:10]
anonymous_data["anonymous_id"] = email_hash
else:
# If no email, create a random ID
anonymous_data["anonymous_id"] = uuid.uuid4().hex[:10]
# Remove or mask personally identifiable information
if anonymous_data.get("name"):
# Replace name with "Candidate"
anonymous_data["name"] = f"Candidate-{anonymous_data['anonymous_id']}"
# Mask contact information
if anonymous_data.get("email"):
parts = anonymous_data["email"].split("@")
if len(parts) == 2:
anonymous_data["email"] = f"{parts[0][0]}{'*' * (len(parts[0])-2)}{parts[0][-1]}@{parts[1]}"
if anonymous_data.get("phone"):
# Mask phone number, keeping only the last 2 digits
digits = ''.join(filter(str.isdigit, anonymous_data["phone"]))
masked_len = max(len(digits) - 2, 0)
anonymous_data["phone"] = '*' * masked_len + digits[-2:] if len(digits) > 2 else digits
# Keep non-identifying information
# (experience, desired_position, location, tech_stack)
return anonymous_data
def encrypt_sensitive_data(self, candidate_info):
"""
Encrypt sensitive fields in candidate data
Args:
candidate_info (dict): The candidate information to encrypt
Returns:
dict: Data with sensitive fields encrypted
"""
encrypted_data = {}
sensitive_fields = ["name", "email", "phone"]
for key, value in candidate_info.items():
if key in sensitive_fields and value:
# Encrypt sensitive fields
encrypted_data[key] = self.cipher.encrypt(value.encode()).decode()
else:
# Keep non-sensitive fields as is
encrypted_data[key] = value
return encrypted_data
def decrypt_sensitive_data(self, encrypted_data):
"""
Decrypt sensitive fields in candidate data
Args:
encrypted_data (dict): The candidate information with encrypted fields
Returns:
dict: Data with sensitive fields decrypted
"""
decrypted_data = {}
sensitive_fields = ["name", "email", "phone"]
for key, value in encrypted_data.items():
if key in sensitive_fields and value and isinstance(value, str):
try:
# Attempt to decrypt sensitive fields
decrypted_data[key] = self.cipher.decrypt(value.encode()).decode()
except Exception:
# If it wasn't actually encrypted, keep as is
decrypted_data[key] = value
else:
# Keep non-sensitive fields as is
decrypted_data[key] = value
return decrypted_data
def save_candidate_data(self, candidate_info, conversation_history=None):
"""
Save candidate data securely with encryption for sensitive information
Args:
candidate_info (dict): The candidate information to save
conversation_history (list, optional): Conversation history to save
Returns:
str: The filename where data was saved
"""
if not candidate_info.get("email"):
# Generate a random ID if no email is available
file_id = uuid.uuid4().hex
else:
# Create a filename based on hashed email for consistency
file_id = hashlib.sha256(candidate_info["email"].encode()).hexdigest()[:10]
# Create a record with metadata
record = {
"timestamp": datetime.now().isoformat(),
"data_version": "1.0",
"encrypted_candidate_info": self.encrypt_sensitive_data(candidate_info),
"anonymized_candidate_info": self.anonymize_data(candidate_info)
}
# Add conversation history if provided
if conversation_history:
# Save only the content, not role information
sanitized_history = []
for msg in conversation_history:
sanitized_history.append({
"role": msg.get("role", "unknown"),
"content": msg.get("content", "")
})
record["conversation_history"] = sanitized_history
# Save to a JSON file
filename = f"{self.data_dir}/candidate_{file_id}.json"
with open(filename, "w") as f:
json.dump(record, f, indent=2)
return filename
def load_candidate_data(self, file_id):
"""
Load and decrypt candidate data from storage
Args:
file_id (str): The file ID to load
Returns:
dict: The loaded and decrypted candidate data or None if not found
"""
filename = f"{self.data_dir}/candidate_{file_id}.json"
try:
with open(filename, "r") as f:
record = json.load(f)
# Decrypt the candidate info
if "encrypted_candidate_info" in record:
decrypted_info = self.decrypt_sensitive_data(record["encrypted_candidate_info"])
record["candidate_info"] = decrypted_info
return record
except (FileNotFoundError, json.JSONDecodeError):
return None
def get_data_deletion_info(self):
"""
Generate information about data retention and deletion for GDPR compliance
Returns:
str: Information about data retention and deletion
"""
return """
## Data Privacy Information
Your data is stored securely and encrypted in accordance with GDPR regulations.
- We retain your information for recruitment purposes for up to 6 months.
- You have the right to request access to, correction, or deletion of your data.
- To exercise these rights, please contact privacy@talentscout.example.com.
- For full details on how we handle your data, please see our Privacy Policy.
"""
def get_gdpr_consent_text(self):
"""
Generate GDPR consent text for candidates
Returns:
str: GDPR consent text
"""
return """
By continuing this conversation, you consent to TalentScout collecting and processing
your personal information for recruitment purposes. Your data will be stored securely
and in accordance with our Privacy Policy. You may request access to, correction,
or deletion of your data at any time.
"""
def generate_simulated_candidates(self, count=5):
"""
Generate simulated candidate data for testing purposes
Args:
count (int): Number of simulated candidates to generate
Returns:
list: List of simulated candidate dictionaries
"""
simulated_candidates = []
tech_stacks = [
"Python, Django, PostgreSQL, Docker",
"JavaScript, React, Node.js, MongoDB",
"Java, Spring Boot, MySQL, AWS",
"C#, .NET Core, SQL Server, Azure",
"Ruby, Rails, Redis, Heroku",
"PHP, Laravel, MySQL, Linux",
"Go, PostgreSQL, Docker, Kubernetes",
"Swift, iOS, Firebase, Git",
"Kotlin, Android, SQLite, Jenkins"
]
locations = ["New York", "San Francisco", "London", "Berlin", "Toronto", "Mumbai", "Pune", "Delhi", "Gurgaon",
"Sydney", "Singapore", "Bangalore", "Tokyo", "Remote"]
positions = ["Software Engineer", "Frontend Developer", "Backend Developer", "MlOps Engineer",
"Full Stack Developer", "DevOps Engineer", "Data Scientist",
"Machine Learning Engineer", "Mobile Developer", "QA Engineer"]
for i in range(count):
# Create a simulated candidate
candidate = {
"name": f"Test Candidate {i+1}",
"email": f"candidate{i+1}@example.com",
"phone": f"+1555{i:04d}1234",
"experience": str(i % 10 + 1),
"desired_position": positions[i % len(positions)],
"location": locations[i % len(locations)],
"tech_stack": tech_stacks[i % len(tech_stacks)],
"questions_asked": bool(i % 2),
"conversation_complete": bool(i % 3)
}
simulated_candidates.append(candidate)
return simulated_candidates