-
Notifications
You must be signed in to change notification settings - Fork 359
Expand file tree
/
Copy pathhar_processing.py
More file actions
231 lines (193 loc) · 6.58 KB
/
har_processing.py
File metadata and controls
231 lines (193 loc) · 6.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import json
import os
from urllib.parse import urlparse
from integuru.models.request import Request
from typing import Tuple, Dict, Optional, Any, List
excluded_keywords = (
"google",
"taboola",
"datadog",
"sentry",
# "relic"
)
excluded_header_keywords = (
"cookie",
"sec-",
"accept",
"user-agent",
"referer",
"relic",
"sentry",
"datadog",
"amplitude",
"mixpanel",
"segment",
"heap",
"hotjar",
"fullstory",
"pendo",
"optimizely",
"adobe",
"analytics",
"tracking",
"telemetry",
"clarity", # Microsoft Clarity
"matomo",
"plausible",
)
def format_request(har_request: Dict[str, Any]) -> Request:
"""
Formats a HAR request into a Request object.
"""
method = har_request.get("method", "GET")
url = har_request.get("url", "")
headers = {}
for header in har_request.get("headers", []):
header_name = header.get("name", "")
if not any(keyword in header_name.lower() for keyword in excluded_header_keywords):
headers[header_name] = header.get("value", "")
query_params_list = har_request.get("queryString", [])
query_params = {param["name"]: param["value"] for param in query_params_list} if query_params_list else None
post_data = har_request.get("postData", {})
body = post_data.get("text") if post_data else None
if body:
content_type = headers.get('Content-Type') or headers.get('content-type')
if content_type and 'application/json' in content_type.lower():
try:
body = json.loads(body)
except json.JSONDecodeError:
pass
return Request(
method=method,
url=url,
headers=headers,
query_params=query_params,
body=body
)
def format_response(har_response: Dict[str, Any]) -> Dict[str, str]:
"""
Extracts and returns the content text and content type from a HAR response.
"""
content = har_response.get("content", {})
return {
"text": content.get("text", ""),
"type": content.get("mimeType", "")
}
def parse_har_file(har_file_path: str) -> Dict[Request, Dict[str, str]]:
"""
Parses the HAR file and returns a dictionary mapping Request objects to response dictionaries.
"""
req_res_dict = {}
with open(har_file_path, 'r', encoding='utf-8') as file:
har_data = json.load(file)
entries = har_data.get("log", {}).get("entries", [])
for entry in entries:
request_data = entry.get("request", {})
response_data = entry.get("response", {})
formatted_request = format_request(request_data)
response_dict = format_response(response_data)
req_res_dict[formatted_request] = response_dict
return req_res_dict
def build_url_to_req_res_map(req_res_dict: Dict[Request, Dict[str, str]]) -> Dict[str, Dict[str, Any]]:
"""
Builds a dictionary mapping URLs to {'request': request, 'response': response_dict}
"""
return {
request.url: {'request': request, 'response': response}
for request, response in req_res_dict.items()
}
def get_har_urls(har_file_path: str) -> List[Tuple[str, str, str, str]]:
"""
Extracts and returns a list of tuples containing method, URL, response format, and response preview
from a HAR file, excluding certain file types and keywords.
"""
# List to store tuples of URLs, request methods, response file formats, and response preview
urls_with_details = []
# Define a tuple of file extensions to exclude
excluded_extensions = (
".png",
".jpg",
".jpeg",
".gif",
".webp",
".svg",
".ico", # Image files
".css", # Stylesheets
# ".js",
# ".map", # JavaScript files
".woff",
".woff2",
".ttf",
".otf",
".eot", # Font files
".mp3",
".mp4",
".wav",
".avi",
".mov",
".flv",
".wmv",
".webm", # Media files
# ".pdf",
# ".zip",
".rar",
".7z",
".tar",
".gz",
".exe",
".dmg", # Other non-text files
)
# Read the HAR file
with open(har_file_path, "r", encoding="utf-8") as file:
har_data = json.load(file)
# Extract entries from the HAR data
entries = har_data.get("log", {}).get("entries", [])
for entry in entries:
request = entry.get("request", {})
response = entry.get("response", {})
url = request.get("url")
method = request.get("method", "GET") # Default to 'GET' if method is missing
response_format = response.get("content", {}).get("mimeType", "")
response_text = response.get("content", {}).get("text", "")
response_preview = response_text[:30] if response_text else ""
if url:
parsed_url = urlparse(url)
path = parsed_url.path.lower()
_, extension = os.path.splitext(path)
request_text = url.lower()
headers = request.get("headers", [])
for header in headers:
request_text += header.get("name", "").lower()
request_text += header.get("value", "").lower()
postData = request.get("postData", {}).get("text", "").lower()
request_text += postData
# Exclude URLs with the specified extensions or if keywords are in the request
# this is done to reduce the number of requests we send to the LLM
if extension not in excluded_extensions and not any(
keyword.lower() in request_text for keyword in excluded_keywords
):
urls_with_details.append((method, url, response_format, response_preview))
return urls_with_details
def parse_cookie_file_to_dict(cookie_file_path: str) -> Dict[str, Dict[str, Any]]:
"""
Parses a JSON cookie file and returns a dictionary of cookie data.
"""
parsed_data = {}
with open(cookie_file_path, "r") as file:
cookies = json.load(file)
for cookie in cookies:
name = cookie.get("name")
value = cookie.get("value")
domain = cookie.get("domain")
path = cookie.get("path")
if name:
parsed_data[name] = {
"value": value,
"domain": domain,
"path": path,
"expires": cookie.get("expires"),
"httpOnly": cookie.get("httpOnly"),
"secure": cookie.get("secure"),
"sameSite": cookie.get("sameSite"),
}
return parsed_data