-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
335 lines (274 loc) · 13.4 KB
/
main.py
File metadata and controls
335 lines (274 loc) · 13.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def fetch_page(url):
"""Fetch the HTML content and load time of the given URL."""
try:
start = time.time()
response = requests.get(url, timeout=10)
response.raise_for_status()
end = time.time()
load_time = end - start
return response.text, load_time
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
return None, None
def analyze_seo(html, url):
"""Analyze the SEO aspects of the HTML content."""
soup = BeautifulSoup(html, "html.parser")
seo_data = {'url': url}
# Title Tag
title_tag = soup.title.string if soup.title else None
seo_data['title'] = title_tag
seo_data['title_length'] = len(title_tag) if title_tag else 0
seo_data['title_quality'] = "Good" if title_tag and 50 <= len(title_tag) <= 60 else "Needs Improvement"
# Meta Description
meta_description = soup.find("meta", attrs={"name": "description"})
description_content = meta_description["content"] if meta_description and meta_description.get("content") else None
seo_data['meta_description'] = description_content
seo_data['meta_description_length'] = len(description_content) if description_content else 0
seo_data['meta_description_quality'] = "Good" if description_content and 120 <= len(description_content) <= 160 else "Needs Improvement"
# Keywords Count in Title and Description
keywords = ['SEO', 'optimization', 'content'] # Example keywords to check
seo_data['keywords_in_title'] = sum(keyword.lower() in (title_tag.lower() if title_tag else '') for keyword in keywords)
seo_data['keywords_in_description'] = sum(keyword.lower() in (description_content.lower() if description_content else '') for keyword in keywords)
# Headings (H1, H2)
seo_data['h1_count'] = len(soup.find_all("h1"))
seo_data['h2_count'] = len(soup.find_all("h2"))
# Image Alt Tags
images = soup.find_all("img")
seo_data['image_count'] = len(images)
seo_data['images_with_alt'] = sum(1 for img in images if img.get("alt"))
seo_data['images_without_alt'] = seo_data['image_count'] - seo_data['images_with_alt']
# Links
links = soup.find_all("a", href=True)
seo_data['link_count'] = len(links)
seo_data['internal_links'] = sum(1 for link in links if url in link['href'])
seo_data['external_links'] = seo_data['link_count'] - seo_data['internal_links']
# Structured Data
structured_data = soup.find_all("script", type="application/ld+json")
seo_data['structured_data_count'] = len(structured_data)
# Robots Meta Tag
robots_meta = soup.find("meta", attrs={"name": "robots"})
seo_data['robots_meta'] = robots_meta["content"] if robots_meta else "Not Found"
# Canonical Tag
canonical_link = soup.find("link", rel="canonical")
seo_data['canonical_link'] = canonical_link["href"] if canonical_link else "Not Found"
# Page Load Time
_, load_time = fetch_page(url)
seo_data['page_load_time'] = load_time
# Mobile-Friendliness
viewport = soup.find("meta", attrs={"name": "viewport"})
seo_data['mobile_friendly'] = "Yes" if viewport else "No"
# Favicon
favicon = soup.find("link", rel="icon")
seo_data['favicon'] = favicon["href"] if favicon else "Not Found"
# Open Graph Tags
og_tags = soup.find_all("meta", attrs={"property": lambda x: x and x.startswith("og:")})
seo_data['og_tags_count'] = len(og_tags)
# Twitter Card Tags
twitter_tags = soup.find_all("meta", attrs={"name": lambda x: x and x.startswith("twitter:")})
seo_data['twitter_tags_count'] = len(twitter_tags)
# HTTPS Check
seo_data['https'] = "Yes" if url.startswith("https") else "No"
# Check for Broken Links
broken_links = 0
for link in links:
try:
response = requests.head(link['href'], timeout=5)
if response.status_code >= 400:
broken_links += 1
except:
broken_links += 1
seo_data['broken_links'] = broken_links
# Content Analysis
body_text = soup.get_text()
word_count = len(body_text.split())
seo_data['word_count'] = word_count
# Keyword Density
for keyword in keywords:
seo_data[f'keyword_density_{keyword}'] = (body_text.lower().count(keyword.lower()) / word_count) * 100 if word_count else 0
# Language and Charset
language = soup.find("html").get("lang") if soup.find("html") else None
charset = soup.find("meta", attrs={"charset": True})
seo_data['language'] = language if language else "Not Found"
seo_data['charset'] = charset["charset"] if charset else "Not Found"
# Sitemap Check
sitemap_url = f"{url}/sitemap.xml"
try:
response = requests.head(sitemap_url, timeout=5)
seo_data['sitemap'] = "Found" if response.status_code == 200 else "Not Found"
except:
seo_data['sitemap'] = "Not Found"
# Robots.txt Check
robots_url = f"{url}/robots.txt"
try:
response = requests.head(robots_url, timeout=5)
seo_data['robots_txt'] = "Found" if response.status_code == 200 else "Not Found"
except:
seo_data['robots_txt'] = "Not Found"
# Header Tag Hierarchy
header_tags = [tag.name for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
seo_data['header_tag_hierarchy'] = "Valid" if header_tags == sorted(header_tags, key=lambda x: int(x[1:])) else "Invalid"
# Duplicate Title Check (Placeholder for cross-page analysis)
existing_titles = set() # Needs to be populated externally
seo_data['duplicate_title'] = "Yes" if title_tag in existing_titles else "No"
existing_titles.add(title_tag)
# Text-to-HTML Ratio
html_length = len(html)
text_length = len(soup.get_text())
seo_data['text_to_html_ratio'] = (text_length / html_length) * 100 if html_length else 0
# Breadcrumb Navigation
breadcrumbs = soup.find_all("nav", attrs={"aria-label": "breadcrumb"})
seo_data['breadcrumbs'] = "Found" if breadcrumbs else "Not Found"
# Pagination Tags
next_link = soup.find("link", rel="next")
prev_link = soup.find("link", rel="prev")
seo_data['pagination_next'] = next_link["href"] if next_link else "Not Found"
seo_data['pagination_prev'] = prev_link["href"] if prev_link else "Not Found"
# Hreflang Tags
hreflang_tags = soup.find_all("link", rel="alternate", hreflang=True)
seo_data['hreflang_count'] = len(hreflang_tags)
# AMP Validation
amp_tag = soup.find("html", attrs={"amp": True})
seo_data['amp'] = "Yes" if amp_tag else "No"
# Content Freshness
publish_date = soup.find("meta", attrs={"name": "article:published_time"})
last_modified_date = soup.find("meta", attrs={"name": "article:modified_time"})
seo_data['publish_date'] = publish_date["content"] if publish_date else "Not Found"
seo_data['last_modified_date'] = last_modified_date["content"] if last_modified_date else "Not Found"
# Schema Markup Types
schema_types = []
for script in structured_data:
try:
data = json.loads(script.string)
if isinstance(data, dict) and "@type" in data:
schema_types.append(data["@type"])
except:
pass
seo_data['schema_types'] = ', '.join(schema_types) if schema_types else "None"
# Specific Open Graph Tags
og_image = soup.find("meta", attrs={"property": "og:image"})
seo_data['og_image'] = og_image["content"] if og_image else "Not Found"
# Twitter Card Image
twitter_image = soup.find("meta", attrs={"name": "twitter:image"})
seo_data['twitter_image'] = twitter_image["content"] if twitter_image else "Not Found"
# Anchor Text Analysis
anchor_texts = [link.get_text().strip() for link in links if link.get_text()]
seo_data['anchor_texts'] = ', '.join(anchor_texts[:10]) # Example: Limit to first 10
# JavaScript Errors
inline_scripts = soup.find_all("script", type="text/javascript")
seo_data['inline_script_count'] = len(inline_scripts)
# External JavaScript Files
external_scripts = soup.find_all("script", src=True)
seo_data['external_script_count'] = len(external_scripts)
# HTTP Status Code
try:
response = requests.head(url, timeout=5)
seo_data['http_status_code'] = response.status_code
except:
seo_data['http_status_code'] = "Unknown"
# Check for Duplicate Phrases
phrases = body_text.split('.')
duplicate_phrases = [phrase for phrase in phrases if phrases.count(phrase) > 1]
seo_data['duplicate_phrases_count'] = len(duplicate_phrases)
# XML Sitemap URL Count
try:
sitemap_response = requests.get(sitemap_url, timeout=5)
if sitemap_response.status_code == 200:
sitemap_soup = BeautifulSoup(sitemap_response.content, "xml")
seo_data['sitemap_url_count'] = len(sitemap_soup.find_all("url"))
else:
seo_data['sitemap_url_count'] = 0
except:
seo_data['sitemap_url_count'] = "Error"
# Accessibility Checks
aria_roles = soup.find_all(attrs={"role": True})
seo_data['aria_role_count'] = len(aria_roles)
# Social Media Integration
og_tags = ["og:title", "og:description", "og:image", "og:url"]
twitter_tags = ["twitter:card", "twitter:title", "twitter:description", "twitter:image"]
seo_data['og_tags_present'] = all(soup.find("meta", attrs={"property": tag}) for tag in og_tags)
seo_data['twitter_tags_present'] = all(soup.find("meta", attrs={"name": tag}) for tag in twitter_tags)
# Language Specification
html_tag = soup.find("html")
seo_data['lang_attribute'] = html_tag.get("lang", "Not Specified") if html_tag else "Not Found"
# Duplicate Meta Description Check
existing_descriptions = set()
seo_data['duplicate_meta_description'] = "Yes" if description_content in existing_descriptions else "No"
existing_descriptions.add(description_content)
# Image Optimization
large_images = [img for img in images if img.get("src") and "large" in img.get("src").lower()]
seo_data['large_images_count'] = len(large_images)
# HTTP Headers
try:
headers_response = requests.head(url, timeout=5)
seo_data['cache_control'] = headers_response.headers.get("Cache-Control", "Not Found")
seo_data['content_security_policy'] = headers_response.headers.get("Content-Security-Policy", "Not Found")
except:
seo_data['http_headers'] = "Error"
# URL Structure Quality
import re
seo_data['url_length'] = len(url)
seo_data['url_structure_quality'] = "Good" if re.match(r'^[a-zA-Z0-9\-\/]+$', url) and len(url) <= 100 else "Needs Improvement"
# Content Hierarchy
headings = [tag.name for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
seo_data['heading_hierarchy'] = "Good" if headings == sorted(headings, key=lambda h: int(h[1:])) else "Needs Improvement"
# Image Lazy Loading
lazy_loaded_images = [img for img in images if img.get("loading") == "lazy"]
seo_data['lazy_loaded_images'] = len(lazy_loaded_images)
# Gzip Compression
try:
headers_response = requests.get(url, timeout=5)
seo_data['gzip_compression'] = "Yes" if "gzip" in headers_response.headers.get("Content-Encoding", "") else "No"
except:
seo_data['gzip_compression'] = "Error"
# Cookie Banner
cookie_banner = soup.find("div", {"id": "cookie-banner"}) or soup.find(string="cookie")
seo_data['cookie_banner'] = "Yes" if cookie_banner else "No"
# Social Proof
ratings = soup.find_all("div", {"class": "rating"}) or soup.find_all("span", {"class": "stars"})
seo_data['social_proof_count'] = len(ratings)
# Video and Audio Content
videos = soup.find_all("video")
audios = soup.find_all("audio")
seo_data['video_count'] = len(videos)
seo_data['audio_count'] = len(audios)
# Affiliate Links
affiliate_links = [link for link in links if "ref=" in link['href'] or "/affiliate/" in link['href']]
seo_data['affiliate_links_count'] = len(affiliate_links)
# Custom Fonts
custom_fonts = soup.find_all("link", href=lambda href: href and "fonts" in href)
seo_data['custom_fonts_count'] = len(custom_fonts)
return seo_data
def generate_csv_report(urls, output_file):
"""Generate a CSV report for a list of URLs."""
data = []
for url in urls:
print(f"Analyzing {url}...")
html, _ = fetch_page(url)
if html:
seo_data = analyze_seo(html, url)
data.append(seo_data)
# Save to CSV
df = pd.DataFrame(data)
df.to_csv(output_file, index=False)
print(f"SEO report saved to {output_file}")
if __name__ == "__main__":
# Input file containing URLs
input_file = "urls.csv" # Replace with your CSV file containing URLs
output_file = "seo_report.csv"
# Read URLs from CSV
try:
urls_df = pd.read_csv(input_file)
urls = urls_df['url'].tolist()
except Exception as e:
print(f"Error reading input file {input_file}: {e}")
urls = []
# Generate SEO report
if urls:
generate_csv_report(urls, output_file)
else:
print("No URLs to analyze.")