-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_crawler.py
More file actions
64 lines (53 loc) · 2 KB
/
web_crawler.py
File metadata and controls
64 lines (53 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import csv
from collections import deque
def is_same_domain(base_url, target_url):
"""Check if target_url belongs to the same domain as base_url."""
base_domain = urlparse(base_url).netloc
target_domain = urlparse(target_url).netloc
return base_domain == target_domain
def crawl_site(start_url, max_pages=200):
"""Recursively crawl all pages within the same domain."""
visited = set()
queue = deque([start_url])
all_links = []
while queue and len(visited) < max_pages:
url = queue.popleft()
if url in visited:
continue
visited.add(url)
print(f"🔍 Crawling: {url}")
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.RequestException:
continue
soup = BeautifulSoup(response.text, "html.parser")
for a_tag in soup.find_all("a", href=True):
href = a_tag["href"]
full_url = urljoin(url, href)
# Clean up and check
parsed = urlparse(full_url)
full_url = parsed._replace(fragment="").geturl()
if is_same_domain(start_url, full_url) and full_url not in visited:
queue.append(full_url)
all_links.append(full_url)
return sorted(set(all_links))
def save_to_csv(links, filename="links.csv"):
"""Save list of links to CSV."""
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["URL"])
for link in links:
writer.writerow([link])
print(f"\n✅ Saved {len(links)} links to {filename}")
if __name__ == "__main__":
start_url = input("Enter a starting URL: ").strip()
print(f"\n🚀 Starting crawl from: {start_url}\n")
links = crawl_site(start_url)
print("\n📋 All discovered URLs:\n")
for link in links:
print(link)
save_to_csv(links)