WebCrawler/web_crawler.py at main · zegron/WebCrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import csv
from collections import deque

def is_same_domain(base_url, target_url):
    """Check if target_url belongs to the same domain as base_url."""
    base_domain = urlparse(base_url).netloc
    target_domain = urlparse(target_url).netloc
    return base_domain == target_domain

def crawl_site(start_url, max_pages=200):
    """Recursively crawl all pages within the same domain."""
    visited = set()
    queue = deque([start_url])
    all_links = []

    while queue and len(visited) < max_pages:
        url = queue.popleft()
        if url in visited:
            continue
        visited.add(url)
        print(f"🔍 Crawling: {url}")

        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
        except requests.RequestException:
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            full_url = urljoin(url, href)

            # Clean up and check
            parsed = urlparse(full_url)
            full_url = parsed._replace(fragment="").geturl()

            if is_same_domain(start_url, full_url) and full_url not in visited:
                queue.append(full_url)
                all_links.append(full_url)

    return sorted(set(all_links))

def save_to_csv(links, filename="links.csv"):
    """Save list of links to CSV."""
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["URL"])
        for link in links:
            writer.writerow([link])
    print(f"\n✅ Saved {len(links)} links to {filename}")

if __name__ == "__main__":
    start_url = input("Enter a starting URL: ").strip()
    print(f"\n🚀 Starting crawl from: {start_url}\n")
    links = crawl_site(start_url)
    print("\n📋 All discovered URLs:\n")
    for link in links:
        print(link)
    save_to_csv(links)