-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
91 lines (69 loc) · 2.58 KB
/
crawler.py
File metadata and controls
91 lines (69 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import deque
def get_refs(url):
try:
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.content, 'html.parser')
return [a.get('href') for a in soup.find_all('a', href=True)]
except Exception as e:
print(f"Error processing {url}: {e}")
return []
def load_skip_extensions():
try:
with open('config.txt', 'r', encoding='utf-8') as file:
extensions = []
for line in file:
line = line.strip()
if line and not line.startswith('#'):
extensions.append(line)
return extensions
except FileNotFoundError:
return [
'.pdf', '.doc', '.docx', '.zip', '.rar', '.7z', '.tar', '.gz',
'.exe', '.msi', '.dmg', '.pkg', '.jpg', '.jpeg', '.png', '.gif',
'.mp4', '.avi', '.mov', '.mp3', '.wav', '.txt', '.csv', '.xls',
'.xlsx', '.ppt', '.pptx'
]
def should_skip_url(url, skip_extensions):
clean_url = url.split('?')[0].split('#')[0]
for ext in skip_extensions:
if ext in clean_url.lower():
return True
return False
if __name__ == "__main__":
url = input("Enter URL: ")
try:
max_links = int(input("Enter maximum number of links to collect (0 for unlimited): "))
except ValueError:
print("Invalid input. Using unlimited links.")
max_links = 0
visited = set()
to_process = deque([url])
result = []
skip_extensions = load_skip_extensions()
while to_process:
if max_links > 0 and len(result) >= max_links:
print(f"Reached maximum limit of {max_links} links. Stopping...")
break
current_url = to_process.popleft()
if current_url in visited:
continue
print(f"Processing: {current_url}")
visited.add(current_url)
result.append(current_url)
new_refs = get_refs(current_url)
for ref in new_refs:
if "http" not in ref:
full_url = urljoin(url, ref)
else:
full_url = ref
if should_skip_url(full_url, skip_extensions):
continue
if url in full_url and full_url not in visited and full_url not in to_process:
to_process.append(full_url)
with open("links.txt", "w", encoding="utf-8") as file:
for link in result:
file.write(link + "\n")
print(f"\nFound {len(result)} links. Result saved to links.txt")