-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebscraperv07.py
More file actions
117 lines (98 loc) · 3.92 KB
/
webscraperv07.py
File metadata and controls
117 lines (98 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_user_input(prompt, valid_options=None, allow_quit=True):
while True:
user_input = input(prompt).strip().lower()
if allow_quit and user_input == 'q':
print("Exiting script by user request.")
exit()
if valid_options is None or user_input in valid_options:
return user_input
else:
print(f"Invalid input. Please enter one of: {', '.join(valid_options)} or 'q' to quit.")
def preview_page(url):
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to retrieve page. Status code: {response.status_code}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
# Collect all unique tag names with counts
tags = {}
for tag in soup.find_all(True):
tag_lower = tag.name.lower()
tags[tag_lower] = tags.get(tag_lower, 0) + 1
return tags, soup
def scrape_tags(soup, selected_tags):
scraped_data = {}
for tag in selected_tags:
elements = soup.find_all(tag)
data = []
for el in elements:
text = el.get_text(strip=True)
if text:
data.append(text)
else:
data.append(str(el.attrs))
scraped_data[tag] = pd.DataFrame(data, columns=['Content'])
return scraped_data
def preview_samples(scraped_data, sample_size=5):
print("\nPreview of scraped data (first few entries):")
for tag, df in scraped_data.items():
print(f"\n<{tag}> (showing up to {sample_size} entries):")
print(df.head(sample_size).to_string(index=False))
def main():
url = input("Enter the URL to scrape: ").strip()
print("\nPreviewing page...")
tags, soup = preview_page(url)
if not tags:
print("No tags found on page.")
return
print("\nTags found on the page (with counts):")
for tag, count in tags.items():
print(f"{tag}: {count}")
# Let user pick tags to scrape
while True:
user_input = input("\nEnter the tags you want to scrape, separated by commas (or 'q' to quit): ").strip()
if user_input.lower() == 'q':
print("Exiting script.")
return
selections = [s.strip().lower() for s in user_input.split(',')]
selections = [s for s in selections if s in tags]
if selections:
break
else:
print("No valid tags entered. Please try again.")
# Confirm selection
confirm = get_user_input(f"\nYou have selected to scrape: {', '.join(selections)}. Proceed? (y/n): ", valid_options=['y','n','yes','no'])
if confirm in ['n','no']:
print("Exiting script.")
return
# Scrape selected tags
scraped_data = scrape_tags(soup, selections)
# Show sample preview
preview_samples(scraped_data)
# Confirm saving
confirm_save = get_user_input("\nDo you want to save this data? (y/n): ", valid_options=['y','n','yes','no'])
if confirm_save in ['n','no']:
print("Exiting script without saving.")
return
# Save each selected tag's data
for tag, df in scraped_data.items():
while True:
file_type = input(f"Choose file format for <{tag}> data (csv/excel or 'q' to quit): ").strip().lower()
if file_type == 'q':
print("Exiting script.")
return
elif file_type == 'csv':
df.to_csv(f"{tag}_data.csv", index=False)
print(f"<{tag}> data saved to {tag}_data.csv")
break
elif file_type == 'excel':
df.to_excel(f"{tag}_data.xlsx", index=False)
print(f"<{tag}> data saved to {tag}_data.xlsx")
break
else:
print("Invalid input. Please enter 'csv', 'excel', or 'q' to quit.")
if __name__ == "__main__":
main()