webscraper/webscraperv07.py at master · zegron/webscraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_user_input(prompt, valid_options=None, allow_quit=True):
    while True:
        user_input = input(prompt).strip().lower()
        if allow_quit and user_input == 'q':
            print("Exiting script by user request.")
            exit()
        if valid_options is None or user_input in valid_options:
            return user_input
        else:
            print(f"Invalid input. Please enter one of: {', '.join(valid_options)} or 'q' to quit.")

def preview_page(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Collect all unique tag names with counts
    tags = {}
    for tag in soup.find_all(True):
        tag_lower = tag.name.lower()
        tags[tag_lower] = tags.get(tag_lower, 0) + 1

    return tags, soup

def scrape_tags(soup, selected_tags):
    scraped_data = {}

    for tag in selected_tags:
        elements = soup.find_all(tag)
        data = []
        for el in elements:
            text = el.get_text(strip=True)
            if text:
                data.append(text)
            else:
                data.append(str(el.attrs))
        scraped_data[tag] = pd.DataFrame(data, columns=['Content'])

    return scraped_data

def preview_samples(scraped_data, sample_size=5):
    print("\nPreview of scraped data (first few entries):")
    for tag, df in scraped_data.items():
        print(f"\n<{tag}> (showing up to {sample_size} entries):")
        print(df.head(sample_size).to_string(index=False))

def main():
    url = input("Enter the URL to scrape: ").strip()
    print("\nPreviewing page...")
    tags, soup = preview_page(url)

    if not tags:
        print("No tags found on page.")
        return

    print("\nTags found on the page (with counts):")
    for tag, count in tags.items():
        print(f"{tag}: {count}")

    # Let user pick tags to scrape
    while True:
        user_input = input("\nEnter the tags you want to scrape, separated by commas (or 'q' to quit): ").strip()
        if user_input.lower() == 'q':
            print("Exiting script.")
            return
        selections = [s.strip().lower() for s in user_input.split(',')]
        selections = [s for s in selections if s in tags]
        if selections:
            break
        else:
            print("No valid tags entered. Please try again.")

    # Confirm selection
    confirm = get_user_input(f"\nYou have selected to scrape: {', '.join(selections)}. Proceed? (y/n): ", valid_options=['y','n','yes','no'])
    if confirm in ['n','no']:
        print("Exiting script.")
        return

    # Scrape selected tags
    scraped_data = scrape_tags(soup, selections)

    # Show sample preview
    preview_samples(scraped_data)

    # Confirm saving
    confirm_save = get_user_input("\nDo you want to save this data? (y/n): ", valid_options=['y','n','yes','no'])
    if confirm_save in ['n','no']:
        print("Exiting script without saving.")
        return

    # Save each selected tag's data
    for tag, df in scraped_data.items():
        while True:
            file_type = input(f"Choose file format for <{tag}> data (csv/excel or 'q' to quit): ").strip().lower()
            if file_type == 'q':
                print("Exiting script.")
                return
            elif file_type == 'csv':
                df.to_csv(f"{tag}_data.csv", index=False)
                print(f"<{tag}> data saved to {tag}_data.csv")
                break
            elif file_type == 'excel':
                df.to_excel(f"{tag}_data.xlsx", index=False)
                print(f"<{tag}> data saved to {tag}_data.xlsx")
                break
            else:
                print("Invalid input. Please enter 'csv', 'excel', or 'q' to quit.")

if __name__ == "__main__":
    main()