-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scrapper.py
More file actions
131 lines (115 loc) · 6.08 KB
/
web_scrapper.py
File metadata and controls
131 lines (115 loc) · 6.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import asyncio
import time
import yaml # type: ignore
import re
import urllib3 # type: ignore
import requests # type: ignore
from pathlib import Path
from bs4 import BeautifulSoup, NavigableString # type: ignore
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError # type: ignore
# ─── Suppress insecure‐request warnings ───
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# ─── Configuration ───
SITES = [
{"name": "MHF_UK_Best_Tips", "url": "https://www.mentalhealth.org.uk/explore-mental-health/publications/our-best-mental-health-tips", "selector": "main"},
{"name": "NIMH_Caring", "url": "https://www.nimh.nih.gov/health/topics/caring-for-your-mental-health", "selector": "main"},
{"name": "Karpagam_Stress_Tips", "url": "https://karpagamhospital.in/10-mental-health-tips-for-stress-relief/", "selector": "article"},
{"name": "Telemanas_Helplines", "url": "https://telemanas.mohfw.gov.in/", "selector": None},
{"name": "FindaHelpline_Suicide","url": "https://findahelpline.com/countries/in/topics/suicidal-thoughts", "selector": "div.views-row"},
{"name": "WHO_Mental_Health", "url": "https://www.who.int/health-topics/mental-health", "selector": "main"},
{"name": "Cleveland_Depression", "url": "https://my.clevelandclinic.org/health/diseases/9290-depression", "selector": "main"},
{"name": "MayoClinic_Stress", "url": "https://www.mayoclinic.org/healthy-lifestyle/stress-management/in-depth/stress-relievers/art-20047257", "selector": "main"},
{"name": "Healthline_Stress", "url": "https://www.healthline.com/nutrition/16-ways-relieve-stress-anxiety", "selector": "main"},
]
OUTPUT_DIR = Path("mental_health_Knowledge-Base")
OUTPUT_DIR.mkdir(exist_ok=True)
PHONE_REGEX = r"\+?\d[\d\-\s]{7,}\d"
# ─── Cleanup & Markdown Conversion ───
def clean_boilerplate(soup):
for tag in soup.select("script, style, nav, header, footer, aside, noscript"):
tag.decompose()
def element_to_markdown(elem) -> str:
lines = []
for child in elem.children:
if isinstance(child, NavigableString):
continue
t = child.name.lower()
text = child.get_text(strip=True)
if not text:
continue
if t in [f"h{i}" for i in range(1,7)]:
lines.append(f"{'#'*int(t[1])} {text}")
elif t == "p":
lines.append(text)
elif t in ("ul","ol"):
ordered = (t == "ol")
for idx, li in enumerate(child.find_all("li", recursive=False), 1):
prefix = f"{idx}." if ordered else "-"
lines.append(f"{prefix} {li.get_text(strip=True)}")
elif t == "a":
href = child.get("href","").strip()
lines.append(f"[{text}]({href})" if href else text)
else:
sub = element_to_markdown(child)
if sub:
lines.append(sub)
return "\n\n".join(lines)
# ─── Fallback scraper using requests ───
def scrape_with_requests(url, selector=None):
resp = requests.get(url, timeout=15, verify=False)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
clean_boilerplate(soup)
container = selector and soup.select_one(selector) or soup.select_one("main") or soup.body
if not container:
container = soup.body
return element_to_markdown(container)
# ─── Main async scraper ───
async def scrape_and_clean():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
for site in SITES:
name, url, sel = site["name"], site["url"], site["selector"]
print(f"→ Scraping {name} …")
md_body = ""
used_fallback = False
if name == "Telemanas_Helplines":
# Telemanas: only phone numbers via regex
used_fallback = True
html = requests.get(url, timeout=10, verify=False).text
phones = set(re.findall(PHONE_REGEX, html))
md_body = "\n".join(f"- {ph}" for ph in sorted(phones))
else:
# Try Playwright first
try:
await page.goto(url, timeout=15000)
await page.wait_for_timeout(3000)
html = await page.content()
soup = BeautifulSoup(html, "html.parser")
clean_boilerplate(soup)
container = sel and soup.select_one(sel) or soup.select_one("main")
if not container:
raise ValueError(f"selector `{sel}` not found")
md_body = element_to_markdown(container)
except (PlaywrightTimeoutError, Exception) as e:
print(f" [Info] Playwright failed ({e}); falling back to requests")
used_fallback = True
try:
md_body = scrape_with_requests(url, selector=sel)
except Exception as re_err:
print(f" [Error] Requests fallback failed: {re_err}")
md_body = ""
# Write file if we got anything
if md_body.strip():
fm = {"source": url, "scraped_at": time.strftime("%Y-%m-%d"), "title": name}
full_md = f"---\n{yaml.safe_dump(fm)}---\n\n{md_body.strip()}"
out_file = OUTPUT_DIR / f"{name}.md"
out_file.write_text(full_md, encoding="utf-8")
branch = "requests" if used_fallback else "playwright"
print(f" ✔ Saved via {branch} → {out_file.name}")
else:
print(f" [Warning] No content extracted for {name}")
await browser.close()
if __name__ == "__main__":
asyncio.run(scrape_and_clean())