forked from adgsenpai/ExtractingMedPages
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextractLinks.py
More file actions
72 lines (65 loc) · 2.64 KB
/
extractLinks.py
File metadata and controls
72 lines (65 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from bs4 import BeautifulSoup
import csv
# Paste your HTML content here (or load it from file)
html = """
<table class="table-condensed table-dropdown">
<tbody>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=143" class="larger-body-16">Psychologist</a></td>
<td><span class="badge">231</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=496" class="larger-body-16">Psychologist - Clinical</a></td>
<td><span class="badge">3,147</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=497" class="larger-body-16">Psychologist - Counselling</a></td>
<td><span class="badge">1,603</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=498" class="larger-body-16">Psychologist - Educational</a></td>
<td><span class="badge">1,488</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=499" class="larger-body-16">Psychologist - Industrial</a></td>
<td><span class="badge">421</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=956" class="larger-body-16">Psychologist - Neuro</a></td>
<td><span class="badge">70</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=500" class="larger-body-16">Psychologist - Research</a></td>
<td><span class="badge">34</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=455" class="larger-body-16">Psychometrist</a></td>
<td><span class="badge">459</span></td>
</tr>
<tr>
<td><a href="index.php?page=servicecountry&servicecode=554" class="larger-body-16">Registered Counsellor</a></td>
<td><span class="badge">1,493</span></td>
</tr>
</tbody>
</table>
"""
# Parse the HTML
soup = BeautifulSoup(html, "html.parser")
# Define your base URL (this will be concatenated)
base_url = "https://www.medpages.info/sf/index.php?page=listing&servicecode=857&countryid=1®ioncode=&subregioncode="
# Extract all hrefs inside <a> tags in the table
links = []
for a in soup.select("table.table-dropdown a"):
href = a.get("href")
name = a.get_text(strip=True)
if href:
full_url = base_url + href # concatenate the given base with each href
links.append((name, full_url))
# Save to CSV
with open("medpages_mental_health.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Full URL"])
writer.writerows(links)
print(f"✅ Extracted {len(links)} links and saved to medpages_full_links.csv")
for name, url in links[:5]: # show first 5 for preview
print(name, "→", url)