-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape-kvb-qrcodes.py
More file actions
55 lines (46 loc) · 1.6 KB
/
scrape-kvb-qrcodes.py
File metadata and controls
55 lines (46 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# encoding: utf-8
"""
Liest KVB Haltestellennamen und deren QR-Code-ID
und speichert das Ergebnis in der CSV-Datei
kvb-qr-codes.csv
"""
import requests
from bs4 import BeautifulSoup
import time
import re
import csv
def scrape():
headers = {
"referer": "http://www.kvb-koeln.de/qr/haltestellen",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.3"
}
urlmask = "http://www.kvb-koeln.de/qr/haltestellen/%s/"
tokens = [
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K",
"L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z", "Ae", "Ue"]
for token in tokens:
url = urlmask % token
r = requests.get(url, headers=headers)
#print r.text
if r.status_code == 200:
soup = BeautifulSoup(r.text)
content = soup.find("center")
for a in content.find_all("a"):
#print a.get("href"), a.text
match = re.match(r"\/qr\/([0-9]+)\/", str(a.get("href")))
if match is not None:
yield (int(match.group(1)), a.text)
time.sleep(1)
def export(stations, path):
with open(path, "wb") as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
writer.writerow(["id", "name"])
for row in stations:
writer.writerow([
str(row[0]),
row[1].encode("utf8")
])
if __name__ == "__main__":
stations = list(scrape())
export(stations, "kvb-qr-codes.csv")