-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathTextMining.py
More file actions
56 lines (50 loc) · 1.62 KB
/
TextMining.py
File metadata and controls
56 lines (50 loc) · 1.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#Victor Bianchi - Software Design - Fall 2017
#Mini-Project #3
#Wikipedia trivia: if you take any article, click on the first link in the article text
#not in parentheses or italics, and then repeat, you will eventually end up at ‘Philosophy’.
import urllib
import sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
user_choice = ' '.join(sys.argv[1:])
if user_choice == '':
start = input("Please enter the name of the first article (leave blank for random) : ")
link = "http://en.wikipedia.org/wiki/" + start
else: #choose random page
link = "http://en.wikipedia.org/wiki/Special:Random"
url = urllib.request.urlopen(link)
visited_urls = []
current = ""
goal = "Philosophy"
steps = 0
print ('\n' + start + '\n -->')
while (current != goal):
BS = BeautifulSoup(url, "lxml")
link = BS.find("div", {"id":"mw-content-text"})
path = link.find("p").find_all("a")
for links in path:
if (str(links.get("title")) == "None" or (str(links.get("title")) == path)):
pass
else:
z = str(links.get("title"))
if (len(z.split()) >= 2):
name = ''
for current in range(len(z.split())):
name += z.split()[current]
if (current != (len(z.split())-1)):
name += "_"
z = name
next_link = "http://en.wikipedia.org/wiki/" + z
#print next_link
current = str(links.get("title"))
visited_urls.append(current)
print (current)
steps += 1
if (current != goal):
print (" --> ")
if (steps == 50 ):
print ("This is taking too long. I quit")
sys.exit(1)
url = urllib.request.urlopen(next_link)
break
print ('\nTo get from ' + start + ' to ' + goal + ', it took %i clicks.' %steps)