-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTDG.py
More file actions
58 lines (45 loc) · 1.39 KB
/
TDG.py
File metadata and controls
58 lines (45 loc) · 1.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import urllib.request, urllib.error, urllib.parse
import os
import bleach
from bs4 import BeautifulSoup
import tagrem
from tfidf import TfIdf
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
def index(urls):
for i in range(len(urls)):
url=urls[i]
response = urllib.request.urlopen(url)
webContent = response.read()
file=os.getcwd()+"/webpages/file"+str(i)+".html"
if not os.path.exists(os.getcwd()+"/webpages"):
os.mkdir(os.getcwd()+"/webpages")
f = open(file, 'wb')
f.write(webContent)
f.close
def word_preprocessing(list):
lemmatizer = WordNetLemmatizer()
for i in list:
i.lower()
lemmatizer.lemmatize(i)
return list
if __name__ =="__main__":
print("Please enter the URLs saperated by a space" )
urls=input().strip().split()
index(urls)
l1=[]
l2=tagrem.text_parser(urls,l1)
print("Please enter the query saperated by space")
query=input().strip().split()
print("The entered urls are")
for i in range(len(urls)):
print(i +":"+urls[i])
query=word_preprocessing(query)
table = TfIdf()
for i in range(len(urls)):
table.add_document(str(i),l2[i])
x=table.similarities(query)
print("URLs Indexed by cosine rankings are")
for i in x:
a=int(x[i][0])
print(i+":"+urls[a])