-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsoup.py
More file actions
77 lines (68 loc) · 2.4 KB
/
soup.py
File metadata and controls
77 lines (68 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from bs4 import BeautifulSoup
import requests
import urllib.request
import os
from socket import error as SocketError
import errno
def search(inputFile,match):
lineNo = []
line = 0;
NoOfMatches = 0
for i in inputFile:
line += 1
for j in range(len(i)):
counter = 0
for k in range(len(match)):
if match[k] == i[j + k]:
counter += 1
else:
break
if counter == len(match):
lineNo.append(line)
NoOfMatches += 1
lineNo.append(NoOfMatches)
return lineNo
if 0:
url = "https://icann562016.sched.org/"
html1 = requests.get(url)
soup = BeautifulSoup(html1.content)
primaryLinks = soup.find_all('a',class_="name")
secondaryLinks = []
for link in primaryLinks:
# os.system("mkdir " + link.string)
# f = open("/home/buridi/Desktop/SOUP/" + "folderNames.txt","a")
# f.write(link.string + "\n")
link = ( url[0:len(url) - 1] + link.get('href') + '/')
print ("Searching PDF files in: " + link)
html = requests.get(link)
newSoup = BeautifulSoup(html.content)
pdfLinks = newSoup.find_all('a',class_="file-uploaded file-uploaded-pdf")
for pdf in pdfLinks:
print ("Downloading " + pdf.get('href') + ":")
urllib.request.urlretrieve(pdf.get('href'),"/home/buridi/Desktop/SOUP/" + pdf.string[0:len(pdf.string) - 1])
print (pdf.string + "Downloaded Succesfully!!")
filepdfs = open("/home/buridi/Desktop/SOUP/" + "pdfFileNames.txt","a")
filepdfs.write(pdf.string[0:len(pdf.string) - 1] + "\n")
if 1:
# convertion int bash style string
# openFolders = open("/home/buridi/Desktop/SOUP/" + "folderNames.txt","r")
# for folder in openFolders:
openFiles = open("/home/buridi/Desktop/SOUP/" + "pdfFileNames.txt","r")
for file in openFiles:
fileName = ""
for i in file:
if i in " ~`!@#$%^&*()_-+={}[]:>;',</?*-+" :
fileName = fileName + "\\" + i
else:
fileName = fileName + i
os.system("pdftotext " + "/home/buridi/Desktop/SOUP/" + fileName)
fileRead = open("/home/buridi/Desktop/SOUP/" + file[0:len(file) - 1] + ".txt")
# Searching
lineNo = search(fileRead,"jurisdiction")
fileLog = open("/home/buridi/Desktop/SOUP/" + file[0:len(file) - 1] + "-Log.txt","a")
fileLog.write("Total No of Matches found : " + str(lineNo[len(lineNo) - 1]) + "\n")
if len(lineNo) > 1:
print (file[0:len(file) - 1])
for no in lineNo[0:len(lineNo) - 1]:
fileLog.write("Match found in Line No :" + str (no) + "\n")
print ("All Downloads Complete!!")