-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharticleCleanUp.py
More file actions
66 lines (61 loc) · 2 KB
/
articleCleanUp.py
File metadata and controls
66 lines (61 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from os import getenv
import pymssql
import csv
import spacy
import re
import sys
from spacy.attrs import ORTH
import numpy
import os
from spacy import en
from operator import itemgetter
from collections import Counter
import textCleanUp
import agent_sort
import spacyStopWords
wordCount = [("Id","Text", "Count")]
allWords = []
nlp = spacy.load("en")
repeatedWords = []
uniqueWords = []
allWordsFrequency = []
spacyStopWords.stopWordsList(nlp)
allTexts = []
def articleCleanup(path):
SKIP_FILES = {'cmds'}
doc_set = []
folderName = os.path.basename(os.path.normpath(path))
#reading the files in the folder
for root, dir_names, file_names in os.walk(path):
for path in dir_names:
read_files(os.path.join(root, path))
for file_name in file_names:
file_path = os.path.join(root, file_name)
if os.path.isfile(file_path):
past_header, lines = False, []
f = open(file_path, encoding="latin-1")
#reading file and adding each line to list
for line in f:
doc_set.append(line)
f.close()
#loops through each line of text and removes stop words,urls, punctuation, numbers and other non-alphabetical symbols
lineText = []
counter = "text"
for line in doc_set:
lineS = str(line.lower())
sentence = nlp(lineS)
cleanText = []
textCleanUp.textCleanup(allWords, sentence, cleanText)
lineText.append(cleanText)
textCleanUp.frequencyCount(nlp, allWords, repeatedWords, uniqueWords, allWordsFrequency,folderName)
i = 0
allTextsS = []
#adding each word that has been repeated in the list of all texts
for line in lineText:
newLine = []
for word in line:
if word not in uniqueWords:
allTextsS.append(word)
newLine.append(word)
allTexts.append(newLine)
agent_sort.splitFile(allTexts, folderName)