pyMSSQL/articleCleanUp.py at master · FoodSentimentObservatory/pyMSSQL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from os import getenv
import pymssql
import csv
import spacy
import re
import sys
from spacy.attrs import ORTH
import numpy
import os
from spacy import en
from operator import itemgetter
from collections import Counter
import textCleanUp
import agent_sort
import spacyStopWords


wordCount = [("Id","Text", "Count")]
allWords = []
nlp = spacy.load("en")
repeatedWords = []
uniqueWords = []
allWordsFrequency = []
spacyStopWords.stopWordsList(nlp)
allTexts = []

def articleCleanup(path):
    SKIP_FILES = {'cmds'}
    doc_set = []
    folderName = os.path.basename(os.path.normpath(path))

    #reading the files in the folder
    for root, dir_names, file_names in os.walk(path):
            for path in dir_names:
                read_files(os.path.join(root, path))
            for file_name in file_names:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                        #reading file and adding each line to list
                    for line in f:
                        doc_set.append(line)
                    f.close()
    #loops through each line of text and removes stop words,urls, punctuation, numbers and other non-alphabetical symbols
    lineText = []
    counter = "text"
    for line in doc_set:
        lineS = str(line.lower())
        sentence = nlp(lineS)
        cleanText = []
        textCleanUp.textCleanup(allWords, sentence, cleanText)
        lineText.append(cleanText)
    textCleanUp.frequencyCount(nlp, allWords, repeatedWords, uniqueWords, allWordsFrequency,folderName)
    i = 0
    allTextsS = []
    #adding each word that has been repeated in the list of all texts
    for line in lineText:
        newLine = []
        for word in line:
            if word not in uniqueWords:
                allTextsS.append(word)
                newLine.append(word)
        allTexts.append(newLine)

    agent_sort.splitFile(allTexts, folderName)