SemEval2019-Task4/clean_shuffle.py at master · michaelfaerber/SemEval2019-Task4 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#------------------------------------------------------------------------------------
# Name:        Clean and shuffle data from input file and
# Purpose:     This module is used create a pandas dataframe from the input
#              file and to clean it in various ways.
#
# Execution:   Not executable as a standalone program (collection of functions)
#
# Author:      Ashwath Sampath
#
# Created:     22-11-2018 (V1.0): Moved from common program to a separate module
# Revisions:   2-12-2018 (V1.1): Added id column to df (pd.read_csv).
#                                Added sem_eval_dir, writing/reading df from pickle
#              13-12-2018 (V1.2): New function to create a df without hyperpartisan.
#------------------------------------------------------------------------------------


import pandas as pd
import re
#from nltk.corpus import stopwords
import csv
from gensim.parsing import preprocessing
import contractions
# import inflect
from sklearn.utils import shuffle
from tqdm import tqdm
import pickle
import os
from nltk import tokenize
import math

def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

def clean_text(text):
    """ Cleans the text in the only argument in various steps
    ARGUMENTS: text: content/title, string
    RETURNS: cleaned text, string"""
    if isfloat(text):
        try:
            if math.isnan(text):
                return ''
        except TypeError:
            print('text: {}'.format(text))
            return ''

    # Replace newlines by space. We want only one doc vector.
    text = text.replace('\n', ' ').lower()
    # Expand contractions: you're to you are and so on.
    text = contractions.fix(text)
    # Remove stop words
    text = preprocessing.remove_stopwords(text)
    # Remove html tags and numbers: can numbers possible be useful?
    text = preprocessing.strip_tags(preprocessing.strip_numeric(text))
    # Remove punctuation -- all special characters
    text = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(text))
    #text = re.sub(r'[^\w\s]', '', text.lower())
    # STEMMING (Porter) automatically lower-cases as well
    # To stem or not to stem, that is the question
    #text = preprocessing.stem_text(text)
    return text

def read_prepare_df(filename, file_path='', should_clean_text=True):
    """ Read a file, put it in a dataframe. Drop unnecessary columns, clean the content.
    Please provide an absolute path.
    ARGUMENTS: filename: path to the input file, string
    RETURNS: df: a 'cleaned' Pandas dataframe with 3 columns (content, title and hyperpartisan) in
                 which nulls in content/title have been dropped"""
    if os.path.isfile(file_path):
      df = pd.read_pickle(file_path)
    else:
        df = pd.read_csv(filename, sep='\t', encoding='utf-8', names=['id', 'title','content','hyperpartisan'])
        print("Original DF shape = {}".format(df.shape))
        # url is useless, remove it. Remove bias too. I no longer have them.
        #df = df.drop(['url', 'bias'], axis=1)
        # Drop NaNs!!
        # df = df[pd.notnull(df['content'])]
        # df = df[pd.notnull(df['title'])]
        # Question: should I combine the title and content in one field?
        if should_clean_text:
            print('Cleaning content...')
            df.content = df['content'].apply(clean_text)
            df.title = df['title'].apply(clean_text)
        # Shuffle it
        df = shuffle(df, random_state=13)
        print("Dataframe shape after cleaning = {}".format(df.shape))
        if file_path:
            print('Writing dataframe to disk...')
            df.to_pickle(file_path)

    # Shuffle randomly
    df = df.sample(frac=1).reset_index(drop=True)
    return df

def read_prepare_sentence_df(filename, file_path):
  if False: #os.path.isfile(file_path):
    df = pd.read_pickle(file_path)
  else:
    df = pd.read_csv(filename, sep='\t', encoding='utf-8', names=['id', 'title','content','hyperpartisan'])
    print("Original DF shape = {}".format(df.shape))
    # url is useless, remove it. Remove bias too. I no longer have them.
    #df = df.drop(['url', 'bias'], axis=1)
    # Drop NaNs!!
    # df = df[pd.notnull(df['content'])]
    # df = df[pd.notnull(df['title'])]

    df['text'] = df['title'] + '. ' + df['content']

    print(df.columns)

    sentences = []
    for index, article in df.iterrows():
        article_id = article['id']
        article_hyperpartisan = article['hyperpartisan']
        sent_text = tokenize.sent_tokenize(article['text'])
        for sentence in sent_text:
            sentences.append({ 'id': article_id, 'hyperpartisan': article_hyperpartisan, 'text': sentence })
    df = pd.DataFrame(sentences)

    print('Cleaning content...')
    df['text'] = df['text'].apply(clean_text)

    # Shuffle it
    df = shuffle(df, random_state=13)
    print("Dataframe shape after cleaning = {}".format(df.shape))
    if file_path:
        print('Writing dataframe to disk...')
        df.to_pickle(file_path)
  return df

def read_prepare_test_df(filename, file_path=''):
    """ Read a file, put it in a dataframe. Drop unnecessary columns, clean the content.
    Please provide an absolute path.
    ARGUMENTS: filename: path to the input file, string
    RETURNS: df: a 'cleaned' Pandas dataframe with 2 columns (id, content, title) in
                 which nulls in content/title have been dropped"""
    if os.path.isfile(file_path):
      df = pd.read_pickle(file_path)
    else:
        df = pd.read_csv(filename, sep='\t', encoding='utf-8', names=['id', 'title','content'])
        print("Original DF shape = {}".format(df.shape))
        # Drop NaNs!!
        # df = df[pd.notnull(df['content'])]
        # df = df[pd.notnull(df['title'])]
        # Question: should I combine the title and content in one field?
        print('Cleaning content...')
        df.content = df['content'].apply(clean_text)
        df.title = df['title'].apply(clean_text)
        # Shuffle it
        df = shuffle(df, random_state=13)
        print("Dataframe shape after cleaning = {}".format(df.shape))
        if file_path:
            print('Writing dataframe to disk...')
            df.to_pickle(file_path)
    return df