-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpp_tut.py
More file actions
82 lines (57 loc) · 2.58 KB
/
pp_tut.py
File metadata and controls
82 lines (57 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import matplotlib.pyplot as plt
import pandas as pd
data_file = "chocolate_data.csv"
data_file2 = "test.csv"
df = pd.read_csv(data_file, sep='\t')
print("The DataFrame shape: {}".format(df.shape))
df.describe()
df['Cocoa Percent'] = df['Cocoa Percent'].str.replace('%','')
df['Cocoa Percent'] = pd.to_numeric(df['Cocoa Percent']) #Casts to float. Maybe think of making them all integers
df.describe() #Now includes stats for coca percent
df.tail()
#See that there are 1937 instances
#Check for missing values for each column
pd.isna(df['Bean Type']).head(7)
keys = df.keys()
kp_dict = {}
for key in keys: #Alternatively, for col in df.columns:
missing_vals = pd.isna(df[key]).sum()
missing_vals_pcent = missing_vals*100/1937 #Bad idea to hardcode total list length
print("{}=> {}/1937 missing: {}%".format(key,missing_vals, missing_vals_pcent))
kp_dict[key]= missing_vals_pcent
#The entire for loop above is more concisely expressed as df.count()
for key in keys:
if (pd.isna(df[key]).sum()/1937) > 0.50:
df = df.drop(key,axis=1) #Axis =1 means drop column. Axis=0(default) is drop row
#Check number of missing values per row
total=0
todrop = []
for row in df.index:
if ((pd.isna(df.iloc[row]).sum())/8) > 0.49:
#culprit = df.iloc[row]
total += 1
todrop.append(row)
df = df.drop(todrop)
print("There are {} rows with too little data".format(total))
#Note the missing values for cocoa percent and ratings
df.shape[0]-df.count(axis=0) #Find the missing values for all columns
#Plot box and whisker plots for both of these (also describe)
#bp=df['Cocoa Percent'].plot.box() #Requires matplotlib
#bp=df['Cocoa Percent'].plot.box() #Requires matplotlib
#Problems Gallore
#https://stackoverflow.com/questions/15884075/tkinter-in-a-virtualenv > For Ubuntu (probably unix-like OSes)
#https://stackoverflow.com/questions/45279754/python-install-tkinter-on-virtualenv-on-linux?noredirect=1&lq=1
#https://stackoverflow.com/questions/40457025/box-plot-of-a-pandas-data-frame
bp=df.boxplot(column='Cocoa Percent') #Requires matplotlib
plt.show() #Actually display the boxplot. This step may cause tremendous headaches!!!
bp2=df.boxplot(column='Rating')
plt.show()
#Decide for each whether to fill in with mean or mode
#For Rating, there are few outliers, therefore use mean
m= df['Rating'].mean()
df['Rating'] = df['Rating'].fillna(m)
#For Cocoa Percent, there are many outliers, therefore replace with median
m= df['Cocoa Percent'].median()
df['Cocoa Percent'] = df['Cocoa Percent'].fillna(m)
#Plot rating histogram to explain why we use the average
#Comment about how missing values can be data in themselves