77from dataset import AuxTables
88from dataset .table import Table , Source
99
10- errors_template = Template ('SELECT count(*) ' \
11- 'FROM $init_table as t1, $grdt_table as t2 ' \
12- 'WHERE t1._tid_ = t2._tid_ ' \
13- 'AND t2._attribute_ = \' $attr\' ' \
14- 'AND t1."$attr" != t2._value_' )
10+ errors_template = Template ("""
11+ SELECT
12+ count(*)
13+ FROM
14+ $raw_table as t1
15+ INNER JOIN
16+ $clean_table as t2
17+ ON
18+ t1._tid_ = t2._tid_
19+ AND t2._attribute_ = '$attr'
20+ AND t1."$attr" != t2._value_
21+ """ )
1522
1623"""
1724The 'errors' aliased subquery returns the (_tid_, _attribute_, _value_)
2330We then count the number of cells that we repaired to the correct ground
2431truth value.
2532"""
26- correct_repairs_template = Template ('SELECT COUNT(*) FROM' \
27- '(SELECT t2._tid_, t2._attribute_, t2._value_ ' \
28- 'FROM $init_table as t1, $grdt_table as t2 ' \
29- 'WHERE t1._tid_ = t2._tid_ ' \
30- 'AND t2._attribute_ = \' $attr\' ' \
31- 'AND t1."$attr" != t2._value_ ) as errors, $inf_dom as repairs ' \
32- 'WHERE errors._tid_ = repairs._tid_ ' \
33- 'AND errors._attribute_ = repairs.attribute ' \
34- 'AND errors._value_ = repairs.rv_value' )
33+ correct_repairs_template = Template ("""
34+ SELECT
35+ count(*)
36+ FROM (
37+ SELECT
38+ t2._tid_,
39+ t2._attribute_,
40+ t2._value_
41+ FROM
42+ $raw_table AS t1
43+ INNER JOIN
44+ $clean_table AS t2
45+ ON
46+ t1._tid_ = t2._tid_
47+ AND t2._attribute_ = '$attr'
48+ AND t1."$attr" != t2._value_
49+ ) AS errors
50+ INNER JOIN
51+ $inf_dom AS repairs
52+ ON
53+ errors._tid_ = repairs._tid_
54+ AND errors._attribute_ = repairs.attribute
55+ AND errors._value_ = repairs.rv_value
56+ """ )
3557
3658
3759class EvalEngine :
@@ -64,7 +86,7 @@ def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None):
6486
6587 def evaluate_repairs (self ):
6688 self .compute_total_repairs ()
67- self .compute_total_repairs_grdt ()
89+ self .compute_total_repairs_clean ()
6890 self .compute_total_errors ()
6991 self .compute_detected_errors ()
7092 self .compute_correct_repairs ()
@@ -79,10 +101,10 @@ def eval_report(self):
79101 tic = time .clock ()
80102 try :
81103 prec , rec , rep_recall , f1 , rep_f1 = self .evaluate_repairs ()
82- report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs (Grdth present ) = %d" % (
83- prec , rec , rep_recall , f1 , rep_f1 , self .detected_errors , self .total_errors , self .correct_repairs , self .total_repairs , self .total_repairs_grdt )
104+ report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs (clean data ) = %d" % (
105+ prec , rec , rep_recall , f1 , rep_f1 , self .detected_errors , self .total_errors , self .correct_repairs , self .total_repairs , self .total_repairs_clean )
84106 report_list = [prec , rec , rep_recall , f1 , rep_f1 , self .detected_errors , self .total_errors ,
85- self .correct_repairs , self .total_repairs , self .total_repairs_grdt ]
107+ self .correct_repairs , self .total_repairs , self .total_repairs_clean ]
86108 except Exception as e :
87109 logging .error ("ERROR generating evaluation report %s" % e )
88110 raise
@@ -91,70 +113,78 @@ def eval_report(self):
91113 return report , report_time , report_list
92114
93115 def compute_total_repairs (self ):
116+ """
117+ compute_total_repairs memoizes into self.total_repairs
118+ the number of cells where the initial value differs from the inferred
119+ value (i.e. the number of repairs) for the entities in the TRAINING data.
120+ """
94121 # TODO(richardwu): how do we define a "repair" if we have multiple
95122 # init values?
96123 query = """
97124 SELECT
98125 count(*)
99126 FROM
100- (SELECT
101- _vid_
102- FROM
103- {cell_domain} AS t1,
104- {inf_values_dom} as t2
105- WHERE
106- t1._tid_ = t2._tid_
107- AND t1.attribute = t2.attribute
108- AND t1.init_values != t2.rv_value
109- ) AS t
127+ {cell_domain} AS t1
128+ INNER JOIN
129+ {inf_values_dom} as t2
130+ ON
131+ t1._tid_ = t2._tid_
132+ AND t1.attribute = t2.attribute
133+ WHERE
134+ t1.init_values != t2.rv_value
110135 """ .format (cell_domain = AuxTables .cell_domain .name ,
111136 inf_values_dom = AuxTables .inf_values_dom .name )
112137 res = self .ds .engine .execute_query (query )
113138 self .total_repairs = float (res [0 ][0 ])
114139
115- def compute_total_repairs_grdt (self ):
140+ def compute_total_repairs_clean (self ):
141+ """
142+ compute_total_repairs_clean memoizes into self.total_repairs_clean
143+ the number of cells where the initial value differs from the inferred
144+ value (i.e. the number of repairs) for the entities in the TEST (clean) data.
145+ """
116146 # TODO(richardwu): how do we define a "repair" if we have multiple
117147 # init values?
118148 query = """
119149 SELECT
120150 count(*)
121151 FROM
122- (SELECT
123- _vid_
124- FROM
125- {cell_domain} AS t1,
126- {inf_values_dom} AS t2,
127- {clean_data} AS t3
128- WHERE
129- t1._tid_ = t2._tid_
130- AND t1.attribute = t2.attribute
131- AND t1.init_values != t2.rv_value
132- AND t1._tid_ = t3._tid_
133- AND t1.attribute = t3._attribute_
134- ) AS t
152+ {cell_domain} AS t1
153+ INNER JOIN
154+ {inf_values_dom} AS t2
155+ ON
156+ t1._tid_ = t2._tid_
157+ AND t1.attribute = t2.attribute
158+ INNER JOIN
159+ {clean_data} AS t3
160+ ON
161+ t1._tid_ = t3._tid_
162+ AND t1.attribute = t3._attribute_
163+ WHERE
164+ t1.init_values != t2.rv_value
135165 """ .format (cell_domain = AuxTables .cell_domain .name ,
136166 inf_values_dom = AuxTables .inf_values_dom .name ,
137167 clean_data = self .clean_data .name )
138168 res = self .ds .engine .execute_query (query )
139- self .total_repairs_grdt = float (res [0 ][0 ])
169+ self .total_repairs_clean = float (res [0 ][0 ])
140170
141171 def compute_total_errors (self ):
142172 queries = []
143173 total_errors = 0.0
144174 for attr in self .ds .get_attributes ():
145- query = errors_template .substitute (init_table = self .ds .raw_data .name , grdt_table = self .clean_data .name ,
175+ query = errors_template .substitute (raw_table = self .ds .raw_data .name , clean_table = self .clean_data .name ,
146176 attr = attr )
147177 queries .append (query )
148178 results = self .ds .engine .execute_queries (queries )
149179 for res in results :
150180 total_errors += float (res [0 ][0 ])
151181 self .total_errors = total_errors
152182
153- def compute_total_errors_grdt (self ):
183+ def compute_total_errors_clean (self ):
154184 queries = []
155185 total_errors = 0.0
156186 for attr in self .ds .get_attributes ():
157- query = errors_template .substitute (init_table = self .ds .raw_data .name , grdt_table = self .clean_data .name ,
187+ query = errors_template .substitute (raw_table = self .ds .raw_data .name , clean_table = self .clean_data .name ,
158188 attr = attr )
159189 queries .append (query )
160190 results = self .ds .engine .execute_queries (queries )
@@ -163,6 +193,11 @@ def compute_total_errors_grdt(self):
163193 self .total_errors = total_errors
164194
165195 def compute_detected_errors (self ):
196+ """
197+ compute_detected_errors
198+ """
199+ # TODO(richardwu): how do we define a "repair" if we have multiple
200+ # init values?
166201 query = """
167202 SELECT
168203 count(*)
@@ -177,7 +212,7 @@ def compute_detected_errors(self):
177212 t1._tid_ = t2._tid_
178213 AND t1._cid_ = t3._cid_
179214 AND t1.attribute = t2._attribute_
180- AND t1.current_value != t2._value_
215+ AND t1.init_values != t2._value_
181216 ) AS t
182217 """ .format (cell_domain = AuxTables .cell_domain .name ,
183218 clean_data = self .clean_data .name ,
@@ -189,7 +224,7 @@ def compute_correct_repairs(self):
189224 queries = []
190225 correct_repairs = 0.0
191226 for attr in self .ds .get_attributes ():
192- query = correct_repairs_template .substitute (init_table = self .ds .raw_data .name , grdt_table = self .clean_data .name ,
227+ query = correct_repairs_template .substitute (raw_table = self .ds .raw_data .name , clean_table = self .clean_data .name ,
193228 attr = attr , inf_dom = AuxTables .inf_values_dom .name )
194229 queries .append (query )
195230 results = self .ds .engine .execute_queries (queries )
@@ -208,9 +243,9 @@ def compute_repairing_recall(self):
208243 return self .correct_repairs / self .detected_errors
209244
210245 def compute_precision (self ):
211- if self .total_repairs_grdt == 0 :
246+ if self .total_repairs_clean == 0 :
212247 return 0
213- return self .correct_repairs / self .total_repairs_grdt
248+ return self .correct_repairs / self .total_repairs_clean
214249
215250 def compute_f1 (self ):
216251 prec = self .compute_precision ()
0 commit comments