HoloClean
diff --git a/‎dataset/dataset.py‎
Lines changed: 49 additions & 10 deletions b/‎dataset/dataset.py‎
Lines changed: 49 additions & 10 deletions
diff --git a/‎evaluate/eval.py‎
Lines changed: 85 additions & 50 deletions b/‎evaluate/eval.py‎
Lines changed: 85 additions & 50 deletions
diff --git a/‎holoclean.py‎
Lines changed: 2 additions & 0 deletions b/‎holoclean.py‎
Lines changed: 2 additions & 0 deletions
@@ -135,13 +135,21 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None):
     def set_constraints(self, constraints):
         self.constraints = constraints
 
+    def aux_table_exists(self, aux_table):
+        """
+        get_aux_table returns True if :param aux_table: has been generated.
+
+        :param aux_table: (AuxTables(Enum)) auxiliary table to check
+        """
+        return aux_table in self.aux_tables
+
     def get_aux_table(self, aux_table):
         """
         get_aux_table returns the Table associated with :param aux_table:.
 
-        :param aux_table: (AuxTables(Enum)) auxiliary table to check
+        :param aux_table: (AuxTables(Enum)) auxiliary table to retrieve
         """
-        if aux_table not in self.aux_tables:
+        if not self.aux_table_exists(aux_table):
             raise Exception("{} auxiliary table has not been generated".format(aux_table))
         return self.aux_tables[aux_table]
 
@@ -218,14 +226,14 @@ def get_cell_id(self, tuple_id, attr_name):
 
     def get_statistics(self):
         if not self.stats_ready:
-            self._collect_stats()
+            self.collect_stats()
         stats = (self.total_tuples, self.single_attr_stats, self.pair_attr_stats)
         self.stats_ready = True
         return stats
 
-    def _collect_stats(self):
+    def collect_stats(self):
         """
-        _collect_stats memoizes:
+        collect_stats calculates and memoizes: (based on current statistics)
           1. self.single_attr_stats ({ attribute -> Series (value -> count) })
             the frequency (# of entities) of a given attribute-value
           2. self.pair_attr_stats ({ attr1 -> { attr2 -> DataFrame } } where
@@ -236,7 +244,7 @@ def _collect_stats(self):
             Also known as co-occurrence count.
         """
 
-        self.total_tuples = self.get_raw_data().shape[0]
+        self.total_tuples = self.get_raw_data()['_tid_'].nunique()
         # Single attribute-value frequency
         for attr in self.get_attributes():
             self.single_attr_stats[attr] = self._get_stats_single(attr)
@@ -251,8 +259,21 @@ def _get_stats_single(self, attr):
         """
         Returns a Series indexed on possible values for 'attr' and contains the frequency.
         """
-        tmp_df = self.get_raw_data()[[attr]].groupby([attr]).size()
-        return tmp_df
+
+        # If cell_domain has not been initialized yet, retrieve statistics
+        # from raw data (this happens when the domain is just being setup)
+        if not self.aux_table_exists(AuxTables.cell_domain):
+            return self.get_raw_data()[[attr]].groupby([attr]).size()
+
+        # Retrieve statistics on current value from cell_domain
+
+        df_domain = self.get_aux_table(AuxTables.cell_domain).df
+        df_count = df_domain.loc[df_domain['attribute'] == attr, 'current_value'].value_counts()
+        # We do not store attributes with only NULL values in cell_domain:
+        # we require _nan_ in our single stats however
+        if df_count.empty:
+            return pd.Series(self.total_tuples, index=['_nan_'])
+        return df_count
 
     def _get_stats_pair(self, cond_attr, trg_attr):
         """
@@ -261,8 +282,26 @@ def _get_stats_pair(self, cond_attr, trg_attr):
             <trg_attr>: all values for trg_attr that appeared at least once with <val1> ('val2')
             <count>: frequency (# of entities) where cond_attr: val1 AND trg_attr: val2
         """
-        tmp_df = self.get_raw_data()[[cond_attr,trg_attr]].groupby([cond_attr,trg_attr]).size().reset_index(name="count")
-        return tmp_df
+        # If cell_domain has not been initialized yet, retrieve statistics
+        # from raw data (this happens when the domain is just being setup)
+        if not self.aux_table_exists(AuxTables.cell_domain):
+            return self.get_raw_data()[[cond_attr,trg_attr]].groupby([cond_attr,trg_attr]).size().reset_index(name="count")
+
+        # Retrieve pairwise statistics on current value from cell_domain
+
+        df_domain = self.get_aux_table(AuxTables.cell_domain).df
+        # Filter cell_domain for only the attributes we care about
+        df_domain = df_domain[df_domain['attribute'].isin([cond_attr, trg_attr])]
+        # Convert to wide form so we have our :param cond_attr:
+        # and :trg_attr: as columns along with the _tid_ column
+        df_domain = df_domain[['_tid_', 'attribute', 'current_value']].pivot(index='_tid_', columns='attribute', values='current_value')
+        # We do not store cells for attributes consisting of only NULL values in cell_domain.
+        # We require this for pair stats though.
+        if cond_attr not in df_domain.columns:
+            df_domain[cond_attr] = '_nan_'
+        if trg_attr not in df_domain.columns:
+            df_domain[trg_attr] = '_nan_'
+        return df_domain.groupby([cond_attr, trg_attr]).size().reset_index(name="count")
 
     def get_domain_info(self):
         """
 
@@ -7,11 +7,18 @@
 from dataset import AuxTables
 from dataset.table import Table, Source
 
-errors_template = Template('SELECT count(*) '\
-                            'FROM $init_table as t1, $grdt_table as t2 '\
-                            'WHERE t1._tid_ = t2._tid_ '\
-                              'AND t2._attribute_ = \'$attr\' '\
-                              'AND t1."$attr" != t2._value_')
+errors_template = Template("""
+SELECT
+    count(*)
+FROM
+    $raw_table as t1
+INNER JOIN
+    $clean_table as t2
+ON
+    t1._tid_ = t2._tid_
+    AND t2._attribute_ = '$attr'
+    AND t1."$attr" != t2._value_
+""")
 
 """
 The 'errors' aliased subquery returns the (_tid_, _attribute_, _value_)
@@ -23,15 +30,30 @@
 We then count the number of cells that we repaired to the correct ground
 truth value.
 """
-correct_repairs_template = Template('SELECT COUNT(*) FROM'\
-                            '(SELECT t2._tid_, t2._attribute_, t2._value_ '\
-                             'FROM $init_table as t1, $grdt_table as t2 '\
-                             'WHERE t1._tid_ = t2._tid_ '\
-                               'AND t2._attribute_ = \'$attr\' '\
-                               'AND t1."$attr" != t2._value_ ) as errors, $inf_dom as repairs '\
-                              'WHERE errors._tid_ = repairs._tid_ '\
-                                'AND errors._attribute_ = repairs.attribute '\
-                                'AND errors._value_ = repairs.rv_value')
+correct_repairs_template = Template("""
+SELECT
+    count(*)
+FROM (
+    SELECT
+        t2._tid_,
+        t2._attribute_,
+        t2._value_
+    FROM
+        $raw_table AS t1
+    INNER JOIN
+        $clean_table AS t2
+    ON
+        t1._tid_ = t2._tid_
+        AND t2._attribute_ = '$attr'
+        AND t1."$attr" != t2._value_
+) AS errors
+INNER JOIN
+    $inf_dom AS repairs
+ON
+    errors._tid_ = repairs._tid_
+    AND errors._attribute_ = repairs.attribute
+    AND errors._value_ = repairs.rv_value
+""")
 
 
 class EvalEngine:
@@ -64,7 +86,7 @@ def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None):
 
     def evaluate_repairs(self):
         self.compute_total_repairs()
-        self.compute_total_repairs_grdt()
+        self.compute_total_repairs_clean()
         self.compute_total_errors()
         self.compute_detected_errors()
         self.compute_correct_repairs()
@@ -79,10 +101,10 @@ def eval_report(self):
         tic = time.clock()
         try:
             prec, rec, rep_recall, f1, rep_f1 = self.evaluate_repairs()
-            report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs (Grdth present) = %d" % (
-                      prec, rec, rep_recall, f1, rep_f1, self.detected_errors, self.total_errors, self.correct_repairs, self.total_repairs, self.total_repairs_grdt)
+            report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs (clean data) = %d" % (
+                      prec, rec, rep_recall, f1, rep_f1, self.detected_errors, self.total_errors, self.correct_repairs, self.total_repairs, self.total_repairs_clean)
             report_list = [prec, rec, rep_recall, f1, rep_f1, self.detected_errors, self.total_errors,
-                           self.correct_repairs, self.total_repairs, self.total_repairs_grdt]
+                           self.correct_repairs, self.total_repairs, self.total_repairs_clean]
         except Exception as e:
             logging.error("ERROR generating evaluation report %s" % e)
             raise
@@ -91,70 +113,78 @@ def eval_report(self):
         return report, report_time, report_list
 
     def compute_total_repairs(self):
+        """
+        compute_total_repairs memoizes into self.total_repairs
+        the number of cells where the initial value differs from the inferred
+        value (i.e. the number of repairs) for the entities in the TRAINING data.
+        """
         # TODO(richardwu): how do we define a "repair" if we have multiple
         # init values?
         query = """
         SELECT
             count(*)
         FROM
-            (SELECT
-                _vid_
-            FROM
-                {cell_domain} AS t1,
-                {inf_values_dom} as t2
-            WHERE
-                t1._tid_ = t2._tid_
-                AND t1.attribute = t2.attribute
-                AND t1.init_values != t2.rv_value
-            ) AS t
+            {cell_domain} AS t1
+        INNER JOIN
+            {inf_values_dom} as t2
+        ON
+            t1._tid_ = t2._tid_
+            AND t1.attribute = t2.attribute
+        WHERE
+            t1.init_values != t2.rv_value
         """.format(cell_domain=AuxTables.cell_domain.name,
                 inf_values_dom=AuxTables.inf_values_dom.name)
         res = self.ds.engine.execute_query(query)
         self.total_repairs = float(res[0][0])
 
-    def compute_total_repairs_grdt(self):
+    def compute_total_repairs_clean(self):
+        """
+        compute_total_repairs_clean memoizes into self.total_repairs_clean
+        the number of cells where the initial value differs from the inferred
+        value (i.e. the number of repairs) for the entities in the TEST (clean) data.
+        """
         # TODO(richardwu): how do we define a "repair" if we have multiple
         # init values?
         query = """
         SELECT
             count(*)
         FROM
-            (SELECT
-                _vid_
-            FROM
-                {cell_domain} AS t1,
-                {inf_values_dom} AS t2,
-                {clean_data} AS t3
-            WHERE
-                t1._tid_ = t2._tid_
-                AND t1.attribute = t2.attribute
-                AND t1.init_values != t2.rv_value
-                AND t1._tid_ = t3._tid_
-                AND t1.attribute = t3._attribute_
-            ) AS t
+            {cell_domain} AS t1
+        INNER JOIN
+            {inf_values_dom} AS t2
+        ON
+            t1._tid_ = t2._tid_
+            AND t1.attribute = t2.attribute
+        INNER JOIN
+            {clean_data} AS t3
+        ON
+            t1._tid_ = t3._tid_
+            AND t1.attribute = t3._attribute_
+        WHERE
+            t1.init_values != t2.rv_value
         """.format(cell_domain=AuxTables.cell_domain.name,
                 inf_values_dom=AuxTables.inf_values_dom.name,
                 clean_data=self.clean_data.name)
         res = self.ds.engine.execute_query(query)
-        self.total_repairs_grdt = float(res[0][0])
+        self.total_repairs_clean = float(res[0][0])
 
     def compute_total_errors(self):
         queries = []
         total_errors = 0.0
         for attr in self.ds.get_attributes():
-            query = errors_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
+            query = errors_template.substitute(raw_table=self.ds.raw_data.name, clean_table=self.clean_data.name,
                         attr=attr)
             queries.append(query)
         results = self.ds.engine.execute_queries(queries)
         for res in results:
             total_errors += float(res[0][0])
         self.total_errors = total_errors
 
-    def compute_total_errors_grdt(self):
+    def compute_total_errors_clean(self):
         queries = []
         total_errors = 0.0
         for attr in self.ds.get_attributes():
-            query = errors_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
+            query = errors_template.substitute(raw_table=self.ds.raw_data.name, clean_table=self.clean_data.name,
                         attr=attr)
             queries.append(query)
         results = self.ds.engine.execute_queries(queries)
@@ -163,6 +193,11 @@ def compute_total_errors_grdt(self):
         self.total_errors = total_errors
 
     def compute_detected_errors(self):
+        """
+        compute_detected_errors
+        """
+        # TODO(richardwu): how do we define a "repair" if we have multiple
+        # init values?
         query = """
         SELECT
             count(*)
@@ -177,7 +212,7 @@ def compute_detected_errors(self):
                 t1._tid_ = t2._tid_
                 AND t1._cid_ = t3._cid_
                 AND t1.attribute = t2._attribute_
-                AND t1.current_value != t2._value_
+                AND t1.init_values != t2._value_
             ) AS t
         """.format(cell_domain=AuxTables.cell_domain.name,
                 clean_data=self.clean_data.name,
@@ -189,7 +224,7 @@ def compute_correct_repairs(self):
         queries = []
         correct_repairs = 0.0
         for attr in self.ds.get_attributes():
-            query = correct_repairs_template.substitute(init_table=self.ds.raw_data.name, grdt_table=self.clean_data.name,
+            query = correct_repairs_template.substitute(raw_table=self.ds.raw_data.name, clean_table=self.clean_data.name,
                         attr=attr, inf_dom=AuxTables.inf_values_dom.name)
             queries.append(query)
         results = self.ds.engine.execute_queries(queries)
@@ -208,9 +243,9 @@ def compute_repairing_recall(self):
         return self.correct_repairs / self.detected_errors
 
     def compute_precision(self):
-        if self.total_repairs_grdt == 0:
+        if self.total_repairs_clean == 0:
             return 0
-        return self.correct_repairs / self.total_repairs_grdt
+        return self.correct_repairs / self.total_repairs_clean
 
     def compute_f1(self):
         prec = self.compute_precision()
 
@@ -272,6 +272,8 @@ def repair_errors(self, featurizers, em_iterations=1, em_iter_func=None):
                 logging.debug('Time to retrieve featurizer weights: %.2f secs' % time)
             # Update current values with inferred values
             self.ds.update_current_values()
+            # Re-compute statistics with new current values
+            self.ds.collect_stats()
 
             # Call em_iter_func if provided at the end of every EM iteration
             if em_iter_func is not None: