diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index f1364518e6..9f9ddfaebc 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -41,6 +41,8 @@ class ValidationRunner: def __init__(self, validation_config_path: str, differ_output: str, stats_summary: str, lint_report: str, validation_output: str): + self.validation_config_path = validation_config_path + self.stats_summary = stats_summary self.config = ValidationConfig(validation_config_path) self.validation_output = validation_output self.validator = Validator() @@ -212,6 +214,48 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: if output_dir: rule_params.setdefault('output_path', output_dir) + # Resolve paths relative to the directory of the validation config. + if 'summary_report' in rule.get('rule_id', ''): + # Helper to find a base directory containing target_sub_path by walking up + def find_base_dir(start_path: str, target_sub_path: str): + if not start_path: + return None + curr = os.path.abspath(start_path) + for _ in range(10): # limit to 10 levels up + if os.path.exists(os.path.join(curr, target_sub_path)): + return curr + parent = os.path.dirname(curr) + if parent == curr: + break + curr = parent + return None + + config_dir = None + # Walk up from validation_config_path, self.stats_summary, or CWD to find where 'golden_data' lives + for start in [self.validation_config_path, self.stats_summary, os.getcwd()]: + config_dir = find_base_dir(start, 'golden_data') + if config_dir: + break + + if not config_dir: + config_dir = os.path.dirname(os.path.abspath(self.validation_config_path)) + + print(f"DEBUG: Found summary_report rule: '{rule.get('rule_id')}'") + print(f"DEBUG: Config directory resolved to: '{config_dir}'") + for path_key in ['golden_files', 'input_files']: + if path_key in rule_params: + val = rule_params[path_key] + print(f"DEBUG: Before resolve '{path_key}': '{val}'") + if isinstance(val, str): + if val and not os.path.isabs(val) and not val.startswith('gs://') and not val.startswith('http://') and not val.startswith('https://'): + rule_params[path_key] = os.path.join(config_dir, val) + elif isinstance(val, list): + rule_params[path_key] = [ + os.path.join(config_dir, item) if isinstance(item, str) and item and not os.path.isabs(item) and not item.startswith('gs://') and not item.startswith('http://') and not item.startswith('https://') else item + for item in val + ] + print(f"DEBUG: After resolve '{path_key}': '{rule_params[path_key]}'") + if validator_name == 'SQL_VALIDATOR': result = validation_func(self.data_sources['stats'], self.data_sources['differ'], diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py index 7b19b783fe..916cd2f43c 100644 --- a/tools/import_validation/validator_goldens.py +++ b/tools/import_validation/validator_goldens.py @@ -71,6 +71,7 @@ import os import sys import tempfile +import csv from absl import app from absl import flags @@ -298,7 +299,8 @@ def load_nodes_from_file(files: str) -> dict: file_nodes = file_util.file_load_csv_dict(input_file, key_index=True) for node in file_nodes.values(): - nodes[len(nodes)] = node + cleaned_node = {k.strip(): v for k, v in node.items() if k is not None and isinstance(k, str) and k.strip() != ''} + nodes[len(nodes)] = cleaned_node else: # For MCF or JSON, we assume nodes are already keyed by DCID. file_nodes = mcf_file_util.load_mcf_nodes(input_file) @@ -379,7 +381,9 @@ def generate_goldens(input_files: str, for k, node in input_nodes.items(): match = False for col, vals in must_include_values.items(): - if node.get(col) in vals: + val = node.get(col) + if val in vals or mcf_file_util.strip_namespace( + val) in vals: match = True break if match: @@ -440,9 +444,16 @@ def generate_goldens(input_files: str, if golden_nodes and output_file: logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}') if file_util.file_is_csv(output_file): - file_util.file_write_csv_dict(golden_nodes, - output_file, - key_column_name=None) + with file_util.FileIO(output_file, mode='w') as csvfile: + columns = sorted( + list(set().union( + *(node.keys() for node in golden_nodes.values())))) + writer = csv.DictWriter(csvfile, + fieldnames=columns, + quoting=csv.QUOTE_NONNUMERIC) + writer.writeheader() + for node in golden_nodes.values(): + writer.writerow(node) else: mcf_file_util.write_mcf_nodes([golden_nodes], output_file)