Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions tools/import_validation/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class ValidationRunner:

def __init__(self, validation_config_path: str, differ_output: str,
stats_summary: str, lint_report: str, validation_output: str):
self.validation_config_path = validation_config_path
self.stats_summary = stats_summary
self.config = ValidationConfig(validation_config_path)
self.validation_output = validation_output
self.validator = Validator()
Expand Down Expand Up @@ -212,6 +214,48 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]:
if output_dir:
rule_params.setdefault('output_path', output_dir)

# Resolve paths relative to the directory of the validation config.
if 'summary_report' in rule.get('rule_id', ''):
# Helper to find a base directory containing target_sub_path by walking up
def find_base_dir(start_path: str, target_sub_path: str):
if not start_path:
return None
curr = os.path.abspath(start_path)
for _ in range(10): # limit to 10 levels up
if os.path.exists(os.path.join(curr, target_sub_path)):
return curr
parent = os.path.dirname(curr)
if parent == curr:
break
curr = parent
return None

config_dir = None
# Walk up from validation_config_path, self.stats_summary, or CWD to find where 'golden_data' lives
for start in [self.validation_config_path, self.stats_summary, os.getcwd()]:
config_dir = find_base_dir(start, 'golden_data')
if config_dir:
break

if not config_dir:
config_dir = os.path.dirname(os.path.abspath(self.validation_config_path))

print(f"DEBUG: Found summary_report rule: '{rule.get('rule_id')}'")
print(f"DEBUG: Config directory resolved to: '{config_dir}'")
for path_key in ['golden_files', 'input_files']:
if path_key in rule_params:
val = rule_params[path_key]
print(f"DEBUG: Before resolve '{path_key}': '{val}'")
if isinstance(val, str):
if val and not os.path.isabs(val) and not val.startswith('gs://') and not val.startswith('http://') and not val.startswith('https://'):
rule_params[path_key] = os.path.join(config_dir, val)
elif isinstance(val, list):
rule_params[path_key] = [
os.path.join(config_dir, item) if isinstance(item, str) and item and not os.path.isabs(item) and not item.startswith('gs://') and not item.startswith('http://') and not item.startswith('https://') else item
for item in val
]
print(f"DEBUG: After resolve '{path_key}': '{rule_params[path_key]}'")
Comment thread
niveditasing marked this conversation as resolved.

if validator_name == 'SQL_VALIDATOR':
result = validation_func(self.data_sources['stats'],
self.data_sources['differ'],
Expand Down
21 changes: 16 additions & 5 deletions tools/import_validation/validator_goldens.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
import os
import sys
import tempfile
import csv

from absl import app
from absl import flags
Expand Down Expand Up @@ -298,7 +299,8 @@ def load_nodes_from_file(files: str) -> dict:
file_nodes = file_util.file_load_csv_dict(input_file,
key_index=True)
for node in file_nodes.values():
nodes[len(nodes)] = node
cleaned_node = {k.strip(): v for k, v in node.items() if k is not None and isinstance(k, str) and k.strip() != ''}
Comment thread
niveditasing marked this conversation as resolved.
nodes[len(nodes)] = cleaned_node
else:
# For MCF or JSON, we assume nodes are already keyed by DCID.
file_nodes = mcf_file_util.load_mcf_nodes(input_file)
Expand Down Expand Up @@ -379,7 +381,9 @@ def generate_goldens(input_files: str,
for k, node in input_nodes.items():
match = False
for col, vals in must_include_values.items():
if node.get(col) in vals:
val = node.get(col)
if val in vals or mcf_file_util.strip_namespace(
val) in vals:
match = True
break
if match:
Expand Down Expand Up @@ -440,9 +444,16 @@ def generate_goldens(input_files: str,
if golden_nodes and output_file:
logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}')
if file_util.file_is_csv(output_file):
file_util.file_write_csv_dict(golden_nodes,
output_file,
key_column_name=None)
with file_util.FileIO(output_file, mode='w') as csvfile:
Comment thread
niveditasing marked this conversation as resolved.
columns = sorted(
list(set().union(
*(node.keys() for node in golden_nodes.values()))))
writer = csv.DictWriter(csvfile,
fieldnames=columns,
quoting=csv.QUOTE_NONNUMERIC)
writer.writeheader()
for node in golden_nodes.values():
writer.writerow(node)
Comment thread
niveditasing marked this conversation as resolved.
else:
mcf_file_util.write_mcf_nodes([golden_nodes], output_file)

Expand Down
Loading