""" Validate processed datasets for quality, missing values, and data consistency. """ import yaml import numpy as np import pandas as pd from pathlib import Path import json from datetime import datetime def load_config(): config_path = Path(__file__).parent.parent / "config" / "data_config.yaml" with open(config_path) as f: return yaml.safe_load(f) def load_schema(): schema_path = Path(__file__).parent.parent / "config" / "schema.yaml" with open(schema_path) as f: return yaml.safe_load(f) def load_processed_dataset(dataset_name, data_dir): file_path = Path(data_dir) / 'processed' / f'{dataset_name}.parquet' if file_path.exists(): return pd.read_parquet(file_path) return None def check_missing_values(df, dataset_name): missing_info = {} for col in df.columns: missing_count = df[col].isna().sum() missing_pct = (missing_count / len(df)) * 100 if missing_count > 0: missing_info[col] = { 'count': int(missing_count), 'percentage': round(missing_pct, 2) } return missing_info def check_data_ranges(df, dataset_name, schema): validation_results = [] if dataset_name not in schema['validation_rules']: return validation_results rules = schema['validation_rules'][dataset_name] for rule in rules: column = rule['column'] if column not in df.columns: continue col_data = df[column].dropna() if 'min' in rule: violations = (col_data < rule['min']).sum() if violations > 0: validation_results.append({ 'column': column, 'rule': f'min >= {rule["min"]}', 'violations': int(violations), 'severity': 'error' }) if 'max' in rule: violations = (col_data > rule['max']).sum() if violations > 0: validation_results.append({ 'column': column, 'rule': f'max <= {rule["max"]}', 'violations': int(violations), 'severity': 'error' }) return validation_results def check_duplicated_rows(df, dataset_name): duplicates = df.duplicated().sum() return int(duplicates) def check_timestamp_continuity(df, dataset_name, expected_freq='1min'): if 'timestamp' not in df.columns: return {'status': 'skipped', 'reason': 'no timestamp column'} df_sorted = df.sort_values('timestamp') time_diffs = df_sorted['timestamp'].diff().dropna() expected_diff = pd.Timedelta(expected_freq) missing_gaps = time_diffs[time_diffs > expected_diff * 1.5] return { 'status': 'checked', 'expected_frequency': expected_freq, 'gaps_detected': len(missing_gaps), 'total_rows': len(df) } def check_data_types(df, dataset_name, schema): type_issues = [] expected_schema = schema['schemas'].get(dataset_name, {}) expected_columns = {col['name']: col['type'] for col in expected_schema.get('columns', [])} for col, expected_type in expected_columns.items(): if col not in df.columns: type_issues.append({ 'column': col, 'issue': 'missing', 'expected': expected_type }) elif expected_type == 'datetime64[ns]': if not pd.api.types.is_datetime64_any_dtype(df[col]): type_issues.append({ 'column': col, 'issue': 'wrong_type', 'expected': 'datetime', 'actual': str(df[col].dtype) }) elif expected_type == 'category': if not pd.api.types.is_categorical_dtype(df[col]): type_issues.append({ 'column': col, 'issue': 'wrong_type', 'expected': 'category', 'actual': str(df[col].dtype) }) elif expected_type == 'float32': if df[col].dtype not in ['float32', 'float64']: type_issues.append({ 'column': col, 'issue': 'wrong_type', 'expected': 'float32', 'actual': str(df[col].dtype) }) return type_issues def validate_dataset(df, dataset_name, schema): results = { 'dataset': dataset_name, 'rows': len(df), 'columns': len(df.columns), 'memory_mb': round(df.memory_usage(deep=True).sum() / 1024 / 1024, 2), 'missing_values': check_missing_values(df, dataset_name), 'duplicated_rows': check_duplicated_rows(df, dataset_name), 'timestamp_continuity': check_timestamp_continuity(df, dataset_name), 'data_ranges': check_data_ranges(df, dataset_name, schema), 'data_types': check_data_types(df, dataset_name, schema) } error_count = ( sum(1 for v in results['data_ranges'] if v.get('severity') == 'error') + len(results['data_types']) ) results['status'] = 'pass' if error_count == 0 else 'warning' if error_count < 10 else 'fail' return results def generate_validation_report(all_results, output_dir): total_errors = sum(1 for r in all_results if r['status'] == 'fail') total_warnings = sum(1 for r in all_results if r['status'] == 'warning') total_pass = sum(1 for r in all_results if r['status'] == 'pass') total_size_mb = sum(r['memory_mb'] for r in all_results) total_rows = sum(r['rows'] for r in all_results) report = { 'generated_at': datetime.utcnow().isoformat(), 'summary': { 'total_datasets': len(all_results), 'passed': total_pass, 'warnings': total_warnings, 'failed': total_errors, 'total_size_mb': round(total_size_mb, 2), 'total_rows': total_rows }, 'datasets': all_results } output_path = Path(output_dir) / 'metadata' / 'validation_report.json' with open(output_path, 'w') as f: json.dump(report, f, indent=2, default=str) return report def print_summary(report): print(f"\n{'='*60}") print("VALIDATION SUMMARY") print(f"{'='*60}") print(f"Datasets processed: {report['summary']['total_datasets']}") print(f" ✓ Passed: {report['summary']['passed']}") print(f" ⚠ Warnings: {report['summary']['warnings']}") print(f" ✗ Failed: {report['summary']['failed']}") print(f"\nTotal size: {report['summary']['total_size_mb']:.2f}MB") print(f"Total rows: {report['summary']['total_rows']:,}") print(f"\n{'='*60}") print("PER-DATASET DETAILS") print(f"{'='*60}") for result in report['datasets']: status_icon = '✓' if result['status'] == 'pass' else '⚠' if result['status'] == 'warning' else '✗' print(f"\n{status_icon} {result['dataset']}") print(f" Rows: {result['rows']:,} | Columns: {result['columns']} | Size: {result['memory_mb']:.2f}MB") if result['missing_values']: print(f" Missing values: {len(result['missing_values'])} columns") if result['data_ranges']: print(f" Range violations: {len(result['data_ranges'])}") if result['data_types']: print(f" Type issues: {len(result['data_types'])}") if result['timestamp_continuity']['status'] == 'checked': if result['timestamp_continuity']['gaps_detected'] > 0: print(f" Time gaps: {result['timestamp_continuity']['gaps_detected']}") def main(): config = load_config() schema = load_schema() data_dir = Path(__file__).parent.parent / 'data' datasets = [ 'electricity_prices', 'battery_capacity', 'renewable_generation', 'conventional_generation', 'load_profiles', 'data_centers', 'bitcoin_mining' ] print("Validating processed datasets...\n") all_results = [] for dataset_name in datasets: print(f"Validating {dataset_name}...") df = load_processed_dataset(dataset_name, data_dir) if df is None: print(f" ✗ Dataset not found, skipping") all_results.append({ 'dataset': dataset_name, 'status': 'error', 'error': 'Dataset file not found' }) continue result = validate_dataset(df, dataset_name, schema) all_results.append(result) status_icon = '✓' if result['status'] == 'pass' else '⚠' if result['status'] == 'warning' else '✗' print(f" {status_icon} {result['rows']:,} rows, {result['columns']} cols, {result['memory_mb']:.2f}MB") report = generate_validation_report(all_results, data_dir) print_summary(report) print(f"\n{'='*60}") print(f"Validation report saved to: {data_dir / 'metadata' / 'validation_report.json'}") return report if __name__ == '__main__': main()