Files
energy-test-data/scripts/04_validate.py
kbt-devops faaadc1297 Add transmission datasets and update mining data
Add two new static datasets for cross-region arbitrage calculations:
- transmission_capacity: region-to-region capacity limits (20 rows)
- transmission_cost: transmission costs per path (20 rows)

Update mining dataset with EUR pricing and power metrics:
- Change btc_price_usd to btc_price_eur
- Add power_efficiency_th_per_mw, power_demand_mw
- Add revenue_eur_per_mwh, profit_eur_per_mwh
- Remove mining_profitability column

Changes include:
- scripts/02_fetch_historical.py: rewrite fetch_bitcoin_mining_data()
- scripts/01_generate_synthetic.py: add transmission data generators
- config/data_config.yaml: add transmission config, update bitcoin config
- config/schema.yaml: add 2 new schemas, update bitcoin_mining schema
- scripts/03_process_merge.py: add 2 new datasets
- scripts/04_validate.py: add 2 new datasets
- test/test_data.py: update for new datasets and bitcoin price reference

Total datasets: 9 (734,491 rows, 17.89 MB)
2026-02-11 01:09:33 +07:00

275 lines
9.2 KiB
Python

"""
Validate processed datasets for quality, missing values, and data consistency.
"""
import yaml
import numpy as np
import pandas as pd
from pathlib import Path
import json
from datetime import datetime
def load_config():
config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
with open(config_path) as f:
return yaml.safe_load(f)
def load_schema():
schema_path = Path(__file__).parent.parent / "config" / "schema.yaml"
with open(schema_path) as f:
return yaml.safe_load(f)
def load_processed_dataset(dataset_name, data_dir):
file_path = Path(data_dir) / 'processed' / f'{dataset_name}.parquet'
if file_path.exists():
return pd.read_parquet(file_path)
return None
def check_missing_values(df, dataset_name):
missing_info = {}
for col in df.columns:
missing_count = df[col].isna().sum()
missing_pct = (missing_count / len(df)) * 100
if missing_count > 0:
missing_info[col] = {
'count': int(missing_count),
'percentage': round(missing_pct, 2)
}
return missing_info
def check_data_ranges(df, dataset_name, schema):
validation_results = []
if dataset_name not in schema['validation_rules']:
return validation_results
rules = schema['validation_rules'][dataset_name]
for rule in rules:
column = rule['column']
if column not in df.columns:
continue
col_data = df[column].dropna()
if 'min' in rule:
violations = (col_data < rule['min']).sum()
if violations > 0:
validation_results.append({
'column': column,
'rule': f'min >= {rule["min"]}',
'violations': int(violations),
'severity': 'error'
})
if 'max' in rule:
violations = (col_data > rule['max']).sum()
if violations > 0:
validation_results.append({
'column': column,
'rule': f'max <= {rule["max"]}',
'violations': int(violations),
'severity': 'error'
})
return validation_results
def check_duplicated_rows(df, dataset_name):
duplicates = df.duplicated().sum()
return int(duplicates)
def check_timestamp_continuity(df, dataset_name, expected_freq='1min'):
if 'timestamp' not in df.columns:
return {'status': 'skipped', 'reason': 'no timestamp column'}
df_sorted = df.sort_values('timestamp')
time_diffs = df_sorted['timestamp'].diff().dropna()
expected_diff = pd.Timedelta(expected_freq)
missing_gaps = time_diffs[time_diffs > expected_diff * 1.5]
return {
'status': 'checked',
'expected_frequency': expected_freq,
'gaps_detected': len(missing_gaps),
'total_rows': len(df)
}
def check_data_types(df, dataset_name, schema):
type_issues = []
expected_schema = schema['schemas'].get(dataset_name, {})
expected_columns = {col['name']: col['type'] for col in expected_schema.get('columns', [])}
for col, expected_type in expected_columns.items():
if col not in df.columns:
type_issues.append({
'column': col,
'issue': 'missing',
'expected': expected_type
})
elif expected_type == 'datetime64[ns]':
if not pd.api.types.is_datetime64_any_dtype(df[col]):
type_issues.append({
'column': col,
'issue': 'wrong_type',
'expected': 'datetime',
'actual': str(df[col].dtype)
})
elif expected_type == 'category':
if not pd.api.types.is_categorical_dtype(df[col]):
type_issues.append({
'column': col,
'issue': 'wrong_type',
'expected': 'category',
'actual': str(df[col].dtype)
})
elif expected_type == 'float32':
if df[col].dtype not in ['float32', 'float64']:
type_issues.append({
'column': col,
'issue': 'wrong_type',
'expected': 'float32',
'actual': str(df[col].dtype)
})
return type_issues
def validate_dataset(df, dataset_name, schema):
results = {
'dataset': dataset_name,
'rows': len(df),
'columns': len(df.columns),
'memory_mb': round(df.memory_usage(deep=True).sum() / 1024 / 1024, 2),
'missing_values': check_missing_values(df, dataset_name),
'duplicated_rows': check_duplicated_rows(df, dataset_name),
'timestamp_continuity': check_timestamp_continuity(df, dataset_name),
'data_ranges': check_data_ranges(df, dataset_name, schema),
'data_types': check_data_types(df, dataset_name, schema)
}
error_count = (
sum(1 for v in results['data_ranges'] if v.get('severity') == 'error') +
len(results['data_types'])
)
results['status'] = 'pass' if error_count == 0 else 'warning' if error_count < 10 else 'fail'
return results
def generate_validation_report(all_results, output_dir):
total_errors = sum(1 for r in all_results if r['status'] == 'fail')
total_warnings = sum(1 for r in all_results if r['status'] == 'warning')
total_pass = sum(1 for r in all_results if r['status'] == 'pass')
total_size_mb = sum(r['memory_mb'] for r in all_results)
total_rows = sum(r['rows'] for r in all_results)
report = {
'generated_at': datetime.utcnow().isoformat(),
'summary': {
'total_datasets': len(all_results),
'passed': total_pass,
'warnings': total_warnings,
'failed': total_errors,
'total_size_mb': round(total_size_mb, 2),
'total_rows': total_rows
},
'datasets': all_results
}
output_path = Path(output_dir) / 'metadata' / 'validation_report.json'
with open(output_path, 'w') as f:
json.dump(report, f, indent=2, default=str)
return report
def print_summary(report):
print(f"\n{'='*60}")
print("VALIDATION SUMMARY")
print(f"{'='*60}")
print(f"Datasets processed: {report['summary']['total_datasets']}")
print(f" ✓ Passed: {report['summary']['passed']}")
print(f" ⚠ Warnings: {report['summary']['warnings']}")
print(f" ✗ Failed: {report['summary']['failed']}")
print(f"\nTotal size: {report['summary']['total_size_mb']:.2f}MB")
print(f"Total rows: {report['summary']['total_rows']:,}")
print(f"\n{'='*60}")
print("PER-DATASET DETAILS")
print(f"{'='*60}")
for result in report['datasets']:
status_icon = '' if result['status'] == 'pass' else '' if result['status'] == 'warning' else ''
print(f"\n{status_icon} {result['dataset']}")
print(f" Rows: {result['rows']:,} | Columns: {result['columns']} | Size: {result['memory_mb']:.2f}MB")
if result['missing_values']:
print(f" Missing values: {len(result['missing_values'])} columns")
if result['data_ranges']:
print(f" Range violations: {len(result['data_ranges'])}")
if result['data_types']:
print(f" Type issues: {len(result['data_types'])}")
if result['timestamp_continuity']['status'] == 'checked':
if result['timestamp_continuity']['gaps_detected'] > 0:
print(f" Time gaps: {result['timestamp_continuity']['gaps_detected']}")
def main():
config = load_config()
schema = load_schema()
data_dir = Path(__file__).parent.parent / 'data'
datasets = [
'electricity_prices',
'battery_capacity',
'renewable_generation',
'conventional_generation',
'load_profiles',
'data_centers',
'bitcoin_mining',
'transmission_capacity',
'transmission_cost'
]
print("Validating processed datasets...\n")
all_results = []
for dataset_name in datasets:
print(f"Validating {dataset_name}...")
df = load_processed_dataset(dataset_name, data_dir)
if df is None:
print(f" ✗ Dataset not found, skipping")
all_results.append({
'dataset': dataset_name,
'status': 'error',
'error': 'Dataset file not found'
})
continue
result = validate_dataset(df, dataset_name, schema)
all_results.append(result)
status_icon = '' if result['status'] == 'pass' else '' if result['status'] == 'warning' else ''
print(f" {status_icon} {result['rows']:,} rows, {result['columns']} cols, {result['memory_mb']:.2f}MB")
report = generate_validation_report(all_results, data_dir)
print_summary(report)
print(f"\n{'='*60}")
print(f"Validation report saved to: {data_dir / 'metadata' / 'validation_report.json'}")
return report
if __name__ == '__main__':
main()