Files
energy-test-data/scripts/04_validate.py
kbt-devops a643767359 Initial commit: Energy test data generation pipeline
Add complete test data preparation system for energy trading strategy
demo. Includes configuration, data generation scripts, and validation
tools for 7 datasets covering electricity prices, battery capacity,
renewable/conventional generation, load profiles, data centers, and
mining data.

Excluded from git: Actual parquet data files (data/raw/, data/processed/)
can be regenerated using the provided scripts.

Datasets:
- electricity_prices: Day-ahead and real-time prices (5 regions)
- battery_capacity: Storage system charge/discharge cycles
- renewable_generation: Solar, wind, hydro with forecast errors
- conventional_generation: Gas, coal, nuclear plant outputs
- load_profiles: Regional demand with weather correlations
- data_centers: Power demand profiles including mining operations
- mining_data: Hashrate, price, profitability (mempool.space API)
2026-02-10 23:28:23 +07:00

273 lines
9.1 KiB
Python

"""
Validate processed datasets for quality, missing values, and data consistency.
"""
import yaml
import numpy as np
import pandas as pd
from pathlib import Path
import json
from datetime import datetime
def load_config():
config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
with open(config_path) as f:
return yaml.safe_load(f)
def load_schema():
schema_path = Path(__file__).parent.parent / "config" / "schema.yaml"
with open(schema_path) as f:
return yaml.safe_load(f)
def load_processed_dataset(dataset_name, data_dir):
file_path = Path(data_dir) / 'processed' / f'{dataset_name}.parquet'
if file_path.exists():
return pd.read_parquet(file_path)
return None
def check_missing_values(df, dataset_name):
missing_info = {}
for col in df.columns:
missing_count = df[col].isna().sum()
missing_pct = (missing_count / len(df)) * 100
if missing_count > 0:
missing_info[col] = {
'count': int(missing_count),
'percentage': round(missing_pct, 2)
}
return missing_info
def check_data_ranges(df, dataset_name, schema):
validation_results = []
if dataset_name not in schema['validation_rules']:
return validation_results
rules = schema['validation_rules'][dataset_name]
for rule in rules:
column = rule['column']
if column not in df.columns:
continue
col_data = df[column].dropna()
if 'min' in rule:
violations = (col_data < rule['min']).sum()
if violations > 0:
validation_results.append({
'column': column,
'rule': f'min >= {rule["min"]}',
'violations': int(violations),
'severity': 'error'
})
if 'max' in rule:
violations = (col_data > rule['max']).sum()
if violations > 0:
validation_results.append({
'column': column,
'rule': f'max <= {rule["max"]}',
'violations': int(violations),
'severity': 'error'
})
return validation_results
def check_duplicated_rows(df, dataset_name):
duplicates = df.duplicated().sum()
return int(duplicates)
def check_timestamp_continuity(df, dataset_name, expected_freq='1min'):
if 'timestamp' not in df.columns:
return {'status': 'skipped', 'reason': 'no timestamp column'}
df_sorted = df.sort_values('timestamp')
time_diffs = df_sorted['timestamp'].diff().dropna()
expected_diff = pd.Timedelta(expected_freq)
missing_gaps = time_diffs[time_diffs > expected_diff * 1.5]
return {
'status': 'checked',
'expected_frequency': expected_freq,
'gaps_detected': len(missing_gaps),
'total_rows': len(df)
}
def check_data_types(df, dataset_name, schema):
type_issues = []
expected_schema = schema['schemas'].get(dataset_name, {})
expected_columns = {col['name']: col['type'] for col in expected_schema.get('columns', [])}
for col, expected_type in expected_columns.items():
if col not in df.columns:
type_issues.append({
'column': col,
'issue': 'missing',
'expected': expected_type
})
elif expected_type == 'datetime64[ns]':
if not pd.api.types.is_datetime64_any_dtype(df[col]):
type_issues.append({
'column': col,
'issue': 'wrong_type',
'expected': 'datetime',
'actual': str(df[col].dtype)
})
elif expected_type == 'category':
if not pd.api.types.is_categorical_dtype(df[col]):
type_issues.append({
'column': col,
'issue': 'wrong_type',
'expected': 'category',
'actual': str(df[col].dtype)
})
elif expected_type == 'float32':
if df[col].dtype not in ['float32', 'float64']:
type_issues.append({
'column': col,
'issue': 'wrong_type',
'expected': 'float32',
'actual': str(df[col].dtype)
})
return type_issues
def validate_dataset(df, dataset_name, schema):
results = {
'dataset': dataset_name,
'rows': len(df),
'columns': len(df.columns),
'memory_mb': round(df.memory_usage(deep=True).sum() / 1024 / 1024, 2),
'missing_values': check_missing_values(df, dataset_name),
'duplicated_rows': check_duplicated_rows(df, dataset_name),
'timestamp_continuity': check_timestamp_continuity(df, dataset_name),
'data_ranges': check_data_ranges(df, dataset_name, schema),
'data_types': check_data_types(df, dataset_name, schema)
}
error_count = (
sum(1 for v in results['data_ranges'] if v.get('severity') == 'error') +
len(results['data_types'])
)
results['status'] = 'pass' if error_count == 0 else 'warning' if error_count < 10 else 'fail'
return results
def generate_validation_report(all_results, output_dir):
total_errors = sum(1 for r in all_results if r['status'] == 'fail')
total_warnings = sum(1 for r in all_results if r['status'] == 'warning')
total_pass = sum(1 for r in all_results if r['status'] == 'pass')
total_size_mb = sum(r['memory_mb'] for r in all_results)
total_rows = sum(r['rows'] for r in all_results)
report = {
'generated_at': datetime.utcnow().isoformat(),
'summary': {
'total_datasets': len(all_results),
'passed': total_pass,
'warnings': total_warnings,
'failed': total_errors,
'total_size_mb': round(total_size_mb, 2),
'total_rows': total_rows
},
'datasets': all_results
}
output_path = Path(output_dir) / 'metadata' / 'validation_report.json'
with open(output_path, 'w') as f:
json.dump(report, f, indent=2, default=str)
return report
def print_summary(report):
print(f"\n{'='*60}")
print("VALIDATION SUMMARY")
print(f"{'='*60}")
print(f"Datasets processed: {report['summary']['total_datasets']}")
print(f" ✓ Passed: {report['summary']['passed']}")
print(f" ⚠ Warnings: {report['summary']['warnings']}")
print(f" ✗ Failed: {report['summary']['failed']}")
print(f"\nTotal size: {report['summary']['total_size_mb']:.2f}MB")
print(f"Total rows: {report['summary']['total_rows']:,}")
print(f"\n{'='*60}")
print("PER-DATASET DETAILS")
print(f"{'='*60}")
for result in report['datasets']:
status_icon = '' if result['status'] == 'pass' else '' if result['status'] == 'warning' else ''
print(f"\n{status_icon} {result['dataset']}")
print(f" Rows: {result['rows']:,} | Columns: {result['columns']} | Size: {result['memory_mb']:.2f}MB")
if result['missing_values']:
print(f" Missing values: {len(result['missing_values'])} columns")
if result['data_ranges']:
print(f" Range violations: {len(result['data_ranges'])}")
if result['data_types']:
print(f" Type issues: {len(result['data_types'])}")
if result['timestamp_continuity']['status'] == 'checked':
if result['timestamp_continuity']['gaps_detected'] > 0:
print(f" Time gaps: {result['timestamp_continuity']['gaps_detected']}")
def main():
config = load_config()
schema = load_schema()
data_dir = Path(__file__).parent.parent / 'data'
datasets = [
'electricity_prices',
'battery_capacity',
'renewable_generation',
'conventional_generation',
'load_profiles',
'data_centers',
'bitcoin_mining'
]
print("Validating processed datasets...\n")
all_results = []
for dataset_name in datasets:
print(f"Validating {dataset_name}...")
df = load_processed_dataset(dataset_name, data_dir)
if df is None:
print(f" ✗ Dataset not found, skipping")
all_results.append({
'dataset': dataset_name,
'status': 'error',
'error': 'Dataset file not found'
})
continue
result = validate_dataset(df, dataset_name, schema)
all_results.append(result)
status_icon = '' if result['status'] == 'pass' else '' if result['status'] == 'warning' else ''
print(f" {status_icon} {result['rows']:,} rows, {result['columns']} cols, {result['memory_mb']:.2f}MB")
report = generate_validation_report(all_results, data_dir)
print_summary(report)
print(f"\n{'='*60}")
print(f"Validation report saved to: {data_dir / 'metadata' / 'validation_report.json'}")
return report
if __name__ == '__main__':
main()