Add two new static datasets for cross-region arbitrage calculations: - transmission_capacity: region-to-region capacity limits (20 rows) - transmission_cost: transmission costs per path (20 rows) Update mining dataset with EUR pricing and power metrics: - Change btc_price_usd to btc_price_eur - Add power_efficiency_th_per_mw, power_demand_mw - Add revenue_eur_per_mwh, profit_eur_per_mwh - Remove mining_profitability column Changes include: - scripts/02_fetch_historical.py: rewrite fetch_bitcoin_mining_data() - scripts/01_generate_synthetic.py: add transmission data generators - config/data_config.yaml: add transmission config, update bitcoin config - config/schema.yaml: add 2 new schemas, update bitcoin_mining schema - scripts/03_process_merge.py: add 2 new datasets - scripts/04_validate.py: add 2 new datasets - test/test_data.py: update for new datasets and bitcoin price reference Total datasets: 9 (734,491 rows, 17.89 MB)
275 lines
9.2 KiB
Python
275 lines
9.2 KiB
Python
"""
|
|
Validate processed datasets for quality, missing values, and data consistency.
|
|
"""
|
|
|
|
import yaml
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import json
|
|
from datetime import datetime
|
|
|
|
def load_config():
|
|
config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
|
|
with open(config_path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def load_schema():
|
|
schema_path = Path(__file__).parent.parent / "config" / "schema.yaml"
|
|
with open(schema_path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def load_processed_dataset(dataset_name, data_dir):
|
|
file_path = Path(data_dir) / 'processed' / f'{dataset_name}.parquet'
|
|
if file_path.exists():
|
|
return pd.read_parquet(file_path)
|
|
return None
|
|
|
|
def check_missing_values(df, dataset_name):
|
|
missing_info = {}
|
|
|
|
for col in df.columns:
|
|
missing_count = df[col].isna().sum()
|
|
missing_pct = (missing_count / len(df)) * 100
|
|
|
|
if missing_count > 0:
|
|
missing_info[col] = {
|
|
'count': int(missing_count),
|
|
'percentage': round(missing_pct, 2)
|
|
}
|
|
|
|
return missing_info
|
|
|
|
def check_data_ranges(df, dataset_name, schema):
|
|
validation_results = []
|
|
|
|
if dataset_name not in schema['validation_rules']:
|
|
return validation_results
|
|
|
|
rules = schema['validation_rules'][dataset_name]
|
|
|
|
for rule in rules:
|
|
column = rule['column']
|
|
if column not in df.columns:
|
|
continue
|
|
|
|
col_data = df[column].dropna()
|
|
|
|
if 'min' in rule:
|
|
violations = (col_data < rule['min']).sum()
|
|
if violations > 0:
|
|
validation_results.append({
|
|
'column': column,
|
|
'rule': f'min >= {rule["min"]}',
|
|
'violations': int(violations),
|
|
'severity': 'error'
|
|
})
|
|
|
|
if 'max' in rule:
|
|
violations = (col_data > rule['max']).sum()
|
|
if violations > 0:
|
|
validation_results.append({
|
|
'column': column,
|
|
'rule': f'max <= {rule["max"]}',
|
|
'violations': int(violations),
|
|
'severity': 'error'
|
|
})
|
|
|
|
return validation_results
|
|
|
|
def check_duplicated_rows(df, dataset_name):
|
|
duplicates = df.duplicated().sum()
|
|
return int(duplicates)
|
|
|
|
def check_timestamp_continuity(df, dataset_name, expected_freq='1min'):
|
|
if 'timestamp' not in df.columns:
|
|
return {'status': 'skipped', 'reason': 'no timestamp column'}
|
|
|
|
df_sorted = df.sort_values('timestamp')
|
|
time_diffs = df_sorted['timestamp'].diff().dropna()
|
|
|
|
expected_diff = pd.Timedelta(expected_freq)
|
|
missing_gaps = time_diffs[time_diffs > expected_diff * 1.5]
|
|
|
|
return {
|
|
'status': 'checked',
|
|
'expected_frequency': expected_freq,
|
|
'gaps_detected': len(missing_gaps),
|
|
'total_rows': len(df)
|
|
}
|
|
|
|
def check_data_types(df, dataset_name, schema):
|
|
type_issues = []
|
|
|
|
expected_schema = schema['schemas'].get(dataset_name, {})
|
|
expected_columns = {col['name']: col['type'] for col in expected_schema.get('columns', [])}
|
|
|
|
for col, expected_type in expected_columns.items():
|
|
if col not in df.columns:
|
|
type_issues.append({
|
|
'column': col,
|
|
'issue': 'missing',
|
|
'expected': expected_type
|
|
})
|
|
elif expected_type == 'datetime64[ns]':
|
|
if not pd.api.types.is_datetime64_any_dtype(df[col]):
|
|
type_issues.append({
|
|
'column': col,
|
|
'issue': 'wrong_type',
|
|
'expected': 'datetime',
|
|
'actual': str(df[col].dtype)
|
|
})
|
|
elif expected_type == 'category':
|
|
if not pd.api.types.is_categorical_dtype(df[col]):
|
|
type_issues.append({
|
|
'column': col,
|
|
'issue': 'wrong_type',
|
|
'expected': 'category',
|
|
'actual': str(df[col].dtype)
|
|
})
|
|
elif expected_type == 'float32':
|
|
if df[col].dtype not in ['float32', 'float64']:
|
|
type_issues.append({
|
|
'column': col,
|
|
'issue': 'wrong_type',
|
|
'expected': 'float32',
|
|
'actual': str(df[col].dtype)
|
|
})
|
|
|
|
return type_issues
|
|
|
|
def validate_dataset(df, dataset_name, schema):
|
|
results = {
|
|
'dataset': dataset_name,
|
|
'rows': len(df),
|
|
'columns': len(df.columns),
|
|
'memory_mb': round(df.memory_usage(deep=True).sum() / 1024 / 1024, 2),
|
|
'missing_values': check_missing_values(df, dataset_name),
|
|
'duplicated_rows': check_duplicated_rows(df, dataset_name),
|
|
'timestamp_continuity': check_timestamp_continuity(df, dataset_name),
|
|
'data_ranges': check_data_ranges(df, dataset_name, schema),
|
|
'data_types': check_data_types(df, dataset_name, schema)
|
|
}
|
|
|
|
error_count = (
|
|
sum(1 for v in results['data_ranges'] if v.get('severity') == 'error') +
|
|
len(results['data_types'])
|
|
)
|
|
|
|
results['status'] = 'pass' if error_count == 0 else 'warning' if error_count < 10 else 'fail'
|
|
|
|
return results
|
|
|
|
def generate_validation_report(all_results, output_dir):
|
|
total_errors = sum(1 for r in all_results if r['status'] == 'fail')
|
|
total_warnings = sum(1 for r in all_results if r['status'] == 'warning')
|
|
total_pass = sum(1 for r in all_results if r['status'] == 'pass')
|
|
|
|
total_size_mb = sum(r['memory_mb'] for r in all_results)
|
|
total_rows = sum(r['rows'] for r in all_results)
|
|
|
|
report = {
|
|
'generated_at': datetime.utcnow().isoformat(),
|
|
'summary': {
|
|
'total_datasets': len(all_results),
|
|
'passed': total_pass,
|
|
'warnings': total_warnings,
|
|
'failed': total_errors,
|
|
'total_size_mb': round(total_size_mb, 2),
|
|
'total_rows': total_rows
|
|
},
|
|
'datasets': all_results
|
|
}
|
|
|
|
output_path = Path(output_dir) / 'metadata' / 'validation_report.json'
|
|
with open(output_path, 'w') as f:
|
|
json.dump(report, f, indent=2, default=str)
|
|
|
|
return report
|
|
|
|
def print_summary(report):
|
|
print(f"\n{'='*60}")
|
|
print("VALIDATION SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"Datasets processed: {report['summary']['total_datasets']}")
|
|
print(f" ✓ Passed: {report['summary']['passed']}")
|
|
print(f" ⚠ Warnings: {report['summary']['warnings']}")
|
|
print(f" ✗ Failed: {report['summary']['failed']}")
|
|
print(f"\nTotal size: {report['summary']['total_size_mb']:.2f}MB")
|
|
print(f"Total rows: {report['summary']['total_rows']:,}")
|
|
|
|
print(f"\n{'='*60}")
|
|
print("PER-DATASET DETAILS")
|
|
print(f"{'='*60}")
|
|
|
|
for result in report['datasets']:
|
|
status_icon = '✓' if result['status'] == 'pass' else '⚠' if result['status'] == 'warning' else '✗'
|
|
print(f"\n{status_icon} {result['dataset']}")
|
|
print(f" Rows: {result['rows']:,} | Columns: {result['columns']} | Size: {result['memory_mb']:.2f}MB")
|
|
|
|
if result['missing_values']:
|
|
print(f" Missing values: {len(result['missing_values'])} columns")
|
|
|
|
if result['data_ranges']:
|
|
print(f" Range violations: {len(result['data_ranges'])}")
|
|
|
|
if result['data_types']:
|
|
print(f" Type issues: {len(result['data_types'])}")
|
|
|
|
if result['timestamp_continuity']['status'] == 'checked':
|
|
if result['timestamp_continuity']['gaps_detected'] > 0:
|
|
print(f" Time gaps: {result['timestamp_continuity']['gaps_detected']}")
|
|
|
|
def main():
|
|
config = load_config()
|
|
schema = load_schema()
|
|
|
|
data_dir = Path(__file__).parent.parent / 'data'
|
|
|
|
datasets = [
|
|
'electricity_prices',
|
|
'battery_capacity',
|
|
'renewable_generation',
|
|
'conventional_generation',
|
|
'load_profiles',
|
|
'data_centers',
|
|
'bitcoin_mining',
|
|
'transmission_capacity',
|
|
'transmission_cost'
|
|
]
|
|
|
|
print("Validating processed datasets...\n")
|
|
|
|
all_results = []
|
|
|
|
for dataset_name in datasets:
|
|
print(f"Validating {dataset_name}...")
|
|
|
|
df = load_processed_dataset(dataset_name, data_dir)
|
|
|
|
if df is None:
|
|
print(f" ✗ Dataset not found, skipping")
|
|
all_results.append({
|
|
'dataset': dataset_name,
|
|
'status': 'error',
|
|
'error': 'Dataset file not found'
|
|
})
|
|
continue
|
|
|
|
result = validate_dataset(df, dataset_name, schema)
|
|
all_results.append(result)
|
|
|
|
status_icon = '✓' if result['status'] == 'pass' else '⚠' if result['status'] == 'warning' else '✗'
|
|
print(f" {status_icon} {result['rows']:,} rows, {result['columns']} cols, {result['memory_mb']:.2f}MB")
|
|
|
|
report = generate_validation_report(all_results, data_dir)
|
|
print_summary(report)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Validation report saved to: {data_dir / 'metadata' / 'validation_report.json'}")
|
|
|
|
return report
|
|
|
|
if __name__ == '__main__':
|
|
main()
|