Add test_data.py for data verification

Add utility script to quickly verify and explore generated test data.
Provides data loading, sample previews, time range checks, and key
statistics for all datasets.
This commit is contained in:
2026-02-10 23:28:27 +07:00
parent a643767359
commit 30719abc27

115
test/test_data.py Normal file
View File

@@ -0,0 +1,115 @@
#!/usr/bin/env python3
"""
Quick test script to verify and explore the generated test data.
"""
import pandas as pd
from pathlib import Path
import json
def main():
data_dir = Path(__file__).parent.parent / 'data' / 'processed'
metadata_dir = Path(__file__).parent.parent / 'data' / 'metadata'
print("="*60)
print("ENERGY TEST DATA - QUICK VERIFICATION")
print("="*60)
datasets = [
'electricity_prices',
'battery_capacity',
'renewable_generation',
'conventional_generation',
'load_profiles',
'data_centers',
'bitcoin_mining'
]
print("\n1. LOADING DATASETS")
print("-" * 60)
loaded = {}
for name in datasets:
file_path = data_dir / f'{name}.parquet'
if file_path.exists():
loaded[name] = pd.read_parquet(file_path)
print(f"{name:25} {len(loaded[name]):>10,} rows")
else:
print(f"{name:25} NOT FOUND")
print(f"\nTotal datasets loaded: {len(loaded)}/7")
print("\n2. SAMPLE DATA PREVIEWS")
print("-" * 60)
for name, df in loaded.items():
print(f"\n{name}:")
print(df.head(3).to_string(index=False))
print(f" Columns: {list(df.columns)}")
print("\n3. TIME RANGE VERIFICATION")
print("-" * 60)
for name, df in loaded.items():
if 'timestamp' in df.columns:
df['timestamp'] = pd.to_datetime(df['timestamp'])
min_time = df['timestamp'].min()
max_time = df['timestamp'].max()
duration = (max_time - min_time).total_seconds() / 3600
print(f" {name:25} {min_time} to {max_time}")
print(f" Duration: {duration:.2f} hours")
print("\n4. KEY STATISTICS")
print("-" * 60)
if 'electricity_prices' in loaded:
df = loaded['electricity_prices']
print(f"\nElectricity Prices (by region):")
for region in df['region'].unique():
region_data = df[df['region'] == region]
print(f" {region:3s} Day-Ahead: €{region_data['day_ahead_price'].mean():.2f} avg, €{region_data['day_ahead_price'].max():.2f} max")
print(f" Real-Time: €{region_data['real_time_price'].mean():.2f} avg, €{region_data['real_time_price'].max():.2f} max")
if 'battery_capacity' in loaded:
df = loaded['battery_capacity']
print(f"\nBattery Capacity:")
for battery_id in df['battery_id'].unique()[:3]:
bat_data = df[df['battery_id'] == battery_id]
print(f" {battery_id} Cap: {bat_data['capacity_mwh'].iloc[0]:.1f} MWh, Avg Charge: {bat_data['charge_level_mwh'].mean():.1f} MWh")
if 'bitcoin_mining' in loaded:
df = loaded['bitcoin_mining']
print(f"\nBitcoin Mining:")
print(f" BTC Price: ${df['btc_price_usd'].mean():.2f} avg, ${df['btc_price_usd'].max():.2f} max")
print(f" Hashrate: {df['hashrate_ths'].mean():.2f} EH/s avg")
print(f" Profitability: ${df['mining_profitability'].mean():.4f} /TH/day avg")
if 'data_centers' in loaded:
df = loaded['data_centers']
print(f"\nData Centers:")
for dc_id in df['data_center_id'].unique():
dc_data = df[df['data_center_id'] == dc_id]
client = dc_data['client_type'].iloc[0]
print(f" {dc_id} Client: {client:8s}, Avg Demand: {dc_data['power_demand_mw'].mean():.1f} MW")
print("\n5. METADATA SUMMARY")
print("-" * 60)
meta_file = metadata_dir / 'final_metadata.json'
if meta_file.exists():
with open(meta_file) as f:
meta = json.load(f)
print(f" Total datasets: {meta['total_datasets']}")
print(f" Total size: {meta['total_size_mb']:.2f} MB")
total_rows = sum(d['rows'] for d in meta['datasets'].values())
print(f" Total rows: {total_rows:,}")
print(f" Generated at: {meta['processed_at']}")
print("\n" + "="*60)
print("TEST COMPLETE - ALL DATA LOADED SUCCESSFULLY")
print("="*60)
return loaded
if __name__ == '__main__':
main()