Files
energy-test-data/test/test_data.py
kbt-devops 30719abc27 Add test_data.py for data verification
Add utility script to quickly verify and explore generated test data.
Provides data loading, sample previews, time range checks, and key
statistics for all datasets.
2026-02-10 23:28:27 +07:00

116 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Quick test script to verify and explore the generated test data.
"""
import pandas as pd
from pathlib import Path
import json
def main():
data_dir = Path(__file__).parent.parent / 'data' / 'processed'
metadata_dir = Path(__file__).parent.parent / 'data' / 'metadata'
print("="*60)
print("ENERGY TEST DATA - QUICK VERIFICATION")
print("="*60)
datasets = [
'electricity_prices',
'battery_capacity',
'renewable_generation',
'conventional_generation',
'load_profiles',
'data_centers',
'bitcoin_mining'
]
print("\n1. LOADING DATASETS")
print("-" * 60)
loaded = {}
for name in datasets:
file_path = data_dir / f'{name}.parquet'
if file_path.exists():
loaded[name] = pd.read_parquet(file_path)
print(f"{name:25} {len(loaded[name]):>10,} rows")
else:
print(f"{name:25} NOT FOUND")
print(f"\nTotal datasets loaded: {len(loaded)}/7")
print("\n2. SAMPLE DATA PREVIEWS")
print("-" * 60)
for name, df in loaded.items():
print(f"\n{name}:")
print(df.head(3).to_string(index=False))
print(f" Columns: {list(df.columns)}")
print("\n3. TIME RANGE VERIFICATION")
print("-" * 60)
for name, df in loaded.items():
if 'timestamp' in df.columns:
df['timestamp'] = pd.to_datetime(df['timestamp'])
min_time = df['timestamp'].min()
max_time = df['timestamp'].max()
duration = (max_time - min_time).total_seconds() / 3600
print(f" {name:25} {min_time} to {max_time}")
print(f" Duration: {duration:.2f} hours")
print("\n4. KEY STATISTICS")
print("-" * 60)
if 'electricity_prices' in loaded:
df = loaded['electricity_prices']
print(f"\nElectricity Prices (by region):")
for region in df['region'].unique():
region_data = df[df['region'] == region]
print(f" {region:3s} Day-Ahead: €{region_data['day_ahead_price'].mean():.2f} avg, €{region_data['day_ahead_price'].max():.2f} max")
print(f" Real-Time: €{region_data['real_time_price'].mean():.2f} avg, €{region_data['real_time_price'].max():.2f} max")
if 'battery_capacity' in loaded:
df = loaded['battery_capacity']
print(f"\nBattery Capacity:")
for battery_id in df['battery_id'].unique()[:3]:
bat_data = df[df['battery_id'] == battery_id]
print(f" {battery_id} Cap: {bat_data['capacity_mwh'].iloc[0]:.1f} MWh, Avg Charge: {bat_data['charge_level_mwh'].mean():.1f} MWh")
if 'bitcoin_mining' in loaded:
df = loaded['bitcoin_mining']
print(f"\nBitcoin Mining:")
print(f" BTC Price: ${df['btc_price_usd'].mean():.2f} avg, ${df['btc_price_usd'].max():.2f} max")
print(f" Hashrate: {df['hashrate_ths'].mean():.2f} EH/s avg")
print(f" Profitability: ${df['mining_profitability'].mean():.4f} /TH/day avg")
if 'data_centers' in loaded:
df = loaded['data_centers']
print(f"\nData Centers:")
for dc_id in df['data_center_id'].unique():
dc_data = df[df['data_center_id'] == dc_id]
client = dc_data['client_type'].iloc[0]
print(f" {dc_id} Client: {client:8s}, Avg Demand: {dc_data['power_demand_mw'].mean():.1f} MW")
print("\n5. METADATA SUMMARY")
print("-" * 60)
meta_file = metadata_dir / 'final_metadata.json'
if meta_file.exists():
with open(meta_file) as f:
meta = json.load(f)
print(f" Total datasets: {meta['total_datasets']}")
print(f" Total size: {meta['total_size_mb']:.2f} MB")
total_rows = sum(d['rows'] for d in meta['datasets'].values())
print(f" Total rows: {total_rows:,}")
print(f" Generated at: {meta['processed_at']}")
print("\n" + "="*60)
print("TEST COMPLETE - ALL DATA LOADED SUCCESSFULLY")
print("="*60)
return loaded
if __name__ == '__main__':
main()