Add test_data.py for data verification

Add utility script to quickly verify and explore generated test data. Provides data loading, sample previews, time range checks, and key statistics for all datasets.
2026-02-10 23:28:27 +07:00
parent a643767359
commit 30719abc27
1 changed files with 115 additions and 0 deletions
--- a/test/test_data.py
+++ b/test/test_data.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+Quick test script to verify and explore the generated test data.
+"""
+
+import pandas as pd
+from pathlib import Path
+import json
+
+def main():
+    data_dir = Path(__file__).parent.parent / 'data' / 'processed'
+    metadata_dir = Path(__file__).parent.parent / 'data' / 'metadata'
+    
+    print("="*60)
+    print("ENERGY TEST DATA - QUICK VERIFICATION")
+    print("="*60)
+    
+    datasets = [
+        'electricity_prices',
+        'battery_capacity',
+        'renewable_generation',
+        'conventional_generation',
+        'load_profiles',
+        'data_centers',
+        'bitcoin_mining'
+    ]
+    
+    print("\n1. LOADING DATASETS")
+    print("-" * 60)
+    
+    loaded = {}
+    for name in datasets:
+        file_path = data_dir / f'{name}.parquet'
+        if file_path.exists():
+            loaded[name] = pd.read_parquet(file_path)
+            print(f"  ✓ {name:25} {len(loaded[name]):>10,} rows")
+        else:
+            print(f"  ✗ {name:25} NOT FOUND")
+    
+    print(f"\nTotal datasets loaded: {len(loaded)}/7")
+    
+    print("\n2. SAMPLE DATA PREVIEWS")
+    print("-" * 60)
+    
+    for name, df in loaded.items():
+        print(f"\n{name}:")
+        print(df.head(3).to_string(index=False))
+        print(f"  Columns: {list(df.columns)}")
+    
+    print("\n3. TIME RANGE VERIFICATION")
+    print("-" * 60)
+    
+    for name, df in loaded.items():
+        if 'timestamp' in df.columns:
+            df['timestamp'] = pd.to_datetime(df['timestamp'])
+            min_time = df['timestamp'].min()
+            max_time = df['timestamp'].max()
+            duration = (max_time - min_time).total_seconds() / 3600
+            print(f"  {name:25} {min_time} to {max_time}")
+            print(f"  Duration: {duration:.2f} hours")
+    
+    print("\n4. KEY STATISTICS")
+    print("-" * 60)
+    
+    if 'electricity_prices' in loaded:
+        df = loaded['electricity_prices']
+        print(f"\nElectricity Prices (by region):")
+        for region in df['region'].unique():
+            region_data = df[df['region'] == region]
+            print(f"  {region:3s}  Day-Ahead: €{region_data['day_ahead_price'].mean():.2f} avg, €{region_data['day_ahead_price'].max():.2f} max")
+            print(f"       Real-Time: €{region_data['real_time_price'].mean():.2f} avg, €{region_data['real_time_price'].max():.2f} max")
+    
+    if 'battery_capacity' in loaded:
+        df = loaded['battery_capacity']
+        print(f"\nBattery Capacity:")
+        for battery_id in df['battery_id'].unique()[:3]:
+            bat_data = df[df['battery_id'] == battery_id]
+            print(f"  {battery_id}  Cap: {bat_data['capacity_mwh'].iloc[0]:.1f} MWh, Avg Charge: {bat_data['charge_level_mwh'].mean():.1f} MWh")
+    
+    if 'bitcoin_mining' in loaded:
+        df = loaded['bitcoin_mining']
+        print(f"\nBitcoin Mining:")
+        print(f"  BTC Price: ${df['btc_price_usd'].mean():.2f} avg, ${df['btc_price_usd'].max():.2f} max")
+        print(f"  Hashrate: {df['hashrate_ths'].mean():.2f} EH/s avg")
+        print(f"  Profitability: ${df['mining_profitability'].mean():.4f} /TH/day avg")
+    
+    if 'data_centers' in loaded:
+        df = loaded['data_centers']
+        print(f"\nData Centers:")
+        for dc_id in df['data_center_id'].unique():
+            dc_data = df[df['data_center_id'] == dc_id]
+            client = dc_data['client_type'].iloc[0]
+            print(f"  {dc_id}  Client: {client:8s}, Avg Demand: {dc_data['power_demand_mw'].mean():.1f} MW")
+    
+    print("\n5. METADATA SUMMARY")
+    print("-" * 60)
+    
+    meta_file = metadata_dir / 'final_metadata.json'
+    if meta_file.exists():
+        with open(meta_file) as f:
+            meta = json.load(f)
+        print(f"  Total datasets: {meta['total_datasets']}")
+        print(f"  Total size: {meta['total_size_mb']:.2f} MB")
+        total_rows = sum(d['rows'] for d in meta['datasets'].values())
+        print(f"  Total rows: {total_rows:,}")
+        print(f"  Generated at: {meta['processed_at']}")
+    
+    print("\n" + "="*60)
+    print("TEST COMPLETE - ALL DATA LOADED SUCCESSFULLY")
+    print("="*60)
+    
+    return loaded
+
+if __name__ == '__main__':
+    main()