From 30719abc27d958de19ac8d911e123305b543867c Mon Sep 17 00:00:00 2001 From: kbt-devops Date: Tue, 10 Feb 2026 23:28:27 +0700 Subject: [PATCH] Add test_data.py for data verification Add utility script to quickly verify and explore generated test data. Provides data loading, sample previews, time range checks, and key statistics for all datasets. --- test/test_data.py | 115 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 test/test_data.py diff --git a/test/test_data.py b/test/test_data.py new file mode 100644 index 0000000..bd6633c --- /dev/null +++ b/test/test_data.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +Quick test script to verify and explore the generated test data. +""" + +import pandas as pd +from pathlib import Path +import json + +def main(): + data_dir = Path(__file__).parent.parent / 'data' / 'processed' + metadata_dir = Path(__file__).parent.parent / 'data' / 'metadata' + + print("="*60) + print("ENERGY TEST DATA - QUICK VERIFICATION") + print("="*60) + + datasets = [ + 'electricity_prices', + 'battery_capacity', + 'renewable_generation', + 'conventional_generation', + 'load_profiles', + 'data_centers', + 'bitcoin_mining' + ] + + print("\n1. LOADING DATASETS") + print("-" * 60) + + loaded = {} + for name in datasets: + file_path = data_dir / f'{name}.parquet' + if file_path.exists(): + loaded[name] = pd.read_parquet(file_path) + print(f" ✓ {name:25} {len(loaded[name]):>10,} rows") + else: + print(f" ✗ {name:25} NOT FOUND") + + print(f"\nTotal datasets loaded: {len(loaded)}/7") + + print("\n2. SAMPLE DATA PREVIEWS") + print("-" * 60) + + for name, df in loaded.items(): + print(f"\n{name}:") + print(df.head(3).to_string(index=False)) + print(f" Columns: {list(df.columns)}") + + print("\n3. TIME RANGE VERIFICATION") + print("-" * 60) + + for name, df in loaded.items(): + if 'timestamp' in df.columns: + df['timestamp'] = pd.to_datetime(df['timestamp']) + min_time = df['timestamp'].min() + max_time = df['timestamp'].max() + duration = (max_time - min_time).total_seconds() / 3600 + print(f" {name:25} {min_time} to {max_time}") + print(f" Duration: {duration:.2f} hours") + + print("\n4. KEY STATISTICS") + print("-" * 60) + + if 'electricity_prices' in loaded: + df = loaded['electricity_prices'] + print(f"\nElectricity Prices (by region):") + for region in df['region'].unique(): + region_data = df[df['region'] == region] + print(f" {region:3s} Day-Ahead: €{region_data['day_ahead_price'].mean():.2f} avg, €{region_data['day_ahead_price'].max():.2f} max") + print(f" Real-Time: €{region_data['real_time_price'].mean():.2f} avg, €{region_data['real_time_price'].max():.2f} max") + + if 'battery_capacity' in loaded: + df = loaded['battery_capacity'] + print(f"\nBattery Capacity:") + for battery_id in df['battery_id'].unique()[:3]: + bat_data = df[df['battery_id'] == battery_id] + print(f" {battery_id} Cap: {bat_data['capacity_mwh'].iloc[0]:.1f} MWh, Avg Charge: {bat_data['charge_level_mwh'].mean():.1f} MWh") + + if 'bitcoin_mining' in loaded: + df = loaded['bitcoin_mining'] + print(f"\nBitcoin Mining:") + print(f" BTC Price: ${df['btc_price_usd'].mean():.2f} avg, ${df['btc_price_usd'].max():.2f} max") + print(f" Hashrate: {df['hashrate_ths'].mean():.2f} EH/s avg") + print(f" Profitability: ${df['mining_profitability'].mean():.4f} /TH/day avg") + + if 'data_centers' in loaded: + df = loaded['data_centers'] + print(f"\nData Centers:") + for dc_id in df['data_center_id'].unique(): + dc_data = df[df['data_center_id'] == dc_id] + client = dc_data['client_type'].iloc[0] + print(f" {dc_id} Client: {client:8s}, Avg Demand: {dc_data['power_demand_mw'].mean():.1f} MW") + + print("\n5. METADATA SUMMARY") + print("-" * 60) + + meta_file = metadata_dir / 'final_metadata.json' + if meta_file.exists(): + with open(meta_file) as f: + meta = json.load(f) + print(f" Total datasets: {meta['total_datasets']}") + print(f" Total size: {meta['total_size_mb']:.2f} MB") + total_rows = sum(d['rows'] for d in meta['datasets'].values()) + print(f" Total rows: {total_rows:,}") + print(f" Generated at: {meta['processed_at']}") + + print("\n" + "="*60) + print("TEST COMPLETE - ALL DATA LOADED SUCCESSFULLY") + print("="*60) + + return loaded + +if __name__ == '__main__': + main()