Initial commit: Energy test data generation pipeline
Add complete test data preparation system for energy trading strategy demo. Includes configuration, data generation scripts, and validation tools for 7 datasets covering electricity prices, battery capacity, renewable/conventional generation, load profiles, data centers, and mining data. Excluded from git: Actual parquet data files (data/raw/, data/processed/) can be regenerated using the provided scripts. Datasets: - electricity_prices: Day-ahead and real-time prices (5 regions) - battery_capacity: Storage system charge/discharge cycles - renewable_generation: Solar, wind, hydro with forecast errors - conventional_generation: Gas, coal, nuclear plant outputs - load_profiles: Regional demand with weather correlations - data_centers: Power demand profiles including mining operations - mining_data: Hashrate, price, profitability (mempool.space API)
This commit is contained in:
172
scripts/03_process_merge.py
Normal file
172
scripts/03_process_merge.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
Process and merge all datasets, apply compression, and save to Parquet format.
|
||||
"""
|
||||
|
||||
import yaml
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import json
|
||||
import sys
|
||||
|
||||
def load_config():
|
||||
config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
|
||||
with open(config_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
def load_dataset(dataset_name, data_base):
|
||||
synthetic_path = data_base / 'metadata' / 'generation_metadata.json'
|
||||
|
||||
df_list = []
|
||||
|
||||
raw_path = data_base / 'raw' / f'{dataset_name}_raw.parquet'
|
||||
if raw_path.exists():
|
||||
print(f" Loading {dataset_name} from raw data...")
|
||||
df = pd.read_parquet(raw_path)
|
||||
df_list.append(df)
|
||||
|
||||
print(f" Total rows for {dataset_name}: {len(pd.concat(df_list, ignore_index=True)) if df_list else 0}")
|
||||
|
||||
return pd.concat(df_list, ignore_index=True) if df_list else None
|
||||
|
||||
def downgrade_precision(df, config):
|
||||
precision = config['output'].get('precision', 'float32')
|
||||
|
||||
for col in df.select_dtypes(include=['float64']).columns:
|
||||
if col == 'timestamp':
|
||||
continue
|
||||
df[col] = df[col].astype(precision)
|
||||
|
||||
for col in df.select_dtypes(include=['int64']).columns:
|
||||
if col == 'timestamp':
|
||||
continue
|
||||
df[col] = df[col].astype('int32')
|
||||
|
||||
return df
|
||||
|
||||
def convert_categoricals(df):
|
||||
for col in df.select_dtypes(include=['object']).columns:
|
||||
if col == 'timestamp':
|
||||
continue
|
||||
if df[col].nunique() < df.shape[0] * 0.5:
|
||||
df[col] = df[col].astype('category')
|
||||
|
||||
return df
|
||||
|
||||
def optimize_memory(df):
|
||||
start_mem = df.memory_usage(deep=True).sum() / 1024 / 1024
|
||||
|
||||
df = downgrade_precision(df, {'output': {'precision': 'float32'}})
|
||||
df = convert_categoricals(df)
|
||||
|
||||
end_mem = df.memory_usage(deep=True).sum() / 1024 / 1024
|
||||
|
||||
reduction = (1 - end_mem / start_mem) * 100
|
||||
print(f" Memory: {start_mem:.2f}MB -> {end_mem:.2f}MB ({reduction:.1f}% reduction)")
|
||||
|
||||
return df
|
||||
|
||||
def save_processed_dataset(df, dataset_name, output_dir, config):
|
||||
output_path = Path(output_dir) / f'{dataset_name}.parquet'
|
||||
|
||||
compression = config['output'].get('compression', 'snappy')
|
||||
|
||||
df.to_parquet(output_path, compression=compression, index=False)
|
||||
|
||||
file_size_mb = output_path.stat().st_size / 1024 / 1024
|
||||
print(f" Saved: {output_path} ({file_size_mb:.2f}MB)")
|
||||
|
||||
return {
|
||||
'path': str(output_path),
|
||||
'size_mb': file_size_mb,
|
||||
'rows': len(df),
|
||||
'columns': len(df.columns)
|
||||
}
|
||||
|
||||
def validate_timestamps(df, dataset_name):
|
||||
if 'timestamp' not in df.columns:
|
||||
print(f" Warning: {dataset_name} has no timestamp column")
|
||||
return False
|
||||
|
||||
df['timestamp'] = pd.to_datetime(df['timestamp'])
|
||||
duplicates = df['timestamp'].duplicated().sum()
|
||||
|
||||
if duplicates > 0:
|
||||
print(f" Warning: {dataset_name} has {duplicates} duplicate timestamps")
|
||||
|
||||
return True
|
||||
|
||||
def generate_final_metadata(processed_info, output_dir):
|
||||
metadata = {
|
||||
'processed_at': pd.Timestamp.utcnow().isoformat(),
|
||||
'total_datasets': len(processed_info),
|
||||
'total_size_mb': sum(info['size_mb'] for info in processed_info.values()),
|
||||
'datasets': processed_info
|
||||
}
|
||||
|
||||
output_path = Path(output_dir) / 'metadata' / 'final_metadata.json'
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(metadata, f, indent=2, default=str)
|
||||
|
||||
return metadata
|
||||
|
||||
def main():
|
||||
config = load_config()
|
||||
|
||||
data_base = Path(__file__).parent.parent / 'data'
|
||||
processed_dir = data_base / 'processed'
|
||||
processed_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Processing and merging datasets...")
|
||||
|
||||
datasets = [
|
||||
'electricity_prices',
|
||||
'battery_capacity',
|
||||
'renewable_generation',
|
||||
'conventional_generation',
|
||||
'load_profiles',
|
||||
'data_centers',
|
||||
'bitcoin_mining'
|
||||
]
|
||||
|
||||
processed_info = {}
|
||||
|
||||
for dataset_name in datasets:
|
||||
print(f"\nProcessing {dataset_name}...")
|
||||
|
||||
df = load_dataset(dataset_name, data_base)
|
||||
|
||||
if df is None:
|
||||
print(f" Warning: {dataset_name} has no data, skipping")
|
||||
continue
|
||||
|
||||
validate_timestamps(df, dataset_name)
|
||||
|
||||
print(" Optimizing memory...")
|
||||
df = optimize_memory(df)
|
||||
|
||||
info = save_processed_dataset(df, dataset_name, processed_dir, config)
|
||||
processed_info[dataset_name] = info
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("Processing complete!")
|
||||
print(f"{'='*60}")
|
||||
|
||||
metadata = generate_final_metadata(processed_info, data_base)
|
||||
|
||||
print(f"\nTotal datasets processed: {len(processed_info)}")
|
||||
print(f"Total size: {metadata['total_size_mb']:.2f}MB")
|
||||
print(f"Target size: {config['output']['target_size_mb']}MB")
|
||||
|
||||
if metadata['total_size_mb'] > config['output']['target_size_mb']:
|
||||
print(f"Warning: Total size exceeds target by {metadata['total_size_mb'] - config['output']['target_size_mb']:.2f}MB")
|
||||
else:
|
||||
print("✓ Total size within target")
|
||||
|
||||
print(f"\nProcessed data saved to: {processed_dir}")
|
||||
print(f"Metadata saved to: {data_base / 'metadata' / 'final_metadata.json'}")
|
||||
|
||||
return processed_info
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user