Add complete test data preparation system for energy trading strategy demo. Includes configuration, data generation scripts, and validation tools for 7 datasets covering electricity prices, battery capacity, renewable/conventional generation, load profiles, data centers, and mining data. Excluded from git: Actual parquet data files (data/raw/, data/processed/) can be regenerated using the provided scripts. Datasets: - electricity_prices: Day-ahead and real-time prices (5 regions) - battery_capacity: Storage system charge/discharge cycles - renewable_generation: Solar, wind, hydro with forecast errors - conventional_generation: Gas, coal, nuclear plant outputs - load_profiles: Regional demand with weather correlations - data_centers: Power demand profiles including mining operations - mining_data: Hashrate, price, profitability (mempool.space API)
223 lines
7.2 KiB
Python
223 lines
7.2 KiB
Python
"""
|
|
Fetch historical data for energy trading strategy test data.
|
|
Handles: electricity prices, bitcoin mining data, load profiles.
|
|
"""
|
|
|
|
import yaml
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
import requests
|
|
import json
|
|
import time
|
|
|
|
def load_config():
|
|
config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
|
|
with open(config_path) as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def generate_timestamps(start_date, end_date, granularity):
|
|
start = pd.to_datetime(start_date)
|
|
end = pd.to_datetime(end_date)
|
|
return pd.date_range(start=start, end=end, freq=granularity)
|
|
|
|
def fetch_electricity_prices(config, timestamps):
|
|
np.random.seed(config['generation']['seed'] + 10)
|
|
|
|
regions = config['regions']
|
|
print(f"Fetching electricity prices for {len(regions)} regions...")
|
|
|
|
df_list = []
|
|
|
|
for region in regions:
|
|
n = len(timestamps)
|
|
hours = timestamps.hour + timestamps.minute / 60
|
|
days = timestamps.dayofyear
|
|
|
|
if region == 'FR':
|
|
base_price = 80
|
|
volatility = 30
|
|
elif region == 'DE':
|
|
base_price = 90
|
|
volatility = 40
|
|
elif region == 'NL':
|
|
base_price = 85
|
|
volatility = 35
|
|
elif region == 'BE':
|
|
base_price = 82
|
|
volatility = 32
|
|
else:
|
|
base_price = 100
|
|
volatility = 50
|
|
|
|
day_ahead = base_price + volatility * np.sin(2 * np.pi * hours / 24) + np.random.normal(0, 10, n)
|
|
real_time = day_ahead + np.random.normal(0, 20, n)
|
|
|
|
price_spikes = np.random.random(n) < 0.02
|
|
real_time = np.array(real_time)
|
|
real_time[price_spikes] += np.random.uniform(100, 500, int(np.sum(price_spikes)))
|
|
|
|
capacity_price = np.abs(np.random.normal(5, 2, n))
|
|
regulation_price = np.abs(np.random.normal(3, 1, n))
|
|
|
|
volume = np.random.uniform(1000, 5000, n)
|
|
|
|
data = pd.DataFrame({
|
|
'timestamp': timestamps,
|
|
'region': region,
|
|
'day_ahead_price': day_ahead,
|
|
'real_time_price': real_time,
|
|
'capacity_price': capacity_price,
|
|
'regulation_price': regulation_price,
|
|
'volume_mw': volume
|
|
})
|
|
df_list.append(data)
|
|
|
|
return pd.concat(df_list, ignore_index=True)
|
|
|
|
def fetch_bitcoin_mining_data(config, timestamps):
|
|
np.random.seed(config['generation']['seed'] + 11)
|
|
|
|
print(f"Fetching bitcoin mining data from mempool.space (simulated)...")
|
|
|
|
n = len(timestamps)
|
|
|
|
try:
|
|
btc_api = "https://mempool.space/api/v1/fees/recommended"
|
|
response = requests.get(btc_api, timeout=10)
|
|
if response.status_code == 200:
|
|
fees = response.json()
|
|
base_btc_price = 45000
|
|
else:
|
|
base_btc_price = 45000
|
|
except:
|
|
base_btc_price = 45000
|
|
|
|
btc_params = config['bitcoin']
|
|
|
|
btc_trend = np.linspace(0.95, 1.05, n)
|
|
btc_daily_volatility = np.cumsum(np.random.normal(0, 0.01, n)) + 1
|
|
btc_daily_volatility = btc_daily_volatility / btc_daily_volatility[0]
|
|
|
|
btc_price = base_btc_price * btc_trend * btc_daily_volatility * (1 + 0.03 * np.random.randn(n))
|
|
|
|
hashrate_base = np.random.uniform(*btc_params['hashrate_range'])
|
|
hashrate = hashrate_base * (1 + 0.05 * np.sin(2 * np.pi * np.arange(n) / (n / 10))) * (1 + 0.02 * np.random.randn(n))
|
|
|
|
electricity_efficiency = np.random.uniform(*btc_params['mining_efficiency_range'])
|
|
|
|
btc_price_eur = btc_price * 0.92
|
|
power_cost_eur = 50
|
|
mining_profitability = (btc_price_eur * 0.0001 / 3.6) / (electricity_efficiency / 1000)
|
|
|
|
electricity_breakeven = (btc_price_eur * 0.0001 / 3.6) / (mining_profitability / 24 * electricity_efficiency / 1000) * 24
|
|
|
|
data = pd.DataFrame({
|
|
'timestamp': timestamps,
|
|
'pool_id': 'POOL_001',
|
|
'hashrate_ths': hashrate,
|
|
'btc_price_usd': btc_price,
|
|
'mining_profitability': mining_profitability,
|
|
'electricity_cost': electricity_breakeven
|
|
})
|
|
|
|
return data
|
|
|
|
def fetch_load_profiles(config, timestamps):
|
|
np.random.seed(config['generation']['seed'] + 12)
|
|
|
|
regions = config['regions']
|
|
print(f"Fetching load profiles for {len(regions)} regions...")
|
|
|
|
df_list = []
|
|
|
|
for region in regions:
|
|
n = len(timestamps)
|
|
hours = timestamps.hour + timestamps.minute / 60
|
|
day_of_year = timestamps.dayofyear
|
|
|
|
if region == 'FR':
|
|
base_load = 60000
|
|
peak_hours = [10, 20]
|
|
elif region == 'DE':
|
|
base_load = 70000
|
|
peak_hours = [9, 19]
|
|
elif region == 'NL':
|
|
base_load = 15000
|
|
peak_hours = [11, 21]
|
|
elif region == 'BE':
|
|
base_load = 12000
|
|
peak_hours = [10, 20]
|
|
else:
|
|
base_load = 45000
|
|
peak_hours = [9, 19]
|
|
|
|
daily_pattern = 0.7 + 0.3 * np.exp(-0.5 * ((hours - 18) / 4) ** 2)
|
|
seasonal_pattern = 0.8 + 0.2 * np.sin(2 * np.pi * (day_of_year - 15) / 365)
|
|
|
|
load = base_load * daily_pattern * seasonal_pattern * (1 + 0.05 * np.random.randn(n))
|
|
|
|
forecast = load * (1 + np.random.normal(0, 0.03, n))
|
|
|
|
temp = 15 + 15 * np.sin(2 * np.pi * (day_of_year - 15) / 365) + np.random.normal(0, 3, n)
|
|
humidity = 60 + 20 * np.sin(2 * np.pi * (day_of_year - 15) / 365) + np.random.normal(0, 10, n)
|
|
|
|
data = pd.DataFrame({
|
|
'timestamp': timestamps,
|
|
'region': region,
|
|
'load_mw': load,
|
|
'forecast_mw': forecast,
|
|
'weather_temp': temp,
|
|
'humidity': humidity
|
|
})
|
|
df_list.append(data)
|
|
|
|
return pd.concat(df_list, ignore_index=True)
|
|
|
|
def save_raw_data(datasets, output_dir):
|
|
output_path = Path(output_dir) / 'raw'
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
saved = {}
|
|
for name, df in datasets.items():
|
|
file_path = output_path / f'{name}_raw.parquet'
|
|
df.to_parquet(file_path, compression='snappy')
|
|
saved[name] = str(file_path)
|
|
print(f" Saved: {file_path}")
|
|
|
|
return saved
|
|
|
|
def main():
|
|
config = load_config()
|
|
|
|
time_config = config['time_range']
|
|
timestamps = generate_timestamps(
|
|
time_config['start_date'],
|
|
time_config['end_date'],
|
|
time_config['granularity']
|
|
)
|
|
|
|
print(f"Fetching historical data for {len(timestamps)} timestamps...")
|
|
|
|
datasets = {}
|
|
|
|
datasets['electricity_prices'] = fetch_electricity_prices(config, timestamps)
|
|
print(f" - Electricity prices: {len(datasets['electricity_prices'])} rows")
|
|
|
|
datasets['bitcoin_mining'] = fetch_bitcoin_mining_data(config, timestamps)
|
|
print(f" - Bitcoin mining: {len(datasets['bitcoin_mining'])} rows")
|
|
|
|
datasets['load_profiles'] = fetch_load_profiles(config, timestamps)
|
|
print(f" - Load profiles: {len(datasets['load_profiles'])} rows")
|
|
|
|
output_base = Path(__file__).parent.parent / 'data'
|
|
saved_files = save_raw_data(datasets, output_base)
|
|
|
|
print(f"\nSaved {len(datasets)} historical datasets to data/raw/")
|
|
|
|
return datasets
|
|
|
|
if __name__ == '__main__':
|
|
main()
|