Files
energy-test-data/config/schema.yaml
kbt-devops a643767359 Initial commit: Energy test data generation pipeline
Add complete test data preparation system for energy trading strategy
demo. Includes configuration, data generation scripts, and validation
tools for 7 datasets covering electricity prices, battery capacity,
renewable/conventional generation, load profiles, data centers, and
mining data.

Excluded from git: Actual parquet data files (data/raw/, data/processed/)
can be regenerated using the provided scripts.

Datasets:
- electricity_prices: Day-ahead and real-time prices (5 regions)
- battery_capacity: Storage system charge/discharge cycles
- renewable_generation: Solar, wind, hydro with forecast errors
- conventional_generation: Gas, coal, nuclear plant outputs
- load_profiles: Regional demand with weather correlations
- data_centers: Power demand profiles including mining operations
- mining_data: Hashrate, price, profitability (mempool.space API)
2026-02-10 23:28:23 +07:00

234 lines
6.3 KiB
YAML

# Schema definitions for energy test data datasets
schemas:
electricity_prices:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of price observation"
- name: "region"
type: "category"
description: "Market region code"
- name: "day_ahead_price"
type: "float32"
unit: "EUR/MWh"
description: "Day-ahead market clearing price"
- name: "real_time_price"
type: "float32"
unit: "EUR/MWh"
description: "Real-time market price"
- name: "capacity_price"
type: "float32"
unit: "EUR/MWh"
description: "Capacity market price"
- name: "regulation_price"
type: "float32"
unit: "EUR/MWh"
description: "Frequency regulation price"
- name: "volume_mw"
type: "float32"
unit: "MW"
description: "Traded volume"
battery_capacity:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of battery state"
- name: "battery_id"
type: "category"
description: "Unique battery identifier"
- name: "capacity_mwh"
type: "float32"
unit: "MWh"
description: "Total storage capacity"
- name: "charge_level_mwh"
type: "float32"
unit: "MWh"
description: "Current energy stored"
- name: "charge_rate_mw"
type: "float32"
unit: "MW"
description: "Current charging rate (positive) or discharging (negative)"
- name: "discharge_rate_mw"
type: "float32"
unit: "MW"
description: "Maximum discharge rate"
- name: "efficiency"
type: "float32"
description: "Round-trip efficiency (0-1)"
renewable_generation:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of generation measurement"
- name: "source"
type: "category"
description: "Renewable source type (solar, wind, hydro)"
- name: "plant_id"
type: "category"
description: "Unique plant identifier"
- name: "generation_mw"
type: "float32"
unit: "MW"
description: "Actual generation output"
- name: "forecast_mw"
type: "float32"
unit: "MW"
description: "Forecasted generation"
- name: "actual_mw"
type: "float32"
unit: "MW"
description: "Actual measured generation (after correction)"
- name: "capacity_factor"
type: "float32"
description: "Capacity utilization factor (0-1)"
conventional_generation:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of generation measurement"
- name: "plant_id"
type: "category"
description: "Unique plant identifier"
- name: "fuel_type"
type: "category"
description: "Primary fuel type (gas, coal, nuclear)"
- name: "generation_mw"
type: "float32"
unit: "MW"
description: "Current generation output"
- name: "marginal_cost"
type: "float32"
unit: "EUR/MWh"
description: "Short-run marginal cost"
- name: "heat_rate"
type: "float32"
unit: "MMBtu/MWh"
description: "Thermal efficiency metric"
load_profiles:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of load measurement"
- name: "region"
type: "category"
description: "Region code"
- name: "load_mw"
type: "float32"
unit: "MW"
description: "Actual system load"
- name: "forecast_mw"
type: "float32"
unit: "MW"
description: "Load forecast"
- name: "weather_temp"
type: "float32"
unit: "Celsius"
description: "Average temperature"
- name: "humidity"
type: "float32"
unit: "%"
description: "Relative humidity"
data_centers:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of demand measurement"
- name: "data_center_id"
type: "category"
description: "Data center identifier"
- name: "location"
type: "category"
description: "Geographic location"
- name: "power_demand_mw"
type: "float32"
unit: "MW"
description: "Current power demand"
- name: "max_bid_price"
type: "float32"
unit: "EUR/MWh"
description: "Maximum price willing to pay"
- name: "client_type"
type: "category"
description: "Client type (bitcoin, enterprise, etc.)"
bitcoin_mining:
columns:
- name: "timestamp"
type: "datetime64[ns]"
description: "Timestamp of mining measurement"
- name: "pool_id"
type: "category"
description: "Mining pool identifier"
- name: "hashrate_ths"
type: "float32"
unit: "TH/s"
description: "Mining pool hashrate"
- name: "btc_price_usd"
type: "float32"
unit: "USD"
description: "Bitcoin price"
- name: "mining_profitability"
type: "float32"
unit: "USD/TH/day"
description: "Mining profitability per terahash per day"
- name: "electricity_cost"
type: "float32"
unit: "EUR/MWh"
description: "Electricity cost breakeven point"
validation_rules:
electricity_prices:
- column: "day_ahead_price"
min: -500
max: 3000
- column: "real_time_price"
min: -500
max: 5000
battery_capacity:
- column: "charge_level_mwh"
min: 0
check_max: "capacity_mwh"
- column: "efficiency"
min: 0.5
max: 1.0
renewable_generation:
- column: "generation_mw"
min: 0
- column: "capacity_factor"
min: 0
max: 1.0
conventional_generation:
- column: "generation_mw"
min: 0
- column: "heat_rate"
min: 5
max: 15
load_profiles:
- column: "load_mw"
min: 0
- column: "weather_temp"
min: -30
max: 50
data_centers:
- column: "power_demand_mw"
min: 0
- column: "max_bid_price"
min: 0
bitcoin_mining:
- column: "hashrate_ths"
min: 0
- column: "btc_price_usd"
min: 1000