Add transmission datasets and update mining data

Add two new static datasets for cross-region arbitrage calculations:
- transmission_capacity: region-to-region capacity limits (20 rows)
- transmission_cost: transmission costs per path (20 rows)

Update mining dataset with EUR pricing and power metrics:
- Change btc_price_usd to btc_price_eur
- Add power_efficiency_th_per_mw, power_demand_mw
- Add revenue_eur_per_mwh, profit_eur_per_mwh
- Remove mining_profitability column

Changes include:
- scripts/02_fetch_historical.py: rewrite fetch_bitcoin_mining_data()
- scripts/01_generate_synthetic.py: add transmission data generators
- config/data_config.yaml: add transmission config, update bitcoin config
- config/schema.yaml: add 2 new schemas, update bitcoin_mining schema
- scripts/03_process_merge.py: add 2 new datasets
- scripts/04_validate.py: add 2 new datasets
- test/test_data.py: update for new datasets and bitcoin price reference

Total datasets: 9 (734,491 rows, 17.89 MB)
This commit is contained in:
2026-02-11 01:09:33 +07:00
parent d981f7c56c
commit faaadc1297
10 changed files with 361 additions and 70 deletions

View File

@@ -210,6 +210,72 @@ def generate_data_center_data(config, timestamps):
return pd.concat(df_list, ignore_index=True)
def generate_transmission_capacity_data(config):
np.random.seed(config['generation']['seed'] + 13)
regions = config['regions']
params = config['transmission']
data = []
for i, src in enumerate(regions):
for j, tgt in enumerate(regions):
if i == j:
continue
base_capacity = np.random.uniform(*params['capacity_base_range'])
if src == 'UK' or tgt == 'UK':
base_capacity *= params['capacity_uk_multiplier']
capacity = base_capacity * np.random.uniform(0.8, 1.2)
efficiency = np.random.uniform(*params['efficiency_range'])
direction = 'bidirectional'
data.append({
'source_region': src,
'target_region': tgt,
'capacity_mw': capacity,
'direction': direction,
'efficiency': efficiency
})
return pd.DataFrame(data)
def generate_transmission_cost_data(config):
np.random.seed(config['generation']['seed'] + 14)
regions = config['regions']
params = config['transmission']
avg_electricity_price = 80
data = []
for i, src in enumerate(regions):
for j, tgt in enumerate(regions):
if i == j:
continue
efficiency = np.random.uniform(*params['efficiency_range'])
loss_percent = (1 - efficiency) * 100
congestion_surcharge = np.random.uniform(*params['congestion_surcharge_range'])
fee = np.random.uniform(*params['fee_range'])
loss_cost = (loss_percent / 100) * avg_electricity_price
cost_eur_mwh = loss_cost + congestion_surcharge + fee
data.append({
'source_region': src,
'target_region': tgt,
'cost_eur_mwh': cost_eur_mwh,
'loss_percent': loss_percent,
'congestion_surcharge_eur_mwh': congestion_surcharge,
'fee_eur_mwh': fee
})
return pd.DataFrame(data)
def apply_noise_and_outliers(df, config):
if not config['generation']['add_noise']:
return df
@@ -283,20 +349,27 @@ def main():
datasets['battery_capacity'] = generate_battery_data(config, timestamps)
print(f" - Battery capacity: {len(datasets['battery_capacity'])} rows")
datasets['renewable_generation'] = generate_renewable_data(config, timestamps)
print(f" - Renewable generation: {len(datasets['renewable_generation'])} rows")
datasets['conventional_generation'] = generate_conventional_data(config, timestamps)
print(f" - Conventional generation: {len(datasets['conventional_generation'])} rows")
datasets['data_centers'] = generate_data_center_data(config, timestamps)
print(f" - Data centers: {len(datasets['data_centers'])} rows")
datasets['transmission_capacity'] = generate_transmission_capacity_data(config)
print(f" - Transmission capacity: {len(datasets['transmission_capacity'])} rows")
datasets['transmission_cost'] = generate_transmission_cost_data(config)
print(f" - Transmission cost: {len(datasets['transmission_cost'])} rows")
for name, df in datasets.items():
df = apply_noise_and_outliers(df, config)
df = add_missing_values(df, config)
datasets[name] = df
if name not in ['transmission_capacity', 'transmission_cost']:
df = apply_noise_and_outliers(df, config)
df = add_missing_values(df, config)
datasets[name] = df
output_base = Path(__file__).parent.parent / 'data'
output_base.mkdir(parents=True, exist_ok=True)