Add transmission datasets and update mining data
Add two new static datasets for cross-region arbitrage calculations: - transmission_capacity: region-to-region capacity limits (20 rows) - transmission_cost: transmission costs per path (20 rows) Update mining dataset with EUR pricing and power metrics: - Change btc_price_usd to btc_price_eur - Add power_efficiency_th_per_mw, power_demand_mw - Add revenue_eur_per_mwh, profit_eur_per_mwh - Remove mining_profitability column Changes include: - scripts/02_fetch_historical.py: rewrite fetch_bitcoin_mining_data() - scripts/01_generate_synthetic.py: add transmission data generators - config/data_config.yaml: add transmission config, update bitcoin config - config/schema.yaml: add 2 new schemas, update bitcoin_mining schema - scripts/03_process_merge.py: add 2 new datasets - scripts/04_validate.py: add 2 new datasets - test/test_data.py: update for new datasets and bitcoin price reference Total datasets: 9 (734,491 rows, 17.89 MB)
This commit is contained in:
@@ -210,6 +210,72 @@ def generate_data_center_data(config, timestamps):
|
||||
|
||||
return pd.concat(df_list, ignore_index=True)
|
||||
|
||||
def generate_transmission_capacity_data(config):
|
||||
np.random.seed(config['generation']['seed'] + 13)
|
||||
|
||||
regions = config['regions']
|
||||
params = config['transmission']
|
||||
|
||||
data = []
|
||||
|
||||
for i, src in enumerate(regions):
|
||||
for j, tgt in enumerate(regions):
|
||||
if i == j:
|
||||
continue
|
||||
|
||||
base_capacity = np.random.uniform(*params['capacity_base_range'])
|
||||
|
||||
if src == 'UK' or tgt == 'UK':
|
||||
base_capacity *= params['capacity_uk_multiplier']
|
||||
|
||||
capacity = base_capacity * np.random.uniform(0.8, 1.2)
|
||||
efficiency = np.random.uniform(*params['efficiency_range'])
|
||||
direction = 'bidirectional'
|
||||
|
||||
data.append({
|
||||
'source_region': src,
|
||||
'target_region': tgt,
|
||||
'capacity_mw': capacity,
|
||||
'direction': direction,
|
||||
'efficiency': efficiency
|
||||
})
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
def generate_transmission_cost_data(config):
|
||||
np.random.seed(config['generation']['seed'] + 14)
|
||||
|
||||
regions = config['regions']
|
||||
params = config['transmission']
|
||||
|
||||
avg_electricity_price = 80
|
||||
|
||||
data = []
|
||||
|
||||
for i, src in enumerate(regions):
|
||||
for j, tgt in enumerate(regions):
|
||||
if i == j:
|
||||
continue
|
||||
|
||||
efficiency = np.random.uniform(*params['efficiency_range'])
|
||||
loss_percent = (1 - efficiency) * 100
|
||||
congestion_surcharge = np.random.uniform(*params['congestion_surcharge_range'])
|
||||
fee = np.random.uniform(*params['fee_range'])
|
||||
|
||||
loss_cost = (loss_percent / 100) * avg_electricity_price
|
||||
cost_eur_mwh = loss_cost + congestion_surcharge + fee
|
||||
|
||||
data.append({
|
||||
'source_region': src,
|
||||
'target_region': tgt,
|
||||
'cost_eur_mwh': cost_eur_mwh,
|
||||
'loss_percent': loss_percent,
|
||||
'congestion_surcharge_eur_mwh': congestion_surcharge,
|
||||
'fee_eur_mwh': fee
|
||||
})
|
||||
|
||||
return pd.DataFrame(data)
|
||||
|
||||
def apply_noise_and_outliers(df, config):
|
||||
if not config['generation']['add_noise']:
|
||||
return df
|
||||
@@ -283,20 +349,27 @@ def main():
|
||||
|
||||
datasets['battery_capacity'] = generate_battery_data(config, timestamps)
|
||||
print(f" - Battery capacity: {len(datasets['battery_capacity'])} rows")
|
||||
|
||||
|
||||
datasets['renewable_generation'] = generate_renewable_data(config, timestamps)
|
||||
print(f" - Renewable generation: {len(datasets['renewable_generation'])} rows")
|
||||
|
||||
|
||||
datasets['conventional_generation'] = generate_conventional_data(config, timestamps)
|
||||
print(f" - Conventional generation: {len(datasets['conventional_generation'])} rows")
|
||||
|
||||
|
||||
datasets['data_centers'] = generate_data_center_data(config, timestamps)
|
||||
print(f" - Data centers: {len(datasets['data_centers'])} rows")
|
||||
|
||||
|
||||
datasets['transmission_capacity'] = generate_transmission_capacity_data(config)
|
||||
print(f" - Transmission capacity: {len(datasets['transmission_capacity'])} rows")
|
||||
|
||||
datasets['transmission_cost'] = generate_transmission_cost_data(config)
|
||||
print(f" - Transmission cost: {len(datasets['transmission_cost'])} rows")
|
||||
|
||||
for name, df in datasets.items():
|
||||
df = apply_noise_and_outliers(df, config)
|
||||
df = add_missing_values(df, config)
|
||||
datasets[name] = df
|
||||
if name not in ['transmission_capacity', 'transmission_cost']:
|
||||
df = apply_noise_and_outliers(df, config)
|
||||
df = add_missing_values(df, config)
|
||||
datasets[name] = df
|
||||
|
||||
output_base = Path(__file__).parent.parent / 'data'
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user