Add transmission datasets and update mining data

Add two new static datasets for cross-region arbitrage calculations:
- transmission_capacity: region-to-region capacity limits (20 rows)
- transmission_cost: transmission costs per path (20 rows)

Update mining dataset with EUR pricing and power metrics:
- Change btc_price_usd to btc_price_eur
- Add power_efficiency_th_per_mw, power_demand_mw
- Add revenue_eur_per_mwh, profit_eur_per_mwh
- Remove mining_profitability column

Changes include:
- scripts/02_fetch_historical.py: rewrite fetch_bitcoin_mining_data()
- scripts/01_generate_synthetic.py: add transmission data generators
- config/data_config.yaml: add transmission config, update bitcoin config
- config/schema.yaml: add 2 new schemas, update bitcoin_mining schema
- scripts/03_process_merge.py: add 2 new datasets
- scripts/04_validate.py: add 2 new datasets
- test/test_data.py: update for new datasets and bitcoin price reference

Total datasets: 9 (734,491 rows, 17.89 MB)
This commit is contained in:
2026-02-11 01:09:33 +07:00
parent d981f7c56c
commit faaadc1297
10 changed files with 361 additions and 70 deletions

View File

@@ -210,6 +210,72 @@ def generate_data_center_data(config, timestamps):
return pd.concat(df_list, ignore_index=True)
def generate_transmission_capacity_data(config):
np.random.seed(config['generation']['seed'] + 13)
regions = config['regions']
params = config['transmission']
data = []
for i, src in enumerate(regions):
for j, tgt in enumerate(regions):
if i == j:
continue
base_capacity = np.random.uniform(*params['capacity_base_range'])
if src == 'UK' or tgt == 'UK':
base_capacity *= params['capacity_uk_multiplier']
capacity = base_capacity * np.random.uniform(0.8, 1.2)
efficiency = np.random.uniform(*params['efficiency_range'])
direction = 'bidirectional'
data.append({
'source_region': src,
'target_region': tgt,
'capacity_mw': capacity,
'direction': direction,
'efficiency': efficiency
})
return pd.DataFrame(data)
def generate_transmission_cost_data(config):
np.random.seed(config['generation']['seed'] + 14)
regions = config['regions']
params = config['transmission']
avg_electricity_price = 80
data = []
for i, src in enumerate(regions):
for j, tgt in enumerate(regions):
if i == j:
continue
efficiency = np.random.uniform(*params['efficiency_range'])
loss_percent = (1 - efficiency) * 100
congestion_surcharge = np.random.uniform(*params['congestion_surcharge_range'])
fee = np.random.uniform(*params['fee_range'])
loss_cost = (loss_percent / 100) * avg_electricity_price
cost_eur_mwh = loss_cost + congestion_surcharge + fee
data.append({
'source_region': src,
'target_region': tgt,
'cost_eur_mwh': cost_eur_mwh,
'loss_percent': loss_percent,
'congestion_surcharge_eur_mwh': congestion_surcharge,
'fee_eur_mwh': fee
})
return pd.DataFrame(data)
def apply_noise_and_outliers(df, config):
if not config['generation']['add_noise']:
return df
@@ -283,20 +349,27 @@ def main():
datasets['battery_capacity'] = generate_battery_data(config, timestamps)
print(f" - Battery capacity: {len(datasets['battery_capacity'])} rows")
datasets['renewable_generation'] = generate_renewable_data(config, timestamps)
print(f" - Renewable generation: {len(datasets['renewable_generation'])} rows")
datasets['conventional_generation'] = generate_conventional_data(config, timestamps)
print(f" - Conventional generation: {len(datasets['conventional_generation'])} rows")
datasets['data_centers'] = generate_data_center_data(config, timestamps)
print(f" - Data centers: {len(datasets['data_centers'])} rows")
datasets['transmission_capacity'] = generate_transmission_capacity_data(config)
print(f" - Transmission capacity: {len(datasets['transmission_capacity'])} rows")
datasets['transmission_cost'] = generate_transmission_cost_data(config)
print(f" - Transmission cost: {len(datasets['transmission_cost'])} rows")
for name, df in datasets.items():
df = apply_noise_and_outliers(df, config)
df = add_missing_values(df, config)
datasets[name] = df
if name not in ['transmission_capacity', 'transmission_cost']:
df = apply_noise_and_outliers(df, config)
df = add_missing_values(df, config)
datasets[name] = df
output_base = Path(__file__).parent.parent / 'data'
output_base.mkdir(parents=True, exist_ok=True)

View File

@@ -78,50 +78,57 @@ def fetch_electricity_prices(config, timestamps):
def fetch_bitcoin_mining_data(config, timestamps):
np.random.seed(config['generation']['seed'] + 11)
print(f"Fetching bitcoin mining data from mempool.space (simulated)...")
n = len(timestamps)
try:
btc_api = "https://mempool.space/api/v1/fees/recommended"
response = requests.get(btc_api, timeout=10)
if response.status_code == 200:
fees = response.json()
base_btc_price = 45000
else:
base_btc_price = 45000
pass
except:
base_btc_price = 45000
pass
btc_params = config['bitcoin']
btc_trend = np.linspace(0.95, 1.05, n)
btc_eur_trend = np.linspace(0.95, 1.05, n)
btc_daily_volatility = np.cumsum(np.random.normal(0, 0.01, n)) + 1
btc_daily_volatility = btc_daily_volatility / btc_daily_volatility[0]
btc_price = base_btc_price * btc_trend * btc_daily_volatility * (1 + 0.03 * np.random.randn(n))
base_btc_price_eur = 41400
btc_price_eur = base_btc_price_eur * btc_eur_trend * btc_daily_volatility * (1 + 0.03 * np.random.randn(n))
hashrate_base = np.random.uniform(*btc_params['hashrate_range'])
hashrate = hashrate_base * (1 + 0.05 * np.sin(2 * np.pi * np.arange(n) / (n / 10))) * (1 + 0.02 * np.random.randn(n))
electricity_efficiency = np.random.uniform(*btc_params['mining_efficiency_range'])
btc_price_eur = btc_price * 0.92
power_cost_eur = 50
mining_profitability = (btc_price_eur * 0.0001 / 3.6) / (electricity_efficiency / 1000)
electricity_breakeven = (btc_price_eur * 0.0001 / 3.6) / (mining_profitability / 24 * electricity_efficiency / 1000) * 24
power_efficiency = np.random.uniform(*btc_params['power_efficiency_range'])
power_demand = hashrate / power_efficiency
mining_profitability = (btc_price_eur * 0.0001 / 3.6) / (power_efficiency / 1000)
revenue_eur_per_mwh = mining_profitability * power_efficiency * 24
electricity_breakeven = 40 + np.random.normal(0, 5, n)
profit_eur_per_mwh = revenue_eur_per_mwh - electricity_breakeven
data = pd.DataFrame({
'timestamp': timestamps,
'pool_id': 'POOL_001',
'hashrate_ths': hashrate,
'btc_price_usd': btc_price,
'mining_profitability': mining_profitability,
'btc_price_eur': btc_price_eur,
'power_efficiency_th_per_mw': power_efficiency,
'power_demand_mw': power_demand,
'revenue_eur_per_mwh': revenue_eur_per_mwh,
'profit_eur_per_mwh': profit_eur_per_mwh,
'electricity_cost': electricity_breakeven
})
return data
def fetch_load_profiles(config, timestamps):

View File

@@ -126,7 +126,9 @@ def main():
'conventional_generation',
'load_profiles',
'data_centers',
'bitcoin_mining'
'bitcoin_mining',
'transmission_capacity',
'transmission_cost'
]
processed_info = {}

View File

@@ -233,7 +233,9 @@ def main():
'conventional_generation',
'load_profiles',
'data_centers',
'bitcoin_mining'
'bitcoin_mining',
'transmission_capacity',
'transmission_cost'
]
print("Validating processed datasets...\n")