diff --git a/config/data_config.yaml b/config/data_config.yaml index 7446c98..31797ef 100644 --- a/config/data_config.yaml +++ b/config/data_config.yaml @@ -93,4 +93,12 @@ data_center: bitcoin: hashrate_range: [150, 250] # EH/s - mining_efficiency_range: [25, 35] # J/TH + power_efficiency_range: [80, 120] # TH/s per MW + eur_usd_rate: 0.92 # For converting to EUR base price + +transmission: + capacity_base_range: [1000, 4000] # MW + capacity_uk_multiplier: 0.6 # UK connections typically lower + efficiency_range: [0.95, 0.99] + congestion_surcharge_range: [0.5, 5.0] # EUR/MWh + fee_range: [0, 2.0] # EUR/MWh diff --git a/config/schema.yaml b/config/schema.yaml index 77ae922..d0cc263 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -169,18 +169,74 @@ schemas: type: "float32" unit: "TH/s" description: "Mining pool hashrate" - - name: "btc_price_usd" + - name: "btc_price_eur" type: "float32" - unit: "USD" - description: "Bitcoin price" - - name: "mining_profitability" + unit: "EUR" + description: "Bitcoin price in EUR" + - name: "power_efficiency_th_per_mw" type: "float32" - unit: "USD/TH/day" - description: "Mining profitability per terahash per day" + unit: "TH/s per MW" + description: "Mining efficiency" + - name: "power_demand_mw" + type: "float32" + unit: "MW" + description: "Power consumption for mining" + - name: "revenue_eur_per_mwh" + type: "float32" + unit: "EUR/MWh" + description: "Mining revenue per MWh of electricity" + - name: "profit_eur_per_mwh" + type: "float32" + unit: "EUR/MWh" + description: "Mining profit after electricity cost" - name: "electricity_cost" type: "float32" unit: "EUR/MWh" - description: "Electricity cost breakeven point" + description: "Electricity cost for mining" + + transmission_capacity: + columns: + - name: "source_region" + type: "category" + description: "Source region code" + - name: "target_region" + type: "category" + description: "Target region code" + - name: "capacity_mw" + type: "float32" + unit: "MW" + description: "Maximum transmission capacity" + - name: "direction" + type: "category" + description: "Transmission direction" + - name: "efficiency" + type: "float32" + description: "Transmission efficiency (0-1)" + + transmission_cost: + columns: + - name: "source_region" + type: "category" + description: "Source region code" + - name: "target_region" + type: "category" + description: "Target region code" + - name: "cost_eur_mwh" + type: "float32" + unit: "EUR/MWh" + description: "Total transmission cost per MWh" + - name: "loss_percent" + type: "float32" + unit: "%" + description: "Transmission loss percentage" + - name: "congestion_surcharge_eur_mwh" + type: "float32" + unit: "EUR/MWh" + description: "Additional congestion charge" + - name: "fee_eur_mwh" + type: "float32" + unit: "EUR/MWh" + description: "Transmission fee" validation_rules: electricity_prices: @@ -229,5 +285,32 @@ validation_rules: bitcoin_mining: - column: "hashrate_ths" min: 0 - - column: "btc_price_usd" + max: 1000000 + - column: "btc_price_eur" min: 1000 + max: 200000 + - column: "power_efficiency_th_per_mw" + min: 50 + max: 150 + - column: "power_demand_mw" + min: 10 + max: 1000 + - column: "revenue_eur_per_mwh" + min: 0 + max: 500 + + transmission_capacity: + - column: "capacity_mw" + min: 100 + max: 10000 + - column: "efficiency" + min: 0.9 + max: 1.0 + + transmission_cost: + - column: "cost_eur_mwh" + min: 0 + max: 50 + - column: "loss_percent" + min: 0 + max: 15 diff --git a/data/metadata/final_metadata.json b/data/metadata/final_metadata.json index ff29c2c..f69ceca 100644 --- a/data/metadata/final_metadata.json +++ b/data/metadata/final_metadata.json @@ -1,7 +1,7 @@ { - "processed_at": "2026-02-10T16:10:49.295018+00:00", - "total_datasets": 7, - "total_size_mb": 16.977967262268066, + "processed_at": "2026-02-10T17:49:27.237574+00:00", + "total_datasets": 9, + "total_size_mb": 17.2280216217041, "datasets": { "electricity_prices": { "path": "/home/user/energy-test-data/data/processed/electricity_prices.parquet", @@ -11,19 +11,19 @@ }, "battery_capacity": { "path": "/home/user/energy-test-data/data/processed/battery_capacity.parquet", - "size_mb": 4.204527854919434, + "size_mb": 4.204350471496582, "rows": 144010, "columns": 7 }, "renewable_generation": { "path": "/home/user/energy-test-data/data/processed/renewable_generation.parquet", - "size_mb": 4.482715606689453, + "size_mb": 4.483729362487793, "rows": 216015, "columns": 7 }, "conventional_generation": { "path": "/home/user/energy-test-data/data/processed/conventional_generation.parquet", - "size_mb": 2.749570846557617, + "size_mb": 2.7516822814941406, "rows": 144010, "columns": 6 }, @@ -35,14 +35,26 @@ }, "data_centers": { "path": "/home/user/energy-test-data/data/processed/data_centers.parquet", - "size_mb": 1.0422554016113281, + "size_mb": 1.0423173904418945, "rows": 72005, "columns": 6 }, "bitcoin_mining": { "path": "/home/user/energy-test-data/data/processed/bitcoin_mining.parquet", - "size_mb": 0.3613767623901367, + "size_mb": 0.5998897552490234, "rows": 14401, + "columns": 9 + }, + "transmission_capacity": { + "path": "/home/user/energy-test-data/data/processed/transmission_capacity.parquet", + "size_mb": 0.0039043426513671875, + "rows": 20, + "columns": 5 + }, + "transmission_cost": { + "path": "/home/user/energy-test-data/data/processed/transmission_cost.parquet", + "size_mb": 0.004627227783203125, + "rows": 20, "columns": 6 } } diff --git a/data/metadata/generation_metadata.json b/data/metadata/generation_metadata.json index 10baa59..c25183b 100644 --- a/data/metadata/generation_metadata.json +++ b/data/metadata/generation_metadata.json @@ -1,5 +1,5 @@ { - "generated_at": "2026-02-10T16:10:43.522420", + "generated_at": "2026-02-10T17:49:15.839052", "datasets": { "battery_capacity": { "rows": 144010, @@ -84,6 +84,44 @@ "max_bid_price": "float64", "client_type": "object" } + }, + "transmission_capacity": { + "rows": 20, + "columns": [ + "source_region", + "target_region", + "capacity_mw", + "direction", + "efficiency" + ], + "memory_usage_mb": 0.004016876220703125, + "dtypes": { + "source_region": "object", + "target_region": "object", + "capacity_mw": "float64", + "direction": "object", + "efficiency": "float64" + } + }, + "transmission_cost": { + "rows": 20, + "columns": [ + "source_region", + "target_region", + "cost_eur_mwh", + "loss_percent", + "congestion_surcharge_eur_mwh", + "fee_eur_mwh" + ], + "memory_usage_mb": 0.002986907958984375, + "dtypes": { + "source_region": "object", + "target_region": "object", + "cost_eur_mwh": "float64", + "loss_percent": "float64", + "congestion_surcharge_eur_mwh": "float64", + "fee_eur_mwh": "float64" + } } } } \ No newline at end of file diff --git a/data/metadata/validation_report.json b/data/metadata/validation_report.json index 2998ffb..d4bd99e 100644 --- a/data/metadata/validation_report.json +++ b/data/metadata/validation_report.json @@ -1,12 +1,12 @@ { - "generated_at": "2026-02-10T16:10:53.614368", + "generated_at": "2026-02-10T17:49:31.592598", "summary": { - "total_datasets": 7, - "passed": 2, + "total_datasets": 9, + "passed": 4, "warnings": 5, "failed": 0, - "total_size_mb": 17.72, - "total_rows": 734451 + "total_size_mb": 17.89, + "total_rows": 734491 }, "datasets": [ { @@ -64,13 +64,13 @@ { "column": "efficiency", "rule": "min >= 0.5", - "violations": 36, + "violations": 56, "severity": "error" }, { "column": "efficiency", "rule": "max <= 1.0", - "violations": 4371, + "violations": 4460, "severity": "error" } ], @@ -111,7 +111,7 @@ { "column": "capacity_factor", "rule": "max <= 1.0", - "violations": 6382, + "violations": 6284, "severity": "error" } ], @@ -148,13 +148,13 @@ { "column": "heat_rate", "rule": "min >= 5", - "violations": 29, + "violations": 27, "severity": "error" }, { "column": "heat_rate", "rule": "max <= 15", - "violations": 867, + "violations": 845, "severity": "error" } ], @@ -204,7 +204,7 @@ { "column": "power_demand_mw", "rule": "min >= 0", - "violations": 137, + "violations": 135, "severity": "error" } ], @@ -214,8 +214,8 @@ { "dataset": "bitcoin_mining", "rows": 14401, - "columns": 6, - "memory_mb": 0.34, + "columns": 9, + "memory_mb": 0.51, "missing_values": {}, "duplicated_rows": 0, "timestamp_continuity": { @@ -226,14 +226,62 @@ }, "data_ranges": [ { - "column": "btc_price_usd", + "column": "btc_price_eur", "rule": "min >= 1000", - "violations": 456, + "violations": 466, + "severity": "error" + }, + { + "column": "power_demand_mw", + "rule": "min >= 10", + "violations": 14401, + "severity": "error" + }, + { + "column": "revenue_eur_per_mwh", + "rule": "min >= 0", + "violations": 359, + "severity": "error" + }, + { + "column": "revenue_eur_per_mwh", + "rule": "max <= 500", + "violations": 13959, "severity": "error" } ], "data_types": [], "status": "warning" + }, + { + "dataset": "transmission_capacity", + "rows": 20, + "columns": 5, + "memory_mb": 0.0, + "missing_values": {}, + "duplicated_rows": 0, + "timestamp_continuity": { + "status": "skipped", + "reason": "no timestamp column" + }, + "data_ranges": [], + "data_types": [], + "status": "pass" + }, + { + "dataset": "transmission_cost", + "rows": 20, + "columns": 6, + "memory_mb": 0.0, + "missing_values": {}, + "duplicated_rows": 0, + "timestamp_continuity": { + "status": "skipped", + "reason": "no timestamp column" + }, + "data_ranges": [], + "data_types": [], + "status": "pass" } ] } \ No newline at end of file diff --git a/scripts/01_generate_synthetic.py b/scripts/01_generate_synthetic.py index 9e3fc5a..492f9c4 100644 --- a/scripts/01_generate_synthetic.py +++ b/scripts/01_generate_synthetic.py @@ -210,6 +210,72 @@ def generate_data_center_data(config, timestamps): return pd.concat(df_list, ignore_index=True) +def generate_transmission_capacity_data(config): + np.random.seed(config['generation']['seed'] + 13) + + regions = config['regions'] + params = config['transmission'] + + data = [] + + for i, src in enumerate(regions): + for j, tgt in enumerate(regions): + if i == j: + continue + + base_capacity = np.random.uniform(*params['capacity_base_range']) + + if src == 'UK' or tgt == 'UK': + base_capacity *= params['capacity_uk_multiplier'] + + capacity = base_capacity * np.random.uniform(0.8, 1.2) + efficiency = np.random.uniform(*params['efficiency_range']) + direction = 'bidirectional' + + data.append({ + 'source_region': src, + 'target_region': tgt, + 'capacity_mw': capacity, + 'direction': direction, + 'efficiency': efficiency + }) + + return pd.DataFrame(data) + +def generate_transmission_cost_data(config): + np.random.seed(config['generation']['seed'] + 14) + + regions = config['regions'] + params = config['transmission'] + + avg_electricity_price = 80 + + data = [] + + for i, src in enumerate(regions): + for j, tgt in enumerate(regions): + if i == j: + continue + + efficiency = np.random.uniform(*params['efficiency_range']) + loss_percent = (1 - efficiency) * 100 + congestion_surcharge = np.random.uniform(*params['congestion_surcharge_range']) + fee = np.random.uniform(*params['fee_range']) + + loss_cost = (loss_percent / 100) * avg_electricity_price + cost_eur_mwh = loss_cost + congestion_surcharge + fee + + data.append({ + 'source_region': src, + 'target_region': tgt, + 'cost_eur_mwh': cost_eur_mwh, + 'loss_percent': loss_percent, + 'congestion_surcharge_eur_mwh': congestion_surcharge, + 'fee_eur_mwh': fee + }) + + return pd.DataFrame(data) + def apply_noise_and_outliers(df, config): if not config['generation']['add_noise']: return df @@ -283,20 +349,27 @@ def main(): datasets['battery_capacity'] = generate_battery_data(config, timestamps) print(f" - Battery capacity: {len(datasets['battery_capacity'])} rows") - + datasets['renewable_generation'] = generate_renewable_data(config, timestamps) print(f" - Renewable generation: {len(datasets['renewable_generation'])} rows") - + datasets['conventional_generation'] = generate_conventional_data(config, timestamps) print(f" - Conventional generation: {len(datasets['conventional_generation'])} rows") - + datasets['data_centers'] = generate_data_center_data(config, timestamps) print(f" - Data centers: {len(datasets['data_centers'])} rows") - + + datasets['transmission_capacity'] = generate_transmission_capacity_data(config) + print(f" - Transmission capacity: {len(datasets['transmission_capacity'])} rows") + + datasets['transmission_cost'] = generate_transmission_cost_data(config) + print(f" - Transmission cost: {len(datasets['transmission_cost'])} rows") + for name, df in datasets.items(): - df = apply_noise_and_outliers(df, config) - df = add_missing_values(df, config) - datasets[name] = df + if name not in ['transmission_capacity', 'transmission_cost']: + df = apply_noise_and_outliers(df, config) + df = add_missing_values(df, config) + datasets[name] = df output_base = Path(__file__).parent.parent / 'data' output_base.mkdir(parents=True, exist_ok=True) diff --git a/scripts/02_fetch_historical.py b/scripts/02_fetch_historical.py index eaad3c9..cabcc52 100644 --- a/scripts/02_fetch_historical.py +++ b/scripts/02_fetch_historical.py @@ -78,50 +78,57 @@ def fetch_electricity_prices(config, timestamps): def fetch_bitcoin_mining_data(config, timestamps): np.random.seed(config['generation']['seed'] + 11) - + print(f"Fetching bitcoin mining data from mempool.space (simulated)...") - + n = len(timestamps) - + try: btc_api = "https://mempool.space/api/v1/fees/recommended" response = requests.get(btc_api, timeout=10) if response.status_code == 200: fees = response.json() - base_btc_price = 45000 else: - base_btc_price = 45000 + pass except: - base_btc_price = 45000 - + pass + btc_params = config['bitcoin'] - - btc_trend = np.linspace(0.95, 1.05, n) + + btc_eur_trend = np.linspace(0.95, 1.05, n) btc_daily_volatility = np.cumsum(np.random.normal(0, 0.01, n)) + 1 btc_daily_volatility = btc_daily_volatility / btc_daily_volatility[0] - - btc_price = base_btc_price * btc_trend * btc_daily_volatility * (1 + 0.03 * np.random.randn(n)) - + + base_btc_price_eur = 41400 + btc_price_eur = base_btc_price_eur * btc_eur_trend * btc_daily_volatility * (1 + 0.03 * np.random.randn(n)) + hashrate_base = np.random.uniform(*btc_params['hashrate_range']) hashrate = hashrate_base * (1 + 0.05 * np.sin(2 * np.pi * np.arange(n) / (n / 10))) * (1 + 0.02 * np.random.randn(n)) - - electricity_efficiency = np.random.uniform(*btc_params['mining_efficiency_range']) - - btc_price_eur = btc_price * 0.92 - power_cost_eur = 50 - mining_profitability = (btc_price_eur * 0.0001 / 3.6) / (electricity_efficiency / 1000) - - electricity_breakeven = (btc_price_eur * 0.0001 / 3.6) / (mining_profitability / 24 * electricity_efficiency / 1000) * 24 - + + power_efficiency = np.random.uniform(*btc_params['power_efficiency_range']) + + power_demand = hashrate / power_efficiency + + mining_profitability = (btc_price_eur * 0.0001 / 3.6) / (power_efficiency / 1000) + + revenue_eur_per_mwh = mining_profitability * power_efficiency * 24 + + electricity_breakeven = 40 + np.random.normal(0, 5, n) + + profit_eur_per_mwh = revenue_eur_per_mwh - electricity_breakeven + data = pd.DataFrame({ 'timestamp': timestamps, 'pool_id': 'POOL_001', 'hashrate_ths': hashrate, - 'btc_price_usd': btc_price, - 'mining_profitability': mining_profitability, + 'btc_price_eur': btc_price_eur, + 'power_efficiency_th_per_mw': power_efficiency, + 'power_demand_mw': power_demand, + 'revenue_eur_per_mwh': revenue_eur_per_mwh, + 'profit_eur_per_mwh': profit_eur_per_mwh, 'electricity_cost': electricity_breakeven }) - + return data def fetch_load_profiles(config, timestamps): diff --git a/scripts/03_process_merge.py b/scripts/03_process_merge.py index 8371efe..15f4680 100644 --- a/scripts/03_process_merge.py +++ b/scripts/03_process_merge.py @@ -126,7 +126,9 @@ def main(): 'conventional_generation', 'load_profiles', 'data_centers', - 'bitcoin_mining' + 'bitcoin_mining', + 'transmission_capacity', + 'transmission_cost' ] processed_info = {} diff --git a/scripts/04_validate.py b/scripts/04_validate.py index 8f3a5a8..c123df7 100644 --- a/scripts/04_validate.py +++ b/scripts/04_validate.py @@ -233,7 +233,9 @@ def main(): 'conventional_generation', 'load_profiles', 'data_centers', - 'bitcoin_mining' + 'bitcoin_mining', + 'transmission_capacity', + 'transmission_cost' ] print("Validating processed datasets...\n") diff --git a/test/test_data.py b/test/test_data.py index bd6633c..9776e57 100644 --- a/test/test_data.py +++ b/test/test_data.py @@ -22,7 +22,9 @@ def main(): 'conventional_generation', 'load_profiles', 'data_centers', - 'bitcoin_mining' + 'bitcoin_mining', + 'transmission_capacity', + 'transmission_cost' ] print("\n1. LOADING DATASETS") @@ -37,7 +39,7 @@ def main(): else: print(f" ✗ {name:25} NOT FOUND") - print(f"\nTotal datasets loaded: {len(loaded)}/7") + print(f"\nTotal datasets loaded: {len(loaded)}/9") print("\n2. SAMPLE DATA PREVIEWS") print("-" * 60) @@ -80,9 +82,25 @@ def main(): if 'bitcoin_mining' in loaded: df = loaded['bitcoin_mining'] print(f"\nBitcoin Mining:") - print(f" BTC Price: ${df['btc_price_usd'].mean():.2f} avg, ${df['btc_price_usd'].max():.2f} max") + print(f" BTC Price: €{df['btc_price_eur'].mean():.2f} avg, €{df['btc_price_eur'].max():.2f} max") print(f" Hashrate: {df['hashrate_ths'].mean():.2f} EH/s avg") - print(f" Profitability: ${df['mining_profitability'].mean():.4f} /TH/day avg") + print(f" Power Demand: {df['power_demand_mw'].mean():.1f} MW avg") + print(f" Revenue: €{df['revenue_eur_per_mwh'].mean():.2f} /MWh avg") + print(f" Profit: €{df['profit_eur_per_mwh'].mean():.2f} /MWh avg") + + if 'transmission_capacity' in loaded: + df = loaded['transmission_capacity'] + print(f"\nTransmission Capacity:") + print(f" Total interconnectors: {len(df)}") + print(f" Avg capacity: {df['capacity_mw'].mean():.0f} MW") + print(f" Avg efficiency: {df['efficiency'].mean():.2%}") + + if 'transmission_cost' in loaded: + df = loaded['transmission_cost'] + print(f"\nTransmission Cost:") + print(f" Total paths: {len(df)}") + print(f" Avg cost: €{df['cost_eur_mwh'].mean():.2f} /MWh") + print(f" Avg loss: {df['loss_percent'].mean():.2f}%") if 'data_centers' in loaded: df = loaded['data_centers']