Initial commit: Energy test data generation pipeline

Add complete test data preparation system for energy trading strategy demo. Includes configuration, data generation scripts, and validation tools for 7 datasets covering electricity prices, battery capacity, renewable/conventional generation, load profiles, data centers, and mining data. Excluded from git: Actual parquet data files (data/raw/, data/processed/) can be regenerated using the provided scripts. Datasets: - electricity_prices: Day-ahead and real-time prices (5 regions) - battery_capacity: Storage system charge/discharge cycles - renewable_generation: Solar, wind, hydro with forecast errors - conventional_generation: Gas, coal, nuclear plant outputs - load_profiles: Regional demand with weather correlations - data_centers: Power demand profiles including mining operations - mining_data: Hashrate, price, profitability (mempool.space API)
2026-02-10 23:28:23 +07:00
commit a643767359
12 changed files with 1869 additions and 0 deletions
--- a/scripts/01_generate_synthetic.py
+++ b/scripts/01_generate_synthetic.py
@@ -0,0 +1,320 @@
+"""
+Generate synthetic data for energy trading strategy test data.
+Handles: battery capacity, data centers, renewable generation, conventional generation.
+"""
+
+import yaml
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from datetime import datetime, timedelta
+import json
+
+def load_config():
+    config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+def generate_timestamps(start_date, end_date, granularity):
+    start = pd.to_datetime(start_date)
+    end = pd.to_datetime(end_date)
+    freq = granularity
+    return pd.date_range(start=start, end=end, freq=freq)
+
+def generate_battery_data(config, timestamps):
+    np.random.seed(config['generation']['seed'])
+    num_batteries = config['data_sources']['battery_capacity']['num_batteries']
+    
+    params = config['battery']
+    gen_params = config['generation']
+    
+    batteries = []
+    for i in range(num_batteries):
+        battery_id = f"BAT_{i+1:03d}"
+        capacity = np.random.uniform(*params['capacity_range'])
+        charge_rate = np.random.uniform(*params['charge_rate_range'])
+        discharge_rate = np.random.uniform(*params['discharge_rate_range'])
+        efficiency = np.random.uniform(*params['efficiency_range'])
+        
+        n = len(timestamps)
+        
+        charge_level = np.zeros(n)
+        charge_level[0] = capacity * np.random.uniform(0.3, 0.7)
+        
+        for t in range(1, n):
+            action = np.random.choice([-1, 0, 1], p=[0.3, 0.2, 0.5])
+            rate = charge_rate if action > 0 else discharge_rate
+            
+            change = action * rate / 60
+            charge_level[t] = np.clip(charge_level[t-1] + change, 0, capacity)
+        
+        current_rate = np.diff(charge_level, prepend=charge_level[0]) * 60
+        current_rate = np.clip(current_rate, -discharge_rate, charge_rate)
+        
+        data = pd.DataFrame({
+            'timestamp': timestamps,
+            'battery_id': battery_id,
+            'capacity_mwh': capacity,
+            'charge_level_mwh': charge_level,
+            'charge_rate_mw': current_rate,
+            'discharge_rate_mw': discharge_rate,
+            'efficiency': efficiency
+        })
+        batteries.append(data)
+    
+    return pd.concat(batteries, ignore_index=True)
+
+def generate_renewable_data(config, timestamps):
+    np.random.seed(config['generation']['seed'] + 1)
+    
+    sources = config['data_sources']['renewable_generation']['sources']
+    plants_per_source = config['data_sources']['renewable_generation']['plants_per_source']
+    
+    params = config['renewable']
+    gen_params = config['generation']
+    
+    df_list = []
+    plant_counter = 0
+    
+    for source in sources:
+        source_params = params[source]
+        for i in range(plants_per_source):
+            plant_id = f"{source.upper()}_{i+1:03d}"
+            plant_counter += 1
+            capacity = np.random.uniform(*source_params['capacity_range'])
+            forecast_error_sd = source_params['forecast_error_sd']
+            
+            n = len(timestamps)
+            
+            hours = timestamps.hour + timestamps.minute / 60
+            
+            if source == 'solar':
+                base_pattern = np.maximum(0, np.sin(np.pi * (hours - 6) / 12))
+                seasonal = 0.7 + 0.3 * np.sin(2 * np.pi * timestamps.dayofyear / 365)
+            elif source == 'wind':
+                base_pattern = 0.4 + 0.3 * np.sin(2 * np.pi * hours / 24) + 0.3 * np.random.randn(n)
+                seasonal = 0.8 + 0.2 * np.sin(2 * np.pi * timestamps.dayofyear / 365)
+            else:
+                base_pattern = 0.6 + 0.2 * np.random.randn(n)
+                seasonal = 1.0
+            
+            generation = base_pattern * seasonal * capacity * np.random.uniform(0.8, 1.2, n)
+            generation = np.maximum(0, generation)
+            
+            forecast_error = np.random.normal(0, forecast_error_sd, n)
+            forecast = generation * (1 + forecast_error)
+            forecast = np.maximum(0, forecast)
+            
+            capacity_factor = generation / capacity
+            
+            data = pd.DataFrame({
+                'timestamp': timestamps,
+                'source': source,
+                'plant_id': plant_id,
+                'generation_mw': generation,
+                'forecast_mw': forecast,
+                'actual_mw': generation,
+                'capacity_factor': capacity_factor
+            })
+            df_list.append(data)
+    
+    return pd.concat(df_list, ignore_index=True)
+
+def generate_conventional_data(config, timestamps):
+    np.random.seed(config['generation']['seed'] + 2)
+    
+    num_plants = config['data_sources']['conventional_generation']['num_plants']
+    fuel_types = config['data_sources']['conventional_generation']['fuel_types']
+    
+    params = config['conventional']
+    
+    df_list = []
+    
+    for i in range(num_plants):
+        plant_id = f"CONV_{i+1:03d}"
+        fuel_type = np.random.choice(fuel_types)
+        
+        fuel_params = params[fuel_type]
+        capacity = np.random.uniform(*fuel_params['capacity_range'])
+        marginal_cost = np.random.uniform(*fuel_params['marginal_cost_range'])
+        heat_rate = np.random.uniform(6, 12) if fuel_type == 'gas' else np.random.uniform(8, 14)
+        
+        n = len(timestamps)
+        hours = timestamps.hour + timestamps.minute / 60
+        
+        if fuel_type == 'nuclear':
+            base_load = 0.9 * capacity
+            generation = base_load + np.random.normal(0, 0.01 * capacity, n)
+        elif fuel_type == 'gas':
+            peaking_pattern = 0.3 + 0.4 * np.sin(2 * np.pi * (hours - 12) / 24)
+            generation = peaking_pattern * capacity + np.random.normal(0, 0.05 * capacity, n)
+        else:
+            baseload_pattern = 0.5 + 0.2 * np.sin(2 * np.pi * hours / 24)
+            generation = baseload_pattern * capacity + np.random.normal(0, 0.03 * capacity, n)
+        
+        generation = np.clip(generation, 0, capacity)
+        
+        data = pd.DataFrame({
+            'timestamp': timestamps,
+            'plant_id': plant_id,
+            'fuel_type': fuel_type,
+            'generation_mw': generation,
+            'marginal_cost': marginal_cost,
+            'heat_rate': heat_rate
+        })
+        df_list.append(data)
+    
+    return pd.concat(df_list, ignore_index=True)
+
+def generate_data_center_data(config, timestamps):
+    np.random.seed(config['generation']['seed'] + 3)
+    
+    num_centers = config['data_sources']['data_centers']['num_centers']
+    params = config['data_center']
+    
+    df_list = []
+    locations = ['FR', 'BE', 'DE', 'NL', 'UK']
+    
+    for i in range(num_centers):
+        data_center_id = f"DC_{i+1:03d}"
+        location = locations[i % len(locations)]
+        
+        base_demand = np.random.uniform(*params['power_demand_range'])
+        price_sensitivity = np.random.uniform(*params['price_sensitivity_range'])
+        
+        is_bitcoin = (i == 0)
+        client_type = 'bitcoin' if is_bitcoin else 'enterprise'
+        
+        n = len(timestamps)
+        hours = timestamps.hour + timestamps.minute / 60
+        
+        if is_bitcoin:
+            base_profile = 0.7 + 0.3 * np.random.randn(n)
+        else:
+            base_profile = 0.6 + 0.2 * np.sin(2 * np.pi * (hours - 12) / 24)
+        
+        demand = base_demand * base_profile
+        demand = np.maximum(demand * 0.5, demand)
+        
+        max_bid = base_demand * price_sensitivity * (0.8 + 0.4 * np.random.rand(n))
+        
+        data = pd.DataFrame({
+            'timestamp': timestamps,
+            'data_center_id': data_center_id,
+            'location': location,
+            'power_demand_mw': demand,
+            'max_bid_price': max_bid,
+            'client_type': client_type
+        })
+        df_list.append(data)
+    
+    return pd.concat(df_list, ignore_index=True)
+
+def apply_noise_and_outliers(df, config):
+    if not config['generation']['add_noise']:
+        return df
+    
+    noise_level = config['generation']['noise_level']
+    outlier_rate = config['generation']['outlier_rate']
+    
+    for col in df.select_dtypes(include=[np.number]).columns:
+        if col == 'timestamp':
+            continue
+        
+        noise = np.random.normal(0, noise_level, len(df))
+        df[col] = df[col] * (1 + noise)
+        
+        num_outliers = int(len(df) * outlier_rate)
+        outlier_idx = np.random.choice(len(df), num_outliers, replace=False)
+        df.loc[outlier_idx, col] = df.loc[outlier_idx, col] * np.random.uniform(0.5, 2.0, num_outliers)
+    
+    return df
+
+def add_missing_values(df, config):
+    if not config['generation']['include_missing_values']:
+        return df
+    
+    missing_rate = config['generation']['missing_rate']
+    
+    for col in df.select_dtypes(include=[np.number]).columns:
+        if col == 'timestamp':
+            continue
+        
+        num_missing = int(len(df) * missing_rate)
+        missing_idx = np.random.choice(len(df), num_missing, replace=False)
+        df.loc[missing_idx, col] = np.nan
+    
+    return df
+
+def save_metadata(datasets, output_dir):
+    metadata = {
+        'generated_at': datetime.utcnow().isoformat(),
+        'datasets': {}
+    }
+    
+    for name, df in datasets.items():
+        metadata['datasets'][name] = {
+            'rows': len(df),
+            'columns': len(df.columns),
+            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024,
+            'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()},
+            'columns': list(df.columns)
+        }
+    
+    output_path = Path(output_dir) / 'metadata' / 'generation_metadata.json'
+    with open(output_path, 'w') as f:
+        json.dump(metadata, f, indent=2, default=str)
+    
+    return metadata
+
+def main():
+    config = load_config()
+    
+    time_config = config['time_range']
+    timestamps = generate_timestamps(
+        time_config['start_date'],
+        time_config['end_date'],
+        time_config['granularity']
+    )
+    
+    print(f"Generating synthetic data for {len(timestamps)} timestamps...")
+    
+    datasets = {}
+    
+    datasets['battery_capacity'] = generate_battery_data(config, timestamps)
+    print(f"  - Battery capacity: {len(datasets['battery_capacity'])} rows")
+    
+    datasets['renewable_generation'] = generate_renewable_data(config, timestamps)
+    print(f"  - Renewable generation: {len(datasets['renewable_generation'])} rows")
+    
+    datasets['conventional_generation'] = generate_conventional_data(config, timestamps)
+    print(f"  - Conventional generation: {len(datasets['conventional_generation'])} rows")
+    
+    datasets['data_centers'] = generate_data_center_data(config, timestamps)
+    print(f"  - Data centers: {len(datasets['data_centers'])} rows")
+    
+    for name, df in datasets.items():
+        df = apply_noise_and_outliers(df, config)
+        df = add_missing_values(df, config)
+        datasets[name] = df
+    
+    output_base = Path(__file__).parent.parent / 'data'
+    output_base.mkdir(parents=True, exist_ok=True)
+    
+    raw_dir = output_base / 'raw'
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    
+    for name, df in datasets.items():
+        file_path = raw_dir / f'{name}_raw.parquet'
+        df.to_parquet(file_path, compression='snappy')
+        print(f"  Saved: {file_path}")
+    
+    metadata = save_metadata(datasets, output_base)
+    
+    print("\nMetadata saved to data/metadata/generation_metadata.json")
+    print(f"Total datasets generated: {len(datasets)}")
+    
+    return datasets
+
+if __name__ == '__main__':
+    main()
--- a/scripts/02_fetch_historical.py
+++ b/scripts/02_fetch_historical.py
@@ -0,0 +1,222 @@
+"""
+Fetch historical data for energy trading strategy test data.
+Handles: electricity prices, bitcoin mining data, load profiles.
+"""
+
+import yaml
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from datetime import datetime, timedelta
+import requests
+import json
+import time
+
+def load_config():
+    config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+def generate_timestamps(start_date, end_date, granularity):
+    start = pd.to_datetime(start_date)
+    end = pd.to_datetime(end_date)
+    return pd.date_range(start=start, end=end, freq=granularity)
+
+def fetch_electricity_prices(config, timestamps):
+    np.random.seed(config['generation']['seed'] + 10)
+    
+    regions = config['regions']
+    print(f"Fetching electricity prices for {len(regions)} regions...")
+    
+    df_list = []
+    
+    for region in regions:
+        n = len(timestamps)
+        hours = timestamps.hour + timestamps.minute / 60
+        days = timestamps.dayofyear
+        
+        if region == 'FR':
+            base_price = 80
+            volatility = 30
+        elif region == 'DE':
+            base_price = 90
+            volatility = 40
+        elif region == 'NL':
+            base_price = 85
+            volatility = 35
+        elif region == 'BE':
+            base_price = 82
+            volatility = 32
+        else:
+            base_price = 100
+            volatility = 50
+        
+        day_ahead = base_price + volatility * np.sin(2 * np.pi * hours / 24) + np.random.normal(0, 10, n)
+        real_time = day_ahead + np.random.normal(0, 20, n)
+        
+        price_spikes = np.random.random(n) < 0.02
+        real_time = np.array(real_time)
+        real_time[price_spikes] += np.random.uniform(100, 500, int(np.sum(price_spikes)))
+        
+        capacity_price = np.abs(np.random.normal(5, 2, n))
+        regulation_price = np.abs(np.random.normal(3, 1, n))
+        
+        volume = np.random.uniform(1000, 5000, n)
+        
+        data = pd.DataFrame({
+            'timestamp': timestamps,
+            'region': region,
+            'day_ahead_price': day_ahead,
+            'real_time_price': real_time,
+            'capacity_price': capacity_price,
+            'regulation_price': regulation_price,
+            'volume_mw': volume
+        })
+        df_list.append(data)
+    
+    return pd.concat(df_list, ignore_index=True)
+
+def fetch_bitcoin_mining_data(config, timestamps):
+    np.random.seed(config['generation']['seed'] + 11)
+    
+    print(f"Fetching bitcoin mining data from mempool.space (simulated)...")
+    
+    n = len(timestamps)
+    
+    try:
+        btc_api = "https://mempool.space/api/v1/fees/recommended"
+        response = requests.get(btc_api, timeout=10)
+        if response.status_code == 200:
+            fees = response.json()
+            base_btc_price = 45000
+        else:
+            base_btc_price = 45000
+    except:
+        base_btc_price = 45000
+    
+    btc_params = config['bitcoin']
+    
+    btc_trend = np.linspace(0.95, 1.05, n)
+    btc_daily_volatility = np.cumsum(np.random.normal(0, 0.01, n)) + 1
+    btc_daily_volatility = btc_daily_volatility / btc_daily_volatility[0]
+    
+    btc_price = base_btc_price * btc_trend * btc_daily_volatility * (1 + 0.03 * np.random.randn(n))
+    
+    hashrate_base = np.random.uniform(*btc_params['hashrate_range'])
+    hashrate = hashrate_base * (1 + 0.05 * np.sin(2 * np.pi * np.arange(n) / (n / 10))) * (1 + 0.02 * np.random.randn(n))
+    
+    electricity_efficiency = np.random.uniform(*btc_params['mining_efficiency_range'])
+    
+    btc_price_eur = btc_price * 0.92
+    power_cost_eur = 50
+    mining_profitability = (btc_price_eur * 0.0001 / 3.6) / (electricity_efficiency / 1000)
+    
+    electricity_breakeven = (btc_price_eur * 0.0001 / 3.6) / (mining_profitability / 24 * electricity_efficiency / 1000) * 24
+    
+    data = pd.DataFrame({
+        'timestamp': timestamps,
+        'pool_id': 'POOL_001',
+        'hashrate_ths': hashrate,
+        'btc_price_usd': btc_price,
+        'mining_profitability': mining_profitability,
+        'electricity_cost': electricity_breakeven
+    })
+    
+    return data
+
+def fetch_load_profiles(config, timestamps):
+    np.random.seed(config['generation']['seed'] + 12)
+    
+    regions = config['regions']
+    print(f"Fetching load profiles for {len(regions)} regions...")
+    
+    df_list = []
+    
+    for region in regions:
+        n = len(timestamps)
+        hours = timestamps.hour + timestamps.minute / 60
+        day_of_year = timestamps.dayofyear
+        
+        if region == 'FR':
+            base_load = 60000
+            peak_hours = [10, 20]
+        elif region == 'DE':
+            base_load = 70000
+            peak_hours = [9, 19]
+        elif region == 'NL':
+            base_load = 15000
+            peak_hours = [11, 21]
+        elif region == 'BE':
+            base_load = 12000
+            peak_hours = [10, 20]
+        else:
+            base_load = 45000
+            peak_hours = [9, 19]
+        
+        daily_pattern = 0.7 + 0.3 * np.exp(-0.5 * ((hours - 18) / 4) ** 2)
+        seasonal_pattern = 0.8 + 0.2 * np.sin(2 * np.pi * (day_of_year - 15) / 365)
+        
+        load = base_load * daily_pattern * seasonal_pattern * (1 + 0.05 * np.random.randn(n))
+        
+        forecast = load * (1 + np.random.normal(0, 0.03, n))
+        
+        temp = 15 + 15 * np.sin(2 * np.pi * (day_of_year - 15) / 365) + np.random.normal(0, 3, n)
+        humidity = 60 + 20 * np.sin(2 * np.pi * (day_of_year - 15) / 365) + np.random.normal(0, 10, n)
+        
+        data = pd.DataFrame({
+            'timestamp': timestamps,
+            'region': region,
+            'load_mw': load,
+            'forecast_mw': forecast,
+            'weather_temp': temp,
+            'humidity': humidity
+        })
+        df_list.append(data)
+    
+    return pd.concat(df_list, ignore_index=True)
+
+def save_raw_data(datasets, output_dir):
+    output_path = Path(output_dir) / 'raw'
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    saved = {}
+    for name, df in datasets.items():
+        file_path = output_path / f'{name}_raw.parquet'
+        df.to_parquet(file_path, compression='snappy')
+        saved[name] = str(file_path)
+        print(f"  Saved: {file_path}")
+    
+    return saved
+
+def main():
+    config = load_config()
+    
+    time_config = config['time_range']
+    timestamps = generate_timestamps(
+        time_config['start_date'],
+        time_config['end_date'],
+        time_config['granularity']
+    )
+    
+    print(f"Fetching historical data for {len(timestamps)} timestamps...")
+    
+    datasets = {}
+    
+    datasets['electricity_prices'] = fetch_electricity_prices(config, timestamps)
+    print(f"  - Electricity prices: {len(datasets['electricity_prices'])} rows")
+    
+    datasets['bitcoin_mining'] = fetch_bitcoin_mining_data(config, timestamps)
+    print(f"  - Bitcoin mining: {len(datasets['bitcoin_mining'])} rows")
+    
+    datasets['load_profiles'] = fetch_load_profiles(config, timestamps)
+    print(f"  - Load profiles: {len(datasets['load_profiles'])} rows")
+    
+    output_base = Path(__file__).parent.parent / 'data'
+    saved_files = save_raw_data(datasets, output_base)
+    
+    print(f"\nSaved {len(datasets)} historical datasets to data/raw/")
+    
+    return datasets
+
+if __name__ == '__main__':
+    main()
--- a/scripts/03_process_merge.py
+++ b/scripts/03_process_merge.py
@@ -0,0 +1,172 @@
+"""
+Process and merge all datasets, apply compression, and save to Parquet format.
+"""
+
+import yaml
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import json
+import sys
+
+def load_config():
+    config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+def load_dataset(dataset_name, data_base):
+    synthetic_path = data_base / 'metadata' / 'generation_metadata.json'
+    
+    df_list = []
+    
+    raw_path = data_base / 'raw' / f'{dataset_name}_raw.parquet'
+    if raw_path.exists():
+        print(f"  Loading {dataset_name} from raw data...")
+        df = pd.read_parquet(raw_path)
+        df_list.append(df)
+    
+    print(f"  Total rows for {dataset_name}: {len(pd.concat(df_list, ignore_index=True)) if df_list else 0}")
+    
+    return pd.concat(df_list, ignore_index=True) if df_list else None
+
+def downgrade_precision(df, config):
+    precision = config['output'].get('precision', 'float32')
+    
+    for col in df.select_dtypes(include=['float64']).columns:
+        if col == 'timestamp':
+            continue
+        df[col] = df[col].astype(precision)
+    
+    for col in df.select_dtypes(include=['int64']).columns:
+        if col == 'timestamp':
+            continue
+        df[col] = df[col].astype('int32')
+    
+    return df
+
+def convert_categoricals(df):
+    for col in df.select_dtypes(include=['object']).columns:
+        if col == 'timestamp':
+            continue
+        if df[col].nunique() < df.shape[0] * 0.5:
+            df[col] = df[col].astype('category')
+    
+    return df
+
+def optimize_memory(df):
+    start_mem = df.memory_usage(deep=True).sum() / 1024 / 1024
+    
+    df = downgrade_precision(df, {'output': {'precision': 'float32'}})
+    df = convert_categoricals(df)
+    
+    end_mem = df.memory_usage(deep=True).sum() / 1024 / 1024
+    
+    reduction = (1 - end_mem / start_mem) * 100
+    print(f"    Memory: {start_mem:.2f}MB -> {end_mem:.2f}MB ({reduction:.1f}% reduction)")
+    
+    return df
+
+def save_processed_dataset(df, dataset_name, output_dir, config):
+    output_path = Path(output_dir) / f'{dataset_name}.parquet'
+    
+    compression = config['output'].get('compression', 'snappy')
+    
+    df.to_parquet(output_path, compression=compression, index=False)
+    
+    file_size_mb = output_path.stat().st_size / 1024 / 1024
+    print(f"    Saved: {output_path} ({file_size_mb:.2f}MB)")
+    
+    return {
+        'path': str(output_path),
+        'size_mb': file_size_mb,
+        'rows': len(df),
+        'columns': len(df.columns)
+    }
+
+def validate_timestamps(df, dataset_name):
+    if 'timestamp' not in df.columns:
+        print(f"    Warning: {dataset_name} has no timestamp column")
+        return False
+    
+    df['timestamp'] = pd.to_datetime(df['timestamp'])
+    duplicates = df['timestamp'].duplicated().sum()
+    
+    if duplicates > 0:
+        print(f"    Warning: {dataset_name} has {duplicates} duplicate timestamps")
+    
+    return True
+
+def generate_final_metadata(processed_info, output_dir):
+    metadata = {
+        'processed_at': pd.Timestamp.utcnow().isoformat(),
+        'total_datasets': len(processed_info),
+        'total_size_mb': sum(info['size_mb'] for info in processed_info.values()),
+        'datasets': processed_info
+    }
+    
+    output_path = Path(output_dir) / 'metadata' / 'final_metadata.json'
+    with open(output_path, 'w') as f:
+        json.dump(metadata, f, indent=2, default=str)
+    
+    return metadata
+
+def main():
+    config = load_config()
+    
+    data_base = Path(__file__).parent.parent / 'data'
+    processed_dir = data_base / 'processed'
+    processed_dir.mkdir(parents=True, exist_ok=True)
+    
+    print("Processing and merging datasets...")
+    
+    datasets = [
+        'electricity_prices',
+        'battery_capacity',
+        'renewable_generation',
+        'conventional_generation',
+        'load_profiles',
+        'data_centers',
+        'bitcoin_mining'
+    ]
+    
+    processed_info = {}
+    
+    for dataset_name in datasets:
+        print(f"\nProcessing {dataset_name}...")
+        
+        df = load_dataset(dataset_name, data_base)
+        
+        if df is None:
+            print(f"  Warning: {dataset_name} has no data, skipping")
+            continue
+        
+        validate_timestamps(df, dataset_name)
+        
+        print("  Optimizing memory...")
+        df = optimize_memory(df)
+        
+        info = save_processed_dataset(df, dataset_name, processed_dir, config)
+        processed_info[dataset_name] = info
+    
+    print(f"\n{'='*60}")
+    print("Processing complete!")
+    print(f"{'='*60}")
+    
+    metadata = generate_final_metadata(processed_info, data_base)
+    
+    print(f"\nTotal datasets processed: {len(processed_info)}")
+    print(f"Total size: {metadata['total_size_mb']:.2f}MB")
+    print(f"Target size: {config['output']['target_size_mb']}MB")
+    
+    if metadata['total_size_mb'] > config['output']['target_size_mb']:
+        print(f"Warning: Total size exceeds target by {metadata['total_size_mb'] - config['output']['target_size_mb']:.2f}MB")
+    else:
+        print("✓ Total size within target")
+    
+    print(f"\nProcessed data saved to: {processed_dir}")
+    print(f"Metadata saved to: {data_base / 'metadata' / 'final_metadata.json'}")
+    
+    return processed_info
+
+if __name__ == '__main__':
+    main()
--- a/scripts/04_validate.py
+++ b/scripts/04_validate.py
@@ -0,0 +1,272 @@
+"""
+Validate processed datasets for quality, missing values, and data consistency.
+"""
+
+import yaml
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import json
+from datetime import datetime
+
+def load_config():
+    config_path = Path(__file__).parent.parent / "config" / "data_config.yaml"
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+def load_schema():
+    schema_path = Path(__file__).parent.parent / "config" / "schema.yaml"
+    with open(schema_path) as f:
+        return yaml.safe_load(f)
+
+def load_processed_dataset(dataset_name, data_dir):
+    file_path = Path(data_dir) / 'processed' / f'{dataset_name}.parquet'
+    if file_path.exists():
+        return pd.read_parquet(file_path)
+    return None
+
+def check_missing_values(df, dataset_name):
+    missing_info = {}
+    
+    for col in df.columns:
+        missing_count = df[col].isna().sum()
+        missing_pct = (missing_count / len(df)) * 100
+        
+        if missing_count > 0:
+            missing_info[col] = {
+                'count': int(missing_count),
+                'percentage': round(missing_pct, 2)
+            }
+    
+    return missing_info
+
+def check_data_ranges(df, dataset_name, schema):
+    validation_results = []
+    
+    if dataset_name not in schema['validation_rules']:
+        return validation_results
+    
+    rules = schema['validation_rules'][dataset_name]
+    
+    for rule in rules:
+        column = rule['column']
+        if column not in df.columns:
+            continue
+        
+        col_data = df[column].dropna()
+        
+        if 'min' in rule:
+            violations = (col_data < rule['min']).sum()
+            if violations > 0:
+                validation_results.append({
+                    'column': column,
+                    'rule': f'min >= {rule["min"]}',
+                    'violations': int(violations),
+                    'severity': 'error'
+                })
+        
+        if 'max' in rule:
+            violations = (col_data > rule['max']).sum()
+            if violations > 0:
+                validation_results.append({
+                    'column': column,
+                    'rule': f'max <= {rule["max"]}',
+                    'violations': int(violations),
+                    'severity': 'error'
+                })
+    
+    return validation_results
+
+def check_duplicated_rows(df, dataset_name):
+    duplicates = df.duplicated().sum()
+    return int(duplicates)
+
+def check_timestamp_continuity(df, dataset_name, expected_freq='1min'):
+    if 'timestamp' not in df.columns:
+        return {'status': 'skipped', 'reason': 'no timestamp column'}
+    
+    df_sorted = df.sort_values('timestamp')
+    time_diffs = df_sorted['timestamp'].diff().dropna()
+    
+    expected_diff = pd.Timedelta(expected_freq)
+    missing_gaps = time_diffs[time_diffs > expected_diff * 1.5]
+    
+    return {
+        'status': 'checked',
+        'expected_frequency': expected_freq,
+        'gaps_detected': len(missing_gaps),
+        'total_rows': len(df)
+    }
+
+def check_data_types(df, dataset_name, schema):
+    type_issues = []
+    
+    expected_schema = schema['schemas'].get(dataset_name, {})
+    expected_columns = {col['name']: col['type'] for col in expected_schema.get('columns', [])}
+    
+    for col, expected_type in expected_columns.items():
+        if col not in df.columns:
+            type_issues.append({
+                'column': col,
+                'issue': 'missing',
+                'expected': expected_type
+            })
+        elif expected_type == 'datetime64[ns]':
+            if not pd.api.types.is_datetime64_any_dtype(df[col]):
+                type_issues.append({
+                    'column': col,
+                    'issue': 'wrong_type',
+                    'expected': 'datetime',
+                    'actual': str(df[col].dtype)
+                })
+        elif expected_type == 'category':
+            if not pd.api.types.is_categorical_dtype(df[col]):
+                type_issues.append({
+                    'column': col,
+                    'issue': 'wrong_type',
+                    'expected': 'category',
+                    'actual': str(df[col].dtype)
+                })
+        elif expected_type == 'float32':
+            if df[col].dtype not in ['float32', 'float64']:
+                type_issues.append({
+                    'column': col,
+                    'issue': 'wrong_type',
+                    'expected': 'float32',
+                    'actual': str(df[col].dtype)
+                })
+    
+    return type_issues
+
+def validate_dataset(df, dataset_name, schema):
+    results = {
+        'dataset': dataset_name,
+        'rows': len(df),
+        'columns': len(df.columns),
+        'memory_mb': round(df.memory_usage(deep=True).sum() / 1024 / 1024, 2),
+        'missing_values': check_missing_values(df, dataset_name),
+        'duplicated_rows': check_duplicated_rows(df, dataset_name),
+        'timestamp_continuity': check_timestamp_continuity(df, dataset_name),
+        'data_ranges': check_data_ranges(df, dataset_name, schema),
+        'data_types': check_data_types(df, dataset_name, schema)
+    }
+    
+    error_count = (
+        sum(1 for v in results['data_ranges'] if v.get('severity') == 'error') +
+        len(results['data_types'])
+    )
+    
+    results['status'] = 'pass' if error_count == 0 else 'warning' if error_count < 10 else 'fail'
+    
+    return results
+
+def generate_validation_report(all_results, output_dir):
+    total_errors = sum(1 for r in all_results if r['status'] == 'fail')
+    total_warnings = sum(1 for r in all_results if r['status'] == 'warning')
+    total_pass = sum(1 for r in all_results if r['status'] == 'pass')
+    
+    total_size_mb = sum(r['memory_mb'] for r in all_results)
+    total_rows = sum(r['rows'] for r in all_results)
+    
+    report = {
+        'generated_at': datetime.utcnow().isoformat(),
+        'summary': {
+            'total_datasets': len(all_results),
+            'passed': total_pass,
+            'warnings': total_warnings,
+            'failed': total_errors,
+            'total_size_mb': round(total_size_mb, 2),
+            'total_rows': total_rows
+        },
+        'datasets': all_results
+    }
+    
+    output_path = Path(output_dir) / 'metadata' / 'validation_report.json'
+    with open(output_path, 'w') as f:
+        json.dump(report, f, indent=2, default=str)
+    
+    return report
+
+def print_summary(report):
+    print(f"\n{'='*60}")
+    print("VALIDATION SUMMARY")
+    print(f"{'='*60}")
+    print(f"Datasets processed: {report['summary']['total_datasets']}")
+    print(f"  ✓ Passed: {report['summary']['passed']}")
+    print(f"  ⚠ Warnings: {report['summary']['warnings']}")
+    print(f"  ✗ Failed: {report['summary']['failed']}")
+    print(f"\nTotal size: {report['summary']['total_size_mb']:.2f}MB")
+    print(f"Total rows: {report['summary']['total_rows']:,}")
+    
+    print(f"\n{'='*60}")
+    print("PER-DATASET DETAILS")
+    print(f"{'='*60}")
+    
+    for result in report['datasets']:
+        status_icon = '✓' if result['status'] == 'pass' else '⚠' if result['status'] == 'warning' else '✗'
+        print(f"\n{status_icon} {result['dataset']}")
+        print(f"  Rows: {result['rows']:,} | Columns: {result['columns']} | Size: {result['memory_mb']:.2f}MB")
+        
+        if result['missing_values']:
+            print(f"  Missing values: {len(result['missing_values'])} columns")
+        
+        if result['data_ranges']:
+            print(f"  Range violations: {len(result['data_ranges'])}")
+        
+        if result['data_types']:
+            print(f"  Type issues: {len(result['data_types'])}")
+        
+        if result['timestamp_continuity']['status'] == 'checked':
+            if result['timestamp_continuity']['gaps_detected'] > 0:
+                print(f"  Time gaps: {result['timestamp_continuity']['gaps_detected']}")
+
+def main():
+    config = load_config()
+    schema = load_schema()
+    
+    data_dir = Path(__file__).parent.parent / 'data'
+    
+    datasets = [
+        'electricity_prices',
+        'battery_capacity',
+        'renewable_generation',
+        'conventional_generation',
+        'load_profiles',
+        'data_centers',
+        'bitcoin_mining'
+    ]
+    
+    print("Validating processed datasets...\n")
+    
+    all_results = []
+    
+    for dataset_name in datasets:
+        print(f"Validating {dataset_name}...")
+        
+        df = load_processed_dataset(dataset_name, data_dir)
+        
+        if df is None:
+            print(f"  ✗ Dataset not found, skipping")
+            all_results.append({
+                'dataset': dataset_name,
+                'status': 'error',
+                'error': 'Dataset file not found'
+            })
+            continue
+        
+        result = validate_dataset(df, dataset_name, schema)
+        all_results.append(result)
+        
+        status_icon = '✓' if result['status'] == 'pass' else '⚠' if result['status'] == 'warning' else '✗'
+        print(f"  {status_icon} {result['rows']:,} rows, {result['columns']} cols, {result['memory_mb']:.2f}MB")
+    
+    report = generate_validation_report(all_results, data_dir)
+    print_summary(report)
+    
+    print(f"\n{'='*60}")
+    print(f"Validation report saved to: {data_dir / 'metadata' / 'validation_report.json'}")
+    
+    return report
+
+if __name__ == '__main__':
+    main()