zpark-ems/core/scripts/backfill_data.py

"""回填历史模拟能耗数据 - 过去30天逐小时数据，含碳排放记录

Uses the shared weather_model for physics-based solar, temperature, and load
generation.  Deterministic seed (42) ensures reproducible output across runs.
"""
import asyncio
import math
import os
import random
import sys
from datetime import datetime, timedelta, timezone

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "backend"))

DATABASE_URL = os.environ.get(
    "DATABASE_URL",
    "postgresql+asyncpg://tianpu:tianpu2026@localhost:5432/tianpu_ems",
)

from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession
from sqlalchemy import text, select

from app.services.weather_model import (
    set_seed, reset_cloud_model,
    pv_power, pv_electrical_at, get_pv_orientation,
    heat_pump_data, building_load, indoor_sensor,
    heat_meter_data, outdoor_temperature, outdoor_humidity,
    get_hvac_mode,
)
from app.models.device import Device

# ---------------------------------------------------------------------------
# Device definitions — will be populated from DB at runtime
# ---------------------------------------------------------------------------
PV_IDS = []
PV_CODES = ["INV-01", "INV-02", "INV-03"]
HP_IDS = []
HP_CODES = ["HP-01", "HP-02", "HP-03", "HP-04"]
METER_IDS = []
METER_CODES = ["METER-GRID", "METER-PV", "METER-HP", "METER-PUMP"]
HEAT_METER_ID = None
SENSOR_IDS = []
SENSOR_CODES = ["TH-01", "TH-02", "TH-03", "TH-04", "TH-05"]


async def _load_device_ids(session: AsyncSession):
    """Load actual device IDs from DB by code."""
    global PV_IDS, HP_IDS, METER_IDS, HEAT_METER_ID, SENSOR_IDS
    result = await session.execute(select(Device.id, Device.code).order_by(Device.id))
    code_to_id = {row[1]: row[0] for row in result.all()}

    PV_IDS = [code_to_id[c] for c in PV_CODES if c in code_to_id]
    HP_IDS = [code_to_id[c] for c in HP_CODES if c in code_to_id]
    METER_IDS = [code_to_id[c] for c in METER_CODES if c in code_to_id]
    HEAT_METER_ID = code_to_id.get("HM-01")
    SENSOR_IDS = [code_to_id[c] for c in SENSOR_CODES if c in code_to_id]
    print(f"  Loaded device IDs: PV={PV_IDS}, HP={HP_IDS}, Meters={METER_IDS}, HeatMeter={HEAT_METER_ID}, Sensors={SENSOR_IDS}")

EMISSION_FACTOR = 0.8843  # kgCO2/kWh - North China grid

DAYS = 30
HOURS_PER_DAY = 24
TOTAL_HOURS = DAYS * HOURS_PER_DAY


# ---------------------------------------------------------------------------
# Main backfill
# ---------------------------------------------------------------------------

async def backfill():
    # Set deterministic seed for reproducibility
    set_seed(42)

    engine = create_async_engine(DATABASE_URL, echo=False, pool_size=5)
    session_factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)

    # Load actual device IDs from DB
    async with session_factory() as session:
        await _load_device_ids(session)

    now = datetime.now(timezone.utc).replace(minute=0, second=0, microsecond=0)
    start = now - timedelta(days=DAYS)

    print(f"Backfill range: {start.isoformat()} -> {now.isoformat()}")
    print(f"Total hours: {TOTAL_HOURS}")

    # ---- Collect rows ----
    energy_rows = []
    carbon_rows = []
    daily_buckets: dict[int, dict[str, dict]] = {}

    all_power_ids = PV_IDS + HP_IDS + METER_IDS
    for did in all_power_ids:
        daily_buckets[did] = {}

    print("Generating hourly energy_data rows (realistic models) ...")
    for h_offset in range(TOTAL_HOURS):
        ts = start + timedelta(hours=h_offset)
        beijing_dt = ts + timedelta(hours=8)
        date_str = beijing_dt.strftime("%Y-%m-%d")

        # Reset cloud model each day for variety
        if h_offset % 24 == 0:
            reset_cloud_model()
            # Re-seed per day for reproducibility but day-to-day variation
            set_seed(42 + h_offset // 24)

        # --- PV inverters ---
        for i, did in enumerate(PV_IDS):
            code = PV_CODES[i]
            orientation = get_pv_orientation(code)
            val = pv_power(ts, rated_power=110.0, orientation=orientation,
                           device_code=code)
            val = round(val, 2)

            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "power", "value": val, "unit": "kW", "quality": 0,
            })

            # Also generate electrical details for richer data
            elec = pv_electrical_at(val, ts, rated_power=110.0)
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "dc_voltage", "value": elec["dc_voltage"], "unit": "V", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "ac_voltage", "value": elec["ac_voltage"], "unit": "V", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "temperature", "value": elec["temperature"], "unit": "℃", "quality": 0,
            })

            daily_buckets[did].setdefault(date_str, {"values": [], "cops": []})
            daily_buckets[did][date_str]["values"].append(val)

        # --- Heat pumps ---
        hp_total_power = 0.0
        hp_cop_sum = 0.0
        hp_count = 0
        for i, did in enumerate(HP_IDS):
            code = HP_CODES[i]
            data = heat_pump_data(ts, rated_power=35.0, device_code=code)
            val = data["power"]
            cop = data["cop"]

            hp_total_power += val
            if cop > 0:
                hp_cop_sum += cop
                hp_count += 1

            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "power", "value": val, "unit": "kW", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "cop", "value": cop, "unit": "", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "inlet_temp", "value": data["inlet_temp"], "unit": "℃", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "outlet_temp", "value": data["outlet_temp"], "unit": "℃", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "flow_rate", "value": data["flow_rate"], "unit": "m³/h", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "outdoor_temp", "value": data["outdoor_temp"], "unit": "℃", "quality": 0,
            })

            daily_buckets[did].setdefault(date_str, {"values": [], "cops": []})
            daily_buckets[did][date_str]["values"].append(val)
            daily_buckets[did][date_str]["cops"].append(cop)

        # --- Meters ---
        for i, did in enumerate(METER_IDS):
            code = METER_CODES[i]
            data = building_load(ts, base_power=50.0, meter_code=code)
            val = data["power"]

            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "power", "value": val, "unit": "kW", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "voltage", "value": data["voltage"], "unit": "V", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "current", "value": data["current"], "unit": "A", "quality": 0,
            })
            energy_rows.append({
                "device_id": did, "timestamp": ts,
                "data_type": "power_factor", "value": data["power_factor"], "unit": "", "quality": 0,
            })

            daily_buckets[did].setdefault(date_str, {"values": [], "cops": []})
            daily_buckets[did][date_str]["values"].append(val)

        # --- Heat meter (correlated with heat pump totals) ---
        avg_cop = hp_cop_sum / hp_count if hp_count > 0 else 3.0
        hm_data = heat_meter_data(ts, hp_power=hp_total_power, hp_cop=avg_cop)
        energy_rows.append({
            "device_id": HEAT_METER_ID, "timestamp": ts,
            "data_type": "heat_power", "value": hm_data["heat_power"], "unit": "kW", "quality": 0,
        })
        energy_rows.append({
            "device_id": HEAT_METER_ID, "timestamp": ts,
            "data_type": "flow_rate", "value": hm_data["flow_rate"], "unit": "m³/h", "quality": 0,
        })
        energy_rows.append({
            "device_id": HEAT_METER_ID, "timestamp": ts,
            "data_type": "supply_temp", "value": hm_data["supply_temp"], "unit": "℃", "quality": 0,
        })
        energy_rows.append({
            "device_id": HEAT_METER_ID, "timestamp": ts,
            "data_type": "return_temp", "value": hm_data["return_temp"], "unit": "℃", "quality": 0,
        })

        # --- Temperature/humidity sensors ---
        for i, sid in enumerate(SENSOR_IDS):
            code = SENSOR_CODES[i]
            is_outdoor = (code == "TH-05")
            data = indoor_sensor(ts, is_outdoor=is_outdoor, device_code=code)
            energy_rows.append({
                "device_id": sid, "timestamp": ts,
                "data_type": "temperature", "value": data["temperature"], "unit": "℃", "quality": 0,
            })
            energy_rows.append({
                "device_id": sid, "timestamp": ts,
                "data_type": "humidity", "value": data["humidity"], "unit": "%", "quality": 0,
            })

    print(f"  Generated {len(energy_rows)} energy_data rows")

    # ---- Build daily summary rows ----
    print("Computing daily summaries ...")
    summary_rows = []
    for did, dates in daily_buckets.items():
        is_pv = did in PV_IDS
        for date_str, bucket in dates.items():
            values = bucket["values"]
            cops = bucket["cops"]
            total = round(sum(values), 2)
            peak = round(max(values), 2) if values else 0
            min_p = round(min(values), 2) if values else 0
            avg_p = round(sum(values) / len(values), 2) if values else 0
            op_hours = sum(1 for v in values if v > 0.5)
            cost = round(total * 0.85, 2)
            carbon = round(total * EMISSION_FACTOR, 2)
            avg_cop = round(sum(cops) / len(cops), 2) if cops else None

            summary_rows.append({
                "device_id": did,
                "date": datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc),
                "energy_type": "electricity",
                "total_consumption": 0.0 if is_pv else total,
                "total_generation": total if is_pv else 0.0,
                "peak_power": peak,
                "min_power": min_p,
                "avg_power": avg_p,
                "operating_hours": float(op_hours),
                "avg_cop": avg_cop,
                "cost": cost,
                "carbon_emission": carbon,
            })

    print(f"  Generated {len(summary_rows)} daily summary rows")

    # ---- Build carbon emission daily rows ----
    print("Computing daily carbon emissions ...")
    daily_consumption: dict[str, float] = {}
    daily_pv_gen: dict[str, float] = {}
    daily_hp_consumption: dict[str, float] = {}

    for did, dates in daily_buckets.items():
        for date_str, bucket in dates.items():
            total = sum(bucket["values"])
            if did in PV_IDS:
                daily_pv_gen[date_str] = daily_pv_gen.get(date_str, 0) + total
            elif did in HP_IDS:
                daily_hp_consumption[date_str] = daily_hp_consumption.get(date_str, 0) + total
                daily_consumption[date_str] = daily_consumption.get(date_str, 0) + total
            else:
                daily_consumption[date_str] = daily_consumption.get(date_str, 0) + total

    all_dates = sorted(set(list(daily_consumption.keys()) + list(daily_pv_gen.keys())))
    for date_str in all_dates:
        dt = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)

        # Grid electricity emission (Scope 2)
        grid_kwh = daily_consumption.get(date_str, 0)
        carbon_rows.append({
            "date": dt, "scope": 2, "category": "electricity",
            "emission": round(grid_kwh * EMISSION_FACTOR, 2),
            "reduction": 0.0,
            "energy_consumption": round(grid_kwh, 2),
            "energy_unit": "kWh",
            "note": "园区用电碳排放",
        })

        # PV generation reduction (Scope 2 avoided)
        pv_kwh = daily_pv_gen.get(date_str, 0)
        if pv_kwh > 0:
            carbon_rows.append({
                "date": dt, "scope": 2, "category": "pv_generation",
                "emission": 0.0,
                "reduction": round(pv_kwh * EMISSION_FACTOR, 2),
                "energy_consumption": round(pv_kwh, 2),
                "energy_unit": "kWh",
                "note": "光伏发电碳减排",
            })

        # Heat pump saving (COP-based reduction vs electric heating)
        hp_kwh = daily_hp_consumption.get(date_str, 0)
        if hp_kwh > 0:
            avg_cop_day = 3.2
            heat_delivered = hp_kwh * avg_cop_day
            electric_heating_kwh = heat_delivered  # COP=1 for electric heating
            saved_kwh = electric_heating_kwh - hp_kwh
            carbon_rows.append({
                "date": dt, "scope": 2, "category": "heat_pump_saving",
                "emission": 0.0,
                "reduction": round(saved_kwh * EMISSION_FACTOR, 2),
                "energy_consumption": round(saved_kwh, 2),
                "energy_unit": "kWh",
                "note": "热泵节能碳减排(相比电加热)",
            })

    print(f"  Generated {len(carbon_rows)} carbon emission rows")

    # ---- Bulk insert ----
    BATCH = 2000

    async with session_factory() as session:
        # Insert energy_data
        print("Inserting energy_data ...")
        insert_energy = text("""
            INSERT INTO energy_data (device_id, timestamp, data_type, value, unit, quality)
            VALUES (:device_id, :timestamp, :data_type, :value, :unit, :quality)
        """)
        for i in range(0, len(energy_rows), BATCH):
            batch = energy_rows[i : i + BATCH]
            await session.execute(insert_energy, batch)
            done = min(i + BATCH, len(energy_rows))
            if done % 10000 < BATCH:
                print(f"  energy_data: {done}/{len(energy_rows)}")
        await session.commit()
        print("  energy_data done.")

        # Insert daily summaries
        print("Inserting energy_daily_summary ...")
        insert_summary = text("""
            INSERT INTO energy_daily_summary
                (device_id, date, energy_type, total_consumption, total_generation,
                 peak_power, min_power, avg_power, operating_hours, avg_cop, cost, carbon_emission)
            VALUES
                (:device_id, :date, :energy_type, :total_consumption, :total_generation,
                 :peak_power, :min_power, :avg_power, :operating_hours, :avg_cop, :cost, :carbon_emission)
        """)
        for i in range(0, len(summary_rows), BATCH):
            batch = summary_rows[i : i + BATCH]
            await session.execute(insert_summary, batch)
        await session.commit()
        print(f"  daily_summary done. ({len(summary_rows)} rows)")

        # Insert carbon emissions
        print("Inserting carbon_emissions ...")
        insert_carbon = text("""
            INSERT INTO carbon_emissions
                (date, scope, category, emission, reduction,
                 energy_consumption, energy_unit, note)
            VALUES
                (:date, :scope, :category, :emission, :reduction,
                 :energy_consumption, :energy_unit, :note)
        """)
        for i in range(0, len(carbon_rows), BATCH):
            batch = carbon_rows[i : i + BATCH]
            await session.execute(insert_carbon, batch)
        await session.commit()
        print(f"  carbon_emissions done. ({len(carbon_rows)} rows)")

    await engine.dispose()
    print("=" * 60)
    print("Backfill complete!")
    print("=" * 60)


if __name__ == "__main__":
    asyncio.run(backfill())