New plugin architecture for customer-specific business logic:
- hooks/base.py: CustomerHooks base class with 12 hook points
(on_alarm_created, on_alarm_resolved, on_energy_data_received,
on_device_status_changed, on_quota_exceeded, on_work_order_created,
on_work_order_completed, on_inspection_completed, on_report_generated,
calculate_custom_kpis, on_charging_order_created/completed)
- hooks/loader.py: Dynamic loader that imports from customers/{CUSTOMER}/hooks/
- alarm_checker.py: calls on_alarm_created and on_alarm_resolved hooks
- quota_checker.py: calls on_quota_exceeded hook
Customers override hooks by creating customers/{name}/hooks/__init__.py
without modifying core code. Scales to 10-20+ customers.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
273 lines
10 KiB
Python
273 lines
10 KiB
Python
"""告警检测服务 - 根据告警规则检查最新数据,生成/自动恢复告警事件"""
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timezone, timedelta
|
|
from pathlib import Path
|
|
from sqlalchemy import select, and_
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from app.models.alarm import AlarmRule, AlarmEvent
|
|
from app.models.energy import EnergyData
|
|
from app.models.device import Device
|
|
from app.hooks import get_hooks
|
|
|
|
logger = logging.getLogger("alarm_checker")
|
|
|
|
# Alarm email template path
|
|
_ALARM_TEMPLATE_PATH = Path(__file__).resolve().parent.parent / "templates" / "alarm_email.html"
|
|
|
|
# Severity display config
|
|
_SEVERITY_CONFIG = {
|
|
"critical": {
|
|
"label": "紧急告警",
|
|
"badge_color": "#d32f2f",
|
|
"bg_color": "#ffebee",
|
|
"text_color": "#c62828",
|
|
},
|
|
"major": {
|
|
"label": "重要告警",
|
|
"badge_color": "#e65100",
|
|
"bg_color": "#fff3e0",
|
|
"text_color": "#e65100",
|
|
},
|
|
"warning": {
|
|
"label": "一般告警",
|
|
"badge_color": "#f9a825",
|
|
"bg_color": "#fffde7",
|
|
"text_color": "#f57f17",
|
|
},
|
|
}
|
|
|
|
|
|
async def _send_alarm_email(
|
|
rule: AlarmRule, event: AlarmEvent, device_id: int, session: AsyncSession
|
|
):
|
|
"""Send alarm notification email if configured."""
|
|
from app.services.email_service import send_email
|
|
from app.core.config import get_settings
|
|
|
|
# Check if email is in notify_channels
|
|
channels = rule.notify_channels or []
|
|
if "email" not in channels:
|
|
return
|
|
|
|
# Get email targets from notify_targets
|
|
targets = rule.notify_targets or {}
|
|
emails = targets.get("emails", []) if isinstance(targets, dict) else []
|
|
# If notify_targets is a list of strings (emails directly)
|
|
if isinstance(targets, list):
|
|
emails = [t for t in targets if isinstance(t, str) and "@" in t]
|
|
|
|
if not emails:
|
|
logger.debug(f"No email recipients for alarm rule '{rule.name}', skipping.")
|
|
return
|
|
|
|
# Fetch device info
|
|
dev_result = await session.execute(select(Device).where(Device.id == device_id))
|
|
device = dev_result.scalar_one_or_none()
|
|
device_name = device.name if device else f"设备#{device_id}"
|
|
device_code = device.code if device else "N/A"
|
|
|
|
settings = get_settings()
|
|
severity_cfg = _SEVERITY_CONFIG.get(rule.severity, _SEVERITY_CONFIG["warning"])
|
|
|
|
# Build threshold string
|
|
if rule.condition == "range_out":
|
|
threshold_str = f"[{rule.threshold_low}, {rule.threshold_high}]"
|
|
else:
|
|
threshold_str = str(rule.threshold)
|
|
|
|
# Format triggered time in Beijing timezone
|
|
triggered_time = event.triggered_at or datetime.now(timezone.utc)
|
|
triggered_beijing = triggered_time + timedelta(hours=8)
|
|
triggered_str = triggered_beijing.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
# Load and render template
|
|
try:
|
|
template_html = _ALARM_TEMPLATE_PATH.read_text(encoding="utf-8")
|
|
except FileNotFoundError:
|
|
logger.error("Alarm email template not found, skipping email.")
|
|
return
|
|
|
|
body_html = template_html.format(
|
|
severity_label=severity_cfg["label"],
|
|
severity_badge_color=severity_cfg["badge_color"],
|
|
severity_bg_color=severity_cfg["bg_color"],
|
|
severity_text_color=severity_cfg["text_color"],
|
|
title=event.title,
|
|
device_name=device_name,
|
|
device_code=device_code,
|
|
data_type=rule.data_type,
|
|
current_value=str(event.value),
|
|
threshold_str=threshold_str,
|
|
triggered_at=triggered_str,
|
|
description=event.description or "",
|
|
platform_url=settings.PLATFORM_URL,
|
|
)
|
|
|
|
subject = f"[{severity_cfg['label']}] {event.title} - 天普EMS告警通知"
|
|
asyncio.create_task(send_email(to=emails, subject=subject, body_html=body_html))
|
|
|
|
# Rate limit: don't create duplicate events for the same rule+device within this window
|
|
RATE_LIMIT_MINUTES = 5
|
|
|
|
|
|
def _in_silence_window(rule: AlarmRule, now_beijing: datetime) -> bool:
|
|
"""Check if current time falls within the rule's silence window."""
|
|
if not rule.silence_start or not rule.silence_end:
|
|
return False
|
|
current_time = now_beijing.strftime("%H:%M")
|
|
start = rule.silence_start
|
|
end = rule.silence_end
|
|
if start <= end:
|
|
return start <= current_time <= end
|
|
else:
|
|
# Crosses midnight, e.g. 22:00 - 06:00
|
|
return current_time >= start or current_time <= end
|
|
|
|
|
|
def _evaluate_condition(rule: AlarmRule, value: float) -> bool:
|
|
"""Evaluate whether a data value triggers the alarm rule condition."""
|
|
if rule.condition == "gt":
|
|
return value > rule.threshold
|
|
elif rule.condition == "lt":
|
|
return value < rule.threshold
|
|
elif rule.condition == "eq":
|
|
return abs(value - rule.threshold) < 0.001
|
|
elif rule.condition == "neq":
|
|
return abs(value - rule.threshold) >= 0.001
|
|
elif rule.condition == "range_out":
|
|
low = rule.threshold_low if rule.threshold_low is not None else float("-inf")
|
|
high = rule.threshold_high if rule.threshold_high is not None else float("inf")
|
|
return value < low or value > high
|
|
return False
|
|
|
|
|
|
async def check_alarms(session: AsyncSession):
|
|
"""Main alarm check routine. Call after each simulator data cycle."""
|
|
now = datetime.now(timezone.utc)
|
|
now_beijing = now + timedelta(hours=8)
|
|
|
|
# 1. Load all active alarm rules
|
|
result = await session.execute(
|
|
select(AlarmRule).where(AlarmRule.is_active == True)
|
|
)
|
|
rules = result.scalars().all()
|
|
|
|
for rule in rules:
|
|
# Skip if in silence window
|
|
if _in_silence_window(rule, now_beijing):
|
|
continue
|
|
|
|
# 2. Find matching devices' latest data point
|
|
# Rules can match by device_id (specific) or device_type (all devices of that type)
|
|
data_query = (
|
|
select(EnergyData)
|
|
.where(EnergyData.data_type == rule.data_type)
|
|
.order_by(EnergyData.timestamp.desc())
|
|
)
|
|
|
|
if rule.device_id:
|
|
data_query = data_query.where(EnergyData.device_id == rule.device_id)
|
|
|
|
# We need to check per-device, so get recent data points
|
|
# For device_type rules, we get data from the last 30 seconds (one cycle)
|
|
cutoff = now - timedelta(seconds=30)
|
|
data_query = data_query.where(EnergyData.timestamp >= cutoff).limit(50)
|
|
|
|
data_result = await session.execute(data_query)
|
|
data_points = data_result.scalars().all()
|
|
|
|
if not data_points:
|
|
continue
|
|
|
|
# Group by device_id and take the latest per device
|
|
latest_by_device: dict[int, EnergyData] = {}
|
|
for dp in data_points:
|
|
if dp.device_id not in latest_by_device:
|
|
latest_by_device[dp.device_id] = dp
|
|
|
|
for device_id, dp in latest_by_device.items():
|
|
triggered = _evaluate_condition(rule, dp.value)
|
|
|
|
# Check for existing active event for this rule + device
|
|
active_event_result = await session.execute(
|
|
select(AlarmEvent).where(
|
|
and_(
|
|
AlarmEvent.rule_id == rule.id,
|
|
AlarmEvent.device_id == device_id,
|
|
AlarmEvent.status.in_(["active", "acknowledged"]),
|
|
)
|
|
)
|
|
)
|
|
active_event = active_event_result.scalar_one_or_none()
|
|
|
|
if triggered and not active_event:
|
|
# Rate limiting: check if a resolved event was created recently
|
|
recent_result = await session.execute(
|
|
select(AlarmEvent).where(
|
|
and_(
|
|
AlarmEvent.rule_id == rule.id,
|
|
AlarmEvent.device_id == device_id,
|
|
AlarmEvent.triggered_at >= now - timedelta(minutes=RATE_LIMIT_MINUTES),
|
|
)
|
|
)
|
|
)
|
|
if recent_result.scalar_one_or_none():
|
|
continue # Skip, recently triggered
|
|
|
|
# Build description
|
|
threshold_str = ""
|
|
if rule.condition == "range_out":
|
|
threshold_str = f"[{rule.threshold_low}, {rule.threshold_high}]"
|
|
else:
|
|
threshold_str = str(rule.threshold)
|
|
|
|
event = AlarmEvent(
|
|
rule_id=rule.id,
|
|
device_id=device_id,
|
|
severity=rule.severity,
|
|
title=rule.name,
|
|
description=f"当前值 {dp.value},阈值 {threshold_str}",
|
|
value=dp.value,
|
|
threshold=rule.threshold,
|
|
status="active",
|
|
triggered_at=now,
|
|
)
|
|
session.add(event)
|
|
await session.flush() # Ensure event has id
|
|
|
|
# Customer hook: on_alarm_created
|
|
try:
|
|
_dev = await session.execute(select(Device).where(Device.id == device_id))
|
|
_device = _dev.scalar_one_or_none()
|
|
await get_hooks().on_alarm_created(event, _device, rule, session)
|
|
except Exception as _he:
|
|
logger.error(f"Hook on_alarm_created error: {_he}")
|
|
|
|
logger.info(
|
|
f"Alarm triggered: {rule.name} | device={device_id} | "
|
|
f"value={dp.value} threshold={threshold_str}"
|
|
)
|
|
|
|
# Send email notification (non-blocking)
|
|
await _send_alarm_email(rule, event, device_id, session)
|
|
|
|
elif not triggered and active_event:
|
|
# Auto-resolve
|
|
active_event.status = "resolved"
|
|
active_event.resolved_at = now
|
|
active_event.resolve_note = "自动恢复"
|
|
|
|
# Customer hook: on_alarm_resolved
|
|
try:
|
|
_dev2 = await session.execute(select(Device).where(Device.id == device_id))
|
|
_device2 = _dev2.scalar_one_or_none()
|
|
await get_hooks().on_alarm_resolved(active_event, _device2, session)
|
|
except Exception as _he2:
|
|
logger.error(f"Hook on_alarm_resolved error: {_he2}")
|
|
logger.info(
|
|
f"Alarm auto-resolved: {rule.name} | device={device_id}"
|
|
)
|
|
|
|
await session.flush()
|