Files
tp-ems/backend/app/services/alarm_checker.py
Du Wenbo 2822486270 Squashed 'core/' changes from 92ec910..2b9797d
2b9797d feat: add customer hooks plugin system (v1.1.0)
26d2731 chore: add VERSION file (1.0.0)

git-subtree-dir: core
git-subtree-split: 2b9797d61b501ecbaa73253f6f4001769917a24f
2026-04-04 18:32:56 +08:00

273 lines
10 KiB
Python

"""告警检测服务 - 根据告警规则检查最新数据,生成/自动恢复告警事件"""
import asyncio
import logging
from datetime import datetime, timezone, timedelta
from pathlib import Path
from sqlalchemy import select, and_
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.alarm import AlarmRule, AlarmEvent
from app.models.energy import EnergyData
from app.models.device import Device
from app.hooks import get_hooks
logger = logging.getLogger("alarm_checker")
# Alarm email template path
_ALARM_TEMPLATE_PATH = Path(__file__).resolve().parent.parent / "templates" / "alarm_email.html"
# Severity display config
_SEVERITY_CONFIG = {
"critical": {
"label": "紧急告警",
"badge_color": "#d32f2f",
"bg_color": "#ffebee",
"text_color": "#c62828",
},
"major": {
"label": "重要告警",
"badge_color": "#e65100",
"bg_color": "#fff3e0",
"text_color": "#e65100",
},
"warning": {
"label": "一般告警",
"badge_color": "#f9a825",
"bg_color": "#fffde7",
"text_color": "#f57f17",
},
}
async def _send_alarm_email(
rule: AlarmRule, event: AlarmEvent, device_id: int, session: AsyncSession
):
"""Send alarm notification email if configured."""
from app.services.email_service import send_email
from app.core.config import get_settings
# Check if email is in notify_channels
channels = rule.notify_channels or []
if "email" not in channels:
return
# Get email targets from notify_targets
targets = rule.notify_targets or {}
emails = targets.get("emails", []) if isinstance(targets, dict) else []
# If notify_targets is a list of strings (emails directly)
if isinstance(targets, list):
emails = [t for t in targets if isinstance(t, str) and "@" in t]
if not emails:
logger.debug(f"No email recipients for alarm rule '{rule.name}', skipping.")
return
# Fetch device info
dev_result = await session.execute(select(Device).where(Device.id == device_id))
device = dev_result.scalar_one_or_none()
device_name = device.name if device else f"设备#{device_id}"
device_code = device.code if device else "N/A"
settings = get_settings()
severity_cfg = _SEVERITY_CONFIG.get(rule.severity, _SEVERITY_CONFIG["warning"])
# Build threshold string
if rule.condition == "range_out":
threshold_str = f"[{rule.threshold_low}, {rule.threshold_high}]"
else:
threshold_str = str(rule.threshold)
# Format triggered time in Beijing timezone
triggered_time = event.triggered_at or datetime.now(timezone.utc)
triggered_beijing = triggered_time + timedelta(hours=8)
triggered_str = triggered_beijing.strftime("%Y-%m-%d %H:%M:%S")
# Load and render template
try:
template_html = _ALARM_TEMPLATE_PATH.read_text(encoding="utf-8")
except FileNotFoundError:
logger.error("Alarm email template not found, skipping email.")
return
body_html = template_html.format(
severity_label=severity_cfg["label"],
severity_badge_color=severity_cfg["badge_color"],
severity_bg_color=severity_cfg["bg_color"],
severity_text_color=severity_cfg["text_color"],
title=event.title,
device_name=device_name,
device_code=device_code,
data_type=rule.data_type,
current_value=str(event.value),
threshold_str=threshold_str,
triggered_at=triggered_str,
description=event.description or "",
platform_url=settings.PLATFORM_URL,
)
subject = f"[{severity_cfg['label']}] {event.title} - 天普EMS告警通知"
asyncio.create_task(send_email(to=emails, subject=subject, body_html=body_html))
# Rate limit: don't create duplicate events for the same rule+device within this window
RATE_LIMIT_MINUTES = 5
def _in_silence_window(rule: AlarmRule, now_beijing: datetime) -> bool:
"""Check if current time falls within the rule's silence window."""
if not rule.silence_start or not rule.silence_end:
return False
current_time = now_beijing.strftime("%H:%M")
start = rule.silence_start
end = rule.silence_end
if start <= end:
return start <= current_time <= end
else:
# Crosses midnight, e.g. 22:00 - 06:00
return current_time >= start or current_time <= end
def _evaluate_condition(rule: AlarmRule, value: float) -> bool:
"""Evaluate whether a data value triggers the alarm rule condition."""
if rule.condition == "gt":
return value > rule.threshold
elif rule.condition == "lt":
return value < rule.threshold
elif rule.condition == "eq":
return abs(value - rule.threshold) < 0.001
elif rule.condition == "neq":
return abs(value - rule.threshold) >= 0.001
elif rule.condition == "range_out":
low = rule.threshold_low if rule.threshold_low is not None else float("-inf")
high = rule.threshold_high if rule.threshold_high is not None else float("inf")
return value < low or value > high
return False
async def check_alarms(session: AsyncSession):
"""Main alarm check routine. Call after each simulator data cycle."""
now = datetime.now(timezone.utc)
now_beijing = now + timedelta(hours=8)
# 1. Load all active alarm rules
result = await session.execute(
select(AlarmRule).where(AlarmRule.is_active == True)
)
rules = result.scalars().all()
for rule in rules:
# Skip if in silence window
if _in_silence_window(rule, now_beijing):
continue
# 2. Find matching devices' latest data point
# Rules can match by device_id (specific) or device_type (all devices of that type)
data_query = (
select(EnergyData)
.where(EnergyData.data_type == rule.data_type)
.order_by(EnergyData.timestamp.desc())
)
if rule.device_id:
data_query = data_query.where(EnergyData.device_id == rule.device_id)
# We need to check per-device, so get recent data points
# For device_type rules, we get data from the last 30 seconds (one cycle)
cutoff = now - timedelta(seconds=30)
data_query = data_query.where(EnergyData.timestamp >= cutoff).limit(50)
data_result = await session.execute(data_query)
data_points = data_result.scalars().all()
if not data_points:
continue
# Group by device_id and take the latest per device
latest_by_device: dict[int, EnergyData] = {}
for dp in data_points:
if dp.device_id not in latest_by_device:
latest_by_device[dp.device_id] = dp
for device_id, dp in latest_by_device.items():
triggered = _evaluate_condition(rule, dp.value)
# Check for existing active event for this rule + device
active_event_result = await session.execute(
select(AlarmEvent).where(
and_(
AlarmEvent.rule_id == rule.id,
AlarmEvent.device_id == device_id,
AlarmEvent.status.in_(["active", "acknowledged"]),
)
)
)
active_event = active_event_result.scalar_one_or_none()
if triggered and not active_event:
# Rate limiting: check if a resolved event was created recently
recent_result = await session.execute(
select(AlarmEvent).where(
and_(
AlarmEvent.rule_id == rule.id,
AlarmEvent.device_id == device_id,
AlarmEvent.triggered_at >= now - timedelta(minutes=RATE_LIMIT_MINUTES),
)
)
)
if recent_result.scalar_one_or_none():
continue # Skip, recently triggered
# Build description
threshold_str = ""
if rule.condition == "range_out":
threshold_str = f"[{rule.threshold_low}, {rule.threshold_high}]"
else:
threshold_str = str(rule.threshold)
event = AlarmEvent(
rule_id=rule.id,
device_id=device_id,
severity=rule.severity,
title=rule.name,
description=f"当前值 {dp.value},阈值 {threshold_str}",
value=dp.value,
threshold=rule.threshold,
status="active",
triggered_at=now,
)
session.add(event)
await session.flush() # Ensure event has id
# Customer hook: on_alarm_created
try:
_dev = await session.execute(select(Device).where(Device.id == device_id))
_device = _dev.scalar_one_or_none()
await get_hooks().on_alarm_created(event, _device, rule, session)
except Exception as _he:
logger.error(f"Hook on_alarm_created error: {_he}")
logger.info(
f"Alarm triggered: {rule.name} | device={device_id} | "
f"value={dp.value} threshold={threshold_str}"
)
# Send email notification (non-blocking)
await _send_alarm_email(rule, event, device_id, session)
elif not triggered and active_event:
# Auto-resolve
active_event.status = "resolved"
active_event.resolved_at = now
active_event.resolve_note = "自动恢复"
# Customer hook: on_alarm_resolved
try:
_dev2 = await session.execute(select(Device).where(Device.id == device_id))
_device2 = _dev2.scalar_one_or_none()
await get_hooks().on_alarm_resolved(active_event, _device2, session)
except Exception as _he2:
logger.error(f"Hook on_alarm_resolved error: {_he2}")
logger.info(
f"Alarm auto-resolved: {rule.name} | device={device_id}"
)
await session.flush()