feature: grouping

This commit is contained in:
2026-02-02 12:55:32 +01:00
parent bdac561e86
commit 5c777483fc
3 changed files with 101 additions and 90 deletions

View File

@@ -56,6 +56,7 @@ class HostConfig:
username: str
password: str
chassis: list[str] | None = None
group: str = "none"
max_retries: int = 3 # 3 retires
backoff: int = 2 # wait 2 seconds
cool_down: int = 120 # seconds to wait after too many failures
@@ -92,24 +93,28 @@ REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing requ
REQUEST_LATENCY = Histogram(
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
)
UP_GAUGE = Gauge("redfish_up", "Host up/down", ["host"])
UP_GAUGE = Gauge("redfish_up", "Host up/down", ["host", "group"])
ERROR_COUNTER = Counter(
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
)
VOLTAGE_GAUGE = Gauge(
"redfish_psu_line_input_voltage_volts",
"Line Input Voltage per PSU",
["host", "psu_serial"],
["host", "psu_serial", "group"],
)
WATTS_GAUGE = Gauge(
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
"redfish_psu_power_input_watts",
"Power Input Watts per PSU",
["host", "psu_serial", "group"],
)
AMPS_GAUGE = Gauge(
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
"redfish_psu_input_amps",
"Current draw in Amps per PSU",
["host", "psu_serial", "group"],
)
# set info metric
SYSTEM_INFO = Info(
"redfish_system_info", "System information (model, serial, etc.)", ["host"]
"redfish_system_info", "System information (model, serial, etc.)", ["host", "group"]
)
@@ -170,7 +175,7 @@ async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
logging.warning(
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
)
UP_GAUGE.labels(host=host.fqdn).set(0)
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
return None
# Probe vendor if not already known
@@ -391,7 +396,7 @@ async def get_power_data(session, host: HostConfig):
logging.warning(
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
)
UP_GAUGE.labels(host=host.fqdn).set(0)
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
return
# Start time measurement
@@ -402,17 +407,17 @@ async def get_power_data(session, host: HostConfig):
if not resources or not resources.chassis:
logging.error("Could not discover any resources for %s", host.fqdn)
host.mark_failure()
UP_GAUGE.labels(host=host.fqdn).set(0)
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
return
host.mark_success()
UP_GAUGE.labels(host=host.fqdn).set(1)
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(1)
chassis_url = f"https://{host.fqdn}{resources.chassis}"
chassis_data = await fetch_with_retry(session, host, chassis_url)
if not chassis_data:
host.mark_failure()
UP_GAUGE.labels(host=host.fqdn).set(0)
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
return
for chassis_member in chassis_data.get("Members", []):
@@ -498,13 +503,17 @@ async def get_power_data(session, host: HostConfig):
def update_prometheus_metrics(host: HostConfig, metrics: PowerMetrics):
"""Update Prometheus metrics with PowerMetrics data."""
if metrics.voltage is not None and metrics.serial:
VOLTAGE_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(
metrics.voltage
)
VOLTAGE_GAUGE.labels(
host=host.fqdn, psu_serial=metrics.serial, group=host.group
).set(metrics.voltage)
if metrics.watts is not None and metrics.serial:
WATTS_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.watts)
WATTS_GAUGE.labels(
host=host.fqdn, psu_serial=metrics.serial, group=host.group
).set(metrics.watts)
if metrics.amps is not None and metrics.serial:
AMPS_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.amps)
AMPS_GAUGE.labels(
host=host.fqdn, psu_serial=metrics.serial, group=host.group
).set(metrics.amps)
async def get_system_info(session, host: HostConfig):
@@ -547,7 +556,7 @@ async def get_system_info(session, host: HostConfig):
serial_number = system_data.get("SerialNumber")
# Hier könnte ihre Werbung stehen
SYSTEM_INFO.labels(host=host.fqdn).info(
SYSTEM_INFO.labels(host=host.fqdn, group=host.group).info(
{
"manufacturer": manufacturer,
"model": model,
@@ -587,7 +596,8 @@ async def run_exporter(config, stop_event):
port = config.get("port", 8000)
default_username = config.get("username")
default_password = config.get("password")
default_chassis = config.get("chassis")
default_chassis = config.get("chassis", "1")
default_group = config.get("group", "none")
hosts = config["hosts"]
interval = config.get("interval", 10)
@@ -604,10 +614,15 @@ async def run_exporter(config, stop_event):
username=host_entry.get("username", default_username),
password=host_entry.get("password", default_password),
chassis=host_entry.get("chassis", default_chassis),
group=host_entry.get("group", default_group),
)
else:
hc = HostConfig(
fqdn=host_entry, username=default_username, password=default_password
fqdn=host_entry,
username=default_username,
password=default_password,
chassis=default_chassis,
group=default_group,
)
host_objs.append(hc)