diff --git a/python/redfish-api/redfish_exporter_v9000.py b/python/redfish-api/redfish_exporter_v9000.py new file mode 100644 index 0000000..3b50efb --- /dev/null +++ b/python/redfish-api/redfish_exporter_v9000.py @@ -0,0 +1,462 @@ +"""Simple Redfish exporter to collect Power data from bare matel server""" + +import argparse +import signal +import time +import logging +from dataclasses import dataclass, field +import asyncio +import aiohttp +import urllib3 +import yaml +import json +from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram + + +@dataclass +class HostConfig: + """Solve too many arguments""" + + fqdn: str + username: str + password: str + max_retries: int = 1 + backoff: int = 2 + cool_down: int = 120 # seconds to wait after too many failures + failures: int = 0 + next_retry_time: float = field(default=0.0, init=False) + + # New attributes for Redfish stuff + vendor: str | None = None + session_token: str | None = None + session_logout: str | None = ( + None # SessionLocation like /redfish/v1/SessionService/Sessions/marco.lucarelli%40abacus.ch00000000xxx/ + ) + + def should_skip(self) -> bool: + """Check if host is still in cool-down window""" + return time.monotonic() < self.next_retry_time + + def mark_failure(self): + """Increase failure counter and maybe trigger cool-down""" + self.failures += 1 + if self.failures >= self.max_retries: + self.next_retry_time = time.monotonic() + self.cool_down + self.failures = 0 # reset after triggering cool-down + + def mark_success(self): + """Reset failure counter after a successful request""" + self.failures = 0 + self.next_retry_time = 0.0 + + +# Disable certificate warnings +urllib3.disable_warnings() +# set log config +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" +) + +# Prometheus metrics +REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request") +REQUEST_LATENCY = Histogram( + "redfish_request_latency_seconds", "Time for Redfish request", ["host"] +) +up_gauge = Gauge("redfish_up", "Host up/down", ["host"]) +error_counter = Counter( + "redfish_errors_total", "Total Redfish errors", ["host", "error"] +) +voltage_gauge = Gauge( + "redfish_psu_line_input_voltage_volts", + "Line Input Voltage per PSU", + ["host", "psu_serial"], +) +watts_gauge = Gauge( + "redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"] +) +amps_gauge = Gauge( + "redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"] +) + + +@REQUEST_TIME.time() +async def process_request(t): + """Simulate request time""" + await asyncio.sleep(t) + + +async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None: + """Fetch JSON from Redfish with retry/backoff""" + if host.should_skip(): + logging.warning( + "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time + ) + up_gauge.labels(host=host.fqdn).set(0) + return None + + if not host.vendor: + try: + async with session.get( + f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10 + ) as resp: + if resp.status == 200: + data = await resp.json() + host.vendor = data.get("Vendor", "") + logging.debug("Detected vendor for %s: %s", host.fqdn, host.vendor) + else: + logging.warning( + "Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status + ) + except Exception as e: + logging.warning("Vendor probe failed for %s: %s", host.fqdn, e) + + is_hpe = host.vendor and host.vendor.strip().upper().startswith("HPE") + + for attempt in range(1, host.max_retries + 1): + try: + headers = {} + + if is_hpe: + # Try to reuse existing session token + if host.session_token: + headers["X-Auth-Token"] = host.session_token + logging.debug("Reusing cached session token for %s", host.fqdn) + else: + # Need to login and store new session token + # HPE Redfish login + login_url = ( + f"https://{host.fqdn}/redfish/v1/SessionService/Sessions" + ) + payload = {"UserName": host.username, "Password": host.password} + async with session.post( + login_url, json=payload, ssl=False, timeout=10 + ) as login_resp: + if login_resp.status == 201: + host.session_token = login_resp.headers.get( + "X-Auth-Token" + ) # as response in header + if not host.session_token: + raise RuntimeError("No X-Auth-Token in login response") + host.session_logout = login_resp.headers.get( + "Location" + ) # as response in header + if not host.session_logout: + raise RuntimeError("No Location in login response") + headers["X-Auth-Token"] = host.session_token + logging.info("New session token obtained for %s", host.fqdn) + else: + logging.warning( + "Login failed for %s: HTTP %s", + host.fqdn, + login_resp.status, + ) + continue # retry login next attempt + + async with session.get( + url, headers=headers, ssl=False, timeout=10 + ) as resp: + if resp.status == 200: + host.mark_success() + return await resp.json() + elif resp.status in (401, 403): + # Token expired or invalid, clear it and retry + logging.warning( + "Invalid token for %s, reauthenticating...", host.fqdn + ) + host.session_token = None + continue + logging.warning( + "HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt + ) + + else: + # Default: BasicAuth, like Supermicro and so + async with session.get( + url, + auth=aiohttp.BasicAuth(host.username, host.password), + ssl=False, + timeout=10, + ) as resp: + if resp.status == 200: + host.mark_success() + return await resp.json() + logging.warning( + "HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt + ) + + except asyncio.TimeoutError: + logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt) + except aiohttp.ClientError as e: + logging.warning( + "Client error on %s (attempt %d): %s", host.fqdn, attempt, e + ) + except Exception as e: + logging.exception( + "Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e + ) + + if attempt < host.max_retries: + await asyncio.sleep(host.backoff * attempt) + else: + host.mark_failure() + logging.error("All retries failed for %s", host.fqdn) + + return None + + +async def get_power_data(session, host: HostConfig): + """Query Redfish and update Prometheus metrics""" + if host.should_skip(): + logging.warning( + "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time + ) + up_gauge.labels(host=host.fqdn).set(0) + return + + # Root ressource abfragen + resources = await discover_redfish_resources(session, host) + if not resources or "Chassis" not in resources: + logging.error("Could not discover Chassis resource for %s", host.fqdn) + host.mark_failure() + up_gauge.labels(host=host.fqdn).set(0) + return + + # Chassis-Ressource abfragen + chassis_url = f"https://{host.fqdn}{resources['Chassis']}" + chassis_data = await fetch_with_retry(session, host, chassis_url) + if not chassis_data: + host.mark_failure() + up_gauge.labels(host=host.fqdn).set(0) + return + + # 3. Power-Daten aus den Chassis-Mitgliedern extrahieren + for chassis_member in chassis_data.get("Members", []): + chassis_member_url = chassis_member.get("@odata.id") + if not chassis_member_url: + continue + + member_url = f"https://{host.fqdn}{chassis_member_url}" + member_data = await fetch_with_retry(session, host, member_url) + if not member_data: + continue + + # PowerSubsystem extrahieren + power_subsystem_url = member_data.get("PowerSubsystem", {}).get("@odata.id") + if not power_subsystem_url: + logging.warning("No PowerSubsystem found for %s", host.fqdn) + continue + + # PowerSubsystem collection abfragen + power_subsystem_url = f"https://{host.fqdn}{power_subsystem_url}" + power_subsystem_data = await fetch_with_retry(session, host, power_subsystem_url) + if not power_subsystem_data: + logging.warning("No PowerSubsystem data found for %s", host.fqdn) + continue + + # PowerSupplies auflisten + power_supplies_url = power_subsystem_data.get("PowerSupplies", {}).get("@odata.id") + + if not power_supplies_url: + logging.warning("No PowerSupplies found for %s", host.fqdn) + continue + + # PowerSupplies Members auflisten + power_supplies_url = f"https://{host.fqdn}{power_supplies_url}" + power_supplies_data = await fetch_with_retry(session, host, power_supplies_url) + if not power_supplies_data: + continue + + # Loop over PowerSupply Members + for psu_member in power_supplies_data.get("Members", []): + psu_url = psu_member.get("@odata.id") + if not psu_url: + continue + + psu_url = f"https://{host.fqdn}{psu_url}" + psu_data = await fetch_with_retry(session, host, psu_url) + if not psu_data: + continue + + # Get Metrics URL + metrics_url = psu_data.get("Metrics", {}).get("@odata.id") + if not metrics_url: + logging.warning("No Metrics found for PowerSupply %s", psu_data.get("Id")) + continue + + metrics_url = f"https://{host.fqdn}{metrics_url}" + metrics_data = await fetch_with_retry(session, host, metrics_url) + if not metrics_data: + continue + pretty = json.dumps(metrics_data, indent=4, sort_keys=True) + print(pretty) + exit(100) + + + + # Ab hier neue Magie + # for psu in power_data.get("PowerSupplies", []): + # ... (deine bestehende Logik für die Metriken) + + + + + + + url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power" + start = time.monotonic() + + data = await fetch_with_retry(session, host, url) + if not data: + host.mark_failure() + up_gauge.labels(host=host.fqdn).set(0) + return + + host.mark_success() + up_gauge.labels(host=host.fqdn).set(1) + + for psu in data.get("PowerSupplies", []): + line_input_v = psu.get("LineInputVoltage") + # HPE Redfish uses LastPowerOutputWatts for Watts + if host.vendor.strip().upper().startswith("HPE"): + watts_input = psu.get("LastPowerOutputWatts") + else: + # Supermicro uses PowerInputWatts + watts_input = psu.get("PowerInputWatts") + serial = psu.get("SerialNumber") + + amps = ( + round(watts_input / line_input_v, 2) + if line_input_v and watts_input + else None + ) + + if line_input_v is not None: + voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v) + if watts_input is not None: + watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input) + if amps is not None: + amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps) + + REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start) + + +async def logout_host(session, host): + """Clean logout for Redfish with session tokens""" + if not host.session_token: + return + if not host.session_logout: + return + try: + logout_url = f"{host.session_logout}" # the full URL is here! + async with session.delete( + logout_url, + headers={"X-Auth-Token": host.session_token}, + ssl=False, + timeout=5, + ) as resp: + if resp.status in (200, 204): + logging.info("Logged out from %s", host.fqdn) + else: + logging.warning( + "Logout failed for %s (HTTP %s)", host.fqdn, resp.status + ) + except Exception as e: + logging.warning("Error during logout for %s: %s", host.fqdn, e) + finally: + host.session_token = None + + +async def run_exporter(config, stop_event): + """Main loop""" + port = config.get("port", 8000) + default_username = config.get("username") + default_password = config.get("password") + hosts = config["hosts"] + interval = config.get("interval", 10) + + # Start Prometheus metrics server + start_http_server(port) + logging.info("Prometheus metrics server running on port %s", port) + + # create persistent HostConfig objects + host_objs = [] + for host_entry in hosts: + if isinstance(host_entry, dict): + hc = HostConfig( + fqdn=host_entry["fqdn"], + username=host_entry.get("username", default_username), + password=host_entry.get("password", default_password), + ) + else: + hc = HostConfig( + fqdn=host_entry, username=default_username, password=default_password + ) + host_objs.append(hc) + + # Connection pooling with aiohttp + connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300) + async with aiohttp.ClientSession(connector=connector) as session: + try: + while not stop_event.is_set(): + tasks = [get_power_data(session, hc) for hc in host_objs] + await asyncio.gather(*tasks) + await process_request(interval) + finally: + # Graceful shutdown: logout from Redfish sessions + logging.info("Exporter stopping, logging out from Redfish sessions...") + await asyncio.gather( + *(logout_host(session, h) for h in host_objs if h.session_token) + ) + logging.info("All sessions logged out.") + logging.info("Exporter stopped cleanly.") + + +# ab hier neu +# Marco Lucarelli 2026-01-29 +async def discover_redfish_resources(session, host: HostConfig) -> dict: + """Discover available Redfish resources and return relevant URLs""" + root_url = f"https://{host.fqdn}/redfish/v1/" + data = await fetch_with_retry(session, host, root_url) + if not data: + return {} + + # Extrahiere Links aus der Root-Antwort + links = { + "Chassis": data.get("Chassis", {}).get("@odata.id"), + "Systems": data.get("Systems", {}).get("@odata.id"), + "SessionService": data.get("SessionService", {}).get("@odata.id"), + } + return links + + +async def main(): + """Modern asyncio entry point""" + parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter") + parser.add_argument("--config", default="config.yaml", help="Path to config file") + parser.add_argument("--port", type=int, help="Override port from config file") + parser.add_argument( + "--interval", type=int, help="Override interval from config file" + ) + args = parser.parse_args() + + # Load YAML config + with open(args.config, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + # Override port if argument is provided + if args.port is not None: + config["port"] = args.port + if args.interval is not None: + config["interval"] = args.interval + + stop_event = asyncio.Event() + loop = asyncio.get_running_loop() + # Handle SIGINT (Ctrl+C) and SIGTERM + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, stop_event.set) + + await run_exporter(config, stop_event) + + +if __name__ == "__main__": + asyncio.run(main())