my_projects/python/redfish-api/redfish_exporter_v9000.py

"""Simple Redfish exporter to collect Power data from bare matel server"""

import argparse
import signal
import time
import logging
from dataclasses import dataclass, field
import asyncio
import aiohttp
import urllib3
import yaml
import json
from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram


@dataclass
class HostConfig:
    """Solve too many arguments"""

    fqdn: str
    username: str
    password: str
    systemid: list[str] | None = None
    max_retries: int = 1
    backoff: int = 2
    cool_down: int = 120  # seconds to wait after too many failures
    failures: int = 0
    next_retry_time: float = field(default=0.0, init=False)

    # New attributes for Redfish stuff
    vendor: str | None = None
    session_token: str | None = None
    session_logout: str | None = (
        None  # SessionLocation like /redfish/v1/SessionService/Sessions/marco.lucarelli%40abacus.ch00000000xxx/
    )

    def should_skip(self) -> bool:
        """Check if host is still in cool-down window"""
        return time.monotonic() < self.next_retry_time

    def mark_failure(self):
        """Increase failure counter and maybe trigger cool-down"""
        self.failures += 1
        if self.failures >= self.max_retries:
            self.next_retry_time = time.monotonic() + self.cool_down
            self.failures = 0  # reset after triggering cool-down

    def mark_success(self):
        """Reset failure counter after a successful request"""
        self.failures = 0
        self.next_retry_time = 0.0


# Disable certificate warnings
urllib3.disable_warnings()
# set log config
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)

# Prometheus metrics
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
REQUEST_LATENCY = Histogram(
    "redfish_request_latency_seconds", "Time for Redfish request", ["host"]
)
up_gauge = Gauge("redfish_up", "Host up/down", ["host"])
error_counter = Counter(
    "redfish_errors_total", "Total Redfish errors", ["host", "error"]
)
voltage_gauge = Gauge(
    "redfish_psu_line_input_voltage_volts",
    "Line Input Voltage per PSU",
    ["host", "psu_serial"],
)
watts_gauge = Gauge(
    "redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
)
amps_gauge = Gauge(
    "redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
)


@REQUEST_TIME.time()
async def process_request(t):
    """Simulate request time"""
    await asyncio.sleep(t)


async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
    """Fetch JSON from Redfish with retry/backoff"""
    if host.should_skip():
        logging.warning(
            "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
        )
        up_gauge.labels(host=host.fqdn).set(0)
        return None

    if not host.vendor:
        try:
            async with session.get(
                f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10
            ) as resp:
                if resp.status == 200:
                    data = await resp.json()
                    host.vendor = data.get("Vendor", "")
                    logging.debug("Detected vendor for %s: %s", host.fqdn, host.vendor)
                else:
                    logging.warning(
                        "Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status
                    )
        except Exception as e:
            logging.warning("Vendor probe failed for %s: %s", host.fqdn, e)

    is_hpe = host.vendor and host.vendor.strip().upper().startswith("HPE")

    for attempt in range(1, host.max_retries + 1):
        try:
            headers = {}

            if is_hpe:
                # Try to reuse existing session token
                if host.session_token:
                    headers["X-Auth-Token"] = host.session_token
                    logging.debug("Reusing cached session token for %s", host.fqdn)
                else:
                    # Need to login and store new session token
                    # HPE Redfish login
                    login_url = (
                        f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
                    )
                    payload = {"UserName": host.username, "Password": host.password}
                    async with session.post(
                        login_url, json=payload, ssl=False, timeout=10
                    ) as login_resp:
                        if login_resp.status == 201:
                            host.session_token = login_resp.headers.get(
                                "X-Auth-Token"
                            )  # as response in header
                            if not host.session_token:
                                raise RuntimeError("No X-Auth-Token in login response")
                            host.session_logout = login_resp.headers.get(
                                "Location"
                            )  # as response in header
                            if not host.session_logout:
                                raise RuntimeError("No Location in login response")
                            headers["X-Auth-Token"] = host.session_token
                            logging.info("New session token obtained for %s", host.fqdn)
                        else:
                            logging.warning(
                                "Login failed for %s: HTTP %s",
                                host.fqdn,
                                login_resp.status,
                            )
                            continue  # retry login next attempt

                async with session.get(
                    url, headers=headers, ssl=False, timeout=10
                ) as resp:
                    if resp.status == 200:
                        host.mark_success()
                        return await resp.json()
                    elif resp.status in (401, 403):
                        # Token expired or invalid, clear it and retry
                        logging.warning(
                            "Invalid token for %s, reauthenticating...", host.fqdn
                        )
                        host.session_token = None
                        continue
                    logging.warning(
                        "HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
                    )

            else:
                # Default: BasicAuth, like Supermicro and so
                async with session.get(
                    url,
                    auth=aiohttp.BasicAuth(host.username, host.password),
                    ssl=False,
                    timeout=10,
                ) as resp:
                    if resp.status == 200:
                        host.mark_success()
                        return await resp.json()
                    logging.warning(
                        "HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
                    )

        except asyncio.TimeoutError:
            logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt)
        except aiohttp.ClientError as e:
            logging.warning(
                "Client error on %s (attempt %d): %s", host.fqdn, attempt, e
            )
        except Exception as e:
            logging.exception(
                "Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e
            )

        if attempt < host.max_retries:
            await asyncio.sleep(host.backoff * attempt)
        else:
            host.mark_failure()
            logging.error("All retries failed for %s", host.fqdn)

    return None


async def get_power_data(session, host: HostConfig):
    """Query Redfish and update Prometheus metrics"""
    if host.should_skip():
        logging.warning(
            "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
        )
        up_gauge.labels(host=host.fqdn).set(0)
        return

    # start time measurement
    start = time.monotonic()
    # Root ressource abfragen
    resources = await discover_redfish_resources(session, host)
    if not resources or "Chassis" not in resources:
        logging.error("Could not discover Chassis resource for %s", host.fqdn)
        host.mark_failure()
        up_gauge.labels(host=host.fqdn).set(0)
        return

    # Mark host as up
    host.mark_success()
    up_gauge.labels(host=host.fqdn).set(1)

    # Chassis-Ressource abfragen
    chassis_url = f"https://{host.fqdn}{resources['Chassis']}"
    chassis_data = await fetch_with_retry(session, host, chassis_url)
    if not chassis_data:
        host.mark_failure()
        up_gauge.labels(host=host.fqdn).set(0)
        return

    # 3. Power-Daten aus den Chassis-Mitgliedern extrahieren
    for chassis_member in chassis_data.get("Members", []):
        chassis_member_url = chassis_member.get("@odata.id")
        if not chassis_member_url:
            continue

        # Get Chassis ID from url ("/redfish/v1/Chassis/1" -> 1)
        chassis_id = chassis_member_url.split("/")[-1]
        # Check if the chassis id is in config (had problem with chassis "NVMe")
        if hasattr(host, 'systemid') and host.systemid:
            if chassis_id not in host.systemid:
                continue

        member_url = f"https://{host.fqdn}{chassis_member_url}"
        member_data = await fetch_with_retry(session, host, member_url)
        if not member_data:
            continue

        # PowerSubsystem url
        power_subsystem_url = member_data.get("PowerSubsystem", {}).get("@odata.id")
        if not power_subsystem_url:
            logging.warning("No PowerSubsystem found for %s", host.fqdn)
            continue

        # PowerSubsystem collection abfragen
        power_subsystem_url = f"https://{host.fqdn}{power_subsystem_url}"
        power_subsystem_data = await fetch_with_retry(
            session, host, power_subsystem_url
        )
        if not power_subsystem_data:
            logging.warning("No PowerSubsystem data found for %s", host.fqdn)
            continue

        # PowerSupplies auflisten
        power_supplies_url = power_subsystem_data.get("PowerSupplies", {}).get(
            "@odata.id"
        )
        if not power_supplies_url:
            logging.warning("No PowerSupplies found for %s", host.fqdn)
            continue

        # PowerSupplies Members auflisten
        power_supplies_url = f"https://{host.fqdn}{power_supplies_url}"
        power_supplies_data = await fetch_with_retry(session, host, power_supplies_url)
        if not power_supplies_data:
            continue

        # Loop over PowerSupply Members
        for psu_member in power_supplies_data.get("Members", []):
            psu_url = psu_member.get("@odata.id")
            if not psu_url:
                continue

            psu_url = f"https://{host.fqdn}{psu_url}"
            psu_data = await fetch_with_retry(session, host, psu_url)
            if not psu_data:
                continue

            # Get Metrics URL
            metrics_url = psu_data.get("Metrics", {}).get("@odata.id")
            if not metrics_url:
                logging.warning(
                    "No Metrics found for PowerSupply %s", psu_data.get("Id")
                )
                continue

            metrics_url = f"https://{host.fqdn}{metrics_url}"
            metrics_data = await fetch_with_retry(session, host, metrics_url)
            if not metrics_data:
                continue

            # Get Metrics from data
            line_input_v = metrics_data.get("InputVoltage", {}).get("Reading")
            watts_input = metrics_data.get("InputPowerWatts", {}).get("Reading")
            amps_input = metrics_data.get("InputCurrentAmps", {}).get("Reading")
            serial = psu_data.get("SerialNumber")
            # Calculate Amps
            if line_input_v is not None:
                voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(
                    line_input_v
                )
            if watts_input is not None:
                watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input)
            if amps_input is not None:
                amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps_input)

    REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)


async def logout_host(session, host):
    """Clean logout for Redfish with session tokens"""
    if not host.session_token:
        return
    if not host.session_logout:
        return
    try:
        logout_url = f"{host.session_logout}"  # the full URL is here!
        async with session.delete(
            logout_url,
            headers={"X-Auth-Token": host.session_token},
            ssl=False,
            timeout=5,
        ) as resp:
            if resp.status in (200, 204):
                logging.info("Logged out from %s", host.fqdn)
            else:
                logging.warning(
                    "Logout failed for %s (HTTP %s)", host.fqdn, resp.status
                )
    except Exception as e:
        logging.warning("Error during logout for %s: %s", host.fqdn, e)
    finally:
        host.session_token = None


async def run_exporter(config, stop_event):
    """Main loop"""
    port = config.get("port", 8000)
    default_username = config.get("username")
    default_password = config.get("password")
    default_systemid = config.get("systemid")
    hosts = config["hosts"]
    interval = config.get("interval", 10)

    # Start Prometheus metrics server
    start_http_server(port)
    logging.info("Prometheus metrics server running on port %s", port)

    # create persistent HostConfig objects
    host_objs = []
    for host_entry in hosts:
        if isinstance(host_entry, dict):
            hc = HostConfig(
                fqdn=host_entry["fqdn"],
                username=host_entry.get("username", default_username),
                password=host_entry.get("password", default_password),
                systemid=host_entry.get("systemid", default_systemid),
            )
        else:
            hc = HostConfig(
                fqdn=host_entry, username=default_username, password=default_password
            )
        host_objs.append(hc)

    # Connection pooling with aiohttp
    connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300)
    async with aiohttp.ClientSession(connector=connector) as session:
        try:
            while not stop_event.is_set():
                tasks = [get_power_data(session, hc) for hc in host_objs]
                await asyncio.gather(*tasks)
                await process_request(interval)
        finally:
            # Graceful shutdown: logout from Redfish sessions
            logging.info("Exporter stopping, logging out from Redfish sessions...")
            await asyncio.gather(
                *(logout_host(session, h) for h in host_objs if h.session_token)
            )
            logging.info("All sessions logged out.")
    logging.info("Exporter stopped cleanly.")


# ab hier neu
# Marco Lucarelli 2026-01-29
async def discover_redfish_resources(session, host: HostConfig) -> dict:
    """Discover available Redfish resources and return relevant URLs"""
    root_url = f"https://{host.fqdn}/redfish/v1/"
    data = await fetch_with_retry(session, host, root_url)
    if not data:
        return {}

    # Extrahiere Links aus der Root-Antwort
    links = {
        "Chassis": data.get("Chassis", {}).get("@odata.id"),
        "Systems": data.get("Systems", {}).get("@odata.id"),
        "SessionService": data.get("SessionService", {}).get("@odata.id"),
    }
    return links


async def main():
    """Modern asyncio entry point"""
    parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
    parser.add_argument("--config", default="config.yaml", help="Path to config file")
    parser.add_argument("--port", type=int, help="Override port from config file")
    parser.add_argument(
        "--interval", type=int, help="Override interval from config file"
    )
    args = parser.parse_args()

    # Load YAML config
    with open(args.config, "r", encoding="utf-8") as file:
        config = yaml.safe_load(file)

    # Override port if argument is provided
    if args.port is not None:
        config["port"] = args.port
    if args.interval is not None:
        config["interval"] = args.interval

    stop_event = asyncio.Event()
    loop = asyncio.get_running_loop()
    # Handle SIGINT (Ctrl+C) and SIGTERM
    for sig in (signal.SIGINT, signal.SIGTERM):
        loop.add_signal_handler(sig, stop_event.set)

    await run_exporter(config, stop_event)


if __name__ == "__main__":
    asyncio.run(main())