"""Simple Redfish exporter to collect Power data from bare matel server""" import argparse import signal import time import logging from dataclasses import dataclass, field import asyncio import aiohttp import urllib3 import yaml from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram @dataclass class HostConfig: """Solve too many arguments""" fqdn: str username: str password: str max_retries: int = 1 backoff: int = 2 cool_down: int = 120 # seconds to wait after too many failures failures: int = 0 next_retry_time: float = field(default=0.0, init=False) def should_skip(self) -> bool: """Check if host is still in cool-down window""" return time.monotonic() < self.next_retry_time def mark_failure(self): """Increase failure counter and maybe trigger cool-down""" self.failures += 1 if self.failures >= self.max_retries: self.next_retry_time = time.monotonic() + self.cool_down self.failures = 0 # reset after triggering cool-down def mark_success(self): """Reset failure counter after a successful request""" self.failures = 0 self.next_retry_time = 0.0 # Disable certificate warnings urllib3.disable_warnings() # set log config logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) # Prometheus metrics REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request") REQUEST_LATENCY = Histogram( "redfish_request_latency_seconds", "Time for Redfish request", ["host"] ) up_gauge = Gauge("redfish_up", "Host up/down", ["host"]) error_counter = Counter( "redfish_errors_total", "Total Redfish errors", ["host", "error"] ) voltage_gauge = Gauge( "redfish_psu_line_input_voltage_volts", "Line Input Voltage per PSU", ["host", "psu_serial"], ) watts_gauge = Gauge( "redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"] ) amps_gauge = Gauge( "redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"] ) @REQUEST_TIME.time() async def process_request(t): """Simulate request time""" await asyncio.sleep(t) async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None: """Fetch JSON from Redfish with retry/backoff""" if host.should_skip(): logging.warning( "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time ) up_gauge.labels(host=host.fqdn).set(0) return None for attempt in range(1, host.max_retries + 1): try: async with session.get( url, auth=aiohttp.BasicAuth(host.username, host.password), ssl=False, timeout=10, ) as resp: if resp.status == 200: host.mark_success() return await resp.json() logging.warning( "HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt ) except asyncio.TimeoutError: logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt) except aiohttp.ClientError as e: logging.warning( "Client error on %s (attempt %d): %s", host.fqdn, attempt, e ) except Exception as e: logging.exception( "Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e ) if attempt < host.max_retries: await asyncio.sleep(host.backoff * attempt) else: host.mark_failure() logging.error("All retries failed for %s", host.fqdn) return None async def get_power_data(session, host: HostConfig): """Query Redfish and update Prometheus metrics""" if host.should_skip(): logging.warning( "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time ) up_gauge.labels(host=host.fqdn).set(0) return url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power" start = time.monotonic() data = await fetch_with_retry(session, host, url) if not data: host.mark_failure() up_gauge.labels(host=host.fqdn).set(0) return host.mark_success() up_gauge.labels(host=host.fqdn).set(1) for psu in data.get("PowerSupplies", []): line_input_v = psu.get("LineInputVoltage") watts_input = psu.get("PowerInputWatts") serial = psu.get("SerialNumber") amps = ( round(watts_input / line_input_v, 2) if line_input_v and watts_input else None ) if line_input_v is not None: voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v) if watts_input is not None: watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input) if amps is not None: amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps) REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start) async def run_exporter(config, stop_event): """Main loop""" port = config.get("port", 8000) default_username = config.get("username") default_password = config.get("password") hosts = config["hosts"] interval = config.get("interval", 10) # Start Prometheus metrics server start_http_server(port) logging.info("Prometheus metrics server running on port %s", port) # create persistent HostConfig objects host_objs = [] for host_entry in hosts: if isinstance(host_entry, dict): hc = HostConfig( fqdn=host_entry["fqdn"], username=host_entry.get("username", default_username), password=host_entry.get("password", default_password), ) else: hc = HostConfig( fqdn=host_entry, username=default_username, password=default_password ) host_objs.append(hc) # Connection pooling with aiohttp connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300) async with aiohttp.ClientSession(connector=connector) as session: while not stop_event.is_set(): tasks = [get_power_data(session, hc) for hc in host_objs] await asyncio.gather(*tasks) await process_request(interval) logging.info("Exporter stopped cleanly.") async def main(): """Modern asyncio entry point""" parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter") parser.add_argument("--config", default="config.yaml", help="Path to config file") parser.add_argument("--port", type=int, help="Override port from config file") parser.add_argument( "--interval", type=int, help="Override interval from config file" ) args = parser.parse_args() # Load YAML config with open(args.config, "r", encoding="utf-8") as file: config = yaml.safe_load(file) # Override port if argument is provided if args.port is not None: config["port"] = args.port if args.interval is not None: config["interval"] = args.interval stop_event = asyncio.Event() loop = asyncio.get_running_loop() # Handle SIGINT (Ctrl+C) and SIGTERM for sig in (signal.SIGINT, signal.SIGTERM): loop.add_signal_handler(sig, stop_event.set) await run_exporter(config, stop_event) if __name__ == "__main__": asyncio.run(main())