Files
my_projects/python/redfish-api/redfish_exporter.py

316 lines
11 KiB
Python

"""Simple Redfish exporter to collect Power data from bare matel server"""
import argparse
import signal
import time
import logging
from dataclasses import dataclass, field
import asyncio
import aiohttp
import urllib3
import yaml
from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram
@dataclass
class HostConfig:
"""Solve too many arguments"""
fqdn: str
username: str
password: str
max_retries: int = 1
backoff: int = 2
cool_down: int = 120 # seconds to wait after too many failures
failures: int = 0
next_retry_time: float = field(default=0.0, init=False)
# New attributes for Redfish stuff
vendor: str | None = None
session_token: str | None = None
def should_skip(self) -> bool:
"""Check if host is still in cool-down window"""
return time.monotonic() < self.next_retry_time
def mark_failure(self):
"""Increase failure counter and maybe trigger cool-down"""
self.failures += 1
if self.failures >= self.max_retries:
self.next_retry_time = time.monotonic() + self.cool_down
self.failures = 0 # reset after triggering cool-down
def mark_success(self):
"""Reset failure counter after a successful request"""
self.failures = 0
self.next_retry_time = 0.0
# Disable certificate warnings
urllib3.disable_warnings()
# set log config
logging.basicConfig(
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)
# Prometheus metrics
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
REQUEST_LATENCY = Histogram(
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
)
up_gauge = Gauge("redfish_up", "Host up/down", ["host"])
error_counter = Counter(
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
)
voltage_gauge = Gauge(
"redfish_psu_line_input_voltage_volts",
"Line Input Voltage per PSU",
["host", "psu_serial"],
)
watts_gauge = Gauge(
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
)
amps_gauge = Gauge(
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
)
@REQUEST_TIME.time()
async def process_request(t):
"""Simulate request time"""
await asyncio.sleep(t)
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
"""Fetch JSON from Redfish with retry/backoff"""
if host.should_skip():
logging.warning(
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
)
up_gauge.labels(host=host.fqdn).set(0)
return None
if not host.vendor:
try:
async with session.get(f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10) as resp:
if resp.status == 200:
data = await resp.json()
host.vendor = data.get("Vendor", "")
logging.debug("Detected vendor for %s: %s", host.fqdn, host.vendor)
else:
logging.warning("Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status)
except Exception as e:
logging.warning("Vendor probe failed for %s: %s", host.fqdn, e)
is_hpe = host.vendor and host.vendor.strip().upper().startswith("HPE")
for attempt in range(1, host.max_retries + 1):
try:
headers = {}
if is_hpe:
# Try to reuse existing session token
if host.session_token:
headers["X-Auth-Token"] = host.session_token
logging.debug("Reusing cached session token for %s", host.fqdn)
else:
# Need to login and store new session token
# HPE Redfish login
login_url = f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
payload = {"UserName": host.username, "Password": host.password}
async with session.post(login_url, json=payload, ssl=False, timeout=10) as login_resp:
if login_resp.status == 201:
host.session_token = login_resp.headers.get("X-Auth-Token")
if not host.session_token:
raise RuntimeError("No X-Auth-Token in login response")
headers["X-Auth-Token"] = host.session_token
logging.info("New session token obtained for %s", host.fqdn)
else:
logging.warning("Login failed for %s: HTTP %s", host.fqdn, login_resp.status)
continue # retry login next attempt
async with session.get(url, headers=headers, ssl=False, timeout=10) as resp:
if resp.status == 200:
host.mark_success()
return await resp.json()
elif resp.status in (401, 403):
# Token expired or invalid, clear it and retry
logging.warning("Invalid token for %s, reauthenticating...", host.fqdn)
host.session_token = None
continue
logging.warning("HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt)
else:
# Default: BasicAuth, like Supermicro and so
async with session.get(
url,
auth=aiohttp.BasicAuth(host.username, host.password),
ssl=False,
timeout=10,
) as resp:
if resp.status == 200:
host.mark_success()
return await resp.json()
logging.warning(
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
)
except asyncio.TimeoutError:
logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt)
except aiohttp.ClientError as e:
logging.warning(
"Client error on %s (attempt %d): %s", host.fqdn, attempt, e
)
except Exception as e:
logging.exception(
"Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e
)
if attempt < host.max_retries:
await asyncio.sleep(host.backoff * attempt)
else:
host.mark_failure()
logging.error("All retries failed for %s", host.fqdn)
return None
async def get_power_data(session, host: HostConfig):
"""Query Redfish and update Prometheus metrics"""
if host.should_skip():
logging.warning(
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
)
up_gauge.labels(host=host.fqdn).set(0)
return
url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power"
start = time.monotonic()
data = await fetch_with_retry(session, host, url)
if not data:
host.mark_failure()
up_gauge.labels(host=host.fqdn).set(0)
return
host.mark_success()
up_gauge.labels(host=host.fqdn).set(1)
for psu in data.get("PowerSupplies", []):
line_input_v = psu.get("LineInputVoltage")
watts_input = psu.get("PowerInputWatts")
serial = psu.get("SerialNumber")
amps = (
round(watts_input / line_input_v, 2)
if line_input_v and watts_input
else None
)
if line_input_v is not None:
voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v)
if watts_input is not None:
watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input)
if amps is not None:
amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps)
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
async def logout_host(session, host):
"""Clean logout for Redfish with session tokens"""
if not host.session_token:
return
try:
logout_url = f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
async with session.delete(
logout_url,
headers={"X-Auth-Token": host.session_token},
ssl=False,
timeout=5,
) as resp:
if resp.status in (200, 204):
logging.info("Logged out from %s", host.fqdn)
else:
logging.warning("Logout failed for %s (HTTP %s)", host.fqdn, resp.status)
except Exception as e:
logging.warning("Error during logout for %s: %s", host.fqdn, e)
finally:
host.session_token = None
async def run_exporter(config, stop_event):
"""Main loop"""
port = config.get("port", 8000)
default_username = config.get("username")
default_password = config.get("password")
hosts = config["hosts"]
interval = config.get("interval", 10)
# Start Prometheus metrics server
start_http_server(port)
logging.info("Prometheus metrics server running on port %s", port)
# create persistent HostConfig objects
host_objs = []
for host_entry in hosts:
if isinstance(host_entry, dict):
hc = HostConfig(
fqdn=host_entry["fqdn"],
username=host_entry.get("username", default_username),
password=host_entry.get("password", default_password),
)
else:
hc = HostConfig(
fqdn=host_entry, username=default_username, password=default_password
)
host_objs.append(hc)
# Connection pooling with aiohttp
connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300)
async with aiohttp.ClientSession(connector=connector) as session:
hosts = [HostConfig(**h) for h in config["hosts"]]
while not stop_event.is_set():
tasks = [get_power_data(session, hc) for hc in host_objs]
await asyncio.gather(*tasks)
await process_request(interval)
# shutdown service
logging.info("Exporter stopping, logging out from Redfish sessions...")
await asyncio.gather(*(logout_host(session, h) for h in hosts if h.session_token))
logging.info("Exporter stopped cleanly.")
async def main():
"""Modern asyncio entry point"""
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
parser.add_argument("--config", default="config.yaml", help="Path to config file")
parser.add_argument("--port", type=int, help="Override port from config file")
parser.add_argument(
"--interval", type=int, help="Override interval from config file"
)
args = parser.parse_args()
# Load YAML config
with open(args.config, "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
# Override port if argument is provided
if args.port is not None:
config["port"] = args.port
if args.interval is not None:
config["interval"] = args.interval
stop_event = asyncio.Event()
loop = asyncio.get_running_loop()
# Handle SIGINT (Ctrl+C) and SIGTERM
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(sig, stop_event.set)
await run_exporter(config, stop_event)
if __name__ == "__main__":
asyncio.run(main())