317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""Simple Redfish exporter to collect Power data from bare matel server"""
|
|
|
|
import argparse
|
|
import signal
|
|
import time
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
import asyncio
|
|
import aiohttp
|
|
import urllib3
|
|
import yaml
|
|
from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram
|
|
|
|
|
|
@dataclass
|
|
class HostConfig:
|
|
"""Solve too many arguments"""
|
|
|
|
fqdn: str
|
|
username: str
|
|
password: str
|
|
max_retries: int = 1
|
|
backoff: int = 2
|
|
cool_down: int = 120 # seconds to wait after too many failures
|
|
failures: int = 0
|
|
next_retry_time: float = field(default=0.0, init=False)
|
|
|
|
# New attributes for Redfish stuff
|
|
vendor: str | None = None
|
|
session_token: str | None = None
|
|
token_expiry = float | None = None
|
|
|
|
def should_skip(self) -> bool:
|
|
"""Check if host is still in cool-down window"""
|
|
return time.monotonic() < self.next_retry_time
|
|
|
|
def mark_failure(self):
|
|
"""Increase failure counter and maybe trigger cool-down"""
|
|
self.failures += 1
|
|
if self.failures >= self.max_retries:
|
|
self.next_retry_time = time.monotonic() + self.cool_down
|
|
self.failures = 0 # reset after triggering cool-down
|
|
|
|
def mark_success(self):
|
|
"""Reset failure counter after a successful request"""
|
|
self.failures = 0
|
|
self.next_retry_time = 0.0
|
|
|
|
|
|
# Disable certificate warnings
|
|
urllib3.disable_warnings()
|
|
# set log config
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
|
)
|
|
|
|
# Prometheus metrics
|
|
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
|
|
REQUEST_LATENCY = Histogram(
|
|
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
|
|
)
|
|
up_gauge = Gauge("redfish_up", "Host up/down", ["host"])
|
|
error_counter = Counter(
|
|
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
|
|
)
|
|
voltage_gauge = Gauge(
|
|
"redfish_psu_line_input_voltage_volts",
|
|
"Line Input Voltage per PSU",
|
|
["host", "psu_serial"],
|
|
)
|
|
watts_gauge = Gauge(
|
|
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
|
|
)
|
|
amps_gauge = Gauge(
|
|
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
|
|
)
|
|
|
|
|
|
@REQUEST_TIME.time()
|
|
async def process_request(t):
|
|
"""Simulate request time"""
|
|
await asyncio.sleep(t)
|
|
|
|
|
|
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
|
"""Fetch JSON from Redfish with retry/backoff"""
|
|
if host.should_skip():
|
|
logging.warning(
|
|
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
|
)
|
|
up_gauge.labels(host=host.fqdn).set(0)
|
|
return None
|
|
|
|
if not host.vendor:
|
|
try:
|
|
async with session.get(f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10) as resp:
|
|
if resp.status == 200:
|
|
data = await resp.json()
|
|
host.vendor = data.get("Vendor", "")
|
|
logging.debug("Detected vendor for %s: %s", host.fqdn, host.vendor)
|
|
else:
|
|
logging.warning("Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status)
|
|
except Exception as e:
|
|
logging.warning("Vendor probe failed for %s: %s", host.fqdn, e)
|
|
|
|
is_hpe = host.vendor and host.vendor.strip().upper().startswith("HPE")
|
|
|
|
for attempt in range(1, host.max_retries + 1):
|
|
try:
|
|
headers = {}
|
|
|
|
if is_hpe:
|
|
# Try to reuse existing session token
|
|
if host.session_token:
|
|
headers["X-Auth-Token"] = host.session_token
|
|
logging.debug("Reusing cached session token for %s", host.fqdn)
|
|
else:
|
|
# Need to login and store new session token
|
|
# HPE Redfish login
|
|
login_url = f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
|
|
payload = {"UserName": host.username, "Password": host.password}
|
|
async with session.post(login_url, json=payload, ssl=False, timeout=10) as login_resp:
|
|
if login_resp.status == 201:
|
|
host.session_token = login_resp.headers.get("X-Auth-Token")
|
|
if not host.session_token:
|
|
raise RuntimeError("No X-Auth-Token in login response")
|
|
headers["X-Auth-Token"] = host.session_token
|
|
logging.info("New session token obtained for %s", host.fqdn)
|
|
else:
|
|
logging.warning("Login failed for %s: HTTP %s", host.fqdn, login_resp.status)
|
|
continue # retry login next attempt
|
|
|
|
async with session.get(url, headers=headers, ssl=False, timeout=10) as resp:
|
|
if resp.status == 200:
|
|
host.mark_success()
|
|
return await resp.json()
|
|
elif resp.status in (401, 403):
|
|
# Token expired or invalid, clear it and retry
|
|
logging.warning("Invalid token for %s, reauthenticating...", host.fqdn)
|
|
host.session_token = None
|
|
continue
|
|
logging.warning("HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt)
|
|
|
|
else:
|
|
# Default: BasicAuth, like Supermicro and so
|
|
async with session.get(
|
|
url,
|
|
auth=aiohttp.BasicAuth(host.username, host.password),
|
|
ssl=False,
|
|
timeout=10,
|
|
) as resp:
|
|
if resp.status == 200:
|
|
host.mark_success()
|
|
return await resp.json()
|
|
logging.warning(
|
|
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
|
|
)
|
|
|
|
except asyncio.TimeoutError:
|
|
logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt)
|
|
except aiohttp.ClientError as e:
|
|
logging.warning(
|
|
"Client error on %s (attempt %d): %s", host.fqdn, attempt, e
|
|
)
|
|
except Exception as e:
|
|
logging.exception(
|
|
"Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e
|
|
)
|
|
|
|
if attempt < host.max_retries:
|
|
await asyncio.sleep(host.backoff * attempt)
|
|
else:
|
|
host.mark_failure()
|
|
logging.error("All retries failed for %s", host.fqdn)
|
|
|
|
return None
|
|
|
|
|
|
async def get_power_data(session, host: HostConfig):
|
|
"""Query Redfish and update Prometheus metrics"""
|
|
if host.should_skip():
|
|
logging.warning(
|
|
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
|
)
|
|
up_gauge.labels(host=host.fqdn).set(0)
|
|
return
|
|
|
|
url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power"
|
|
start = time.monotonic()
|
|
|
|
data = await fetch_with_retry(session, host, url)
|
|
if not data:
|
|
host.mark_failure()
|
|
up_gauge.labels(host=host.fqdn).set(0)
|
|
return
|
|
|
|
host.mark_success()
|
|
up_gauge.labels(host=host.fqdn).set(1)
|
|
|
|
for psu in data.get("PowerSupplies", []):
|
|
line_input_v = psu.get("LineInputVoltage")
|
|
watts_input = psu.get("PowerInputWatts")
|
|
serial = psu.get("SerialNumber")
|
|
|
|
amps = (
|
|
round(watts_input / line_input_v, 2)
|
|
if line_input_v and watts_input
|
|
else None
|
|
)
|
|
|
|
if line_input_v is not None:
|
|
voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v)
|
|
if watts_input is not None:
|
|
watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input)
|
|
if amps is not None:
|
|
amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps)
|
|
|
|
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
|
|
|
|
async def logout_host(session, host):
|
|
"""Clean logout for Redfish with session tokens"""
|
|
if not host.session_token:
|
|
return
|
|
|
|
try:
|
|
logout_url = f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
|
|
async with session.delete(
|
|
logout_url,
|
|
headers={"X-Auth-Token": host.session_token},
|
|
ssl=False,
|
|
timeout=5,
|
|
) as resp:
|
|
if resp.status in (200, 204):
|
|
logging.info("Logged out from %s", host.fqdn)
|
|
else:
|
|
logging.warning("Logout failed for %s (HTTP %s)", host.fqdn, resp.status)
|
|
except Exception as e:
|
|
logging.warning("Error during logout for %s: %s", host.fqdn, e)
|
|
finally:
|
|
host.session_token = None
|
|
|
|
|
|
async def run_exporter(config, stop_event):
|
|
"""Main loop"""
|
|
port = config.get("port", 8000)
|
|
default_username = config.get("username")
|
|
default_password = config.get("password")
|
|
hosts = config["hosts"]
|
|
interval = config.get("interval", 10)
|
|
|
|
# Start Prometheus metrics server
|
|
start_http_server(port)
|
|
logging.info("Prometheus metrics server running on port %s", port)
|
|
|
|
# create persistent HostConfig objects
|
|
host_objs = []
|
|
for host_entry in hosts:
|
|
if isinstance(host_entry, dict):
|
|
hc = HostConfig(
|
|
fqdn=host_entry["fqdn"],
|
|
username=host_entry.get("username", default_username),
|
|
password=host_entry.get("password", default_password),
|
|
)
|
|
else:
|
|
hc = HostConfig(
|
|
fqdn=host_entry, username=default_username, password=default_password
|
|
)
|
|
host_objs.append(hc)
|
|
|
|
# Connection pooling with aiohttp
|
|
connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300)
|
|
async with aiohttp.ClientSession(connector=connector) as session:
|
|
hosts = [HostConfig(**h) for h in config["hosts"]]
|
|
while not stop_event.is_set():
|
|
tasks = [get_power_data(session, hc) for hc in host_objs]
|
|
|
|
await asyncio.gather(*tasks)
|
|
await process_request(interval)
|
|
# shutdown service
|
|
logging.info("Exporter stopping, logging out from Redfish sessions...")
|
|
await asyncio.gather(*(logout_host(session, h) for h in hosts if h.session_token))
|
|
|
|
logging.info("Exporter stopped cleanly.")
|
|
|
|
|
|
async def main():
|
|
"""Modern asyncio entry point"""
|
|
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
|
|
parser.add_argument("--config", default="config.yaml", help="Path to config file")
|
|
parser.add_argument("--port", type=int, help="Override port from config file")
|
|
parser.add_argument(
|
|
"--interval", type=int, help="Override interval from config file"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Load YAML config
|
|
with open(args.config, "r", encoding="utf-8") as file:
|
|
config = yaml.safe_load(file)
|
|
|
|
# Override port if argument is provided
|
|
if args.port is not None:
|
|
config["port"] = args.port
|
|
if args.interval is not None:
|
|
config["interval"] = args.interval
|
|
|
|
stop_event = asyncio.Event()
|
|
loop = asyncio.get_running_loop()
|
|
# Handle SIGINT (Ctrl+C) and SIGTERM
|
|
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
loop.add_signal_handler(sig, stop_event.set)
|
|
|
|
await run_exporter(config, stop_event)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|