publish my redfish_exporter

This commit is contained in:
2025-11-13 10:29:29 +01:00
parent 581df6617b
commit 5e68842356
8 changed files with 525 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
FROM python:3
EXPOSE 8000
WORKDIR /usr/src/app
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
COPY redfish_exporter.py .
COPY config.yaml .
CMD [ "python", "./redfish_exporter.py" ]

View File

@@ -0,0 +1,117 @@
# Description
I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution, feel free to replace me.
Usage:
```
usage: redfish_exporter.py [-h] [--config CONFIG] [--port PORT]
Redfish Prometheus Exporter
options:
-h, --help show this help message and exit
--config CONFIG Path to config file
--port PORT Override port from config file
```
# Install
## Requirements
Dependencies:
* see requirements.txt
## Configuration
Create `config.yaml`:
```yaml
---
interval: 5
port: 8000
username: user1
password: secret
hosts:
- srv1-112.mgmt.wtb1.ch.abainfra.net
- srv2-112.mgmt.wtb1.ch.abainfra.net
- srv3-112.mgmt.wtb1.ch.abainfra.net
- srv4-112.mgmt.wtb1.ch.abainfra.net
```
or:
```yaml
---
interval: 5
port: 8000
username: user1
password: secret1
hosts:
- fqdn: srv1-112.mgmt.wtb1.ch.abainfra.net
username: user2
password: secret2
- fqdn: srv2-112.mgmt.wtb1.ch.abainfra.net
username: user3
password: secret3
- fqdn: srv3-112.mgmt.wtb1.ch.abainfra.net
username: user4
password: secret4
- fqdn: srv4-112.mgmt.wtb1.ch.abainfra.net
username: user5
password: secret5
```
The `port`, `interval` are optional and can be overwritten by argument. Save default values are hardcoded.
# Use as Container
```
docker build -t redfish_exporter .
docker run -it --rm --name redfish_exporter_app -p 8000:8000 redfish_exporter:latest
```
# Legacy way
```bash
mkdir /srv/redfish-exporter
```
## Python dependencies
```bash
cd /srv/redfish-exporter
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
## Create user
```bash
sudo useradd -r -s /bin/false redfish
```
## Install systemd unit file
```bash
sudo cp redfish-exporter.service /etc/systemd/system/redfish-exporter.service
sudo systemctl daemon-reload
sudo systemctl enable --now redfish-exporter.service
```
# Usefull oneliners
## public IP with curl
```bash
curl icanhazip.com
curl -4 icanhazip.com
curl -6 icanhazip.com
curl 'https://api.ipify.org?format=json'
curl 'https://api64.ipify.org?format=json'
```

View File

@@ -0,0 +1,14 @@
---
interval: 10
port: 8000
username: gloabl-user
password: global-password
hosts:
- fqdn: host1.example.com
username: user1
password: secret1
- fqdn: host2.example.com
username: user2
password: secret2
- fqdn: host3.example.com
- fqdn: host4.example.com

View File

@@ -0,0 +1,35 @@
import requests
import urllib3
# import sys
urllib3.disable_warnings()
username = "<user>"
password = "<password>"
# host = sys.argv[1]
def get_power_data(host):
"""Redfish API Chassis Power"""
url = f"https://{host}.mgmt.wtb1.ch.abainfra.net/redfish/v1/Chassis/1/Power"
response = requests.get(url, auth=(username, password), verify=False)
if response.status_code == 200:
data = response.json()
for idx, psu in enumerate(data.get("PowerSupplies", [])):
line_input_v = psu.get("LineInputVoltage")
watts_input = psu.get("PowerInputWatts")
serial = psu.get("SerialNumber")
print(
f"PSU {idx}, {serial}: {host}, {line_input_v} V, {watts_input} W, {round(watts_input/line_input_v,2)} A"
)
else:
print(f"Error {response.status_code}: {response.text}")
# loop over each hosts
hosts = [
"srv1-112",
]
for host in hosts:
get_power_data(host)

View File

@@ -0,0 +1,14 @@
[Unit]
Description=Redfish Prometheus Exporter
After=network.target
[Service]
Type=simple
WorkingDirectory=/srv/redfish-exporter
ExecStart=/srv/redfish-exporter/venv/bin/python redfish_exporter.py
Restart=on-failure
User=redfish
Group=redfish
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,235 @@
"""Simple Redfish exporter to collect Power data from bare matel server"""
import argparse
import signal
import time
import logging
from dataclasses import dataclass, field
import asyncio
import aiohttp
import urllib3
import yaml
from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram
@dataclass
class HostConfig:
"""Solve too many arguments"""
fqdn: str
username: str
password: str
max_retries: int = 1
backoff: int = 2
cool_down: int = 120 # seconds to wait after too many failures
failures: int = 0
next_retry_time: float = field(default=0.0, init=False)
def should_skip(self) -> bool:
"""Check if host is still in cool-down window"""
return time.monotonic() < self.next_retry_time
def mark_failure(self):
"""Increase failure counter and maybe trigger cool-down"""
self.failures += 1
if self.failures >= self.max_retries:
self.next_retry_time = time.monotonic() + self.cool_down
self.failures = 0 # reset after triggering cool-down
def mark_success(self):
"""Reset failure counter after a successful request"""
self.failures = 0
self.next_retry_time = 0.0
# Disable certificate warnings
urllib3.disable_warnings()
# set log config
logging.basicConfig(
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)
# Prometheus metrics
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
REQUEST_LATENCY = Histogram(
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
)
up_gauge = Gauge("redfish_up", "Host up/down", ["host"])
error_counter = Counter(
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
)
voltage_gauge = Gauge(
"redfish_psu_line_input_voltage_volts",
"Line Input Voltage per PSU",
["host", "psu_serial"],
)
watts_gauge = Gauge(
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
)
amps_gauge = Gauge(
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
)
@REQUEST_TIME.time()
async def process_request(t):
"""Simulate request time"""
await asyncio.sleep(t)
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
"""Fetch JSON from Redfish with retry/backoff"""
if host.should_skip():
logging.warning(
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
)
up_gauge.labels(host=host.fqdn).set(0)
return None
for attempt in range(1, host.max_retries + 1):
try:
async with session.get(
url,
auth=aiohttp.BasicAuth(host.username, host.password),
ssl=False,
timeout=10,
) as resp:
if resp.status == 200:
host.mark_success()
return await resp.json()
logging.warning(
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
)
except asyncio.TimeoutError:
logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt)
except aiohttp.ClientError as e:
logging.warning(
"Client error on %s (attempt %d): %s", host.fqdn, attempt, e
)
except Exception as e:
logging.exception(
"Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e
)
if attempt < host.max_retries:
await asyncio.sleep(host.backoff * attempt)
else:
host.mark_failure()
logging.error("All retries failed for %s", host.fqdn)
return None
async def get_power_data(session, host: HostConfig):
"""Query Redfish and update Prometheus metrics"""
if host.should_skip():
logging.warning(
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
)
up_gauge.labels(host=host.fqdn).set(0)
return
url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power"
start = time.monotonic()
data = await fetch_with_retry(session, host, url)
if not data:
host.mark_failure()
up_gauge.labels(host=host.fqdn).set(0)
return
host.mark_success()
up_gauge.labels(host=host.fqdn).set(1)
for psu in data.get("PowerSupplies", []):
line_input_v = psu.get("LineInputVoltage")
watts_input = psu.get("PowerInputWatts")
serial = psu.get("SerialNumber")
amps = (
round(watts_input / line_input_v, 2)
if line_input_v and watts_input
else None
)
if line_input_v is not None:
voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v)
if watts_input is not None:
watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input)
if amps is not None:
amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps)
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
async def run_exporter(config, stop_event):
"""Main loop"""
port = config.get("port", 8000)
default_username = config.get("username")
default_password = config.get("password")
hosts = config["hosts"]
interval = config.get("interval", 10)
# Start Prometheus metrics server
start_http_server(port)
logging.info("Prometheus metrics server running on port %s", port)
# create persistent HostConfig objects
host_objs = []
for host_entry in hosts:
if isinstance(host_entry, dict):
hc = HostConfig(
fqdn=host_entry["fqdn"],
username=host_entry.get("username", default_username),
password=host_entry.get("password", default_password),
)
else:
hc = HostConfig(
fqdn=host_entry, username=default_username, password=default_password
)
host_objs.append(hc)
# Connection pooling with aiohttp
connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300)
async with aiohttp.ClientSession(connector=connector) as session:
while not stop_event.is_set():
tasks = [get_power_data(session, hc) for hc in host_objs]
await asyncio.gather(*tasks)
await process_request(interval)
logging.info("Exporter stopped cleanly.")
async def main():
"""Modern asyncio entry point"""
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
parser.add_argument("--config", default="config.yaml", help="Path to config file")
parser.add_argument("--port", type=int, help="Override port from config file")
parser.add_argument(
"--interval", type=int, help="Override interval from config file"
)
args = parser.parse_args()
# Load YAML config
with open(args.config, "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
# Override port if argument is provided
if args.port is not None:
config["port"] = args.port
if args.interval is not None:
config["interval"] = args.interval
stop_event = asyncio.Event()
loop = asyncio.get_running_loop()
# Handle SIGINT (Ctrl+C) and SIGTERM
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(sig, stop_event.set)
await run_exporter(config, stop_event)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,91 @@
from prometheus_client import Gauge, start_http_server, Summary
import requests
import urllib3
import random
import time
from concurrent.futures import ThreadPoolExecutor
urllib3.disable_warnings()
username = "<user>"
password = "<password>"
# Create a metric to track time spent and requests made.
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
# Decorate function with metric.
@REQUEST_TIME.time()
def process_request(t):
"""A dummy function that takes some time."""
time.sleep(t)
# Define Prometheus metrics
voltage_gauge = Gauge(
"redfish_psu_line_input_voltage_volts",
"Line Input Voltage per PSU",
["host", "psu_serial"],
)
watts_gauge = Gauge(
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
)
amps_gauge = Gauge(
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
)
def get_power_data(fqdn):
"""Redfish API Chassis Power"""
url = f"https://{fqdn}/redfish/v1/Chassis/1/Power"
try:
response = requests.get(
url, auth=(username, password), verify=False, timeout=10
)
response.raise_for_status()
data = response.json()
for psu in data.get("PowerSupplies", []):
line_input_v = psu.get("LineInputVoltage")
watts_input = psu.get("PowerInputWatts")
serial = psu.get("SerialNumber")
if line_input_v and watts_input:
amps = round(watts_input / line_input_v, 2)
else:
amps = None
# Push metrics
if line_input_v is not None:
voltage_gauge.labels(host=fqdn, psu_serial=serial).set(line_input_v)
if watts_input is not None:
watts_gauge.labels(host=fqdn, psu_serial=serial).set(watts_input)
if amps is not None:
amps_gauge.labels(host=fqdn, psu_serial=serial).set(amps)
except Exception as e:
print(f"Error querying {url}: {e}")
if __name__ == "__main__":
# Start metrics server on port 8000
start_http_server(8000)
hosts = [
"srv1-119.mgmt.sgg1.ch.abainfra.net",
]
# Thread pool for parallel requests
executor = ThreadPoolExecutor(max_workers=len(hosts))
while True:
futures = [executor.submit(get_power_data, fqdn) for fqdn in hosts]
# wait for all to finish
for future in futures:
future.result()
# for fqdn in hosts:
# get_power_data(fqdn)
# Fixed scape interval:
# time.sleep(30) # scrape interval
# Random sleep between 0 and 5 seconds (inclusive)
# sleep_time = random.uniform(0, 5)
# process_request(sleep_time)
# Random sleep between 0 and 1 seconds:
process_request(random.random())

View File

@@ -0,0 +1,6 @@
prometheus-client==0.23.1
requests==2.32.5
urllib3==2.5.0
aiohttp==3.12.15
asyncio==4.0.0
PyYAML==6.0.2