publish my redfish_exporter
This commit is contained in:
13
python/redfish-api/Dockerfile
Normal file
13
python/redfish-api/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM python:3
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
WORKDIR /usr/src/app
|
||||
|
||||
COPY requirements.txt ./
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY redfish_exporter.py .
|
||||
COPY config.yaml .
|
||||
|
||||
CMD [ "python", "./redfish_exporter.py" ]
|
||||
117
python/redfish-api/README.md
Normal file
117
python/redfish-api/README.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# Description
|
||||
|
||||
I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution, feel free to replace me.
|
||||
|
||||
Usage:
|
||||
|
||||
```
|
||||
usage: redfish_exporter.py [-h] [--config CONFIG] [--port PORT]
|
||||
|
||||
Redfish Prometheus Exporter
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG Path to config file
|
||||
--port PORT Override port from config file
|
||||
```
|
||||
|
||||
|
||||
# Install
|
||||
|
||||
## Requirements
|
||||
|
||||
Dependencies:
|
||||
|
||||
* see requirements.txt
|
||||
|
||||
## Configuration
|
||||
|
||||
Create `config.yaml`:
|
||||
|
||||
```yaml
|
||||
---
|
||||
interval: 5
|
||||
port: 8000
|
||||
username: user1
|
||||
password: secret
|
||||
hosts:
|
||||
- srv1-112.mgmt.wtb1.ch.abainfra.net
|
||||
- srv2-112.mgmt.wtb1.ch.abainfra.net
|
||||
- srv3-112.mgmt.wtb1.ch.abainfra.net
|
||||
- srv4-112.mgmt.wtb1.ch.abainfra.net
|
||||
```
|
||||
|
||||
or:
|
||||
|
||||
```yaml
|
||||
---
|
||||
interval: 5
|
||||
port: 8000
|
||||
username: user1
|
||||
password: secret1
|
||||
hosts:
|
||||
- fqdn: srv1-112.mgmt.wtb1.ch.abainfra.net
|
||||
username: user2
|
||||
password: secret2
|
||||
- fqdn: srv2-112.mgmt.wtb1.ch.abainfra.net
|
||||
username: user3
|
||||
password: secret3
|
||||
- fqdn: srv3-112.mgmt.wtb1.ch.abainfra.net
|
||||
username: user4
|
||||
password: secret4
|
||||
- fqdn: srv4-112.mgmt.wtb1.ch.abainfra.net
|
||||
username: user5
|
||||
password: secret5
|
||||
```
|
||||
|
||||
The `port`, `interval` are optional and can be overwritten by argument. Save default values are hardcoded.
|
||||
|
||||
|
||||
# Use as Container
|
||||
|
||||
```
|
||||
docker build -t redfish_exporter .
|
||||
docker run -it --rm --name redfish_exporter_app -p 8000:8000 redfish_exporter:latest
|
||||
```
|
||||
|
||||
# Legacy way
|
||||
|
||||
```bash
|
||||
mkdir /srv/redfish-exporter
|
||||
```
|
||||
|
||||
## Python dependencies
|
||||
|
||||
```bash
|
||||
cd /srv/redfish-exporter
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Create user
|
||||
|
||||
```bash
|
||||
sudo useradd -r -s /bin/false redfish
|
||||
```
|
||||
|
||||
## Install systemd unit file
|
||||
|
||||
```bash
|
||||
sudo cp redfish-exporter.service /etc/systemd/system/redfish-exporter.service
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now redfish-exporter.service
|
||||
```
|
||||
|
||||
# Usefull oneliners
|
||||
|
||||
## public IP with curl
|
||||
|
||||
```bash
|
||||
curl icanhazip.com
|
||||
curl -4 icanhazip.com
|
||||
curl -6 icanhazip.com
|
||||
|
||||
curl 'https://api.ipify.org?format=json'
|
||||
curl 'https://api64.ipify.org?format=json'
|
||||
```
|
||||
14
python/redfish-api/config.yaml.example
Normal file
14
python/redfish-api/config.yaml.example
Normal file
@@ -0,0 +1,14 @@
|
||||
---
|
||||
interval: 10
|
||||
port: 8000
|
||||
username: gloabl-user
|
||||
password: global-password
|
||||
hosts:
|
||||
- fqdn: host1.example.com
|
||||
username: user1
|
||||
password: secret1
|
||||
- fqdn: host2.example.com
|
||||
username: user2
|
||||
password: secret2
|
||||
- fqdn: host3.example.com
|
||||
- fqdn: host4.example.com
|
||||
35
python/redfish-api/get_power_redfishapi.py
Normal file
35
python/redfish-api/get_power_redfishapi.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import requests
|
||||
import urllib3
|
||||
# import sys
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
username = "<user>"
|
||||
password = "<password>"
|
||||
# host = sys.argv[1]
|
||||
|
||||
|
||||
def get_power_data(host):
|
||||
"""Redfish API Chassis Power"""
|
||||
url = f"https://{host}.mgmt.wtb1.ch.abainfra.net/redfish/v1/Chassis/1/Power"
|
||||
response = requests.get(url, auth=(username, password), verify=False)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
for idx, psu in enumerate(data.get("PowerSupplies", [])):
|
||||
line_input_v = psu.get("LineInputVoltage")
|
||||
watts_input = psu.get("PowerInputWatts")
|
||||
serial = psu.get("SerialNumber")
|
||||
print(
|
||||
f"PSU {idx}, {serial}: {host}, {line_input_v} V, {watts_input} W, {round(watts_input/line_input_v,2)} A"
|
||||
)
|
||||
else:
|
||||
print(f"Error {response.status_code}: {response.text}")
|
||||
|
||||
|
||||
# loop over each hosts
|
||||
hosts = [
|
||||
"srv1-112",
|
||||
]
|
||||
for host in hosts:
|
||||
get_power_data(host)
|
||||
14
python/redfish-api/redfish-exporter.service
Normal file
14
python/redfish-api/redfish-exporter.service
Normal file
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=Redfish Prometheus Exporter
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/srv/redfish-exporter
|
||||
ExecStart=/srv/redfish-exporter/venv/bin/python redfish_exporter.py
|
||||
Restart=on-failure
|
||||
User=redfish
|
||||
Group=redfish
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
235
python/redfish-api/redfish_exporter.py
Normal file
235
python/redfish-api/redfish_exporter.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""Simple Redfish exporter to collect Power data from bare matel server"""
|
||||
|
||||
import argparse
|
||||
import signal
|
||||
import time
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import urllib3
|
||||
import yaml
|
||||
from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram
|
||||
|
||||
|
||||
@dataclass
|
||||
class HostConfig:
|
||||
"""Solve too many arguments"""
|
||||
|
||||
fqdn: str
|
||||
username: str
|
||||
password: str
|
||||
max_retries: int = 1
|
||||
backoff: int = 2
|
||||
cool_down: int = 120 # seconds to wait after too many failures
|
||||
failures: int = 0
|
||||
next_retry_time: float = field(default=0.0, init=False)
|
||||
|
||||
def should_skip(self) -> bool:
|
||||
"""Check if host is still in cool-down window"""
|
||||
return time.monotonic() < self.next_retry_time
|
||||
|
||||
def mark_failure(self):
|
||||
"""Increase failure counter and maybe trigger cool-down"""
|
||||
self.failures += 1
|
||||
if self.failures >= self.max_retries:
|
||||
self.next_retry_time = time.monotonic() + self.cool_down
|
||||
self.failures = 0 # reset after triggering cool-down
|
||||
|
||||
def mark_success(self):
|
||||
"""Reset failure counter after a successful request"""
|
||||
self.failures = 0
|
||||
self.next_retry_time = 0.0
|
||||
|
||||
|
||||
# Disable certificate warnings
|
||||
urllib3.disable_warnings()
|
||||
# set log config
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
||||
)
|
||||
|
||||
# Prometheus metrics
|
||||
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
|
||||
REQUEST_LATENCY = Histogram(
|
||||
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
|
||||
)
|
||||
up_gauge = Gauge("redfish_up", "Host up/down", ["host"])
|
||||
error_counter = Counter(
|
||||
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
|
||||
)
|
||||
voltage_gauge = Gauge(
|
||||
"redfish_psu_line_input_voltage_volts",
|
||||
"Line Input Voltage per PSU",
|
||||
["host", "psu_serial"],
|
||||
)
|
||||
watts_gauge = Gauge(
|
||||
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
|
||||
)
|
||||
amps_gauge = Gauge(
|
||||
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
|
||||
)
|
||||
|
||||
|
||||
@REQUEST_TIME.time()
|
||||
async def process_request(t):
|
||||
"""Simulate request time"""
|
||||
await asyncio.sleep(t)
|
||||
|
||||
|
||||
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
||||
"""Fetch JSON from Redfish with retry/backoff"""
|
||||
if host.should_skip():
|
||||
logging.warning(
|
||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
||||
)
|
||||
up_gauge.labels(host=host.fqdn).set(0)
|
||||
return None
|
||||
|
||||
for attempt in range(1, host.max_retries + 1):
|
||||
try:
|
||||
async with session.get(
|
||||
url,
|
||||
auth=aiohttp.BasicAuth(host.username, host.password),
|
||||
ssl=False,
|
||||
timeout=10,
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
host.mark_success()
|
||||
return await resp.json()
|
||||
logging.warning(
|
||||
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt)
|
||||
except aiohttp.ClientError as e:
|
||||
logging.warning(
|
||||
"Client error on %s (attempt %d): %s", host.fqdn, attempt, e
|
||||
)
|
||||
except Exception as e:
|
||||
logging.exception(
|
||||
"Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e
|
||||
)
|
||||
|
||||
if attempt < host.max_retries:
|
||||
await asyncio.sleep(host.backoff * attempt)
|
||||
else:
|
||||
host.mark_failure()
|
||||
logging.error("All retries failed for %s", host.fqdn)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def get_power_data(session, host: HostConfig):
|
||||
"""Query Redfish and update Prometheus metrics"""
|
||||
if host.should_skip():
|
||||
logging.warning(
|
||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
||||
)
|
||||
up_gauge.labels(host=host.fqdn).set(0)
|
||||
return
|
||||
|
||||
url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power"
|
||||
start = time.monotonic()
|
||||
|
||||
data = await fetch_with_retry(session, host, url)
|
||||
if not data:
|
||||
host.mark_failure()
|
||||
up_gauge.labels(host=host.fqdn).set(0)
|
||||
return
|
||||
|
||||
host.mark_success()
|
||||
up_gauge.labels(host=host.fqdn).set(1)
|
||||
|
||||
for psu in data.get("PowerSupplies", []):
|
||||
line_input_v = psu.get("LineInputVoltage")
|
||||
watts_input = psu.get("PowerInputWatts")
|
||||
serial = psu.get("SerialNumber")
|
||||
|
||||
amps = (
|
||||
round(watts_input / line_input_v, 2)
|
||||
if line_input_v and watts_input
|
||||
else None
|
||||
)
|
||||
|
||||
if line_input_v is not None:
|
||||
voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v)
|
||||
if watts_input is not None:
|
||||
watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input)
|
||||
if amps is not None:
|
||||
amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps)
|
||||
|
||||
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
|
||||
|
||||
|
||||
async def run_exporter(config, stop_event):
|
||||
"""Main loop"""
|
||||
port = config.get("port", 8000)
|
||||
default_username = config.get("username")
|
||||
default_password = config.get("password")
|
||||
hosts = config["hosts"]
|
||||
interval = config.get("interval", 10)
|
||||
|
||||
# Start Prometheus metrics server
|
||||
start_http_server(port)
|
||||
logging.info("Prometheus metrics server running on port %s", port)
|
||||
|
||||
# create persistent HostConfig objects
|
||||
host_objs = []
|
||||
for host_entry in hosts:
|
||||
if isinstance(host_entry, dict):
|
||||
hc = HostConfig(
|
||||
fqdn=host_entry["fqdn"],
|
||||
username=host_entry.get("username", default_username),
|
||||
password=host_entry.get("password", default_password),
|
||||
)
|
||||
else:
|
||||
hc = HostConfig(
|
||||
fqdn=host_entry, username=default_username, password=default_password
|
||||
)
|
||||
host_objs.append(hc)
|
||||
|
||||
# Connection pooling with aiohttp
|
||||
connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300)
|
||||
async with aiohttp.ClientSession(connector=connector) as session:
|
||||
while not stop_event.is_set():
|
||||
tasks = [get_power_data(session, hc) for hc in host_objs]
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
await process_request(interval)
|
||||
|
||||
logging.info("Exporter stopped cleanly.")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Modern asyncio entry point"""
|
||||
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
|
||||
parser.add_argument("--config", default="config.yaml", help="Path to config file")
|
||||
parser.add_argument("--port", type=int, help="Override port from config file")
|
||||
parser.add_argument(
|
||||
"--interval", type=int, help="Override interval from config file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load YAML config
|
||||
with open(args.config, "r", encoding="utf-8") as file:
|
||||
config = yaml.safe_load(file)
|
||||
|
||||
# Override port if argument is provided
|
||||
if args.port is not None:
|
||||
config["port"] = args.port
|
||||
if args.interval is not None:
|
||||
config["interval"] = args.interval
|
||||
|
||||
stop_event = asyncio.Event()
|
||||
loop = asyncio.get_running_loop()
|
||||
# Handle SIGINT (Ctrl+C) and SIGTERM
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
loop.add_signal_handler(sig, stop_event.set)
|
||||
|
||||
await run_exporter(config, stop_event)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
91
python/redfish-api/redfish_exporter_simple.py
Normal file
91
python/redfish-api/redfish_exporter_simple.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from prometheus_client import Gauge, start_http_server, Summary
|
||||
import requests
|
||||
import urllib3
|
||||
import random
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
username = "<user>"
|
||||
password = "<password>"
|
||||
|
||||
# Create a metric to track time spent and requests made.
|
||||
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
|
||||
|
||||
|
||||
# Decorate function with metric.
|
||||
@REQUEST_TIME.time()
|
||||
def process_request(t):
|
||||
"""A dummy function that takes some time."""
|
||||
time.sleep(t)
|
||||
|
||||
|
||||
# Define Prometheus metrics
|
||||
voltage_gauge = Gauge(
|
||||
"redfish_psu_line_input_voltage_volts",
|
||||
"Line Input Voltage per PSU",
|
||||
["host", "psu_serial"],
|
||||
)
|
||||
watts_gauge = Gauge(
|
||||
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
|
||||
)
|
||||
amps_gauge = Gauge(
|
||||
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
|
||||
)
|
||||
|
||||
|
||||
def get_power_data(fqdn):
|
||||
"""Redfish API Chassis Power"""
|
||||
url = f"https://{fqdn}/redfish/v1/Chassis/1/Power"
|
||||
try:
|
||||
response = requests.get(
|
||||
url, auth=(username, password), verify=False, timeout=10
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
for psu in data.get("PowerSupplies", []):
|
||||
line_input_v = psu.get("LineInputVoltage")
|
||||
watts_input = psu.get("PowerInputWatts")
|
||||
serial = psu.get("SerialNumber")
|
||||
|
||||
if line_input_v and watts_input:
|
||||
amps = round(watts_input / line_input_v, 2)
|
||||
else:
|
||||
amps = None
|
||||
|
||||
# Push metrics
|
||||
if line_input_v is not None:
|
||||
voltage_gauge.labels(host=fqdn, psu_serial=serial).set(line_input_v)
|
||||
if watts_input is not None:
|
||||
watts_gauge.labels(host=fqdn, psu_serial=serial).set(watts_input)
|
||||
if amps is not None:
|
||||
amps_gauge.labels(host=fqdn, psu_serial=serial).set(amps)
|
||||
except Exception as e:
|
||||
print(f"Error querying {url}: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Start metrics server on port 8000
|
||||
start_http_server(8000)
|
||||
hosts = [
|
||||
"srv1-119.mgmt.sgg1.ch.abainfra.net",
|
||||
]
|
||||
|
||||
# Thread pool for parallel requests
|
||||
executor = ThreadPoolExecutor(max_workers=len(hosts))
|
||||
|
||||
while True:
|
||||
futures = [executor.submit(get_power_data, fqdn) for fqdn in hosts]
|
||||
# wait for all to finish
|
||||
for future in futures:
|
||||
future.result()
|
||||
# for fqdn in hosts:
|
||||
# get_power_data(fqdn)
|
||||
# Fixed scape interval:
|
||||
# time.sleep(30) # scrape interval
|
||||
# Random sleep between 0 and 5 seconds (inclusive)
|
||||
# sleep_time = random.uniform(0, 5)
|
||||
# process_request(sleep_time)
|
||||
# Random sleep between 0 and 1 seconds:
|
||||
process_request(random.random())
|
||||
6
python/redfish-api/requirements.txt
Normal file
6
python/redfish-api/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
prometheus-client==0.23.1
|
||||
requests==2.32.5
|
||||
urllib3==2.5.0
|
||||
aiohttp==3.12.15
|
||||
asyncio==4.0.0
|
||||
PyYAML==6.0.2
|
||||
Reference in New Issue
Block a user