From 5c777483fcb641599ba391bc54a5bfab4f387a5d Mon Sep 17 00:00:00 2001 From: Marco Lucarelli Date: Mon, 2 Feb 2026 12:55:32 +0100 Subject: [PATCH] feature: grouping --- python/redfish-api/README.md | 139 +++++++++---------- python/redfish-api/config.yaml.example | 1 + python/redfish-api/redfish_exporter_v9000.py | 51 ++++--- 3 files changed, 101 insertions(+), 90 deletions(-) diff --git a/python/redfish-api/README.md b/python/redfish-api/README.md index c2478fb..af88c1a 100644 --- a/python/redfish-api/README.md +++ b/python/redfish-api/README.md @@ -1,76 +1,58 @@ -# Redfish Exporter -A Python-based Prometheus exporter for collecting power data (Watts, Volts, Amperes) from bare metal servers using the Redfish API. This tool supports multiple vendors (e.g., HPE, Supermicro). +# Redfish-Exporter +A Python-based Prometheus exporter for collecting power data (Watts, Volts, Amperes) from bare metal servers using the Redfish API. This tool supports multiple vendors (e.g., HPE, Supermicro) and is designed to run cross-platform on Linux and Windows. -I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution, feel free to replace me. - ---- - -## Table of Contents -- [Redfish Exporter](#redfish-exporter) - - [Table of Contents](#table-of-contents) - - [Description](#description) - - [Features](#features) - - [Usage](#usage) -- [Installation](#installation) - - [Requirements](#requirements) - - [Configuration](#configuration) - - [Basic Configuration](#basic-configuration) - - [Basic Configuration](#basic-configuration-1) -- [Container](#container) -- [Legacy Installation](#legacy-installation) - - [Python Dependencies](#python-dependencies) - - [Create user](#create-user) - - [Systemd Service](#systemd-service) -- [Testet on Hardware](#testet-on-hardware) -- [License](#license) - ---- - -## Description -This tool collects power metrics from servers using the Redfish API and exposes them in a format compatible with Prometheus. It supports both modern and legacy Redfish API versions and handles authentication for different vendors. - ---- +I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution or you want more feature, feel free to replace me or expand my prometheus exporter. ## Features - Collects power metrics: Watts, Volts, and Amperes. - Supports multiple vendors (HPE, Supermicro, etc.). +- Supports grouping. - Cross-platform compatibility (Linux and Windows). - Graceful error handling and retry logic. - Configurable via YAML. - Docker support. +## Metrics Overview +| Metric | Typ | Description | +| ------------------------------------ | --------- | -------------------------------------------------------------- | +| redfish_up | Gauge | Status from host (1 = reachable, 0 = not reachable). | +| redfish_psu_line_input_voltage_volts | Gauge | Voltages per powersupply (label: host, psu_serial). | +| redfish_psu_power_input_watts | Gauge | Watts per powersupply (label: host, psu_serial). | +| redfish_psu_input_amps | Gauge | Amperes per powersupply (label: host, psu_serial). | +| redfish_system_info | Info | Systeminformation (Vendor, Model, Serial, Redfish Version). | +| redfish_request_latency_seconds | Histogram | Latency (label: host). | +| redfish_errors_total | Counter | Number of errors per host and error type (label: host, error). | + ## Usage -```bash -usage: redfish_exporter.py [-h] [--config CONFIG] [--port PORT] [--interval INTERVAL] +``` +usage: python main.py [-h] [--config CONFIG] [--port PORT] Redfish Prometheus Exporter options: - -h, --help show this help message and exit - --config CONFIG Path to config file - --port PORT Override port from config file - --interval INTERVAL Override interval from config file + -h, --help show this help message and exit + --config CONFIG Path to config file + --port PORT Override port from config file ``` -# Installation +# Install ## Requirements -Requirements: +* just (optional) +* python 3.8+ +* uv +* see `pyproject.tom` -* Python 3.8+ -* see `pyproject.toml` - -Install the dependencies using: +Install the dependencies using `uv`: ```bash -cd /srv/redfish-exporter uv sync source .venv/bin/activate uv lock --upgrade --refresh ``` ## Configuration -Create a `config.yaml` file with the following structure: +Create `config.yaml` with following structure: ### Basic Configuration ```yaml @@ -87,7 +69,7 @@ hosts: - host4.example.net ``` -### Basic Configuration +### Advanced Configuration ```yaml --- interval: 5 @@ -95,15 +77,18 @@ port: 8000 username: user1 password: secret1 chassis: ["1"] +group: development # set default group for all hosts hosts: - fqdn: host1.example.net username: user2 password: secret2 chassis: ["0"] + group: production # use group for specific host - fqdn: host2.example.net username: user3 password: secret3 chassis: ["1"] + group: stage - fqdn: host3.example.net username: user4 password: secret4 @@ -113,30 +98,43 @@ hosts: password: secret5 ``` -The `port`, `interval` are optional and can be overwritten by argument. Save default values are hardcoded. +The `port`, `interval` and `interval` are optional and can be be overridden by command-line arguments. Default values are hardcoded. +### Prometheus Configuration +``` +global: + scrape_interval: 15s + evaluation_interval: 15s -# Container +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "redfish_exporter" + static_configs: + - targets: ["localhost:8000"] # Adjust to your config + metrics_path: /metrics + scrape_interval: 15s +``` + +# Docker / Container To run the Redfish Exporter in a Docker container: ``` -docker buildx build -t your-tag . -docker run -it --rm --name redfish_exporter_app -p 8000:8000 your-tag:latest +docker buildx build -t redfish_exporter . +docker run -it --rm --name redfish_exporter_app -p 8000:8000 redfish_exporter:latest ``` # Legacy Installation + +## Python Dependencies ```bash mkdir /srv/redfish-exporter # or git clone https://github.com/dasbaum-ch/redfish-exporter.git /srv/redfish-exporter -``` - -## Python Dependencies -```bash cd /srv/redfish-exporter -uv sync -source .venv/bin/activate -uv lock --upgrade --refresh +uv sync --locked ``` ## Create user @@ -145,28 +143,25 @@ sudo useradd -r -s /bin/false redfish ``` ## Systemd Service - 1. Copy the systemd unit file: ```bash sudo cp redfish-exporter.service /etc/systemd/system/redfish-exporter.service ``` -1. Reload and start the service: + +2. Reload and start the service: ```bash sudo systemctl daemon-reload sudo systemctl enable --now redfish-exporter.service ``` -# Testet on Hardware - -Here some Server's that I have successfully testet: -* Supermicro - * AS -5126GS-TNRT2 - * Redfish 1.21.0 - * AS -1124US-TNRP - * Redfish 1.8.0 -* HPE - * ProLiant DL380 Gen10 - * Redfish 1.6.0 - # License -This project is licensed under the MIT License. See the LICENSE file for details. +This project is licensed under the **MIT License**. See the [LICENSE](LICENSE) file for details. + +# Testet on Hardware +Here some Server's that I have successfully testet: + +| Vendor | Model | Redfish Version | +| ---------- | -------------------- | --------------- | +| Supermicro | AS-5126GS-TNRT2 | 1.21.0 | +| | AS-1124US-TNRP | 1.8.0 | +| HPE | ProLiant DL380 Gen10 | 1.6.0 | diff --git a/python/redfish-api/config.yaml.example b/python/redfish-api/config.yaml.example index f3a4f05..07acdf8 100644 --- a/python/redfish-api/config.yaml.example +++ b/python/redfish-api/config.yaml.example @@ -4,6 +4,7 @@ port: 8000 username: gloabl-user password: global-password chassis: ["1"] # Strings, not integers! +group: production hosts: - fqdn: host1.example.com username: user1 diff --git a/python/redfish-api/redfish_exporter_v9000.py b/python/redfish-api/redfish_exporter_v9000.py index 8a93c68..c9b274d 100644 --- a/python/redfish-api/redfish_exporter_v9000.py +++ b/python/redfish-api/redfish_exporter_v9000.py @@ -56,6 +56,7 @@ class HostConfig: username: str password: str chassis: list[str] | None = None + group: str = "none" max_retries: int = 3 # 3 retires backoff: int = 2 # wait 2 seconds cool_down: int = 120 # seconds to wait after too many failures @@ -92,24 +93,28 @@ REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing requ REQUEST_LATENCY = Histogram( "redfish_request_latency_seconds", "Time for Redfish request", ["host"] ) -UP_GAUGE = Gauge("redfish_up", "Host up/down", ["host"]) +UP_GAUGE = Gauge("redfish_up", "Host up/down", ["host", "group"]) ERROR_COUNTER = Counter( "redfish_errors_total", "Total Redfish errors", ["host", "error"] ) VOLTAGE_GAUGE = Gauge( "redfish_psu_line_input_voltage_volts", "Line Input Voltage per PSU", - ["host", "psu_serial"], + ["host", "psu_serial", "group"], ) WATTS_GAUGE = Gauge( - "redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"] + "redfish_psu_power_input_watts", + "Power Input Watts per PSU", + ["host", "psu_serial", "group"], ) AMPS_GAUGE = Gauge( - "redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"] + "redfish_psu_input_amps", + "Current draw in Amps per PSU", + ["host", "psu_serial", "group"], ) # set info metric SYSTEM_INFO = Info( - "redfish_system_info", "System information (model, serial, etc.)", ["host"] + "redfish_system_info", "System information (model, serial, etc.)", ["host", "group"] ) @@ -170,7 +175,7 @@ async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None: logging.warning( "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time ) - UP_GAUGE.labels(host=host.fqdn).set(0) + UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0) return None # Probe vendor if not already known @@ -391,7 +396,7 @@ async def get_power_data(session, host: HostConfig): logging.warning( "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time ) - UP_GAUGE.labels(host=host.fqdn).set(0) + UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0) return # Start time measurement @@ -402,17 +407,17 @@ async def get_power_data(session, host: HostConfig): if not resources or not resources.chassis: logging.error("Could not discover any resources for %s", host.fqdn) host.mark_failure() - UP_GAUGE.labels(host=host.fqdn).set(0) + UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0) return host.mark_success() - UP_GAUGE.labels(host=host.fqdn).set(1) + UP_GAUGE.labels(host=host.fqdn, group=host.group).set(1) chassis_url = f"https://{host.fqdn}{resources.chassis}" chassis_data = await fetch_with_retry(session, host, chassis_url) if not chassis_data: host.mark_failure() - UP_GAUGE.labels(host=host.fqdn).set(0) + UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0) return for chassis_member in chassis_data.get("Members", []): @@ -498,13 +503,17 @@ async def get_power_data(session, host: HostConfig): def update_prometheus_metrics(host: HostConfig, metrics: PowerMetrics): """Update Prometheus metrics with PowerMetrics data.""" if metrics.voltage is not None and metrics.serial: - VOLTAGE_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set( - metrics.voltage - ) + VOLTAGE_GAUGE.labels( + host=host.fqdn, psu_serial=metrics.serial, group=host.group + ).set(metrics.voltage) if metrics.watts is not None and metrics.serial: - WATTS_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.watts) + WATTS_GAUGE.labels( + host=host.fqdn, psu_serial=metrics.serial, group=host.group + ).set(metrics.watts) if metrics.amps is not None and metrics.serial: - AMPS_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.amps) + AMPS_GAUGE.labels( + host=host.fqdn, psu_serial=metrics.serial, group=host.group + ).set(metrics.amps) async def get_system_info(session, host: HostConfig): @@ -547,7 +556,7 @@ async def get_system_info(session, host: HostConfig): serial_number = system_data.get("SerialNumber") # Hier könnte ihre Werbung stehen - SYSTEM_INFO.labels(host=host.fqdn).info( + SYSTEM_INFO.labels(host=host.fqdn, group=host.group).info( { "manufacturer": manufacturer, "model": model, @@ -587,7 +596,8 @@ async def run_exporter(config, stop_event): port = config.get("port", 8000) default_username = config.get("username") default_password = config.get("password") - default_chassis = config.get("chassis") + default_chassis = config.get("chassis", "1") + default_group = config.get("group", "none") hosts = config["hosts"] interval = config.get("interval", 10) @@ -604,10 +614,15 @@ async def run_exporter(config, stop_event): username=host_entry.get("username", default_username), password=host_entry.get("password", default_password), chassis=host_entry.get("chassis", default_chassis), + group=host_entry.get("group", default_group), ) else: hc = HostConfig( - fqdn=host_entry, username=default_username, password=default_password + fqdn=host_entry, + username=default_username, + password=default_password, + chassis=default_chassis, + group=default_group, ) host_objs.append(hc)