Compare commits
9 Commits
9bedf0c799
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
991f031ce5
|
|||
|
8b19633a84
|
|||
|
b68889e869
|
|||
|
5c777483fc
|
|||
|
bdac561e86
|
|||
|
50e8376937
|
|||
|
361e75e4f3
|
|||
|
b1db6212a0
|
|||
|
d8fc5cd8b8
|
21
python/redfish-api/LICENSE
Normal file
21
python/redfish-api/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2026 dasBaum
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
@@ -1,117 +1,169 @@
|
|||||||
# Description
|
# Redfish-Exporter
|
||||||
|
A Python-based Prometheus exporter for collecting power data (Watts, Volts, Amperes) from bare metal servers using the Redfish API. This tool supports multiple vendors (e.g., HPE, Supermicro) and is designed to run cross-platform on Linux and Windows.
|
||||||
|
|
||||||
I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution, feel free to replace me.
|
I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution or you want more feature, feel free to replace me or expand my prometheus exporter.
|
||||||
|
|
||||||
Usage:
|
## Features
|
||||||
|
- Collects power metrics: Watts, Volts, and Amperes.
|
||||||
|
- Supports multiple vendors (HPE, Supermicro, etc.).
|
||||||
|
- Supports grouping.
|
||||||
|
- Cross-platform compatibility (Linux and Windows).
|
||||||
|
- Graceful error handling and retry logic.
|
||||||
|
- Configurable via YAML.
|
||||||
|
- Docker support.
|
||||||
|
|
||||||
|
## Metrics Overview
|
||||||
|
| Metric | Typ | Description |
|
||||||
|
|---------------------------------|-----------|----------------------------------------------------------------|
|
||||||
|
| redfish_up | Gauge | Status from host (1 = reachable, 0 = not reachable). |
|
||||||
|
| redfish_psu_input_voltage | Gauge | Voltages per powersupply (label: host, psu_serial). |
|
||||||
|
| redfish_psu_input_watts | Gauge | Watts per powersupply (label: host, psu_serial). |
|
||||||
|
| redfish_psu_input_amps | Gauge | Amperes per powersupply (label: host, psu_serial). |
|
||||||
|
| redfish_system_info | Info | Systeminformation (Vendor, Model, Serial, Redfish Version). |
|
||||||
|
| redfish_request_latency_seconds | Histogram | Latency (label: host). |
|
||||||
|
| redfish_errors_total | Counter | Number of errors per host and error type (label: host, error). |
|
||||||
|
|
||||||
|
## Usage
|
||||||
```
|
```
|
||||||
usage: redfish_exporter.py [-h] [--config CONFIG] [--port PORT]
|
usage: python main.py [-h] [--config CONFIG] [--port PORT]
|
||||||
|
|
||||||
Redfish Prometheus Exporter
|
Redfish Prometheus Exporter
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--config CONFIG Path to config file
|
--config CONFIG Path to config file
|
||||||
--port PORT Override port from config file
|
--port PORT Override port from config file
|
||||||
|
--interval INTERVAL Override interval from config file
|
||||||
|
--show-deprecated Enable deprecated warnings in log
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
# Install
|
# Install
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
* just (optional)
|
||||||
|
* python 3.8+
|
||||||
|
* uv
|
||||||
|
* see `pyproject.tom`
|
||||||
|
|
||||||
Dependencies:
|
Install the dependencies using `uv`:
|
||||||
|
|
||||||
* see requirements.txt
|
```bash
|
||||||
|
uv sync
|
||||||
|
source .venv/bin/activate
|
||||||
|
uv lock --upgrade --refresh
|
||||||
|
```
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
Create `config.yaml` with following structure:
|
||||||
|
|
||||||
Create `config.yaml`:
|
### Basic Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
---
|
---
|
||||||
interval: 5
|
interval: 5
|
||||||
port: 8000
|
port: 8000
|
||||||
username: user1
|
username: user
|
||||||
password: secret
|
password: secret
|
||||||
|
chassis: ["1"]
|
||||||
hosts:
|
hosts:
|
||||||
- srv1-112.mgmt.wtb1.ch.abainfra.net
|
- host1.example.net
|
||||||
- srv2-112.mgmt.wtb1.ch.abainfra.net
|
- host2.example.net
|
||||||
- srv3-112.mgmt.wtb1.ch.abainfra.net
|
- host3.example.net
|
||||||
- srv4-112.mgmt.wtb1.ch.abainfra.net
|
- host4.example.net
|
||||||
```
|
```
|
||||||
|
|
||||||
or:
|
### Advanced Configuration
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
---
|
---
|
||||||
interval: 5
|
interval: 5
|
||||||
port: 8000
|
port: 8000
|
||||||
username: user1
|
username: user1
|
||||||
password: secret1
|
password: secret1
|
||||||
|
chassis: ["1"]
|
||||||
|
group: development # set default group for all hosts
|
||||||
hosts:
|
hosts:
|
||||||
- fqdn: srv1-112.mgmt.wtb1.ch.abainfra.net
|
- fqdn: host1.example.net
|
||||||
username: user2
|
username: user2
|
||||||
password: secret2
|
password: secret2
|
||||||
- fqdn: srv2-112.mgmt.wtb1.ch.abainfra.net
|
chassis: ["0"]
|
||||||
|
group: production # use group for specific host
|
||||||
|
- fqdn: host2.example.net
|
||||||
username: user3
|
username: user3
|
||||||
password: secret3
|
password: secret3
|
||||||
- fqdn: srv3-112.mgmt.wtb1.ch.abainfra.net
|
chassis: ["1"]
|
||||||
|
group: stage
|
||||||
|
- fqdn: host3.example.net
|
||||||
username: user4
|
username: user4
|
||||||
password: secret4
|
password: secret4
|
||||||
- fqdn: srv4-112.mgmt.wtb1.ch.abainfra.net
|
chassis: ["example"]
|
||||||
|
- fqdn: host4.example.net
|
||||||
username: user5
|
username: user5
|
||||||
password: secret5
|
password: secret5
|
||||||
```
|
```
|
||||||
|
|
||||||
The `port`, `interval` are optional and can be overwritten by argument. Save default values are hardcoded.
|
The `port`, `interval` and `interval` are optional and can be be overridden by command-line arguments. Default values are hardcoded.
|
||||||
|
|
||||||
|
### Prometheus Configuration
|
||||||
|
```
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
# Use as Container
|
scrape_configs:
|
||||||
|
- job_name: "prometheus"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
|
|
||||||
|
- job_name: "redfish_exporter"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:8000"] # Adjust to your config
|
||||||
|
metrics_path: /metrics
|
||||||
|
scrape_interval: 15s
|
||||||
|
```
|
||||||
|
|
||||||
|
# Docker / Container
|
||||||
|
To run the Redfish Exporter in a Docker container:
|
||||||
|
|
||||||
```
|
```
|
||||||
docker build -t redfish_exporter .
|
docker buildx build -t redfish_exporter .
|
||||||
docker run -it --rm --name redfish_exporter_app -p 8000:8000 redfish_exporter:latest
|
docker run -it --rm --name redfish_exporter_app -p 8000:8000 redfish_exporter:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
# Legacy way
|
# Legacy Installation
|
||||||
|
|
||||||
|
## Python Dependencies
|
||||||
```bash
|
```bash
|
||||||
mkdir /srv/redfish-exporter
|
mkdir /srv/redfish-exporter
|
||||||
```
|
# or
|
||||||
|
git clone https://github.com/dasbaum-ch/redfish-exporter.git /srv/redfish-exporter
|
||||||
## Python dependencies
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd /srv/redfish-exporter
|
cd /srv/redfish-exporter
|
||||||
python3 -m venv venv
|
uv sync --locked
|
||||||
source venv/bin/activate
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Create user
|
## Create user
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo useradd -r -s /bin/false redfish
|
sudo useradd -r -s /bin/false redfish
|
||||||
```
|
```
|
||||||
|
|
||||||
## Install systemd unit file
|
## Systemd Service
|
||||||
|
1. Copy the systemd unit file:
|
||||||
```bash
|
```bash
|
||||||
sudo cp redfish-exporter.service /etc/systemd/system/redfish-exporter.service
|
sudo cp redfish-exporter.service /etc/systemd/system/redfish-exporter.service
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Reload and start the service:
|
||||||
|
```bash
|
||||||
sudo systemctl daemon-reload
|
sudo systemctl daemon-reload
|
||||||
sudo systemctl enable --now redfish-exporter.service
|
sudo systemctl enable --now redfish-exporter.service
|
||||||
```
|
```
|
||||||
|
|
||||||
# Usefull oneliners
|
# License
|
||||||
|
This project is licensed under the **MIT License**. See the [LICENSE](LICENSE) file for details.
|
||||||
|
|
||||||
## public IP with curl
|
# Testet on Hardware
|
||||||
|
Here some Server's that I have successfully testet:
|
||||||
|
|
||||||
```bash
|
| Vendor | Model | Redfish Version |
|
||||||
curl icanhazip.com
|
|------------|----------------------|-----------------|
|
||||||
curl -4 icanhazip.com
|
| Supermicro | AS-5126GS-TNRT2 | 1.21.0 |
|
||||||
curl -6 icanhazip.com
|
| | AS-1124US-TNRP | 1.8.0 |
|
||||||
|
| HPE | ProLiant DL380 Gen10 | 1.6.0 |
|
||||||
curl 'https://api.ipify.org?format=json'
|
|
||||||
curl 'https://api64.ipify.org?format=json'
|
|
||||||
```
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ port: 8000
|
|||||||
username: gloabl-user
|
username: gloabl-user
|
||||||
password: global-password
|
password: global-password
|
||||||
chassis: ["1"] # Strings, not integers!
|
chassis: ["1"] # Strings, not integers!
|
||||||
|
group: production
|
||||||
hosts:
|
hosts:
|
||||||
- fqdn: host1.example.com
|
- fqdn: host1.example.com
|
||||||
username: user1
|
username: user1
|
||||||
|
|||||||
@@ -9,7 +9,43 @@ import asyncio
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import urllib3
|
import urllib3
|
||||||
import yaml
|
import yaml
|
||||||
from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram
|
from prometheus_client import (
|
||||||
|
Gauge,
|
||||||
|
start_http_server,
|
||||||
|
Summary,
|
||||||
|
Counter,
|
||||||
|
Histogram,
|
||||||
|
Info,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RedfishResource:
|
||||||
|
"""Container for Redfish resource URLs."""
|
||||||
|
|
||||||
|
chassis: str | None = None
|
||||||
|
systems: str | None = None
|
||||||
|
power: str | None = None
|
||||||
|
session_service: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PowerMetrics:
|
||||||
|
"""Container for power metrics."""
|
||||||
|
|
||||||
|
voltage: float | None = None
|
||||||
|
watts: float | None = None
|
||||||
|
amps: float | None = None
|
||||||
|
serial: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RedfishSession:
|
||||||
|
"""Container for Redfish session data."""
|
||||||
|
|
||||||
|
token: str | None = None
|
||||||
|
logout_url: str | None = None
|
||||||
|
vendor: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -19,18 +55,14 @@ class HostConfig:
|
|||||||
fqdn: str
|
fqdn: str
|
||||||
username: str
|
username: str
|
||||||
password: str
|
password: str
|
||||||
max_retries: int = 1
|
chassis: list[str] | None = None
|
||||||
backoff: int = 2
|
group: str = "none"
|
||||||
|
max_retries: int = 3 # 3 retires
|
||||||
|
backoff: int = 2 # wait 2 seconds
|
||||||
cool_down: int = 120 # seconds to wait after too many failures
|
cool_down: int = 120 # seconds to wait after too many failures
|
||||||
failures: int = 0
|
failures: int = 0
|
||||||
next_retry_time: float = field(default=0.0, init=False)
|
next_retry_time: float = field(default=0.0, init=False)
|
||||||
|
session: RedfishSession = field(default_factory=RedfishSession)
|
||||||
# New attributes for Redfish stuff
|
|
||||||
vendor: str | None = None
|
|
||||||
session_token: str | None = None
|
|
||||||
session_logout: str | None = (
|
|
||||||
None # SessionLocation like /redfish/v1/SessionService/Sessions/marco.lucarelli%40abacus.ch00000000xxx/
|
|
||||||
)
|
|
||||||
|
|
||||||
def should_skip(self) -> bool:
|
def should_skip(self) -> bool:
|
||||||
"""Check if host is still in cool-down window"""
|
"""Check if host is still in cool-down window"""
|
||||||
@@ -61,20 +93,28 @@ REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing requ
|
|||||||
REQUEST_LATENCY = Histogram(
|
REQUEST_LATENCY = Histogram(
|
||||||
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
|
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
|
||||||
)
|
)
|
||||||
up_gauge = Gauge("redfish_up", "Host up/down", ["host"])
|
UP_GAUGE = Gauge("redfish_up", "Host up/down", ["host", "group"])
|
||||||
error_counter = Counter(
|
ERROR_COUNTER = Counter(
|
||||||
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
|
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
|
||||||
)
|
)
|
||||||
voltage_gauge = Gauge(
|
VOLTAGE_GAUGE = Gauge(
|
||||||
"redfish_psu_line_input_voltage_volts",
|
"redfish_psu_input_voltage_volts",
|
||||||
"Line Input Voltage per PSU",
|
"Line Input Voltage per PSU",
|
||||||
["host", "psu_serial"],
|
["host", "psu_serial", "group"],
|
||||||
)
|
)
|
||||||
watts_gauge = Gauge(
|
WATTS_GAUGE = Gauge(
|
||||||
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
|
"redfish_psu_input_watts",
|
||||||
|
"Power Input Watts per PSU",
|
||||||
|
["host", "psu_serial", "group"],
|
||||||
)
|
)
|
||||||
amps_gauge = Gauge(
|
AMPS_GAUGE = Gauge(
|
||||||
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
|
"redfish_psu_input_amps",
|
||||||
|
"Current draw in Amps per PSU",
|
||||||
|
["host", "psu_serial", "group"],
|
||||||
|
)
|
||||||
|
# set info metric
|
||||||
|
SYSTEM_INFO = Info(
|
||||||
|
"redfish_system", "System information (model, serial, etc.)", ["host", "group"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -84,72 +124,80 @@ async def process_request(t):
|
|||||||
await asyncio.sleep(t)
|
await asyncio.sleep(t)
|
||||||
|
|
||||||
|
|
||||||
|
async def probe_vendor(session, host: HostConfig) -> str | None:
|
||||||
|
"""Probe the vendor of the Redfish host."""
|
||||||
|
try:
|
||||||
|
async with session.get(
|
||||||
|
f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10
|
||||||
|
) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
data = await resp.json()
|
||||||
|
vendor = data.get("Vendor", "")
|
||||||
|
logging.debug("Detected vendor for %s: %s", host.fqdn, vendor)
|
||||||
|
return vendor
|
||||||
|
logging.warning(
|
||||||
|
"Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("Vendor probe failed for %s: %s", host.fqdn, e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def login_hpe(session, host: HostConfig) -> bool:
|
||||||
|
"""Login to HPE Redfish API and set session token."""
|
||||||
|
login_url = f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
|
||||||
|
payload = {"UserName": host.username, "Password": host.password}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with session.post(
|
||||||
|
login_url, json=payload, ssl=False, timeout=10
|
||||||
|
) as login_resp:
|
||||||
|
if login_resp.status == 201:
|
||||||
|
host.session.token = login_resp.headers.get("X-Auth-Token")
|
||||||
|
host.session.logout_url = login_resp.headers.get("Location")
|
||||||
|
|
||||||
|
if not host.session.token or not host.session.logout_url:
|
||||||
|
raise RuntimeError("Invalid login response")
|
||||||
|
|
||||||
|
logging.info("New session token obtained for %s", host.fqdn)
|
||||||
|
return True
|
||||||
|
logging.warning(
|
||||||
|
"Login failed for %s: HTTP %s", host.fqdn, login_resp.status
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("Login failed for %s: %s", host.fqdn, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
||||||
"""Fetch JSON from Redfish with retry/backoff"""
|
"""Fetch JSON from Redfish with retry/backoff."""
|
||||||
if host.should_skip():
|
if host.should_skip():
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
||||||
)
|
)
|
||||||
up_gauge.labels(host=host.fqdn).set(0)
|
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not host.vendor:
|
# Probe vendor if not already known
|
||||||
try:
|
if not host.session.vendor:
|
||||||
async with session.get(
|
host.session.vendor = await probe_vendor(session, host)
|
||||||
f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10
|
|
||||||
) as resp:
|
|
||||||
if resp.status == 200:
|
|
||||||
data = await resp.json()
|
|
||||||
host.vendor = data.get("Vendor", "")
|
|
||||||
logging.debug("Detected vendor for %s: %s", host.fqdn, host.vendor)
|
|
||||||
else:
|
|
||||||
logging.warning(
|
|
||||||
"Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning("Vendor probe failed for %s: %s", host.fqdn, e)
|
|
||||||
|
|
||||||
is_hpe = host.vendor and host.vendor.strip().upper().startswith("HPE")
|
is_hpe = host.session.vendor and host.session.vendor.strip().upper().startswith(
|
||||||
|
"HPE"
|
||||||
|
)
|
||||||
|
|
||||||
for attempt in range(1, host.max_retries + 1):
|
for attempt in range(1, host.max_retries + 1):
|
||||||
try:
|
try:
|
||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
if is_hpe:
|
if is_hpe:
|
||||||
# Try to reuse existing session token
|
# Handle HPE session token
|
||||||
if host.session_token:
|
if not host.session.token:
|
||||||
headers["X-Auth-Token"] = host.session_token
|
if not await login_hpe(session, host):
|
||||||
logging.debug("Reusing cached session token for %s", host.fqdn)
|
# Retry login next attempt
|
||||||
else:
|
continue
|
||||||
# Need to login and store new session token
|
|
||||||
# HPE Redfish login
|
headers["X-Auth-Token"] = host.session.token
|
||||||
login_url = (
|
|
||||||
f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
|
|
||||||
)
|
|
||||||
payload = {"UserName": host.username, "Password": host.password}
|
|
||||||
async with session.post(
|
|
||||||
login_url, json=payload, ssl=False, timeout=10
|
|
||||||
) as login_resp:
|
|
||||||
if login_resp.status == 201:
|
|
||||||
host.session_token = login_resp.headers.get(
|
|
||||||
"X-Auth-Token"
|
|
||||||
) # as response in header
|
|
||||||
if not host.session_token:
|
|
||||||
raise RuntimeError("No X-Auth-Token in login response")
|
|
||||||
host.session_logout = login_resp.headers.get(
|
|
||||||
"Location"
|
|
||||||
) # as response in header
|
|
||||||
if not host.session_logout:
|
|
||||||
raise RuntimeError("No Location in login response")
|
|
||||||
headers["X-Auth-Token"] = host.session_token
|
|
||||||
logging.info("New session token obtained for %s", host.fqdn)
|
|
||||||
else:
|
|
||||||
logging.warning(
|
|
||||||
"Login failed for %s: HTTP %s",
|
|
||||||
host.fqdn,
|
|
||||||
login_resp.status,
|
|
||||||
)
|
|
||||||
continue # retry login next attempt
|
|
||||||
|
|
||||||
async with session.get(
|
async with session.get(
|
||||||
url, headers=headers, ssl=False, timeout=10
|
url, headers=headers, ssl=False, timeout=10
|
||||||
@@ -162,14 +210,14 @@ async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
|||||||
logging.warning(
|
logging.warning(
|
||||||
"Invalid token for %s, reauthenticating...", host.fqdn
|
"Invalid token for %s, reauthenticating...", host.fqdn
|
||||||
)
|
)
|
||||||
host.session_token = None
|
host.session.token = None
|
||||||
continue
|
continue
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
|
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Default: BasicAuth, like Supermicro and so
|
# Default: BasicAuth
|
||||||
async with session.get(
|
async with session.get(
|
||||||
url,
|
url,
|
||||||
auth=aiohttp.BasicAuth(host.username, host.password),
|
auth=aiohttp.BasicAuth(host.username, host.password),
|
||||||
@@ -203,64 +251,337 @@ async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def get_power_data(session, host: HostConfig):
|
async def discover_redfish_resources(
|
||||||
"""Query Redfish and update Prometheus metrics"""
|
session, host: HostConfig
|
||||||
|
) -> RedfishResource | None:
|
||||||
|
"""Discover available Redfish resources and return relevant URLs"""
|
||||||
|
root_url = f"https://{host.fqdn}/redfish/v1/"
|
||||||
|
data = await fetch_with_retry(session, host, root_url)
|
||||||
|
if not data:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Create RedfishRessource object
|
||||||
|
resources = RedfishResource(
|
||||||
|
chassis=data.get("Chassis", {}).get("@odata.id"),
|
||||||
|
systems=data.get("Systems", {}).get("@odata.id"),
|
||||||
|
session_service=data.get("SessionService", {}).get("@odata.id"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not resources.chassis:
|
||||||
|
logging.error("No valid Chassis URL found for host %s", host.fqdn)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return resources
|
||||||
|
|
||||||
|
|
||||||
|
def get_power_resource_info(
|
||||||
|
member_data: dict, host_fqdn: str, show_deprecated_warnings
|
||||||
|
) -> tuple[str | None, str | None]:
|
||||||
|
"""Get the URL and type of Power resource (PowerSubsystem or Power)."""
|
||||||
|
# Try PowerSubsystem (new Redfish versions)
|
||||||
|
power_url = member_data.get("PowerSubsystem", {}).get("@odata.id")
|
||||||
|
if power_url:
|
||||||
|
return f"https://{host_fqdn}{power_url}", "PowerSubsystem"
|
||||||
|
|
||||||
|
# Try Power for older Redfish versions
|
||||||
|
power_url = member_data.get("Power", {}).get("@odata.id")
|
||||||
|
if power_url:
|
||||||
|
if show_deprecated_warnings:
|
||||||
|
logging.warning(
|
||||||
|
"DEPRECATED: Host %s uses old Redfish API (Power instead of PowerSubsystem). "
|
||||||
|
"Consider updating the firmware for full compatibility.",
|
||||||
|
host_fqdn,
|
||||||
|
)
|
||||||
|
return f"https://{host_fqdn}{power_url}", "Power"
|
||||||
|
|
||||||
|
# Nothing found -> Error
|
||||||
|
logging.error("No Power or PowerSubsystem found for host %s", host_fqdn)
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def process_power_supplies_url(
|
||||||
|
power_data: dict, power_resource_type: str, host_fqdn: str
|
||||||
|
) -> str | None:
|
||||||
|
"""Get the URL for PowerSupplies based on the Power resource type."""
|
||||||
|
if power_resource_type == "PowerSubsystem":
|
||||||
|
# Bei PowerSubsystem: PowerSupplies ist ein separates Objekt
|
||||||
|
power_supplies_url = power_data.get("PowerSupplies", {}).get("@odata.id")
|
||||||
|
if power_supplies_url:
|
||||||
|
return f"https://{host_fqdn}{power_supplies_url}"
|
||||||
|
|
||||||
|
elif power_resource_type == "Power":
|
||||||
|
# Bei Power: PowerSupplies ist direkt im Power-Objekt enthalten
|
||||||
|
if "PowerSupplies" in power_data:
|
||||||
|
return f"https://{host_fqdn}/redfish/v1/Chassis/1/Power"
|
||||||
|
|
||||||
|
logging.error("No PowerSupplies found in Power resource for host %s", host_fqdn)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def process_power_supplies(
|
||||||
|
power_data: dict,
|
||||||
|
power_resource_type: str,
|
||||||
|
) -> list[dict] | None:
|
||||||
|
"""Get PowerSupplies data based on the Power resource type."""
|
||||||
|
if power_resource_type == "PowerSubsystem":
|
||||||
|
# PowerSubsystem: PowerSupplies is a ressource with Members
|
||||||
|
power_supplies_url = power_data.get("PowerSupplies", {}).get("@odata.id")
|
||||||
|
if not power_supplies_url:
|
||||||
|
logging.error("No PowerSupplies URL found for PowerSubsystem")
|
||||||
|
return None
|
||||||
|
return None # If none, then use the PowerSubsystem member url
|
||||||
|
|
||||||
|
elif power_resource_type == "Power":
|
||||||
|
# Power: PowerSupplies is an array!
|
||||||
|
return power_data.get("PowerSupplies", [])
|
||||||
|
|
||||||
|
logging.error("Unknown power resource type")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def process_power_supply(
|
||||||
|
session, host: HostConfig, psu_data: dict, power_resource_type: str
|
||||||
|
) -> PowerMetrics | None:
|
||||||
|
"""Extract metrics from PowerSupply"""
|
||||||
|
serial = psu_data.get("SerialNumber")
|
||||||
|
metrics = PowerMetrics(serial=serial)
|
||||||
|
|
||||||
|
if power_resource_type == "PowerSubsystem":
|
||||||
|
# New Redfish API: Metrics are an own "Metrics" ressource
|
||||||
|
metrics_url = psu_data.get("Metrics", {}).get("@odata.id")
|
||||||
|
if not metrics_url:
|
||||||
|
logging.warning("No Metrics found for PowerSupply %s", psu_data.get("Id"))
|
||||||
|
return None
|
||||||
|
|
||||||
|
metrics_url = f"https://{host.fqdn}{metrics_url}"
|
||||||
|
metrics_data = await fetch_with_retry(session, host, metrics_url)
|
||||||
|
if not metrics_data:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Get metrics from Metrics ressource
|
||||||
|
metrics.voltage = metrics_data.get("InputVoltage", {}).get("Reading")
|
||||||
|
metrics.watts = metrics_data.get("InputPowerWatts", {}).get("Reading")
|
||||||
|
metrics.amps = metrics_data.get("InputCurrentAmps", {}).get("Reading")
|
||||||
|
|
||||||
|
elif power_resource_type == "Power":
|
||||||
|
# Older Redfish API: Metrics are direct in PowerSupply as an array
|
||||||
|
metrics.voltage = psu_data.get("LineInputVoltage")
|
||||||
|
metrics.watts = psu_data.get("PowerInputWatts")
|
||||||
|
if metrics.watts is None:
|
||||||
|
metrics.watts = psu_data.get("LastPowerOutputWatts")
|
||||||
|
metrics.amps = psu_data.get("InputCurrentAmps")
|
||||||
|
if metrics.amps is None and metrics.voltage and metrics.watts:
|
||||||
|
metrics.amps = round(metrics.watts / metrics.voltage, 2)
|
||||||
|
|
||||||
|
else:
|
||||||
|
logging.error(
|
||||||
|
"Unknown power resource type for PowerSupply %s", psu_data.get("Id")
|
||||||
|
)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url(url: str) -> str:
|
||||||
|
"""Ensure URL does not end with a trailing slash."""
|
||||||
|
# I needed this for realy old Redfish versions :S (<1.6.0)
|
||||||
|
if url.endswith("/"):
|
||||||
|
return url[:-1] # Remove trailing slash
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
async def get_power_data(session, host: HostConfig, show_deprecated_warnings):
|
||||||
|
"""Query Redfish for power data and update Prometheus metrics"""
|
||||||
if host.should_skip():
|
if host.should_skip():
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
||||||
)
|
)
|
||||||
up_gauge.labels(host=host.fqdn).set(0)
|
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
|
||||||
return
|
return
|
||||||
|
|
||||||
url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power"
|
# Start time measurement
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
|
|
||||||
data = await fetch_with_retry(session, host, url)
|
# Get root ressources
|
||||||
if not data:
|
resources = await discover_redfish_resources(session, host)
|
||||||
|
if not resources or not resources.chassis:
|
||||||
|
logging.error("Could not discover any resources for %s", host.fqdn)
|
||||||
host.mark_failure()
|
host.mark_failure()
|
||||||
up_gauge.labels(host=host.fqdn).set(0)
|
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
|
||||||
return
|
return
|
||||||
|
|
||||||
host.mark_success()
|
host.mark_success()
|
||||||
up_gauge.labels(host=host.fqdn).set(1)
|
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(1)
|
||||||
|
|
||||||
for psu in data.get("PowerSupplies", []):
|
chassis_url = f"https://{host.fqdn}{resources.chassis}"
|
||||||
line_input_v = psu.get("LineInputVoltage")
|
chassis_data = await fetch_with_retry(session, host, chassis_url)
|
||||||
# HPE Redfish uses LastPowerOutputWatts for Watts
|
if not chassis_data:
|
||||||
if host.vendor.strip().upper().startswith("HPE"):
|
host.mark_failure()
|
||||||
watts_input = psu.get("LastPowerOutputWatts")
|
UP_GAUGE.labels(host=host.fqdn, group=host.group).set(0)
|
||||||
else:
|
return
|
||||||
# Supermicro uses PowerInputWatts
|
|
||||||
watts_input = psu.get("PowerInputWatts")
|
|
||||||
serial = psu.get("SerialNumber")
|
|
||||||
|
|
||||||
amps = (
|
for chassis_member in chassis_data.get("Members", []):
|
||||||
round(watts_input / line_input_v, 2)
|
chassis_member_url = chassis_member.get("@odata.id")
|
||||||
if line_input_v and watts_input
|
if not chassis_member_url:
|
||||||
else None
|
continue
|
||||||
|
|
||||||
|
# Normalize URL... I needed this for realy old Redfish versions :S (<1.6.0)
|
||||||
|
chassis_member_url = normalize_url(chassis_member_url)
|
||||||
|
# Get chassis id from url ("/redfish/v1/Chassis/1" -> 1)
|
||||||
|
chassis_member_id = chassis_member_url.split("/")[-1]
|
||||||
|
# Check if the chassis id is in config (had problem with chassis "NVMe")
|
||||||
|
if hasattr(host, "chassis") and host.chassis:
|
||||||
|
if chassis_member_id not in host.chassis:
|
||||||
|
continue
|
||||||
|
|
||||||
|
member_url = f"https://{host.fqdn}{chassis_member_url}"
|
||||||
|
member_data = await fetch_with_retry(session, host, member_url)
|
||||||
|
if not member_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get Power ressource (fallback to "Power")
|
||||||
|
power_resource_url, power_resource_type = get_power_resource_info(
|
||||||
|
member_data, host.fqdn, show_deprecated_warnings
|
||||||
)
|
)
|
||||||
|
if not power_resource_url:
|
||||||
|
continue
|
||||||
|
|
||||||
if line_input_v is not None:
|
# Get Power Data
|
||||||
voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v)
|
power_data = await fetch_with_retry(session, host, power_resource_url)
|
||||||
if watts_input is not None:
|
if not power_data:
|
||||||
watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input)
|
continue
|
||||||
if amps is not None:
|
|
||||||
amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps)
|
|
||||||
|
|
||||||
|
# Get PowerSupplies, depend on ressource type ("Power" or "PowerSubsystem")
|
||||||
|
if power_resource_type == "PowerSubsystem":
|
||||||
|
# Request PowerSupplies url (for PowerSubsystem)
|
||||||
|
power_supplies_url = power_data.get("PowerSupplies", {}).get("@odata.id")
|
||||||
|
if not power_supplies_url:
|
||||||
|
logging.warning("No PowerSupplies found for %s", host.fqdn)
|
||||||
|
continue
|
||||||
|
|
||||||
|
power_supplies_url = f"https://{host.fqdn}{power_supplies_url}"
|
||||||
|
power_supplies_data = await fetch_with_retry(
|
||||||
|
session, host, power_supplies_url
|
||||||
|
)
|
||||||
|
if not power_supplies_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# loop over Members for "PowerSubsystem"
|
||||||
|
for psu_member in power_supplies_data.get("Members", []):
|
||||||
|
psu_url = psu_member.get("@odata.id")
|
||||||
|
if not psu_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
psu_url = f"https://{host.fqdn}{psu_url}"
|
||||||
|
psu_data = await fetch_with_retry(session, host, psu_url)
|
||||||
|
if not psu_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process PowerSupplies object
|
||||||
|
metrics = await process_power_supply(
|
||||||
|
session, host, psu_data, "PowerSubsystem"
|
||||||
|
)
|
||||||
|
if metrics:
|
||||||
|
update_prometheus_metrics(host, metrics)
|
||||||
|
|
||||||
|
elif power_resource_type == "Power":
|
||||||
|
# Loop over PowerSupplies for older Redfish versions
|
||||||
|
for psu in power_data.get("PowerSupplies", []):
|
||||||
|
# Process PowerSupplies object
|
||||||
|
metrics = await process_power_supply(session, host, psu, "Power")
|
||||||
|
if metrics:
|
||||||
|
update_prometheus_metrics(host, metrics)
|
||||||
|
|
||||||
|
else:
|
||||||
|
logging.error("Unknown power resource type for host %s", host.fqdn)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Measure request and process latency
|
||||||
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
|
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
|
||||||
|
|
||||||
|
|
||||||
|
def update_prometheus_metrics(host: HostConfig, metrics: PowerMetrics):
|
||||||
|
"""Update Prometheus metrics with PowerMetrics data."""
|
||||||
|
if metrics.voltage is not None and metrics.serial:
|
||||||
|
VOLTAGE_GAUGE.labels(
|
||||||
|
host=host.fqdn, psu_serial=metrics.serial, group=host.group
|
||||||
|
).set(metrics.voltage)
|
||||||
|
if metrics.watts is not None and metrics.serial:
|
||||||
|
WATTS_GAUGE.labels(
|
||||||
|
host=host.fqdn, psu_serial=metrics.serial, group=host.group
|
||||||
|
).set(metrics.watts)
|
||||||
|
if metrics.amps is not None and metrics.serial:
|
||||||
|
AMPS_GAUGE.labels(
|
||||||
|
host=host.fqdn, psu_serial=metrics.serial, group=host.group
|
||||||
|
).set(metrics.amps)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_system_info(session, host: HostConfig):
|
||||||
|
"""Query Redfish for system data and update Prometheus metrics"""
|
||||||
|
if host.should_skip():
|
||||||
|
logging.warning(
|
||||||
|
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Get Redfish Version
|
||||||
|
root_url = f"https://{host.fqdn}/redfish/v1/"
|
||||||
|
root_data = await fetch_with_retry(session, host, root_url)
|
||||||
|
if not root_data:
|
||||||
|
host.mark_failure()
|
||||||
|
return
|
||||||
|
|
||||||
|
redfish_version = root_data.get("RedfishVersion")
|
||||||
|
# Get Manufacturer, Serial and Model
|
||||||
|
systems_url = f"https://{host.fqdn}/redfish/v1/Systems/"
|
||||||
|
systems_data = await fetch_with_retry(session, host, systems_url)
|
||||||
|
if not systems_data:
|
||||||
|
host.mark_failure()
|
||||||
|
return
|
||||||
|
|
||||||
|
# loop for each system members
|
||||||
|
for system_member in systems_data.get("Members", []):
|
||||||
|
system_url = system_member.get("@odata.id")
|
||||||
|
if not system_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
system_data = await fetch_with_retry(
|
||||||
|
session, host, f"https://{host.fqdn}{system_url}"
|
||||||
|
)
|
||||||
|
if not system_data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
manufacturer = system_data.get("Manufacturer")
|
||||||
|
if manufacturer is None:
|
||||||
|
manufacturer = "<no data>"
|
||||||
|
model = system_data.get("Model")
|
||||||
|
if model is None:
|
||||||
|
model = "<no data>"
|
||||||
|
serial_number = system_data.get("SerialNumber")
|
||||||
|
if serial_number is None:
|
||||||
|
serial_number = "<no data>"
|
||||||
|
|
||||||
|
# Hier könnte ihre Werbung stehen
|
||||||
|
SYSTEM_INFO.labels(host=host.fqdn, group=host.group).info(
|
||||||
|
{
|
||||||
|
"manufacturer": manufacturer,
|
||||||
|
"model": model,
|
||||||
|
"serial_number": serial_number,
|
||||||
|
"redfish_version": redfish_version,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def logout_host(session, host):
|
async def logout_host(session, host):
|
||||||
"""Clean logout for Redfish with session tokens"""
|
"""Clean logout for Redfish with session tokens"""
|
||||||
if not host.session_token:
|
if not host.session.token or not host.session.logout_url:
|
||||||
return
|
|
||||||
if not host.session_logout:
|
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
logout_url = f"{host.session_logout}" # the full URL is here!
|
logout_url = host.session.logout_url
|
||||||
async with session.delete(
|
async with session.delete(
|
||||||
logout_url,
|
logout_url,
|
||||||
headers={"X-Auth-Token": host.session_token},
|
headers={"X-Auth-Token": host.session.token},
|
||||||
ssl=False,
|
ssl=False,
|
||||||
timeout=5,
|
timeout=5,
|
||||||
) as resp:
|
) as resp:
|
||||||
@@ -273,14 +594,17 @@ async def logout_host(session, host):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning("Error during logout for %s: %s", host.fqdn, e)
|
logging.warning("Error during logout for %s: %s", host.fqdn, e)
|
||||||
finally:
|
finally:
|
||||||
host.session_token = None
|
host.session.token = None
|
||||||
|
host.session.logout_url = None
|
||||||
|
|
||||||
|
|
||||||
async def run_exporter(config, stop_event):
|
async def run_exporter(config, stop_event, show_deprecated_warnings):
|
||||||
"""Main loop"""
|
"""Main loop"""
|
||||||
port = config.get("port", 8000)
|
port = config.get("port", 8000)
|
||||||
default_username = config.get("username")
|
default_username = config.get("username")
|
||||||
default_password = config.get("password")
|
default_password = config.get("password")
|
||||||
|
default_chassis = config.get("chassis", "1")
|
||||||
|
default_group = config.get("group", "none")
|
||||||
hosts = config["hosts"]
|
hosts = config["hosts"]
|
||||||
interval = config.get("interval", 10)
|
interval = config.get("interval", 10)
|
||||||
|
|
||||||
@@ -296,10 +620,16 @@ async def run_exporter(config, stop_event):
|
|||||||
fqdn=host_entry["fqdn"],
|
fqdn=host_entry["fqdn"],
|
||||||
username=host_entry.get("username", default_username),
|
username=host_entry.get("username", default_username),
|
||||||
password=host_entry.get("password", default_password),
|
password=host_entry.get("password", default_password),
|
||||||
|
chassis=host_entry.get("chassis", default_chassis),
|
||||||
|
group=host_entry.get("group", default_group),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
hc = HostConfig(
|
hc = HostConfig(
|
||||||
fqdn=host_entry, username=default_username, password=default_password
|
fqdn=host_entry,
|
||||||
|
username=default_username,
|
||||||
|
password=default_password,
|
||||||
|
chassis=default_chassis,
|
||||||
|
group=default_group,
|
||||||
)
|
)
|
||||||
host_objs.append(hc)
|
host_objs.append(hc)
|
||||||
|
|
||||||
@@ -308,14 +638,17 @@ async def run_exporter(config, stop_event):
|
|||||||
async with aiohttp.ClientSession(connector=connector) as session:
|
async with aiohttp.ClientSession(connector=connector) as session:
|
||||||
try:
|
try:
|
||||||
while not stop_event.is_set():
|
while not stop_event.is_set():
|
||||||
tasks = [get_power_data(session, hc) for hc in host_objs]
|
tasks = []
|
||||||
|
for hc in host_objs:
|
||||||
|
tasks.append(get_power_data(session, hc, show_deprecated_warnings))
|
||||||
|
tasks.append(get_system_info(session, hc))
|
||||||
await asyncio.gather(*tasks)
|
await asyncio.gather(*tasks)
|
||||||
await process_request(interval)
|
await process_request(interval)
|
||||||
finally:
|
finally:
|
||||||
# Graceful shutdown: logout from Redfish sessions
|
# Graceful shutdown: logout from Redfish sessions
|
||||||
logging.info("Exporter stopping, logging out from Redfish sessions...")
|
logging.info("Exporter stopping, logging out from Redfish sessions...")
|
||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
*(logout_host(session, h) for h in host_objs if h.session_token)
|
*(logout_host(session, h) for h in host_objs if h.session.token)
|
||||||
)
|
)
|
||||||
logging.info("All sessions logged out.")
|
logging.info("All sessions logged out.")
|
||||||
logging.info("Exporter stopped cleanly.")
|
logging.info("Exporter stopped cleanly.")
|
||||||
@@ -323,14 +656,19 @@ async def run_exporter(config, stop_event):
|
|||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Modern asyncio entry point"""
|
"""Modern asyncio entry point"""
|
||||||
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
|
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter.")
|
||||||
parser.add_argument("--config", default="config.yaml", help="Path to config file")
|
parser.add_argument("--config", default="config.yaml", help="Path to config file.")
|
||||||
parser.add_argument("--port", type=int, help="Override port from config file")
|
parser.add_argument("--port", type=int, help="Override port from config file.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--interval", type=int, help="Override interval from config file"
|
"--interval", type=int, help="Override interval from config file."
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--show-deprecated", action="store_true", help="Enable deprecated warnings in log.")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
show_deprecated_warnings = args.show_deprecated
|
||||||
|
if show_deprecated_warnings:
|
||||||
|
logging.warning("Deprecated warnings are enabled.")
|
||||||
|
|
||||||
# Load YAML config
|
# Load YAML config
|
||||||
with open(args.config, "r", encoding="utf-8") as file:
|
with open(args.config, "r", encoding="utf-8") as file:
|
||||||
config = yaml.safe_load(file)
|
config = yaml.safe_load(file)
|
||||||
@@ -341,13 +679,14 @@ async def main():
|
|||||||
if args.interval is not None:
|
if args.interval is not None:
|
||||||
config["interval"] = args.interval
|
config["interval"] = args.interval
|
||||||
|
|
||||||
|
|
||||||
stop_event = asyncio.Event()
|
stop_event = asyncio.Event()
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
# Handle SIGINT (Ctrl+C) and SIGTERM
|
# Handle SIGINT (Ctrl+C) and SIGTERM
|
||||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||||
loop.add_signal_handler(sig, stop_event.set)
|
loop.add_signal_handler(sig, stop_event.set)
|
||||||
|
|
||||||
await run_exporter(config, stop_event)
|
await run_exporter(config, stop_event, show_deprecated_warnings)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -1,652 +0,0 @@
|
|||||||
"""Simple Redfish exporter to collect Power data from bare matel server"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import signal
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
import asyncio
|
|
||||||
import aiohttp
|
|
||||||
import urllib3
|
|
||||||
import yaml
|
|
||||||
from prometheus_client import (
|
|
||||||
Gauge,
|
|
||||||
start_http_server,
|
|
||||||
Summary,
|
|
||||||
Counter,
|
|
||||||
Histogram,
|
|
||||||
Info,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class RedfishResource:
|
|
||||||
"""Container for Redfish resource URLs."""
|
|
||||||
chassis: str | None = None
|
|
||||||
systems: str | None = None
|
|
||||||
power: str | None = None
|
|
||||||
session_service: str | None = None
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class PowerMetrics:
|
|
||||||
"""Container for power metrics."""
|
|
||||||
voltage: float | None = None
|
|
||||||
watts: float | None = None
|
|
||||||
amps: float | None = None
|
|
||||||
serial: str | None = None
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class RedfishSession:
|
|
||||||
"""Container for Redfish session data."""
|
|
||||||
token: str | None = None
|
|
||||||
loggout_url: str | None = None
|
|
||||||
vendor: str | None = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class HostConfig:
|
|
||||||
"""Solve too many arguments"""
|
|
||||||
|
|
||||||
fqdn: str
|
|
||||||
username: str
|
|
||||||
password: str
|
|
||||||
chassis: list[str] | None = None
|
|
||||||
max_retries: int = 3 # 3 retires
|
|
||||||
backoff: int = 2 # wait 2 seconds
|
|
||||||
cool_down: int = 120 # seconds to wait after too many failures
|
|
||||||
failures: int = 0
|
|
||||||
next_retry_time: float = field(default=0.0, init=False)
|
|
||||||
session: RedfishSession = field(default_factory=RedfishSession)
|
|
||||||
|
|
||||||
# New attributes for Redfish stuff
|
|
||||||
vendor: str | None = None
|
|
||||||
session_token: str | None = None
|
|
||||||
session_logout: str | None = (
|
|
||||||
None # SessionLocation like /redfish/v1/SessionService/Sessions/marco.lucarelli%40abacus.ch00000000xxx/
|
|
||||||
)
|
|
||||||
|
|
||||||
def should_skip(self) -> bool:
|
|
||||||
"""Check if host is still in cool-down window"""
|
|
||||||
return time.monotonic() < self.next_retry_time
|
|
||||||
|
|
||||||
def mark_failure(self):
|
|
||||||
"""Increase failure counter and maybe trigger cool-down"""
|
|
||||||
self.failures += 1
|
|
||||||
if self.failures >= self.max_retries:
|
|
||||||
self.next_retry_time = time.monotonic() + self.cool_down
|
|
||||||
self.failures = 0 # reset after triggering cool-down
|
|
||||||
|
|
||||||
def mark_success(self):
|
|
||||||
"""Reset failure counter after a successful request"""
|
|
||||||
self.failures = 0
|
|
||||||
self.next_retry_time = 0.0
|
|
||||||
|
|
||||||
|
|
||||||
# Disable certificate warnings
|
|
||||||
urllib3.disable_warnings()
|
|
||||||
# set log config
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prometheus metrics
|
|
||||||
REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request")
|
|
||||||
REQUEST_LATENCY = Histogram(
|
|
||||||
"redfish_request_latency_seconds", "Time for Redfish request", ["host"]
|
|
||||||
)
|
|
||||||
UP_GAUGE = Gauge("redfish_up", "Host up/down", ["host"])
|
|
||||||
ERROR_COUNTER = Counter(
|
|
||||||
"redfish_errors_total", "Total Redfish errors", ["host", "error"]
|
|
||||||
)
|
|
||||||
VOLTAGE_GAUGE = Gauge(
|
|
||||||
"redfish_psu_line_input_voltage_volts",
|
|
||||||
"Line Input Voltage per PSU",
|
|
||||||
["host", "psu_serial"],
|
|
||||||
)
|
|
||||||
WATTS_GAUGE = Gauge(
|
|
||||||
"redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"]
|
|
||||||
)
|
|
||||||
AMPS_GAUGE = Gauge(
|
|
||||||
"redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"]
|
|
||||||
)
|
|
||||||
# set info metric
|
|
||||||
SYSTEM_INFO = Info(
|
|
||||||
"redfish_system_info", "System information (model, serial, etc.)", ["host"]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@REQUEST_TIME.time()
|
|
||||||
async def process_request(t):
|
|
||||||
"""Simulate request time"""
|
|
||||||
await asyncio.sleep(t)
|
|
||||||
|
|
||||||
|
|
||||||
async def probe_vendor(session, host: HostConfig) -> str | None:
|
|
||||||
"""Probe the vendor of the Redfish host."""
|
|
||||||
try:
|
|
||||||
async with session.get(
|
|
||||||
f"https://{host.fqdn}/redfish/v1/", ssl=False, timeout=10
|
|
||||||
) as resp:
|
|
||||||
if resp.status == 200:
|
|
||||||
data = await resp.json()
|
|
||||||
vendor = data.get("Vendor", "")
|
|
||||||
logging.debug("Detected vendor for %s: %s", host.fqdn, vendor)
|
|
||||||
return vendor
|
|
||||||
logging.warning(
|
|
||||||
"Vendor probe failed on %s: HTTP %s", host.fqdn, resp.status
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning("Vendor probe failed for %s: %s", host.fqdn, e)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def login_hpe(session, host: HostConfig) -> bool:
|
|
||||||
"""Login to HPE Redfish API and set session token."""
|
|
||||||
login_url = f"https://{host.fqdn}/redfish/v1/SessionService/Sessions"
|
|
||||||
payload = {"UserName": host.username, "Password": host.password}
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with session.post(login_url, json=payload, ssl=False, timeout=10) as login_resp:
|
|
||||||
if login_resp.status == 201:
|
|
||||||
host.session.token = login_resp.headers.get("X-Auth-Token")
|
|
||||||
host.session.logout_url = login_resp.headers.get("Location")
|
|
||||||
|
|
||||||
if not host.session.token or not host.session.logout_url:
|
|
||||||
raise RuntimeError("Invalid login response")
|
|
||||||
|
|
||||||
logging.info("New session token obtained for %s", host.fqdn)
|
|
||||||
return True
|
|
||||||
logging.warning(
|
|
||||||
"Login failed for %s: HTTP %s", host.fqdn, login_resp.status
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning("Login failed for %s: %s", host.fqdn, e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None:
|
|
||||||
"""Fetch JSON from Redfish with retry/backoff."""
|
|
||||||
if host.should_skip():
|
|
||||||
logging.warning(
|
|
||||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
|
||||||
)
|
|
||||||
UP_GAUGE.labels(host=host.fqdn).set(0)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Probe vendor if not already known
|
|
||||||
if not host.session.vendor:
|
|
||||||
host.session.vendor = await probe_vendor(session, host)
|
|
||||||
|
|
||||||
is_hpe = host.session.vendor and host.session.vendor.strip().upper().startswith("HPE")
|
|
||||||
|
|
||||||
for attempt in range(1, host.max_retries + 1):
|
|
||||||
try:
|
|
||||||
headers = {}
|
|
||||||
|
|
||||||
if is_hpe:
|
|
||||||
# Handle HPE session token
|
|
||||||
if not host.session.token:
|
|
||||||
if not await login_hpe(session, host):
|
|
||||||
# Retry login next attempt
|
|
||||||
continue
|
|
||||||
|
|
||||||
headers["X-Auth-Token"] = host.session.token
|
|
||||||
|
|
||||||
async with session.get(
|
|
||||||
url, headers=headers, ssl=False, timeout=10
|
|
||||||
) as resp:
|
|
||||||
if resp.status == 200:
|
|
||||||
host.mark_success()
|
|
||||||
return await resp.json()
|
|
||||||
elif resp.status in (401, 403):
|
|
||||||
# Token expired or invalid, clear it and retry
|
|
||||||
logging.warning(
|
|
||||||
"Invalid token for %s, reauthenticating...", host.fqdn
|
|
||||||
)
|
|
||||||
host.session.token = None
|
|
||||||
continue
|
|
||||||
logging.warning(
|
|
||||||
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Default: BasicAuth
|
|
||||||
async with session.get(
|
|
||||||
url,
|
|
||||||
auth=aiohttp.BasicAuth(host.username, host.password),
|
|
||||||
ssl=False,
|
|
||||||
timeout=10,
|
|
||||||
) as resp:
|
|
||||||
if resp.status == 200:
|
|
||||||
host.mark_success()
|
|
||||||
return await resp.json()
|
|
||||||
logging.warning(
|
|
||||||
"HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt
|
|
||||||
)
|
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt)
|
|
||||||
except aiohttp.ClientError as e:
|
|
||||||
logging.warning(
|
|
||||||
"Client error on %s (attempt %d): %s", host.fqdn, attempt, e
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logging.exception(
|
|
||||||
"Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e
|
|
||||||
)
|
|
||||||
|
|
||||||
if attempt < host.max_retries:
|
|
||||||
await asyncio.sleep(host.backoff * attempt)
|
|
||||||
else:
|
|
||||||
host.mark_failure()
|
|
||||||
logging.error("All retries failed for %s", host.fqdn)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def discover_redfish_resources(session, host: HostConfig) -> RedfishResource | None:
|
|
||||||
"""Discover available Redfish resources and return relevant URLs"""
|
|
||||||
root_url = f"https://{host.fqdn}/redfish/v1/"
|
|
||||||
data = await fetch_with_retry(session, host, root_url)
|
|
||||||
if not data:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
# Create RedfishRessource object
|
|
||||||
resources = RedfishResource(
|
|
||||||
chassis=data.get("Chassis", {}).get("@odata.id"),
|
|
||||||
systems=data.get("Systems", {}).get("@odata.id"),
|
|
||||||
session_service=data.get("SessionService", {}).get("@odata.id"),
|
|
||||||
)
|
|
||||||
|
|
||||||
if not resources.chassis:
|
|
||||||
logging.error("No valid Chassis URL found for host %s", host.fqdn)
|
|
||||||
return None
|
|
||||||
|
|
||||||
return resources
|
|
||||||
|
|
||||||
|
|
||||||
def get_power_resource_info(
|
|
||||||
member_data: dict, host_fqdn: str
|
|
||||||
) -> tuple[str | None, str | None]:
|
|
||||||
"""Get the URL and type of Power resource (PowerSubsystem or Power)."""
|
|
||||||
# Try PowerSubsystem (new Redfish versions)
|
|
||||||
power_url = member_data.get("PowerSubsystem", {}).get("@odata.id")
|
|
||||||
if power_url:
|
|
||||||
return f"https://{host_fqdn}{power_url}", "PowerSubsystem"
|
|
||||||
|
|
||||||
# Try Power for older Redfish versions
|
|
||||||
power_url = member_data.get("Power", {}).get("@odata.id")
|
|
||||||
if power_url:
|
|
||||||
logging.warning(
|
|
||||||
"DEPRECATED: Host %s uses old Redfish API (Power instead of PowerSubsystem). "
|
|
||||||
"Consider updating the firmware for full compatibility.",
|
|
||||||
host_fqdn,
|
|
||||||
)
|
|
||||||
return f"https://{host_fqdn}{power_url}", "Power"
|
|
||||||
|
|
||||||
# Nothing found -> Error
|
|
||||||
logging.error("No Power or PowerSubsystem found for host %s", host_fqdn)
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
|
|
||||||
def process_power_supplies_url(
|
|
||||||
power_data: dict, power_resource_type: str, host_fqdn: str
|
|
||||||
) -> str | None:
|
|
||||||
"""Get the URL for PowerSupplies based on the Power resource type."""
|
|
||||||
if power_resource_type == "PowerSubsystem":
|
|
||||||
# Bei PowerSubsystem: PowerSupplies ist ein separates Objekt
|
|
||||||
power_supplies_url = power_data.get("PowerSupplies", {}).get("@odata.id")
|
|
||||||
if power_supplies_url:
|
|
||||||
return f"https://{host_fqdn}{power_supplies_url}"
|
|
||||||
|
|
||||||
elif power_resource_type == "Power":
|
|
||||||
# Bei Power: PowerSupplies ist direkt im Power-Objekt enthalten
|
|
||||||
if "PowerSupplies" in power_data:
|
|
||||||
return f"https://{host_fqdn}/redfish/v1/Chassis/1/Power"
|
|
||||||
|
|
||||||
logging.error("No PowerSupplies found in Power resource for host %s", host_fqdn)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def process_power_supplies(
|
|
||||||
power_data: dict,
|
|
||||||
power_resource_type: str,
|
|
||||||
) -> list[dict] | None:
|
|
||||||
"""Get PowerSupplies data based on the Power resource type."""
|
|
||||||
if power_resource_type == "PowerSubsystem":
|
|
||||||
# PowerSubsystem: PowerSupplies is a ressource with Members
|
|
||||||
power_supplies_url = power_data.get("PowerSupplies", {}).get("@odata.id")
|
|
||||||
if not power_supplies_url:
|
|
||||||
logging.error("No PowerSupplies URL found for PowerSubsystem")
|
|
||||||
return None
|
|
||||||
return None # If none, then use the PowerSubsystem member url
|
|
||||||
|
|
||||||
elif power_resource_type == "Power":
|
|
||||||
# Power: PowerSupplies is an array!
|
|
||||||
return power_data.get("PowerSupplies", [])
|
|
||||||
|
|
||||||
logging.error("Unknown power resource type")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def process_power_supply(
|
|
||||||
session, host: HostConfig, psu_data: dict, power_resource_type: str
|
|
||||||
) -> PowerMetrics | None:
|
|
||||||
"""Extract metrics from PowerSupply"""
|
|
||||||
serial = psu_data.get("SerialNumber")
|
|
||||||
metrics = PowerMetrics(serial=serial)
|
|
||||||
|
|
||||||
if power_resource_type == "PowerSubsystem":
|
|
||||||
# New Redfish API: Metrics are an own "Metrics" ressource
|
|
||||||
metrics_url = psu_data.get("Metrics", {}).get("@odata.id")
|
|
||||||
if not metrics_url:
|
|
||||||
logging.warning("No Metrics found for PowerSupply %s", psu_data.get("Id"))
|
|
||||||
return None
|
|
||||||
|
|
||||||
metrics_url = f"https://{host.fqdn}{metrics_url}"
|
|
||||||
metrics_data = await fetch_with_retry(session, host, metrics_url)
|
|
||||||
if not metrics_data:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get metrics from Metrics ressource
|
|
||||||
metrics.voltage = metrics_data.get("InputVoltage", {}).get("Reading")
|
|
||||||
metrics.watts = metrics_data.get("InputPowerWatts", {}).get("Reading")
|
|
||||||
metrics.amps = metrics_data.get("InputCurrentAmps", {}).get("Reading")
|
|
||||||
|
|
||||||
elif power_resource_type == "Power":
|
|
||||||
# Older Redfish API: Metrics are direct in PowerSupply as an array
|
|
||||||
metrics.voltage = psu_data.get("LineInputVoltage")
|
|
||||||
metrics.watts = psu_data.get("PowerInputWatts")
|
|
||||||
if metrics.watts is None:
|
|
||||||
metrics.watts = psu_data.get("LastPowerOutputWatts")
|
|
||||||
metrics.amps = psu_data.get("InputCurrentAmps")
|
|
||||||
if metrics.amps is None and metrics.voltage and metrics.watts:
|
|
||||||
metrics.amps = round(metrics.watts / metrics.voltage, 2)
|
|
||||||
|
|
||||||
else:
|
|
||||||
logging.error(
|
|
||||||
"Unknown power resource type for PowerSupply %s", psu_data.get("Id")
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
return metrics
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(url: str) -> str:
|
|
||||||
"""Ensure URL does not end with a trailing slash."""
|
|
||||||
# I needed this for realy old Redfish versions :S (<1.6.0)
|
|
||||||
if url.endswith("/"):
|
|
||||||
return url[:-1] # Remove trailing slash
|
|
||||||
return url
|
|
||||||
|
|
||||||
|
|
||||||
async def get_power_data(session, host: HostConfig):
|
|
||||||
"""Query Redfish for power data and update Prometheus metrics"""
|
|
||||||
if host.should_skip():
|
|
||||||
logging.warning(
|
|
||||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
|
||||||
)
|
|
||||||
UP_GAUGE.labels(host=host.fqdn).set(0)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Start time measurement
|
|
||||||
start = time.monotonic()
|
|
||||||
# Get Root ressources
|
|
||||||
resources = await discover_redfish_resources(session, host)
|
|
||||||
if not resources or not resources.chassis:
|
|
||||||
logging.error("Could not discover any resources for %s", host.fqdn)
|
|
||||||
host.mark_failure()
|
|
||||||
UP_GAUGE.labels(host=host.fqdn).set(0)
|
|
||||||
return
|
|
||||||
|
|
||||||
host.mark_success()
|
|
||||||
UP_GAUGE.labels(host=host.fqdn).set(1)
|
|
||||||
|
|
||||||
chassis_url = resources.get("Chassis")
|
|
||||||
chassis_data = await fetch_with_retry(session, host, chassis_url)
|
|
||||||
if not chassis_data:
|
|
||||||
host.mark_failure()
|
|
||||||
UP_GAUGE.labels(host=host.fqdn).set(0)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
for chassis_member in chassis_data.get("Members", []):
|
|
||||||
chassis_member_url = chassis_member.get("@odata.id")
|
|
||||||
if not chassis_member_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Normalize URL... I needed this for realy old Redfish versions :S (<1.6.0)
|
|
||||||
chassis_member_url = normalize_url(chassis_member_url)
|
|
||||||
# Get chassis id from url ("/redfish/v1/Chassis/1" -> 1)
|
|
||||||
chassis_member_id = chassis_member_url.split("/")[-1]
|
|
||||||
# Check if the chassis id is in config (had problem with chassis "NVMe")
|
|
||||||
if hasattr(host, "chassis") and host.chassis:
|
|
||||||
if chassis_member_id not in host.chassis:
|
|
||||||
continue
|
|
||||||
|
|
||||||
member_url = f"https://{host.fqdn}{chassis_member_url}"
|
|
||||||
member_data = await fetch_with_retry(session, host, member_url)
|
|
||||||
if not member_data:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get Power ressource (fallback to "Power")
|
|
||||||
power_resource_url, power_resource_type = get_power_resource_info(member_data, host.fqdn)
|
|
||||||
if not power_resource_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get Power Data
|
|
||||||
power_data = await fetch_with_retry(session, host, power_resource_url)
|
|
||||||
if not power_data:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get PowerSupplies, depend on ressource type ("Power" or "PowerSubsystem")
|
|
||||||
if power_resource_type == "PowerSubsystem":
|
|
||||||
# Request PowerSupplies url (for PowerSubsystem)
|
|
||||||
power_supplies_url = power_data.get("PowerSupplies", {}).get("@odata.id")
|
|
||||||
if not power_supplies_url:
|
|
||||||
logging.warning("No PowerSupplies found for %s", host.fqdn)
|
|
||||||
continue
|
|
||||||
|
|
||||||
power_supplies_url = f"https://{host.fqdn}{power_supplies_url}"
|
|
||||||
power_supplies_data = await fetch_with_retry(session, host, power_supplies_url)
|
|
||||||
if not power_supplies_data:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# loop over Members for "PowerSubsystem"
|
|
||||||
for psu_member in power_supplies_data.get("Members", []):
|
|
||||||
psu_url = psu_member.get("@odata.id")
|
|
||||||
if not psu_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
psu_url = f"https://{host.fqdn}{psu_url}"
|
|
||||||
psu_data = await fetch_with_retry(session, host, psu_url)
|
|
||||||
if not psu_data:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Process PowerSupplies object
|
|
||||||
metrics = await process_power_supply(session, host, psu_data, "PowerSubsystem")
|
|
||||||
if metrics:
|
|
||||||
update_prometheus_metrics(host, metrics)
|
|
||||||
|
|
||||||
elif power_resource_type == "Power":
|
|
||||||
# Loop over PowerSupplies for older Redfish versions
|
|
||||||
for psu in power_data.get("PowerSupplies", []):
|
|
||||||
# Process PowerSupplies object
|
|
||||||
metrics = await process_power_supply(session, host, psu, "Power")
|
|
||||||
if metrics:
|
|
||||||
update_prometheus_metrics(host, metrics)
|
|
||||||
|
|
||||||
else:
|
|
||||||
logging.error("Unknown power resource type for host %s", host.fqdn)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Measure request and process latency
|
|
||||||
REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start)
|
|
||||||
|
|
||||||
def update_prometheus_metrics(host: HostConfig, metrics: PowerMetrics):
|
|
||||||
"""Update Prometheus metrics with PowerMetrics data."""
|
|
||||||
if metrics.voltage is not None and metrics.serial:
|
|
||||||
VOLTAGE_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.voltage)
|
|
||||||
if metrics.watts is not None and metrics.serial:
|
|
||||||
WATTS_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.watts)
|
|
||||||
if metrics.amps is not None and metrics.serial:
|
|
||||||
AMPS_GAUGE.labels(host=host.fqdn, psu_serial=metrics.serial).set(metrics.amps)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_system_info(session, host: HostConfig):
|
|
||||||
"""Query Redfish for system data and update Prometheus metrics"""
|
|
||||||
if host.should_skip():
|
|
||||||
logging.warning(
|
|
||||||
"Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Get Redfish Version
|
|
||||||
root_url = f"https://{host.fqdn}/redfish/v1/"
|
|
||||||
root_data = await fetch_with_retry(session, host, root_url)
|
|
||||||
if not root_data:
|
|
||||||
host.mark_failure()
|
|
||||||
return
|
|
||||||
|
|
||||||
redfish_version = root_data.get("RedfishVersion")
|
|
||||||
# Get Manufacturer, Serial and Model
|
|
||||||
systems_url = f"https://{host.fqdn}/redfish/v1/Systems/"
|
|
||||||
systems_data = await fetch_with_retry(session, host, systems_url)
|
|
||||||
if not systems_data:
|
|
||||||
host.mark_failure()
|
|
||||||
return
|
|
||||||
|
|
||||||
# loop for each system members
|
|
||||||
for system_member in systems_data.get("Members", []):
|
|
||||||
system_url = system_member.get("@odata.id")
|
|
||||||
if not system_url:
|
|
||||||
continue
|
|
||||||
|
|
||||||
system_data = await fetch_with_retry(
|
|
||||||
session, host, f"https://{host.fqdn}{system_url}"
|
|
||||||
)
|
|
||||||
if not system_data:
|
|
||||||
continue
|
|
||||||
|
|
||||||
manufacturer = system_data.get("Manufacturer")
|
|
||||||
model = system_data.get("Model")
|
|
||||||
serial_number = system_data.get("SerialNumber")
|
|
||||||
|
|
||||||
# Hier könnte ihre Werbung stehen
|
|
||||||
SYSTEM_INFO.labels(host=host.fqdn).info(
|
|
||||||
{
|
|
||||||
"manufacturer": manufacturer,
|
|
||||||
"model": model,
|
|
||||||
"serial_number": serial_number,
|
|
||||||
"redfish_version": redfish_version,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def logout_host(session, host):
|
|
||||||
"""Clean logout for Redfish with session tokens"""
|
|
||||||
if not host.session.token or not host.session.logout_url:
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
logout_url = host.session.logout_url
|
|
||||||
async with session.delete(
|
|
||||||
logout_url,
|
|
||||||
headers={"X-Auth-Token": host.session.token},
|
|
||||||
ssl=False,
|
|
||||||
timeout=5,
|
|
||||||
) as resp:
|
|
||||||
if resp.status in (200, 204):
|
|
||||||
logging.info("Logged out from %s", host.fqdn)
|
|
||||||
else:
|
|
||||||
logging.warning(
|
|
||||||
"Logout failed for %s (HTTP %s)", host.fqdn, resp.status
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning("Error during logout for %s: %s", host.fqdn, e)
|
|
||||||
finally:
|
|
||||||
host.session.token = None
|
|
||||||
host.session.logout_url = None
|
|
||||||
|
|
||||||
|
|
||||||
async def run_exporter(config, stop_event):
|
|
||||||
"""Main loop"""
|
|
||||||
port = config.get("port", 8000)
|
|
||||||
default_username = config.get("username")
|
|
||||||
default_password = config.get("password")
|
|
||||||
default_chassis = config.get("chassis")
|
|
||||||
hosts = config["hosts"]
|
|
||||||
interval = config.get("interval", 10)
|
|
||||||
|
|
||||||
# Start Prometheus metrics server
|
|
||||||
start_http_server(port)
|
|
||||||
logging.info("Prometheus metrics server running on port %s", port)
|
|
||||||
|
|
||||||
# create persistent HostConfig objects
|
|
||||||
host_objs = []
|
|
||||||
for host_entry in hosts:
|
|
||||||
if isinstance(host_entry, dict):
|
|
||||||
hc = HostConfig(
|
|
||||||
fqdn=host_entry["fqdn"],
|
|
||||||
username=host_entry.get("username", default_username),
|
|
||||||
password=host_entry.get("password", default_password),
|
|
||||||
chassis=host_entry.get("chassis", default_chassis),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
hc = HostConfig(
|
|
||||||
fqdn=host_entry, username=default_username, password=default_password
|
|
||||||
)
|
|
||||||
host_objs.append(hc)
|
|
||||||
|
|
||||||
# Connection pooling with aiohttp
|
|
||||||
connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300)
|
|
||||||
async with aiohttp.ClientSession(connector=connector) as session:
|
|
||||||
try:
|
|
||||||
while not stop_event.is_set():
|
|
||||||
tasks = []
|
|
||||||
for hc in host_objs:
|
|
||||||
tasks.append(get_power_data(session, hc))
|
|
||||||
tasks.append(get_system_info(session, hc))
|
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
await process_request(interval)
|
|
||||||
finally:
|
|
||||||
# Graceful shutdown: logout from Redfish sessions
|
|
||||||
logging.info("Exporter stopping, logging out from Redfish sessions...")
|
|
||||||
await asyncio.gather(
|
|
||||||
*(logout_host(session, h) for h in host_objs if h.session_token)
|
|
||||||
)
|
|
||||||
logging.info("All sessions logged out.")
|
|
||||||
logging.info("Exporter stopped cleanly.")
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Modern asyncio entry point"""
|
|
||||||
parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter")
|
|
||||||
parser.add_argument("--config", default="config.yaml", help="Path to config file")
|
|
||||||
parser.add_argument("--port", type=int, help="Override port from config file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--interval", type=int, help="Override interval from config file"
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Load YAML config
|
|
||||||
with open(args.config, "r", encoding="utf-8") as file:
|
|
||||||
config = yaml.safe_load(file)
|
|
||||||
|
|
||||||
# Override port if argument is provided
|
|
||||||
if args.port is not None:
|
|
||||||
config["port"] = args.port
|
|
||||||
if args.interval is not None:
|
|
||||||
config["interval"] = args.interval
|
|
||||||
|
|
||||||
stop_event = asyncio.Event()
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
# Handle SIGINT (Ctrl+C) and SIGTERM
|
|
||||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
|
||||||
loop.add_signal_handler(sig, stop_event.set)
|
|
||||||
|
|
||||||
await run_exporter(config, stop_event)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
Reference in New Issue
Block a user