diff --git a/python/redfish-api/Dockerfile b/python/redfish-api/Dockerfile new file mode 100644 index 0000000..2b30ed6 --- /dev/null +++ b/python/redfish-api/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3 + +EXPOSE 8000 + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY redfish_exporter.py . +COPY config.yaml . + +CMD [ "python", "./redfish_exporter.py" ] diff --git a/python/redfish-api/README.md b/python/redfish-api/README.md new file mode 100644 index 0000000..69de841 --- /dev/null +++ b/python/redfish-api/README.md @@ -0,0 +1,117 @@ +# Description + +I've createtd this python script to collect Power data to analyse Watts, Volts and Amperes. If there is a better solution, feel free to replace me. + +Usage: + +``` +usage: redfish_exporter.py [-h] [--config CONFIG] [--port PORT] + +Redfish Prometheus Exporter + +options: + -h, --help show this help message and exit + --config CONFIG Path to config file + --port PORT Override port from config file +``` + + +# Install + +## Requirements + +Dependencies: + +* see requirements.txt + +## Configuration + +Create `config.yaml`: + +```yaml +--- +interval: 5 +port: 8000 +username: user1 +password: secret +hosts: + - srv1-112.mgmt.wtb1.ch.abainfra.net + - srv2-112.mgmt.wtb1.ch.abainfra.net + - srv3-112.mgmt.wtb1.ch.abainfra.net + - srv4-112.mgmt.wtb1.ch.abainfra.net +``` + +or: + +```yaml +--- +interval: 5 +port: 8000 +username: user1 +password: secret1 +hosts: + - fqdn: srv1-112.mgmt.wtb1.ch.abainfra.net + username: user2 + password: secret2 + - fqdn: srv2-112.mgmt.wtb1.ch.abainfra.net + username: user3 + password: secret3 + - fqdn: srv3-112.mgmt.wtb1.ch.abainfra.net + username: user4 + password: secret4 + - fqdn: srv4-112.mgmt.wtb1.ch.abainfra.net + username: user5 + password: secret5 +``` + +The `port`, `interval` are optional and can be overwritten by argument. Save default values are hardcoded. + + +# Use as Container + +``` +docker build -t redfish_exporter . +docker run -it --rm --name redfish_exporter_app -p 8000:8000 redfish_exporter:latest +``` + +# Legacy way + +```bash +mkdir /srv/redfish-exporter +``` + +## Python dependencies + +```bash +cd /srv/redfish-exporter +python3 -m venv venv +source venv/bin/activate +pip install -r requirements.txt +``` + +## Create user + +```bash +sudo useradd -r -s /bin/false redfish +``` + +## Install systemd unit file + +```bash +sudo cp redfish-exporter.service /etc/systemd/system/redfish-exporter.service +sudo systemctl daemon-reload +sudo systemctl enable --now redfish-exporter.service +``` + +# Usefull oneliners + +## public IP with curl + +```bash +curl icanhazip.com +curl -4 icanhazip.com +curl -6 icanhazip.com + +curl 'https://api.ipify.org?format=json' +curl 'https://api64.ipify.org?format=json' +``` diff --git a/python/redfish-api/config.yaml.example b/python/redfish-api/config.yaml.example new file mode 100644 index 0000000..f323556 --- /dev/null +++ b/python/redfish-api/config.yaml.example @@ -0,0 +1,14 @@ +--- +interval: 10 +port: 8000 +username: gloabl-user +password: global-password +hosts: + - fqdn: host1.example.com + username: user1 + password: secret1 + - fqdn: host2.example.com + username: user2 + password: secret2 + - fqdn: host3.example.com + - fqdn: host4.example.com diff --git a/python/redfish-api/get_power_redfishapi.py b/python/redfish-api/get_power_redfishapi.py new file mode 100644 index 0000000..563700c --- /dev/null +++ b/python/redfish-api/get_power_redfishapi.py @@ -0,0 +1,35 @@ +import requests +import urllib3 +# import sys + +urllib3.disable_warnings() + +username = "" +password = "" +# host = sys.argv[1] + + +def get_power_data(host): + """Redfish API Chassis Power""" + url = f"https://{host}.mgmt.wtb1.ch.abainfra.net/redfish/v1/Chassis/1/Power" + response = requests.get(url, auth=(username, password), verify=False) + + if response.status_code == 200: + data = response.json() + for idx, psu in enumerate(data.get("PowerSupplies", [])): + line_input_v = psu.get("LineInputVoltage") + watts_input = psu.get("PowerInputWatts") + serial = psu.get("SerialNumber") + print( + f"PSU {idx}, {serial}: {host}, {line_input_v} V, {watts_input} W, {round(watts_input/line_input_v,2)} A" + ) + else: + print(f"Error {response.status_code}: {response.text}") + + +# loop over each hosts +hosts = [ + "srv1-112", +] +for host in hosts: + get_power_data(host) diff --git a/python/redfish-api/redfish-exporter.service b/python/redfish-api/redfish-exporter.service new file mode 100644 index 0000000..0879114 --- /dev/null +++ b/python/redfish-api/redfish-exporter.service @@ -0,0 +1,14 @@ +[Unit] +Description=Redfish Prometheus Exporter +After=network.target + +[Service] +Type=simple +WorkingDirectory=/srv/redfish-exporter +ExecStart=/srv/redfish-exporter/venv/bin/python redfish_exporter.py +Restart=on-failure +User=redfish +Group=redfish + +[Install] +WantedBy=multi-user.target diff --git a/python/redfish-api/redfish_exporter.py b/python/redfish-api/redfish_exporter.py new file mode 100644 index 0000000..89a6430 --- /dev/null +++ b/python/redfish-api/redfish_exporter.py @@ -0,0 +1,235 @@ +"""Simple Redfish exporter to collect Power data from bare matel server""" + +import argparse +import signal +import time +import logging +from dataclasses import dataclass, field +import asyncio +import aiohttp +import urllib3 +import yaml +from prometheus_client import Gauge, start_http_server, Summary, Counter, Histogram + + +@dataclass +class HostConfig: + """Solve too many arguments""" + + fqdn: str + username: str + password: str + max_retries: int = 1 + backoff: int = 2 + cool_down: int = 120 # seconds to wait after too many failures + failures: int = 0 + next_retry_time: float = field(default=0.0, init=False) + + def should_skip(self) -> bool: + """Check if host is still in cool-down window""" + return time.monotonic() < self.next_retry_time + + def mark_failure(self): + """Increase failure counter and maybe trigger cool-down""" + self.failures += 1 + if self.failures >= self.max_retries: + self.next_retry_time = time.monotonic() + self.cool_down + self.failures = 0 # reset after triggering cool-down + + def mark_success(self): + """Reset failure counter after a successful request""" + self.failures = 0 + self.next_retry_time = 0.0 + + +# Disable certificate warnings +urllib3.disable_warnings() +# set log config +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" +) + +# Prometheus metrics +REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request") +REQUEST_LATENCY = Histogram( + "redfish_request_latency_seconds", "Time for Redfish request", ["host"] +) +up_gauge = Gauge("redfish_up", "Host up/down", ["host"]) +error_counter = Counter( + "redfish_errors_total", "Total Redfish errors", ["host", "error"] +) +voltage_gauge = Gauge( + "redfish_psu_line_input_voltage_volts", + "Line Input Voltage per PSU", + ["host", "psu_serial"], +) +watts_gauge = Gauge( + "redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"] +) +amps_gauge = Gauge( + "redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"] +) + + +@REQUEST_TIME.time() +async def process_request(t): + """Simulate request time""" + await asyncio.sleep(t) + + +async def fetch_with_retry(session, host: HostConfig, url: str) -> dict | None: + """Fetch JSON from Redfish with retry/backoff""" + if host.should_skip(): + logging.warning( + "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time + ) + up_gauge.labels(host=host.fqdn).set(0) + return None + + for attempt in range(1, host.max_retries + 1): + try: + async with session.get( + url, + auth=aiohttp.BasicAuth(host.username, host.password), + ssl=False, + timeout=10, + ) as resp: + if resp.status == 200: + host.mark_success() + return await resp.json() + logging.warning( + "HTTP %s from %s (attempt %d)", resp.status, host.fqdn, attempt + ) + + except asyncio.TimeoutError: + logging.warning("Timeout on %s (attempt %d)", host.fqdn, attempt) + except aiohttp.ClientError as e: + logging.warning( + "Client error on %s (attempt %d): %s", host.fqdn, attempt, e + ) + except Exception as e: + logging.exception( + "Unexpected error on %s (attempt %d): %s", host.fqdn, attempt, e + ) + + if attempt < host.max_retries: + await asyncio.sleep(host.backoff * attempt) + else: + host.mark_failure() + logging.error("All retries failed for %s", host.fqdn) + + return None + + +async def get_power_data(session, host: HostConfig): + """Query Redfish and update Prometheus metrics""" + if host.should_skip(): + logging.warning( + "Skipping %s (in cool-down until %.1f)", host.fqdn, host.next_retry_time + ) + up_gauge.labels(host=host.fqdn).set(0) + return + + url = f"https://{host.fqdn}/redfish/v1/Chassis/1/Power" + start = time.monotonic() + + data = await fetch_with_retry(session, host, url) + if not data: + host.mark_failure() + up_gauge.labels(host=host.fqdn).set(0) + return + + host.mark_success() + up_gauge.labels(host=host.fqdn).set(1) + + for psu in data.get("PowerSupplies", []): + line_input_v = psu.get("LineInputVoltage") + watts_input = psu.get("PowerInputWatts") + serial = psu.get("SerialNumber") + + amps = ( + round(watts_input / line_input_v, 2) + if line_input_v and watts_input + else None + ) + + if line_input_v is not None: + voltage_gauge.labels(host=host.fqdn, psu_serial=serial).set(line_input_v) + if watts_input is not None: + watts_gauge.labels(host=host.fqdn, psu_serial=serial).set(watts_input) + if amps is not None: + amps_gauge.labels(host=host.fqdn, psu_serial=serial).set(amps) + + REQUEST_LATENCY.labels(host=host.fqdn).observe(time.monotonic() - start) + + +async def run_exporter(config, stop_event): + """Main loop""" + port = config.get("port", 8000) + default_username = config.get("username") + default_password = config.get("password") + hosts = config["hosts"] + interval = config.get("interval", 10) + + # Start Prometheus metrics server + start_http_server(port) + logging.info("Prometheus metrics server running on port %s", port) + + # create persistent HostConfig objects + host_objs = [] + for host_entry in hosts: + if isinstance(host_entry, dict): + hc = HostConfig( + fqdn=host_entry["fqdn"], + username=host_entry.get("username", default_username), + password=host_entry.get("password", default_password), + ) + else: + hc = HostConfig( + fqdn=host_entry, username=default_username, password=default_password + ) + host_objs.append(hc) + + # Connection pooling with aiohttp + connector = aiohttp.TCPConnector(limit_per_host=5, limit=50, ttl_dns_cache=300) + async with aiohttp.ClientSession(connector=connector) as session: + while not stop_event.is_set(): + tasks = [get_power_data(session, hc) for hc in host_objs] + + await asyncio.gather(*tasks) + await process_request(interval) + + logging.info("Exporter stopped cleanly.") + + +async def main(): + """Modern asyncio entry point""" + parser = argparse.ArgumentParser(description="Redfish Prometheus Exporter") + parser.add_argument("--config", default="config.yaml", help="Path to config file") + parser.add_argument("--port", type=int, help="Override port from config file") + parser.add_argument( + "--interval", type=int, help="Override interval from config file" + ) + args = parser.parse_args() + + # Load YAML config + with open(args.config, "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + + # Override port if argument is provided + if args.port is not None: + config["port"] = args.port + if args.interval is not None: + config["interval"] = args.interval + + stop_event = asyncio.Event() + loop = asyncio.get_running_loop() + # Handle SIGINT (Ctrl+C) and SIGTERM + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, stop_event.set) + + await run_exporter(config, stop_event) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/python/redfish-api/redfish_exporter_simple.py b/python/redfish-api/redfish_exporter_simple.py new file mode 100644 index 0000000..ede4d61 --- /dev/null +++ b/python/redfish-api/redfish_exporter_simple.py @@ -0,0 +1,91 @@ +from prometheus_client import Gauge, start_http_server, Summary +import requests +import urllib3 +import random +import time +from concurrent.futures import ThreadPoolExecutor + +urllib3.disable_warnings() + +username = "" +password = "" + +# Create a metric to track time spent and requests made. +REQUEST_TIME = Summary("request_processing_seconds", "Time spent processing request") + + +# Decorate function with metric. +@REQUEST_TIME.time() +def process_request(t): + """A dummy function that takes some time.""" + time.sleep(t) + + +# Define Prometheus metrics +voltage_gauge = Gauge( + "redfish_psu_line_input_voltage_volts", + "Line Input Voltage per PSU", + ["host", "psu_serial"], +) +watts_gauge = Gauge( + "redfish_psu_power_input_watts", "Power Input Watts per PSU", ["host", "psu_serial"] +) +amps_gauge = Gauge( + "redfish_psu_input_amps", "Current draw in Amps per PSU", ["host", "psu_serial"] +) + + +def get_power_data(fqdn): + """Redfish API Chassis Power""" + url = f"https://{fqdn}/redfish/v1/Chassis/1/Power" + try: + response = requests.get( + url, auth=(username, password), verify=False, timeout=10 + ) + response.raise_for_status() + data = response.json() + for psu in data.get("PowerSupplies", []): + line_input_v = psu.get("LineInputVoltage") + watts_input = psu.get("PowerInputWatts") + serial = psu.get("SerialNumber") + + if line_input_v and watts_input: + amps = round(watts_input / line_input_v, 2) + else: + amps = None + + # Push metrics + if line_input_v is not None: + voltage_gauge.labels(host=fqdn, psu_serial=serial).set(line_input_v) + if watts_input is not None: + watts_gauge.labels(host=fqdn, psu_serial=serial).set(watts_input) + if amps is not None: + amps_gauge.labels(host=fqdn, psu_serial=serial).set(amps) + except Exception as e: + print(f"Error querying {url}: {e}") + + +if __name__ == "__main__": + # Start metrics server on port 8000 + start_http_server(8000) + hosts = [ + "srv1-119.mgmt.sgg1.ch.abainfra.net", + ] + + # Thread pool for parallel requests + executor = ThreadPoolExecutor(max_workers=len(hosts)) + + while True: + futures = [executor.submit(get_power_data, fqdn) for fqdn in hosts] + # wait for all to finish + for future in futures: + future.result() + # for fqdn in hosts: + # get_power_data(fqdn) + # Fixed scape interval: + # time.sleep(30) # scrape interval + # Random sleep between 0 and 5 seconds (inclusive) + # sleep_time = random.uniform(0, 5) + # process_request(sleep_time) + # Random sleep between 0 and 1 seconds: + process_request(random.random()) diff --git a/python/redfish-api/requirements.txt b/python/redfish-api/requirements.txt new file mode 100644 index 0000000..6f5d0d5 --- /dev/null +++ b/python/redfish-api/requirements.txt @@ -0,0 +1,6 @@ +prometheus-client==0.23.1 +requests==2.32.5 +urllib3==2.5.0 +aiohttp==3.12.15 +asyncio==4.0.0 +PyYAML==6.0.2