From cc0ad5b28624c865efc84a52c357aa7fa1525b72 Mon Sep 17 00:00:00 2001 From: Chris Farhood Date: Wed, 18 Feb 2026 22:07:19 -0500 Subject: [PATCH] docs: document metric availability and requirements in MetricsPage Add a file-level comment and in-page requirements section explaining exactly what is and isn't available for each metric type: Power (W) -- available on discrete GPU nodes via node-exporter hwmon collector + i915 driver (no extra config) Frequency (MHz) -- NOT available; node-exporter --collector.drm is AMD-only and does not read i915 gt_freq sysfs Utilization (%) -- NOT available; no standard Prometheus collector supports i915 engine busy metrics iGPU nodes -- no metrics at all (iGPU driver has no hwmon) The in-page MetricRequirements component surfaces this information directly in the UI so operators know what to expect and why. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- src/components/MetricsPage.tsx | 141 +++++++++++++++++++++++++-------- 1 file changed, 107 insertions(+), 34 deletions(-) diff --git a/src/components/MetricsPage.tsx b/src/components/MetricsPage.tsx index f1c301e..fc47ccc 100644 --- a/src/components/MetricsPage.tsx +++ b/src/components/MetricsPage.tsx @@ -1,9 +1,29 @@ /** - * MetricsPage — Intel GPU power metrics from Prometheus (node-exporter hwmon). + * MetricsPage — Intel GPU metrics from Prometheus (node-exporter hwmon). * - * The Intel i915/Xe GPU driver exposes hwmon sensors which node-exporter scrapes. - * This page queries kube-prometheus-stack for real-time GPU power draw - * (derived from node_hwmon_energy_joule_total rate) and TDP per GPU node. + * METRIC AVAILABILITY + * ------------------- + * Power (current W, TDP) + * Source: node_hwmon_energy_joule_total, node_hwmon_power_max_watt + * Driver: i915 hwmon sysfs (/sys/class/drm/card{N}/device/hwmon/) + * Scraped: node-exporter hwmon collector (enabled by default) + * Nodes: Discrete GPU nodes only (i915 driver exposes hwmon; iGPU driver does not) + * No extra config required — works out of the box with kube-prometheus-stack. + * + * GPU Frequency (current, boost, min, max MHz) + * Source: DRM sysfs (/sys/class/drm/card{N}/gt_{x}_freq_mhz) + * Driver: i915 kernel driver + * Scraped: NOT available -- node-exporter --collector.drm is AMD-only and does not + * read i915 gt_freq sysfs files. Would require a custom exporter or + * node-exporter textfile collector sidecar writing these values. + * + * GPU Utilization (engine busy %) + * Source: Not exposed via hwmon or any standard Prometheus collector for i915. + * Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter. + * + * Integrated GPU (iGPU) nodes + * The iGPU driver does not expose hwmon sensors. No Prometheus metrics are + * available for iGPU nodes regardless of configuration. */ import { @@ -69,20 +89,14 @@ function GpuChipCard({ chip }: { chip: GpuChipMetrics }) { const rows: Array<{ name: string; value: React.ReactNode }> = [ { name: 'Node', value: chip.nodeName }, { name: 'GPU (PCI)', value: chip.chip }, + { + name: 'Current Power', + value: chip.powerWatts !== null + ? + : No data — needs ≥5m of scrape history, + }, ]; - if (chip.powerWatts !== null) { - rows.push({ - name: 'Current Power', - value: , - }); - } else { - rows.push({ - name: 'Current Power', - value: No data (needs ≥5m of scrape history), - }); - } - if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) { rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) }); } @@ -94,6 +108,70 @@ function GpuChipCard({ chip }: { chip: GpuChipMetrics }) { ); } +// --------------------------------------------------------------------------- +// Requirements info box +// --------------------------------------------------------------------------- + +function MetricRequirements() { + return ( + + + Available — discrete GPU nodes +
+ Source: node_hwmon_energy_joule_total via node-exporter hwmon collector (enabled by default). + Requires the i915 kernel driver on the node. iGPU nodes do not expose hwmon sensors. +
+ + ), + }, + { + name: 'Frequency (MHz)', + value: ( + <> + Not available +
+ i915 exposes gt_*_freq_mhz via DRM sysfs but node-exporter's{' '} + --collector.drm flag is AMD-only and does not read these files. + A custom exporter or textfile-collector sidecar writing these values would be required. +
+ + ), + }, + { + name: 'Utilization (%)', + value: ( + <> + Not available +
+ No standard Prometheus collector exposes i915 engine busy percentage. + Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter. +
+ + ), + }, + { + name: 'iGPU nodes', + value: ( + <> + No metrics available +
+ The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics + are available for iGPU nodes regardless of configuration. +
+ + ), + }, + ]} + /> +
+ ); +} + // --------------------------------------------------------------------------- // Main page // --------------------------------------------------------------------------- @@ -154,10 +232,12 @@ export default function MetricsPage() { + + {fetching && !metrics && } {fetchError && ( - + {fetchError}, }, { - name: 'Data Source', - value: 'node_hwmon_energy_joule_total (chip_name="i915") via kube-prometheus-stack', - }, - { - name: 'Requirements', - value: 'kube-prometheus-stack installed in monitoring namespace with node-exporter enabled', + name: 'Checked services', + value: 'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)', }, ]} /> @@ -178,27 +254,24 @@ export default function MetricsPage() { )} {metrics && metrics.chips.length === 0 && ( - + - Prometheus is reachable but no i915 hwmon chips found + Prometheus reachable — no node_hwmon_chip_names{chip_name="i915"} found ), }, { - name: 'Note', - value: 'The i915 driver exposes hwmon sensors on discrete Intel GPU nodes. ' + - 'Ensure node-exporter is running on GPU nodes with hwmon collector enabled.', + name: 'GPU Nodes', + value: gpuNodes.length > 0 ? gpuNodes.map(n => n.metadata.name).join(', ') : 'None detected', }, { - name: 'GPU Nodes', - value: gpuNodes.length > 0 - ? gpuNodes.map(n => n.metadata.name).join(', ') - : 'None detected', + name: 'Likely cause', + value: 'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.', }, ]} /> @@ -227,8 +300,8 @@ export default function MetricsPage() { value: new Date(metrics.fetchedAt).toLocaleTimeString(), }, { - name: 'Data Source', - value: 'node-exporter hwmon · i915 driver · rate(node_hwmon_energy_joule_total[5m])', + name: 'Query', + value: 'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}', }, ]} />