chore: bump to v0.3.0

Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
docs: document metric availability and requirements in MetricsPage
2026-02-19 05:57:13 -05:00 · 2026-02-18 22:07:19 -05:00 · 2026-02-18 21:37:16 -05:00
4 changed files with 346 additions and 426 deletions
@@ -1,4 +1,4 @@
-version: "0.2.0"
+version: "0.3.0"
 name: headlamp-intel-gpu-plugin
 displayName: Intel GPU
 description: >-
@@ -15,7 +15,7 @@ license: Apache-2.0
 category: monitoring-logging
 homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
-appVersion: "0.2.0"
+appVersion: "0.3.0"
 keywords:
  - headlamp
@@ -46,7 +46,9 @@ links:
 changes:
  - kind: added
-    description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
+    description: "Metrics page: document which metrics require what infrastructure (power via hwmon works out of the box; frequency and utilization need custom exporters)"
  - kind: added
    description: "Metrics page: real-time GPU power draw (W) and TDP via node-exporter i915 hwmon metrics in kube-prometheus-stack"
  - kind: changed
    description: "Sidebar label changed to intel-gpu"
  - kind: removed
@@ -69,7 +71,7 @@ changes:
    description: "App bar health badge: hidden when no Intel GPU plugin detected"
 annotations:
-  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
+  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.3.0/headlamp-intel-gpu-plugin-0.3.0.tar.gz"
-  headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
+  headlamp/plugin/archive-checksum: "sha256:fdc53099ee3123680f24fe4a319b753ca3d030aac31abd4e3f383221085c9c2d"
  headlamp/plugin/version-compat: ">=0.20.0"
  headlamp/plugin/distro-compat: "in-cluster,web,app"
@@ -1,6 +1,6 @@
 {
  "name": "headlamp-intel-gpu-plugin",
-  "version": "0.2.0",
+  "version": "0.3.0",
  "description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
  "repository": {
    "type": "git",
@@ -1,16 +1,15 @@
 /**
- * Prometheus text format parser for Intel GPU device plugin metrics.
+ * Intel GPU metrics via Prometheus (kube-prometheus-stack).
 *
- * Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
+ * The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter
- * via the Kubernetes API proxy and parses key metric families.
+ * scrapes automatically. We query Prometheus for:
 *   - node_hwmon_energy_joule_total  (chip_name="i915") → rate = power in W
 *   - node_hwmon_power_max_watt      (same chip)        → TDP
 *   - node_hwmon_chip_names          (chip_name="i915") → identify GPU chips
 *   - node_uname_info                                   → instance → nodename
 *
- * Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
+ * Queries go through the Kubernetes API proxy to the in-cluster Prometheus
- *   gpu_i915_engine_active_ticks  — engine busy ticks (per card, engine)
+ * service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/...
 *   gpu_i915_engine_total_ticks   — engine total ticks (for utilization %)
 *   gpu_i915_energy_microjoules   — cumulative energy (µJ → power = delta/dt)
 *   gpu_i915_gt_boost_freq_mhz    — current GT boost frequency (MHz)
 *   gpu_i915_memory_local         — local (VRAM) memory usage (bytes)
 *   gpu_i915_memory_system        — system memory usage (bytes)
 */
 import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
@@ -19,239 +18,152 @@ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
 // Types
 // ---------------------------------------------------------------------------
-export interface MetricSample {
+export interface GpuChipMetrics {
-  labels: Record<string, string>;
+  /** Kubernetes node name (e.g. "buttons") */
  value: number;
 }
 export interface MetricFamily {
  name: string;
  help: string;
  type: string;
  samples: MetricSample[];
 }
 export type ParsedMetrics = Map<string, MetricFamily>;
 export interface GpuNodeMetrics {
  /** Node name this metric set was fetched from (via plugin pod) */
  nodeName: string;
-  /** Pod name of the intel-gpu-plugin daemonset pod */
+  /** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */
-  podName: string;
+  chip: string;
-  /** Engine utilization per (card, engine): 0–100 */
+  /** node-exporter instance (IP:port) */
-  engineUtilization: Array<{ card: string; engine: string; pct: number }>;
+  instance: string;
-  /** Current GT boost frequency in MHz per card */
+  /** Current power draw in watts (rate of energy counter, null if unavailable) */
-  boostFreqMhz: Array<{ card: string; value: number }>;
+  powerWatts: number | null;
-  /** Local VRAM usage in bytes per card */
+  /** Maximum / TDP power in watts */
-  memoryLocalBytes: Array<{ card: string; value: number }>;
+  powerMaxWatts: number | null;
-  /** System memory usage in bytes per card */
+}
-  memorySystemBytes: Array<{ card: string; value: number }>;
+
-  /** Cumulative energy in µJ per card (raw counter; compute delta for power) */
+export interface GpuMetrics {
-  energyMicrojoules: Array<{ card: string; value: number }>;
+  chips: GpuChipMetrics[];
-  /** Raw parsed metric families for advanced use */
+  /** ISO timestamp of when metrics were fetched */
-  raw: ParsedMetrics;
+  fetchedAt: string;
 }
 // ---------------------------------------------------------------------------
-// Prometheus text format parser
+// Prometheus query helper
 // ---------------------------------------------------------------------------
-const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
+interface PrometheusResult {
-
+  metric: Record<string, string>;
-function parseLabels(labelStr: string): Record<string, string> {
+  value: [number, string];
  const labels: Record<string, string> = {};
  let match: RegExpExecArray | null;
  const re = new RegExp(LABEL_PAIR_RE.source, 'g');
  while ((match = re.exec(labelStr)) !== null) {
    const key = match[1];
    const val = match[2];
    if (key && val !== undefined) {
      labels[key] = val;
    }
  }
  return labels;
 }
-export function parsePrometheusText(text: string): ParsedMetrics {
+interface PrometheusResponse {
-  const families = new Map<string, MetricFamily>();
+  status: string;
-  let currentName = '';
+  data: {
-  let currentHelp = '';
+    resultType: string;
-  let currentType = '';
+    result: PrometheusResult[];
  for (const rawLine of text.split('\n')) {
    const line = rawLine.trim();
    if (!line) continue;
    if (line.startsWith('# HELP ')) {
      const rest = line.slice(7);
      const spaceIdx = rest.indexOf(' ');
      currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
      currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
      continue;
    }
    if (line.startsWith('# TYPE ')) {
      const rest = line.slice(7);
      const spaceIdx = rest.indexOf(' ');
      currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
      continue;
    }
    if (line.startsWith('#')) continue;
    const openBrace = line.indexOf('{');
    const closeBrace = line.lastIndexOf('}');
    let metricName: string;
    let labels: Record<string, string>;
    let valuePart: string;
    if (openBrace >= 0 && closeBrace > openBrace) {
      metricName = line.slice(0, openBrace);
      labels = parseLabels(line.slice(openBrace + 1, closeBrace));
      valuePart = line.slice(closeBrace + 1).trim();
    } else {
      const spaceIdx = line.lastIndexOf(' ');
      if (spaceIdx < 0) continue;
      metricName = line.slice(0, spaceIdx);
      labels = {};
      valuePart = line.slice(spaceIdx + 1).trim();
    }
    const valueTokens = valuePart.split(' ');
    const valueStr = valueTokens[0] ?? '';
    const value = parseFloat(valueStr);
    if (!Number.isFinite(value)) continue;
    const familyKey = metricName;
    let family = families.get(familyKey);
    if (!family) {
      family = {
        name: familyKey,
        help: metricName === currentName ? currentHelp : '',
        type: metricName === currentName ? currentType : '',
        samples: [],
      };
      families.set(familyKey, family);
    }
    family.samples.push({ labels, value });
  }
  return families;
 }
 // ---------------------------------------------------------------------------
 // Extract Intel GPU metrics from the parsed map
 // ---------------------------------------------------------------------------
 function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
  return families.get(name)?.samples ?? [];
 }
 export function extractGpuNodeMetrics(
  families: ParsedMetrics,
  nodeName: string,
  podName: string
 ): GpuNodeMetrics {
  const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
  const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
  // Build utilization: active/total per (card, engine)
  const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
  for (const active of activeSamples) {
    const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
    const engine = active.labels['engine'] ?? 'render/0';
    const totalSample = totalSamples.find(
      s =>
        (s.labels['card'] ?? s.labels['gpu']) === card &&
        s.labels['engine'] === engine
    );
    const total = totalSample?.value ?? 0;
    const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
    engineUtilization.push({ card, engine, pct });
  }
  // Boost frequency
  const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
    value: s.value,
  }));
  // Memory
  const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
    value: s.value,
  }));
  const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
    value: s.value,
  }));
  // Energy
  const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
    value: s.value,
  }));
  return {
    nodeName,
    podName,
    engineUtilization,
    boostFreqMhz,
    memoryLocalBytes,
    memorySystemBytes,
    energyMicrojoules,
    raw: families,
  };
 }
 // ---------------------------------------------------------------------------
 // Fetch metrics from an Intel GPU device plugin pod
 // ---------------------------------------------------------------------------
 /**
- * Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
+ * Service discovery: find the Prometheus service.
- *
+ * Tries the kube-prometheus-stack default name; falls back to prometheus-operated.
 * The proxy path is:
 *   /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
 *
 * Returns null if the pod is not exposing metrics (enableMonitoring: false)
 * or if the proxy request fails.
 */
-export async function fetchGpuPluginMetrics(
+const PROMETHEUS_SERVICES = [
-  podName: string,
+  { namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' },
-  namespace: string,
+  { namespace: 'monitoring', service: 'prometheus-operated', port: '9090' },
-  nodeName: string
+  { namespace: 'monitoring', service: 'prometheus', port: '9090' },
-): Promise<GpuNodeMetrics | null> {
+];
  const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
-  try {
+async function queryPrometheus(
-    const raw: unknown = await ApiProxy.request(path, {
+  query: string,
-      method: 'GET',
+  prometheusPath: string
-      isJSON: false,
+): Promise<PrometheusResult[]> {
-    });
+  const encoded = encodeURIComponent(query);
  const path = `${prometheusPath}/api/v1/query?query=${encoded}`;
-    if (typeof raw !== 'string') return null;
+  const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse;
-    const families = parsePrometheusText(raw);
+  if (raw?.status !== 'success') return [];
-    return extractGpuNodeMetrics(families, nodeName, podName);
+  return raw.data?.result ?? [];
-  } catch {
+}
-    return null;
+
 async function findPrometheusPath(): Promise<string | null> {
  for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
    const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
    try {
      const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse;
      if (raw?.status === 'success') return basePath;
    } catch {
      // try next
    }
  }
  return null;
 }
 // ---------------------------------------------------------------------------
 // Metrics fetch
 // ---------------------------------------------------------------------------
 export async function fetchGpuMetrics(): Promise<GpuMetrics | null> {
  const prometheusPath = await findPrometheusPath();
  if (!prometheusPath) return null;
  // Run queries in parallel
  const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([
    // i915 chip identification
    queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath),
    // Current power (rate of cumulative energy counter)
    queryPrometheus(
      'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
      prometheusPath
    ),
    // TDP / max power
    queryPrometheus(
      'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
      prometheusPath
    ),
    // instance → nodename mapping
    queryPrometheus('node_uname_info', prometheusPath),
  ]);
  // Build instance → nodename map
  const instanceToNode = new Map<string, string>();
  for (const r of unameResults) {
    const inst = r.metric['instance'];
    const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst;
    if (inst) instanceToNode.set(inst, nodename);
  }
  // Build chip → power map
  const chipToPower = new Map<string, number>();
  for (const r of energyRateResults) {
    const chip = r.metric['chip'];
    if (chip) chipToPower.set(chip, parseFloat(r.value[1]));
  }
  // Build chip → max power map
  const chipToMaxPower = new Map<string, number>();
  for (const r of powerMaxResults) {
    const chip = r.metric['chip'];
    if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1]));
  }
  // Assemble per-chip metrics from the chip identification results
  const chips: GpuChipMetrics[] = chipResults.map(r => {
    const chip = r.metric['chip'] ?? '';
    const instance = r.metric['instance'] ?? '';
    const nodeName = instanceToNode.get(instance) ?? instance;
    const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null;
    const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null;
    return { nodeName, chip, instance, powerWatts, powerMaxWatts };
  });
  return {
    chips,
    fetchedAt: new Date().toISOString(),
  };
 }
 // ---------------------------------------------------------------------------
 // Formatting helpers
 // ---------------------------------------------------------------------------
-export function formatBytes(bytes: number): string {
+export function formatWatts(w: number): string {
-  if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
+  return `${w.toFixed(1)} W`;
  if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
  if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
  return `${bytes} B`;
 }
-export function formatFreq(mhz: number): string {
+export function formatPercent(used: number, max: number): string {
-  return `${Math.round(mhz)} MHz`;
+  if (max <= 0) return '—';
  return `${Math.round((used / max) * 100)}%`;
 }
@@ -1,9 +1,29 @@
 /**
- * MetricsPage — real-time Intel GPU metrics from the device plugin pods.
+ * MetricsPage — Intel GPU metrics from Prometheus (node-exporter hwmon).
 *
- * Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
+ * METRIC AVAILABILITY
- * and displays per-card engine utilization, GPU frequency, memory usage,
+ * -------------------
- * and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
+ * Power (current W, TDP)
 *   Source:   node_hwmon_energy_joule_total, node_hwmon_power_max_watt
 *   Driver:   i915 hwmon sysfs (/sys/class/drm/card{N}/device/hwmon/)
 *   Scraped:  node-exporter hwmon collector (enabled by default)
 *   Nodes:    Discrete GPU nodes only (i915 driver exposes hwmon; iGPU driver does not)
 *   No extra config required — works out of the box with kube-prometheus-stack.
 *
 * GPU Frequency (current, boost, min, max MHz)
 *   Source:   DRM sysfs (/sys/class/drm/card{N}/gt_{x}_freq_mhz)
 *   Driver:   i915 kernel driver
 *   Scraped:  NOT available -- node-exporter --collector.drm is AMD-only and does not
 *             read i915 gt_freq sysfs files. Would require a custom exporter or
 *             node-exporter textfile collector sidecar writing these values.
 *
 * GPU Utilization (engine busy %)
 *   Source:   Not exposed via hwmon or any standard Prometheus collector for i915.
 *             Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
 *
 * Integrated GPU (iGPU) nodes
 *   The iGPU driver does not expose hwmon sensors. No Prometheus metrics are
 *   available for iGPU nodes regardless of configuration.
 */
 import {
@@ -15,141 +35,140 @@ import {
 } from '@kinvolk/headlamp-plugin/lib/CommonComponents';
 import React, { useCallback, useEffect, useState } from 'react';
 import { useIntelGpuContext } from '../api/IntelGpuDataContext';
-import {
+import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics';
  fetchGpuPluginMetrics,
  formatBytes,
  formatFreq,
  GpuNodeMetrics,
 } from '../api/metrics';
 import { IntelGpuPod } from '../api/k8s';
 // ---------------------------------------------------------------------------
-// Utilization bar
+// Power bar
 // ---------------------------------------------------------------------------
-function UtilizationBar({ pct }: { pct: number }) {
+function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
-  const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
+  const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
  const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
  return (
    <div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
-      <div
+      {pct !== null && (
        style={{
          width: '100px',
          height: '8px',
          backgroundColor: '#e0e0e0',
          borderRadius: '4px',
          overflow: 'hidden',
          flexShrink: 0,
        }}
      >
        <div
          style={{
-            width: `${pct}%`,
+            width: '100px',
-            height: '100%',
+            height: '8px',
-            backgroundColor: color,
+            backgroundColor: '#e0e0e0',
            borderRadius: '4px',
-            transition: 'width 0.3s ease',
+            overflow: 'hidden',
            flexShrink: 0,
          }}
-        />
+        >
-      </div>
+          <div
-      <span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
+            style={{
              width: `${pct}%`,
              height: '100%',
              backgroundColor: color,
              borderRadius: '4px',
              transition: 'width 0.4s ease',
            }}
          />
        </div>
      )}
      <span style={{ fontSize: '13px', fontVariantNumeric: 'tabular-nums' }}>
        {formatWatts(watts)}
        {maxWatts !== null && maxWatts > 0 && (
          <span style={{ color: '#888', marginLeft: '4px' }}>
            / {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)})
          </span>
        )}
      </span>
    </div>
  );
 }
 // ---------------------------------------------------------------------------
-// Per-node metrics card
+// Per-chip card
 // ---------------------------------------------------------------------------
-function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
+function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
-  const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
+  const rows: Array<{ name: string; value: React.ReactNode }> = [
    { name: 'Node', value: chip.nodeName },
    { name: 'GPU (PCI)', value: chip.chip },
    {
      name: 'Current Power',
      value: chip.powerWatts !== null
        ? <PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
        : <StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>,
    },
  ];
-  // Group engines by card
+  if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) {
-  const byCard = new Map<string, typeof engineUtilization>();
+    rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) });
  for (const e of engineUtilization) {
    if (!byCard.has(e.card)) byCard.set(e.card, []);
    byCard.get(e.card)!.push(e);
  }
  const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
  const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
  const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
  const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
  const cards = Array.from(
    new Set([
      ...byCard.keys(),
      ...freqByCard.keys(),
      ...memLocalByCard.keys(),
    ])
  ).sort();
  if (cards.length === 0) {
    return (
      <SectionBox title={`${nodeName} — No Metric Data`}>
        <NameValueTable
          rows={[
            {
              name: 'Pod',
              value: podName,
            },
            {
              name: 'Note',
              value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
            },
          ]}
        />
      </SectionBox>
    );
  }
  return (
-    <>
+    <SectionBox title={`${chip.nodeName} — ${chip.chip}`}>
-      {cards.map(card => {
+      <NameValueTable rows={rows} />
-        const engines = byCard.get(card) ?? [];
+    </SectionBox>
-        const freq = freqByCard.get(card);
+  );
-        const memLocal = memLocalByCard.get(card);
+}
        const memSys = memSysByCard.get(card);
        const energy = energyByCard.get(card);
-        const rows: Array<{ name: string; value: React.ReactNode }> = [
+// ---------------------------------------------------------------------------
-          { name: 'Node', value: nodeName },
+// Requirements info box
-          { name: 'Plugin Pod', value: podName },
+// ---------------------------------------------------------------------------
          { name: 'GPU Card', value: card },
        ];
-        if (freq !== undefined) {
+function MetricRequirements() {
-          rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
+  return (
-        }
+    <SectionBox title="Metric Availability">
-
+      <NameValueTable
-        if (memLocal !== undefined) {
+        rows={[
-          rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
+          {
-        }
+            name: 'Power (W)',
-        if (memSys !== undefined && memSys > 0) {
+            value: (
-          rows.push({ name: 'System Memory', value: formatBytes(memSys) });
+              <>
-        }
+                <StatusLabel status="success">Available — discrete GPU nodes</StatusLabel>
-
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
-        if (energy !== undefined) {
+                  Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon collector (enabled by default).
-          rows.push({
+                  Requires the i915 kernel driver on the node. iGPU nodes do not expose hwmon sensors.
-            name: 'Energy (cumulative)',
+                </div>
-            value: `${(energy / 1e6).toFixed(2)} J`,
+              </>
-          });
+            ),
-        }
+          },
-
+          {
-        // Engine utilization rows
+            name: 'Frequency (MHz)',
-        for (const e of engines) {
+            value: (
-          rows.push({
+              <>
-            name: `Engine: ${e.engine}`,
+                <StatusLabel status="error">Not available</StatusLabel>
-            value: <UtilizationBar pct={e.pct} />,
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
-          });
+                  i915 exposes <code>gt_*_freq_mhz</code> via DRM sysfs but node-exporter&apos;s{' '}
-        }
+                  <code>--collector.drm</code> flag is AMD-only and does not read these files.
-
+                  A custom exporter or textfile-collector sidecar writing these values would be required.
-        return (
+                </div>
-          <SectionBox key={`${nodeName}-${card}`} title={`${nodeName} — ${card}`}>
+              </>
-            <NameValueTable rows={rows} />
+            ),
-          </SectionBox>
+          },
-        );
+          {
-      })}
+            name: 'Utilization (%)',
-    </>
+            value: (
              <>
                <StatusLabel status="error">Not available</StatusLabel>
                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
                  No standard Prometheus collector exposes i915 engine busy percentage.
                  Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
                </div>
              </>
            ),
          },
          {
            name: 'iGPU nodes',
            value: (
              <>
                <StatusLabel status="error">No metrics available</StatusLabel>
                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
                  The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics
                  are available for iGPU nodes regardless of configuration.
                </div>
              </>
            ),
          },
        ]}
      />
    </SectionBox>
  );
 }
@@ -158,38 +177,33 @@ function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
 // ---------------------------------------------------------------------------
 export default function MetricsPage() {
-  const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
+  const { gpuNodes, loading: ctxLoading } = useIntelGpuContext();
-  const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
+  const [metrics, setMetrics] = useState<GpuMetrics | null>(null);
  const [fetchError, setFetchError] = useState<string | null>(null);
  const [fetching, setFetching] = useState(false);
-  const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
+  const doFetch = useCallback(async () => {
    if (pods.length === 0) return;
    setFetching(true);
-
+    setFetchError(null);
-    const results = await Promise.all(
+    try {
-      pods.map(async pod => {
+      const result = await fetchGpuMetrics();
-        const name = pod.metadata.name;
+      setMetrics(result);
-        const namespace = pod.metadata.namespace ?? 'kube-system';
+      if (!result) {
-        const nodeName = pod.spec?.nodeName ?? name;
+        setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.');
-        const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
+      }
-        return { name, result };
+    } catch (e: unknown) {
-      })
+      setFetchError(e instanceof Error ? e.message : String(e));
-    );
+    } finally {
-
+      setFetching(false);
    const map = new Map<string, GpuNodeMetrics | 'error'>();
    for (const { name, result } of results) {
      map.set(name, result ?? 'error');
    }
    setMetricsMap(map);
    setFetching(false);
  }, []);
  useEffect(() => {
-    if (!ctxLoading && pluginPods.length > 0) {
+    if (!ctxLoading) {
-      void fetchAll(pluginPods);
+      void doFetch();
    }
-  }, [ctxLoading, pluginPods, fetchAll]);
+  }, [ctxLoading, doFetch]);
  if (ctxLoading) {
    return <Loader title="Loading Intel GPU data..." />;
@@ -200,8 +214,8 @@ export default function MetricsPage() {
      <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
        <SectionHeader title="Intel GPU — Metrics" />
        <button
-          onClick={() => void fetchAll(pluginPods)}
+          onClick={() => void doFetch()}
-          disabled={fetching || pluginPods.length === 0}
+          disabled={fetching}
          aria-label="Refresh metrics"
          style={{
            padding: '6px 16px',
@@ -218,94 +232,86 @@ export default function MetricsPage() {
        </button>
      </div>
-      {!pluginInstalled && (
+      <MetricRequirements />
-        <SectionBox title="Intel GPU Plugin Not Detected">
+
      {fetching && !metrics && <Loader title="Querying Prometheus for GPU metrics..." />}
      {fetchError && (
        <SectionBox title="Prometheus Unreachable">
          <NameValueTable
            rows={[
              {
-                name: 'Status',
+                name: 'Error',
-                value: (
+                value: <StatusLabel status="error">{fetchError}</StatusLabel>,
                  <StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
                ),
              },
              {
-                name: 'Note',
+                name: 'Checked services',
-                value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
+                value: 'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
              },
            ]}
          />
        </SectionBox>
      )}
-      {pluginInstalled && pluginPods.length === 0 && (
+      {metrics && metrics.chips.length === 0 && (
-        <SectionBox title="No Plugin Pods Found">
+        <SectionBox title="No i915 Metrics in Prometheus">
          <NameValueTable
            rows={[
              {
                name: 'Status',
                value: (
                  <StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
                ),
              },
            ]}
          />
        </SectionBox>
      )}
      {pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
        <Loader title="Fetching GPU metrics..." />
      )}
      {pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
        <SectionBox title="Metrics Unavailable">
          <NameValueTable
            rows={[
              {
                name: 'Status',
                value: (
                  <StatusLabel status="warning">
-                    Could not fetch metrics from any plugin pod
+                    Prometheus reachable — no node_hwmon_chip_names&#123;chip_name=&quot;i915&quot;&#125; found
                  </StatusLabel>
                ),
              },
              {
-                name: 'Requirements',
+                name: 'GPU Nodes',
-                value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
+                value: gpuNodes.length > 0 ? gpuNodes.map(n => n.metadata.name).join(', ') : 'None detected',
              },
              {
-                name: 'Plugin Pods Found',
+                name: 'Likely cause',
-                value: pluginPods.map(p => p.metadata.name).join(', '),
+                value: 'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
              },
            ]}
          />
        </SectionBox>
      )}
-      {Array.from(metricsMap.entries()).map(([podName, metrics]) => {
+      {metrics && metrics.chips.length > 0 && (
-        if (metrics === 'error') {
+        <>
-          return (
+          <SectionBox title="GPU Power Summary">
-            <SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
+            <NameValueTable
-              <NameValueTable
+              rows={[
-                rows={[
+                {
-                  {
+                  name: 'GPUs Monitored',
-                    name: 'Status',
+                  value: String(metrics.chips.length),
-                    value: (
+                },
-                      <StatusLabel status="error">
+                {
-                        Failed to fetch metrics from pod
+                  name: 'Total Power',
-                      </StatusLabel>
+                  value: (() => {
-                    ),
+                    const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0);
-                  },
+                    const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0);
-                  {
+                    return <PowerBar watts={total} maxWatts={maxTotal > 0 ? maxTotal : null} />;
-                    name: 'Hint',
+                  })(),
-                    value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
+                },
-                  },
+                {
-                ]}
+                  name: 'Last Fetched',
-              />
+                  value: new Date(metrics.fetchedAt).toLocaleTimeString(),
-            </SectionBox>
+                },
-          );
+                {
-        }
+                  name: 'Query',
-        return <NodeMetricsCard key={podName} metrics={metrics} />;
+                  value: 'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
-      })}
+                },
              ]}
            />
          </SectionBox>
          {metrics.chips.map(chip => (
            <GpuChipCard key={`${chip.instance}-${chip.chip}`} chip={chip} />
          ))}
        </>
      )}
    </>
  );
 }