chore: bump to v0.3.0

Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
docs: document metric availability and requirements in MetricsPage
2026-02-19 05:57:13 -05:00 · 2026-02-18 22:07:19 -05:00 · 2026-02-18 21:37:16 -05:00
4 changed files with 346 additions and 426 deletions
@@ -1,4 +1,4 @@
-version: "0.2.0"
+version: "0.3.0"
 name: headlamp-intel-gpu-plugin
 displayName: Intel GPU
 description: >-
@@ -15,7 +15,7 @@ license: Apache-2.0
 category: monitoring-logging

 homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
-appVersion: "0.2.0"
+appVersion: "0.3.0"

 keywords:
  - headlamp
@@ -46,7 +46,9 @@ links:

 changes:
  - kind: added
-    description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
+    description: "Metrics page: document which metrics require what infrastructure (power via hwmon works out of the box; frequency and utilization need custom exporters)"
+  - kind: added
+    description: "Metrics page: real-time GPU power draw (W) and TDP via node-exporter i915 hwmon metrics in kube-prometheus-stack"
  - kind: changed
    description: "Sidebar label changed to intel-gpu"
  - kind: removed
@@ -69,7 +71,7 @@ changes:
    description: "App bar health badge: hidden when no Intel GPU plugin detected"

 annotations:
-  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
-  headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
+  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.3.0/headlamp-intel-gpu-plugin-0.3.0.tar.gz"
+  headlamp/plugin/archive-checksum: "sha256:fdc53099ee3123680f24fe4a319b753ca3d030aac31abd4e3f383221085c9c2d"
  headlamp/plugin/version-compat: ">=0.20.0"
  headlamp/plugin/distro-compat: "in-cluster,web,app"
@@ -1,6 +1,6 @@
 {
  "name": "headlamp-intel-gpu-plugin",
-  "version": "0.2.0",
+  "version": "0.3.0",
  "description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
  "repository": {
    "type": "git",
@@ -1,16 +1,15 @@
 /**
- * Prometheus text format parser for Intel GPU device plugin metrics.
+ * Intel GPU metrics via Prometheus (kube-prometheus-stack).
 *
- * Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
- * via the Kubernetes API proxy and parses key metric families.
+ * The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter
+ * scrapes automatically. We query Prometheus for:
+ *   - node_hwmon_energy_joule_total  (chip_name="i915") → rate = power in W
+ *   - node_hwmon_power_max_watt      (same chip)        → TDP
+ *   - node_hwmon_chip_names          (chip_name="i915") → identify GPU chips
+ *   - node_uname_info                                   → instance → nodename
 *
- * Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
- *   gpu_i915_engine_active_ticks  — engine busy ticks (per card, engine)
- *   gpu_i915_engine_total_ticks   — engine total ticks (for utilization %)
- *   gpu_i915_energy_microjoules   — cumulative energy (µJ → power = delta/dt)
- *   gpu_i915_gt_boost_freq_mhz    — current GT boost frequency (MHz)
- *   gpu_i915_memory_local         — local (VRAM) memory usage (bytes)
- *   gpu_i915_memory_system        — system memory usage (bytes)
+ * Queries go through the Kubernetes API proxy to the in-cluster Prometheus
+ * service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/...
 */

 import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
@@ -19,239 +18,152 @@ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
 // Types
 // ---------------------------------------------------------------------------

-export interface MetricSample {
-  labels: Record<string, string>;
-  value: number;
-}
-
-export interface MetricFamily {
-  name: string;
-  help: string;
-  type: string;
-  samples: MetricSample[];
-}
-
-export type ParsedMetrics = Map<string, MetricFamily>;
-
-export interface GpuNodeMetrics {
-  /** Node name this metric set was fetched from (via plugin pod) */
+export interface GpuChipMetrics {
+  /** Kubernetes node name (e.g. "buttons") */
  nodeName: string;
-  /** Pod name of the intel-gpu-plugin daemonset pod */
-  podName: string;
-  /** Engine utilization per (card, engine): 0–100 */
-  engineUtilization: Array<{ card: string; engine: string; pct: number }>;
-  /** Current GT boost frequency in MHz per card */
-  boostFreqMhz: Array<{ card: string; value: number }>;
-  /** Local VRAM usage in bytes per card */
-  memoryLocalBytes: Array<{ card: string; value: number }>;
-  /** System memory usage in bytes per card */
-  memorySystemBytes: Array<{ card: string; value: number }>;
-  /** Cumulative energy in µJ per card (raw counter; compute delta for power) */
-  energyMicrojoules: Array<{ card: string; value: number }>;
-  /** Raw parsed metric families for advanced use */
-  raw: ParsedMetrics;
+  /** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */
+  chip: string;
+  /** node-exporter instance (IP:port) */
+  instance: string;
+  /** Current power draw in watts (rate of energy counter, null if unavailable) */
+  powerWatts: number | null;
+  /** Maximum / TDP power in watts */
+  powerMaxWatts: number | null;
+}
+
+export interface GpuMetrics {
+  chips: GpuChipMetrics[];
+  /** ISO timestamp of when metrics were fetched */
+  fetchedAt: string;
 }

 // ---------------------------------------------------------------------------
-// Prometheus text format parser
+// Prometheus query helper
 // ---------------------------------------------------------------------------

-const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
-
-function parseLabels(labelStr: string): Record<string, string> {
-  const labels: Record<string, string> = {};
-  let match: RegExpExecArray | null;
-  const re = new RegExp(LABEL_PAIR_RE.source, 'g');
-  while ((match = re.exec(labelStr)) !== null) {
-    const key = match[1];
-    const val = match[2];
-    if (key && val !== undefined) {
-      labels[key] = val;
-    }
-  }
-  return labels;
+interface PrometheusResult {
+  metric: Record<string, string>;
+  value: [number, string];
 }

-export function parsePrometheusText(text: string): ParsedMetrics {
-  const families = new Map<string, MetricFamily>();
-  let currentName = '';
-  let currentHelp = '';
-  let currentType = '';
-
-  for (const rawLine of text.split('\n')) {
-    const line = rawLine.trim();
-    if (!line) continue;
-
-    if (line.startsWith('# HELP ')) {
-      const rest = line.slice(7);
-      const spaceIdx = rest.indexOf(' ');
-      currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
-      currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
-      continue;
-    }
-
-    if (line.startsWith('# TYPE ')) {
-      const rest = line.slice(7);
-      const spaceIdx = rest.indexOf(' ');
-      currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
-      continue;
-    }
-
-    if (line.startsWith('#')) continue;
-
-    const openBrace = line.indexOf('{');
-    const closeBrace = line.lastIndexOf('}');
-
-    let metricName: string;
-    let labels: Record<string, string>;
-    let valuePart: string;
-
-    if (openBrace >= 0 && closeBrace > openBrace) {
-      metricName = line.slice(0, openBrace);
-      labels = parseLabels(line.slice(openBrace + 1, closeBrace));
-      valuePart = line.slice(closeBrace + 1).trim();
-    } else {
-      const spaceIdx = line.lastIndexOf(' ');
-      if (spaceIdx < 0) continue;
-      metricName = line.slice(0, spaceIdx);
-      labels = {};
-      valuePart = line.slice(spaceIdx + 1).trim();
-    }
-
-    const valueTokens = valuePart.split(' ');
-    const valueStr = valueTokens[0] ?? '';
-    const value = parseFloat(valueStr);
-    if (!Number.isFinite(value)) continue;
-
-    const familyKey = metricName;
-    let family = families.get(familyKey);
-    if (!family) {
-      family = {
-        name: familyKey,
-        help: metricName === currentName ? currentHelp : '',
-        type: metricName === currentName ? currentType : '',
-        samples: [],
-      };
-      families.set(familyKey, family);
-    }
-
-    family.samples.push({ labels, value });
-  }
-
-  return families;
-}
-
-// ---------------------------------------------------------------------------
-// Extract Intel GPU metrics from the parsed map
-// ---------------------------------------------------------------------------
-
-function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
-  return families.get(name)?.samples ?? [];
-}
-
-export function extractGpuNodeMetrics(
-  families: ParsedMetrics,
-  nodeName: string,
-  podName: string
-): GpuNodeMetrics {
-  const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
-  const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
-
-  // Build utilization: active/total per (card, engine)
-  const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
-  for (const active of activeSamples) {
-    const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
-    const engine = active.labels['engine'] ?? 'render/0';
-    const totalSample = totalSamples.find(
-      s =>
-        (s.labels['card'] ?? s.labels['gpu']) === card &&
-        s.labels['engine'] === engine
-    );
-    const total = totalSample?.value ?? 0;
-    const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
-    engineUtilization.push({ card, engine, pct });
-  }
-
-  // Boost frequency
-  const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-
-  // Memory
-  const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-  const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-
-  // Energy
-  const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-
-  return {
-    nodeName,
-    podName,
-    engineUtilization,
-    boostFreqMhz,
-    memoryLocalBytes,
-    memorySystemBytes,
-    energyMicrojoules,
-    raw: families,
+interface PrometheusResponse {
+  status: string;
+  data: {
+    resultType: string;
+    result: PrometheusResult[];
  };
 }

-// ---------------------------------------------------------------------------
-// Fetch metrics from an Intel GPU device plugin pod
-// ---------------------------------------------------------------------------
-
 /**
- * Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
- *
- * The proxy path is:
- *   /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
- *
- * Returns null if the pod is not exposing metrics (enableMonitoring: false)
- * or if the proxy request fails.
+ * Service discovery: find the Prometheus service.
+ * Tries the kube-prometheus-stack default name; falls back to prometheus-operated.
 */
-export async function fetchGpuPluginMetrics(
-  podName: string,
-  namespace: string,
-  nodeName: string
-): Promise<GpuNodeMetrics | null> {
-  const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
+const PROMETHEUS_SERVICES = [
+  { namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' },
+  { namespace: 'monitoring', service: 'prometheus-operated', port: '9090' },
+  { namespace: 'monitoring', service: 'prometheus', port: '9090' },
+];

-  try {
-    const raw: unknown = await ApiProxy.request(path, {
-      method: 'GET',
-      isJSON: false,
-    });
+async function queryPrometheus(
+  query: string,
+  prometheusPath: string
+): Promise<PrometheusResult[]> {
+  const encoded = encodeURIComponent(query);
+  const path = `${prometheusPath}/api/v1/query?query=${encoded}`;

-    if (typeof raw !== 'string') return null;
+  const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse;

-    const families = parsePrometheusText(raw);
-    return extractGpuNodeMetrics(families, nodeName, podName);
-  } catch {
-    return null;
+  if (raw?.status !== 'success') return [];
+  return raw.data?.result ?? [];
+}
+
+async function findPrometheusPath(): Promise<string | null> {
+  for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
+    const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
+    try {
+      const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse;
+      if (raw?.status === 'success') return basePath;
+    } catch {
+      // try next
+    }
  }
+  return null;
+}
+
+// ---------------------------------------------------------------------------
+// Metrics fetch
+// ---------------------------------------------------------------------------
+
+export async function fetchGpuMetrics(): Promise<GpuMetrics | null> {
+  const prometheusPath = await findPrometheusPath();
+  if (!prometheusPath) return null;
+
+  // Run queries in parallel
+  const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([
+    // i915 chip identification
+    queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath),
+    // Current power (rate of cumulative energy counter)
+    queryPrometheus(
+      'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
+      prometheusPath
+    ),
+    // TDP / max power
+    queryPrometheus(
+      'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
+      prometheusPath
+    ),
+    // instance → nodename mapping
+    queryPrometheus('node_uname_info', prometheusPath),
+  ]);
+
+  // Build instance → nodename map
+  const instanceToNode = new Map<string, string>();
+  for (const r of unameResults) {
+    const inst = r.metric['instance'];
+    const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst;
+    if (inst) instanceToNode.set(inst, nodename);
+  }
+
+  // Build chip → power map
+  const chipToPower = new Map<string, number>();
+  for (const r of energyRateResults) {
+    const chip = r.metric['chip'];
+    if (chip) chipToPower.set(chip, parseFloat(r.value[1]));
+  }
+
+  // Build chip → max power map
+  const chipToMaxPower = new Map<string, number>();
+  for (const r of powerMaxResults) {
+    const chip = r.metric['chip'];
+    if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1]));
+  }
+
+  // Assemble per-chip metrics from the chip identification results
+  const chips: GpuChipMetrics[] = chipResults.map(r => {
+    const chip = r.metric['chip'] ?? '';
+    const instance = r.metric['instance'] ?? '';
+    const nodeName = instanceToNode.get(instance) ?? instance;
+    const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null;
+    const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null;
+
+    return { nodeName, chip, instance, powerWatts, powerMaxWatts };
+  });
+
+  return {
+    chips,
+    fetchedAt: new Date().toISOString(),
+  };
 }

 // ---------------------------------------------------------------------------
 // Formatting helpers
 // ---------------------------------------------------------------------------

-export function formatBytes(bytes: number): string {
-  if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
-  if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
-  if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
-  return `${bytes} B`;
+export function formatWatts(w: number): string {
+  return `${w.toFixed(1)} W`;
 }

-export function formatFreq(mhz: number): string {
-  return `${Math.round(mhz)} MHz`;
+export function formatPercent(used: number, max: number): string {
+  if (max <= 0) return '—';
+  return `${Math.round((used / max) * 100)}%`;
 }
@@ -1,9 +1,29 @@
 /**
- * MetricsPage — real-time Intel GPU metrics from the device plugin pods.
+ * MetricsPage — Intel GPU metrics from Prometheus (node-exporter hwmon).
 *
- * Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
- * and displays per-card engine utilization, GPU frequency, memory usage,
- * and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
+ * METRIC AVAILABILITY
+ * -------------------
+ * Power (current W, TDP)
+ *   Source:   node_hwmon_energy_joule_total, node_hwmon_power_max_watt
+ *   Driver:   i915 hwmon sysfs (/sys/class/drm/card{N}/device/hwmon/)
+ *   Scraped:  node-exporter hwmon collector (enabled by default)
+ *   Nodes:    Discrete GPU nodes only (i915 driver exposes hwmon; iGPU driver does not)
+ *   No extra config required — works out of the box with kube-prometheus-stack.
+ *
+ * GPU Frequency (current, boost, min, max MHz)
+ *   Source:   DRM sysfs (/sys/class/drm/card{N}/gt_{x}_freq_mhz)
+ *   Driver:   i915 kernel driver
+ *   Scraped:  NOT available -- node-exporter --collector.drm is AMD-only and does not
+ *             read i915 gt_freq sysfs files. Would require a custom exporter or
+ *             node-exporter textfile collector sidecar writing these values.
+ *
+ * GPU Utilization (engine busy %)
+ *   Source:   Not exposed via hwmon or any standard Prometheus collector for i915.
+ *             Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
+ *
+ * Integrated GPU (iGPU) nodes
+ *   The iGPU driver does not expose hwmon sensors. No Prometheus metrics are
+ *   available for iGPU nodes regardless of configuration.
 */

 import {
@@ -15,141 +35,140 @@ import {
 } from '@kinvolk/headlamp-plugin/lib/CommonComponents';
 import React, { useCallback, useEffect, useState } from 'react';
 import { useIntelGpuContext } from '../api/IntelGpuDataContext';
-import {
-  fetchGpuPluginMetrics,
-  formatBytes,
-  formatFreq,
-  GpuNodeMetrics,
-} from '../api/metrics';
-import { IntelGpuPod } from '../api/k8s';
+import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics';

 // ---------------------------------------------------------------------------
-// Utilization bar
+// Power bar
 // ---------------------------------------------------------------------------

-function UtilizationBar({ pct }: { pct: number }) {
-  const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
+function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
+  const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
+  const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
+
  return (
    <div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
-      <div
-        style={{
-          width: '100px',
-          height: '8px',
-          backgroundColor: '#e0e0e0',
-          borderRadius: '4px',
-          overflow: 'hidden',
-          flexShrink: 0,
-        }}
-      >
+      {pct !== null && (
        <div
          style={{
-            width: `${pct}%`,
-            height: '100%',
-            backgroundColor: color,
+            width: '100px',
+            height: '8px',
+            backgroundColor: '#e0e0e0',
            borderRadius: '4px',
-            transition: 'width 0.3s ease',
+            overflow: 'hidden',
+            flexShrink: 0,
          }}
-        />
-      </div>
-      <span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
+        >
+          <div
+            style={{
+              width: `${pct}%`,
+              height: '100%',
+              backgroundColor: color,
+              borderRadius: '4px',
+              transition: 'width 0.4s ease',
+            }}
+          />
+        </div>
+      )}
+      <span style={{ fontSize: '13px', fontVariantNumeric: 'tabular-nums' }}>
+        {formatWatts(watts)}
+        {maxWatts !== null && maxWatts > 0 && (
+          <span style={{ color: '#888', marginLeft: '4px' }}>
+            / {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)})
+          </span>
+        )}
+      </span>
    </div>
  );
 }

 // ---------------------------------------------------------------------------
-// Per-node metrics card
+// Per-chip card
 // ---------------------------------------------------------------------------

-function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
-  const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
+function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
+  const rows: Array<{ name: string; value: React.ReactNode }> = [
+    { name: 'Node', value: chip.nodeName },
+    { name: 'GPU (PCI)', value: chip.chip },
+    {
+      name: 'Current Power',
+      value: chip.powerWatts !== null
+        ? <PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
+        : <StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>,
+    },
+  ];

-  // Group engines by card
-  const byCard = new Map<string, typeof engineUtilization>();
-  for (const e of engineUtilization) {
-    if (!byCard.has(e.card)) byCard.set(e.card, []);
-    byCard.get(e.card)!.push(e);
-  }
-
-  const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
-  const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
-  const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
-  const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
-
-  const cards = Array.from(
-    new Set([
-      ...byCard.keys(),
-      ...freqByCard.keys(),
-      ...memLocalByCard.keys(),
-    ])
-  ).sort();
-
-  if (cards.length === 0) {
-    return (
-      <SectionBox title={`${nodeName} — No Metric Data`}>
-        <NameValueTable
-          rows={[
-            {
-              name: 'Pod',
-              value: podName,
-            },
-            {
-              name: 'Note',
-              value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
-            },
-          ]}
-        />
-      </SectionBox>
-    );
+  if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) {
+    rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) });
  }

  return (
-    <>
-      {cards.map(card => {
-        const engines = byCard.get(card) ?? [];
-        const freq = freqByCard.get(card);
-        const memLocal = memLocalByCard.get(card);
-        const memSys = memSysByCard.get(card);
-        const energy = energyByCard.get(card);
+    <SectionBox title={`${chip.nodeName} — ${chip.chip}`}>
+      <NameValueTable rows={rows} />
+    </SectionBox>
+  );
+}

-        const rows: Array<{ name: string; value: React.ReactNode }> = [
-          { name: 'Node', value: nodeName },
-          { name: 'Plugin Pod', value: podName },
-          { name: 'GPU Card', value: card },
-        ];
+// ---------------------------------------------------------------------------
+// Requirements info box
+// ---------------------------------------------------------------------------

-        if (freq !== undefined) {
-          rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
-        }
-
-        if (memLocal !== undefined) {
-          rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
-        }
-        if (memSys !== undefined && memSys > 0) {
-          rows.push({ name: 'System Memory', value: formatBytes(memSys) });
-        }
-
-        if (energy !== undefined) {
-          rows.push({
-            name: 'Energy (cumulative)',
-            value: `${(energy / 1e6).toFixed(2)} J`,
-          });
-        }
-
-        // Engine utilization rows
-        for (const e of engines) {
-          rows.push({
-            name: `Engine: ${e.engine}`,
-            value: <UtilizationBar pct={e.pct} />,
-          });
-        }
-
-        return (
-          <SectionBox key={`${nodeName}-${card}`} title={`${nodeName} — ${card}`}>
-            <NameValueTable rows={rows} />
-          </SectionBox>
-        );
-      })}
-    </>
+function MetricRequirements() {
+  return (
+    <SectionBox title="Metric Availability">
+      <NameValueTable
+        rows={[
+          {
+            name: 'Power (W)',
+            value: (
+              <>
+                <StatusLabel status="success">Available — discrete GPU nodes</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon collector (enabled by default).
+                  Requires the i915 kernel driver on the node. iGPU nodes do not expose hwmon sensors.
+                </div>
+              </>
+            ),
+          },
+          {
+            name: 'Frequency (MHz)',
+            value: (
+              <>
+                <StatusLabel status="error">Not available</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  i915 exposes <code>gt_*_freq_mhz</code> via DRM sysfs but node-exporter&apos;s{' '}
+                  <code>--collector.drm</code> flag is AMD-only and does not read these files.
+                  A custom exporter or textfile-collector sidecar writing these values would be required.
+                </div>
+              </>
+            ),
+          },
+          {
+            name: 'Utilization (%)',
+            value: (
+              <>
+                <StatusLabel status="error">Not available</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  No standard Prometheus collector exposes i915 engine busy percentage.
+                  Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
+                </div>
+              </>
+            ),
+          },
+          {
+            name: 'iGPU nodes',
+            value: (
+              <>
+                <StatusLabel status="error">No metrics available</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics
+                  are available for iGPU nodes regardless of configuration.
+                </div>
+              </>
+            ),
+          },
+        ]}
+      />
+    </SectionBox>
  );
 }

@@ -158,38 +177,33 @@ function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
 // ---------------------------------------------------------------------------

 export default function MetricsPage() {
-  const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
+  const { gpuNodes, loading: ctxLoading } = useIntelGpuContext();

-  const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
+  const [metrics, setMetrics] = useState<GpuMetrics | null>(null);
+  const [fetchError, setFetchError] = useState<string | null>(null);
  const [fetching, setFetching] = useState(false);

-  const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
-    if (pods.length === 0) return;
+  const doFetch = useCallback(async () => {
    setFetching(true);
-
-    const results = await Promise.all(
-      pods.map(async pod => {
-        const name = pod.metadata.name;
-        const namespace = pod.metadata.namespace ?? 'kube-system';
-        const nodeName = pod.spec?.nodeName ?? name;
-        const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
-        return { name, result };
-      })
-    );
-
-    const map = new Map<string, GpuNodeMetrics | 'error'>();
-    for (const { name, result } of results) {
-      map.set(name, result ?? 'error');
+    setFetchError(null);
+    try {
+      const result = await fetchGpuMetrics();
+      setMetrics(result);
+      if (!result) {
+        setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.');
+      }
+    } catch (e: unknown) {
+      setFetchError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setFetching(false);
    }
-    setMetricsMap(map);
-    setFetching(false);
  }, []);

  useEffect(() => {
-    if (!ctxLoading && pluginPods.length > 0) {
-      void fetchAll(pluginPods);
+    if (!ctxLoading) {
+      void doFetch();
    }
-  }, [ctxLoading, pluginPods, fetchAll]);
+  }, [ctxLoading, doFetch]);

  if (ctxLoading) {
    return <Loader title="Loading Intel GPU data..." />;
@@ -200,8 +214,8 @@ export default function MetricsPage() {
      <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
        <SectionHeader title="Intel GPU — Metrics" />
        <button
-          onClick={() => void fetchAll(pluginPods)}
-          disabled={fetching || pluginPods.length === 0}
+          onClick={() => void doFetch()}
+          disabled={fetching}
          aria-label="Refresh metrics"
          style={{
            padding: '6px 16px',
@@ -218,94 +232,86 @@ export default function MetricsPage() {
        </button>
      </div>

-      {!pluginInstalled && (
-        <SectionBox title="Intel GPU Plugin Not Detected">
+      <MetricRequirements />
+
+      {fetching && !metrics && <Loader title="Querying Prometheus for GPU metrics..." />}
+
+      {fetchError && (
+        <SectionBox title="Prometheus Unreachable">
          <NameValueTable
            rows={[
              {
-                name: 'Status',
-                value: (
-                  <StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
-                ),
+                name: 'Error',
+                value: <StatusLabel status="error">{fetchError}</StatusLabel>,
              },
              {
-                name: 'Note',
-                value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
+                name: 'Checked services',
+                value: 'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
              },
            ]}
          />
        </SectionBox>
      )}

-      {pluginInstalled && pluginPods.length === 0 && (
-        <SectionBox title="No Plugin Pods Found">
-          <NameValueTable
-            rows={[
-              {
-                name: 'Status',
-                value: (
-                  <StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
-                ),
-              },
-            ]}
-          />
-        </SectionBox>
-      )}
-
-      {pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
-        <Loader title="Fetching GPU metrics..." />
-      )}
-
-      {pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
-        <SectionBox title="Metrics Unavailable">
+      {metrics && metrics.chips.length === 0 && (
+        <SectionBox title="No i915 Metrics in Prometheus">
          <NameValueTable
            rows={[
              {
                name: 'Status',
                value: (
                  <StatusLabel status="warning">
-                    Could not fetch metrics from any plugin pod
+                    Prometheus reachable — no node_hwmon_chip_names&#123;chip_name=&quot;i915&quot;&#125; found
                  </StatusLabel>
                ),
              },
              {
-                name: 'Requirements',
-                value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
+                name: 'GPU Nodes',
+                value: gpuNodes.length > 0 ? gpuNodes.map(n => n.metadata.name).join(', ') : 'None detected',
              },
              {
-                name: 'Plugin Pods Found',
-                value: pluginPods.map(p => p.metadata.name).join(', '),
+                name: 'Likely cause',
+                value: 'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
              },
            ]}
          />
        </SectionBox>
      )}

-      {Array.from(metricsMap.entries()).map(([podName, metrics]) => {
-        if (metrics === 'error') {
-          return (
-            <SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
-              <NameValueTable
-                rows={[
-                  {
-                    name: 'Status',
-                    value: (
-                      <StatusLabel status="error">
-                        Failed to fetch metrics from pod
-                      </StatusLabel>
-                    ),
-                  },
-                  {
-                    name: 'Hint',
-                    value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
-                  },
-                ]}
-              />
-            </SectionBox>
-          );
-        }
-        return <NodeMetricsCard key={podName} metrics={metrics} />;
-      })}
+      {metrics && metrics.chips.length > 0 && (
+        <>
+          <SectionBox title="GPU Power Summary">
+            <NameValueTable
+              rows={[
+                {
+                  name: 'GPUs Monitored',
+                  value: String(metrics.chips.length),
+                },
+                {
+                  name: 'Total Power',
+                  value: (() => {
+                    const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0);
+                    const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0);
+                    return <PowerBar watts={total} maxWatts={maxTotal > 0 ? maxTotal : null} />;
+                  })(),
+                },
+                {
+                  name: 'Last Fetched',
+                  value: new Date(metrics.fetchedAt).toLocaleTimeString(),
+                },
+                {
+                  name: 'Query',
+                  value: 'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
+                },
+              ]}
+            />
+          </SectionBox>
+
+          {metrics.chips.map(chip => (
+            <GpuChipCard key={`${chip.instance}-${chip.chip}`} chip={chip} />
+          ))}
+        </>
+      )}
    </>
  );
 }