fix: switch Metrics page to Prometheus/node-exporter i915 hwmon source

The Intel GPU device plugin -enable-monitoring flag registers a monitoring K8s resource type (not a Prometheus endpoint). Real GPU power metrics come from node-exporter's hwmon collector which scrapes the i915 kernel driver. - Rewrite src/api/metrics.ts: query kube-prometheus-stack Prometheus for node_hwmon_energy_joule_total (rate → watts), node_hwmon_power_max_watt (TDP), joined with node_hwmon_chip_names{chip_name="i915"} to identify GPU chips. Instance → node name resolved via node_uname_info. - Rewrite src/components/MetricsPage.tsx: shows per-chip current power (W) with bar vs TDP, total fleet power summary, last-fetched timestamp. Auto-discovers Prometheus service in monitoring namespace. - Update artifacthub-pkg.yml checksum for repackaged v0.2.0 tarball. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
2026-02-18 21:37:16 -05:00
parent a226f0191c
commit 4b4e565a1a
3 changed files with 276 additions and 431 deletions
@@ -46,7 +46,7 @@ links:

 changes:
  - kind: added
-    description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
+    description: "Metrics page: real-time GPU power draw (W) and TDP via node-exporter i915 hwmon metrics in kube-prometheus-stack"
  - kind: changed
    description: "Sidebar label changed to intel-gpu"
  - kind: removed
@@ -70,6 +70,6 @@ changes:

 annotations:
  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
-  headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
+  headlamp/plugin/archive-checksum: "sha256:cbcd20916d72e91ccc36143c74680fbeb2f045e1cbe6d1cc0d844b198e2a1ea5"
  headlamp/plugin/version-compat: ">=0.20.0"
  headlamp/plugin/distro-compat: "in-cluster,web,app"
@@ -1,16 +1,15 @@
 /**
- * Prometheus text format parser for Intel GPU device plugin metrics.
+ * Intel GPU metrics via Prometheus (kube-prometheus-stack).
 *
- * Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
- * via the Kubernetes API proxy and parses key metric families.
+ * The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter
+ * scrapes automatically. We query Prometheus for:
+ *   - node_hwmon_energy_joule_total  (chip_name="i915") → rate = power in W
+ *   - node_hwmon_power_max_watt      (same chip)        → TDP
+ *   - node_hwmon_chip_names          (chip_name="i915") → identify GPU chips
+ *   - node_uname_info                                   → instance → nodename
 *
- * Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
- *   gpu_i915_engine_active_ticks  — engine busy ticks (per card, engine)
- *   gpu_i915_engine_total_ticks   — engine total ticks (for utilization %)
- *   gpu_i915_energy_microjoules   — cumulative energy (µJ → power = delta/dt)
- *   gpu_i915_gt_boost_freq_mhz    — current GT boost frequency (MHz)
- *   gpu_i915_memory_local         — local (VRAM) memory usage (bytes)
- *   gpu_i915_memory_system        — system memory usage (bytes)
+ * Queries go through the Kubernetes API proxy to the in-cluster Prometheus
+ * service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/...
 */

 import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
@@ -19,239 +18,152 @@ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
 // Types
 // ---------------------------------------------------------------------------

-export interface MetricSample {
-  labels: Record<string, string>;
-  value: number;
-}
-
-export interface MetricFamily {
-  name: string;
-  help: string;
-  type: string;
-  samples: MetricSample[];
-}
-
-export type ParsedMetrics = Map<string, MetricFamily>;
-
-export interface GpuNodeMetrics {
-  /** Node name this metric set was fetched from (via plugin pod) */
+export interface GpuChipMetrics {
+  /** Kubernetes node name (e.g. "buttons") */
  nodeName: string;
-  /** Pod name of the intel-gpu-plugin daemonset pod */
-  podName: string;
-  /** Engine utilization per (card, engine): 0–100 */
-  engineUtilization: Array<{ card: string; engine: string; pct: number }>;
-  /** Current GT boost frequency in MHz per card */
-  boostFreqMhz: Array<{ card: string; value: number }>;
-  /** Local VRAM usage in bytes per card */
-  memoryLocalBytes: Array<{ card: string; value: number }>;
-  /** System memory usage in bytes per card */
-  memorySystemBytes: Array<{ card: string; value: number }>;
-  /** Cumulative energy in µJ per card (raw counter; compute delta for power) */
-  energyMicrojoules: Array<{ card: string; value: number }>;
-  /** Raw parsed metric families for advanced use */
-  raw: ParsedMetrics;
+  /** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */
+  chip: string;
+  /** node-exporter instance (IP:port) */
+  instance: string;
+  /** Current power draw in watts (rate of energy counter, null if unavailable) */
+  powerWatts: number | null;
+  /** Maximum / TDP power in watts */
+  powerMaxWatts: number | null;
+}
+
+export interface GpuMetrics {
+  chips: GpuChipMetrics[];
+  /** ISO timestamp of when metrics were fetched */
+  fetchedAt: string;
 }

 // ---------------------------------------------------------------------------
-// Prometheus text format parser
+// Prometheus query helper
 // ---------------------------------------------------------------------------

-const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
-
-function parseLabels(labelStr: string): Record<string, string> {
-  const labels: Record<string, string> = {};
-  let match: RegExpExecArray | null;
-  const re = new RegExp(LABEL_PAIR_RE.source, 'g');
-  while ((match = re.exec(labelStr)) !== null) {
-    const key = match[1];
-    const val = match[2];
-    if (key && val !== undefined) {
-      labels[key] = val;
-    }
-  }
-  return labels;
+interface PrometheusResult {
+  metric: Record<string, string>;
+  value: [number, string];
 }

-export function parsePrometheusText(text: string): ParsedMetrics {
-  const families = new Map<string, MetricFamily>();
-  let currentName = '';
-  let currentHelp = '';
-  let currentType = '';
-
-  for (const rawLine of text.split('\n')) {
-    const line = rawLine.trim();
-    if (!line) continue;
-
-    if (line.startsWith('# HELP ')) {
-      const rest = line.slice(7);
-      const spaceIdx = rest.indexOf(' ');
-      currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
-      currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
-      continue;
-    }
-
-    if (line.startsWith('# TYPE ')) {
-      const rest = line.slice(7);
-      const spaceIdx = rest.indexOf(' ');
-      currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
-      continue;
-    }
-
-    if (line.startsWith('#')) continue;
-
-    const openBrace = line.indexOf('{');
-    const closeBrace = line.lastIndexOf('}');
-
-    let metricName: string;
-    let labels: Record<string, string>;
-    let valuePart: string;
-
-    if (openBrace >= 0 && closeBrace > openBrace) {
-      metricName = line.slice(0, openBrace);
-      labels = parseLabels(line.slice(openBrace + 1, closeBrace));
-      valuePart = line.slice(closeBrace + 1).trim();
-    } else {
-      const spaceIdx = line.lastIndexOf(' ');
-      if (spaceIdx < 0) continue;
-      metricName = line.slice(0, spaceIdx);
-      labels = {};
-      valuePart = line.slice(spaceIdx + 1).trim();
-    }
-
-    const valueTokens = valuePart.split(' ');
-    const valueStr = valueTokens[0] ?? '';
-    const value = parseFloat(valueStr);
-    if (!Number.isFinite(value)) continue;
-
-    const familyKey = metricName;
-    let family = families.get(familyKey);
-    if (!family) {
-      family = {
-        name: familyKey,
-        help: metricName === currentName ? currentHelp : '',
-        type: metricName === currentName ? currentType : '',
-        samples: [],
-      };
-      families.set(familyKey, family);
-    }
-
-    family.samples.push({ labels, value });
-  }
-
-  return families;
-}
-
-// ---------------------------------------------------------------------------
-// Extract Intel GPU metrics from the parsed map
-// ---------------------------------------------------------------------------
-
-function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
-  return families.get(name)?.samples ?? [];
-}
-
-export function extractGpuNodeMetrics(
-  families: ParsedMetrics,
-  nodeName: string,
-  podName: string
-): GpuNodeMetrics {
-  const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
-  const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
-
-  // Build utilization: active/total per (card, engine)
-  const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
-  for (const active of activeSamples) {
-    const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
-    const engine = active.labels['engine'] ?? 'render/0';
-    const totalSample = totalSamples.find(
-      s =>
-        (s.labels['card'] ?? s.labels['gpu']) === card &&
-        s.labels['engine'] === engine
-    );
-    const total = totalSample?.value ?? 0;
-    const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
-    engineUtilization.push({ card, engine, pct });
-  }
-
-  // Boost frequency
-  const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-
-  // Memory
-  const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-  const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-
-  // Energy
-  const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
-    card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
-    value: s.value,
-  }));
-
-  return {
-    nodeName,
-    podName,
-    engineUtilization,
-    boostFreqMhz,
-    memoryLocalBytes,
-    memorySystemBytes,
-    energyMicrojoules,
-    raw: families,
+interface PrometheusResponse {
+  status: string;
+  data: {
+    resultType: string;
+    result: PrometheusResult[];
  };
 }

-// ---------------------------------------------------------------------------
-// Fetch metrics from an Intel GPU device plugin pod
-// ---------------------------------------------------------------------------
-
 /**
- * Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
- *
- * The proxy path is:
- *   /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
- *
- * Returns null if the pod is not exposing metrics (enableMonitoring: false)
- * or if the proxy request fails.
+ * Service discovery: find the Prometheus service.
+ * Tries the kube-prometheus-stack default name; falls back to prometheus-operated.
 */
-export async function fetchGpuPluginMetrics(
-  podName: string,
-  namespace: string,
-  nodeName: string
-): Promise<GpuNodeMetrics | null> {
-  const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
+const PROMETHEUS_SERVICES = [
+  { namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' },
+  { namespace: 'monitoring', service: 'prometheus-operated', port: '9090' },
+  { namespace: 'monitoring', service: 'prometheus', port: '9090' },
+];

-  try {
-    const raw: unknown = await ApiProxy.request(path, {
-      method: 'GET',
-      isJSON: false,
-    });
+async function queryPrometheus(
+  query: string,
+  prometheusPath: string
+): Promise<PrometheusResult[]> {
+  const encoded = encodeURIComponent(query);
+  const path = `${prometheusPath}/api/v1/query?query=${encoded}`;

-    if (typeof raw !== 'string') return null;
+  const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse;

-    const families = parsePrometheusText(raw);
-    return extractGpuNodeMetrics(families, nodeName, podName);
-  } catch {
-    return null;
+  if (raw?.status !== 'success') return [];
+  return raw.data?.result ?? [];
+}
+
+async function findPrometheusPath(): Promise<string | null> {
+  for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
+    const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
+    try {
+      const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse;
+      if (raw?.status === 'success') return basePath;
+    } catch {
+      // try next
+    }
  }
+  return null;
+}
+
+// ---------------------------------------------------------------------------
+// Metrics fetch
+// ---------------------------------------------------------------------------
+
+export async function fetchGpuMetrics(): Promise<GpuMetrics | null> {
+  const prometheusPath = await findPrometheusPath();
+  if (!prometheusPath) return null;
+
+  // Run queries in parallel
+  const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([
+    // i915 chip identification
+    queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath),
+    // Current power (rate of cumulative energy counter)
+    queryPrometheus(
+      'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
+      prometheusPath
+    ),
+    // TDP / max power
+    queryPrometheus(
+      'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
+      prometheusPath
+    ),
+    // instance → nodename mapping
+    queryPrometheus('node_uname_info', prometheusPath),
+  ]);
+
+  // Build instance → nodename map
+  const instanceToNode = new Map<string, string>();
+  for (const r of unameResults) {
+    const inst = r.metric['instance'];
+    const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst;
+    if (inst) instanceToNode.set(inst, nodename);
+  }
+
+  // Build chip → power map
+  const chipToPower = new Map<string, number>();
+  for (const r of energyRateResults) {
+    const chip = r.metric['chip'];
+    if (chip) chipToPower.set(chip, parseFloat(r.value[1]));
+  }
+
+  // Build chip → max power map
+  const chipToMaxPower = new Map<string, number>();
+  for (const r of powerMaxResults) {
+    const chip = r.metric['chip'];
+    if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1]));
+  }
+
+  // Assemble per-chip metrics from the chip identification results
+  const chips: GpuChipMetrics[] = chipResults.map(r => {
+    const chip = r.metric['chip'] ?? '';
+    const instance = r.metric['instance'] ?? '';
+    const nodeName = instanceToNode.get(instance) ?? instance;
+    const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null;
+    const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null;
+
+    return { nodeName, chip, instance, powerWatts, powerMaxWatts };
+  });
+
+  return {
+    chips,
+    fetchedAt: new Date().toISOString(),
+  };
 }

 // ---------------------------------------------------------------------------
 // Formatting helpers
 // ---------------------------------------------------------------------------

-export function formatBytes(bytes: number): string {
-  if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
-  if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
-  if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
-  return `${bytes} B`;
+export function formatWatts(w: number): string {
+  return `${w.toFixed(1)} W`;
 }

-export function formatFreq(mhz: number): string {
-  return `${Math.round(mhz)} MHz`;
+export function formatPercent(used: number, max: number): string {
+  if (max <= 0) return '—';
+  return `${Math.round((used / max) * 100)}%`;
 }
@@ -1,9 +1,9 @@
 /**
- * MetricsPage — real-time Intel GPU metrics from the device plugin pods.
+ * MetricsPage — Intel GPU power metrics from Prometheus (node-exporter hwmon).
 *
- * Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
- * and displays per-card engine utilization, GPU frequency, memory usage,
- * and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
+ * The Intel i915/Xe GPU driver exposes hwmon sensors which node-exporter scrapes.
+ * This page queries kube-prometheus-stack for real-time GPU power draw
+ * (derived from node_hwmon_energy_joule_total rate) and TDP per GPU node.
 */

 import {
@@ -15,141 +15,82 @@ import {
 } from '@kinvolk/headlamp-plugin/lib/CommonComponents';
 import React, { useCallback, useEffect, useState } from 'react';
 import { useIntelGpuContext } from '../api/IntelGpuDataContext';
-import {
-  fetchGpuPluginMetrics,
-  formatBytes,
-  formatFreq,
-  GpuNodeMetrics,
-} from '../api/metrics';
-import { IntelGpuPod } from '../api/k8s';
+import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics';

 // ---------------------------------------------------------------------------
-// Utilization bar
+// Power bar
 // ---------------------------------------------------------------------------

-function UtilizationBar({ pct }: { pct: number }) {
-  const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
+function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
+  const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
+  const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
+
  return (
    <div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
-      <div
-        style={{
-          width: '100px',
-          height: '8px',
-          backgroundColor: '#e0e0e0',
-          borderRadius: '4px',
-          overflow: 'hidden',
-          flexShrink: 0,
-        }}
-      >
+      {pct !== null && (
        <div
          style={{
-            width: `${pct}%`,
-            height: '100%',
-            backgroundColor: color,
+            width: '100px',
+            height: '8px',
+            backgroundColor: '#e0e0e0',
            borderRadius: '4px',
-            transition: 'width 0.3s ease',
+            overflow: 'hidden',
+            flexShrink: 0,
          }}
-        />
-      </div>
-      <span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
+        >
+          <div
+            style={{
+              width: `${pct}%`,
+              height: '100%',
+              backgroundColor: color,
+              borderRadius: '4px',
+              transition: 'width 0.4s ease',
+            }}
+          />
+        </div>
+      )}
+      <span style={{ fontSize: '13px', fontVariantNumeric: 'tabular-nums' }}>
+        {formatWatts(watts)}
+        {maxWatts !== null && maxWatts > 0 && (
+          <span style={{ color: '#888', marginLeft: '4px' }}>
+            / {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)})
+          </span>
+        )}
+      </span>
    </div>
  );
 }

 // ---------------------------------------------------------------------------
-// Per-node metrics card
+// Per-chip card
 // ---------------------------------------------------------------------------

-function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
-  const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
+function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
+  const rows: Array<{ name: string; value: React.ReactNode }> = [
+    { name: 'Node', value: chip.nodeName },
+    { name: 'GPU (PCI)', value: chip.chip },
+  ];

-  // Group engines by card
-  const byCard = new Map<string, typeof engineUtilization>();
-  for (const e of engineUtilization) {
-    if (!byCard.has(e.card)) byCard.set(e.card, []);
-    byCard.get(e.card)!.push(e);
+  if (chip.powerWatts !== null) {
+    rows.push({
+      name: 'Current Power',
+      value: <PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />,
+    });
+  } else {
+    rows.push({
+      name: 'Current Power',
+      value: <StatusLabel status="warning">No data (needs ≥5m of scrape history)</StatusLabel>,
+    });
  }

-  const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
-  const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
-  const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
-  const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
-
-  const cards = Array.from(
-    new Set([
-      ...byCard.keys(),
-      ...freqByCard.keys(),
-      ...memLocalByCard.keys(),
-    ])
-  ).sort();
-
-  if (cards.length === 0) {
-    return (
-      <SectionBox title={`${nodeName} — No Metric Data`}>
-        <NameValueTable
-          rows={[
-            {
-              name: 'Pod',
-              value: podName,
-            },
-            {
-              name: 'Note',
-              value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
-            },
-          ]}
-        />
-      </SectionBox>
-    );
+  if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) {
+    rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) });
  }

  return (
-    <>
-      {cards.map(card => {
-        const engines = byCard.get(card) ?? [];
-        const freq = freqByCard.get(card);
-        const memLocal = memLocalByCard.get(card);
-        const memSys = memSysByCard.get(card);
-        const energy = energyByCard.get(card);
-
-        const rows: Array<{ name: string; value: React.ReactNode }> = [
-          { name: 'Node', value: nodeName },
-          { name: 'Plugin Pod', value: podName },
-          { name: 'GPU Card', value: card },
-        ];
-
-        if (freq !== undefined) {
-          rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
-        }
-
-        if (memLocal !== undefined) {
-          rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
-        }
-        if (memSys !== undefined && memSys > 0) {
-          rows.push({ name: 'System Memory', value: formatBytes(memSys) });
-        }
-
-        if (energy !== undefined) {
-          rows.push({
-            name: 'Energy (cumulative)',
-            value: `${(energy / 1e6).toFixed(2)} J`,
-          });
-        }
-
-        // Engine utilization rows
-        for (const e of engines) {
-          rows.push({
-            name: `Engine: ${e.engine}`,
-            value: <UtilizationBar pct={e.pct} />,
-          });
-        }
-
-        return (
-          <SectionBox key={`${nodeName}-${card}`} title={`${nodeName} — ${card}`}>
-            <NameValueTable rows={rows} />
-          </SectionBox>
-        );
-      })}
-    </>
+    <SectionBox title={`${chip.nodeName} — ${chip.chip}`}>
+      <NameValueTable rows={rows} />
+    </SectionBox>
  );
 }

@@ -158,38 +99,33 @@ function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
 // ---------------------------------------------------------------------------

 export default function MetricsPage() {
-  const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
+  const { gpuNodes, loading: ctxLoading } = useIntelGpuContext();

-  const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
+  const [metrics, setMetrics] = useState<GpuMetrics | null>(null);
+  const [fetchError, setFetchError] = useState<string | null>(null);
  const [fetching, setFetching] = useState(false);

-  const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
-    if (pods.length === 0) return;
+  const doFetch = useCallback(async () => {
    setFetching(true);
-
-    const results = await Promise.all(
-      pods.map(async pod => {
-        const name = pod.metadata.name;
-        const namespace = pod.metadata.namespace ?? 'kube-system';
-        const nodeName = pod.spec?.nodeName ?? name;
-        const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
-        return { name, result };
-      })
-    );
-
-    const map = new Map<string, GpuNodeMetrics | 'error'>();
-    for (const { name, result } of results) {
-      map.set(name, result ?? 'error');
+    setFetchError(null);
+    try {
+      const result = await fetchGpuMetrics();
+      setMetrics(result);
+      if (!result) {
+        setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.');
+      }
+    } catch (e: unknown) {
+      setFetchError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setFetching(false);
    }
-    setMetricsMap(map);
-    setFetching(false);
  }, []);

  useEffect(() => {
-    if (!ctxLoading && pluginPods.length > 0) {
-      void fetchAll(pluginPods);
+    if (!ctxLoading) {
+      void doFetch();
    }
-  }, [ctxLoading, pluginPods, fetchAll]);
+  }, [ctxLoading, doFetch]);

  if (ctxLoading) {
    return <Loader title="Loading Intel GPU data..." />;
@@ -200,8 +136,8 @@ export default function MetricsPage() {
      <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
        <SectionHeader title="Intel GPU — Metrics" />
        <button
-          onClick={() => void fetchAll(pluginPods)}
-          disabled={fetching || pluginPods.length === 0}
+          onClick={() => void doFetch()}
+          disabled={fetching}
          aria-label="Refresh metrics"
          style={{
            padding: '6px 16px',
@@ -218,94 +154,91 @@ export default function MetricsPage() {
        </button>
      </div>

-      {!pluginInstalled && (
-        <SectionBox title="Intel GPU Plugin Not Detected">
-          <NameValueTable
-            rows={[
-              {
-                name: 'Status',
-                value: (
-                  <StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
-                ),
-              },
-              {
-                name: 'Note',
-                value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
-              },
-            ]}
-          />
-        </SectionBox>
-      )}
+      {fetching && !metrics && <Loader title="Querying Prometheus for GPU metrics..." />}

-      {pluginInstalled && pluginPods.length === 0 && (
-        <SectionBox title="No Plugin Pods Found">
-          <NameValueTable
-            rows={[
-              {
-                name: 'Status',
-                value: (
-                  <StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
-                ),
-              },
-            ]}
-          />
-        </SectionBox>
-      )}
-
-      {pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
-        <Loader title="Fetching GPU metrics..." />
-      )}
-
-      {pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
+      {fetchError && (
        <SectionBox title="Metrics Unavailable">
+          <NameValueTable
+            rows={[
+              {
+                name: 'Error',
+                value: <StatusLabel status="error">{fetchError}</StatusLabel>,
+              },
+              {
+                name: 'Data Source',
+                value: 'node_hwmon_energy_joule_total (chip_name="i915") via kube-prometheus-stack',
+              },
+              {
+                name: 'Requirements',
+                value: 'kube-prometheus-stack installed in monitoring namespace with node-exporter enabled',
+              },
+            ]}
+          />
+        </SectionBox>
+      )}
+
+      {metrics && metrics.chips.length === 0 && (
+        <SectionBox title="No i915 GPU Metrics Found">
          <NameValueTable
            rows={[
              {
                name: 'Status',
                value: (
                  <StatusLabel status="warning">
-                    Could not fetch metrics from any plugin pod
+                    Prometheus is reachable but no i915 hwmon chips found
                  </StatusLabel>
                ),
              },
              {
-                name: 'Requirements',
-                value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
+                name: 'Note',
+                value: 'The i915 driver exposes hwmon sensors on discrete Intel GPU nodes. ' +
+                  'Ensure node-exporter is running on GPU nodes with hwmon collector enabled.',
              },
              {
-                name: 'Plugin Pods Found',
-                value: pluginPods.map(p => p.metadata.name).join(', '),
+                name: 'GPU Nodes',
+                value: gpuNodes.length > 0
+                  ? gpuNodes.map(n => n.metadata.name).join(', ')
+                  : 'None detected',
              },
            ]}
          />
        </SectionBox>
      )}

-      {Array.from(metricsMap.entries()).map(([podName, metrics]) => {
-        if (metrics === 'error') {
-          return (
-            <SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
-              <NameValueTable
-                rows={[
-                  {
-                    name: 'Status',
-                    value: (
-                      <StatusLabel status="error">
-                        Failed to fetch metrics from pod
-                      </StatusLabel>
-                    ),
-                  },
-                  {
-                    name: 'Hint',
-                    value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
-                  },
-                ]}
-              />
-            </SectionBox>
-          );
-        }
-        return <NodeMetricsCard key={podName} metrics={metrics} />;
-      })}
+      {metrics && metrics.chips.length > 0 && (
+        <>
+          <SectionBox title="GPU Power Summary">
+            <NameValueTable
+              rows={[
+                {
+                  name: 'GPUs Monitored',
+                  value: String(metrics.chips.length),
+                },
+                {
+                  name: 'Total Power',
+                  value: (() => {
+                    const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0);
+                    const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0);
+                    return <PowerBar watts={total} maxWatts={maxTotal > 0 ? maxTotal : null} />;
+                  })(),
+                },
+                {
+                  name: 'Last Fetched',
+                  value: new Date(metrics.fetchedAt).toLocaleTimeString(),
+                },
+                {
+                  name: 'Data Source',
+                  value: 'node-exporter hwmon · i915 driver · rate(node_hwmon_energy_joule_total[5m])',
+                },
+              ]}
+            />
+          </SectionBox>
+
+          {metrics.chips.map(chip => (
+            <GpuChipCard key={`${chip.instance}-${chip.chip}`} chip={chip} />
+          ))}
+        </>
+      )}
    </>
  );
 }