chore: bump to v0.3.0

Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
docs: document metric availability and requirements in MetricsPage
2026-02-19 05:57:13 -05:00 · 2026-02-18 22:07:19 -05:00 · 2026-02-18 21:37:16 -05:00 · 2026-02-18 21:23:36 -05:00
5 changed files with 527 additions and 23 deletions
@@ -1,4 +1,4 @@
-version: "0.1.0"
+version: "0.3.0"
 name: headlamp-intel-gpu-plugin
 displayName: Intel GPU
 description: >-
@@ -7,13 +7,15 @@ description: >-
  allocation, pods requesting Intel GPU resources, and injects Intel GPU
  sections into native Node and Pod detail pages. Supports discrete (i915),
  Xe, and integrated GPU nodes with graceful degradation when the device
-  plugin operator is not installed.
+  plugin operator is not installed. Includes a Metrics page showing real-time
+  engine utilization, GPU frequency, VRAM usage, and energy from the device
+  plugin's Prometheus endpoint.
 createdAt: "2026-02-18T00:00:00Z"
 license: Apache-2.0
 category: monitoring-logging

 homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
-appVersion: "0.1.0"
+appVersion: "0.3.0"

 keywords:
  - headlamp
@@ -43,6 +45,14 @@ links:
    url: https://intel.github.io/intel-device-plugins-for-kubernetes/

 changes:
+  - kind: added
+    description: "Metrics page: document which metrics require what infrastructure (power via hwmon works out of the box; frequency and utilization need custom exporters)"
+  - kind: added
+    description: "Metrics page: real-time GPU power draw (W) and TDP via node-exporter i915 hwmon metrics in kube-prometheus-stack"
+  - kind: changed
+    description: "Sidebar label changed to intel-gpu"
+  - kind: removed
+    description: "Removed app bar health badge"
  - kind: added
    description: "Overview dashboard: plugin health, GPU node summary, allocation bar, active GPU pods"
  - kind: added
@@ -61,7 +71,7 @@ changes:
    description: "App bar health badge: hidden when no Intel GPU plugin detected"

 annotations:
-  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.1.0/headlamp-intel-gpu-plugin-0.1.0.tar.gz"
-  headlamp/plugin/archive-checksum: "sha256:d6a50567d0f9e537f0edadac334d6a03cd182f5b64b47264577f2213fd882687"
+  headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.3.0/headlamp-intel-gpu-plugin-0.3.0.tar.gz"
+  headlamp/plugin/archive-checksum: "sha256:fdc53099ee3123680f24fe4a319b753ca3d030aac31abd4e3f383221085c9c2d"
  headlamp/plugin/version-compat: ">=0.20.0"
  headlamp/plugin/distro-compat: "in-cluster,web,app"
@@ -1,6 +1,6 @@
 {
  "name": "headlamp-intel-gpu-plugin",
-  "version": "0.1.0",
+  "version": "0.3.0",
  "description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
  "repository": {
    "type": "git",
@@ -0,0 +1,169 @@
+/**
+ * Intel GPU metrics via Prometheus (kube-prometheus-stack).
+ *
+ * The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter
+ * scrapes automatically. We query Prometheus for:
+ *   - node_hwmon_energy_joule_total  (chip_name="i915") → rate = power in W
+ *   - node_hwmon_power_max_watt      (same chip)        → TDP
+ *   - node_hwmon_chip_names          (chip_name="i915") → identify GPU chips
+ *   - node_uname_info                                   → instance → nodename
+ *
+ * Queries go through the Kubernetes API proxy to the in-cluster Prometheus
+ * service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/...
+ */
+
+import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+export interface GpuChipMetrics {
+  /** Kubernetes node name (e.g. "buttons") */
+  nodeName: string;
+  /** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */
+  chip: string;
+  /** node-exporter instance (IP:port) */
+  instance: string;
+  /** Current power draw in watts (rate of energy counter, null if unavailable) */
+  powerWatts: number | null;
+  /** Maximum / TDP power in watts */
+  powerMaxWatts: number | null;
+}
+
+export interface GpuMetrics {
+  chips: GpuChipMetrics[];
+  /** ISO timestamp of when metrics were fetched */
+  fetchedAt: string;
+}
+
+// ---------------------------------------------------------------------------
+// Prometheus query helper
+// ---------------------------------------------------------------------------
+
+interface PrometheusResult {
+  metric: Record<string, string>;
+  value: [number, string];
+}
+
+interface PrometheusResponse {
+  status: string;
+  data: {
+    resultType: string;
+    result: PrometheusResult[];
+  };
+}
+
+/**
+ * Service discovery: find the Prometheus service.
+ * Tries the kube-prometheus-stack default name; falls back to prometheus-operated.
+ */
+const PROMETHEUS_SERVICES = [
+  { namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' },
+  { namespace: 'monitoring', service: 'prometheus-operated', port: '9090' },
+  { namespace: 'monitoring', service: 'prometheus', port: '9090' },
+];
+
+async function queryPrometheus(
+  query: string,
+  prometheusPath: string
+): Promise<PrometheusResult[]> {
+  const encoded = encodeURIComponent(query);
+  const path = `${prometheusPath}/api/v1/query?query=${encoded}`;
+
+  const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse;
+
+  if (raw?.status !== 'success') return [];
+  return raw.data?.result ?? [];
+}
+
+async function findPrometheusPath(): Promise<string | null> {
+  for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
+    const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
+    try {
+      const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse;
+      if (raw?.status === 'success') return basePath;
+    } catch {
+      // try next
+    }
+  }
+  return null;
+}
+
+// ---------------------------------------------------------------------------
+// Metrics fetch
+// ---------------------------------------------------------------------------
+
+export async function fetchGpuMetrics(): Promise<GpuMetrics | null> {
+  const prometheusPath = await findPrometheusPath();
+  if (!prometheusPath) return null;
+
+  // Run queries in parallel
+  const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([
+    // i915 chip identification
+    queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath),
+    // Current power (rate of cumulative energy counter)
+    queryPrometheus(
+      'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
+      prometheusPath
+    ),
+    // TDP / max power
+    queryPrometheus(
+      'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
+      prometheusPath
+    ),
+    // instance → nodename mapping
+    queryPrometheus('node_uname_info', prometheusPath),
+  ]);
+
+  // Build instance → nodename map
+  const instanceToNode = new Map<string, string>();
+  for (const r of unameResults) {
+    const inst = r.metric['instance'];
+    const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst;
+    if (inst) instanceToNode.set(inst, nodename);
+  }
+
+  // Build chip → power map
+  const chipToPower = new Map<string, number>();
+  for (const r of energyRateResults) {
+    const chip = r.metric['chip'];
+    if (chip) chipToPower.set(chip, parseFloat(r.value[1]));
+  }
+
+  // Build chip → max power map
+  const chipToMaxPower = new Map<string, number>();
+  for (const r of powerMaxResults) {
+    const chip = r.metric['chip'];
+    if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1]));
+  }
+
+  // Assemble per-chip metrics from the chip identification results
+  const chips: GpuChipMetrics[] = chipResults.map(r => {
+    const chip = r.metric['chip'] ?? '';
+    const instance = r.metric['instance'] ?? '';
+    const nodeName = instanceToNode.get(instance) ?? instance;
+    const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null;
+    const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null;
+
+    return { nodeName, chip, instance, powerWatts, powerMaxWatts };
+  });
+
+  return {
+    chips,
+    fetchedAt: new Date().toISOString(),
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Formatting helpers
+// ---------------------------------------------------------------------------
+
+export function formatWatts(w: number): string {
+  return `${w.toFixed(1)} W`;
+}
+
+export function formatPercent(used: number, max: number): string {
+  if (max <= 0) return '—';
+  return `${Math.round((used / max) * 100)}%`;
+}
@@ -0,0 +1,317 @@
+/**
+ * MetricsPage — Intel GPU metrics from Prometheus (node-exporter hwmon).
+ *
+ * METRIC AVAILABILITY
+ * -------------------
+ * Power (current W, TDP)
+ *   Source:   node_hwmon_energy_joule_total, node_hwmon_power_max_watt
+ *   Driver:   i915 hwmon sysfs (/sys/class/drm/card{N}/device/hwmon/)
+ *   Scraped:  node-exporter hwmon collector (enabled by default)
+ *   Nodes:    Discrete GPU nodes only (i915 driver exposes hwmon; iGPU driver does not)
+ *   No extra config required — works out of the box with kube-prometheus-stack.
+ *
+ * GPU Frequency (current, boost, min, max MHz)
+ *   Source:   DRM sysfs (/sys/class/drm/card{N}/gt_{x}_freq_mhz)
+ *   Driver:   i915 kernel driver
+ *   Scraped:  NOT available -- node-exporter --collector.drm is AMD-only and does not
+ *             read i915 gt_freq sysfs files. Would require a custom exporter or
+ *             node-exporter textfile collector sidecar writing these values.
+ *
+ * GPU Utilization (engine busy %)
+ *   Source:   Not exposed via hwmon or any standard Prometheus collector for i915.
+ *             Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
+ *
+ * Integrated GPU (iGPU) nodes
+ *   The iGPU driver does not expose hwmon sensors. No Prometheus metrics are
+ *   available for iGPU nodes regardless of configuration.
+ */
+
+import {
+  Loader,
+  NameValueTable,
+  SectionBox,
+  SectionHeader,
+  StatusLabel,
+} from '@kinvolk/headlamp-plugin/lib/CommonComponents';
+import React, { useCallback, useEffect, useState } from 'react';
+import { useIntelGpuContext } from '../api/IntelGpuDataContext';
+import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics';
+
+// ---------------------------------------------------------------------------
+// Power bar
+// ---------------------------------------------------------------------------
+
+function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
+  const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
+  const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
+
+  return (
+    <div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
+      {pct !== null && (
+        <div
+          style={{
+            width: '100px',
+            height: '8px',
+            backgroundColor: '#e0e0e0',
+            borderRadius: '4px',
+            overflow: 'hidden',
+            flexShrink: 0,
+          }}
+        >
+          <div
+            style={{
+              width: `${pct}%`,
+              height: '100%',
+              backgroundColor: color,
+              borderRadius: '4px',
+              transition: 'width 0.4s ease',
+            }}
+          />
+        </div>
+      )}
+      <span style={{ fontSize: '13px', fontVariantNumeric: 'tabular-nums' }}>
+        {formatWatts(watts)}
+        {maxWatts !== null && maxWatts > 0 && (
+          <span style={{ color: '#888', marginLeft: '4px' }}>
+            / {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)})
+          </span>
+        )}
+      </span>
+    </div>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Per-chip card
+// ---------------------------------------------------------------------------
+
+function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
+  const rows: Array<{ name: string; value: React.ReactNode }> = [
+    { name: 'Node', value: chip.nodeName },
+    { name: 'GPU (PCI)', value: chip.chip },
+    {
+      name: 'Current Power',
+      value: chip.powerWatts !== null
+        ? <PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
+        : <StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>,
+    },
+  ];
+
+  if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) {
+    rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) });
+  }
+
+  return (
+    <SectionBox title={`${chip.nodeName} — ${chip.chip}`}>
+      <NameValueTable rows={rows} />
+    </SectionBox>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Requirements info box
+// ---------------------------------------------------------------------------
+
+function MetricRequirements() {
+  return (
+    <SectionBox title="Metric Availability">
+      <NameValueTable
+        rows={[
+          {
+            name: 'Power (W)',
+            value: (
+              <>
+                <StatusLabel status="success">Available — discrete GPU nodes</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon collector (enabled by default).
+                  Requires the i915 kernel driver on the node. iGPU nodes do not expose hwmon sensors.
+                </div>
+              </>
+            ),
+          },
+          {
+            name: 'Frequency (MHz)',
+            value: (
+              <>
+                <StatusLabel status="error">Not available</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  i915 exposes <code>gt_*_freq_mhz</code> via DRM sysfs but node-exporter&apos;s{' '}
+                  <code>--collector.drm</code> flag is AMD-only and does not read these files.
+                  A custom exporter or textfile-collector sidecar writing these values would be required.
+                </div>
+              </>
+            ),
+          },
+          {
+            name: 'Utilization (%)',
+            value: (
+              <>
+                <StatusLabel status="error">Not available</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  No standard Prometheus collector exposes i915 engine busy percentage.
+                  Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
+                </div>
+              </>
+            ),
+          },
+          {
+            name: 'iGPU nodes',
+            value: (
+              <>
+                <StatusLabel status="error">No metrics available</StatusLabel>
+                <div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
+                  The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics
+                  are available for iGPU nodes regardless of configuration.
+                </div>
+              </>
+            ),
+          },
+        ]}
+      />
+    </SectionBox>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Main page
+// ---------------------------------------------------------------------------
+
+export default function MetricsPage() {
+  const { gpuNodes, loading: ctxLoading } = useIntelGpuContext();
+
+  const [metrics, setMetrics] = useState<GpuMetrics | null>(null);
+  const [fetchError, setFetchError] = useState<string | null>(null);
+  const [fetching, setFetching] = useState(false);
+
+  const doFetch = useCallback(async () => {
+    setFetching(true);
+    setFetchError(null);
+    try {
+      const result = await fetchGpuMetrics();
+      setMetrics(result);
+      if (!result) {
+        setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.');
+      }
+    } catch (e: unknown) {
+      setFetchError(e instanceof Error ? e.message : String(e));
+    } finally {
+      setFetching(false);
+    }
+  }, []);
+
+  useEffect(() => {
+    if (!ctxLoading) {
+      void doFetch();
+    }
+  }, [ctxLoading, doFetch]);
+
+  if (ctxLoading) {
+    return <Loader title="Loading Intel GPU data..." />;
+  }
+
+  return (
+    <>
+      <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
+        <SectionHeader title="Intel GPU — Metrics" />
+        <button
+          onClick={() => void doFetch()}
+          disabled={fetching}
+          aria-label="Refresh metrics"
+          style={{
+            padding: '6px 16px',
+            backgroundColor: 'transparent',
+            color: 'var(--mui-palette-primary-main, #0071c5)',
+            border: '1px solid var(--mui-palette-primary-main, #0071c5)',
+            borderRadius: '4px',
+            cursor: 'pointer',
+            fontSize: '13px',
+            fontWeight: 500,
+          }}
+        >
+          {fetching ? 'Refreshing…' : 'Refresh'}
+        </button>
+      </div>
+
+      <MetricRequirements />
+
+      {fetching && !metrics && <Loader title="Querying Prometheus for GPU metrics..." />}
+
+      {fetchError && (
+        <SectionBox title="Prometheus Unreachable">
+          <NameValueTable
+            rows={[
+              {
+                name: 'Error',
+                value: <StatusLabel status="error">{fetchError}</StatusLabel>,
+              },
+              {
+                name: 'Checked services',
+                value: 'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
+              },
+            ]}
+          />
+        </SectionBox>
+      )}
+
+      {metrics && metrics.chips.length === 0 && (
+        <SectionBox title="No i915 Metrics in Prometheus">
+          <NameValueTable
+            rows={[
+              {
+                name: 'Status',
+                value: (
+                  <StatusLabel status="warning">
+                    Prometheus reachable — no node_hwmon_chip_names&#123;chip_name=&quot;i915&quot;&#125; found
+                  </StatusLabel>
+                ),
+              },
+              {
+                name: 'GPU Nodes',
+                value: gpuNodes.length > 0 ? gpuNodes.map(n => n.metadata.name).join(', ') : 'None detected',
+              },
+              {
+                name: 'Likely cause',
+                value: 'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
+              },
+            ]}
+          />
+        </SectionBox>
+      )}
+
+      {metrics && metrics.chips.length > 0 && (
+        <>
+          <SectionBox title="GPU Power Summary">
+            <NameValueTable
+              rows={[
+                {
+                  name: 'GPUs Monitored',
+                  value: String(metrics.chips.length),
+                },
+                {
+                  name: 'Total Power',
+                  value: (() => {
+                    const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0);
+                    const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0);
+                    return <PowerBar watts={total} maxWatts={maxTotal > 0 ? maxTotal : null} />;
+                  })(),
+                },
+                {
+                  name: 'Last Fetched',
+                  value: new Date(metrics.fetchedAt).toLocaleTimeString(),
+                },
+                {
+                  name: 'Query',
+                  value: 'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
+                },
+              ]}
+            />
+          </SectionBox>
+
+          {metrics.chips.map(chip => (
+            <GpuChipCard key={`${chip.instance}-${chip.chip}`} chip={chip} />
+          ))}
+        </>
+      )}
+    </>
+  );
+}
@@ -1,20 +1,17 @@
 /**
 * headlamp-intel-gpu-plugin — entry point.
 *
- * Registers sidebar entries, routes, detail view sections, table column
- * processors, and app bar action for Intel GPU device plugin visibility
- * in Headlamp.
+ * Registers sidebar entries, routes, detail view sections, and table column
+ * processors for Intel GPU device plugin visibility in Headlamp.
 *
 * Surfaces Intel GPU information in the following places:
- *   - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods
+ *   - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods / Metrics
 *   - Native Node detail page: Intel GPU section (capacity, utilization, pods)
 *   - Native Pod detail page: GPU resource requests per container
 *   - Native Nodes table: GPU Type and GPU Devices columns
- *   - App bar: health badge (hidden when plugin not installed)
 */

 import {
-  registerAppBarAction,
  registerDetailsViewSection,
  registerResourceTableColumnsProcessor,
  registerRoute,
@@ -22,9 +19,9 @@ import {
 } from '@kinvolk/headlamp-plugin/lib';
 import React from 'react';
 import { IntelGpuDataProvider } from './api/IntelGpuDataContext';
-import AppBarGpuBadge from './components/AppBarGpuBadge';
 import DevicePluginsPage from './components/DevicePluginsPage';
 import { buildNodeGpuColumns } from './components/integrations/NodeColumns';
+import MetricsPage from './components/MetricsPage';
 import NodeDetailSection from './components/NodeDetailSection';
 import NodesPage from './components/NodesPage';
 import OverviewPage from './components/OverviewPage';
@@ -38,7 +35,7 @@ import PodsPage from './components/PodsPage';
 registerSidebarEntry({
  parent: null,
  name: 'intel-gpu',
-  label: 'Intel GPU',
+  label: 'intel-gpu',
  url: '/intel-gpu',
  icon: 'mdi:gpu',
 });
@@ -75,6 +72,14 @@ registerSidebarEntry({
  icon: 'mdi:cube-outline',
 });

+registerSidebarEntry({
+  parent: 'intel-gpu',
+  name: 'intel-gpu-metrics',
+  label: 'Metrics',
+  url: '/intel-gpu/metrics',
+  icon: 'mdi:chart-line',
+});
+
 // ---------------------------------------------------------------------------
 // Routes
 // ---------------------------------------------------------------------------
@@ -127,6 +132,18 @@ registerRoute({
  ),
 });

+registerRoute({
+  path: '/intel-gpu/metrics',
+  sidebar: 'intel-gpu-metrics',
+  name: 'intel-gpu-metrics',
+  exact: true,
+  component: () => (
+    <IntelGpuDataProvider>
+      <MetricsPage />
+    </IntelGpuDataProvider>
+  ),
+});
+
 // ---------------------------------------------------------------------------
 // Detail view section — Node pages
 // Inject Intel GPU section into native Node detail page for GPU nodes.
@@ -164,12 +181,3 @@ registerResourceTableColumnsProcessor(({ id, columns }) => {
  return columns;
 });

-// ---------------------------------------------------------------------------
-// App bar action — Intel GPU health badge
-// ---------------------------------------------------------------------------
-
-registerAppBarAction(() => (
-  <IntelGpuDataProvider>
-    <AppBarGpuBadge />
-  </IntelGpuDataProvider>
-));
Author	SHA1	Message	Date
Chris Farhood	2eb19f8401	chore: bump to v0.3.0 Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>	2026-02-19 05:57:13 -05:00
Chris Farhood	cc0ad5b286	docs: document metric availability and requirements in MetricsPage Add a file-level comment and in-page requirements section explaining exactly what is and isn't available for each metric type: Power (W) -- available on discrete GPU nodes via node-exporter hwmon collector + i915 driver (no extra config) Frequency (MHz) -- NOT available; node-exporter --collector.drm is AMD-only and does not read i915 gt_freq sysfs Utilization (%) -- NOT available; no standard Prometheus collector supports i915 engine busy metrics iGPU nodes -- no metrics at all (iGPU driver has no hwmon) The in-page MetricRequirements component surfaces this information directly in the UI so operators know what to expect and why. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>	2026-02-18 22:07:19 -05:00
Chris Farhood	4b4e565a1a	fix: switch Metrics page to Prometheus/node-exporter i915 hwmon source The Intel GPU device plugin -enable-monitoring flag registers a monitoring K8s resource type (not a Prometheus endpoint). Real GPU power metrics come from node-exporter's hwmon collector which scrapes the i915 kernel driver. - Rewrite src/api/metrics.ts: query kube-prometheus-stack Prometheus for node_hwmon_energy_joule_total (rate → watts), node_hwmon_power_max_watt (TDP), joined with node_hwmon_chip_names{chip_name="i915"} to identify GPU chips. Instance → node name resolved via node_uname_info. - Rewrite src/components/MetricsPage.tsx: shows per-chip current power (W) with bar vs TDP, total fleet power summary, last-fetched timestamp. Auto-discovers Prometheus service in monitoring namespace. - Update artifacthub-pkg.yml checksum for repackaged v0.2.0 tarball. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>	2026-02-18 21:37:16 -05:00
Chris Farhood	a226f0191c	feat: add Metrics page, remove app bar badge, fix sidebar label - Add src/api/metrics.ts: Prometheus text parser + fetchGpuPluginMetrics() fetching from Intel GPU device plugin pods (port 9090). Extracts engine utilization (active/total ticks → %), boost frequency (MHz), VRAM and system memory usage, cumulative energy (µJ). - Add src/components/MetricsPage.tsx: per-card metrics display with inline utilization bars, graceful fallback when enableMonitoring is not set. - Register Metrics sidebar entry (mdi:chart-line) and route /intel-gpu/metrics. - Remove registerAppBarAction and AppBarGpuBadge (colored info bubble). - Fix sidebar parent label: 'Intel GPU' → 'intel-gpu'. - Bump to v0.2.0; update artifacthub-pkg.yml with new archive URL and checksum. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>	2026-02-18 21:23:36 -05:00