feat: add Metrics page, remove app bar badge, fix sidebar label

- Add src/api/metrics.ts: Prometheus text parser + fetchGpuPluginMetrics()
  fetching from Intel GPU device plugin pods (port 9090). Extracts engine
  utilization (active/total ticks → %), boost frequency (MHz), VRAM and
  system memory usage, cumulative energy (µJ).

- Add src/components/MetricsPage.tsx: per-card metrics display with inline
  utilization bars, graceful fallback when enableMonitoring is not set.

- Register Metrics sidebar entry (mdi:chart-line) and route /intel-gpu/metrics.

- Remove registerAppBarAction and AppBarGpuBadge (colored info bubble).

- Fix sidebar parent label: 'Intel GPU' → 'intel-gpu'.

- Bump to v0.2.0; update artifacthub-pkg.yml with new archive URL and checksum.

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
2026-02-18 21:23:36 -05:00
parent 3c045e54be
commit 6465124516
5 changed files with 607 additions and 23 deletions
+13 -5
View File
@@ -1,4 +1,4 @@
version: "0.1.0"
version: "0.2.0"
name: headlamp-intel-gpu-plugin
displayName: Intel GPU
description: >-
@@ -7,13 +7,15 @@ description: >-
allocation, pods requesting Intel GPU resources, and injects Intel GPU
sections into native Node and Pod detail pages. Supports discrete (i915),
Xe, and integrated GPU nodes with graceful degradation when the device
plugin operator is not installed.
plugin operator is not installed. Includes a Metrics page showing real-time
engine utilization, GPU frequency, VRAM usage, and energy from the device
plugin's Prometheus endpoint.
createdAt: "2026-02-18T00:00:00Z"
license: Apache-2.0
category: monitoring-logging
homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
appVersion: "0.1.0"
appVersion: "0.2.0"
keywords:
- headlamp
@@ -43,6 +45,12 @@ links:
url: https://intel.github.io/intel-device-plugins-for-kubernetes/
changes:
- kind: added
description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
- kind: changed
description: "Sidebar label changed to intel-gpu"
- kind: removed
description: "Removed app bar health badge"
- kind: added
description: "Overview dashboard: plugin health, GPU node summary, allocation bar, active GPU pods"
- kind: added
@@ -61,7 +69,7 @@ changes:
description: "App bar health badge: hidden when no Intel GPU plugin detected"
annotations:
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.1.0/headlamp-intel-gpu-plugin-0.1.0.tar.gz"
headlamp/plugin/archive-checksum: "sha256:d6a50567d0f9e537f0edadac334d6a03cd182f5b64b47264577f2213fd882687"
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
headlamp/plugin/version-compat: ">=0.20.0"
headlamp/plugin/distro-compat: "in-cluster,web,app"
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "headlamp-intel-gpu-plugin",
"version": "0.1.0",
"version": "0.2.0",
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
"repository": {
"type": "git",
+257
View File
@@ -0,0 +1,257 @@
/**
* Prometheus text format parser for Intel GPU device plugin metrics.
*
* Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
* via the Kubernetes API proxy and parses key metric families.
*
* Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
* gpu_i915_engine_active_ticks — engine busy ticks (per card, engine)
* gpu_i915_engine_total_ticks — engine total ticks (for utilization %)
* gpu_i915_energy_microjoules — cumulative energy (µJ → power = delta/dt)
* gpu_i915_gt_boost_freq_mhz — current GT boost frequency (MHz)
* gpu_i915_memory_local — local (VRAM) memory usage (bytes)
* gpu_i915_memory_system — system memory usage (bytes)
*/
import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface MetricSample {
labels: Record<string, string>;
value: number;
}
export interface MetricFamily {
name: string;
help: string;
type: string;
samples: MetricSample[];
}
export type ParsedMetrics = Map<string, MetricFamily>;
export interface GpuNodeMetrics {
/** Node name this metric set was fetched from (via plugin pod) */
nodeName: string;
/** Pod name of the intel-gpu-plugin daemonset pod */
podName: string;
/** Engine utilization per (card, engine): 0100 */
engineUtilization: Array<{ card: string; engine: string; pct: number }>;
/** Current GT boost frequency in MHz per card */
boostFreqMhz: Array<{ card: string; value: number }>;
/** Local VRAM usage in bytes per card */
memoryLocalBytes: Array<{ card: string; value: number }>;
/** System memory usage in bytes per card */
memorySystemBytes: Array<{ card: string; value: number }>;
/** Cumulative energy in µJ per card (raw counter; compute delta for power) */
energyMicrojoules: Array<{ card: string; value: number }>;
/** Raw parsed metric families for advanced use */
raw: ParsedMetrics;
}
// ---------------------------------------------------------------------------
// Prometheus text format parser
// ---------------------------------------------------------------------------
const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
function parseLabels(labelStr: string): Record<string, string> {
const labels: Record<string, string> = {};
let match: RegExpExecArray | null;
const re = new RegExp(LABEL_PAIR_RE.source, 'g');
while ((match = re.exec(labelStr)) !== null) {
const key = match[1];
const val = match[2];
if (key && val !== undefined) {
labels[key] = val;
}
}
return labels;
}
export function parsePrometheusText(text: string): ParsedMetrics {
const families = new Map<string, MetricFamily>();
let currentName = '';
let currentHelp = '';
let currentType = '';
for (const rawLine of text.split('\n')) {
const line = rawLine.trim();
if (!line) continue;
if (line.startsWith('# HELP ')) {
const rest = line.slice(7);
const spaceIdx = rest.indexOf(' ');
currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
continue;
}
if (line.startsWith('# TYPE ')) {
const rest = line.slice(7);
const spaceIdx = rest.indexOf(' ');
currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
continue;
}
if (line.startsWith('#')) continue;
const openBrace = line.indexOf('{');
const closeBrace = line.lastIndexOf('}');
let metricName: string;
let labels: Record<string, string>;
let valuePart: string;
if (openBrace >= 0 && closeBrace > openBrace) {
metricName = line.slice(0, openBrace);
labels = parseLabels(line.slice(openBrace + 1, closeBrace));
valuePart = line.slice(closeBrace + 1).trim();
} else {
const spaceIdx = line.lastIndexOf(' ');
if (spaceIdx < 0) continue;
metricName = line.slice(0, spaceIdx);
labels = {};
valuePart = line.slice(spaceIdx + 1).trim();
}
const valueTokens = valuePart.split(' ');
const valueStr = valueTokens[0] ?? '';
const value = parseFloat(valueStr);
if (!Number.isFinite(value)) continue;
const familyKey = metricName;
let family = families.get(familyKey);
if (!family) {
family = {
name: familyKey,
help: metricName === currentName ? currentHelp : '',
type: metricName === currentName ? currentType : '',
samples: [],
};
families.set(familyKey, family);
}
family.samples.push({ labels, value });
}
return families;
}
// ---------------------------------------------------------------------------
// Extract Intel GPU metrics from the parsed map
// ---------------------------------------------------------------------------
function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
return families.get(name)?.samples ?? [];
}
export function extractGpuNodeMetrics(
families: ParsedMetrics,
nodeName: string,
podName: string
): GpuNodeMetrics {
const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
// Build utilization: active/total per (card, engine)
const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
for (const active of activeSamples) {
const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
const engine = active.labels['engine'] ?? 'render/0';
const totalSample = totalSamples.find(
s =>
(s.labels['card'] ?? s.labels['gpu']) === card &&
s.labels['engine'] === engine
);
const total = totalSample?.value ?? 0;
const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
engineUtilization.push({ card, engine, pct });
}
// Boost frequency
const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
value: s.value,
}));
// Memory
const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
value: s.value,
}));
const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
value: s.value,
}));
// Energy
const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
value: s.value,
}));
return {
nodeName,
podName,
engineUtilization,
boostFreqMhz,
memoryLocalBytes,
memorySystemBytes,
energyMicrojoules,
raw: families,
};
}
// ---------------------------------------------------------------------------
// Fetch metrics from an Intel GPU device plugin pod
// ---------------------------------------------------------------------------
/**
* Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
*
* The proxy path is:
* /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
*
* Returns null if the pod is not exposing metrics (enableMonitoring: false)
* or if the proxy request fails.
*/
export async function fetchGpuPluginMetrics(
podName: string,
namespace: string,
nodeName: string
): Promise<GpuNodeMetrics | null> {
const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
try {
const raw: unknown = await ApiProxy.request(path, {
method: 'GET',
isJSON: false,
});
if (typeof raw !== 'string') return null;
const families = parsePrometheusText(raw);
return extractGpuNodeMetrics(families, nodeName, podName);
} catch {
return null;
}
}
// ---------------------------------------------------------------------------
// Formatting helpers
// ---------------------------------------------------------------------------
export function formatBytes(bytes: number): string {
if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
return `${bytes} B`;
}
export function formatFreq(mhz: number): string {
return `${Math.round(mhz)} MHz`;
}
+311
View File
@@ -0,0 +1,311 @@
/**
* MetricsPage — real-time Intel GPU metrics from the device plugin pods.
*
* Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
* and displays per-card engine utilization, GPU frequency, memory usage,
* and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
*/
import {
Loader,
NameValueTable,
SectionBox,
SectionHeader,
StatusLabel,
} from '@kinvolk/headlamp-plugin/lib/CommonComponents';
import React, { useCallback, useEffect, useState } from 'react';
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
import {
fetchGpuPluginMetrics,
formatBytes,
formatFreq,
GpuNodeMetrics,
} from '../api/metrics';
import { IntelGpuPod } from '../api/k8s';
// ---------------------------------------------------------------------------
// Utilization bar
// ---------------------------------------------------------------------------
function UtilizationBar({ pct }: { pct: number }) {
const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
return (
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
<div
style={{
width: '100px',
height: '8px',
backgroundColor: '#e0e0e0',
borderRadius: '4px',
overflow: 'hidden',
flexShrink: 0,
}}
>
<div
style={{
width: `${pct}%`,
height: '100%',
backgroundColor: color,
borderRadius: '4px',
transition: 'width 0.3s ease',
}}
/>
</div>
<span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
</div>
);
}
// ---------------------------------------------------------------------------
// Per-node metrics card
// ---------------------------------------------------------------------------
function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
// Group engines by card
const byCard = new Map<string, typeof engineUtilization>();
for (const e of engineUtilization) {
if (!byCard.has(e.card)) byCard.set(e.card, []);
byCard.get(e.card)!.push(e);
}
const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
const cards = Array.from(
new Set([
...byCard.keys(),
...freqByCard.keys(),
...memLocalByCard.keys(),
])
).sort();
if (cards.length === 0) {
return (
<SectionBox title={`${nodeName} — No Metric Data`}>
<NameValueTable
rows={[
{
name: 'Pod',
value: podName,
},
{
name: 'Note',
value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
},
]}
/>
</SectionBox>
);
}
return (
<>
{cards.map(card => {
const engines = byCard.get(card) ?? [];
const freq = freqByCard.get(card);
const memLocal = memLocalByCard.get(card);
const memSys = memSysByCard.get(card);
const energy = energyByCard.get(card);
const rows: Array<{ name: string; value: React.ReactNode }> = [
{ name: 'Node', value: nodeName },
{ name: 'Plugin Pod', value: podName },
{ name: 'GPU Card', value: card },
];
if (freq !== undefined) {
rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
}
if (memLocal !== undefined) {
rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
}
if (memSys !== undefined && memSys > 0) {
rows.push({ name: 'System Memory', value: formatBytes(memSys) });
}
if (energy !== undefined) {
rows.push({
name: 'Energy (cumulative)',
value: `${(energy / 1e6).toFixed(2)} J`,
});
}
// Engine utilization rows
for (const e of engines) {
rows.push({
name: `Engine: ${e.engine}`,
value: <UtilizationBar pct={e.pct} />,
});
}
return (
<SectionBox key={`${nodeName}-${card}`} title={`${nodeName}${card}`}>
<NameValueTable rows={rows} />
</SectionBox>
);
})}
</>
);
}
// ---------------------------------------------------------------------------
// Main page
// ---------------------------------------------------------------------------
export default function MetricsPage() {
const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
const [fetching, setFetching] = useState(false);
const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
if (pods.length === 0) return;
setFetching(true);
const results = await Promise.all(
pods.map(async pod => {
const name = pod.metadata.name;
const namespace = pod.metadata.namespace ?? 'kube-system';
const nodeName = pod.spec?.nodeName ?? name;
const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
return { name, result };
})
);
const map = new Map<string, GpuNodeMetrics | 'error'>();
for (const { name, result } of results) {
map.set(name, result ?? 'error');
}
setMetricsMap(map);
setFetching(false);
}, []);
useEffect(() => {
if (!ctxLoading && pluginPods.length > 0) {
void fetchAll(pluginPods);
}
}, [ctxLoading, pluginPods, fetchAll]);
if (ctxLoading) {
return <Loader title="Loading Intel GPU data..." />;
}
return (
<>
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
<SectionHeader title="Intel GPU — Metrics" />
<button
onClick={() => void fetchAll(pluginPods)}
disabled={fetching || pluginPods.length === 0}
aria-label="Refresh metrics"
style={{
padding: '6px 16px',
backgroundColor: 'transparent',
color: 'var(--mui-palette-primary-main, #0071c5)',
border: '1px solid var(--mui-palette-primary-main, #0071c5)',
borderRadius: '4px',
cursor: 'pointer',
fontSize: '13px',
fontWeight: 500,
}}
>
{fetching ? 'Refreshing…' : 'Refresh'}
</button>
</div>
{!pluginInstalled && (
<SectionBox title="Intel GPU Plugin Not Detected">
<NameValueTable
rows={[
{
name: 'Status',
value: (
<StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
),
},
{
name: 'Note',
value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
},
]}
/>
</SectionBox>
)}
{pluginInstalled && pluginPods.length === 0 && (
<SectionBox title="No Plugin Pods Found">
<NameValueTable
rows={[
{
name: 'Status',
value: (
<StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
),
},
]}
/>
</SectionBox>
)}
{pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
<Loader title="Fetching GPU metrics..." />
)}
{pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
<SectionBox title="Metrics Unavailable">
<NameValueTable
rows={[
{
name: 'Status',
value: (
<StatusLabel status="warning">
Could not fetch metrics from any plugin pod
</StatusLabel>
),
},
{
name: 'Requirements',
value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
},
{
name: 'Plugin Pods Found',
value: pluginPods.map(p => p.metadata.name).join(', '),
},
]}
/>
</SectionBox>
)}
{Array.from(metricsMap.entries()).map(([podName, metrics]) => {
if (metrics === 'error') {
return (
<SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
<NameValueTable
rows={[
{
name: 'Status',
value: (
<StatusLabel status="error">
Failed to fetch metrics from pod
</StatusLabel>
),
},
{
name: 'Hint',
value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
},
]}
/>
</SectionBox>
);
}
return <NodeMetricsCard key={podName} metrics={metrics} />;
})}
</>
);
}
+25 -17
View File
@@ -1,20 +1,17 @@
/**
* headlamp-intel-gpu-plugin — entry point.
*
* Registers sidebar entries, routes, detail view sections, table column
* processors, and app bar action for Intel GPU device plugin visibility
* in Headlamp.
* Registers sidebar entries, routes, detail view sections, and table column
* processors for Intel GPU device plugin visibility in Headlamp.
*
* Surfaces Intel GPU information in the following places:
* - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods
* - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods / Metrics
* - Native Node detail page: Intel GPU section (capacity, utilization, pods)
* - Native Pod detail page: GPU resource requests per container
* - Native Nodes table: GPU Type and GPU Devices columns
* - App bar: health badge (hidden when plugin not installed)
*/
import {
registerAppBarAction,
registerDetailsViewSection,
registerResourceTableColumnsProcessor,
registerRoute,
@@ -22,9 +19,9 @@ import {
} from '@kinvolk/headlamp-plugin/lib';
import React from 'react';
import { IntelGpuDataProvider } from './api/IntelGpuDataContext';
import AppBarGpuBadge from './components/AppBarGpuBadge';
import DevicePluginsPage from './components/DevicePluginsPage';
import { buildNodeGpuColumns } from './components/integrations/NodeColumns';
import MetricsPage from './components/MetricsPage';
import NodeDetailSection from './components/NodeDetailSection';
import NodesPage from './components/NodesPage';
import OverviewPage from './components/OverviewPage';
@@ -38,7 +35,7 @@ import PodsPage from './components/PodsPage';
registerSidebarEntry({
parent: null,
name: 'intel-gpu',
label: 'Intel GPU',
label: 'intel-gpu',
url: '/intel-gpu',
icon: 'mdi:gpu',
});
@@ -75,6 +72,14 @@ registerSidebarEntry({
icon: 'mdi:cube-outline',
});
registerSidebarEntry({
parent: 'intel-gpu',
name: 'intel-gpu-metrics',
label: 'Metrics',
url: '/intel-gpu/metrics',
icon: 'mdi:chart-line',
});
// ---------------------------------------------------------------------------
// Routes
// ---------------------------------------------------------------------------
@@ -127,6 +132,18 @@ registerRoute({
),
});
registerRoute({
path: '/intel-gpu/metrics',
sidebar: 'intel-gpu-metrics',
name: 'intel-gpu-metrics',
exact: true,
component: () => (
<IntelGpuDataProvider>
<MetricsPage />
</IntelGpuDataProvider>
),
});
// ---------------------------------------------------------------------------
// Detail view section — Node pages
// Inject Intel GPU section into native Node detail page for GPU nodes.
@@ -164,12 +181,3 @@ registerResourceTableColumnsProcessor(({ id, columns }) => {
return columns;
});
// ---------------------------------------------------------------------------
// App bar action — Intel GPU health badge
// ---------------------------------------------------------------------------
registerAppBarAction(() => (
<IntelGpuDataProvider>
<AppBarGpuBadge />
</IntelGpuDataProvider>
));