diff --git a/artifacthub-pkg.yml b/artifacthub-pkg.yml index cae3ef5..bb009e6 100644 --- a/artifacthub-pkg.yml +++ b/artifacthub-pkg.yml @@ -1,4 +1,4 @@ -version: "0.1.0" +version: "0.2.0" name: headlamp-intel-gpu-plugin displayName: Intel GPU description: >- @@ -7,13 +7,15 @@ description: >- allocation, pods requesting Intel GPU resources, and injects Intel GPU sections into native Node and Pod detail pages. Supports discrete (i915), Xe, and integrated GPU nodes with graceful degradation when the device - plugin operator is not installed. + plugin operator is not installed. Includes a Metrics page showing real-time + engine utilization, GPU frequency, VRAM usage, and energy from the device + plugin's Prometheus endpoint. createdAt: "2026-02-18T00:00:00Z" license: Apache-2.0 category: monitoring-logging homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin -appVersion: "0.1.0" +appVersion: "0.2.0" keywords: - headlamp @@ -43,6 +45,12 @@ links: url: https://intel.github.io/intel-device-plugins-for-kubernetes/ changes: + - kind: added + description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)" + - kind: changed + description: "Sidebar label changed to intel-gpu" + - kind: removed + description: "Removed app bar health badge" - kind: added description: "Overview dashboard: plugin health, GPU node summary, allocation bar, active GPU pods" - kind: added @@ -61,7 +69,7 @@ changes: description: "App bar health badge: hidden when no Intel GPU plugin detected" annotations: - headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.1.0/headlamp-intel-gpu-plugin-0.1.0.tar.gz" - headlamp/plugin/archive-checksum: "sha256:d6a50567d0f9e537f0edadac334d6a03cd182f5b64b47264577f2213fd882687" + headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz" + headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442" headlamp/plugin/version-compat: ">=0.20.0" headlamp/plugin/distro-compat: "in-cluster,web,app" diff --git a/package.json b/package.json index f4152fc..f484454 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "headlamp-intel-gpu-plugin", - "version": "0.1.0", + "version": "0.2.0", "description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring", "repository": { "type": "git", diff --git a/src/api/metrics.ts b/src/api/metrics.ts new file mode 100644 index 0000000..dc88403 --- /dev/null +++ b/src/api/metrics.ts @@ -0,0 +1,257 @@ +/** + * Prometheus text format parser for Intel GPU device plugin metrics. + * + * Fetches raw metrics from the Intel GPU device plugin pod (port 9090) + * via the Kubernetes API proxy and parses key metric families. + * + * Metrics exposed by intel-gpu-plugin when enableMonitoring: true: + * gpu_i915_engine_active_ticks — engine busy ticks (per card, engine) + * gpu_i915_engine_total_ticks — engine total ticks (for utilization %) + * gpu_i915_energy_microjoules — cumulative energy (µJ → power = delta/dt) + * gpu_i915_gt_boost_freq_mhz — current GT boost frequency (MHz) + * gpu_i915_memory_local — local (VRAM) memory usage (bytes) + * gpu_i915_memory_system — system memory usage (bytes) + */ + +import { ApiProxy } from '@kinvolk/headlamp-plugin/lib'; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface MetricSample { + labels: Record; + value: number; +} + +export interface MetricFamily { + name: string; + help: string; + type: string; + samples: MetricSample[]; +} + +export type ParsedMetrics = Map; + +export interface GpuNodeMetrics { + /** Node name this metric set was fetched from (via plugin pod) */ + nodeName: string; + /** Pod name of the intel-gpu-plugin daemonset pod */ + podName: string; + /** Engine utilization per (card, engine): 0–100 */ + engineUtilization: Array<{ card: string; engine: string; pct: number }>; + /** Current GT boost frequency in MHz per card */ + boostFreqMhz: Array<{ card: string; value: number }>; + /** Local VRAM usage in bytes per card */ + memoryLocalBytes: Array<{ card: string; value: number }>; + /** System memory usage in bytes per card */ + memorySystemBytes: Array<{ card: string; value: number }>; + /** Cumulative energy in µJ per card (raw counter; compute delta for power) */ + energyMicrojoules: Array<{ card: string; value: number }>; + /** Raw parsed metric families for advanced use */ + raw: ParsedMetrics; +} + +// --------------------------------------------------------------------------- +// Prometheus text format parser +// --------------------------------------------------------------------------- + +const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g; + +function parseLabels(labelStr: string): Record { + const labels: Record = {}; + let match: RegExpExecArray | null; + const re = new RegExp(LABEL_PAIR_RE.source, 'g'); + while ((match = re.exec(labelStr)) !== null) { + const key = match[1]; + const val = match[2]; + if (key && val !== undefined) { + labels[key] = val; + } + } + return labels; +} + +export function parsePrometheusText(text: string): ParsedMetrics { + const families = new Map(); + let currentName = ''; + let currentHelp = ''; + let currentType = ''; + + for (const rawLine of text.split('\n')) { + const line = rawLine.trim(); + if (!line) continue; + + if (line.startsWith('# HELP ')) { + const rest = line.slice(7); + const spaceIdx = rest.indexOf(' '); + currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest; + currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : ''; + continue; + } + + if (line.startsWith('# TYPE ')) { + const rest = line.slice(7); + const spaceIdx = rest.indexOf(' '); + currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : ''; + continue; + } + + if (line.startsWith('#')) continue; + + const openBrace = line.indexOf('{'); + const closeBrace = line.lastIndexOf('}'); + + let metricName: string; + let labels: Record; + let valuePart: string; + + if (openBrace >= 0 && closeBrace > openBrace) { + metricName = line.slice(0, openBrace); + labels = parseLabels(line.slice(openBrace + 1, closeBrace)); + valuePart = line.slice(closeBrace + 1).trim(); + } else { + const spaceIdx = line.lastIndexOf(' '); + if (spaceIdx < 0) continue; + metricName = line.slice(0, spaceIdx); + labels = {}; + valuePart = line.slice(spaceIdx + 1).trim(); + } + + const valueTokens = valuePart.split(' '); + const valueStr = valueTokens[0] ?? ''; + const value = parseFloat(valueStr); + if (!Number.isFinite(value)) continue; + + const familyKey = metricName; + let family = families.get(familyKey); + if (!family) { + family = { + name: familyKey, + help: metricName === currentName ? currentHelp : '', + type: metricName === currentName ? currentType : '', + samples: [], + }; + families.set(familyKey, family); + } + + family.samples.push({ labels, value }); + } + + return families; +} + +// --------------------------------------------------------------------------- +// Extract Intel GPU metrics from the parsed map +// --------------------------------------------------------------------------- + +function samplesFor(families: ParsedMetrics, name: string): MetricSample[] { + return families.get(name)?.samples ?? []; +} + +export function extractGpuNodeMetrics( + families: ParsedMetrics, + nodeName: string, + podName: string +): GpuNodeMetrics { + const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks'); + const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks'); + + // Build utilization: active/total per (card, engine) + const engineUtilization: GpuNodeMetrics['engineUtilization'] = []; + for (const active of activeSamples) { + const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0'; + const engine = active.labels['engine'] ?? 'render/0'; + const totalSample = totalSamples.find( + s => + (s.labels['card'] ?? s.labels['gpu']) === card && + s.labels['engine'] === engine + ); + const total = totalSample?.value ?? 0; + const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0; + engineUtilization.push({ card, engine, pct }); + } + + // Boost frequency + const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({ + card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', + value: s.value, + })); + + // Memory + const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({ + card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', + value: s.value, + })); + const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({ + card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', + value: s.value, + })); + + // Energy + const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({ + card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', + value: s.value, + })); + + return { + nodeName, + podName, + engineUtilization, + boostFreqMhz, + memoryLocalBytes, + memorySystemBytes, + energyMicrojoules, + raw: families, + }; +} + +// --------------------------------------------------------------------------- +// Fetch metrics from an Intel GPU device plugin pod +// --------------------------------------------------------------------------- + +/** + * Fetches and parses Prometheus metrics from an Intel GPU device plugin pod. + * + * The proxy path is: + * /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics + * + * Returns null if the pod is not exposing metrics (enableMonitoring: false) + * or if the proxy request fails. + */ +export async function fetchGpuPluginMetrics( + podName: string, + namespace: string, + nodeName: string +): Promise { + const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`; + + try { + const raw: unknown = await ApiProxy.request(path, { + method: 'GET', + isJSON: false, + }); + + if (typeof raw !== 'string') return null; + + const families = parsePrometheusText(raw); + return extractGpuNodeMetrics(families, nodeName, podName); + } catch { + return null; + } +} + +// --------------------------------------------------------------------------- +// Formatting helpers +// --------------------------------------------------------------------------- + +export function formatBytes(bytes: number): string { + if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`; + if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`; + if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`; + return `${bytes} B`; +} + +export function formatFreq(mhz: number): string { + return `${Math.round(mhz)} MHz`; +} diff --git a/src/components/MetricsPage.tsx b/src/components/MetricsPage.tsx new file mode 100644 index 0000000..711abe0 --- /dev/null +++ b/src/components/MetricsPage.tsx @@ -0,0 +1,311 @@ +/** + * MetricsPage — real-time Intel GPU metrics from the device plugin pods. + * + * Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090) + * and displays per-card engine utilization, GPU frequency, memory usage, + * and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin. + */ + +import { + Loader, + NameValueTable, + SectionBox, + SectionHeader, + StatusLabel, +} from '@kinvolk/headlamp-plugin/lib/CommonComponents'; +import React, { useCallback, useEffect, useState } from 'react'; +import { useIntelGpuContext } from '../api/IntelGpuDataContext'; +import { + fetchGpuPluginMetrics, + formatBytes, + formatFreq, + GpuNodeMetrics, +} from '../api/metrics'; +import { IntelGpuPod } from '../api/k8s'; + +// --------------------------------------------------------------------------- +// Utilization bar +// --------------------------------------------------------------------------- + +function UtilizationBar({ pct }: { pct: number }) { + const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5'; + return ( +
+
+
+
+ {pct}% +
+ ); +} + +// --------------------------------------------------------------------------- +// Per-node metrics card +// --------------------------------------------------------------------------- + +function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) { + const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics; + + // Group engines by card + const byCard = new Map(); + for (const e of engineUtilization) { + if (!byCard.has(e.card)) byCard.set(e.card, []); + byCard.get(e.card)!.push(e); + } + + const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value])); + const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value])); + const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value])); + const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value])); + + const cards = Array.from( + new Set([ + ...byCard.keys(), + ...freqByCard.keys(), + ...memLocalByCard.keys(), + ]) + ).sort(); + + if (cards.length === 0) { + return ( + + + + ); + } + + return ( + <> + {cards.map(card => { + const engines = byCard.get(card) ?? []; + const freq = freqByCard.get(card); + const memLocal = memLocalByCard.get(card); + const memSys = memSysByCard.get(card); + const energy = energyByCard.get(card); + + const rows: Array<{ name: string; value: React.ReactNode }> = [ + { name: 'Node', value: nodeName }, + { name: 'Plugin Pod', value: podName }, + { name: 'GPU Card', value: card }, + ]; + + if (freq !== undefined) { + rows.push({ name: 'Boost Frequency', value: formatFreq(freq) }); + } + + if (memLocal !== undefined) { + rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) }); + } + if (memSys !== undefined && memSys > 0) { + rows.push({ name: 'System Memory', value: formatBytes(memSys) }); + } + + if (energy !== undefined) { + rows.push({ + name: 'Energy (cumulative)', + value: `${(energy / 1e6).toFixed(2)} J`, + }); + } + + // Engine utilization rows + for (const e of engines) { + rows.push({ + name: `Engine: ${e.engine}`, + value: , + }); + } + + return ( + + + + ); + })} + + ); +} + +// --------------------------------------------------------------------------- +// Main page +// --------------------------------------------------------------------------- + +export default function MetricsPage() { + const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext(); + + const [metricsMap, setMetricsMap] = useState>(new Map()); + const [fetching, setFetching] = useState(false); + + const fetchAll = useCallback(async (pods: IntelGpuPod[]) => { + if (pods.length === 0) return; + setFetching(true); + + const results = await Promise.all( + pods.map(async pod => { + const name = pod.metadata.name; + const namespace = pod.metadata.namespace ?? 'kube-system'; + const nodeName = pod.spec?.nodeName ?? name; + const result = await fetchGpuPluginMetrics(name, namespace, nodeName); + return { name, result }; + }) + ); + + const map = new Map(); + for (const { name, result } of results) { + map.set(name, result ?? 'error'); + } + setMetricsMap(map); + setFetching(false); + }, []); + + useEffect(() => { + if (!ctxLoading && pluginPods.length > 0) { + void fetchAll(pluginPods); + } + }, [ctxLoading, pluginPods, fetchAll]); + + if (ctxLoading) { + return ; + } + + return ( + <> +
+ + +
+ + {!pluginInstalled && ( + + No Intel GPU device plugin pods found + ), + }, + { + name: 'Note', + value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.', + }, + ]} + /> + + )} + + {pluginInstalled && pluginPods.length === 0 && ( + + Plugin detected via CRD but no pods found + ), + }, + ]} + /> + + )} + + {pluginPods.length > 0 && metricsMap.size === 0 && fetching && ( + + )} + + {pluginPods.length > 0 && metricsMap.size === 0 && !fetching && ( + + + Could not fetch metrics from any plugin pod + + ), + }, + { + name: 'Requirements', + value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.', + }, + { + name: 'Plugin Pods Found', + value: pluginPods.map(p => p.metadata.name).join(', '), + }, + ]} + /> + + )} + + {Array.from(metricsMap.entries()).map(([podName, metrics]) => { + if (metrics === 'error') { + return ( + + + Failed to fetch metrics from pod + + ), + }, + { + name: 'Hint', + value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.', + }, + ]} + /> + + ); + } + return ; + })} + + ); +} diff --git a/src/index.tsx b/src/index.tsx index 2d8da11..f05b5e0 100644 --- a/src/index.tsx +++ b/src/index.tsx @@ -1,20 +1,17 @@ /** * headlamp-intel-gpu-plugin — entry point. * - * Registers sidebar entries, routes, detail view sections, table column - * processors, and app bar action for Intel GPU device plugin visibility - * in Headlamp. + * Registers sidebar entries, routes, detail view sections, and table column + * processors for Intel GPU device plugin visibility in Headlamp. * * Surfaces Intel GPU information in the following places: - * - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods + * - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods / Metrics * - Native Node detail page: Intel GPU section (capacity, utilization, pods) * - Native Pod detail page: GPU resource requests per container * - Native Nodes table: GPU Type and GPU Devices columns - * - App bar: health badge (hidden when plugin not installed) */ import { - registerAppBarAction, registerDetailsViewSection, registerResourceTableColumnsProcessor, registerRoute, @@ -22,9 +19,9 @@ import { } from '@kinvolk/headlamp-plugin/lib'; import React from 'react'; import { IntelGpuDataProvider } from './api/IntelGpuDataContext'; -import AppBarGpuBadge from './components/AppBarGpuBadge'; import DevicePluginsPage from './components/DevicePluginsPage'; import { buildNodeGpuColumns } from './components/integrations/NodeColumns'; +import MetricsPage from './components/MetricsPage'; import NodeDetailSection from './components/NodeDetailSection'; import NodesPage from './components/NodesPage'; import OverviewPage from './components/OverviewPage'; @@ -38,7 +35,7 @@ import PodsPage from './components/PodsPage'; registerSidebarEntry({ parent: null, name: 'intel-gpu', - label: 'Intel GPU', + label: 'intel-gpu', url: '/intel-gpu', icon: 'mdi:gpu', }); @@ -75,6 +72,14 @@ registerSidebarEntry({ icon: 'mdi:cube-outline', }); +registerSidebarEntry({ + parent: 'intel-gpu', + name: 'intel-gpu-metrics', + label: 'Metrics', + url: '/intel-gpu/metrics', + icon: 'mdi:chart-line', +}); + // --------------------------------------------------------------------------- // Routes // --------------------------------------------------------------------------- @@ -127,6 +132,18 @@ registerRoute({ ), }); +registerRoute({ + path: '/intel-gpu/metrics', + sidebar: 'intel-gpu-metrics', + name: 'intel-gpu-metrics', + exact: true, + component: () => ( + + + + ), +}); + // --------------------------------------------------------------------------- // Detail view section — Node pages // Inject Intel GPU section into native Node detail page for GPU nodes. @@ -164,12 +181,3 @@ registerResourceTableColumnsProcessor(({ id, columns }) => { return columns; }); -// --------------------------------------------------------------------------- -// App bar action — Intel GPU health badge -// --------------------------------------------------------------------------- - -registerAppBarAction(() => ( - - - -));