feat: add Metrics page, remove app bar badge, fix sidebar label
- Add src/api/metrics.ts: Prometheus text parser + fetchGpuPluginMetrics() fetching from Intel GPU device plugin pods (port 9090). Extracts engine utilization (active/total ticks → %), boost frequency (MHz), VRAM and system memory usage, cumulative energy (µJ). - Add src/components/MetricsPage.tsx: per-card metrics display with inline utilization bars, graceful fallback when enableMonitoring is not set. - Register Metrics sidebar entry (mdi:chart-line) and route /intel-gpu/metrics. - Remove registerAppBarAction and AppBarGpuBadge (colored info bubble). - Fix sidebar parent label: 'Intel GPU' → 'intel-gpu'. - Bump to v0.2.0; update artifacthub-pkg.yml with new archive URL and checksum. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
+13
-5
@@ -1,4 +1,4 @@
|
|||||||
version: "0.1.0"
|
version: "0.2.0"
|
||||||
name: headlamp-intel-gpu-plugin
|
name: headlamp-intel-gpu-plugin
|
||||||
displayName: Intel GPU
|
displayName: Intel GPU
|
||||||
description: >-
|
description: >-
|
||||||
@@ -7,13 +7,15 @@ description: >-
|
|||||||
allocation, pods requesting Intel GPU resources, and injects Intel GPU
|
allocation, pods requesting Intel GPU resources, and injects Intel GPU
|
||||||
sections into native Node and Pod detail pages. Supports discrete (i915),
|
sections into native Node and Pod detail pages. Supports discrete (i915),
|
||||||
Xe, and integrated GPU nodes with graceful degradation when the device
|
Xe, and integrated GPU nodes with graceful degradation when the device
|
||||||
plugin operator is not installed.
|
plugin operator is not installed. Includes a Metrics page showing real-time
|
||||||
|
engine utilization, GPU frequency, VRAM usage, and energy from the device
|
||||||
|
plugin's Prometheus endpoint.
|
||||||
createdAt: "2026-02-18T00:00:00Z"
|
createdAt: "2026-02-18T00:00:00Z"
|
||||||
license: Apache-2.0
|
license: Apache-2.0
|
||||||
category: monitoring-logging
|
category: monitoring-logging
|
||||||
|
|
||||||
homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
|
homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
|
||||||
appVersion: "0.1.0"
|
appVersion: "0.2.0"
|
||||||
|
|
||||||
keywords:
|
keywords:
|
||||||
- headlamp
|
- headlamp
|
||||||
@@ -43,6 +45,12 @@ links:
|
|||||||
url: https://intel.github.io/intel-device-plugins-for-kubernetes/
|
url: https://intel.github.io/intel-device-plugins-for-kubernetes/
|
||||||
|
|
||||||
changes:
|
changes:
|
||||||
|
- kind: added
|
||||||
|
description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
|
||||||
|
- kind: changed
|
||||||
|
description: "Sidebar label changed to intel-gpu"
|
||||||
|
- kind: removed
|
||||||
|
description: "Removed app bar health badge"
|
||||||
- kind: added
|
- kind: added
|
||||||
description: "Overview dashboard: plugin health, GPU node summary, allocation bar, active GPU pods"
|
description: "Overview dashboard: plugin health, GPU node summary, allocation bar, active GPU pods"
|
||||||
- kind: added
|
- kind: added
|
||||||
@@ -61,7 +69,7 @@ changes:
|
|||||||
description: "App bar health badge: hidden when no Intel GPU plugin detected"
|
description: "App bar health badge: hidden when no Intel GPU plugin detected"
|
||||||
|
|
||||||
annotations:
|
annotations:
|
||||||
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.1.0/headlamp-intel-gpu-plugin-0.1.0.tar.gz"
|
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
|
||||||
headlamp/plugin/archive-checksum: "sha256:d6a50567d0f9e537f0edadac334d6a03cd182f5b64b47264577f2213fd882687"
|
headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
|
||||||
headlamp/plugin/version-compat: ">=0.20.0"
|
headlamp/plugin/version-compat: ">=0.20.0"
|
||||||
headlamp/plugin/distro-compat: "in-cluster,web,app"
|
headlamp/plugin/distro-compat: "in-cluster,web,app"
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "headlamp-intel-gpu-plugin",
|
"name": "headlamp-intel-gpu-plugin",
|
||||||
"version": "0.1.0",
|
"version": "0.2.0",
|
||||||
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
|
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|||||||
@@ -0,0 +1,257 @@
|
|||||||
|
/**
|
||||||
|
* Prometheus text format parser for Intel GPU device plugin metrics.
|
||||||
|
*
|
||||||
|
* Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
|
||||||
|
* via the Kubernetes API proxy and parses key metric families.
|
||||||
|
*
|
||||||
|
* Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
|
||||||
|
* gpu_i915_engine_active_ticks — engine busy ticks (per card, engine)
|
||||||
|
* gpu_i915_engine_total_ticks — engine total ticks (for utilization %)
|
||||||
|
* gpu_i915_energy_microjoules — cumulative energy (µJ → power = delta/dt)
|
||||||
|
* gpu_i915_gt_boost_freq_mhz — current GT boost frequency (MHz)
|
||||||
|
* gpu_i915_memory_local — local (VRAM) memory usage (bytes)
|
||||||
|
* gpu_i915_memory_system — system memory usage (bytes)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Types
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export interface MetricSample {
|
||||||
|
labels: Record<string, string>;
|
||||||
|
value: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface MetricFamily {
|
||||||
|
name: string;
|
||||||
|
help: string;
|
||||||
|
type: string;
|
||||||
|
samples: MetricSample[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export type ParsedMetrics = Map<string, MetricFamily>;
|
||||||
|
|
||||||
|
export interface GpuNodeMetrics {
|
||||||
|
/** Node name this metric set was fetched from (via plugin pod) */
|
||||||
|
nodeName: string;
|
||||||
|
/** Pod name of the intel-gpu-plugin daemonset pod */
|
||||||
|
podName: string;
|
||||||
|
/** Engine utilization per (card, engine): 0–100 */
|
||||||
|
engineUtilization: Array<{ card: string; engine: string; pct: number }>;
|
||||||
|
/** Current GT boost frequency in MHz per card */
|
||||||
|
boostFreqMhz: Array<{ card: string; value: number }>;
|
||||||
|
/** Local VRAM usage in bytes per card */
|
||||||
|
memoryLocalBytes: Array<{ card: string; value: number }>;
|
||||||
|
/** System memory usage in bytes per card */
|
||||||
|
memorySystemBytes: Array<{ card: string; value: number }>;
|
||||||
|
/** Cumulative energy in µJ per card (raw counter; compute delta for power) */
|
||||||
|
energyMicrojoules: Array<{ card: string; value: number }>;
|
||||||
|
/** Raw parsed metric families for advanced use */
|
||||||
|
raw: ParsedMetrics;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Prometheus text format parser
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
|
||||||
|
|
||||||
|
function parseLabels(labelStr: string): Record<string, string> {
|
||||||
|
const labels: Record<string, string> = {};
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
const re = new RegExp(LABEL_PAIR_RE.source, 'g');
|
||||||
|
while ((match = re.exec(labelStr)) !== null) {
|
||||||
|
const key = match[1];
|
||||||
|
const val = match[2];
|
||||||
|
if (key && val !== undefined) {
|
||||||
|
labels[key] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return labels;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parsePrometheusText(text: string): ParsedMetrics {
|
||||||
|
const families = new Map<string, MetricFamily>();
|
||||||
|
let currentName = '';
|
||||||
|
let currentHelp = '';
|
||||||
|
let currentType = '';
|
||||||
|
|
||||||
|
for (const rawLine of text.split('\n')) {
|
||||||
|
const line = rawLine.trim();
|
||||||
|
if (!line) continue;
|
||||||
|
|
||||||
|
if (line.startsWith('# HELP ')) {
|
||||||
|
const rest = line.slice(7);
|
||||||
|
const spaceIdx = rest.indexOf(' ');
|
||||||
|
currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
|
||||||
|
currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (line.startsWith('# TYPE ')) {
|
||||||
|
const rest = line.slice(7);
|
||||||
|
const spaceIdx = rest.indexOf(' ');
|
||||||
|
currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (line.startsWith('#')) continue;
|
||||||
|
|
||||||
|
const openBrace = line.indexOf('{');
|
||||||
|
const closeBrace = line.lastIndexOf('}');
|
||||||
|
|
||||||
|
let metricName: string;
|
||||||
|
let labels: Record<string, string>;
|
||||||
|
let valuePart: string;
|
||||||
|
|
||||||
|
if (openBrace >= 0 && closeBrace > openBrace) {
|
||||||
|
metricName = line.slice(0, openBrace);
|
||||||
|
labels = parseLabels(line.slice(openBrace + 1, closeBrace));
|
||||||
|
valuePart = line.slice(closeBrace + 1).trim();
|
||||||
|
} else {
|
||||||
|
const spaceIdx = line.lastIndexOf(' ');
|
||||||
|
if (spaceIdx < 0) continue;
|
||||||
|
metricName = line.slice(0, spaceIdx);
|
||||||
|
labels = {};
|
||||||
|
valuePart = line.slice(spaceIdx + 1).trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const valueTokens = valuePart.split(' ');
|
||||||
|
const valueStr = valueTokens[0] ?? '';
|
||||||
|
const value = parseFloat(valueStr);
|
||||||
|
if (!Number.isFinite(value)) continue;
|
||||||
|
|
||||||
|
const familyKey = metricName;
|
||||||
|
let family = families.get(familyKey);
|
||||||
|
if (!family) {
|
||||||
|
family = {
|
||||||
|
name: familyKey,
|
||||||
|
help: metricName === currentName ? currentHelp : '',
|
||||||
|
type: metricName === currentName ? currentType : '',
|
||||||
|
samples: [],
|
||||||
|
};
|
||||||
|
families.set(familyKey, family);
|
||||||
|
}
|
||||||
|
|
||||||
|
family.samples.push({ labels, value });
|
||||||
|
}
|
||||||
|
|
||||||
|
return families;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Extract Intel GPU metrics from the parsed map
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
|
||||||
|
return families.get(name)?.samples ?? [];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractGpuNodeMetrics(
|
||||||
|
families: ParsedMetrics,
|
||||||
|
nodeName: string,
|
||||||
|
podName: string
|
||||||
|
): GpuNodeMetrics {
|
||||||
|
const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
|
||||||
|
const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
|
||||||
|
|
||||||
|
// Build utilization: active/total per (card, engine)
|
||||||
|
const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
|
||||||
|
for (const active of activeSamples) {
|
||||||
|
const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
|
||||||
|
const engine = active.labels['engine'] ?? 'render/0';
|
||||||
|
const totalSample = totalSamples.find(
|
||||||
|
s =>
|
||||||
|
(s.labels['card'] ?? s.labels['gpu']) === card &&
|
||||||
|
s.labels['engine'] === engine
|
||||||
|
);
|
||||||
|
const total = totalSample?.value ?? 0;
|
||||||
|
const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
|
||||||
|
engineUtilization.push({ card, engine, pct });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Boost frequency
|
||||||
|
const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
|
||||||
|
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||||
|
value: s.value,
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Memory
|
||||||
|
const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
|
||||||
|
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||||
|
value: s.value,
|
||||||
|
}));
|
||||||
|
const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
|
||||||
|
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||||
|
value: s.value,
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Energy
|
||||||
|
const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
|
||||||
|
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||||
|
value: s.value,
|
||||||
|
}));
|
||||||
|
|
||||||
|
return {
|
||||||
|
nodeName,
|
||||||
|
podName,
|
||||||
|
engineUtilization,
|
||||||
|
boostFreqMhz,
|
||||||
|
memoryLocalBytes,
|
||||||
|
memorySystemBytes,
|
||||||
|
energyMicrojoules,
|
||||||
|
raw: families,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Fetch metrics from an Intel GPU device plugin pod
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
|
||||||
|
*
|
||||||
|
* The proxy path is:
|
||||||
|
* /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
|
||||||
|
*
|
||||||
|
* Returns null if the pod is not exposing metrics (enableMonitoring: false)
|
||||||
|
* or if the proxy request fails.
|
||||||
|
*/
|
||||||
|
export async function fetchGpuPluginMetrics(
|
||||||
|
podName: string,
|
||||||
|
namespace: string,
|
||||||
|
nodeName: string
|
||||||
|
): Promise<GpuNodeMetrics | null> {
|
||||||
|
const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const raw: unknown = await ApiProxy.request(path, {
|
||||||
|
method: 'GET',
|
||||||
|
isJSON: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (typeof raw !== 'string') return null;
|
||||||
|
|
||||||
|
const families = parsePrometheusText(raw);
|
||||||
|
return extractGpuNodeMetrics(families, nodeName, podName);
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Formatting helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export function formatBytes(bytes: number): string {
|
||||||
|
if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
|
||||||
|
if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
|
||||||
|
if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
|
||||||
|
return `${bytes} B`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function formatFreq(mhz: number): string {
|
||||||
|
return `${Math.round(mhz)} MHz`;
|
||||||
|
}
|
||||||
@@ -0,0 +1,311 @@
|
|||||||
|
/**
|
||||||
|
* MetricsPage — real-time Intel GPU metrics from the device plugin pods.
|
||||||
|
*
|
||||||
|
* Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
|
||||||
|
* and displays per-card engine utilization, GPU frequency, memory usage,
|
||||||
|
* and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
Loader,
|
||||||
|
NameValueTable,
|
||||||
|
SectionBox,
|
||||||
|
SectionHeader,
|
||||||
|
StatusLabel,
|
||||||
|
} from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
||||||
|
import React, { useCallback, useEffect, useState } from 'react';
|
||||||
|
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||||
|
import {
|
||||||
|
fetchGpuPluginMetrics,
|
||||||
|
formatBytes,
|
||||||
|
formatFreq,
|
||||||
|
GpuNodeMetrics,
|
||||||
|
} from '../api/metrics';
|
||||||
|
import { IntelGpuPod } from '../api/k8s';
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Utilization bar
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function UtilizationBar({ pct }: { pct: number }) {
|
||||||
|
const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||||
|
return (
|
||||||
|
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||||
|
<div
|
||||||
|
style={{
|
||||||
|
width: '100px',
|
||||||
|
height: '8px',
|
||||||
|
backgroundColor: '#e0e0e0',
|
||||||
|
borderRadius: '4px',
|
||||||
|
overflow: 'hidden',
|
||||||
|
flexShrink: 0,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<div
|
||||||
|
style={{
|
||||||
|
width: `${pct}%`,
|
||||||
|
height: '100%',
|
||||||
|
backgroundColor: color,
|
||||||
|
borderRadius: '4px',
|
||||||
|
transition: 'width 0.3s ease',
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Per-node metrics card
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
|
||||||
|
const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
|
||||||
|
|
||||||
|
// Group engines by card
|
||||||
|
const byCard = new Map<string, typeof engineUtilization>();
|
||||||
|
for (const e of engineUtilization) {
|
||||||
|
if (!byCard.has(e.card)) byCard.set(e.card, []);
|
||||||
|
byCard.get(e.card)!.push(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
|
||||||
|
const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
|
||||||
|
const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
|
||||||
|
const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
|
||||||
|
|
||||||
|
const cards = Array.from(
|
||||||
|
new Set([
|
||||||
|
...byCard.keys(),
|
||||||
|
...freqByCard.keys(),
|
||||||
|
...memLocalByCard.keys(),
|
||||||
|
])
|
||||||
|
).sort();
|
||||||
|
|
||||||
|
if (cards.length === 0) {
|
||||||
|
return (
|
||||||
|
<SectionBox title={`${nodeName} — No Metric Data`}>
|
||||||
|
<NameValueTable
|
||||||
|
rows={[
|
||||||
|
{
|
||||||
|
name: 'Pod',
|
||||||
|
value: podName,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Note',
|
||||||
|
value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
{cards.map(card => {
|
||||||
|
const engines = byCard.get(card) ?? [];
|
||||||
|
const freq = freqByCard.get(card);
|
||||||
|
const memLocal = memLocalByCard.get(card);
|
||||||
|
const memSys = memSysByCard.get(card);
|
||||||
|
const energy = energyByCard.get(card);
|
||||||
|
|
||||||
|
const rows: Array<{ name: string; value: React.ReactNode }> = [
|
||||||
|
{ name: 'Node', value: nodeName },
|
||||||
|
{ name: 'Plugin Pod', value: podName },
|
||||||
|
{ name: 'GPU Card', value: card },
|
||||||
|
];
|
||||||
|
|
||||||
|
if (freq !== undefined) {
|
||||||
|
rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (memLocal !== undefined) {
|
||||||
|
rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
|
||||||
|
}
|
||||||
|
if (memSys !== undefined && memSys > 0) {
|
||||||
|
rows.push({ name: 'System Memory', value: formatBytes(memSys) });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (energy !== undefined) {
|
||||||
|
rows.push({
|
||||||
|
name: 'Energy (cumulative)',
|
||||||
|
value: `${(energy / 1e6).toFixed(2)} J`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Engine utilization rows
|
||||||
|
for (const e of engines) {
|
||||||
|
rows.push({
|
||||||
|
name: `Engine: ${e.engine}`,
|
||||||
|
value: <UtilizationBar pct={e.pct} />,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<SectionBox key={`${nodeName}-${card}`} title={`${nodeName} — ${card}`}>
|
||||||
|
<NameValueTable rows={rows} />
|
||||||
|
</SectionBox>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Main page
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export default function MetricsPage() {
|
||||||
|
const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
|
||||||
|
|
||||||
|
const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
|
||||||
|
const [fetching, setFetching] = useState(false);
|
||||||
|
|
||||||
|
const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
|
||||||
|
if (pods.length === 0) return;
|
||||||
|
setFetching(true);
|
||||||
|
|
||||||
|
const results = await Promise.all(
|
||||||
|
pods.map(async pod => {
|
||||||
|
const name = pod.metadata.name;
|
||||||
|
const namespace = pod.metadata.namespace ?? 'kube-system';
|
||||||
|
const nodeName = pod.spec?.nodeName ?? name;
|
||||||
|
const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
|
||||||
|
return { name, result };
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
const map = new Map<string, GpuNodeMetrics | 'error'>();
|
||||||
|
for (const { name, result } of results) {
|
||||||
|
map.set(name, result ?? 'error');
|
||||||
|
}
|
||||||
|
setMetricsMap(map);
|
||||||
|
setFetching(false);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (!ctxLoading && pluginPods.length > 0) {
|
||||||
|
void fetchAll(pluginPods);
|
||||||
|
}
|
||||||
|
}, [ctxLoading, pluginPods, fetchAll]);
|
||||||
|
|
||||||
|
if (ctxLoading) {
|
||||||
|
return <Loader title="Loading Intel GPU data..." />;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||||
|
<SectionHeader title="Intel GPU — Metrics" />
|
||||||
|
<button
|
||||||
|
onClick={() => void fetchAll(pluginPods)}
|
||||||
|
disabled={fetching || pluginPods.length === 0}
|
||||||
|
aria-label="Refresh metrics"
|
||||||
|
style={{
|
||||||
|
padding: '6px 16px',
|
||||||
|
backgroundColor: 'transparent',
|
||||||
|
color: 'var(--mui-palette-primary-main, #0071c5)',
|
||||||
|
border: '1px solid var(--mui-palette-primary-main, #0071c5)',
|
||||||
|
borderRadius: '4px',
|
||||||
|
cursor: 'pointer',
|
||||||
|
fontSize: '13px',
|
||||||
|
fontWeight: 500,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{fetching ? 'Refreshing…' : 'Refresh'}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{!pluginInstalled && (
|
||||||
|
<SectionBox title="Intel GPU Plugin Not Detected">
|
||||||
|
<NameValueTable
|
||||||
|
rows={[
|
||||||
|
{
|
||||||
|
name: 'Status',
|
||||||
|
value: (
|
||||||
|
<StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Note',
|
||||||
|
value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{pluginInstalled && pluginPods.length === 0 && (
|
||||||
|
<SectionBox title="No Plugin Pods Found">
|
||||||
|
<NameValueTable
|
||||||
|
rows={[
|
||||||
|
{
|
||||||
|
name: 'Status',
|
||||||
|
value: (
|
||||||
|
<StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
|
||||||
|
<Loader title="Fetching GPU metrics..." />
|
||||||
|
)}
|
||||||
|
|
||||||
|
{pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
|
||||||
|
<SectionBox title="Metrics Unavailable">
|
||||||
|
<NameValueTable
|
||||||
|
rows={[
|
||||||
|
{
|
||||||
|
name: 'Status',
|
||||||
|
value: (
|
||||||
|
<StatusLabel status="warning">
|
||||||
|
Could not fetch metrics from any plugin pod
|
||||||
|
</StatusLabel>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Requirements',
|
||||||
|
value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Plugin Pods Found',
|
||||||
|
value: pluginPods.map(p => p.metadata.name).join(', '),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{Array.from(metricsMap.entries()).map(([podName, metrics]) => {
|
||||||
|
if (metrics === 'error') {
|
||||||
|
return (
|
||||||
|
<SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
|
||||||
|
<NameValueTable
|
||||||
|
rows={[
|
||||||
|
{
|
||||||
|
name: 'Status',
|
||||||
|
value: (
|
||||||
|
<StatusLabel status="error">
|
||||||
|
Failed to fetch metrics from pod
|
||||||
|
</StatusLabel>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Hint',
|
||||||
|
value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return <NodeMetricsCard key={podName} metrics={metrics} />;
|
||||||
|
})}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
||||||
+25
-17
@@ -1,20 +1,17 @@
|
|||||||
/**
|
/**
|
||||||
* headlamp-intel-gpu-plugin — entry point.
|
* headlamp-intel-gpu-plugin — entry point.
|
||||||
*
|
*
|
||||||
* Registers sidebar entries, routes, detail view sections, table column
|
* Registers sidebar entries, routes, detail view sections, and table column
|
||||||
* processors, and app bar action for Intel GPU device plugin visibility
|
* processors for Intel GPU device plugin visibility in Headlamp.
|
||||||
* in Headlamp.
|
|
||||||
*
|
*
|
||||||
* Surfaces Intel GPU information in the following places:
|
* Surfaces Intel GPU information in the following places:
|
||||||
* - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods
|
* - Dedicated sidebar section: Overview / Device Plugins / Nodes / Pods / Metrics
|
||||||
* - Native Node detail page: Intel GPU section (capacity, utilization, pods)
|
* - Native Node detail page: Intel GPU section (capacity, utilization, pods)
|
||||||
* - Native Pod detail page: GPU resource requests per container
|
* - Native Pod detail page: GPU resource requests per container
|
||||||
* - Native Nodes table: GPU Type and GPU Devices columns
|
* - Native Nodes table: GPU Type and GPU Devices columns
|
||||||
* - App bar: health badge (hidden when plugin not installed)
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import {
|
import {
|
||||||
registerAppBarAction,
|
|
||||||
registerDetailsViewSection,
|
registerDetailsViewSection,
|
||||||
registerResourceTableColumnsProcessor,
|
registerResourceTableColumnsProcessor,
|
||||||
registerRoute,
|
registerRoute,
|
||||||
@@ -22,9 +19,9 @@ import {
|
|||||||
} from '@kinvolk/headlamp-plugin/lib';
|
} from '@kinvolk/headlamp-plugin/lib';
|
||||||
import React from 'react';
|
import React from 'react';
|
||||||
import { IntelGpuDataProvider } from './api/IntelGpuDataContext';
|
import { IntelGpuDataProvider } from './api/IntelGpuDataContext';
|
||||||
import AppBarGpuBadge from './components/AppBarGpuBadge';
|
|
||||||
import DevicePluginsPage from './components/DevicePluginsPage';
|
import DevicePluginsPage from './components/DevicePluginsPage';
|
||||||
import { buildNodeGpuColumns } from './components/integrations/NodeColumns';
|
import { buildNodeGpuColumns } from './components/integrations/NodeColumns';
|
||||||
|
import MetricsPage from './components/MetricsPage';
|
||||||
import NodeDetailSection from './components/NodeDetailSection';
|
import NodeDetailSection from './components/NodeDetailSection';
|
||||||
import NodesPage from './components/NodesPage';
|
import NodesPage from './components/NodesPage';
|
||||||
import OverviewPage from './components/OverviewPage';
|
import OverviewPage from './components/OverviewPage';
|
||||||
@@ -38,7 +35,7 @@ import PodsPage from './components/PodsPage';
|
|||||||
registerSidebarEntry({
|
registerSidebarEntry({
|
||||||
parent: null,
|
parent: null,
|
||||||
name: 'intel-gpu',
|
name: 'intel-gpu',
|
||||||
label: 'Intel GPU',
|
label: 'intel-gpu',
|
||||||
url: '/intel-gpu',
|
url: '/intel-gpu',
|
||||||
icon: 'mdi:gpu',
|
icon: 'mdi:gpu',
|
||||||
});
|
});
|
||||||
@@ -75,6 +72,14 @@ registerSidebarEntry({
|
|||||||
icon: 'mdi:cube-outline',
|
icon: 'mdi:cube-outline',
|
||||||
});
|
});
|
||||||
|
|
||||||
|
registerSidebarEntry({
|
||||||
|
parent: 'intel-gpu',
|
||||||
|
name: 'intel-gpu-metrics',
|
||||||
|
label: 'Metrics',
|
||||||
|
url: '/intel-gpu/metrics',
|
||||||
|
icon: 'mdi:chart-line',
|
||||||
|
});
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Routes
|
// Routes
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -127,6 +132,18 @@ registerRoute({
|
|||||||
),
|
),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
registerRoute({
|
||||||
|
path: '/intel-gpu/metrics',
|
||||||
|
sidebar: 'intel-gpu-metrics',
|
||||||
|
name: 'intel-gpu-metrics',
|
||||||
|
exact: true,
|
||||||
|
component: () => (
|
||||||
|
<IntelGpuDataProvider>
|
||||||
|
<MetricsPage />
|
||||||
|
</IntelGpuDataProvider>
|
||||||
|
),
|
||||||
|
});
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Detail view section — Node pages
|
// Detail view section — Node pages
|
||||||
// Inject Intel GPU section into native Node detail page for GPU nodes.
|
// Inject Intel GPU section into native Node detail page for GPU nodes.
|
||||||
@@ -164,12 +181,3 @@ registerResourceTableColumnsProcessor(({ id, columns }) => {
|
|||||||
return columns;
|
return columns;
|
||||||
});
|
});
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// App bar action — Intel GPU health badge
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
registerAppBarAction(() => (
|
|
||||||
<IntelGpuDataProvider>
|
|
||||||
<AppBarGpuBadge />
|
|
||||||
</IntelGpuDataProvider>
|
|
||||||
));
|
|
||||||
|
|||||||
Reference in New Issue
Block a user