Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2eb19f8401 | |||
| cc0ad5b286 | |||
| 4b4e565a1a |
+7
-5
@@ -1,4 +1,4 @@
|
|||||||
version: "0.2.0"
|
version: "0.3.0"
|
||||||
name: headlamp-intel-gpu-plugin
|
name: headlamp-intel-gpu-plugin
|
||||||
displayName: Intel GPU
|
displayName: Intel GPU
|
||||||
description: >-
|
description: >-
|
||||||
@@ -15,7 +15,7 @@ license: Apache-2.0
|
|||||||
category: monitoring-logging
|
category: monitoring-logging
|
||||||
|
|
||||||
homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
|
homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
|
||||||
appVersion: "0.2.0"
|
appVersion: "0.3.0"
|
||||||
|
|
||||||
keywords:
|
keywords:
|
||||||
- headlamp
|
- headlamp
|
||||||
@@ -46,7 +46,9 @@ links:
|
|||||||
|
|
||||||
changes:
|
changes:
|
||||||
- kind: added
|
- kind: added
|
||||||
description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
|
description: "Metrics page: document which metrics require what infrastructure (power via hwmon works out of the box; frequency and utilization need custom exporters)"
|
||||||
|
- kind: added
|
||||||
|
description: "Metrics page: real-time GPU power draw (W) and TDP via node-exporter i915 hwmon metrics in kube-prometheus-stack"
|
||||||
- kind: changed
|
- kind: changed
|
||||||
description: "Sidebar label changed to intel-gpu"
|
description: "Sidebar label changed to intel-gpu"
|
||||||
- kind: removed
|
- kind: removed
|
||||||
@@ -69,7 +71,7 @@ changes:
|
|||||||
description: "App bar health badge: hidden when no Intel GPU plugin detected"
|
description: "App bar health badge: hidden when no Intel GPU plugin detected"
|
||||||
|
|
||||||
annotations:
|
annotations:
|
||||||
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
|
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.3.0/headlamp-intel-gpu-plugin-0.3.0.tar.gz"
|
||||||
headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
|
headlamp/plugin/archive-checksum: "sha256:fdc53099ee3123680f24fe4a319b753ca3d030aac31abd4e3f383221085c9c2d"
|
||||||
headlamp/plugin/version-compat: ">=0.20.0"
|
headlamp/plugin/version-compat: ">=0.20.0"
|
||||||
headlamp/plugin/distro-compat: "in-cluster,web,app"
|
headlamp/plugin/distro-compat: "in-cluster,web,app"
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "headlamp-intel-gpu-plugin",
|
"name": "headlamp-intel-gpu-plugin",
|
||||||
"version": "0.2.0",
|
"version": "0.3.0",
|
||||||
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
|
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
|
|||||||
+132
-220
@@ -1,16 +1,15 @@
|
|||||||
/**
|
/**
|
||||||
* Prometheus text format parser for Intel GPU device plugin metrics.
|
* Intel GPU metrics via Prometheus (kube-prometheus-stack).
|
||||||
*
|
*
|
||||||
* Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
|
* The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter
|
||||||
* via the Kubernetes API proxy and parses key metric families.
|
* scrapes automatically. We query Prometheus for:
|
||||||
|
* - node_hwmon_energy_joule_total (chip_name="i915") → rate = power in W
|
||||||
|
* - node_hwmon_power_max_watt (same chip) → TDP
|
||||||
|
* - node_hwmon_chip_names (chip_name="i915") → identify GPU chips
|
||||||
|
* - node_uname_info → instance → nodename
|
||||||
*
|
*
|
||||||
* Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
|
* Queries go through the Kubernetes API proxy to the in-cluster Prometheus
|
||||||
* gpu_i915_engine_active_ticks — engine busy ticks (per card, engine)
|
* service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/...
|
||||||
* gpu_i915_engine_total_ticks — engine total ticks (for utilization %)
|
|
||||||
* gpu_i915_energy_microjoules — cumulative energy (µJ → power = delta/dt)
|
|
||||||
* gpu_i915_gt_boost_freq_mhz — current GT boost frequency (MHz)
|
|
||||||
* gpu_i915_memory_local — local (VRAM) memory usage (bytes)
|
|
||||||
* gpu_i915_memory_system — system memory usage (bytes)
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
||||||
@@ -19,239 +18,152 @@ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
|||||||
// Types
|
// Types
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export interface MetricSample {
|
export interface GpuChipMetrics {
|
||||||
labels: Record<string, string>;
|
/** Kubernetes node name (e.g. "buttons") */
|
||||||
value: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface MetricFamily {
|
|
||||||
name: string;
|
|
||||||
help: string;
|
|
||||||
type: string;
|
|
||||||
samples: MetricSample[];
|
|
||||||
}
|
|
||||||
|
|
||||||
export type ParsedMetrics = Map<string, MetricFamily>;
|
|
||||||
|
|
||||||
export interface GpuNodeMetrics {
|
|
||||||
/** Node name this metric set was fetched from (via plugin pod) */
|
|
||||||
nodeName: string;
|
nodeName: string;
|
||||||
/** Pod name of the intel-gpu-plugin daemonset pod */
|
/** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */
|
||||||
podName: string;
|
chip: string;
|
||||||
/** Engine utilization per (card, engine): 0–100 */
|
/** node-exporter instance (IP:port) */
|
||||||
engineUtilization: Array<{ card: string; engine: string; pct: number }>;
|
instance: string;
|
||||||
/** Current GT boost frequency in MHz per card */
|
/** Current power draw in watts (rate of energy counter, null if unavailable) */
|
||||||
boostFreqMhz: Array<{ card: string; value: number }>;
|
powerWatts: number | null;
|
||||||
/** Local VRAM usage in bytes per card */
|
/** Maximum / TDP power in watts */
|
||||||
memoryLocalBytes: Array<{ card: string; value: number }>;
|
powerMaxWatts: number | null;
|
||||||
/** System memory usage in bytes per card */
|
}
|
||||||
memorySystemBytes: Array<{ card: string; value: number }>;
|
|
||||||
/** Cumulative energy in µJ per card (raw counter; compute delta for power) */
|
export interface GpuMetrics {
|
||||||
energyMicrojoules: Array<{ card: string; value: number }>;
|
chips: GpuChipMetrics[];
|
||||||
/** Raw parsed metric families for advanced use */
|
/** ISO timestamp of when metrics were fetched */
|
||||||
raw: ParsedMetrics;
|
fetchedAt: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Prometheus text format parser
|
// Prometheus query helper
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
|
interface PrometheusResult {
|
||||||
|
metric: Record<string, string>;
|
||||||
function parseLabels(labelStr: string): Record<string, string> {
|
value: [number, string];
|
||||||
const labels: Record<string, string> = {};
|
|
||||||
let match: RegExpExecArray | null;
|
|
||||||
const re = new RegExp(LABEL_PAIR_RE.source, 'g');
|
|
||||||
while ((match = re.exec(labelStr)) !== null) {
|
|
||||||
const key = match[1];
|
|
||||||
const val = match[2];
|
|
||||||
if (key && val !== undefined) {
|
|
||||||
labels[key] = val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return labels;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parsePrometheusText(text: string): ParsedMetrics {
|
interface PrometheusResponse {
|
||||||
const families = new Map<string, MetricFamily>();
|
status: string;
|
||||||
let currentName = '';
|
data: {
|
||||||
let currentHelp = '';
|
resultType: string;
|
||||||
let currentType = '';
|
result: PrometheusResult[];
|
||||||
|
|
||||||
for (const rawLine of text.split('\n')) {
|
|
||||||
const line = rawLine.trim();
|
|
||||||
if (!line) continue;
|
|
||||||
|
|
||||||
if (line.startsWith('# HELP ')) {
|
|
||||||
const rest = line.slice(7);
|
|
||||||
const spaceIdx = rest.indexOf(' ');
|
|
||||||
currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
|
|
||||||
currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (line.startsWith('# TYPE ')) {
|
|
||||||
const rest = line.slice(7);
|
|
||||||
const spaceIdx = rest.indexOf(' ');
|
|
||||||
currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (line.startsWith('#')) continue;
|
|
||||||
|
|
||||||
const openBrace = line.indexOf('{');
|
|
||||||
const closeBrace = line.lastIndexOf('}');
|
|
||||||
|
|
||||||
let metricName: string;
|
|
||||||
let labels: Record<string, string>;
|
|
||||||
let valuePart: string;
|
|
||||||
|
|
||||||
if (openBrace >= 0 && closeBrace > openBrace) {
|
|
||||||
metricName = line.slice(0, openBrace);
|
|
||||||
labels = parseLabels(line.slice(openBrace + 1, closeBrace));
|
|
||||||
valuePart = line.slice(closeBrace + 1).trim();
|
|
||||||
} else {
|
|
||||||
const spaceIdx = line.lastIndexOf(' ');
|
|
||||||
if (spaceIdx < 0) continue;
|
|
||||||
metricName = line.slice(0, spaceIdx);
|
|
||||||
labels = {};
|
|
||||||
valuePart = line.slice(spaceIdx + 1).trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
const valueTokens = valuePart.split(' ');
|
|
||||||
const valueStr = valueTokens[0] ?? '';
|
|
||||||
const value = parseFloat(valueStr);
|
|
||||||
if (!Number.isFinite(value)) continue;
|
|
||||||
|
|
||||||
const familyKey = metricName;
|
|
||||||
let family = families.get(familyKey);
|
|
||||||
if (!family) {
|
|
||||||
family = {
|
|
||||||
name: familyKey,
|
|
||||||
help: metricName === currentName ? currentHelp : '',
|
|
||||||
type: metricName === currentName ? currentType : '',
|
|
||||||
samples: [],
|
|
||||||
};
|
|
||||||
families.set(familyKey, family);
|
|
||||||
}
|
|
||||||
|
|
||||||
family.samples.push({ labels, value });
|
|
||||||
}
|
|
||||||
|
|
||||||
return families;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Extract Intel GPU metrics from the parsed map
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
|
|
||||||
return families.get(name)?.samples ?? [];
|
|
||||||
}
|
|
||||||
|
|
||||||
export function extractGpuNodeMetrics(
|
|
||||||
families: ParsedMetrics,
|
|
||||||
nodeName: string,
|
|
||||||
podName: string
|
|
||||||
): GpuNodeMetrics {
|
|
||||||
const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
|
|
||||||
const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
|
|
||||||
|
|
||||||
// Build utilization: active/total per (card, engine)
|
|
||||||
const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
|
|
||||||
for (const active of activeSamples) {
|
|
||||||
const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
|
|
||||||
const engine = active.labels['engine'] ?? 'render/0';
|
|
||||||
const totalSample = totalSamples.find(
|
|
||||||
s =>
|
|
||||||
(s.labels['card'] ?? s.labels['gpu']) === card &&
|
|
||||||
s.labels['engine'] === engine
|
|
||||||
);
|
|
||||||
const total = totalSample?.value ?? 0;
|
|
||||||
const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
|
|
||||||
engineUtilization.push({ card, engine, pct });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Boost frequency
|
|
||||||
const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
|
|
||||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
|
||||||
value: s.value,
|
|
||||||
}));
|
|
||||||
|
|
||||||
// Memory
|
|
||||||
const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
|
|
||||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
|
||||||
value: s.value,
|
|
||||||
}));
|
|
||||||
const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
|
|
||||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
|
||||||
value: s.value,
|
|
||||||
}));
|
|
||||||
|
|
||||||
// Energy
|
|
||||||
const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
|
|
||||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
|
||||||
value: s.value,
|
|
||||||
}));
|
|
||||||
|
|
||||||
return {
|
|
||||||
nodeName,
|
|
||||||
podName,
|
|
||||||
engineUtilization,
|
|
||||||
boostFreqMhz,
|
|
||||||
memoryLocalBytes,
|
|
||||||
memorySystemBytes,
|
|
||||||
energyMicrojoules,
|
|
||||||
raw: families,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Fetch metrics from an Intel GPU device plugin pod
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
|
* Service discovery: find the Prometheus service.
|
||||||
*
|
* Tries the kube-prometheus-stack default name; falls back to prometheus-operated.
|
||||||
* The proxy path is:
|
|
||||||
* /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
|
|
||||||
*
|
|
||||||
* Returns null if the pod is not exposing metrics (enableMonitoring: false)
|
|
||||||
* or if the proxy request fails.
|
|
||||||
*/
|
*/
|
||||||
export async function fetchGpuPluginMetrics(
|
const PROMETHEUS_SERVICES = [
|
||||||
podName: string,
|
{ namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' },
|
||||||
namespace: string,
|
{ namespace: 'monitoring', service: 'prometheus-operated', port: '9090' },
|
||||||
nodeName: string
|
{ namespace: 'monitoring', service: 'prometheus', port: '9090' },
|
||||||
): Promise<GpuNodeMetrics | null> {
|
];
|
||||||
const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
|
|
||||||
|
|
||||||
try {
|
async function queryPrometheus(
|
||||||
const raw: unknown = await ApiProxy.request(path, {
|
query: string,
|
||||||
method: 'GET',
|
prometheusPath: string
|
||||||
isJSON: false,
|
): Promise<PrometheusResult[]> {
|
||||||
});
|
const encoded = encodeURIComponent(query);
|
||||||
|
const path = `${prometheusPath}/api/v1/query?query=${encoded}`;
|
||||||
|
|
||||||
if (typeof raw !== 'string') return null;
|
const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse;
|
||||||
|
|
||||||
const families = parsePrometheusText(raw);
|
if (raw?.status !== 'success') return [];
|
||||||
return extractGpuNodeMetrics(families, nodeName, podName);
|
return raw.data?.result ?? [];
|
||||||
} catch {
|
}
|
||||||
return null;
|
|
||||||
|
async function findPrometheusPath(): Promise<string | null> {
|
||||||
|
for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
|
||||||
|
const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
|
||||||
|
try {
|
||||||
|
const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse;
|
||||||
|
if (raw?.status === 'success') return basePath;
|
||||||
|
} catch {
|
||||||
|
// try next
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Metrics fetch
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
export async function fetchGpuMetrics(): Promise<GpuMetrics | null> {
|
||||||
|
const prometheusPath = await findPrometheusPath();
|
||||||
|
if (!prometheusPath) return null;
|
||||||
|
|
||||||
|
// Run queries in parallel
|
||||||
|
const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([
|
||||||
|
// i915 chip identification
|
||||||
|
queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath),
|
||||||
|
// Current power (rate of cumulative energy counter)
|
||||||
|
queryPrometheus(
|
||||||
|
'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
|
||||||
|
prometheusPath
|
||||||
|
),
|
||||||
|
// TDP / max power
|
||||||
|
queryPrometheus(
|
||||||
|
'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
|
||||||
|
prometheusPath
|
||||||
|
),
|
||||||
|
// instance → nodename mapping
|
||||||
|
queryPrometheus('node_uname_info', prometheusPath),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Build instance → nodename map
|
||||||
|
const instanceToNode = new Map<string, string>();
|
||||||
|
for (const r of unameResults) {
|
||||||
|
const inst = r.metric['instance'];
|
||||||
|
const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst;
|
||||||
|
if (inst) instanceToNode.set(inst, nodename);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build chip → power map
|
||||||
|
const chipToPower = new Map<string, number>();
|
||||||
|
for (const r of energyRateResults) {
|
||||||
|
const chip = r.metric['chip'];
|
||||||
|
if (chip) chipToPower.set(chip, parseFloat(r.value[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build chip → max power map
|
||||||
|
const chipToMaxPower = new Map<string, number>();
|
||||||
|
for (const r of powerMaxResults) {
|
||||||
|
const chip = r.metric['chip'];
|
||||||
|
if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assemble per-chip metrics from the chip identification results
|
||||||
|
const chips: GpuChipMetrics[] = chipResults.map(r => {
|
||||||
|
const chip = r.metric['chip'] ?? '';
|
||||||
|
const instance = r.metric['instance'] ?? '';
|
||||||
|
const nodeName = instanceToNode.get(instance) ?? instance;
|
||||||
|
const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null;
|
||||||
|
const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null;
|
||||||
|
|
||||||
|
return { nodeName, chip, instance, powerWatts, powerMaxWatts };
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
chips,
|
||||||
|
fetchedAt: new Date().toISOString(),
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Formatting helpers
|
// Formatting helpers
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export function formatBytes(bytes: number): string {
|
export function formatWatts(w: number): string {
|
||||||
if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
|
return `${w.toFixed(1)} W`;
|
||||||
if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
|
|
||||||
if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
|
|
||||||
return `${bytes} B`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function formatFreq(mhz: number): string {
|
export function formatPercent(used: number, max: number): string {
|
||||||
return `${Math.round(mhz)} MHz`;
|
if (max <= 0) return '—';
|
||||||
|
return `${Math.round((used / max) * 100)}%`;
|
||||||
}
|
}
|
||||||
|
|||||||
+206
-200
@@ -1,9 +1,29 @@
|
|||||||
/**
|
/**
|
||||||
* MetricsPage — real-time Intel GPU metrics from the device plugin pods.
|
* MetricsPage — Intel GPU metrics from Prometheus (node-exporter hwmon).
|
||||||
*
|
*
|
||||||
* Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
|
* METRIC AVAILABILITY
|
||||||
* and displays per-card engine utilization, GPU frequency, memory usage,
|
* -------------------
|
||||||
* and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
|
* Power (current W, TDP)
|
||||||
|
* Source: node_hwmon_energy_joule_total, node_hwmon_power_max_watt
|
||||||
|
* Driver: i915 hwmon sysfs (/sys/class/drm/card{N}/device/hwmon/)
|
||||||
|
* Scraped: node-exporter hwmon collector (enabled by default)
|
||||||
|
* Nodes: Discrete GPU nodes only (i915 driver exposes hwmon; iGPU driver does not)
|
||||||
|
* No extra config required — works out of the box with kube-prometheus-stack.
|
||||||
|
*
|
||||||
|
* GPU Frequency (current, boost, min, max MHz)
|
||||||
|
* Source: DRM sysfs (/sys/class/drm/card{N}/gt_{x}_freq_mhz)
|
||||||
|
* Driver: i915 kernel driver
|
||||||
|
* Scraped: NOT available -- node-exporter --collector.drm is AMD-only and does not
|
||||||
|
* read i915 gt_freq sysfs files. Would require a custom exporter or
|
||||||
|
* node-exporter textfile collector sidecar writing these values.
|
||||||
|
*
|
||||||
|
* GPU Utilization (engine busy %)
|
||||||
|
* Source: Not exposed via hwmon or any standard Prometheus collector for i915.
|
||||||
|
* Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
|
||||||
|
*
|
||||||
|
* Integrated GPU (iGPU) nodes
|
||||||
|
* The iGPU driver does not expose hwmon sensors. No Prometheus metrics are
|
||||||
|
* available for iGPU nodes regardless of configuration.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import {
|
import {
|
||||||
@@ -15,141 +35,140 @@ import {
|
|||||||
} from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
} from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
||||||
import React, { useCallback, useEffect, useState } from 'react';
|
import React, { useCallback, useEffect, useState } from 'react';
|
||||||
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||||
import {
|
import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics';
|
||||||
fetchGpuPluginMetrics,
|
|
||||||
formatBytes,
|
|
||||||
formatFreq,
|
|
||||||
GpuNodeMetrics,
|
|
||||||
} from '../api/metrics';
|
|
||||||
import { IntelGpuPod } from '../api/k8s';
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Utilization bar
|
// Power bar
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
function UtilizationBar({ pct }: { pct: number }) {
|
function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
|
||||||
const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
|
||||||
|
const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||||
<div
|
{pct !== null && (
|
||||||
style={{
|
|
||||||
width: '100px',
|
|
||||||
height: '8px',
|
|
||||||
backgroundColor: '#e0e0e0',
|
|
||||||
borderRadius: '4px',
|
|
||||||
overflow: 'hidden',
|
|
||||||
flexShrink: 0,
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<div
|
<div
|
||||||
style={{
|
style={{
|
||||||
width: `${pct}%`,
|
width: '100px',
|
||||||
height: '100%',
|
height: '8px',
|
||||||
backgroundColor: color,
|
backgroundColor: '#e0e0e0',
|
||||||
borderRadius: '4px',
|
borderRadius: '4px',
|
||||||
transition: 'width 0.3s ease',
|
overflow: 'hidden',
|
||||||
|
flexShrink: 0,
|
||||||
}}
|
}}
|
||||||
/>
|
>
|
||||||
</div>
|
<div
|
||||||
<span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
|
style={{
|
||||||
|
width: `${pct}%`,
|
||||||
|
height: '100%',
|
||||||
|
backgroundColor: color,
|
||||||
|
borderRadius: '4px',
|
||||||
|
transition: 'width 0.4s ease',
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
<span style={{ fontSize: '13px', fontVariantNumeric: 'tabular-nums' }}>
|
||||||
|
{formatWatts(watts)}
|
||||||
|
{maxWatts !== null && maxWatts > 0 && (
|
||||||
|
<span style={{ color: '#888', marginLeft: '4px' }}>
|
||||||
|
/ {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)})
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</span>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Per-node metrics card
|
// Per-chip card
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
|
function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
|
||||||
const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
|
const rows: Array<{ name: string; value: React.ReactNode }> = [
|
||||||
|
{ name: 'Node', value: chip.nodeName },
|
||||||
|
{ name: 'GPU (PCI)', value: chip.chip },
|
||||||
|
{
|
||||||
|
name: 'Current Power',
|
||||||
|
value: chip.powerWatts !== null
|
||||||
|
? <PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
|
||||||
|
: <StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
// Group engines by card
|
if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) {
|
||||||
const byCard = new Map<string, typeof engineUtilization>();
|
rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) });
|
||||||
for (const e of engineUtilization) {
|
|
||||||
if (!byCard.has(e.card)) byCard.set(e.card, []);
|
|
||||||
byCard.get(e.card)!.push(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
|
|
||||||
const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
|
|
||||||
const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
|
|
||||||
const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
|
|
||||||
|
|
||||||
const cards = Array.from(
|
|
||||||
new Set([
|
|
||||||
...byCard.keys(),
|
|
||||||
...freqByCard.keys(),
|
|
||||||
...memLocalByCard.keys(),
|
|
||||||
])
|
|
||||||
).sort();
|
|
||||||
|
|
||||||
if (cards.length === 0) {
|
|
||||||
return (
|
|
||||||
<SectionBox title={`${nodeName} — No Metric Data`}>
|
|
||||||
<NameValueTable
|
|
||||||
rows={[
|
|
||||||
{
|
|
||||||
name: 'Pod',
|
|
||||||
value: podName,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: 'Note',
|
|
||||||
value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
|
|
||||||
},
|
|
||||||
]}
|
|
||||||
/>
|
|
||||||
</SectionBox>
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<SectionBox title={`${chip.nodeName} — ${chip.chip}`}>
|
||||||
{cards.map(card => {
|
<NameValueTable rows={rows} />
|
||||||
const engines = byCard.get(card) ?? [];
|
</SectionBox>
|
||||||
const freq = freqByCard.get(card);
|
);
|
||||||
const memLocal = memLocalByCard.get(card);
|
}
|
||||||
const memSys = memSysByCard.get(card);
|
|
||||||
const energy = energyByCard.get(card);
|
|
||||||
|
|
||||||
const rows: Array<{ name: string; value: React.ReactNode }> = [
|
// ---------------------------------------------------------------------------
|
||||||
{ name: 'Node', value: nodeName },
|
// Requirements info box
|
||||||
{ name: 'Plugin Pod', value: podName },
|
// ---------------------------------------------------------------------------
|
||||||
{ name: 'GPU Card', value: card },
|
|
||||||
];
|
|
||||||
|
|
||||||
if (freq !== undefined) {
|
function MetricRequirements() {
|
||||||
rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
|
return (
|
||||||
}
|
<SectionBox title="Metric Availability">
|
||||||
|
<NameValueTable
|
||||||
if (memLocal !== undefined) {
|
rows={[
|
||||||
rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
|
{
|
||||||
}
|
name: 'Power (W)',
|
||||||
if (memSys !== undefined && memSys > 0) {
|
value: (
|
||||||
rows.push({ name: 'System Memory', value: formatBytes(memSys) });
|
<>
|
||||||
}
|
<StatusLabel status="success">Available — discrete GPU nodes</StatusLabel>
|
||||||
|
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||||
if (energy !== undefined) {
|
Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon collector (enabled by default).
|
||||||
rows.push({
|
Requires the i915 kernel driver on the node. iGPU nodes do not expose hwmon sensors.
|
||||||
name: 'Energy (cumulative)',
|
</div>
|
||||||
value: `${(energy / 1e6).toFixed(2)} J`,
|
</>
|
||||||
});
|
),
|
||||||
}
|
},
|
||||||
|
{
|
||||||
// Engine utilization rows
|
name: 'Frequency (MHz)',
|
||||||
for (const e of engines) {
|
value: (
|
||||||
rows.push({
|
<>
|
||||||
name: `Engine: ${e.engine}`,
|
<StatusLabel status="error">Not available</StatusLabel>
|
||||||
value: <UtilizationBar pct={e.pct} />,
|
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||||
});
|
i915 exposes <code>gt_*_freq_mhz</code> via DRM sysfs but node-exporter's{' '}
|
||||||
}
|
<code>--collector.drm</code> flag is AMD-only and does not read these files.
|
||||||
|
A custom exporter or textfile-collector sidecar writing these values would be required.
|
||||||
return (
|
</div>
|
||||||
<SectionBox key={`${nodeName}-${card}`} title={`${nodeName} — ${card}`}>
|
</>
|
||||||
<NameValueTable rows={rows} />
|
),
|
||||||
</SectionBox>
|
},
|
||||||
);
|
{
|
||||||
})}
|
name: 'Utilization (%)',
|
||||||
</>
|
value: (
|
||||||
|
<>
|
||||||
|
<StatusLabel status="error">Not available</StatusLabel>
|
||||||
|
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||||
|
No standard Prometheus collector exposes i915 engine busy percentage.
|
||||||
|
Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'iGPU nodes',
|
||||||
|
value: (
|
||||||
|
<>
|
||||||
|
<StatusLabel status="error">No metrics available</StatusLabel>
|
||||||
|
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||||
|
The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics
|
||||||
|
are available for iGPU nodes regardless of configuration.
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -158,38 +177,33 @@ function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
|
|||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export default function MetricsPage() {
|
export default function MetricsPage() {
|
||||||
const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
|
const { gpuNodes, loading: ctxLoading } = useIntelGpuContext();
|
||||||
|
|
||||||
const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
|
const [metrics, setMetrics] = useState<GpuMetrics | null>(null);
|
||||||
|
const [fetchError, setFetchError] = useState<string | null>(null);
|
||||||
const [fetching, setFetching] = useState(false);
|
const [fetching, setFetching] = useState(false);
|
||||||
|
|
||||||
const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
|
const doFetch = useCallback(async () => {
|
||||||
if (pods.length === 0) return;
|
|
||||||
setFetching(true);
|
setFetching(true);
|
||||||
|
setFetchError(null);
|
||||||
const results = await Promise.all(
|
try {
|
||||||
pods.map(async pod => {
|
const result = await fetchGpuMetrics();
|
||||||
const name = pod.metadata.name;
|
setMetrics(result);
|
||||||
const namespace = pod.metadata.namespace ?? 'kube-system';
|
if (!result) {
|
||||||
const nodeName = pod.spec?.nodeName ?? name;
|
setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.');
|
||||||
const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
|
}
|
||||||
return { name, result };
|
} catch (e: unknown) {
|
||||||
})
|
setFetchError(e instanceof Error ? e.message : String(e));
|
||||||
);
|
} finally {
|
||||||
|
setFetching(false);
|
||||||
const map = new Map<string, GpuNodeMetrics | 'error'>();
|
|
||||||
for (const { name, result } of results) {
|
|
||||||
map.set(name, result ?? 'error');
|
|
||||||
}
|
}
|
||||||
setMetricsMap(map);
|
|
||||||
setFetching(false);
|
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!ctxLoading && pluginPods.length > 0) {
|
if (!ctxLoading) {
|
||||||
void fetchAll(pluginPods);
|
void doFetch();
|
||||||
}
|
}
|
||||||
}, [ctxLoading, pluginPods, fetchAll]);
|
}, [ctxLoading, doFetch]);
|
||||||
|
|
||||||
if (ctxLoading) {
|
if (ctxLoading) {
|
||||||
return <Loader title="Loading Intel GPU data..." />;
|
return <Loader title="Loading Intel GPU data..." />;
|
||||||
@@ -200,8 +214,8 @@ export default function MetricsPage() {
|
|||||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||||
<SectionHeader title="Intel GPU — Metrics" />
|
<SectionHeader title="Intel GPU — Metrics" />
|
||||||
<button
|
<button
|
||||||
onClick={() => void fetchAll(pluginPods)}
|
onClick={() => void doFetch()}
|
||||||
disabled={fetching || pluginPods.length === 0}
|
disabled={fetching}
|
||||||
aria-label="Refresh metrics"
|
aria-label="Refresh metrics"
|
||||||
style={{
|
style={{
|
||||||
padding: '6px 16px',
|
padding: '6px 16px',
|
||||||
@@ -218,94 +232,86 @@ export default function MetricsPage() {
|
|||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{!pluginInstalled && (
|
<MetricRequirements />
|
||||||
<SectionBox title="Intel GPU Plugin Not Detected">
|
|
||||||
|
{fetching && !metrics && <Loader title="Querying Prometheus for GPU metrics..." />}
|
||||||
|
|
||||||
|
{fetchError && (
|
||||||
|
<SectionBox title="Prometheus Unreachable">
|
||||||
<NameValueTable
|
<NameValueTable
|
||||||
rows={[
|
rows={[
|
||||||
{
|
{
|
||||||
name: 'Status',
|
name: 'Error',
|
||||||
value: (
|
value: <StatusLabel status="error">{fetchError}</StatusLabel>,
|
||||||
<StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'Note',
|
name: 'Checked services',
|
||||||
value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
|
value: 'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
|
||||||
},
|
},
|
||||||
]}
|
]}
|
||||||
/>
|
/>
|
||||||
</SectionBox>
|
</SectionBox>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{pluginInstalled && pluginPods.length === 0 && (
|
{metrics && metrics.chips.length === 0 && (
|
||||||
<SectionBox title="No Plugin Pods Found">
|
<SectionBox title="No i915 Metrics in Prometheus">
|
||||||
<NameValueTable
|
|
||||||
rows={[
|
|
||||||
{
|
|
||||||
name: 'Status',
|
|
||||||
value: (
|
|
||||||
<StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
|
|
||||||
),
|
|
||||||
},
|
|
||||||
]}
|
|
||||||
/>
|
|
||||||
</SectionBox>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
|
|
||||||
<Loader title="Fetching GPU metrics..." />
|
|
||||||
)}
|
|
||||||
|
|
||||||
{pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
|
|
||||||
<SectionBox title="Metrics Unavailable">
|
|
||||||
<NameValueTable
|
<NameValueTable
|
||||||
rows={[
|
rows={[
|
||||||
{
|
{
|
||||||
name: 'Status',
|
name: 'Status',
|
||||||
value: (
|
value: (
|
||||||
<StatusLabel status="warning">
|
<StatusLabel status="warning">
|
||||||
Could not fetch metrics from any plugin pod
|
Prometheus reachable — no node_hwmon_chip_names{chip_name="i915"} found
|
||||||
</StatusLabel>
|
</StatusLabel>
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'Requirements',
|
name: 'GPU Nodes',
|
||||||
value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
|
value: gpuNodes.length > 0 ? gpuNodes.map(n => n.metadata.name).join(', ') : 'None detected',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'Plugin Pods Found',
|
name: 'Likely cause',
|
||||||
value: pluginPods.map(p => p.metadata.name).join(', '),
|
value: 'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
|
||||||
},
|
},
|
||||||
]}
|
]}
|
||||||
/>
|
/>
|
||||||
</SectionBox>
|
</SectionBox>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{Array.from(metricsMap.entries()).map(([podName, metrics]) => {
|
{metrics && metrics.chips.length > 0 && (
|
||||||
if (metrics === 'error') {
|
<>
|
||||||
return (
|
<SectionBox title="GPU Power Summary">
|
||||||
<SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
|
<NameValueTable
|
||||||
<NameValueTable
|
rows={[
|
||||||
rows={[
|
{
|
||||||
{
|
name: 'GPUs Monitored',
|
||||||
name: 'Status',
|
value: String(metrics.chips.length),
|
||||||
value: (
|
},
|
||||||
<StatusLabel status="error">
|
{
|
||||||
Failed to fetch metrics from pod
|
name: 'Total Power',
|
||||||
</StatusLabel>
|
value: (() => {
|
||||||
),
|
const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0);
|
||||||
},
|
const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0);
|
||||||
{
|
return <PowerBar watts={total} maxWatts={maxTotal > 0 ? maxTotal : null} />;
|
||||||
name: 'Hint',
|
})(),
|
||||||
value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
|
},
|
||||||
},
|
{
|
||||||
]}
|
name: 'Last Fetched',
|
||||||
/>
|
value: new Date(metrics.fetchedAt).toLocaleTimeString(),
|
||||||
</SectionBox>
|
},
|
||||||
);
|
{
|
||||||
}
|
name: 'Query',
|
||||||
return <NodeMetricsCard key={podName} metrics={metrics} />;
|
value: 'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
|
||||||
})}
|
},
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
</SectionBox>
|
||||||
|
|
||||||
|
{metrics.chips.map(chip => (
|
||||||
|
<GpuChipCard key={`${chip.instance}-${chip.chip}`} chip={chip} />
|
||||||
|
))}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user