diff --git a/artifacthub-pkg.yml b/artifacthub-pkg.yml index bb009e6..043641b 100644 --- a/artifacthub-pkg.yml +++ b/artifacthub-pkg.yml @@ -46,7 +46,7 @@ links: changes: - kind: added - description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)" + description: "Metrics page: real-time GPU power draw (W) and TDP via node-exporter i915 hwmon metrics in kube-prometheus-stack" - kind: changed description: "Sidebar label changed to intel-gpu" - kind: removed @@ -70,6 +70,6 @@ changes: annotations: headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz" - headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442" + headlamp/plugin/archive-checksum: "sha256:cbcd20916d72e91ccc36143c74680fbeb2f045e1cbe6d1cc0d844b198e2a1ea5" headlamp/plugin/version-compat: ">=0.20.0" headlamp/plugin/distro-compat: "in-cluster,web,app" diff --git a/src/api/metrics.ts b/src/api/metrics.ts index dc88403..f51ca24 100644 --- a/src/api/metrics.ts +++ b/src/api/metrics.ts @@ -1,16 +1,15 @@ /** - * Prometheus text format parser for Intel GPU device plugin metrics. + * Intel GPU metrics via Prometheus (kube-prometheus-stack). * - * Fetches raw metrics from the Intel GPU device plugin pod (port 9090) - * via the Kubernetes API proxy and parses key metric families. + * The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter + * scrapes automatically. We query Prometheus for: + * - node_hwmon_energy_joule_total (chip_name="i915") → rate = power in W + * - node_hwmon_power_max_watt (same chip) → TDP + * - node_hwmon_chip_names (chip_name="i915") → identify GPU chips + * - node_uname_info → instance → nodename * - * Metrics exposed by intel-gpu-plugin when enableMonitoring: true: - * gpu_i915_engine_active_ticks — engine busy ticks (per card, engine) - * gpu_i915_engine_total_ticks — engine total ticks (for utilization %) - * gpu_i915_energy_microjoules — cumulative energy (µJ → power = delta/dt) - * gpu_i915_gt_boost_freq_mhz — current GT boost frequency (MHz) - * gpu_i915_memory_local — local (VRAM) memory usage (bytes) - * gpu_i915_memory_system — system memory usage (bytes) + * Queries go through the Kubernetes API proxy to the in-cluster Prometheus + * service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/... */ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib'; @@ -19,239 +18,152 @@ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib'; // Types // --------------------------------------------------------------------------- -export interface MetricSample { - labels: Record; - value: number; -} - -export interface MetricFamily { - name: string; - help: string; - type: string; - samples: MetricSample[]; -} - -export type ParsedMetrics = Map; - -export interface GpuNodeMetrics { - /** Node name this metric set was fetched from (via plugin pod) */ +export interface GpuChipMetrics { + /** Kubernetes node name (e.g. "buttons") */ nodeName: string; - /** Pod name of the intel-gpu-plugin daemonset pod */ - podName: string; - /** Engine utilization per (card, engine): 0–100 */ - engineUtilization: Array<{ card: string; engine: string; pct: number }>; - /** Current GT boost frequency in MHz per card */ - boostFreqMhz: Array<{ card: string; value: number }>; - /** Local VRAM usage in bytes per card */ - memoryLocalBytes: Array<{ card: string; value: number }>; - /** System memory usage in bytes per card */ - memorySystemBytes: Array<{ card: string; value: number }>; - /** Cumulative energy in µJ per card (raw counter; compute delta for power) */ - energyMicrojoules: Array<{ card: string; value: number }>; - /** Raw parsed metric families for advanced use */ - raw: ParsedMetrics; + /** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */ + chip: string; + /** node-exporter instance (IP:port) */ + instance: string; + /** Current power draw in watts (rate of energy counter, null if unavailable) */ + powerWatts: number | null; + /** Maximum / TDP power in watts */ + powerMaxWatts: number | null; +} + +export interface GpuMetrics { + chips: GpuChipMetrics[]; + /** ISO timestamp of when metrics were fetched */ + fetchedAt: string; } // --------------------------------------------------------------------------- -// Prometheus text format parser +// Prometheus query helper // --------------------------------------------------------------------------- -const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g; - -function parseLabels(labelStr: string): Record { - const labels: Record = {}; - let match: RegExpExecArray | null; - const re = new RegExp(LABEL_PAIR_RE.source, 'g'); - while ((match = re.exec(labelStr)) !== null) { - const key = match[1]; - const val = match[2]; - if (key && val !== undefined) { - labels[key] = val; - } - } - return labels; +interface PrometheusResult { + metric: Record; + value: [number, string]; } -export function parsePrometheusText(text: string): ParsedMetrics { - const families = new Map(); - let currentName = ''; - let currentHelp = ''; - let currentType = ''; - - for (const rawLine of text.split('\n')) { - const line = rawLine.trim(); - if (!line) continue; - - if (line.startsWith('# HELP ')) { - const rest = line.slice(7); - const spaceIdx = rest.indexOf(' '); - currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest; - currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : ''; - continue; - } - - if (line.startsWith('# TYPE ')) { - const rest = line.slice(7); - const spaceIdx = rest.indexOf(' '); - currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : ''; - continue; - } - - if (line.startsWith('#')) continue; - - const openBrace = line.indexOf('{'); - const closeBrace = line.lastIndexOf('}'); - - let metricName: string; - let labels: Record; - let valuePart: string; - - if (openBrace >= 0 && closeBrace > openBrace) { - metricName = line.slice(0, openBrace); - labels = parseLabels(line.slice(openBrace + 1, closeBrace)); - valuePart = line.slice(closeBrace + 1).trim(); - } else { - const spaceIdx = line.lastIndexOf(' '); - if (spaceIdx < 0) continue; - metricName = line.slice(0, spaceIdx); - labels = {}; - valuePart = line.slice(spaceIdx + 1).trim(); - } - - const valueTokens = valuePart.split(' '); - const valueStr = valueTokens[0] ?? ''; - const value = parseFloat(valueStr); - if (!Number.isFinite(value)) continue; - - const familyKey = metricName; - let family = families.get(familyKey); - if (!family) { - family = { - name: familyKey, - help: metricName === currentName ? currentHelp : '', - type: metricName === currentName ? currentType : '', - samples: [], - }; - families.set(familyKey, family); - } - - family.samples.push({ labels, value }); - } - - return families; -} - -// --------------------------------------------------------------------------- -// Extract Intel GPU metrics from the parsed map -// --------------------------------------------------------------------------- - -function samplesFor(families: ParsedMetrics, name: string): MetricSample[] { - return families.get(name)?.samples ?? []; -} - -export function extractGpuNodeMetrics( - families: ParsedMetrics, - nodeName: string, - podName: string -): GpuNodeMetrics { - const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks'); - const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks'); - - // Build utilization: active/total per (card, engine) - const engineUtilization: GpuNodeMetrics['engineUtilization'] = []; - for (const active of activeSamples) { - const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0'; - const engine = active.labels['engine'] ?? 'render/0'; - const totalSample = totalSamples.find( - s => - (s.labels['card'] ?? s.labels['gpu']) === card && - s.labels['engine'] === engine - ); - const total = totalSample?.value ?? 0; - const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0; - engineUtilization.push({ card, engine, pct }); - } - - // Boost frequency - const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({ - card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', - value: s.value, - })); - - // Memory - const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({ - card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', - value: s.value, - })); - const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({ - card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', - value: s.value, - })); - - // Energy - const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({ - card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0', - value: s.value, - })); - - return { - nodeName, - podName, - engineUtilization, - boostFreqMhz, - memoryLocalBytes, - memorySystemBytes, - energyMicrojoules, - raw: families, +interface PrometheusResponse { + status: string; + data: { + resultType: string; + result: PrometheusResult[]; }; } -// --------------------------------------------------------------------------- -// Fetch metrics from an Intel GPU device plugin pod -// --------------------------------------------------------------------------- - /** - * Fetches and parses Prometheus metrics from an Intel GPU device plugin pod. - * - * The proxy path is: - * /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics - * - * Returns null if the pod is not exposing metrics (enableMonitoring: false) - * or if the proxy request fails. + * Service discovery: find the Prometheus service. + * Tries the kube-prometheus-stack default name; falls back to prometheus-operated. */ -export async function fetchGpuPluginMetrics( - podName: string, - namespace: string, - nodeName: string -): Promise { - const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`; +const PROMETHEUS_SERVICES = [ + { namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' }, + { namespace: 'monitoring', service: 'prometheus-operated', port: '9090' }, + { namespace: 'monitoring', service: 'prometheus', port: '9090' }, +]; - try { - const raw: unknown = await ApiProxy.request(path, { - method: 'GET', - isJSON: false, - }); +async function queryPrometheus( + query: string, + prometheusPath: string +): Promise { + const encoded = encodeURIComponent(query); + const path = `${prometheusPath}/api/v1/query?query=${encoded}`; - if (typeof raw !== 'string') return null; + const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse; - const families = parsePrometheusText(raw); - return extractGpuNodeMetrics(families, nodeName, podName); - } catch { - return null; + if (raw?.status !== 'success') return []; + return raw.data?.result ?? []; +} + +async function findPrometheusPath(): Promise { + for (const { namespace, service, port } of PROMETHEUS_SERVICES) { + const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`; + try { + const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse; + if (raw?.status === 'success') return basePath; + } catch { + // try next + } } + return null; +} + +// --------------------------------------------------------------------------- +// Metrics fetch +// --------------------------------------------------------------------------- + +export async function fetchGpuMetrics(): Promise { + const prometheusPath = await findPrometheusPath(); + if (!prometheusPath) return null; + + // Run queries in parallel + const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([ + // i915 chip identification + queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath), + // Current power (rate of cumulative energy counter) + queryPrometheus( + 'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}', + prometheusPath + ), + // TDP / max power + queryPrometheus( + 'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}', + prometheusPath + ), + // instance → nodename mapping + queryPrometheus('node_uname_info', prometheusPath), + ]); + + // Build instance → nodename map + const instanceToNode = new Map(); + for (const r of unameResults) { + const inst = r.metric['instance']; + const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst; + if (inst) instanceToNode.set(inst, nodename); + } + + // Build chip → power map + const chipToPower = new Map(); + for (const r of energyRateResults) { + const chip = r.metric['chip']; + if (chip) chipToPower.set(chip, parseFloat(r.value[1])); + } + + // Build chip → max power map + const chipToMaxPower = new Map(); + for (const r of powerMaxResults) { + const chip = r.metric['chip']; + if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1])); + } + + // Assemble per-chip metrics from the chip identification results + const chips: GpuChipMetrics[] = chipResults.map(r => { + const chip = r.metric['chip'] ?? ''; + const instance = r.metric['instance'] ?? ''; + const nodeName = instanceToNode.get(instance) ?? instance; + const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null; + const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null; + + return { nodeName, chip, instance, powerWatts, powerMaxWatts }; + }); + + return { + chips, + fetchedAt: new Date().toISOString(), + }; } // --------------------------------------------------------------------------- // Formatting helpers // --------------------------------------------------------------------------- -export function formatBytes(bytes: number): string { - if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`; - if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`; - if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`; - return `${bytes} B`; +export function formatWatts(w: number): string { + return `${w.toFixed(1)} W`; } -export function formatFreq(mhz: number): string { - return `${Math.round(mhz)} MHz`; +export function formatPercent(used: number, max: number): string { + if (max <= 0) return '—'; + return `${Math.round((used / max) * 100)}%`; } diff --git a/src/components/MetricsPage.tsx b/src/components/MetricsPage.tsx index 711abe0..f1c301e 100644 --- a/src/components/MetricsPage.tsx +++ b/src/components/MetricsPage.tsx @@ -1,9 +1,9 @@ /** - * MetricsPage — real-time Intel GPU metrics from the device plugin pods. + * MetricsPage — Intel GPU power metrics from Prometheus (node-exporter hwmon). * - * Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090) - * and displays per-card engine utilization, GPU frequency, memory usage, - * and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin. + * The Intel i915/Xe GPU driver exposes hwmon sensors which node-exporter scrapes. + * This page queries kube-prometheus-stack for real-time GPU power draw + * (derived from node_hwmon_energy_joule_total rate) and TDP per GPU node. */ import { @@ -15,141 +15,82 @@ import { } from '@kinvolk/headlamp-plugin/lib/CommonComponents'; import React, { useCallback, useEffect, useState } from 'react'; import { useIntelGpuContext } from '../api/IntelGpuDataContext'; -import { - fetchGpuPluginMetrics, - formatBytes, - formatFreq, - GpuNodeMetrics, -} from '../api/metrics'; -import { IntelGpuPod } from '../api/k8s'; +import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics'; // --------------------------------------------------------------------------- -// Utilization bar +// Power bar // --------------------------------------------------------------------------- -function UtilizationBar({ pct }: { pct: number }) { - const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5'; +function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) { + const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null; + const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5'; + return (
-
+ {pct !== null && (
-
- {pct}% + > +
+
+ )} + + {formatWatts(watts)} + {maxWatts !== null && maxWatts > 0 && ( + + / {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)}) + + )} +
); } // --------------------------------------------------------------------------- -// Per-node metrics card +// Per-chip card // --------------------------------------------------------------------------- -function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) { - const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics; +function GpuChipCard({ chip }: { chip: GpuChipMetrics }) { + const rows: Array<{ name: string; value: React.ReactNode }> = [ + { name: 'Node', value: chip.nodeName }, + { name: 'GPU (PCI)', value: chip.chip }, + ]; - // Group engines by card - const byCard = new Map(); - for (const e of engineUtilization) { - if (!byCard.has(e.card)) byCard.set(e.card, []); - byCard.get(e.card)!.push(e); + if (chip.powerWatts !== null) { + rows.push({ + name: 'Current Power', + value: , + }); + } else { + rows.push({ + name: 'Current Power', + value: No data (needs ≥5m of scrape history), + }); } - const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value])); - const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value])); - const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value])); - const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value])); - - const cards = Array.from( - new Set([ - ...byCard.keys(), - ...freqByCard.keys(), - ...memLocalByCard.keys(), - ]) - ).sort(); - - if (cards.length === 0) { - return ( - - - - ); + if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) { + rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) }); } return ( - <> - {cards.map(card => { - const engines = byCard.get(card) ?? []; - const freq = freqByCard.get(card); - const memLocal = memLocalByCard.get(card); - const memSys = memSysByCard.get(card); - const energy = energyByCard.get(card); - - const rows: Array<{ name: string; value: React.ReactNode }> = [ - { name: 'Node', value: nodeName }, - { name: 'Plugin Pod', value: podName }, - { name: 'GPU Card', value: card }, - ]; - - if (freq !== undefined) { - rows.push({ name: 'Boost Frequency', value: formatFreq(freq) }); - } - - if (memLocal !== undefined) { - rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) }); - } - if (memSys !== undefined && memSys > 0) { - rows.push({ name: 'System Memory', value: formatBytes(memSys) }); - } - - if (energy !== undefined) { - rows.push({ - name: 'Energy (cumulative)', - value: `${(energy / 1e6).toFixed(2)} J`, - }); - } - - // Engine utilization rows - for (const e of engines) { - rows.push({ - name: `Engine: ${e.engine}`, - value: , - }); - } - - return ( - - - - ); - })} - + + + ); } @@ -158,38 +99,33 @@ function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) { // --------------------------------------------------------------------------- export default function MetricsPage() { - const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext(); + const { gpuNodes, loading: ctxLoading } = useIntelGpuContext(); - const [metricsMap, setMetricsMap] = useState>(new Map()); + const [metrics, setMetrics] = useState(null); + const [fetchError, setFetchError] = useState(null); const [fetching, setFetching] = useState(false); - const fetchAll = useCallback(async (pods: IntelGpuPod[]) => { - if (pods.length === 0) return; + const doFetch = useCallback(async () => { setFetching(true); - - const results = await Promise.all( - pods.map(async pod => { - const name = pod.metadata.name; - const namespace = pod.metadata.namespace ?? 'kube-system'; - const nodeName = pod.spec?.nodeName ?? name; - const result = await fetchGpuPluginMetrics(name, namespace, nodeName); - return { name, result }; - }) - ); - - const map = new Map(); - for (const { name, result } of results) { - map.set(name, result ?? 'error'); + setFetchError(null); + try { + const result = await fetchGpuMetrics(); + setMetrics(result); + if (!result) { + setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.'); + } + } catch (e: unknown) { + setFetchError(e instanceof Error ? e.message : String(e)); + } finally { + setFetching(false); } - setMetricsMap(map); - setFetching(false); }, []); useEffect(() => { - if (!ctxLoading && pluginPods.length > 0) { - void fetchAll(pluginPods); + if (!ctxLoading) { + void doFetch(); } - }, [ctxLoading, pluginPods, fetchAll]); + }, [ctxLoading, doFetch]); if (ctxLoading) { return ; @@ -200,8 +136,8 @@ export default function MetricsPage() {
- {!pluginInstalled && ( - - No Intel GPU device plugin pods found - ), - }, - { - name: 'Note', - value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.', - }, - ]} - /> - - )} + {fetching && !metrics && } - {pluginInstalled && pluginPods.length === 0 && ( - - Plugin detected via CRD but no pods found - ), - }, - ]} - /> - - )} - - {pluginPods.length > 0 && metricsMap.size === 0 && fetching && ( - - )} - - {pluginPods.length > 0 && metricsMap.size === 0 && !fetching && ( + {fetchError && ( + {fetchError}, + }, + { + name: 'Data Source', + value: 'node_hwmon_energy_joule_total (chip_name="i915") via kube-prometheus-stack', + }, + { + name: 'Requirements', + value: 'kube-prometheus-stack installed in monitoring namespace with node-exporter enabled', + }, + ]} + /> + + )} + + {metrics && metrics.chips.length === 0 && ( + - Could not fetch metrics from any plugin pod + Prometheus is reachable but no i915 hwmon chips found ), }, { - name: 'Requirements', - value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.', + name: 'Note', + value: 'The i915 driver exposes hwmon sensors on discrete Intel GPU nodes. ' + + 'Ensure node-exporter is running on GPU nodes with hwmon collector enabled.', }, { - name: 'Plugin Pods Found', - value: pluginPods.map(p => p.metadata.name).join(', '), + name: 'GPU Nodes', + value: gpuNodes.length > 0 + ? gpuNodes.map(n => n.metadata.name).join(', ') + : 'None detected', }, ]} /> )} - {Array.from(metricsMap.entries()).map(([podName, metrics]) => { - if (metrics === 'error') { - return ( - - - Failed to fetch metrics from pod - - ), - }, - { - name: 'Hint', - value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.', - }, - ]} - /> - - ); - } - return ; - })} + {metrics && metrics.chips.length > 0 && ( + <> + + { + const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0); + const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0); + return 0 ? maxTotal : null} />; + })(), + }, + { + name: 'Last Fetched', + value: new Date(metrics.fetchedAt).toLocaleTimeString(), + }, + { + name: 'Data Source', + value: 'node-exporter hwmon · i915 driver · rate(node_hwmon_energy_joule_total[5m])', + }, + ]} + /> + + + {metrics.chips.map(chip => ( + + ))} + + )} ); }