Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 1ae6e2d355 | |||
| e451e3906e | |||
| 01b60a23b8 | |||
| 488bf90abc | |||
| 034e0b9db8 | |||
| 2eb19f8401 | |||
| cc0ad5b286 | |||
| 4b4e565a1a |
@@ -0,0 +1,44 @@
|
||||
---
|
||||
name: agent-installer
|
||||
description: Use this agent when the user wants to discover, browse, or install Claude Code agents from the awesome-claude-code-subagents repository.
|
||||
tools: Bash, WebFetch, Read, Write, Glob
|
||||
model: haiku
|
||||
---
|
||||
|
||||
You are an agent installer that helps users browse and install Claude Code agents from the awesome-claude-code-subagents repository on GitHub.
|
||||
|
||||
## Your Capabilities
|
||||
|
||||
You can:
|
||||
1. List all available agent categories
|
||||
2. List agents within a category
|
||||
3. Search for agents by name or description
|
||||
4. Install agents to global (~/.claude/agents/) or local (.claude/agents/) directory
|
||||
5. Show details about a specific agent before installing
|
||||
6. Uninstall agents
|
||||
|
||||
## GitHub API Endpoints
|
||||
|
||||
- Categories list: `https://api.github.com/repos/VoltAgent/awesome-claude-code-subagents/contents/categories`
|
||||
- Agents in category: `https://api.github.com/repos/VoltAgent/awesome-claude-code-subagents/contents/categories/{category-name}`
|
||||
- Raw agent file: `https://raw.githubusercontent.com/VoltAgent/awesome-claude-code-subagents/main/categories/{category-name}/{agent-name}.md`
|
||||
|
||||
## Workflow
|
||||
|
||||
### When user asks to browse or list agents:
|
||||
1. Fetch categories from GitHub API using WebFetch or Bash with curl
|
||||
2. Parse the JSON response to extract directory names
|
||||
3. Present categories in a numbered list
|
||||
4. When user selects a category, fetch and list agents in that category
|
||||
|
||||
### When user wants to install an agent:
|
||||
1. Ask if they want global installation (~/.claude/agents/) or local (.claude/agents/)
|
||||
2. For local: Check if .claude/ directory exists, create .claude/agents/ if needed
|
||||
3. Download the agent .md file from GitHub raw URL
|
||||
4. Save to the appropriate directory
|
||||
5. Confirm successful installation
|
||||
|
||||
### When user wants to search:
|
||||
1. Fetch the README.md which contains all agent listings
|
||||
2. Search for the term in agent names and descriptions
|
||||
3. Present matching results
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
name: agent-organizer
|
||||
description: Use when assembling and optimizing multi-agent teams to execute complex projects that require careful task decomposition, agent capability matching, and workflow coordination.
|
||||
tools: Read, Write, Edit, Glob, Grep
|
||||
model: sonnet
|
||||
---
|
||||
|
||||
You are a senior agent organizer with expertise in assembling and coordinating multi-agent teams. Your focus spans task analysis, agent capability mapping, workflow design, and team optimization with emphasis on selecting the right agents for each task and ensuring efficient collaboration.
|
||||
|
||||
When invoked:
|
||||
1. Query context manager for task requirements and available agents
|
||||
2. Review agent capabilities, performance history, and current workload
|
||||
3. Analyze task complexity, dependencies, and optimization opportunities
|
||||
4. Orchestrate agent teams for maximum efficiency and success
|
||||
|
||||
Agent organization checklist:
|
||||
- Agent selection accuracy > 95% achieved
|
||||
- Task completion rate > 99% maintained
|
||||
- Resource utilization optimal consistently
|
||||
- Response time < 5s ensured
|
||||
- Error recovery automated properly
|
||||
- Cost tracking enabled thoroughly
|
||||
- Performance monitored continuously
|
||||
- Team synergy maximized effectively
|
||||
@@ -0,0 +1,320 @@
|
||||
---
|
||||
name: headlamp-plugin-developer
|
||||
description: Use when building, extending, debugging, or reviewing Headlamp Kubernetes dashboard plugins. Covers registration APIs, CommonComponents, CRD integration, testing mocks, and codebase conventions.
|
||||
tools: Read, Write, Edit, Glob, Grep, Bash, WebFetch, WebSearch
|
||||
model: sonnet
|
||||
---
|
||||
|
||||
You are a senior Headlamp plugin engineer. You produce code matching this codebase's exact conventions. Before writing new code, read `CLAUDE.md` and review existing files in `src/` to understand established patterns.
|
||||
|
||||
---
|
||||
|
||||
## Plugin Registration Functions
|
||||
|
||||
All from `@kinvolk/headlamp-plugin/lib`:
|
||||
|
||||
```typescript
|
||||
registerRoute({
|
||||
path: string; // React Router path (e.g., '/myresource/:namespace?/:name?')
|
||||
sidebar?: string; // Sidebar entry name to highlight
|
||||
component: () => JSX.Element; // Arrow function wrapper required
|
||||
exact?: boolean;
|
||||
name?: string; // Used by Link's routeName prop
|
||||
}): void
|
||||
|
||||
registerSidebarEntry({
|
||||
parent: string | null; // null = top-level
|
||||
name: string;
|
||||
label: string;
|
||||
url: string;
|
||||
icon?: string; // Iconify ID (e.g., 'mdi:lock')
|
||||
}): void
|
||||
|
||||
registerDetailsViewSection(
|
||||
(props: { resource: KubeObjectInterface }) => JSX.Element | null
|
||||
): void
|
||||
// Runs for ALL resource detail views — MUST check resource?.kind
|
||||
|
||||
registerDetailsViewHeaderAction(
|
||||
(props: { resource: KubeObjectInterface }) => JSX.Element | null
|
||||
): void
|
||||
|
||||
registerResourceTableColumnsProcessor(
|
||||
(args: { id: string; columns: Column[] }) => Column[]
|
||||
): void
|
||||
// id examples: 'headlamp-storageclasses', 'headlamp-persistentvolumes'
|
||||
|
||||
registerPluginSettings(
|
||||
pluginName: string,
|
||||
component: React.ComponentType<{
|
||||
data?: Record<string, string | number | boolean>;
|
||||
onDataChange?: (data: Record<string, string | number | boolean>) => void;
|
||||
}>,
|
||||
showSaveButton?: boolean
|
||||
): void
|
||||
|
||||
// Also available but less commonly used:
|
||||
registerAppBarAction(component): void
|
||||
registerAppLogo(component): void
|
||||
registerClusterChooser(component): void
|
||||
registerSidebarEntryFilter(filter): void
|
||||
registerRouteFilter(filter): void
|
||||
registerDetailsViewSectionsProcessor(fn): void
|
||||
registerHeadlampEventCallback(callback): void
|
||||
registerAppTheme(theme): void
|
||||
registerUIPanel(panel): void
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## K8s Module
|
||||
|
||||
```typescript
|
||||
import { K8s } from '@kinvolk/headlamp-plugin/lib';
|
||||
```
|
||||
|
||||
### KubeObject Base Class
|
||||
|
||||
```typescript
|
||||
class KubeObject<T extends KubeObjectInterface> {
|
||||
jsonData: T; // Raw K8s JSON — use this for spec/status access
|
||||
metadata: KubeMetadata;
|
||||
kind: string;
|
||||
|
||||
getAge(): string;
|
||||
getName(): string;
|
||||
getNamespace(): string | undefined;
|
||||
delete(force?: boolean): Promise<void>;
|
||||
patch(body: RecursivePartial<T>): Promise<void>;
|
||||
|
||||
static useGet(name?, namespace?): [item: T | null, error: ApiError | null];
|
||||
static useList(opts?: { namespace?: string }): [items: T[], error: ApiError | null, loading: boolean];
|
||||
static apiEndpoint: ApiClient | ApiWithNamespaceClient;
|
||||
static className: string;
|
||||
}
|
||||
```
|
||||
|
||||
**CRITICAL**: Resource hooks return class instances. Raw K8s JSON lives under `.jsonData`. Access fields via `.jsonData.spec`, `.jsonData.status`, or typed getters.
|
||||
|
||||
### ResourceClasses
|
||||
|
||||
All standard K8s resource types available (Secret, Namespace, Pod, etc.):
|
||||
```typescript
|
||||
const [secrets, error, loading] = K8s.ResourceClasses.Secret.useList({ namespace: 'default' });
|
||||
const [secret, error] = K8s.ResourceClasses.Secret.useGet('my-secret', 'default');
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## ApiProxy
|
||||
|
||||
```typescript
|
||||
import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
||||
|
||||
ApiProxy.request(
|
||||
path: string,
|
||||
options?: {
|
||||
method?: 'GET' | 'POST' | 'PUT' | 'PATCH' | 'DELETE';
|
||||
body?: string; // JSON.stringify'd
|
||||
isJSON?: boolean; // false for non-JSON (logs, metrics)
|
||||
headers?: Record<string, string>;
|
||||
}
|
||||
): Promise<unknown>
|
||||
|
||||
// CRD endpoint factories
|
||||
ApiProxy.apiFactoryWithNamespace(group, version, resource): ApiWithNamespaceClient
|
||||
ApiProxy.apiFactory(group, version, resource): ApiClient
|
||||
```
|
||||
|
||||
**Service proxy URL** (accessing in-cluster services):
|
||||
```
|
||||
/api/v1/namespaces/${ns}/services/http:${name}:${port}/proxy${path}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## CommonComponents
|
||||
|
||||
From `@kinvolk/headlamp-plugin/lib/CommonComponents`:
|
||||
|
||||
`SectionBox` — container with title and optional `headerProps.actions`
|
||||
`SectionHeader` — standalone header with title and actions array
|
||||
`SectionFilterHeader` — header with namespace filter; `noNamespaceFilter` to hide it; `actions` array
|
||||
`StatusLabel` — status chip; `status`: `'success' | 'error' | 'warning' | 'info'`
|
||||
`Link` — internal nav; `routeName` + `params` object
|
||||
`Loader` — spinner with `title` prop
|
||||
`PercentageBar` — bar chart with `data` array of `{ name, value, fill }`
|
||||
|
||||
### SimpleTable (non-obvious props)
|
||||
```typescript
|
||||
<SimpleTable
|
||||
data={items}
|
||||
columns={[
|
||||
{ label: 'Name', getter: (item) => item.metadata.name },
|
||||
{ label: 'Status', getter: (item) => <StatusLabel status="success">Ready</StatusLabel> },
|
||||
]}
|
||||
emptyMessage="No items found."
|
||||
/>
|
||||
```
|
||||
|
||||
### NameValueTable (non-obvious props)
|
||||
```typescript
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{ name: 'Key', value: 'display value' },
|
||||
{ name: 'Hidden', value: 'x', hide: true },
|
||||
]}
|
||||
/>
|
||||
```
|
||||
|
||||
### ConfigStore
|
||||
```typescript
|
||||
import { ConfigStore } from '@kinvolk/headlamp-plugin/lib';
|
||||
const store = new ConfigStore<MyConfig>('plugin-name');
|
||||
store.get(): MyConfig;
|
||||
store.update(partial: Partial<MyConfig>): void;
|
||||
store.useConfig(): () => MyConfig;
|
||||
```
|
||||
|
||||
### Pre-bundled (no package.json entry needed)
|
||||
react, react-dom, react-router-dom, @iconify/react, react-redux, @material-ui/core, @material-ui/styles, lodash, notistack, recharts, monaco-editor
|
||||
|
||||
---
|
||||
|
||||
## CRD Class Pattern
|
||||
|
||||
```typescript
|
||||
import { ApiProxy, K8s } from '@kinvolk/headlamp-plugin/lib';
|
||||
const { apiFactoryWithNamespace } = ApiProxy;
|
||||
const { KubeObject } = K8s.cluster;
|
||||
type KubeObjectInterface = K8s.cluster.KubeObjectInterface;
|
||||
|
||||
interface MyResourceInterface extends KubeObjectInterface {
|
||||
spec: MySpec;
|
||||
status?: MyStatus;
|
||||
}
|
||||
|
||||
export class MyResource extends KubeObject<MyResourceInterface> {
|
||||
static apiEndpoint = apiFactoryWithNamespace('mygroup.io', 'v1', 'myresources');
|
||||
static get className(): string { return 'MyResource'; }
|
||||
get spec(): MySpec { return this.jsonData.spec; }
|
||||
get status(): MyStatus | undefined { return this.jsonData.status; }
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Plugin Entry Point Pattern
|
||||
|
||||
```typescript
|
||||
// 1. Sidebar (parent → children)
|
||||
registerSidebarEntry({ parent: null, name: 'my-plugin', label: 'My Plugin', icon: 'mdi:icon', url: '/mypath' });
|
||||
registerSidebarEntry({ parent: 'my-plugin', name: 'my-list', label: 'Resources', url: '/mypath' });
|
||||
|
||||
// 2. Routes wrapped in ApiErrorBoundary
|
||||
registerRoute({
|
||||
path: '/mypath/:namespace?/:name?',
|
||||
sidebar: 'my-list',
|
||||
component: () => <ApiErrorBoundary><MyListPage /></ApiErrorBoundary>,
|
||||
exact: true, name: 'my-resource',
|
||||
});
|
||||
|
||||
// 3. Detail injection wrapped in GenericErrorBoundary
|
||||
registerDetailsViewSection(({ resource }) => {
|
||||
if (resource?.kind !== 'Secret') return null;
|
||||
return <GenericErrorBoundary><MySection resource={resource} /></GenericErrorBoundary>;
|
||||
});
|
||||
|
||||
// 4. Settings
|
||||
registerPluginSettings('my-plugin', SettingsPage, true);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Headlamp Test Mocks
|
||||
|
||||
```typescript
|
||||
vi.mock('@kinvolk/headlamp-plugin/lib', () => ({
|
||||
ApiProxy: { request: vi.fn().mockResolvedValue({}) },
|
||||
K8s: { ResourceClasses: {}, cluster: { KubeObject: class {} } },
|
||||
}));
|
||||
|
||||
vi.mock('@kinvolk/headlamp-plugin/lib/CommonComponents', () => ({
|
||||
SectionBox: ({ children, title }: any) => <div data-testid="section-box">{title}{children}</div>,
|
||||
SimpleTable: ({ data, columns }: any) => (
|
||||
<table><tbody>{data.map((d: any, i: number) =>
|
||||
<tr key={i}>{columns.map((c: any, j: number) => <td key={j}>{c.getter(d)}</td>)}</tr>
|
||||
)}</tbody></table>
|
||||
),
|
||||
NameValueTable: ({ rows }: any) => (
|
||||
<dl>{rows.filter((r: any) => !r.hide).map((r: any) =>
|
||||
<div key={r.name}><dt>{r.name}</dt><dd>{r.value}</dd></div>
|
||||
)}</dl>
|
||||
),
|
||||
StatusLabel: ({ children, status }: any) => <span data-status={status}>{children}</span>,
|
||||
Link: ({ children }: any) => <a>{children}</a>,
|
||||
Loader: ({ title }: any) => <div data-testid="loader">{title}</div>,
|
||||
}));
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Theming & Dark Mode
|
||||
|
||||
Headlamp supports light and dark themes. **Never hardcode colors.** Use CSS custom properties with light-mode fallbacks:
|
||||
|
||||
### Required CSS variables for inline styles
|
||||
```typescript
|
||||
// Text
|
||||
color: 'var(--mui-palette-text-primary)'
|
||||
color: 'var(--mui-palette-text-secondary, #666)'
|
||||
|
||||
// Backgrounds
|
||||
backgroundColor: 'var(--mui-palette-background-default, #fafafa)'
|
||||
backgroundColor: 'var(--mui-palette-background-paper, #fff)'
|
||||
|
||||
// Borders
|
||||
border: '1px solid var(--mui-palette-divider, #e0e0e0)'
|
||||
|
||||
// Interactive
|
||||
backgroundColor: 'var(--mui-palette-primary-main, #1976d2)'
|
||||
color: 'var(--mui-palette-primary-contrastText, #fff)'
|
||||
|
||||
// Disabled states
|
||||
backgroundColor: 'var(--mui-palette-action-disabledBackground, #e0e0e0)'
|
||||
color: 'var(--mui-palette-action-disabled, #9e9e9e)'
|
||||
|
||||
// Links
|
||||
color: 'var(--link-color, #1976d2)'
|
||||
```
|
||||
|
||||
### Common mistakes to avoid
|
||||
- **NEVER** use raw `#fff`, `#000`, `#333`, `#666` etc. without wrapping in `var(--mui-palette-*)`
|
||||
- **NEVER** use `rgba(0,0,0,0.5)` for overlays without a variable — this is the one exception where raw rgba is acceptable (backdrop overlays)
|
||||
- **NEVER** assume white backgrounds or dark text — always use `background-paper`/`text-primary`
|
||||
- For `<style>` blocks (drawers, etc.), use the same CSS variables in the stylesheet
|
||||
- Fallback values after the comma are for environments where the variable isn't set — always use the light-mode default
|
||||
|
||||
### Form inputs in custom components
|
||||
```typescript
|
||||
const inputStyle = {
|
||||
border: '1px solid var(--mui-palette-divider, #ccc)',
|
||||
borderRadius: '4px',
|
||||
backgroundColor: 'var(--mui-palette-background-paper)',
|
||||
color: 'var(--mui-palette-text-primary)',
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Code Quality Rules
|
||||
|
||||
1. **Functional components only** — no class components (except ErrorBoundary)
|
||||
2. **TypeScript strict mode** — no `any`; use `unknown` + type guards at API boundaries
|
||||
3. **Headlamp CommonComponents + MUI** — `@mui/material` is available via Headlamp's bundled deps; no other UI libraries (no Ant Design, etc.)
|
||||
4. **Inline CSS only** — `style={{}}` props, CSS variables (`var(--mui-palette-*)`) for theming
|
||||
5. **Accessibility** — `aria-label`, `aria-modal`, `role="dialog"`, `aria-live` for dynamic content
|
||||
6. **Cancellation safety** — async effects must check a `cancelled` flag
|
||||
7. **Error handling** — Result types in lib/, ErrorBoundaries wrapping components (ApiErrorBoundary for routes, GenericErrorBoundary for injected sections)
|
||||
8. **Tests** — vitest + @testing-library/react, mock Headlamp APIs per above pattern
|
||||
9. Run `npm run tsc` and `npm test` after implementation changes
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
name: multi-agent-coordinator
|
||||
description: Use when coordinating multiple concurrent agents that need to communicate, share state, synchronize work, and handle distributed failures across a system.
|
||||
tools: Read, Write, Edit, Glob, Grep
|
||||
model: opus
|
||||
---
|
||||
|
||||
You are a senior multi-agent coordinator with expertise in orchestrating complex distributed workflows. Your focus spans inter-agent communication, task dependency management, parallel execution control, and fault tolerance with emphasis on ensuring efficient, reliable coordination across large agent teams.
|
||||
|
||||
When invoked:
|
||||
1. Query context manager for workflow requirements and agent states
|
||||
2. Review communication patterns, dependencies, and resource constraints
|
||||
3. Analyze coordination bottlenecks, deadlock risks, and optimization opportunities
|
||||
4. Implement robust multi-agent coordination strategies
|
||||
|
||||
Multi-agent coordination checklist:
|
||||
- Coordination overhead < 5% maintained
|
||||
- Deadlock prevention 100% ensured
|
||||
- Message delivery guaranteed thoroughly
|
||||
- Scalability to 100+ agents verified
|
||||
- Fault tolerance built-in properly
|
||||
- Monitoring comprehensive continuously
|
||||
- Recovery automated effectively
|
||||
- Performance optimal consistently
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(done)",
|
||||
"Bash(npm install:*)",
|
||||
"Bash(git add:*)",
|
||||
"Bash(git commit:*)",
|
||||
"Bash(git push:*)",
|
||||
"Bash(gh workflow:*)",
|
||||
"Bash(gh run:*)",
|
||||
"Bash(npm run:*)",
|
||||
"Bash(npm ci:*)",
|
||||
"Bash(npm test:*)"
|
||||
]
|
||||
},
|
||||
"enabledMcpjsonServers": [
|
||||
"github",
|
||||
"kubernetes",
|
||||
"flux",
|
||||
"playwright"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
module.exports = {
|
||||
extends: ['@headlamp-k8s/eslint-config'],
|
||||
rules: {
|
||||
// Prettier handles indentation; the shared config's indent rule
|
||||
// conflicts with Prettier's JSX ternary formatting.
|
||||
indent: 'off',
|
||||
},
|
||||
};
|
||||
@@ -0,0 +1,41 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
ci:
|
||||
runs-on: local-ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '22'
|
||||
cache: 'npm'
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build plugin
|
||||
run: npx @kinvolk/headlamp-plugin build
|
||||
|
||||
- name: Lint
|
||||
run: npm run lint
|
||||
|
||||
- name: Type-check
|
||||
run: npm run tsc
|
||||
|
||||
- name: Format check
|
||||
run: npm run format:check
|
||||
|
||||
- name: Run tests
|
||||
run: npm test
|
||||
@@ -0,0 +1,104 @@
|
||||
name: Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: 'Release version (e.g. 1.0.0)'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
concurrency:
|
||||
group: release
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
ci:
|
||||
uses: ./.github/workflows/ci.yaml
|
||||
|
||||
release:
|
||||
needs: ci
|
||||
runs-on: local-ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
- name: Validate version format
|
||||
run: |
|
||||
if [[ ! "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
echo "Error: Version must be in X.Y.Z format"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '22'
|
||||
cache: 'npm'
|
||||
|
||||
- name: Configure Git
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Update version in package.json
|
||||
run: npm version ${{ inputs.version }} --no-git-tag-version --allow-same-version
|
||||
|
||||
- name: Update artifacthub-pkg.yml
|
||||
run: |
|
||||
VERSION="${{ inputs.version }}"
|
||||
PKG_NAME=$(jq -r .name package.json)
|
||||
RELEASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}/${PKG_NAME}-${VERSION}.tar.gz"
|
||||
sed -i "s/^version:.*/version: \"${VERSION}\"/" artifacthub-pkg.yml
|
||||
sed -i "s|headlamp/plugin/archive-url:.*|headlamp/plugin/archive-url: \"${RELEASE_URL}\"|" artifacthub-pkg.yml
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build plugin
|
||||
run: npx @kinvolk/headlamp-plugin build
|
||||
|
||||
- name: Package plugin
|
||||
run: npx @kinvolk/headlamp-plugin package
|
||||
|
||||
- name: Prepare release tarball
|
||||
run: |
|
||||
VERSION="${{ inputs.version }}"
|
||||
PKG_NAME=$(jq -r .name package.json)
|
||||
TARBALL="${PKG_NAME}-${VERSION}.tar.gz"
|
||||
echo "TARBALL=$TARBALL" >> $GITHUB_ENV
|
||||
echo "PKG_NAME=$PKG_NAME" >> $GITHUB_ENV
|
||||
|
||||
- name: Validate tarball
|
||||
run: |
|
||||
echo "Tarball: ${{ env.TARBALL }}"
|
||||
ls -lh "${{ env.TARBALL }}"
|
||||
tar -tzf "${{ env.TARBALL }}" | head -20
|
||||
tar -tzf "${{ env.TARBALL }}" | grep -q "main.js" || { echo "Error: main.js not found in tarball"; exit 1; }
|
||||
|
||||
- name: Compute checksum
|
||||
run: |
|
||||
CHECKSUM=$(sha256sum "${{ env.TARBALL }}" | awk '{print $1}')
|
||||
echo "CHECKSUM=$CHECKSUM" >> $GITHUB_ENV
|
||||
sed -i "s|headlamp/plugin/archive-checksum:.*|headlamp/plugin/archive-checksum: sha256:${CHECKSUM}|" artifacthub-pkg.yml
|
||||
|
||||
- name: Commit and tag
|
||||
run: |
|
||||
VERSION="${{ inputs.version }}"
|
||||
git add package.json package-lock.json artifacthub-pkg.yml
|
||||
git commit -m "release: v${VERSION}"
|
||||
git tag "v${VERSION}"
|
||||
git push origin main --tags
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ inputs.version }}
|
||||
name: v${{ inputs.version }}
|
||||
generate_release_notes: true
|
||||
files: ${{ env.TARBALL }}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"github": {
|
||||
"type": "http",
|
||||
"url": "https://api.githubcopilot.com/mcp/",
|
||||
"headers": { "Authorization": "Bearer ${GITHUB_TOKEN}" }
|
||||
},
|
||||
"kubernetes": { "type": "sse", "url": "http://localhost:8080/sse" },
|
||||
"flux": { "type": "sse", "url": "http://localhost:8081/sse" },
|
||||
"playwright": { "type": "sse", "url": "http://localhost:8086/sse" }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
module.exports = require('@headlamp-k8s/eslint-config/prettier-config');
|
||||
@@ -0,0 +1,95 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project
|
||||
|
||||
Headlamp plugin for Intel GPU device plugin visibility and monitoring. Read-only — monitors GpuDevicePlugin CRDs, GPU-capable nodes, pods requesting Intel GPU resources, and real-time power metrics via Prometheus. No cluster write operations.
|
||||
|
||||
- **Plugin name**: `intel-gpu`
|
||||
- **Target**: Headlamp >= v0.20.0
|
||||
- **Data sources**: GpuDevicePlugin CRDs (`deviceplugin.intel.com/v1`), Nodes, Pods (all namespaces), Prometheus (node-exporter i915 hwmon)
|
||||
- **Reference plugin**: `../headlamp-kube-vip-plugin`
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
npm start # dev server with hot reload
|
||||
npm run build # production build
|
||||
npm run package # package for headlamp
|
||||
npm run tsc # TypeScript type check (no emit)
|
||||
npm run lint # ESLint
|
||||
npm run lint:fix # ESLint with auto-fix
|
||||
npm run format # Prettier write
|
||||
npm run format:check # Prettier check
|
||||
npm test # vitest run
|
||||
npm run test:watch # vitest watch mode
|
||||
```
|
||||
|
||||
All tests and `tsc` must pass before committing.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
src/
|
||||
├── index.tsx # Plugin entry: registerRoute, registerSidebarEntry, registerDetailsViewSection, registerResourceTableColumnsProcessor
|
||||
├── api/
|
||||
│ ├── k8s.ts # Types + helpers (GpuDevicePlugin CRD, Nodes, Pods, type guards, formatters)
|
||||
│ ├── k8s.test.ts # Tests for k8s helpers (48 test cases)
|
||||
│ ├── metrics.ts # Prometheus GPU power metrics (node-exporter i915 hwmon)
|
||||
│ └── IntelGpuDataContext.tsx # Shared React context provider with data fetching
|
||||
└── components/
|
||||
├── OverviewPage.tsx # Dashboard: plugin health, GPU node summary, allocation, active pods
|
||||
├── DevicePluginsPage.tsx # GpuDevicePlugin CRD instances with spec/status and daemon pods
|
||||
├── NodesPage.tsx # Per-node GPU type, device count, allocation, workload pods
|
||||
├── PodsPage.tsx # All pods requesting Intel GPU resources with per-container detail
|
||||
├── MetricsPage.tsx # Real-time GPU power metrics from Prometheus
|
||||
├── NodeDetailSection.tsx # Injected into native Node detail page (capacity, utilization, pods)
|
||||
├── PodDetailSection.tsx # Injected into native Pod detail page (GPU requests per container)
|
||||
└── integrations/
|
||||
└── NodeColumns.tsx # GPU Type and GPU Devices columns for native Nodes table
|
||||
```
|
||||
|
||||
## Data flow
|
||||
|
||||
`IntelGpuDataContext.tsx` uses **two fetching strategies**:
|
||||
|
||||
1. **Headlamp hooks** (`K8s.ResourceClasses.*.useList()`) — for Nodes and Pods.
|
||||
2. **`ApiProxy.request()`** — for GpuDevicePlugin CRDs and plugin daemon pods (with label selector fallback).
|
||||
|
||||
The plugin gracefully degrades when the GpuDevicePlugin CRD is not installed — GPU nodes and pods are still shown based on resource labels and capacity.
|
||||
|
||||
## Key constants (src/api/k8s.ts)
|
||||
|
||||
- API group: `deviceplugin.intel.com`
|
||||
- API version: `v1`
|
||||
- GPU resources: `gpu.intel.com/i915`, `gpu.intel.com/xe`, `gpu.intel.com/millicores`, `gpu.intel.com/memory.max`
|
||||
- Resource prefix: `gpu.intel.com/`
|
||||
- Node labels: `intel.feature.node.kubernetes.io/gpu`, `node-role.kubernetes.io/gpu`, `node-role.kubernetes.io/igpu`
|
||||
- Pod selector: `app=intel-gpu-plugin`
|
||||
- Prometheus services: `kube-prometheus-stack-prometheus`, `prometheus-operated`, `prometheus` (monitoring namespace, port 9090)
|
||||
|
||||
## Code conventions
|
||||
|
||||
- Functional React components only — no class components
|
||||
- All imports from `@kinvolk/headlamp-plugin/lib` and `@kinvolk/headlamp-plugin/lib/CommonComponents`
|
||||
- No additional UI libraries (no MUI direct imports, no Ant Design, etc.)
|
||||
- TypeScript strict mode — no `any`, use `unknown` + type guards at API boundaries
|
||||
- Context provider (`IntelGpuDataProvider`) wraps each route component in `index.tsx`
|
||||
- Tests: vitest + @testing-library/react, mock with `vi.mock('@kinvolk/headlamp-plugin/lib', ...)`
|
||||
- `vitest.setup.ts` provides a spec-compliant `localStorage` shim for Node 22+ compatibility
|
||||
|
||||
## Testing
|
||||
|
||||
Mock pattern for headlamp APIs:
|
||||
```typescript
|
||||
vi.mock('@kinvolk/headlamp-plugin/lib', () => ({
|
||||
ApiProxy: { request: vi.fn().mockResolvedValue({ items: [] }) },
|
||||
K8s: {
|
||||
ResourceClasses: {
|
||||
Node: { useList: vi.fn(() => [[], null]) },
|
||||
Pod: { useList: vi.fn(() => [[], null]) },
|
||||
},
|
||||
},
|
||||
}));
|
||||
```
|
||||
@@ -0,0 +1,36 @@
|
||||
# Contributing
|
||||
|
||||
Contributions are welcome! Please follow these guidelines.
|
||||
|
||||
## Development Setup
|
||||
|
||||
```bash
|
||||
git clone https://github.com/privilegedescalation/headlamp-intel-gpu-plugin.git
|
||||
cd headlamp-intel-gpu-plugin
|
||||
npm install
|
||||
npm start
|
||||
```
|
||||
|
||||
## Before Submitting a PR
|
||||
|
||||
```bash
|
||||
npm run tsc # TypeScript type check
|
||||
npm run lint # ESLint
|
||||
npm run format:check # Prettier
|
||||
npm test # All tests must pass
|
||||
```
|
||||
|
||||
## Code Style
|
||||
|
||||
- TypeScript strict mode (no `any`)
|
||||
- Functional React components only
|
||||
- All UI from `@kinvolk/headlamp-plugin/lib/CommonComponents`
|
||||
- Tests with vitest + @testing-library/react
|
||||
|
||||
## Commit Messages
|
||||
|
||||
Use conventional commit format:
|
||||
- `feat:` new features
|
||||
- `fix:` bug fixes
|
||||
- `chore:` maintenance
|
||||
- `docs:` documentation
|
||||
@@ -0,0 +1,190 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to the Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by the Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding any notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
Copyright 2025 privilegedescalation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@@ -0,0 +1,110 @@
|
||||
# headlamp-intel-gpu-plugin
|
||||
|
||||
[](https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/actions/workflows/ci.yaml)
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
A [Headlamp](https://headlamp.dev/) plugin providing visibility into [Intel GPU device plugin](https://intel.github.io/intel-device-plugins-for-kubernetes/) deployments on Kubernetes.
|
||||
|
||||
## Features
|
||||
|
||||
- **Overview Dashboard** — Plugin health, GPU node summary, allocation bar, active GPU pods
|
||||
- **Device Plugins** — GpuDevicePlugin CRD instances with spec/status and daemon pod health
|
||||
- **GPU Nodes** — Per-node GPU type (discrete/integrated), device count, allocation, workload pods
|
||||
- **GPU Pods** — All pods requesting Intel GPU resources with per-container detail
|
||||
- **Metrics** — Real-time GPU power draw (W) and TDP via Prometheus node-exporter i915 hwmon
|
||||
- **Node Detail Integration** — Intel GPU section injected into native Headlamp Node detail views
|
||||
- **Pod Detail Integration** — GPU resource requests/limits injected into native Pod detail views
|
||||
- **Nodes Table Columns** — GPU Type and GPU Devices columns added to native Nodes table
|
||||
|
||||
## Installation
|
||||
|
||||
### Plugin Manager (Headlamp UI)
|
||||
|
||||
Search for `intel-gpu` in the Headlamp Plugin Manager.
|
||||
|
||||
### Manual
|
||||
|
||||
```bash
|
||||
# Download the latest release tarball
|
||||
curl -LO https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/latest/download/intel-gpu-*.tar.gz
|
||||
|
||||
# Extract to Headlamp plugins directory
|
||||
mkdir -p ~/.config/Headlamp/plugins
|
||||
tar -xzf intel-gpu-*.tar.gz -C ~/.config/Headlamp/plugins/
|
||||
```
|
||||
|
||||
### From Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/privilegedescalation/headlamp-intel-gpu-plugin.git
|
||||
cd headlamp-intel-gpu-plugin
|
||||
npm install
|
||||
npm run build
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Headlamp >= v0.20.0
|
||||
- Intel GPU device plugin deployed (optional — plugin gracefully degrades without it)
|
||||
- Optional: Node Feature Discovery with Intel GPU labels
|
||||
- Optional: kube-prometheus-stack with node-exporter for GPU power metrics
|
||||
|
||||
## RBAC
|
||||
|
||||
This plugin is **read-only** and requires the following permissions:
|
||||
|
||||
| Resource | API Group | Verbs |
|
||||
|----------|-----------|-------|
|
||||
| nodes | v1 | list, get, watch |
|
||||
| pods | v1 | list, get, watch |
|
||||
| gpudeviceplugins | deviceplugin.intel.com/v1 | list, get |
|
||||
|
||||
For metrics, Prometheus must be accessible via the Headlamp API proxy in the `monitoring` namespace.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
src/
|
||||
├── index.tsx # Plugin entry point
|
||||
├── api/
|
||||
│ ├── k8s.ts # Types and helper functions
|
||||
│ ├── metrics.ts # Prometheus GPU metrics
|
||||
│ └── IntelGpuDataContext.tsx # React context provider
|
||||
└── components/
|
||||
├── OverviewPage.tsx # Dashboard
|
||||
├── DevicePluginsPage.tsx # Device plugin CRDs
|
||||
├── NodesPage.tsx # GPU nodes
|
||||
├── PodsPage.tsx # GPU pods
|
||||
├── MetricsPage.tsx # Power metrics
|
||||
├── NodeDetailSection.tsx # Injected into Node detail view
|
||||
├── PodDetailSection.tsx # Injected into Pod detail view
|
||||
└── integrations/
|
||||
└── NodeColumns.tsx # Nodes table columns
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
npm install
|
||||
npm start # dev server
|
||||
npm test # run tests
|
||||
npm run tsc # type check
|
||||
npm run lint # ESLint
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| No GPU nodes shown | No Intel GPU labels or resources on nodes | Install Intel Node Feature Discovery or Intel GPU device plugin |
|
||||
| CRD not available warning | GpuDevicePlugin CRD not installed | Install Intel device plugins operator — plugin still works without it |
|
||||
| No metrics data | Prometheus not found | Deploy kube-prometheus-stack in the `monitoring` namespace |
|
||||
| Metrics show only discrete GPUs | Integrated GPUs lack hwmon | Expected — iGPU driver doesn't expose hwmon power data |
|
||||
|
||||
## Contributing
|
||||
|
||||
See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
|
||||
|
||||
## License
|
||||
|
||||
Apache License 2.0. See [LICENSE](LICENSE) for details.
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
| Version | Supported |
|
||||
|---------|-----------|
|
||||
| latest | Yes |
|
||||
|
||||
## Plugin Scope
|
||||
|
||||
This plugin is **read-only**. It does not perform any write operations against the Kubernetes cluster. It reads:
|
||||
|
||||
- Nodes
|
||||
- Pods (all namespaces)
|
||||
- GpuDevicePlugin CRDs (`deviceplugin.intel.com/v1`)
|
||||
- Prometheus metrics (via API proxy in `monitoring` namespace)
|
||||
|
||||
All data is fetched through Headlamp's built-in API proxy, which respects the user's existing RBAC permissions.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please report security vulnerabilities by opening a private issue or emailing the maintainers directly.
|
||||
+19
-27
@@ -1,5 +1,5 @@
|
||||
version: "0.2.0"
|
||||
name: headlamp-intel-gpu-plugin
|
||||
version: "0.4.1"
|
||||
name: intel-gpu
|
||||
displayName: Intel GPU
|
||||
description: >-
|
||||
Headlamp plugin for Intel GPU device plugin visibility and monitoring.
|
||||
@@ -8,14 +8,14 @@ description: >-
|
||||
sections into native Node and Pod detail pages. Supports discrete (i915),
|
||||
Xe, and integrated GPU nodes with graceful degradation when the device
|
||||
plugin operator is not installed. Includes a Metrics page showing real-time
|
||||
engine utilization, GPU frequency, VRAM usage, and energy from the device
|
||||
plugin's Prometheus endpoint.
|
||||
GPU power draw and TDP from node-exporter i915 hwmon metrics (discrete GPU
|
||||
nodes only).
|
||||
createdAt: "2026-02-18T00:00:00Z"
|
||||
license: Apache-2.0
|
||||
category: monitoring-logging
|
||||
|
||||
homeURL: https://github.com/privilegedescalation/headlamp-intel-gpu-plugin
|
||||
appVersion: "0.2.0"
|
||||
appVersion: "0.4.0"
|
||||
|
||||
keywords:
|
||||
- headlamp
|
||||
@@ -45,31 +45,23 @@ links:
|
||||
url: https://intel.github.io/intel-device-plugins-for-kubernetes/
|
||||
|
||||
changes:
|
||||
- kind: added
|
||||
description: "Metrics page: real-time engine utilization, boost frequency, VRAM usage, and energy from Intel GPU device plugin Prometheus endpoint (port 9090)"
|
||||
- kind: fixed
|
||||
description: "Remove unsafe `as any` casts in NodeDetailSection"
|
||||
- kind: fixed
|
||||
description: "Fix MetricsPage fetch cancellation safety (prevent setState on unmounted component)"
|
||||
- kind: fixed
|
||||
description: "Fix typo gpuPluinPods → gpuPluginPods in data context"
|
||||
- kind: changed
|
||||
description: "Sidebar label changed to intel-gpu"
|
||||
description: "Move extractJsonData utility to module scope to avoid recreation on every render"
|
||||
- kind: removed
|
||||
description: "Removed app bar health badge"
|
||||
- kind: added
|
||||
description: "Overview dashboard: plugin health, GPU node summary, allocation bar, active GPU pods"
|
||||
- kind: added
|
||||
description: "Device Plugins page: GpuDevicePlugin CRD instances with spec/status and daemon pods"
|
||||
- kind: added
|
||||
description: "GPU Nodes page: per-node GPU type, device count, allocation, workload pods"
|
||||
- kind: added
|
||||
description: "GPU Pods page: all pods requesting Intel GPU resources with per-container detail"
|
||||
- kind: added
|
||||
description: "Node detail injection: Intel GPU section on native Node detail pages (capacity, allocatable, utilization, active pods)"
|
||||
- kind: added
|
||||
description: "Pod detail injection: GPU resource requests/limits per container on native Pod detail pages"
|
||||
- kind: added
|
||||
description: "Nodes table: GPU Type and GPU Devices columns injected into native Nodes table"
|
||||
- kind: added
|
||||
description: "App bar health badge: hidden when no Intel GPU plugin detected"
|
||||
description: "Remove dead AppBarGpuBadge component"
|
||||
- kind: fixed
|
||||
description: "Fix appVersion mismatch and inaccurate metrics description in Artifact Hub metadata"
|
||||
- kind: fixed
|
||||
description: "Resolve ESLint/Prettier indent conflict by disabling ESLint indent rule (Prettier is formatting authority)"
|
||||
|
||||
annotations:
|
||||
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.2.0/headlamp-intel-gpu-plugin-0.2.0.tar.gz"
|
||||
headlamp/plugin/archive-checksum: "sha256:404be582bd13c167f61785028eb6eb91dd621106cbe76038f2c071a576a1a442"
|
||||
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.4.1/intel-gpu-0.4.1.tar.gz"
|
||||
headlamp/plugin/archive-checksum: ""
|
||||
headlamp/plugin/version-compat: ">=0.20.0"
|
||||
headlamp/plugin/distro-compat: "in-cluster,web,app"
|
||||
|
||||
Generated
+4
-4
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "headlamp-intel-gpu-plugin",
|
||||
"version": "0.1.0",
|
||||
"name": "intel-gpu",
|
||||
"version": "0.4.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "headlamp-intel-gpu-plugin",
|
||||
"version": "0.1.0",
|
||||
"name": "intel-gpu",
|
||||
"version": "0.4.0",
|
||||
"license": "Apache-2.0",
|
||||
"devDependencies": {
|
||||
"@kinvolk/headlamp-plugin": "^0.13.0"
|
||||
|
||||
+8
-4
@@ -1,12 +1,16 @@
|
||||
{
|
||||
"name": "headlamp-intel-gpu-plugin",
|
||||
"version": "0.2.0",
|
||||
"name": "intel-gpu",
|
||||
"version": "0.4.1",
|
||||
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/cpfarhood/headlamp-intel-gpu-plugin.git"
|
||||
"url": "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin.git"
|
||||
},
|
||||
"author": "cpfarhood",
|
||||
"bugs": {
|
||||
"url": "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/issues"
|
||||
},
|
||||
"homepage": "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin#readme",
|
||||
"author": "privilegedescalation",
|
||||
"license": "Apache-2.0",
|
||||
"scripts": {
|
||||
"start": "headlamp-plugin start",
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||
"extends": ["config:recommended"]
|
||||
}
|
||||
@@ -65,6 +65,18 @@ export function useIntelGpuContext(): IntelGpuContextValue {
|
||||
return ctx;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Extract raw Kubernetes JSON from Headlamp KubeObject wrappers. */
|
||||
const extractJsonData = (items: unknown[]): unknown[] =>
|
||||
items.map(item =>
|
||||
item && typeof item === 'object' && 'jsonData' in item
|
||||
? (item as { jsonData: unknown }).jsonData
|
||||
: item
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Provider
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -116,9 +128,11 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
||||
// Intel device plugins operator deployment
|
||||
`/api/v1/pods?labelSelector=${encodeURIComponent('app=intel-gpu-plugin')}`,
|
||||
// Alternative: by component label
|
||||
`/api/v1/pods?labelSelector=${encodeURIComponent('app.kubernetes.io/name=intel-gpu-plugin')}`,
|
||||
`/api/v1/pods?labelSelector=${encodeURIComponent(
|
||||
'app.kubernetes.io/name=intel-gpu-plugin'
|
||||
)}`,
|
||||
// Intel device plugins from inteldeviceplugins-system namespace
|
||||
`/api/v1/namespaces/inteldeviceplugins-system/pods`,
|
||||
'/api/v1/namespaces/inteldeviceplugins-system/pods',
|
||||
];
|
||||
|
||||
const foundPluginPods: IntelGpuPod[] = [];
|
||||
@@ -127,8 +141,8 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
||||
try {
|
||||
const list = await ApiProxy.request(url);
|
||||
if (!cancelled && isKubeList(list)) {
|
||||
const gpuPluinPods = filterIntelGpuPluginPods(list.items);
|
||||
foundPluginPods.push(...gpuPluinPods);
|
||||
const gpuPluginPods = filterIntelGpuPluginPods(list.items);
|
||||
foundPluginPods.push(...gpuPluginPods);
|
||||
}
|
||||
} catch {
|
||||
// Silently ignore — some selectors may not match
|
||||
@@ -155,7 +169,9 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
||||
}
|
||||
|
||||
void fetchAsync();
|
||||
return () => { cancelled = true; };
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [refreshKey]);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -166,13 +182,6 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
||||
// type helpers work correctly.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const extractJsonData = (items: unknown[]): unknown[] =>
|
||||
items.map(item =>
|
||||
item && typeof item === 'object' && 'jsonData' in item
|
||||
? (item as { jsonData: unknown }).jsonData
|
||||
: item
|
||||
);
|
||||
|
||||
const gpuNodes = useMemo(() => {
|
||||
if (!allNodes) return [];
|
||||
return filterIntelGpuNodes(extractJsonData(allNodes as unknown[]));
|
||||
|
||||
+4
-8
@@ -12,18 +12,18 @@ import {
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
getPodGpuRequests,
|
||||
type GpuDevicePlugin,
|
||||
INTEL_GPU_NODE_LABEL,
|
||||
INTEL_GPU_RESOURCE,
|
||||
INTEL_GPU_XE_RESOURCE,
|
||||
type IntelGpuNode,
|
||||
type IntelGpuPod,
|
||||
isGpuRequestingPod,
|
||||
isIntelGpuNode,
|
||||
isKubeList,
|
||||
isNodeReady,
|
||||
pluginStatusText,
|
||||
pluginStatusToStatus,
|
||||
type GpuDevicePlugin,
|
||||
type IntelGpuNode,
|
||||
type IntelGpuPod,
|
||||
} from './k8s';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -413,11 +413,7 @@ describe('formatGpuType', () => {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('pluginStatusToStatus', () => {
|
||||
function makePlugin(
|
||||
desired: number,
|
||||
ready: number,
|
||||
unavailable = 0
|
||||
): GpuDevicePlugin {
|
||||
function makePlugin(desired: number, ready: number, unavailable = 0): GpuDevicePlugin {
|
||||
return {
|
||||
apiVersion: 'deviceplugin.intel.com/v1',
|
||||
kind: 'GpuDevicePlugin',
|
||||
|
||||
+21
-28
@@ -28,8 +28,7 @@ export const INTEL_DISCRETE_GPU_NODE_ROLE = 'node-role.kubernetes.io/gpu';
|
||||
export const INTEL_INTEGRATED_GPU_NODE_ROLE = 'node-role.kubernetes.io/igpu';
|
||||
|
||||
/** Label selector for Intel GPU device plugin DaemonSet pods */
|
||||
export const INTEL_GPU_PLUGIN_LABEL_SELECTOR =
|
||||
'app=intel-gpu-plugin';
|
||||
export const INTEL_GPU_PLUGIN_LABEL_SELECTOR = 'app=intel-gpu-plugin';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generic Kubernetes object base shapes
|
||||
@@ -194,9 +193,12 @@ export function getNodeGpuType(node: IntelGpuNode): GpuType {
|
||||
|
||||
export function formatGpuType(type: GpuType): string {
|
||||
switch (type) {
|
||||
case 'discrete': return 'Discrete';
|
||||
case 'integrated': return 'Integrated';
|
||||
default: return 'Unknown';
|
||||
case 'discrete':
|
||||
return 'Discrete';
|
||||
case 'integrated':
|
||||
return 'Integrated';
|
||||
default:
|
||||
return 'Unknown';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,9 +274,11 @@ export function isIntelGpuPluginPod(pod: unknown): pod is IntelGpuPod {
|
||||
const meta = obj['metadata'] as Record<string, unknown> | undefined;
|
||||
const labels = meta?.['labels'] as Record<string, string> | undefined;
|
||||
if (!labels) return false;
|
||||
return labels['app'] === 'intel-gpu-plugin' ||
|
||||
(labels['app.kubernetes.io/name'] === 'intel-gpu-plugin') ||
|
||||
(labels['component'] === 'intel-gpu-plugin');
|
||||
return (
|
||||
labels['app'] === 'intel-gpu-plugin' ||
|
||||
labels['app.kubernetes.io/name'] === 'intel-gpu-plugin' ||
|
||||
labels['component'] === 'intel-gpu-plugin'
|
||||
);
|
||||
}
|
||||
|
||||
export function filterIntelGpuPluginPods(items: unknown[]): IntelGpuPod[] {
|
||||
@@ -284,10 +288,7 @@ export function filterIntelGpuPluginPods(items: unknown[]): IntelGpuPod[] {
|
||||
/** Get total GPU requests from a pod's containers */
|
||||
export function getPodGpuRequests(pod: IntelGpuPod): Record<string, string> {
|
||||
const totals: Record<string, number> = {};
|
||||
const allContainers = [
|
||||
...(pod.spec?.containers ?? []),
|
||||
...(pod.spec?.initContainers ?? []),
|
||||
];
|
||||
const allContainers = [...(pod.spec?.containers ?? []), ...(pod.spec?.initContainers ?? [])];
|
||||
for (const c of allContainers) {
|
||||
const requests = c.resources?.requests ?? {};
|
||||
for (const [key, value] of Object.entries(requests)) {
|
||||
@@ -300,15 +301,11 @@ export function getPodGpuRequests(pod: IntelGpuPod): Record<string, string> {
|
||||
}
|
||||
|
||||
export function isPodReady(pod: IntelGpuPod): boolean {
|
||||
return (
|
||||
pod.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false
|
||||
);
|
||||
return pod.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false;
|
||||
}
|
||||
|
||||
export function getPodRestarts(pod: IntelGpuPod): number {
|
||||
return (
|
||||
pod.status?.containerStatuses?.reduce((sum, c) => sum + c.restartCount, 0) ?? 0
|
||||
);
|
||||
return pod.status?.containerStatuses?.reduce((sum, c) => sum + c.restartCount, 0) ?? 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -330,9 +327,7 @@ export function isKubeList(value: unknown): value is KubeList<unknown> {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function isNodeReady(node: IntelGpuNode): boolean {
|
||||
return (
|
||||
node.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false
|
||||
);
|
||||
return node.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -359,11 +354,11 @@ export function formatAge(timestamp: string | undefined): string {
|
||||
export function formatGpuResourceName(resourceKey: string): string {
|
||||
const name = resourceKey.replace(INTEL_GPU_RESOURCE_PREFIX, '');
|
||||
const map: Record<string, string> = {
|
||||
'i915': 'GPU (i915)',
|
||||
'xe': 'GPU (Xe)',
|
||||
'millicores': 'GPU Millicores',
|
||||
i915: 'GPU (i915)',
|
||||
xe: 'GPU (Xe)',
|
||||
millicores: 'GPU Millicores',
|
||||
'memory.max': 'GPU Memory (max)',
|
||||
'tiles': 'GPU Tiles',
|
||||
tiles: 'GPU Tiles',
|
||||
};
|
||||
return map[name] ?? name;
|
||||
}
|
||||
@@ -372,9 +367,7 @@ export function formatGpuResourceName(resourceKey: string): string {
|
||||
// Status helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function pluginStatusToStatus(
|
||||
plugin: GpuDevicePlugin
|
||||
): 'success' | 'warning' | 'error' {
|
||||
export function pluginStatusToStatus(plugin: GpuDevicePlugin): 'success' | 'warning' | 'error' {
|
||||
const desired = plugin.status?.desiredNumberScheduled ?? 0;
|
||||
const ready = plugin.status?.numberReady ?? 0;
|
||||
const unavailable = plugin.status?.numberUnavailable ?? 0;
|
||||
|
||||
+131
-220
@@ -1,16 +1,15 @@
|
||||
/**
|
||||
* Prometheus text format parser for Intel GPU device plugin metrics.
|
||||
* Intel GPU metrics via Prometheus (kube-prometheus-stack).
|
||||
*
|
||||
* Fetches raw metrics from the Intel GPU device plugin pod (port 9090)
|
||||
* via the Kubernetes API proxy and parses key metric families.
|
||||
* The Intel i915/Xe GPU driver exposes hwmon sensors that node-exporter
|
||||
* scrapes automatically. We query Prometheus for:
|
||||
* - node_hwmon_energy_joule_total (chip_name="i915") → rate = power in W
|
||||
* - node_hwmon_power_max_watt (same chip) → TDP
|
||||
* - node_hwmon_chip_names (chip_name="i915") → identify GPU chips
|
||||
* - node_uname_info → instance → nodename
|
||||
*
|
||||
* Metrics exposed by intel-gpu-plugin when enableMonitoring: true:
|
||||
* gpu_i915_engine_active_ticks — engine busy ticks (per card, engine)
|
||||
* gpu_i915_engine_total_ticks — engine total ticks (for utilization %)
|
||||
* gpu_i915_energy_microjoules — cumulative energy (µJ → power = delta/dt)
|
||||
* gpu_i915_gt_boost_freq_mhz — current GT boost frequency (MHz)
|
||||
* gpu_i915_memory_local — local (VRAM) memory usage (bytes)
|
||||
* gpu_i915_memory_system — system memory usage (bytes)
|
||||
* Queries go through the Kubernetes API proxy to the in-cluster Prometheus
|
||||
* service: /api/v1/namespaces/monitoring/services/{svc}:{port}/proxy/...
|
||||
*/
|
||||
|
||||
import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
||||
@@ -19,239 +18,151 @@ import { ApiProxy } from '@kinvolk/headlamp-plugin/lib';
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface MetricSample {
|
||||
labels: Record<string, string>;
|
||||
value: number;
|
||||
}
|
||||
|
||||
export interface MetricFamily {
|
||||
name: string;
|
||||
help: string;
|
||||
type: string;
|
||||
samples: MetricSample[];
|
||||
}
|
||||
|
||||
export type ParsedMetrics = Map<string, MetricFamily>;
|
||||
|
||||
export interface GpuNodeMetrics {
|
||||
/** Node name this metric set was fetched from (via plugin pod) */
|
||||
export interface GpuChipMetrics {
|
||||
/** Kubernetes node name (e.g. "buttons") */
|
||||
nodeName: string;
|
||||
/** Pod name of the intel-gpu-plugin daemonset pod */
|
||||
podName: string;
|
||||
/** Engine utilization per (card, engine): 0–100 */
|
||||
engineUtilization: Array<{ card: string; engine: string; pct: number }>;
|
||||
/** Current GT boost frequency in MHz per card */
|
||||
boostFreqMhz: Array<{ card: string; value: number }>;
|
||||
/** Local VRAM usage in bytes per card */
|
||||
memoryLocalBytes: Array<{ card: string; value: number }>;
|
||||
/** System memory usage in bytes per card */
|
||||
memorySystemBytes: Array<{ card: string; value: number }>;
|
||||
/** Cumulative energy in µJ per card (raw counter; compute delta for power) */
|
||||
energyMicrojoules: Array<{ card: string; value: number }>;
|
||||
/** Raw parsed metric families for advanced use */
|
||||
raw: ParsedMetrics;
|
||||
/** PCI chip address (e.g. "0000:09:01_0_0000:0a:00_0") */
|
||||
chip: string;
|
||||
/** node-exporter instance (IP:port) */
|
||||
instance: string;
|
||||
/** Current power draw in watts (rate of energy counter, null if unavailable) */
|
||||
powerWatts: number | null;
|
||||
/** Maximum / TDP power in watts */
|
||||
powerMaxWatts: number | null;
|
||||
}
|
||||
|
||||
export interface GpuMetrics {
|
||||
chips: GpuChipMetrics[];
|
||||
/** ISO timestamp of when metrics were fetched */
|
||||
fetchedAt: string;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prometheus text format parser
|
||||
// Prometheus query helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const LABEL_PAIR_RE = /(\w+)="([^"]*)"/g;
|
||||
|
||||
function parseLabels(labelStr: string): Record<string, string> {
|
||||
const labels: Record<string, string> = {};
|
||||
let match: RegExpExecArray | null;
|
||||
const re = new RegExp(LABEL_PAIR_RE.source, 'g');
|
||||
while ((match = re.exec(labelStr)) !== null) {
|
||||
const key = match[1];
|
||||
const val = match[2];
|
||||
if (key && val !== undefined) {
|
||||
labels[key] = val;
|
||||
}
|
||||
}
|
||||
return labels;
|
||||
interface PrometheusResult {
|
||||
metric: Record<string, string>;
|
||||
value: [number, string];
|
||||
}
|
||||
|
||||
export function parsePrometheusText(text: string): ParsedMetrics {
|
||||
const families = new Map<string, MetricFamily>();
|
||||
let currentName = '';
|
||||
let currentHelp = '';
|
||||
let currentType = '';
|
||||
|
||||
for (const rawLine of text.split('\n')) {
|
||||
const line = rawLine.trim();
|
||||
if (!line) continue;
|
||||
|
||||
if (line.startsWith('# HELP ')) {
|
||||
const rest = line.slice(7);
|
||||
const spaceIdx = rest.indexOf(' ');
|
||||
currentName = spaceIdx >= 0 ? rest.slice(0, spaceIdx) : rest;
|
||||
currentHelp = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.startsWith('# TYPE ')) {
|
||||
const rest = line.slice(7);
|
||||
const spaceIdx = rest.indexOf(' ');
|
||||
currentType = spaceIdx >= 0 ? rest.slice(spaceIdx + 1) : '';
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.startsWith('#')) continue;
|
||||
|
||||
const openBrace = line.indexOf('{');
|
||||
const closeBrace = line.lastIndexOf('}');
|
||||
|
||||
let metricName: string;
|
||||
let labels: Record<string, string>;
|
||||
let valuePart: string;
|
||||
|
||||
if (openBrace >= 0 && closeBrace > openBrace) {
|
||||
metricName = line.slice(0, openBrace);
|
||||
labels = parseLabels(line.slice(openBrace + 1, closeBrace));
|
||||
valuePart = line.slice(closeBrace + 1).trim();
|
||||
} else {
|
||||
const spaceIdx = line.lastIndexOf(' ');
|
||||
if (spaceIdx < 0) continue;
|
||||
metricName = line.slice(0, spaceIdx);
|
||||
labels = {};
|
||||
valuePart = line.slice(spaceIdx + 1).trim();
|
||||
}
|
||||
|
||||
const valueTokens = valuePart.split(' ');
|
||||
const valueStr = valueTokens[0] ?? '';
|
||||
const value = parseFloat(valueStr);
|
||||
if (!Number.isFinite(value)) continue;
|
||||
|
||||
const familyKey = metricName;
|
||||
let family = families.get(familyKey);
|
||||
if (!family) {
|
||||
family = {
|
||||
name: familyKey,
|
||||
help: metricName === currentName ? currentHelp : '',
|
||||
type: metricName === currentName ? currentType : '',
|
||||
samples: [],
|
||||
};
|
||||
families.set(familyKey, family);
|
||||
}
|
||||
|
||||
family.samples.push({ labels, value });
|
||||
}
|
||||
|
||||
return families;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Extract Intel GPU metrics from the parsed map
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function samplesFor(families: ParsedMetrics, name: string): MetricSample[] {
|
||||
return families.get(name)?.samples ?? [];
|
||||
}
|
||||
|
||||
export function extractGpuNodeMetrics(
|
||||
families: ParsedMetrics,
|
||||
nodeName: string,
|
||||
podName: string
|
||||
): GpuNodeMetrics {
|
||||
const activeSamples = samplesFor(families, 'gpu_i915_engine_active_ticks');
|
||||
const totalSamples = samplesFor(families, 'gpu_i915_engine_total_ticks');
|
||||
|
||||
// Build utilization: active/total per (card, engine)
|
||||
const engineUtilization: GpuNodeMetrics['engineUtilization'] = [];
|
||||
for (const active of activeSamples) {
|
||||
const card = active.labels['card'] ?? active.labels['gpu'] ?? 'gpu0';
|
||||
const engine = active.labels['engine'] ?? 'render/0';
|
||||
const totalSample = totalSamples.find(
|
||||
s =>
|
||||
(s.labels['card'] ?? s.labels['gpu']) === card &&
|
||||
s.labels['engine'] === engine
|
||||
);
|
||||
const total = totalSample?.value ?? 0;
|
||||
const pct = total > 0 ? Math.min(100, Math.round((active.value / total) * 100)) : 0;
|
||||
engineUtilization.push({ card, engine, pct });
|
||||
}
|
||||
|
||||
// Boost frequency
|
||||
const boostFreqMhz = samplesFor(families, 'gpu_i915_gt_boost_freq_mhz').map(s => ({
|
||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||
value: s.value,
|
||||
}));
|
||||
|
||||
// Memory
|
||||
const memoryLocalBytes = samplesFor(families, 'gpu_i915_memory_local').map(s => ({
|
||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||
value: s.value,
|
||||
}));
|
||||
const memorySystemBytes = samplesFor(families, 'gpu_i915_memory_system').map(s => ({
|
||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||
value: s.value,
|
||||
}));
|
||||
|
||||
// Energy
|
||||
const energyMicrojoules = samplesFor(families, 'gpu_i915_energy_microjoules').map(s => ({
|
||||
card: s.labels['card'] ?? s.labels['gpu'] ?? 'gpu0',
|
||||
value: s.value,
|
||||
}));
|
||||
|
||||
return {
|
||||
nodeName,
|
||||
podName,
|
||||
engineUtilization,
|
||||
boostFreqMhz,
|
||||
memoryLocalBytes,
|
||||
memorySystemBytes,
|
||||
energyMicrojoules,
|
||||
raw: families,
|
||||
interface PrometheusResponse {
|
||||
status: string;
|
||||
data: {
|
||||
resultType: string;
|
||||
result: PrometheusResult[];
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fetch metrics from an Intel GPU device plugin pod
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Fetches and parses Prometheus metrics from an Intel GPU device plugin pod.
|
||||
*
|
||||
* The proxy path is:
|
||||
* /api/v1/namespaces/{namespace}/pods/{podName}:9090/proxy/metrics
|
||||
*
|
||||
* Returns null if the pod is not exposing metrics (enableMonitoring: false)
|
||||
* or if the proxy request fails.
|
||||
* Service discovery: find the Prometheus service.
|
||||
* Tries the kube-prometheus-stack default name; falls back to prometheus-operated.
|
||||
*/
|
||||
export async function fetchGpuPluginMetrics(
|
||||
podName: string,
|
||||
namespace: string,
|
||||
nodeName: string
|
||||
): Promise<GpuNodeMetrics | null> {
|
||||
const path = `/api/v1/namespaces/${namespace}/pods/${podName}:9090/proxy/metrics`;
|
||||
const PROMETHEUS_SERVICES = [
|
||||
{ namespace: 'monitoring', service: 'kube-prometheus-stack-prometheus', port: '9090' },
|
||||
{ namespace: 'monitoring', service: 'prometheus-operated', port: '9090' },
|
||||
{ namespace: 'monitoring', service: 'prometheus', port: '9090' },
|
||||
];
|
||||
|
||||
try {
|
||||
const raw: unknown = await ApiProxy.request(path, {
|
||||
method: 'GET',
|
||||
isJSON: false,
|
||||
});
|
||||
async function queryPrometheus(query: string, prometheusPath: string): Promise<PrometheusResult[]> {
|
||||
const encoded = encodeURIComponent(query);
|
||||
const path = `${prometheusPath}/api/v1/query?query=${encoded}`;
|
||||
|
||||
if (typeof raw !== 'string') return null;
|
||||
const raw = (await ApiProxy.request(path, { method: 'GET' })) as PrometheusResponse;
|
||||
|
||||
const families = parsePrometheusText(raw);
|
||||
return extractGpuNodeMetrics(families, nodeName, podName);
|
||||
} catch {
|
||||
return null;
|
||||
if (raw?.status !== 'success') return [];
|
||||
return raw.data?.result ?? [];
|
||||
}
|
||||
|
||||
async function findPrometheusPath(): Promise<string | null> {
|
||||
for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
|
||||
const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
|
||||
try {
|
||||
const raw = (await ApiProxy.request(`${basePath}/api/v1/query?query=1`, {
|
||||
method: 'GET',
|
||||
})) as PrometheusResponse;
|
||||
if (raw?.status === 'success') return basePath;
|
||||
} catch {
|
||||
// try next
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metrics fetch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export async function fetchGpuMetrics(): Promise<GpuMetrics | null> {
|
||||
const prometheusPath = await findPrometheusPath();
|
||||
if (!prometheusPath) return null;
|
||||
|
||||
// Run queries in parallel
|
||||
const [chipResults, energyRateResults, powerMaxResults, unameResults] = await Promise.all([
|
||||
// i915 chip identification
|
||||
queryPrometheus('node_hwmon_chip_names{chip_name="i915"}', prometheusPath),
|
||||
// Current power (rate of cumulative energy counter)
|
||||
queryPrometheus(
|
||||
'rate(node_hwmon_energy_joule_total[5m]) * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
|
||||
prometheusPath
|
||||
),
|
||||
// TDP / max power
|
||||
queryPrometheus(
|
||||
'node_hwmon_power_max_watt * on(chip,instance) group_left(chip_name) node_hwmon_chip_names{chip_name="i915"}',
|
||||
prometheusPath
|
||||
),
|
||||
// instance → nodename mapping
|
||||
queryPrometheus('node_uname_info', prometheusPath),
|
||||
]);
|
||||
|
||||
// Build instance → nodename map
|
||||
const instanceToNode = new Map<string, string>();
|
||||
for (const r of unameResults) {
|
||||
const inst = r.metric['instance'];
|
||||
const nodename = r.metric['nodename'] ?? r.metric['node'] ?? inst;
|
||||
if (inst) instanceToNode.set(inst, nodename);
|
||||
}
|
||||
|
||||
// Build chip → power map
|
||||
const chipToPower = new Map<string, number>();
|
||||
for (const r of energyRateResults) {
|
||||
const chip = r.metric['chip'];
|
||||
if (chip) chipToPower.set(chip, parseFloat(r.value[1]));
|
||||
}
|
||||
|
||||
// Build chip → max power map
|
||||
const chipToMaxPower = new Map<string, number>();
|
||||
for (const r of powerMaxResults) {
|
||||
const chip = r.metric['chip'];
|
||||
if (chip) chipToMaxPower.set(chip, parseFloat(r.value[1]));
|
||||
}
|
||||
|
||||
// Assemble per-chip metrics from the chip identification results
|
||||
const chips: GpuChipMetrics[] = chipResults.map(r => {
|
||||
const chip = r.metric['chip'] ?? '';
|
||||
const instance = r.metric['instance'] ?? '';
|
||||
const nodeName = instanceToNode.get(instance) ?? instance;
|
||||
const powerWatts = chipToPower.has(chip) ? chipToPower.get(chip)! : null;
|
||||
const powerMaxWatts = chipToMaxPower.has(chip) ? chipToMaxPower.get(chip)! : null;
|
||||
|
||||
return { nodeName, chip, instance, powerWatts, powerMaxWatts };
|
||||
});
|
||||
|
||||
return {
|
||||
chips,
|
||||
fetchedAt: new Date().toISOString(),
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Formatting helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function formatBytes(bytes: number): string {
|
||||
if (bytes >= 1e9) return `${(bytes / 1e9).toFixed(1)} GB`;
|
||||
if (bytes >= 1e6) return `${(bytes / 1e6).toFixed(1)} MB`;
|
||||
if (bytes >= 1e3) return `${(bytes / 1e3).toFixed(1)} KB`;
|
||||
return `${bytes} B`;
|
||||
export function formatWatts(w: number): string {
|
||||
return `${w.toFixed(1)} W`;
|
||||
}
|
||||
|
||||
export function formatFreq(mhz: number): string {
|
||||
return `${Math.round(mhz)} MHz`;
|
||||
export function formatPercent(used: number, max: number): string {
|
||||
if (max <= 0) return '—';
|
||||
return `${Math.round((used / max) * 100)}%`;
|
||||
}
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
/**
|
||||
* AppBarGpuBadge — compact Intel GPU health indicator in the Headlamp app bar.
|
||||
*
|
||||
* Shows a status chip in the top navigation bar summarising GPU plugin health.
|
||||
* Hides itself when no Intel GPU plugin is detected.
|
||||
*/
|
||||
|
||||
import { StatusLabel } from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
||||
import React from 'react';
|
||||
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
|
||||
export default function AppBarGpuBadge() {
|
||||
const { pluginInstalled, gpuNodes, devicePlugins, loading } = useIntelGpuContext();
|
||||
|
||||
// Hide when loading or no plugin present
|
||||
if (loading || !pluginInstalled) return null;
|
||||
|
||||
const hasUnhealthyPlugin = devicePlugins.some(p => {
|
||||
const desired = p.status?.desiredNumberScheduled ?? 0;
|
||||
const ready = p.status?.numberReady ?? 0;
|
||||
const unavailable = p.status?.numberUnavailable ?? 0;
|
||||
return (desired > 0 && ready < desired) || unavailable > 0;
|
||||
});
|
||||
|
||||
const status = hasUnhealthyPlugin ? 'warning' : 'success';
|
||||
const nodeCount = gpuNodes.length;
|
||||
|
||||
return (
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
alignItems: 'center',
|
||||
gap: '4px',
|
||||
padding: '0 8px',
|
||||
cursor: 'default',
|
||||
}}
|
||||
title={`Intel GPU: ${nodeCount} node${nodeCount !== 1 ? 's' : ''}`}
|
||||
>
|
||||
<StatusLabel status={status}>
|
||||
<span style={{ fontSize: '11px', fontWeight: 600 }}>
|
||||
Intel GPU{nodeCount > 0 ? ` · ${nodeCount}N` : ''}
|
||||
</span>
|
||||
</StatusLabel>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@@ -18,8 +18,7 @@ import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import { formatAge, isPodReady, pluginStatusText, pluginStatusToStatus } from '../api/k8s';
|
||||
|
||||
export default function DevicePluginsPage() {
|
||||
const { devicePlugins, pluginPods, crdAvailable, loading, error, refresh } =
|
||||
useIntelGpuContext();
|
||||
const { devicePlugins, pluginPods, crdAvailable, loading, error, refresh } = useIntelGpuContext();
|
||||
|
||||
if (loading) {
|
||||
return <Loader title="Loading device plugin data..." />;
|
||||
@@ -27,7 +26,14 @@ export default function DevicePluginsPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Device Plugins" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -102,7 +108,10 @@ export default function DevicePluginsPage() {
|
||||
)}
|
||||
|
||||
{devicePlugins.map(plugin => (
|
||||
<SectionBox key={plugin.metadata.uid ?? plugin.metadata.name} title={`GpuDevicePlugin: ${plugin.metadata.name}`}>
|
||||
<SectionBox
|
||||
key={plugin.metadata.uid ?? plugin.metadata.name}
|
||||
title={`GpuDevicePlugin: ${plugin.metadata.name}`}
|
||||
>
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
@@ -146,14 +155,14 @@ export default function DevicePluginsPage() {
|
||||
value: String(plugin.status?.numberReady ?? '—'),
|
||||
},
|
||||
...(plugin.status?.numberUnavailable
|
||||
? [{
|
||||
name: 'Unavailable Nodes',
|
||||
value: (
|
||||
<StatusLabel status="error">
|
||||
{plugin.status.numberUnavailable}
|
||||
</StatusLabel>
|
||||
),
|
||||
}]
|
||||
? [
|
||||
{
|
||||
name: 'Unavailable Nodes',
|
||||
value: (
|
||||
<StatusLabel status="error">{plugin.status.numberUnavailable}</StatusLabel>
|
||||
),
|
||||
},
|
||||
]
|
||||
: []),
|
||||
{
|
||||
name: 'Node Selector',
|
||||
@@ -177,12 +186,12 @@ export default function DevicePluginsPage() {
|
||||
<SectionBox title="Plugin Daemon Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'Ready',
|
||||
getter: (p) => (
|
||||
getter: p => (
|
||||
<StatusLabel status={isPodReady(p) ? 'success' : 'warning'}>
|
||||
{isPodReady(p) ? 'Ready' : p.status?.phase ?? 'Unknown'}
|
||||
</StatusLabel>
|
||||
@@ -190,10 +199,9 @@ export default function DevicePluginsPage() {
|
||||
},
|
||||
{
|
||||
label: 'Restarts',
|
||||
getter: (p) => {
|
||||
const restarts = p.status?.containerStatuses?.reduce(
|
||||
(sum, c) => sum + c.restartCount, 0
|
||||
) ?? 0;
|
||||
getter: p => {
|
||||
const restarts =
|
||||
p.status?.containerStatuses?.reduce((sum, c) => sum + c.restartCount, 0) ?? 0;
|
||||
return restarts > 0 ? (
|
||||
<StatusLabel status="warning">{restarts}</StatusLabel>
|
||||
) : (
|
||||
@@ -201,7 +209,7 @@ export default function DevicePluginsPage() {
|
||||
);
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={pluginPods}
|
||||
/>
|
||||
|
||||
+247
-202
@@ -1,9 +1,29 @@
|
||||
/**
|
||||
* MetricsPage — real-time Intel GPU metrics from the device plugin pods.
|
||||
* MetricsPage — Intel GPU metrics from Prometheus (node-exporter hwmon).
|
||||
*
|
||||
* Fetches Prometheus metrics from each Intel GPU device plugin pod (port 9090)
|
||||
* and displays per-card engine utilization, GPU frequency, memory usage,
|
||||
* and cumulative energy. Requires `enableMonitoring: true` in GpuDevicePlugin.
|
||||
* METRIC AVAILABILITY
|
||||
* -------------------
|
||||
* Power (current W, TDP)
|
||||
* Source: node_hwmon_energy_joule_total, node_hwmon_power_max_watt
|
||||
* Driver: i915 hwmon sysfs (/sys/class/drm/card{N}/device/hwmon/)
|
||||
* Scraped: node-exporter hwmon collector (enabled by default)
|
||||
* Nodes: Discrete GPU nodes only (i915 driver exposes hwmon; iGPU driver does not)
|
||||
* No extra config required — works out of the box with kube-prometheus-stack.
|
||||
*
|
||||
* GPU Frequency (current, boost, min, max MHz)
|
||||
* Source: DRM sysfs (/sys/class/drm/card{N}/gt_{x}_freq_mhz)
|
||||
* Driver: i915 kernel driver
|
||||
* Scraped: NOT available -- node-exporter --collector.drm is AMD-only and does not
|
||||
* read i915 gt_freq sysfs files. Would require a custom exporter or
|
||||
* node-exporter textfile collector sidecar writing these values.
|
||||
*
|
||||
* GPU Utilization (engine busy %)
|
||||
* Source: Not exposed via hwmon or any standard Prometheus collector for i915.
|
||||
* Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
|
||||
*
|
||||
* Integrated GPU (iGPU) nodes
|
||||
* The iGPU driver does not expose hwmon sensors. No Prometheus metrics are
|
||||
* available for iGPU nodes regardless of configuration.
|
||||
*/
|
||||
|
||||
import {
|
||||
@@ -16,140 +36,151 @@ import {
|
||||
import React, { useCallback, useEffect, useState } from 'react';
|
||||
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import {
|
||||
fetchGpuPluginMetrics,
|
||||
formatBytes,
|
||||
formatFreq,
|
||||
GpuNodeMetrics,
|
||||
fetchGpuMetrics,
|
||||
formatPercent,
|
||||
formatWatts,
|
||||
GpuChipMetrics,
|
||||
GpuMetrics,
|
||||
} from '../api/metrics';
|
||||
import { IntelGpuPod } from '../api/k8s';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Utilization bar
|
||||
// Power bar
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function UtilizationBar({ pct }: { pct: number }) {
|
||||
const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||
function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
|
||||
const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
|
||||
const color =
|
||||
pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||
|
||||
return (
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||
<div
|
||||
style={{
|
||||
width: '100px',
|
||||
height: '8px',
|
||||
backgroundColor: '#e0e0e0',
|
||||
borderRadius: '4px',
|
||||
overflow: 'hidden',
|
||||
flexShrink: 0,
|
||||
}}
|
||||
>
|
||||
{pct !== null && (
|
||||
<div
|
||||
style={{
|
||||
width: `${pct}%`,
|
||||
height: '100%',
|
||||
backgroundColor: color,
|
||||
width: '100px',
|
||||
height: '8px',
|
||||
backgroundColor: '#e0e0e0',
|
||||
borderRadius: '4px',
|
||||
transition: 'width 0.3s ease',
|
||||
overflow: 'hidden',
|
||||
flexShrink: 0,
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
<span style={{ fontSize: '12px', fontVariantNumeric: 'tabular-nums' }}>{pct}%</span>
|
||||
>
|
||||
<div
|
||||
style={{
|
||||
width: `${pct}%`,
|
||||
height: '100%',
|
||||
backgroundColor: color,
|
||||
borderRadius: '4px',
|
||||
transition: 'width 0.4s ease',
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
<span style={{ fontSize: '13px', fontVariantNumeric: 'tabular-nums' }}>
|
||||
{formatWatts(watts)}
|
||||
{maxWatts !== null && maxWatts > 0 && (
|
||||
<span style={{ color: '#888', marginLeft: '4px' }}>
|
||||
/ {formatWatts(maxWatts)} ({formatPercent(watts, maxWatts)})
|
||||
</span>
|
||||
)}
|
||||
</span>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-node metrics card
|
||||
// Per-chip card
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
|
||||
const { nodeName, podName, engineUtilization, boostFreqMhz, memoryLocalBytes, memorySystemBytes, energyMicrojoules } = metrics;
|
||||
function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
|
||||
const rows: Array<{ name: string; value: React.ReactNode }> = [
|
||||
{ name: 'Node', value: chip.nodeName },
|
||||
{ name: 'GPU (PCI)', value: chip.chip },
|
||||
{
|
||||
name: 'Current Power',
|
||||
value:
|
||||
chip.powerWatts !== null ? (
|
||||
<PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
|
||||
) : (
|
||||
<StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>
|
||||
),
|
||||
},
|
||||
];
|
||||
|
||||
// Group engines by card
|
||||
const byCard = new Map<string, typeof engineUtilization>();
|
||||
for (const e of engineUtilization) {
|
||||
if (!byCard.has(e.card)) byCard.set(e.card, []);
|
||||
byCard.get(e.card)!.push(e);
|
||||
}
|
||||
|
||||
const freqByCard = new Map(boostFreqMhz.map(f => [f.card, f.value]));
|
||||
const memLocalByCard = new Map(memoryLocalBytes.map(m => [m.card, m.value]));
|
||||
const memSysByCard = new Map(memorySystemBytes.map(m => [m.card, m.value]));
|
||||
const energyByCard = new Map(energyMicrojoules.map(e => [e.card, e.value]));
|
||||
|
||||
const cards = Array.from(
|
||||
new Set([
|
||||
...byCard.keys(),
|
||||
...freqByCard.keys(),
|
||||
...memLocalByCard.keys(),
|
||||
])
|
||||
).sort();
|
||||
|
||||
if (cards.length === 0) {
|
||||
return (
|
||||
<SectionBox title={`${nodeName} — No Metric Data`}>
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'Pod',
|
||||
value: podName,
|
||||
},
|
||||
{
|
||||
name: 'Note',
|
||||
value: 'No GPU metrics found. Ensure enableMonitoring: true is set in GpuDevicePlugin.',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
);
|
||||
if (chip.powerMaxWatts !== null && chip.powerMaxWatts > 0) {
|
||||
rows.push({ name: 'TDP', value: formatWatts(chip.powerMaxWatts) });
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
{cards.map(card => {
|
||||
const engines = byCard.get(card) ?? [];
|
||||
const freq = freqByCard.get(card);
|
||||
const memLocal = memLocalByCard.get(card);
|
||||
const memSys = memSysByCard.get(card);
|
||||
const energy = energyByCard.get(card);
|
||||
<SectionBox title={`${chip.nodeName} — ${chip.chip}`}>
|
||||
<NameValueTable rows={rows} />
|
||||
</SectionBox>
|
||||
);
|
||||
}
|
||||
|
||||
const rows: Array<{ name: string; value: React.ReactNode }> = [
|
||||
{ name: 'Node', value: nodeName },
|
||||
{ name: 'Plugin Pod', value: podName },
|
||||
{ name: 'GPU Card', value: card },
|
||||
];
|
||||
// ---------------------------------------------------------------------------
|
||||
// Requirements info box
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
if (freq !== undefined) {
|
||||
rows.push({ name: 'Boost Frequency', value: formatFreq(freq) });
|
||||
}
|
||||
|
||||
if (memLocal !== undefined) {
|
||||
rows.push({ name: 'VRAM (local)', value: formatBytes(memLocal) });
|
||||
}
|
||||
if (memSys !== undefined && memSys > 0) {
|
||||
rows.push({ name: 'System Memory', value: formatBytes(memSys) });
|
||||
}
|
||||
|
||||
if (energy !== undefined) {
|
||||
rows.push({
|
||||
name: 'Energy (cumulative)',
|
||||
value: `${(energy / 1e6).toFixed(2)} J`,
|
||||
});
|
||||
}
|
||||
|
||||
// Engine utilization rows
|
||||
for (const e of engines) {
|
||||
rows.push({
|
||||
name: `Engine: ${e.engine}`,
|
||||
value: <UtilizationBar pct={e.pct} />,
|
||||
});
|
||||
}
|
||||
|
||||
return (
|
||||
<SectionBox key={`${nodeName}-${card}`} title={`${nodeName} — ${card}`}>
|
||||
<NameValueTable rows={rows} />
|
||||
</SectionBox>
|
||||
);
|
||||
})}
|
||||
</>
|
||||
function MetricRequirements() {
|
||||
return (
|
||||
<SectionBox title="Metric Availability">
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'Power (W)',
|
||||
value: (
|
||||
<>
|
||||
<StatusLabel status="success">Available — discrete GPU nodes</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon
|
||||
collector (enabled by default). Requires the i915 kernel driver on the node. iGPU
|
||||
nodes do not expose hwmon sensors.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'Frequency (MHz)',
|
||||
value: (
|
||||
<>
|
||||
<StatusLabel status="error">Not available</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
i915 exposes <code>gt_*_freq_mhz</code> via DRM sysfs but node-exporter's{' '}
|
||||
<code>--collector.drm</code> flag is AMD-only and does not read these files. A
|
||||
custom exporter or textfile-collector sidecar writing these values would be
|
||||
required.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'Utilization (%)',
|
||||
value: (
|
||||
<>
|
||||
<StatusLabel status="error">Not available</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
No standard Prometheus collector exposes i915 engine busy percentage. Would
|
||||
require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'iGPU nodes',
|
||||
value: (
|
||||
<>
|
||||
<StatusLabel status="error">No metrics available</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics are
|
||||
available for iGPU nodes regardless of configuration.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -158,38 +189,46 @@ function NodeMetricsCard({ metrics }: { metrics: GpuNodeMetrics }) {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export default function MetricsPage() {
|
||||
const { pluginPods, pluginInstalled, loading: ctxLoading } = useIntelGpuContext();
|
||||
const { gpuNodes, loading: ctxLoading } = useIntelGpuContext();
|
||||
|
||||
const [metricsMap, setMetricsMap] = useState<Map<string, GpuNodeMetrics | 'error'>>(new Map());
|
||||
const [metrics, setMetrics] = useState<GpuMetrics | null>(null);
|
||||
const [fetchError, setFetchError] = useState<string | null>(null);
|
||||
const [fetching, setFetching] = useState(false);
|
||||
const [fetchSeq, setFetchSeq] = useState(0);
|
||||
|
||||
const fetchAll = useCallback(async (pods: IntelGpuPod[]) => {
|
||||
if (pods.length === 0) return;
|
||||
setFetching(true);
|
||||
|
||||
const results = await Promise.all(
|
||||
pods.map(async pod => {
|
||||
const name = pod.metadata.name;
|
||||
const namespace = pod.metadata.namespace ?? 'kube-system';
|
||||
const nodeName = pod.spec?.nodeName ?? name;
|
||||
const result = await fetchGpuPluginMetrics(name, namespace, nodeName);
|
||||
return { name, result };
|
||||
})
|
||||
);
|
||||
|
||||
const map = new Map<string, GpuNodeMetrics | 'error'>();
|
||||
for (const { name, result } of results) {
|
||||
map.set(name, result ?? 'error');
|
||||
}
|
||||
setMetricsMap(map);
|
||||
setFetching(false);
|
||||
const doFetch = useCallback(() => {
|
||||
setFetchSeq(s => s + 1);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (!ctxLoading && pluginPods.length > 0) {
|
||||
void fetchAll(pluginPods);
|
||||
}
|
||||
}, [ctxLoading, pluginPods, fetchAll]);
|
||||
if (ctxLoading) return;
|
||||
|
||||
let cancelled = false;
|
||||
setFetching(true);
|
||||
setFetchError(null);
|
||||
|
||||
fetchGpuMetrics()
|
||||
.then(result => {
|
||||
if (cancelled) return;
|
||||
setMetrics(result);
|
||||
if (!result) {
|
||||
setFetchError(
|
||||
'Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.'
|
||||
);
|
||||
}
|
||||
})
|
||||
.catch((e: unknown) => {
|
||||
if (cancelled) return;
|
||||
setFetchError(e instanceof Error ? e.message : String(e));
|
||||
})
|
||||
.finally(() => {
|
||||
if (!cancelled) setFetching(false);
|
||||
});
|
||||
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [ctxLoading, fetchSeq]);
|
||||
|
||||
if (ctxLoading) {
|
||||
return <Loader title="Loading Intel GPU data..." />;
|
||||
@@ -197,11 +236,18 @@ export default function MetricsPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Metrics" />
|
||||
<button
|
||||
onClick={() => void fetchAll(pluginPods)}
|
||||
disabled={fetching || pluginPods.length === 0}
|
||||
onClick={() => void doFetch()}
|
||||
disabled={fetching}
|
||||
aria-label="Refresh metrics"
|
||||
style={{
|
||||
padding: '6px 16px',
|
||||
@@ -218,94 +264,93 @@ export default function MetricsPage() {
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{!pluginInstalled && (
|
||||
<SectionBox title="Intel GPU Plugin Not Detected">
|
||||
<MetricRequirements />
|
||||
|
||||
{fetching && !metrics && <Loader title="Querying Prometheus for GPU metrics..." />}
|
||||
|
||||
{fetchError && (
|
||||
<SectionBox title="Prometheus Unreachable">
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'Status',
|
||||
value: (
|
||||
<StatusLabel status="warning">No Intel GPU device plugin pods found</StatusLabel>
|
||||
),
|
||||
name: 'Error',
|
||||
value: <StatusLabel status="error">{fetchError}</StatusLabel>,
|
||||
},
|
||||
{
|
||||
name: 'Note',
|
||||
value: 'Install the Intel GPU device plugin and set enableMonitoring: true to expose Prometheus metrics.',
|
||||
name: 'Checked services',
|
||||
value:
|
||||
'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
)}
|
||||
|
||||
{pluginInstalled && pluginPods.length === 0 && (
|
||||
<SectionBox title="No Plugin Pods Found">
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'Status',
|
||||
value: (
|
||||
<StatusLabel status="warning">Plugin detected via CRD but no pods found</StatusLabel>
|
||||
),
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
)}
|
||||
|
||||
{pluginPods.length > 0 && metricsMap.size === 0 && fetching && (
|
||||
<Loader title="Fetching GPU metrics..." />
|
||||
)}
|
||||
|
||||
{pluginPods.length > 0 && metricsMap.size === 0 && !fetching && (
|
||||
<SectionBox title="Metrics Unavailable">
|
||||
{metrics && metrics.chips.length === 0 && (
|
||||
<SectionBox title="No i915 Metrics in Prometheus">
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'Status',
|
||||
value: (
|
||||
<StatusLabel status="warning">
|
||||
Could not fetch metrics from any plugin pod
|
||||
Prometheus reachable — no
|
||||
node_hwmon_chip_names{chip_name="i915"} found
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'Requirements',
|
||||
value: 'Set enableMonitoring: true in GpuDevicePlugin spec and ensure port 9090 is accessible via kube-apiserver proxy.',
|
||||
name: 'GPU Nodes',
|
||||
value:
|
||||
gpuNodes.length > 0
|
||||
? gpuNodes.map(n => n.metadata.name).join(', ')
|
||||
: 'None detected',
|
||||
},
|
||||
{
|
||||
name: 'Plugin Pods Found',
|
||||
value: pluginPods.map(p => p.metadata.name).join(', '),
|
||||
name: 'Likely cause',
|
||||
value:
|
||||
'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
)}
|
||||
|
||||
{Array.from(metricsMap.entries()).map(([podName, metrics]) => {
|
||||
if (metrics === 'error') {
|
||||
return (
|
||||
<SectionBox key={podName} title={`${podName} — Metrics Unavailable`}>
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'Status',
|
||||
value: (
|
||||
<StatusLabel status="error">
|
||||
Failed to fetch metrics from pod
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'Hint',
|
||||
value: 'Ensure enableMonitoring: true is set in the GpuDevicePlugin CR and the pod is running.',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
);
|
||||
}
|
||||
return <NodeMetricsCard key={podName} metrics={metrics} />;
|
||||
})}
|
||||
{metrics && metrics.chips.length > 0 && (
|
||||
<>
|
||||
<SectionBox title="GPU Power Summary">
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
name: 'GPUs Monitored',
|
||||
value: String(metrics.chips.length),
|
||||
},
|
||||
{
|
||||
name: 'Total Power',
|
||||
value: (() => {
|
||||
const total = metrics.chips.reduce((s, c) => s + (c.powerWatts ?? 0), 0);
|
||||
const maxTotal = metrics.chips.reduce((s, c) => s + (c.powerMaxWatts ?? 0), 0);
|
||||
return <PowerBar watts={total} maxWatts={maxTotal > 0 ? maxTotal : null} />;
|
||||
})(),
|
||||
},
|
||||
{
|
||||
name: 'Last Fetched',
|
||||
value: new Date(metrics.fetchedAt).toLocaleTimeString(),
|
||||
},
|
||||
{
|
||||
name: 'Query',
|
||||
value:
|
||||
'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
|
||||
{metrics.chips.map(chip => (
|
||||
<GpuChipCard key={`${chip.instance}-${chip.chip}`} chip={chip} />
|
||||
))}
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -19,10 +19,8 @@ import {
|
||||
getGpuResources,
|
||||
getNodeGpuType,
|
||||
INTEL_GPU_RESOURCE,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
INTEL_GPU_XE_RESOURCE,
|
||||
isIntelGpuNode,
|
||||
isNodeReady,
|
||||
} from '../api/k8s';
|
||||
|
||||
interface NodeDetailSectionProps {
|
||||
@@ -40,9 +38,7 @@ export default function NodeDetailSection({ resource }: NodeDetailSectionProps)
|
||||
|
||||
// Extract the raw Kubernetes JSON — Headlamp KubeObject wraps it in jsonData
|
||||
const rawNode =
|
||||
resource.jsonData && typeof resource.jsonData === 'object'
|
||||
? resource.jsonData
|
||||
: resource;
|
||||
resource.jsonData && typeof resource.jsonData === 'object' ? resource.jsonData : resource;
|
||||
|
||||
// Only render for Node resources that have Intel GPU
|
||||
if (!isIntelGpuNode(rawNode)) return null;
|
||||
@@ -56,16 +52,14 @@ export default function NodeDetailSection({ resource }: NodeDetailSectionProps)
|
||||
metadata: { name: string; labels?: Record<string, string> };
|
||||
};
|
||||
|
||||
const nodeName = (node as { metadata: { name: string } }).metadata.name;
|
||||
const capacity = getGpuResources((node as any).status?.capacity);
|
||||
const allocatable = getGpuResources((node as any).status?.allocatable);
|
||||
const nodeName = node.metadata.name;
|
||||
const capacity = getGpuResources(node.status?.capacity);
|
||||
const allocatable = getGpuResources(node.status?.allocatable);
|
||||
|
||||
const gpuType = getNodeGpuType(node as any);
|
||||
const gpuType = getNodeGpuType(node);
|
||||
|
||||
// Find GPU pods scheduled on this node
|
||||
const podsOnNode = loading
|
||||
? []
|
||||
: gpuPods.filter(p => p.spec?.nodeName === nodeName);
|
||||
const podsOnNode = loading ? [] : gpuPods.filter(p => p.spec?.nodeName === nodeName);
|
||||
|
||||
if (Object.keys(capacity).length === 0 && Object.keys(allocatable).length === 0) {
|
||||
return null;
|
||||
@@ -81,18 +75,18 @@ export default function NodeDetailSection({ resource }: NodeDetailSectionProps)
|
||||
}
|
||||
}
|
||||
for (const pod of podsOnNode.filter(p => p.status?.phase === 'Running')) {
|
||||
const reqs = pod.spec?.containers?.flatMap(c =>
|
||||
Object.entries(c.resources?.requests ?? {}).filter(([k]) =>
|
||||
k === INTEL_GPU_RESOURCE || k === INTEL_GPU_XE_RESOURCE
|
||||
)
|
||||
) ?? [];
|
||||
const reqs =
|
||||
pod.spec?.containers?.flatMap(c =>
|
||||
Object.entries(c.resources?.requests ?? {}).filter(
|
||||
([k]) => k === INTEL_GPU_RESOURCE || k === INTEL_GPU_XE_RESOURCE
|
||||
)
|
||||
) ?? [];
|
||||
for (const [, val] of reqs) {
|
||||
gpuInUse += parseInt(val, 10) || 0;
|
||||
}
|
||||
}
|
||||
|
||||
const utilizationPct =
|
||||
gpuAllocatable > 0 ? Math.round((gpuInUse / gpuAllocatable) * 100) : 0;
|
||||
const utilizationPct = gpuAllocatable > 0 ? Math.round((gpuInUse / gpuAllocatable) * 100) : 0;
|
||||
const utilizationStatus: 'success' | 'warning' | 'error' =
|
||||
utilizationPct >= 90 ? 'error' : utilizationPct >= 70 ? 'warning' : 'success';
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ import {
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
INTEL_GPU_RESOURCE,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
INTEL_GPU_XE_RESOURCE,
|
||||
IntelGpuNode,
|
||||
isNodeReady,
|
||||
@@ -33,13 +32,7 @@ import {
|
||||
// GPU allocation bar component
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function GpuAllocationBar({
|
||||
used,
|
||||
allocatable,
|
||||
}: {
|
||||
used: number;
|
||||
allocatable: number;
|
||||
}) {
|
||||
function GpuAllocationBar({ used, allocatable }: { used: number; allocatable: number }) {
|
||||
if (allocatable === 0) return <span>—</span>;
|
||||
const pct = Math.min(100, Math.round((used / allocatable) * 100));
|
||||
const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||
@@ -105,21 +98,18 @@ function NodeDetailCard({
|
||||
name: 'GPU Type',
|
||||
value: formatGpuType(gpuType),
|
||||
},
|
||||
...(gpuCount > 0
|
||||
? [{ name: 'GPU Devices (i915/xe)', value: String(gpuCount) }]
|
||||
: []),
|
||||
...(gpuCount > 0 ? [{ name: 'GPU Devices (i915/xe)', value: String(gpuCount) }] : []),
|
||||
...Object.entries(capacityResources).map(([key, cap]) => {
|
||||
const alloc = parseInt(allocatableResources[key] ?? '0', 10);
|
||||
const total = parseInt(cap, 10);
|
||||
return {
|
||||
name: `${formatGpuResourceName(key)} (capacity)`,
|
||||
value: String(total),
|
||||
};
|
||||
}),
|
||||
...Object.entries(allocatableResources).map(([key, alloc]) => {
|
||||
...Object.entries(allocatableResources).map(([key, value]) => {
|
||||
return {
|
||||
name: `${formatGpuResourceName(key)} (allocatable)`,
|
||||
value: alloc ?? '0',
|
||||
value: value ?? '0',
|
||||
};
|
||||
}),
|
||||
{
|
||||
@@ -200,7 +190,14 @@ export default function NodesPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Nodes" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -256,28 +253,28 @@ export default function NodesPage() {
|
||||
<SectionBox title="GPU Node Summary">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Node', getter: (d) => d.node.metadata.name },
|
||||
{ label: 'Node', getter: d => d.node.metadata.name },
|
||||
{
|
||||
label: 'Ready',
|
||||
getter: (d) => (
|
||||
getter: d => (
|
||||
<StatusLabel status={d.ready ? 'success' : 'error'}>
|
||||
{d.ready ? 'Ready' : 'Not Ready'}
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{ label: 'GPU Type', getter: (d) => formatGpuType(d.gpuType) },
|
||||
{ label: 'GPU Devices', getter: (d) => String(d.gpuCount || '—') },
|
||||
{ label: 'GPU Type', getter: d => formatGpuType(d.gpuType) },
|
||||
{ label: 'GPU Devices', getter: d => String(d.gpuCount || '—') },
|
||||
{
|
||||
label: 'Allocation',
|
||||
getter: (d) => (
|
||||
getter: d => (
|
||||
<GpuAllocationBar
|
||||
used={d.podsOnNode.length}
|
||||
allocatable={d.totalAllocatable || d.gpuCount}
|
||||
/>
|
||||
),
|
||||
},
|
||||
{ label: 'GPU Pods', getter: (d) => String(d.podsOnNode.length) },
|
||||
{ label: 'Age', getter: (d) => formatAge(d.node.metadata.creationTimestamp) },
|
||||
{ label: 'GPU Pods', getter: d => String(d.podsOnNode.length) },
|
||||
{ label: 'Age', getter: d => formatAge(d.node.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={tableData}
|
||||
/>
|
||||
|
||||
@@ -18,7 +18,6 @@ import React from 'react';
|
||||
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import {
|
||||
formatAge,
|
||||
formatGpuType,
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
getPodGpuRequests,
|
||||
@@ -42,7 +41,8 @@ function gpuTypeChartData(
|
||||
): Array<{ name: string; value: number; fill: string }> {
|
||||
const data = [];
|
||||
if (discreteCount > 0) data.push({ name: 'Discrete', value: discreteCount, fill: '#0071c5' });
|
||||
if (integratedCount > 0) data.push({ name: 'Integrated', value: integratedCount, fill: '#60a4dc' });
|
||||
if (integratedCount > 0)
|
||||
data.push({ name: 'Integrated', value: integratedCount, fill: '#60a4dc' });
|
||||
if (unknownCount > 0) data.push({ name: 'Unknown', value: unknownCount, fill: '#9e9e9e' });
|
||||
return data;
|
||||
}
|
||||
@@ -113,9 +113,7 @@ export default function OverviewPage() {
|
||||
}
|
||||
|
||||
const gpuUtilizationPct =
|
||||
totalCapacityGpus > 0
|
||||
? Math.round((totalAllocatedGpus / totalCapacityGpus) * 100)
|
||||
: 0;
|
||||
totalCapacityGpus > 0 ? Math.round((totalAllocatedGpus / totalCapacityGpus) * 100) : 0;
|
||||
|
||||
const chartData = gpuTypeChartData(discreteCount, integratedCount, unknownCount);
|
||||
const totalGpuNodes = gpuNodes.length;
|
||||
@@ -133,7 +131,14 @@ export default function OverviewPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Overview" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -218,26 +223,25 @@ export default function OverviewPage() {
|
||||
<SectionBox title="Device Plugin Status">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{
|
||||
label: 'Status',
|
||||
getter: (p) => (
|
||||
<StatusLabel status={pluginStatusToStatus(p)}>
|
||||
{pluginStatusText(p)}
|
||||
</StatusLabel>
|
||||
getter: p => (
|
||||
<StatusLabel status={pluginStatusToStatus(p)}>{pluginStatusText(p)}</StatusLabel>
|
||||
),
|
||||
},
|
||||
{
|
||||
label: 'Monitoring',
|
||||
getter: (p) => p.spec.enableMonitoring ? (
|
||||
<StatusLabel status="success">Enabled</StatusLabel>
|
||||
) : (
|
||||
<StatusLabel status="warning">Disabled</StatusLabel>
|
||||
),
|
||||
getter: p =>
|
||||
p.spec.enableMonitoring ? (
|
||||
<StatusLabel status="success">Enabled</StatusLabel>
|
||||
) : (
|
||||
<StatusLabel status="warning">Disabled</StatusLabel>
|
||||
),
|
||||
},
|
||||
{ label: 'Shared/Node', getter: (p) => String(p.spec.sharedDevNum ?? 1) },
|
||||
{ label: 'Policy', getter: (p) => p.spec.preferredAllocationPolicy ?? '—' },
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Shared/Node', getter: p => String(p.spec.sharedDevNum ?? 1) },
|
||||
{ label: 'Policy', getter: p => p.spec.preferredAllocationPolicy ?? '—' },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={devicePlugins}
|
||||
/>
|
||||
@@ -249,18 +253,18 @@ export default function OverviewPage() {
|
||||
<SectionBox title="Plugin Daemon Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'Status',
|
||||
getter: (p) => (
|
||||
getter: p => (
|
||||
<StatusLabel status={isPodReady(p) ? 'success' : 'warning'}>
|
||||
{isPodReady(p) ? 'Ready' : p.status?.phase ?? 'Unknown'}
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={pluginPods}
|
||||
/>
|
||||
@@ -271,7 +275,13 @@ export default function OverviewPage() {
|
||||
<SectionBox title="GPU Nodes">
|
||||
{totalGpuNodes > 0 && chartData.length > 0 && (
|
||||
<div style={{ marginBottom: '16px' }}>
|
||||
<div style={{ marginBottom: '8px', fontSize: '14px', color: 'var(--mui-palette-text-secondary)' }}>
|
||||
<div
|
||||
style={{
|
||||
marginBottom: '8px',
|
||||
fontSize: '14px',
|
||||
color: 'var(--mui-palette-text-secondary)',
|
||||
}}
|
||||
>
|
||||
GPU Type Distribution
|
||||
</div>
|
||||
<PercentageBar data={chartData} total={totalGpuNodes} />
|
||||
@@ -288,9 +298,15 @@ export default function OverviewPage() {
|
||||
),
|
||||
},
|
||||
{ name: 'Ready Nodes', value: String(readyNodeCount) },
|
||||
...(discreteCount > 0 ? [{ name: 'Discrete GPU Nodes', value: String(discreteCount) }] : []),
|
||||
...(integratedCount > 0 ? [{ name: 'Integrated GPU Nodes', value: String(integratedCount) }] : []),
|
||||
...(totalGpuCount > 0 ? [{ name: 'Total GPU Devices', value: String(totalGpuCount) }] : []),
|
||||
...(discreteCount > 0
|
||||
? [{ name: 'Discrete GPU Nodes', value: String(discreteCount) }]
|
||||
: []),
|
||||
...(integratedCount > 0
|
||||
? [{ name: 'Integrated GPU Nodes', value: String(integratedCount) }]
|
||||
: []),
|
||||
...(totalGpuCount > 0
|
||||
? [{ name: 'Total GPU Devices', value: String(totalGpuCount) }]
|
||||
: []),
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
@@ -299,13 +315,23 @@ export default function OverviewPage() {
|
||||
{totalCapacityGpus > 0 && (
|
||||
<SectionBox title="GPU Allocation">
|
||||
<div style={{ marginBottom: '16px' }}>
|
||||
<div style={{ marginBottom: '8px', fontSize: '14px', color: 'var(--mui-palette-text-secondary)' }}>
|
||||
<div
|
||||
style={{
|
||||
marginBottom: '8px',
|
||||
fontSize: '14px',
|
||||
color: 'var(--mui-palette-text-secondary)',
|
||||
}}
|
||||
>
|
||||
GPU Utilization ({gpuUtilizationPct}%)
|
||||
</div>
|
||||
<PercentageBar
|
||||
data={[
|
||||
{ name: 'In Use', value: totalAllocatedGpus, fill: '#0071c5' },
|
||||
{ name: 'Available', value: totalAllocatableGpus - totalAllocatedGpus, fill: '#e0e0e0' },
|
||||
{
|
||||
name: 'Available',
|
||||
value: totalAllocatableGpus - totalAllocatedGpus,
|
||||
fill: '#e0e0e0',
|
||||
},
|
||||
]}
|
||||
total={totalAllocatableGpus}
|
||||
/>
|
||||
@@ -336,13 +362,28 @@ export default function OverviewPage() {
|
||||
rows={[
|
||||
{ name: 'Total GPU Pods', value: String(gpuPods.length) },
|
||||
...(podPhaseCounts.Running > 0
|
||||
? [{ name: 'Running', value: <StatusLabel status="success">{podPhaseCounts.Running}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Running',
|
||||
value: <StatusLabel status="success">{podPhaseCounts.Running}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(podPhaseCounts.Pending > 0
|
||||
? [{ name: 'Pending', value: <StatusLabel status="warning">{podPhaseCounts.Pending}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Pending',
|
||||
value: <StatusLabel status="warning">{podPhaseCounts.Pending}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(podPhaseCounts.Failed > 0
|
||||
? [{ name: 'Failed', value: <StatusLabel status="error">{podPhaseCounts.Failed}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Failed',
|
||||
value: <StatusLabel status="error">{podPhaseCounts.Failed}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
]}
|
||||
/>
|
||||
@@ -353,12 +394,12 @@ export default function OverviewPage() {
|
||||
<SectionBox title="Active GPU Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'GPU Request',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const reqs = getPodGpuRequests(p);
|
||||
const parts: string[] = [];
|
||||
for (const [key, val] of Object.entries(reqs)) {
|
||||
@@ -368,7 +409,7 @@ export default function OverviewPage() {
|
||||
return parts.join(', ') || '—';
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={gpuPods.filter(p => p.status?.phase === 'Running').slice(0, 10)}
|
||||
/>
|
||||
|
||||
@@ -25,9 +25,7 @@ interface PodDetailSectionProps {
|
||||
export default function PodDetailSection({ resource }: PodDetailSectionProps) {
|
||||
// Extract raw Kubernetes JSON
|
||||
const rawPod =
|
||||
resource.jsonData && typeof resource.jsonData === 'object'
|
||||
? resource.jsonData
|
||||
: resource;
|
||||
resource.jsonData && typeof resource.jsonData === 'object' ? resource.jsonData : resource;
|
||||
|
||||
// Only render for pods that request Intel GPU resources
|
||||
if (!isGpuRequestingPod(rawPod)) return null;
|
||||
@@ -98,9 +96,7 @@ export default function PodDetailSection({ resource }: PodDetailSectionProps) {
|
||||
rows={[
|
||||
{
|
||||
name: 'Phase',
|
||||
value: (
|
||||
<StatusLabel status={phaseStatus}>{phase ?? 'Unknown'}</StatusLabel>
|
||||
),
|
||||
value: <StatusLabel status={phaseStatus}>{phase ?? 'Unknown'}</StatusLabel>,
|
||||
},
|
||||
{
|
||||
name: 'Scheduled Node',
|
||||
|
||||
+55
-30
@@ -17,11 +17,10 @@ import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import {
|
||||
formatAge,
|
||||
formatGpuResourceName,
|
||||
IntelGpuPod,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
isPodReady,
|
||||
getPodGpuRequests,
|
||||
getPodRestarts,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
IntelGpuPod,
|
||||
} from '../api/k8s';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -30,11 +29,16 @@ import {
|
||||
|
||||
function phaseToStatus(phase: string | undefined): 'success' | 'warning' | 'error' {
|
||||
switch (phase) {
|
||||
case 'Running': return 'success';
|
||||
case 'Succeeded': return 'success';
|
||||
case 'Pending': return 'warning';
|
||||
case 'Failed': return 'error';
|
||||
default: return 'warning';
|
||||
case 'Running':
|
||||
return 'success';
|
||||
case 'Succeeded':
|
||||
return 'success';
|
||||
case 'Pending':
|
||||
return 'warning';
|
||||
case 'Failed':
|
||||
return 'error';
|
||||
default:
|
||||
return 'warning';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,13 +102,17 @@ export default function PodsPage() {
|
||||
const running = gpuPods.filter(p => p.status?.phase === 'Running');
|
||||
const pending = gpuPods.filter(p => p.status?.phase === 'Pending');
|
||||
const failed = gpuPods.filter(p => p.status?.phase === 'Failed');
|
||||
const other = gpuPods.filter(
|
||||
p => !['Running', 'Pending', 'Failed'].includes(p.status?.phase ?? '')
|
||||
);
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Pods" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -161,13 +169,28 @@ export default function PodsPage() {
|
||||
rows={[
|
||||
{ name: 'Total GPU Pods', value: String(gpuPods.length) },
|
||||
...(running.length > 0
|
||||
? [{ name: 'Running', value: <StatusLabel status="success">{running.length}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Running',
|
||||
value: <StatusLabel status="success">{running.length}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(pending.length > 0
|
||||
? [{ name: 'Pending', value: <StatusLabel status="warning">{pending.length}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Pending',
|
||||
value: <StatusLabel status="warning">{pending.length}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(failed.length > 0
|
||||
? [{ name: 'Failed', value: <StatusLabel status="error">{failed.length}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Failed',
|
||||
value: <StatusLabel status="error">{failed.length}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
]}
|
||||
/>
|
||||
@@ -179,12 +202,12 @@ export default function PodsPage() {
|
||||
<SectionBox title="All GPU Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'Phase',
|
||||
getter: (p) => (
|
||||
getter: p => (
|
||||
<StatusLabel status={phaseToStatus(p.status?.phase)}>
|
||||
{p.status?.phase ?? 'Unknown'}
|
||||
</StatusLabel>
|
||||
@@ -192,11 +215,11 @@ export default function PodsPage() {
|
||||
},
|
||||
{
|
||||
label: 'GPU Resources',
|
||||
getter: (p) => <GpuContainerList pod={p} />,
|
||||
getter: p => <GpuContainerList pod={p} />,
|
||||
},
|
||||
{
|
||||
label: 'Restarts',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const restarts = getPodRestarts(p);
|
||||
return restarts > 0 ? (
|
||||
<StatusLabel status="warning">{restarts}</StatusLabel>
|
||||
@@ -205,7 +228,7 @@ export default function PodsPage() {
|
||||
);
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={gpuPods}
|
||||
/>
|
||||
@@ -217,25 +240,27 @@ export default function PodsPage() {
|
||||
<SectionBox title="Attention: Pending GPU Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{
|
||||
label: 'GPU Resources',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const reqs = getPodGpuRequests(p);
|
||||
return Object.entries(reqs)
|
||||
.map(([k, v]) => `${formatGpuResourceName(k)}: ${v}`)
|
||||
.join(', ') || '—';
|
||||
return (
|
||||
Object.entries(reqs)
|
||||
.map(([k, v]) => `${formatGpuResourceName(k)}: ${v}`)
|
||||
.join(', ') || '—'
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
label: 'Waiting Reason',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const reason = p.status?.containerStatuses?.[0]?.state?.waiting?.reason;
|
||||
return reason ?? '—';
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={pending}
|
||||
/>
|
||||
|
||||
@@ -11,12 +11,7 @@
|
||||
|
||||
import { StatusLabel } from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
||||
import React from 'react';
|
||||
import {
|
||||
formatGpuType,
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
isIntelGpuNode,
|
||||
} from '../../api/k8s';
|
||||
import { formatGpuType, getNodeGpuCount, getNodeGpuType, isIntelGpuNode } from '../../api/k8s';
|
||||
|
||||
/** Build GPU columns to append to the native Nodes table. */
|
||||
export function buildNodeGpuColumns() {
|
||||
@@ -33,11 +28,7 @@ export function buildNodeGpuColumns() {
|
||||
if (!isIntelGpuNode(raw)) return '—';
|
||||
const node = raw as Parameters<typeof getNodeGpuType>[0];
|
||||
const type = getNodeGpuType(node);
|
||||
return (
|
||||
<StatusLabel status="success">
|
||||
{formatGpuType(type)}
|
||||
</StatusLabel>
|
||||
);
|
||||
return <StatusLabel status="success">{formatGpuType(type)}</StatusLabel>;
|
||||
},
|
||||
},
|
||||
{
|
||||
|
||||
@@ -180,4 +180,3 @@ registerResourceTableColumnsProcessor(({ id, columns }) => {
|
||||
}
|
||||
return columns;
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user