Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 01b60a23b8 | |||
| 488bf90abc | |||
| 034e0b9db8 |
@@ -0,0 +1,44 @@
|
||||
---
|
||||
name: agent-installer
|
||||
description: Use this agent when the user wants to discover, browse, or install Claude Code agents from the awesome-claude-code-subagents repository.
|
||||
tools: Bash, WebFetch, Read, Write, Glob
|
||||
model: haiku
|
||||
---
|
||||
|
||||
You are an agent installer that helps users browse and install Claude Code agents from the awesome-claude-code-subagents repository on GitHub.
|
||||
|
||||
## Your Capabilities
|
||||
|
||||
You can:
|
||||
1. List all available agent categories
|
||||
2. List agents within a category
|
||||
3. Search for agents by name or description
|
||||
4. Install agents to global (~/.claude/agents/) or local (.claude/agents/) directory
|
||||
5. Show details about a specific agent before installing
|
||||
6. Uninstall agents
|
||||
|
||||
## GitHub API Endpoints
|
||||
|
||||
- Categories list: `https://api.github.com/repos/VoltAgent/awesome-claude-code-subagents/contents/categories`
|
||||
- Agents in category: `https://api.github.com/repos/VoltAgent/awesome-claude-code-subagents/contents/categories/{category-name}`
|
||||
- Raw agent file: `https://raw.githubusercontent.com/VoltAgent/awesome-claude-code-subagents/main/categories/{category-name}/{agent-name}.md`
|
||||
|
||||
## Workflow
|
||||
|
||||
### When user asks to browse or list agents:
|
||||
1. Fetch categories from GitHub API using WebFetch or Bash with curl
|
||||
2. Parse the JSON response to extract directory names
|
||||
3. Present categories in a numbered list
|
||||
4. When user selects a category, fetch and list agents in that category
|
||||
|
||||
### When user wants to install an agent:
|
||||
1. Ask if they want global installation (~/.claude/agents/) or local (.claude/agents/)
|
||||
2. For local: Check if .claude/ directory exists, create .claude/agents/ if needed
|
||||
3. Download the agent .md file from GitHub raw URL
|
||||
4. Save to the appropriate directory
|
||||
5. Confirm successful installation
|
||||
|
||||
### When user wants to search:
|
||||
1. Fetch the README.md which contains all agent listings
|
||||
2. Search for the term in agent names and descriptions
|
||||
3. Present matching results
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
name: agent-organizer
|
||||
description: Use when assembling and optimizing multi-agent teams to execute complex projects that require careful task decomposition, agent capability matching, and workflow coordination.
|
||||
tools: Read, Write, Edit, Glob, Grep
|
||||
model: sonnet
|
||||
---
|
||||
|
||||
You are a senior agent organizer with expertise in assembling and coordinating multi-agent teams. Your focus spans task analysis, agent capability mapping, workflow design, and team optimization with emphasis on selecting the right agents for each task and ensuring efficient collaboration.
|
||||
|
||||
When invoked:
|
||||
1. Query context manager for task requirements and available agents
|
||||
2. Review agent capabilities, performance history, and current workload
|
||||
3. Analyze task complexity, dependencies, and optimization opportunities
|
||||
4. Orchestrate agent teams for maximum efficiency and success
|
||||
|
||||
Agent organization checklist:
|
||||
- Agent selection accuracy > 95% achieved
|
||||
- Task completion rate > 99% maintained
|
||||
- Resource utilization optimal consistently
|
||||
- Response time < 5s ensured
|
||||
- Error recovery automated properly
|
||||
- Cost tracking enabled thoroughly
|
||||
- Performance monitored continuously
|
||||
- Team synergy maximized effectively
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
name: multi-agent-coordinator
|
||||
description: Use when coordinating multiple concurrent agents that need to communicate, share state, synchronize work, and handle distributed failures across a system.
|
||||
tools: Read, Write, Edit, Glob, Grep
|
||||
model: opus
|
||||
---
|
||||
|
||||
You are a senior multi-agent coordinator with expertise in orchestrating complex distributed workflows. Your focus spans inter-agent communication, task dependency management, parallel execution control, and fault tolerance with emphasis on ensuring efficient, reliable coordination across large agent teams.
|
||||
|
||||
When invoked:
|
||||
1. Query context manager for workflow requirements and agent states
|
||||
2. Review communication patterns, dependencies, and resource constraints
|
||||
3. Analyze coordination bottlenecks, deadlock risks, and optimization opportunities
|
||||
4. Implement robust multi-agent coordination strategies
|
||||
|
||||
Multi-agent coordination checklist:
|
||||
- Coordination overhead < 5% maintained
|
||||
- Deadlock prevention 100% ensured
|
||||
- Message delivery guaranteed thoroughly
|
||||
- Scalability to 100+ agents verified
|
||||
- Fault tolerance built-in properly
|
||||
- Monitoring comprehensive continuously
|
||||
- Recovery automated effectively
|
||||
- Performance optimal consistently
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(done)",
|
||||
"Bash(npm install:*)",
|
||||
"Bash(git add:*)",
|
||||
"Bash(git commit:*)",
|
||||
"Bash(git push:*)",
|
||||
"Bash(gh workflow:*)",
|
||||
"Bash(gh run:*)",
|
||||
"Bash(npm run:*)",
|
||||
"Bash(npm ci:*)",
|
||||
"Bash(npm test:*)"
|
||||
]
|
||||
},
|
||||
"enabledMcpjsonServers": [
|
||||
"github",
|
||||
"kubernetes",
|
||||
"flux",
|
||||
"playwright"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
module.exports = {
|
||||
extends: ['@headlamp-k8s/eslint-config'],
|
||||
};
|
||||
@@ -0,0 +1,41 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_call:
|
||||
|
||||
jobs:
|
||||
ci:
|
||||
runs-on: local-ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '22'
|
||||
cache: 'npm'
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build plugin
|
||||
run: npx @kinvolk/headlamp-plugin build
|
||||
|
||||
- name: Lint
|
||||
run: npm run lint
|
||||
|
||||
- name: Type-check
|
||||
run: npm run tsc
|
||||
|
||||
- name: Format check
|
||||
run: npm run format:check
|
||||
|
||||
- name: Run tests
|
||||
run: npm test
|
||||
@@ -0,0 +1,104 @@
|
||||
name: Release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
version:
|
||||
description: 'Release version (e.g. 1.0.0)'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
concurrency:
|
||||
group: release
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
ci:
|
||||
uses: ./.github/workflows/ci.yaml
|
||||
|
||||
release:
|
||||
needs: ci
|
||||
runs-on: local-ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
- name: Validate version format
|
||||
run: |
|
||||
if [[ ! "${{ inputs.version }}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
echo "Error: Version must be in X.Y.Z format"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '22'
|
||||
cache: 'npm'
|
||||
|
||||
- name: Configure Git
|
||||
run: |
|
||||
git config user.name "github-actions[bot]"
|
||||
git config user.email "github-actions[bot]@users.noreply.github.com"
|
||||
|
||||
- name: Update version in package.json
|
||||
run: npm version ${{ inputs.version }} --no-git-tag-version --allow-same-version
|
||||
|
||||
- name: Update artifacthub-pkg.yml
|
||||
run: |
|
||||
VERSION="${{ inputs.version }}"
|
||||
PKG_NAME=$(jq -r .name package.json)
|
||||
RELEASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}/${PKG_NAME}-${VERSION}.tar.gz"
|
||||
sed -i "s/^version:.*/version: \"${VERSION}\"/" artifacthub-pkg.yml
|
||||
sed -i "s|headlamp/plugin/archive-url:.*|headlamp/plugin/archive-url: \"${RELEASE_URL}\"|" artifacthub-pkg.yml
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Build plugin
|
||||
run: npx @kinvolk/headlamp-plugin build
|
||||
|
||||
- name: Package plugin
|
||||
run: npx @kinvolk/headlamp-plugin package
|
||||
|
||||
- name: Prepare release tarball
|
||||
run: |
|
||||
VERSION="${{ inputs.version }}"
|
||||
PKG_NAME=$(jq -r .name package.json)
|
||||
TARBALL="${PKG_NAME}-${VERSION}.tar.gz"
|
||||
echo "TARBALL=$TARBALL" >> $GITHUB_ENV
|
||||
echo "PKG_NAME=$PKG_NAME" >> $GITHUB_ENV
|
||||
|
||||
- name: Validate tarball
|
||||
run: |
|
||||
echo "Tarball: ${{ env.TARBALL }}"
|
||||
ls -lh "${{ env.TARBALL }}"
|
||||
tar -tzf "${{ env.TARBALL }}" | head -20
|
||||
tar -tzf "${{ env.TARBALL }}" | grep -q "main.js" || { echo "Error: main.js not found in tarball"; exit 1; }
|
||||
|
||||
- name: Compute checksum
|
||||
run: |
|
||||
CHECKSUM=$(sha256sum "${{ env.TARBALL }}" | awk '{print $1}')
|
||||
echo "CHECKSUM=$CHECKSUM" >> $GITHUB_ENV
|
||||
sed -i "s|headlamp/plugin/archive-checksum:.*|headlamp/plugin/archive-checksum: sha256:${CHECKSUM}|" artifacthub-pkg.yml
|
||||
|
||||
- name: Commit and tag
|
||||
run: |
|
||||
VERSION="${{ inputs.version }}"
|
||||
git add package.json package-lock.json artifacthub-pkg.yml
|
||||
git commit -m "release: v${VERSION}"
|
||||
git tag "v${VERSION}"
|
||||
git push origin main --tags
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ inputs.version }}
|
||||
name: v${{ inputs.version }}
|
||||
generate_release_notes: true
|
||||
files: ${{ env.TARBALL }}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"github": {
|
||||
"type": "http",
|
||||
"url": "https://api.githubcopilot.com/mcp/",
|
||||
"headers": { "Authorization": "Bearer ${GITHUB_TOKEN}" }
|
||||
},
|
||||
"kubernetes": { "type": "sse", "url": "http://localhost:8080/sse" },
|
||||
"flux": { "type": "sse", "url": "http://localhost:8081/sse" },
|
||||
"playwright": { "type": "sse", "url": "http://localhost:8086/sse" }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
module.exports = require('@headlamp-k8s/eslint-config/prettier-config');
|
||||
@@ -0,0 +1,95 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project
|
||||
|
||||
Headlamp plugin for Intel GPU device plugin visibility and monitoring. Read-only — monitors GpuDevicePlugin CRDs, GPU-capable nodes, pods requesting Intel GPU resources, and real-time power metrics via Prometheus. No cluster write operations.
|
||||
|
||||
- **Plugin name**: `intel-gpu`
|
||||
- **Target**: Headlamp >= v0.20.0
|
||||
- **Data sources**: GpuDevicePlugin CRDs (`deviceplugin.intel.com/v1`), Nodes, Pods (all namespaces), Prometheus (node-exporter i915 hwmon)
|
||||
- **Reference plugin**: `../headlamp-kube-vip-plugin`
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
npm start # dev server with hot reload
|
||||
npm run build # production build
|
||||
npm run package # package for headlamp
|
||||
npm run tsc # TypeScript type check (no emit)
|
||||
npm run lint # ESLint
|
||||
npm run lint:fix # ESLint with auto-fix
|
||||
npm run format # Prettier write
|
||||
npm run format:check # Prettier check
|
||||
npm test # vitest run
|
||||
npm run test:watch # vitest watch mode
|
||||
```
|
||||
|
||||
All tests and `tsc` must pass before committing.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
src/
|
||||
├── index.tsx # Plugin entry: registerRoute, registerSidebarEntry, registerDetailsViewSection, registerResourceTableColumnsProcessor
|
||||
├── api/
|
||||
│ ├── k8s.ts # Types + helpers (GpuDevicePlugin CRD, Nodes, Pods, type guards, formatters)
|
||||
│ ├── k8s.test.ts # Tests for k8s helpers (70+ test cases)
|
||||
│ ├── metrics.ts # Prometheus GPU power metrics (node-exporter i915 hwmon)
|
||||
│ └── IntelGpuDataContext.tsx # Shared React context provider with data fetching
|
||||
└── components/
|
||||
├── OverviewPage.tsx # Dashboard: plugin health, GPU node summary, allocation, active pods
|
||||
├── DevicePluginsPage.tsx # GpuDevicePlugin CRD instances with spec/status and daemon pods
|
||||
├── NodesPage.tsx # Per-node GPU type, device count, allocation, workload pods
|
||||
├── PodsPage.tsx # All pods requesting Intel GPU resources with per-container detail
|
||||
├── MetricsPage.tsx # Real-time GPU power metrics from Prometheus
|
||||
├── NodeDetailSection.tsx # Injected into native Node detail page (capacity, utilization, pods)
|
||||
├── PodDetailSection.tsx # Injected into native Pod detail page (GPU requests per container)
|
||||
└── integrations/
|
||||
└── NodeColumns.tsx # GPU Type and GPU Devices columns for native Nodes table
|
||||
```
|
||||
|
||||
## Data flow
|
||||
|
||||
`IntelGpuDataContext.tsx` uses **two fetching strategies**:
|
||||
|
||||
1. **Headlamp hooks** (`K8s.ResourceClasses.*.useList()`) — for Nodes and Pods.
|
||||
2. **`ApiProxy.request()`** — for GpuDevicePlugin CRDs and plugin daemon pods (with label selector fallback).
|
||||
|
||||
The plugin gracefully degrades when the GpuDevicePlugin CRD is not installed — GPU nodes and pods are still shown based on resource labels and capacity.
|
||||
|
||||
## Key constants (src/api/k8s.ts)
|
||||
|
||||
- API group: `deviceplugin.intel.com`
|
||||
- API version: `v1`
|
||||
- GPU resources: `gpu.intel.com/i915`, `gpu.intel.com/xe`, `gpu.intel.com/millicores`, `gpu.intel.com/memory.max`
|
||||
- Resource prefix: `gpu.intel.com/`
|
||||
- Node labels: `intel.feature.node.kubernetes.io/gpu`, `node-role.kubernetes.io/gpu`, `node-role.kubernetes.io/igpu`
|
||||
- Pod selector: `app=intel-gpu-plugin`
|
||||
- Prometheus services: `kube-prometheus-stack-prometheus`, `prometheus-operated`, `prometheus` (monitoring namespace, port 9090)
|
||||
|
||||
## Code conventions
|
||||
|
||||
- Functional React components only — no class components
|
||||
- All imports from `@kinvolk/headlamp-plugin/lib` and `@kinvolk/headlamp-plugin/lib/CommonComponents`
|
||||
- No additional UI libraries (no MUI direct imports, no Ant Design, etc.)
|
||||
- TypeScript strict mode — no `any`, use `unknown` + type guards at API boundaries
|
||||
- Context provider (`IntelGpuDataProvider`) wraps each route component in `index.tsx`
|
||||
- Tests: vitest + @testing-library/react, mock with `vi.mock('@kinvolk/headlamp-plugin/lib', ...)`
|
||||
- `vitest.setup.ts` provides a spec-compliant `localStorage` shim for Node 22+ compatibility
|
||||
|
||||
## Testing
|
||||
|
||||
Mock pattern for headlamp APIs:
|
||||
```typescript
|
||||
vi.mock('@kinvolk/headlamp-plugin/lib', () => ({
|
||||
ApiProxy: { request: vi.fn().mockResolvedValue({ items: [] }) },
|
||||
K8s: {
|
||||
ResourceClasses: {
|
||||
Node: { useList: vi.fn(() => [[], null]) },
|
||||
Pod: { useList: vi.fn(() => [[], null]) },
|
||||
},
|
||||
},
|
||||
}));
|
||||
```
|
||||
@@ -0,0 +1,36 @@
|
||||
# Contributing
|
||||
|
||||
Contributions are welcome! Please follow these guidelines.
|
||||
|
||||
## Development Setup
|
||||
|
||||
```bash
|
||||
git clone https://github.com/privilegedescalation/headlamp-intel-gpu-plugin.git
|
||||
cd headlamp-intel-gpu-plugin
|
||||
npm install
|
||||
npm start
|
||||
```
|
||||
|
||||
## Before Submitting a PR
|
||||
|
||||
```bash
|
||||
npm run tsc # TypeScript type check
|
||||
npm run lint # ESLint
|
||||
npm run format:check # Prettier
|
||||
npm test # All tests must pass
|
||||
```
|
||||
|
||||
## Code Style
|
||||
|
||||
- TypeScript strict mode (no `any`)
|
||||
- Functional React components only
|
||||
- All UI from `@kinvolk/headlamp-plugin/lib/CommonComponents`
|
||||
- Tests with vitest + @testing-library/react
|
||||
|
||||
## Commit Messages
|
||||
|
||||
Use conventional commit format:
|
||||
- `feat:` new features
|
||||
- `fix:` bug fixes
|
||||
- `chore:` maintenance
|
||||
- `docs:` documentation
|
||||
@@ -0,0 +1,190 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to the Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by the Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding any notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
Copyright 2025 privilegedescalation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@@ -0,0 +1,110 @@
|
||||
# headlamp-intel-gpu-plugin
|
||||
|
||||
[](https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/actions/workflows/ci.yaml)
|
||||
[](https://opensource.org/licenses/Apache-2.0)
|
||||
|
||||
A [Headlamp](https://headlamp.dev/) plugin providing visibility into [Intel GPU device plugin](https://intel.github.io/intel-device-plugins-for-kubernetes/) deployments on Kubernetes.
|
||||
|
||||
## Features
|
||||
|
||||
- **Overview Dashboard** — Plugin health, GPU node summary, allocation bar, active GPU pods
|
||||
- **Device Plugins** — GpuDevicePlugin CRD instances with spec/status and daemon pod health
|
||||
- **GPU Nodes** — Per-node GPU type (discrete/integrated), device count, allocation, workload pods
|
||||
- **GPU Pods** — All pods requesting Intel GPU resources with per-container detail
|
||||
- **Metrics** — Real-time GPU power draw (W) and TDP via Prometheus node-exporter i915 hwmon
|
||||
- **Node Detail Integration** — Intel GPU section injected into native Headlamp Node detail views
|
||||
- **Pod Detail Integration** — GPU resource requests/limits injected into native Pod detail views
|
||||
- **Nodes Table Columns** — GPU Type and GPU Devices columns added to native Nodes table
|
||||
|
||||
## Installation
|
||||
|
||||
### Plugin Manager (Headlamp UI)
|
||||
|
||||
Search for `intel-gpu` in the Headlamp Plugin Manager.
|
||||
|
||||
### Manual
|
||||
|
||||
```bash
|
||||
# Download the latest release tarball
|
||||
curl -LO https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/latest/download/intel-gpu-*.tar.gz
|
||||
|
||||
# Extract to Headlamp plugins directory
|
||||
mkdir -p ~/.config/Headlamp/plugins
|
||||
tar -xzf intel-gpu-*.tar.gz -C ~/.config/Headlamp/plugins/
|
||||
```
|
||||
|
||||
### From Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/privilegedescalation/headlamp-intel-gpu-plugin.git
|
||||
cd headlamp-intel-gpu-plugin
|
||||
npm install
|
||||
npm run build
|
||||
```
|
||||
|
||||
## Requirements
|
||||
|
||||
- Headlamp >= v0.20.0
|
||||
- Intel GPU device plugin deployed (optional — plugin gracefully degrades without it)
|
||||
- Optional: Node Feature Discovery with Intel GPU labels
|
||||
- Optional: kube-prometheus-stack with node-exporter for GPU power metrics
|
||||
|
||||
## RBAC
|
||||
|
||||
This plugin is **read-only** and requires the following permissions:
|
||||
|
||||
| Resource | API Group | Verbs |
|
||||
|----------|-----------|-------|
|
||||
| nodes | v1 | list, get, watch |
|
||||
| pods | v1 | list, get, watch |
|
||||
| gpudeviceplugins | deviceplugin.intel.com/v1 | list, get |
|
||||
|
||||
For metrics, Prometheus must be accessible via the Headlamp API proxy in the `monitoring` namespace.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
src/
|
||||
├── index.tsx # Plugin entry point
|
||||
├── api/
|
||||
│ ├── k8s.ts # Types and helper functions
|
||||
│ ├── metrics.ts # Prometheus GPU metrics
|
||||
│ └── IntelGpuDataContext.tsx # React context provider
|
||||
└── components/
|
||||
├── OverviewPage.tsx # Dashboard
|
||||
├── DevicePluginsPage.tsx # Device plugin CRDs
|
||||
├── NodesPage.tsx # GPU nodes
|
||||
├── PodsPage.tsx # GPU pods
|
||||
├── MetricsPage.tsx # Power metrics
|
||||
├── NodeDetailSection.tsx # Injected into Node detail view
|
||||
├── PodDetailSection.tsx # Injected into Pod detail view
|
||||
└── integrations/
|
||||
└── NodeColumns.tsx # Nodes table columns
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
npm install
|
||||
npm start # dev server
|
||||
npm test # run tests
|
||||
npm run tsc # type check
|
||||
npm run lint # ESLint
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Symptom | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| No GPU nodes shown | No Intel GPU labels or resources on nodes | Install Intel Node Feature Discovery or Intel GPU device plugin |
|
||||
| CRD not available warning | GpuDevicePlugin CRD not installed | Install Intel device plugins operator — plugin still works without it |
|
||||
| No metrics data | Prometheus not found | Deploy kube-prometheus-stack in the `monitoring` namespace |
|
||||
| Metrics show only discrete GPUs | Integrated GPUs lack hwmon | Expected — iGPU driver doesn't expose hwmon power data |
|
||||
|
||||
## Contributing
|
||||
|
||||
See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
|
||||
|
||||
## License
|
||||
|
||||
Apache License 2.0. See [LICENSE](LICENSE) for details.
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
| Version | Supported |
|
||||
|---------|-----------|
|
||||
| latest | Yes |
|
||||
|
||||
## Plugin Scope
|
||||
|
||||
This plugin is **read-only**. It does not perform any write operations against the Kubernetes cluster. It reads:
|
||||
|
||||
- Nodes
|
||||
- Pods (all namespaces)
|
||||
- GpuDevicePlugin CRDs (`deviceplugin.intel.com/v1`)
|
||||
- Prometheus metrics (via API proxy in `monitoring` namespace)
|
||||
|
||||
All data is fetched through Headlamp's built-in API proxy, which respects the user's existing RBAC permissions.
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
Please report security vulnerabilities by opening a private issue or emailing the maintainers directly.
|
||||
+4
-4
@@ -1,5 +1,5 @@
|
||||
version: "0.3.0"
|
||||
name: headlamp-intel-gpu-plugin
|
||||
version: "0.4.0"
|
||||
name: intel-gpu
|
||||
displayName: Intel GPU
|
||||
description: >-
|
||||
Headlamp plugin for Intel GPU device plugin visibility and monitoring.
|
||||
@@ -71,7 +71,7 @@ changes:
|
||||
description: "App bar health badge: hidden when no Intel GPU plugin detected"
|
||||
|
||||
annotations:
|
||||
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.3.0/headlamp-intel-gpu-plugin-0.3.0.tar.gz"
|
||||
headlamp/plugin/archive-checksum: "sha256:fdc53099ee3123680f24fe4a319b753ca3d030aac31abd4e3f383221085c9c2d"
|
||||
headlamp/plugin/archive-url: "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/releases/download/v0.4.0/intel-gpu-0.4.0.tar.gz"
|
||||
headlamp/plugin/archive-checksum: sha256:f529794d7995b35b954fa32c10874fa8367f6f5cd8040600e47a3013373219df
|
||||
headlamp/plugin/version-compat: ">=0.20.0"
|
||||
headlamp/plugin/distro-compat: "in-cluster,web,app"
|
||||
|
||||
Generated
+4
-4
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "headlamp-intel-gpu-plugin",
|
||||
"version": "0.1.0",
|
||||
"name": "intel-gpu",
|
||||
"version": "0.4.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "headlamp-intel-gpu-plugin",
|
||||
"version": "0.1.0",
|
||||
"name": "intel-gpu",
|
||||
"version": "0.4.0",
|
||||
"license": "Apache-2.0",
|
||||
"devDependencies": {
|
||||
"@kinvolk/headlamp-plugin": "^0.13.0"
|
||||
|
||||
+8
-4
@@ -1,12 +1,16 @@
|
||||
{
|
||||
"name": "headlamp-intel-gpu-plugin",
|
||||
"version": "0.3.0",
|
||||
"name": "intel-gpu",
|
||||
"version": "0.4.0",
|
||||
"description": "Headlamp plugin for Intel GPU device plugin visibility and monitoring",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/cpfarhood/headlamp-intel-gpu-plugin.git"
|
||||
"url": "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin.git"
|
||||
},
|
||||
"author": "cpfarhood",
|
||||
"bugs": {
|
||||
"url": "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin/issues"
|
||||
},
|
||||
"homepage": "https://github.com/privilegedescalation/headlamp-intel-gpu-plugin#readme",
|
||||
"author": "privilegedescalation",
|
||||
"license": "Apache-2.0",
|
||||
"scripts": {
|
||||
"start": "headlamp-plugin start",
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
||||
"extends": ["config:recommended"]
|
||||
}
|
||||
@@ -116,9 +116,11 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
||||
// Intel device plugins operator deployment
|
||||
`/api/v1/pods?labelSelector=${encodeURIComponent('app=intel-gpu-plugin')}`,
|
||||
// Alternative: by component label
|
||||
`/api/v1/pods?labelSelector=${encodeURIComponent('app.kubernetes.io/name=intel-gpu-plugin')}`,
|
||||
`/api/v1/pods?labelSelector=${encodeURIComponent(
|
||||
'app.kubernetes.io/name=intel-gpu-plugin'
|
||||
)}`,
|
||||
// Intel device plugins from inteldeviceplugins-system namespace
|
||||
`/api/v1/namespaces/inteldeviceplugins-system/pods`,
|
||||
'/api/v1/namespaces/inteldeviceplugins-system/pods',
|
||||
];
|
||||
|
||||
const foundPluginPods: IntelGpuPod[] = [];
|
||||
@@ -155,7 +157,9 @@ export function IntelGpuDataProvider({ children }: { children: React.ReactNode }
|
||||
}
|
||||
|
||||
void fetchAsync();
|
||||
return () => { cancelled = true; };
|
||||
return () => {
|
||||
cancelled = true;
|
||||
};
|
||||
}, [refreshKey]);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
+4
-8
@@ -12,18 +12,18 @@ import {
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
getPodGpuRequests,
|
||||
type GpuDevicePlugin,
|
||||
INTEL_GPU_NODE_LABEL,
|
||||
INTEL_GPU_RESOURCE,
|
||||
INTEL_GPU_XE_RESOURCE,
|
||||
type IntelGpuNode,
|
||||
type IntelGpuPod,
|
||||
isGpuRequestingPod,
|
||||
isIntelGpuNode,
|
||||
isKubeList,
|
||||
isNodeReady,
|
||||
pluginStatusText,
|
||||
pluginStatusToStatus,
|
||||
type GpuDevicePlugin,
|
||||
type IntelGpuNode,
|
||||
type IntelGpuPod,
|
||||
} from './k8s';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -413,11 +413,7 @@ describe('formatGpuType', () => {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('pluginStatusToStatus', () => {
|
||||
function makePlugin(
|
||||
desired: number,
|
||||
ready: number,
|
||||
unavailable = 0
|
||||
): GpuDevicePlugin {
|
||||
function makePlugin(desired: number, ready: number, unavailable = 0): GpuDevicePlugin {
|
||||
return {
|
||||
apiVersion: 'deviceplugin.intel.com/v1',
|
||||
kind: 'GpuDevicePlugin',
|
||||
|
||||
+21
-28
@@ -28,8 +28,7 @@ export const INTEL_DISCRETE_GPU_NODE_ROLE = 'node-role.kubernetes.io/gpu';
|
||||
export const INTEL_INTEGRATED_GPU_NODE_ROLE = 'node-role.kubernetes.io/igpu';
|
||||
|
||||
/** Label selector for Intel GPU device plugin DaemonSet pods */
|
||||
export const INTEL_GPU_PLUGIN_LABEL_SELECTOR =
|
||||
'app=intel-gpu-plugin';
|
||||
export const INTEL_GPU_PLUGIN_LABEL_SELECTOR = 'app=intel-gpu-plugin';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generic Kubernetes object base shapes
|
||||
@@ -194,9 +193,12 @@ export function getNodeGpuType(node: IntelGpuNode): GpuType {
|
||||
|
||||
export function formatGpuType(type: GpuType): string {
|
||||
switch (type) {
|
||||
case 'discrete': return 'Discrete';
|
||||
case 'integrated': return 'Integrated';
|
||||
default: return 'Unknown';
|
||||
case 'discrete':
|
||||
return 'Discrete';
|
||||
case 'integrated':
|
||||
return 'Integrated';
|
||||
default:
|
||||
return 'Unknown';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -272,9 +274,11 @@ export function isIntelGpuPluginPod(pod: unknown): pod is IntelGpuPod {
|
||||
const meta = obj['metadata'] as Record<string, unknown> | undefined;
|
||||
const labels = meta?.['labels'] as Record<string, string> | undefined;
|
||||
if (!labels) return false;
|
||||
return labels['app'] === 'intel-gpu-plugin' ||
|
||||
(labels['app.kubernetes.io/name'] === 'intel-gpu-plugin') ||
|
||||
(labels['component'] === 'intel-gpu-plugin');
|
||||
return (
|
||||
labels['app'] === 'intel-gpu-plugin' ||
|
||||
labels['app.kubernetes.io/name'] === 'intel-gpu-plugin' ||
|
||||
labels['component'] === 'intel-gpu-plugin'
|
||||
);
|
||||
}
|
||||
|
||||
export function filterIntelGpuPluginPods(items: unknown[]): IntelGpuPod[] {
|
||||
@@ -284,10 +288,7 @@ export function filterIntelGpuPluginPods(items: unknown[]): IntelGpuPod[] {
|
||||
/** Get total GPU requests from a pod's containers */
|
||||
export function getPodGpuRequests(pod: IntelGpuPod): Record<string, string> {
|
||||
const totals: Record<string, number> = {};
|
||||
const allContainers = [
|
||||
...(pod.spec?.containers ?? []),
|
||||
...(pod.spec?.initContainers ?? []),
|
||||
];
|
||||
const allContainers = [...(pod.spec?.containers ?? []), ...(pod.spec?.initContainers ?? [])];
|
||||
for (const c of allContainers) {
|
||||
const requests = c.resources?.requests ?? {};
|
||||
for (const [key, value] of Object.entries(requests)) {
|
||||
@@ -300,15 +301,11 @@ export function getPodGpuRequests(pod: IntelGpuPod): Record<string, string> {
|
||||
}
|
||||
|
||||
export function isPodReady(pod: IntelGpuPod): boolean {
|
||||
return (
|
||||
pod.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false
|
||||
);
|
||||
return pod.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false;
|
||||
}
|
||||
|
||||
export function getPodRestarts(pod: IntelGpuPod): number {
|
||||
return (
|
||||
pod.status?.containerStatuses?.reduce((sum, c) => sum + c.restartCount, 0) ?? 0
|
||||
);
|
||||
return pod.status?.containerStatuses?.reduce((sum, c) => sum + c.restartCount, 0) ?? 0;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -330,9 +327,7 @@ export function isKubeList(value: unknown): value is KubeList<unknown> {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function isNodeReady(node: IntelGpuNode): boolean {
|
||||
return (
|
||||
node.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false
|
||||
);
|
||||
return node.status?.conditions?.some(c => c.type === 'Ready' && c.status === 'True') ?? false;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -359,11 +354,11 @@ export function formatAge(timestamp: string | undefined): string {
|
||||
export function formatGpuResourceName(resourceKey: string): string {
|
||||
const name = resourceKey.replace(INTEL_GPU_RESOURCE_PREFIX, '');
|
||||
const map: Record<string, string> = {
|
||||
'i915': 'GPU (i915)',
|
||||
'xe': 'GPU (Xe)',
|
||||
'millicores': 'GPU Millicores',
|
||||
i915: 'GPU (i915)',
|
||||
xe: 'GPU (Xe)',
|
||||
millicores: 'GPU Millicores',
|
||||
'memory.max': 'GPU Memory (max)',
|
||||
'tiles': 'GPU Tiles',
|
||||
tiles: 'GPU Tiles',
|
||||
};
|
||||
return map[name] ?? name;
|
||||
}
|
||||
@@ -372,9 +367,7 @@ export function formatGpuResourceName(resourceKey: string): string {
|
||||
// Status helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function pluginStatusToStatus(
|
||||
plugin: GpuDevicePlugin
|
||||
): 'success' | 'warning' | 'error' {
|
||||
export function pluginStatusToStatus(plugin: GpuDevicePlugin): 'success' | 'warning' | 'error' {
|
||||
const desired = plugin.status?.desiredNumberScheduled ?? 0;
|
||||
const ready = plugin.status?.numberReady ?? 0;
|
||||
const unavailable = plugin.status?.numberUnavailable ?? 0;
|
||||
|
||||
+5
-6
@@ -64,14 +64,11 @@ const PROMETHEUS_SERVICES = [
|
||||
{ namespace: 'monitoring', service: 'prometheus', port: '9090' },
|
||||
];
|
||||
|
||||
async function queryPrometheus(
|
||||
query: string,
|
||||
prometheusPath: string
|
||||
): Promise<PrometheusResult[]> {
|
||||
async function queryPrometheus(query: string, prometheusPath: string): Promise<PrometheusResult[]> {
|
||||
const encoded = encodeURIComponent(query);
|
||||
const path = `${prometheusPath}/api/v1/query?query=${encoded}`;
|
||||
|
||||
const raw = await ApiProxy.request(path, { method: 'GET' }) as PrometheusResponse;
|
||||
const raw = (await ApiProxy.request(path, { method: 'GET' })) as PrometheusResponse;
|
||||
|
||||
if (raw?.status !== 'success') return [];
|
||||
return raw.data?.result ?? [];
|
||||
@@ -81,7 +78,9 @@ async function findPrometheusPath(): Promise<string | null> {
|
||||
for (const { namespace, service, port } of PROMETHEUS_SERVICES) {
|
||||
const basePath = `/api/v1/namespaces/${namespace}/services/${service}:${port}/proxy`;
|
||||
try {
|
||||
const raw = await ApiProxy.request(`${basePath}/api/v1/query?query=1`, { method: 'GET' }) as PrometheusResponse;
|
||||
const raw = (await ApiProxy.request(`${basePath}/api/v1/query?query=1`, {
|
||||
method: 'GET',
|
||||
})) as PrometheusResponse;
|
||||
if (raw?.status === 'success') return basePath;
|
||||
} catch {
|
||||
// try next
|
||||
|
||||
@@ -18,8 +18,7 @@ import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import { formatAge, isPodReady, pluginStatusText, pluginStatusToStatus } from '../api/k8s';
|
||||
|
||||
export default function DevicePluginsPage() {
|
||||
const { devicePlugins, pluginPods, crdAvailable, loading, error, refresh } =
|
||||
useIntelGpuContext();
|
||||
const { devicePlugins, pluginPods, crdAvailable, loading, error, refresh } = useIntelGpuContext();
|
||||
|
||||
if (loading) {
|
||||
return <Loader title="Loading device plugin data..." />;
|
||||
@@ -27,7 +26,14 @@ export default function DevicePluginsPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Device Plugins" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -102,7 +108,10 @@ export default function DevicePluginsPage() {
|
||||
)}
|
||||
|
||||
{devicePlugins.map(plugin => (
|
||||
<SectionBox key={plugin.metadata.uid ?? plugin.metadata.name} title={`GpuDevicePlugin: ${plugin.metadata.name}`}>
|
||||
<SectionBox
|
||||
key={plugin.metadata.uid ?? plugin.metadata.name}
|
||||
title={`GpuDevicePlugin: ${plugin.metadata.name}`}
|
||||
>
|
||||
<NameValueTable
|
||||
rows={[
|
||||
{
|
||||
@@ -146,14 +155,14 @@ export default function DevicePluginsPage() {
|
||||
value: String(plugin.status?.numberReady ?? '—'),
|
||||
},
|
||||
...(plugin.status?.numberUnavailable
|
||||
? [{
|
||||
name: 'Unavailable Nodes',
|
||||
value: (
|
||||
<StatusLabel status="error">
|
||||
{plugin.status.numberUnavailable}
|
||||
</StatusLabel>
|
||||
),
|
||||
}]
|
||||
? [
|
||||
{
|
||||
name: 'Unavailable Nodes',
|
||||
value: (
|
||||
<StatusLabel status="error">{plugin.status.numberUnavailable}</StatusLabel>
|
||||
),
|
||||
},
|
||||
]
|
||||
: []),
|
||||
{
|
||||
name: 'Node Selector',
|
||||
@@ -177,12 +186,12 @@ export default function DevicePluginsPage() {
|
||||
<SectionBox title="Plugin Daemon Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'Ready',
|
||||
getter: (p) => (
|
||||
getter: p => (
|
||||
<StatusLabel status={isPodReady(p) ? 'success' : 'warning'}>
|
||||
{isPodReady(p) ? 'Ready' : p.status?.phase ?? 'Unknown'}
|
||||
</StatusLabel>
|
||||
@@ -190,10 +199,9 @@ export default function DevicePluginsPage() {
|
||||
},
|
||||
{
|
||||
label: 'Restarts',
|
||||
getter: (p) => {
|
||||
const restarts = p.status?.containerStatuses?.reduce(
|
||||
(sum, c) => sum + c.restartCount, 0
|
||||
) ?? 0;
|
||||
getter: p => {
|
||||
const restarts =
|
||||
p.status?.containerStatuses?.reduce((sum, c) => sum + c.restartCount, 0) ?? 0;
|
||||
return restarts > 0 ? (
|
||||
<StatusLabel status="warning">{restarts}</StatusLabel>
|
||||
) : (
|
||||
@@ -201,7 +209,7 @@ export default function DevicePluginsPage() {
|
||||
);
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={pluginPods}
|
||||
/>
|
||||
|
||||
@@ -35,7 +35,13 @@ import {
|
||||
} from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
||||
import React, { useCallback, useEffect, useState } from 'react';
|
||||
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics } from '../api/metrics';
|
||||
import {
|
||||
fetchGpuMetrics,
|
||||
formatPercent,
|
||||
formatWatts,
|
||||
GpuChipMetrics,
|
||||
GpuMetrics,
|
||||
} from '../api/metrics';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Power bar
|
||||
@@ -43,7 +49,8 @@ import { fetchGpuMetrics, formatPercent, formatWatts, GpuChipMetrics, GpuMetrics
|
||||
|
||||
function PowerBar({ watts, maxWatts }: { watts: number; maxWatts: number | null }) {
|
||||
const pct = maxWatts && maxWatts > 0 ? Math.min(100, Math.round((watts / maxWatts) * 100)) : null;
|
||||
const color = pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||
const color =
|
||||
pct === null ? '#0071c5' : pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||
|
||||
return (
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||
@@ -91,9 +98,12 @@ function GpuChipCard({ chip }: { chip: GpuChipMetrics }) {
|
||||
{ name: 'GPU (PCI)', value: chip.chip },
|
||||
{
|
||||
name: 'Current Power',
|
||||
value: chip.powerWatts !== null
|
||||
? <PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
|
||||
: <StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>,
|
||||
value:
|
||||
chip.powerWatts !== null ? (
|
||||
<PowerBar watts={chip.powerWatts} maxWatts={chip.powerMaxWatts} />
|
||||
) : (
|
||||
<StatusLabel status="warning">No data — needs ≥5m of scrape history</StatusLabel>
|
||||
),
|
||||
},
|
||||
];
|
||||
|
||||
@@ -123,8 +133,9 @@ function MetricRequirements() {
|
||||
<>
|
||||
<StatusLabel status="success">Available — discrete GPU nodes</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon collector (enabled by default).
|
||||
Requires the i915 kernel driver on the node. iGPU nodes do not expose hwmon sensors.
|
||||
Source: <code>node_hwmon_energy_joule_total</code> via node-exporter hwmon
|
||||
collector (enabled by default). Requires the i915 kernel driver on the node. iGPU
|
||||
nodes do not expose hwmon sensors.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
@@ -136,8 +147,9 @@ function MetricRequirements() {
|
||||
<StatusLabel status="error">Not available</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
i915 exposes <code>gt_*_freq_mhz</code> via DRM sysfs but node-exporter's{' '}
|
||||
<code>--collector.drm</code> flag is AMD-only and does not read these files.
|
||||
A custom exporter or textfile-collector sidecar writing these values would be required.
|
||||
<code>--collector.drm</code> flag is AMD-only and does not read these files. A
|
||||
custom exporter or textfile-collector sidecar writing these values would be
|
||||
required.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
@@ -148,8 +160,8 @@ function MetricRequirements() {
|
||||
<>
|
||||
<StatusLabel status="error">Not available</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
No standard Prometheus collector exposes i915 engine busy percentage.
|
||||
Would require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
|
||||
No standard Prometheus collector exposes i915 engine busy percentage. Would
|
||||
require intel-gpu-top, XPU Manager, or a custom DRM-based exporter.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
@@ -160,8 +172,8 @@ function MetricRequirements() {
|
||||
<>
|
||||
<StatusLabel status="error">No metrics available</StatusLabel>
|
||||
<div style={{ marginTop: '4px', fontSize: '12px', color: '#666' }}>
|
||||
The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics
|
||||
are available for iGPU nodes regardless of configuration.
|
||||
The integrated GPU driver does not expose hwmon sensors. No Prometheus metrics are
|
||||
available for iGPU nodes regardless of configuration.
|
||||
</div>
|
||||
</>
|
||||
),
|
||||
@@ -190,7 +202,9 @@ export default function MetricsPage() {
|
||||
const result = await fetchGpuMetrics();
|
||||
setMetrics(result);
|
||||
if (!result) {
|
||||
setFetchError('Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.');
|
||||
setFetchError(
|
||||
'Could not reach Prometheus. Ensure kube-prometheus-stack is installed in the monitoring namespace.'
|
||||
);
|
||||
}
|
||||
} catch (e: unknown) {
|
||||
setFetchError(e instanceof Error ? e.message : String(e));
|
||||
@@ -211,7 +225,14 @@ export default function MetricsPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Metrics" />
|
||||
<button
|
||||
onClick={() => void doFetch()}
|
||||
@@ -246,7 +267,8 @@ export default function MetricsPage() {
|
||||
},
|
||||
{
|
||||
name: 'Checked services',
|
||||
value: 'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
|
||||
value:
|
||||
'kube-prometheus-stack-prometheus:9090, prometheus-operated:9090, prometheus:9090 (monitoring namespace)',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
@@ -261,17 +283,22 @@ export default function MetricsPage() {
|
||||
name: 'Status',
|
||||
value: (
|
||||
<StatusLabel status="warning">
|
||||
Prometheus reachable — no node_hwmon_chip_names{chip_name="i915"} found
|
||||
Prometheus reachable — no
|
||||
node_hwmon_chip_names{chip_name="i915"} found
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{
|
||||
name: 'GPU Nodes',
|
||||
value: gpuNodes.length > 0 ? gpuNodes.map(n => n.metadata.name).join(', ') : 'None detected',
|
||||
value:
|
||||
gpuNodes.length > 0
|
||||
? gpuNodes.map(n => n.metadata.name).join(', ')
|
||||
: 'None detected',
|
||||
},
|
||||
{
|
||||
name: 'Likely cause',
|
||||
value: 'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
|
||||
value:
|
||||
'node-exporter is not running on the GPU nodes, or the hwmon collector is disabled.',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
@@ -301,7 +328,8 @@ export default function MetricsPage() {
|
||||
},
|
||||
{
|
||||
name: 'Query',
|
||||
value: 'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
|
||||
value:
|
||||
'rate(node_hwmon_energy_joule_total[5m]) joined with node_hwmon_chip_names{chip_name="i915"}',
|
||||
},
|
||||
]}
|
||||
/>
|
||||
|
||||
@@ -19,10 +19,8 @@ import {
|
||||
getGpuResources,
|
||||
getNodeGpuType,
|
||||
INTEL_GPU_RESOURCE,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
INTEL_GPU_XE_RESOURCE,
|
||||
isIntelGpuNode,
|
||||
isNodeReady,
|
||||
} from '../api/k8s';
|
||||
|
||||
interface NodeDetailSectionProps {
|
||||
@@ -40,9 +38,7 @@ export default function NodeDetailSection({ resource }: NodeDetailSectionProps)
|
||||
|
||||
// Extract the raw Kubernetes JSON — Headlamp KubeObject wraps it in jsonData
|
||||
const rawNode =
|
||||
resource.jsonData && typeof resource.jsonData === 'object'
|
||||
? resource.jsonData
|
||||
: resource;
|
||||
resource.jsonData && typeof resource.jsonData === 'object' ? resource.jsonData : resource;
|
||||
|
||||
// Only render for Node resources that have Intel GPU
|
||||
if (!isIntelGpuNode(rawNode)) return null;
|
||||
@@ -63,9 +59,7 @@ export default function NodeDetailSection({ resource }: NodeDetailSectionProps)
|
||||
const gpuType = getNodeGpuType(node as any);
|
||||
|
||||
// Find GPU pods scheduled on this node
|
||||
const podsOnNode = loading
|
||||
? []
|
||||
: gpuPods.filter(p => p.spec?.nodeName === nodeName);
|
||||
const podsOnNode = loading ? [] : gpuPods.filter(p => p.spec?.nodeName === nodeName);
|
||||
|
||||
if (Object.keys(capacity).length === 0 && Object.keys(allocatable).length === 0) {
|
||||
return null;
|
||||
@@ -81,18 +75,18 @@ export default function NodeDetailSection({ resource }: NodeDetailSectionProps)
|
||||
}
|
||||
}
|
||||
for (const pod of podsOnNode.filter(p => p.status?.phase === 'Running')) {
|
||||
const reqs = pod.spec?.containers?.flatMap(c =>
|
||||
Object.entries(c.resources?.requests ?? {}).filter(([k]) =>
|
||||
k === INTEL_GPU_RESOURCE || k === INTEL_GPU_XE_RESOURCE
|
||||
)
|
||||
) ?? [];
|
||||
const reqs =
|
||||
pod.spec?.containers?.flatMap(c =>
|
||||
Object.entries(c.resources?.requests ?? {}).filter(
|
||||
([k]) => k === INTEL_GPU_RESOURCE || k === INTEL_GPU_XE_RESOURCE
|
||||
)
|
||||
) ?? [];
|
||||
for (const [, val] of reqs) {
|
||||
gpuInUse += parseInt(val, 10) || 0;
|
||||
}
|
||||
}
|
||||
|
||||
const utilizationPct =
|
||||
gpuAllocatable > 0 ? Math.round((gpuInUse / gpuAllocatable) * 100) : 0;
|
||||
const utilizationPct = gpuAllocatable > 0 ? Math.round((gpuInUse / gpuAllocatable) * 100) : 0;
|
||||
const utilizationStatus: 'success' | 'warning' | 'error' =
|
||||
utilizationPct >= 90 ? 'error' : utilizationPct >= 70 ? 'warning' : 'success';
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ import {
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
INTEL_GPU_RESOURCE,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
INTEL_GPU_XE_RESOURCE,
|
||||
IntelGpuNode,
|
||||
isNodeReady,
|
||||
@@ -33,13 +32,7 @@ import {
|
||||
// GPU allocation bar component
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function GpuAllocationBar({
|
||||
used,
|
||||
allocatable,
|
||||
}: {
|
||||
used: number;
|
||||
allocatable: number;
|
||||
}) {
|
||||
function GpuAllocationBar({ used, allocatable }: { used: number; allocatable: number }) {
|
||||
if (allocatable === 0) return <span>—</span>;
|
||||
const pct = Math.min(100, Math.round((used / allocatable) * 100));
|
||||
const color = pct >= 90 ? '#d32f2f' : pct >= 70 ? '#f57c00' : '#0071c5';
|
||||
@@ -105,21 +98,18 @@ function NodeDetailCard({
|
||||
name: 'GPU Type',
|
||||
value: formatGpuType(gpuType),
|
||||
},
|
||||
...(gpuCount > 0
|
||||
? [{ name: 'GPU Devices (i915/xe)', value: String(gpuCount) }]
|
||||
: []),
|
||||
...(gpuCount > 0 ? [{ name: 'GPU Devices (i915/xe)', value: String(gpuCount) }] : []),
|
||||
...Object.entries(capacityResources).map(([key, cap]) => {
|
||||
const alloc = parseInt(allocatableResources[key] ?? '0', 10);
|
||||
const total = parseInt(cap, 10);
|
||||
return {
|
||||
name: `${formatGpuResourceName(key)} (capacity)`,
|
||||
value: String(total),
|
||||
};
|
||||
}),
|
||||
...Object.entries(allocatableResources).map(([key, alloc]) => {
|
||||
...Object.entries(allocatableResources).map(([key, value]) => {
|
||||
return {
|
||||
name: `${formatGpuResourceName(key)} (allocatable)`,
|
||||
value: alloc ?? '0',
|
||||
value: value ?? '0',
|
||||
};
|
||||
}),
|
||||
{
|
||||
@@ -200,7 +190,14 @@ export default function NodesPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Nodes" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -256,28 +253,28 @@ export default function NodesPage() {
|
||||
<SectionBox title="GPU Node Summary">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Node', getter: (d) => d.node.metadata.name },
|
||||
{ label: 'Node', getter: d => d.node.metadata.name },
|
||||
{
|
||||
label: 'Ready',
|
||||
getter: (d) => (
|
||||
getter: d => (
|
||||
<StatusLabel status={d.ready ? 'success' : 'error'}>
|
||||
{d.ready ? 'Ready' : 'Not Ready'}
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{ label: 'GPU Type', getter: (d) => formatGpuType(d.gpuType) },
|
||||
{ label: 'GPU Devices', getter: (d) => String(d.gpuCount || '—') },
|
||||
{ label: 'GPU Type', getter: d => formatGpuType(d.gpuType) },
|
||||
{ label: 'GPU Devices', getter: d => String(d.gpuCount || '—') },
|
||||
{
|
||||
label: 'Allocation',
|
||||
getter: (d) => (
|
||||
getter: d => (
|
||||
<GpuAllocationBar
|
||||
used={d.podsOnNode.length}
|
||||
allocatable={d.totalAllocatable || d.gpuCount}
|
||||
/>
|
||||
),
|
||||
},
|
||||
{ label: 'GPU Pods', getter: (d) => String(d.podsOnNode.length) },
|
||||
{ label: 'Age', getter: (d) => formatAge(d.node.metadata.creationTimestamp) },
|
||||
{ label: 'GPU Pods', getter: d => String(d.podsOnNode.length) },
|
||||
{ label: 'Age', getter: d => formatAge(d.node.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={tableData}
|
||||
/>
|
||||
|
||||
@@ -18,7 +18,6 @@ import React from 'react';
|
||||
import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import {
|
||||
formatAge,
|
||||
formatGpuType,
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
getPodGpuRequests,
|
||||
@@ -42,7 +41,8 @@ function gpuTypeChartData(
|
||||
): Array<{ name: string; value: number; fill: string }> {
|
||||
const data = [];
|
||||
if (discreteCount > 0) data.push({ name: 'Discrete', value: discreteCount, fill: '#0071c5' });
|
||||
if (integratedCount > 0) data.push({ name: 'Integrated', value: integratedCount, fill: '#60a4dc' });
|
||||
if (integratedCount > 0)
|
||||
data.push({ name: 'Integrated', value: integratedCount, fill: '#60a4dc' });
|
||||
if (unknownCount > 0) data.push({ name: 'Unknown', value: unknownCount, fill: '#9e9e9e' });
|
||||
return data;
|
||||
}
|
||||
@@ -113,9 +113,7 @@ export default function OverviewPage() {
|
||||
}
|
||||
|
||||
const gpuUtilizationPct =
|
||||
totalCapacityGpus > 0
|
||||
? Math.round((totalAllocatedGpus / totalCapacityGpus) * 100)
|
||||
: 0;
|
||||
totalCapacityGpus > 0 ? Math.round((totalAllocatedGpus / totalCapacityGpus) * 100) : 0;
|
||||
|
||||
const chartData = gpuTypeChartData(discreteCount, integratedCount, unknownCount);
|
||||
const totalGpuNodes = gpuNodes.length;
|
||||
@@ -133,7 +131,14 @@ export default function OverviewPage() {
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Overview" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -218,26 +223,25 @@ export default function OverviewPage() {
|
||||
<SectionBox title="Device Plugin Status">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{
|
||||
label: 'Status',
|
||||
getter: (p) => (
|
||||
<StatusLabel status={pluginStatusToStatus(p)}>
|
||||
{pluginStatusText(p)}
|
||||
</StatusLabel>
|
||||
getter: p => (
|
||||
<StatusLabel status={pluginStatusToStatus(p)}>{pluginStatusText(p)}</StatusLabel>
|
||||
),
|
||||
},
|
||||
{
|
||||
label: 'Monitoring',
|
||||
getter: (p) => p.spec.enableMonitoring ? (
|
||||
<StatusLabel status="success">Enabled</StatusLabel>
|
||||
) : (
|
||||
<StatusLabel status="warning">Disabled</StatusLabel>
|
||||
),
|
||||
getter: p =>
|
||||
p.spec.enableMonitoring ? (
|
||||
<StatusLabel status="success">Enabled</StatusLabel>
|
||||
) : (
|
||||
<StatusLabel status="warning">Disabled</StatusLabel>
|
||||
),
|
||||
},
|
||||
{ label: 'Shared/Node', getter: (p) => String(p.spec.sharedDevNum ?? 1) },
|
||||
{ label: 'Policy', getter: (p) => p.spec.preferredAllocationPolicy ?? '—' },
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Shared/Node', getter: p => String(p.spec.sharedDevNum ?? 1) },
|
||||
{ label: 'Policy', getter: p => p.spec.preferredAllocationPolicy ?? '—' },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={devicePlugins}
|
||||
/>
|
||||
@@ -249,18 +253,18 @@ export default function OverviewPage() {
|
||||
<SectionBox title="Plugin Daemon Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'Status',
|
||||
getter: (p) => (
|
||||
getter: p => (
|
||||
<StatusLabel status={isPodReady(p) ? 'success' : 'warning'}>
|
||||
{isPodReady(p) ? 'Ready' : p.status?.phase ?? 'Unknown'}
|
||||
</StatusLabel>
|
||||
),
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={pluginPods}
|
||||
/>
|
||||
@@ -271,7 +275,13 @@ export default function OverviewPage() {
|
||||
<SectionBox title="GPU Nodes">
|
||||
{totalGpuNodes > 0 && chartData.length > 0 && (
|
||||
<div style={{ marginBottom: '16px' }}>
|
||||
<div style={{ marginBottom: '8px', fontSize: '14px', color: 'var(--mui-palette-text-secondary)' }}>
|
||||
<div
|
||||
style={{
|
||||
marginBottom: '8px',
|
||||
fontSize: '14px',
|
||||
color: 'var(--mui-palette-text-secondary)',
|
||||
}}
|
||||
>
|
||||
GPU Type Distribution
|
||||
</div>
|
||||
<PercentageBar data={chartData} total={totalGpuNodes} />
|
||||
@@ -288,9 +298,15 @@ export default function OverviewPage() {
|
||||
),
|
||||
},
|
||||
{ name: 'Ready Nodes', value: String(readyNodeCount) },
|
||||
...(discreteCount > 0 ? [{ name: 'Discrete GPU Nodes', value: String(discreteCount) }] : []),
|
||||
...(integratedCount > 0 ? [{ name: 'Integrated GPU Nodes', value: String(integratedCount) }] : []),
|
||||
...(totalGpuCount > 0 ? [{ name: 'Total GPU Devices', value: String(totalGpuCount) }] : []),
|
||||
...(discreteCount > 0
|
||||
? [{ name: 'Discrete GPU Nodes', value: String(discreteCount) }]
|
||||
: []),
|
||||
...(integratedCount > 0
|
||||
? [{ name: 'Integrated GPU Nodes', value: String(integratedCount) }]
|
||||
: []),
|
||||
...(totalGpuCount > 0
|
||||
? [{ name: 'Total GPU Devices', value: String(totalGpuCount) }]
|
||||
: []),
|
||||
]}
|
||||
/>
|
||||
</SectionBox>
|
||||
@@ -299,13 +315,23 @@ export default function OverviewPage() {
|
||||
{totalCapacityGpus > 0 && (
|
||||
<SectionBox title="GPU Allocation">
|
||||
<div style={{ marginBottom: '16px' }}>
|
||||
<div style={{ marginBottom: '8px', fontSize: '14px', color: 'var(--mui-palette-text-secondary)' }}>
|
||||
<div
|
||||
style={{
|
||||
marginBottom: '8px',
|
||||
fontSize: '14px',
|
||||
color: 'var(--mui-palette-text-secondary)',
|
||||
}}
|
||||
>
|
||||
GPU Utilization ({gpuUtilizationPct}%)
|
||||
</div>
|
||||
<PercentageBar
|
||||
data={[
|
||||
{ name: 'In Use', value: totalAllocatedGpus, fill: '#0071c5' },
|
||||
{ name: 'Available', value: totalAllocatableGpus - totalAllocatedGpus, fill: '#e0e0e0' },
|
||||
{
|
||||
name: 'Available',
|
||||
value: totalAllocatableGpus - totalAllocatedGpus,
|
||||
fill: '#e0e0e0',
|
||||
},
|
||||
]}
|
||||
total={totalAllocatableGpus}
|
||||
/>
|
||||
@@ -336,13 +362,28 @@ export default function OverviewPage() {
|
||||
rows={[
|
||||
{ name: 'Total GPU Pods', value: String(gpuPods.length) },
|
||||
...(podPhaseCounts.Running > 0
|
||||
? [{ name: 'Running', value: <StatusLabel status="success">{podPhaseCounts.Running}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Running',
|
||||
value: <StatusLabel status="success">{podPhaseCounts.Running}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(podPhaseCounts.Pending > 0
|
||||
? [{ name: 'Pending', value: <StatusLabel status="warning">{podPhaseCounts.Pending}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Pending',
|
||||
value: <StatusLabel status="warning">{podPhaseCounts.Pending}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(podPhaseCounts.Failed > 0
|
||||
? [{ name: 'Failed', value: <StatusLabel status="error">{podPhaseCounts.Failed}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Failed',
|
||||
value: <StatusLabel status="error">{podPhaseCounts.Failed}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
]}
|
||||
/>
|
||||
@@ -353,12 +394,12 @@ export default function OverviewPage() {
|
||||
<SectionBox title="Active GPU Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'GPU Request',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const reqs = getPodGpuRequests(p);
|
||||
const parts: string[] = [];
|
||||
for (const [key, val] of Object.entries(reqs)) {
|
||||
@@ -368,7 +409,7 @@ export default function OverviewPage() {
|
||||
return parts.join(', ') || '—';
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={gpuPods.filter(p => p.status?.phase === 'Running').slice(0, 10)}
|
||||
/>
|
||||
|
||||
@@ -25,9 +25,7 @@ interface PodDetailSectionProps {
|
||||
export default function PodDetailSection({ resource }: PodDetailSectionProps) {
|
||||
// Extract raw Kubernetes JSON
|
||||
const rawPod =
|
||||
resource.jsonData && typeof resource.jsonData === 'object'
|
||||
? resource.jsonData
|
||||
: resource;
|
||||
resource.jsonData && typeof resource.jsonData === 'object' ? resource.jsonData : resource;
|
||||
|
||||
// Only render for pods that request Intel GPU resources
|
||||
if (!isGpuRequestingPod(rawPod)) return null;
|
||||
@@ -98,9 +96,7 @@ export default function PodDetailSection({ resource }: PodDetailSectionProps) {
|
||||
rows={[
|
||||
{
|
||||
name: 'Phase',
|
||||
value: (
|
||||
<StatusLabel status={phaseStatus}>{phase ?? 'Unknown'}</StatusLabel>
|
||||
),
|
||||
value: <StatusLabel status={phaseStatus}>{phase ?? 'Unknown'}</StatusLabel>,
|
||||
},
|
||||
{
|
||||
name: 'Scheduled Node',
|
||||
|
||||
+55
-30
@@ -17,11 +17,10 @@ import { useIntelGpuContext } from '../api/IntelGpuDataContext';
|
||||
import {
|
||||
formatAge,
|
||||
formatGpuResourceName,
|
||||
IntelGpuPod,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
isPodReady,
|
||||
getPodGpuRequests,
|
||||
getPodRestarts,
|
||||
INTEL_GPU_RESOURCE_PREFIX,
|
||||
IntelGpuPod,
|
||||
} from '../api/k8s';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -30,11 +29,16 @@ import {
|
||||
|
||||
function phaseToStatus(phase: string | undefined): 'success' | 'warning' | 'error' {
|
||||
switch (phase) {
|
||||
case 'Running': return 'success';
|
||||
case 'Succeeded': return 'success';
|
||||
case 'Pending': return 'warning';
|
||||
case 'Failed': return 'error';
|
||||
default: return 'warning';
|
||||
case 'Running':
|
||||
return 'success';
|
||||
case 'Succeeded':
|
||||
return 'success';
|
||||
case 'Pending':
|
||||
return 'warning';
|
||||
case 'Failed':
|
||||
return 'error';
|
||||
default:
|
||||
return 'warning';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,13 +102,17 @@ export default function PodsPage() {
|
||||
const running = gpuPods.filter(p => p.status?.phase === 'Running');
|
||||
const pending = gpuPods.filter(p => p.status?.phase === 'Pending');
|
||||
const failed = gpuPods.filter(p => p.status?.phase === 'Failed');
|
||||
const other = gpuPods.filter(
|
||||
p => !['Running', 'Pending', 'Failed'].includes(p.status?.phase ?? '')
|
||||
);
|
||||
|
||||
return (
|
||||
<>
|
||||
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '20px' }}>
|
||||
<div
|
||||
style={{
|
||||
display: 'flex',
|
||||
justifyContent: 'space-between',
|
||||
alignItems: 'center',
|
||||
marginBottom: '20px',
|
||||
}}
|
||||
>
|
||||
<SectionHeader title="Intel GPU — Pods" />
|
||||
<button
|
||||
onClick={refresh}
|
||||
@@ -161,13 +169,28 @@ export default function PodsPage() {
|
||||
rows={[
|
||||
{ name: 'Total GPU Pods', value: String(gpuPods.length) },
|
||||
...(running.length > 0
|
||||
? [{ name: 'Running', value: <StatusLabel status="success">{running.length}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Running',
|
||||
value: <StatusLabel status="success">{running.length}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(pending.length > 0
|
||||
? [{ name: 'Pending', value: <StatusLabel status="warning">{pending.length}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Pending',
|
||||
value: <StatusLabel status="warning">{pending.length}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
...(failed.length > 0
|
||||
? [{ name: 'Failed', value: <StatusLabel status="error">{failed.length}</StatusLabel> }]
|
||||
? [
|
||||
{
|
||||
name: 'Failed',
|
||||
value: <StatusLabel status="error">{failed.length}</StatusLabel>,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
]}
|
||||
/>
|
||||
@@ -179,12 +202,12 @@ export default function PodsPage() {
|
||||
<SectionBox title="All GPU Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: (p) => p.spec?.nodeName ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Node', getter: p => p.spec?.nodeName ?? '—' },
|
||||
{
|
||||
label: 'Phase',
|
||||
getter: (p) => (
|
||||
getter: p => (
|
||||
<StatusLabel status={phaseToStatus(p.status?.phase)}>
|
||||
{p.status?.phase ?? 'Unknown'}
|
||||
</StatusLabel>
|
||||
@@ -192,11 +215,11 @@ export default function PodsPage() {
|
||||
},
|
||||
{
|
||||
label: 'GPU Resources',
|
||||
getter: (p) => <GpuContainerList pod={p} />,
|
||||
getter: p => <GpuContainerList pod={p} />,
|
||||
},
|
||||
{
|
||||
label: 'Restarts',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const restarts = getPodRestarts(p);
|
||||
return restarts > 0 ? (
|
||||
<StatusLabel status="warning">{restarts}</StatusLabel>
|
||||
@@ -205,7 +228,7 @@ export default function PodsPage() {
|
||||
);
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={gpuPods}
|
||||
/>
|
||||
@@ -217,25 +240,27 @@ export default function PodsPage() {
|
||||
<SectionBox title="Attention: Pending GPU Pods">
|
||||
<SimpleTable
|
||||
columns={[
|
||||
{ label: 'Name', getter: (p) => p.metadata.name },
|
||||
{ label: 'Namespace', getter: (p) => p.metadata.namespace ?? '—' },
|
||||
{ label: 'Name', getter: p => p.metadata.name },
|
||||
{ label: 'Namespace', getter: p => p.metadata.namespace ?? '—' },
|
||||
{
|
||||
label: 'GPU Resources',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const reqs = getPodGpuRequests(p);
|
||||
return Object.entries(reqs)
|
||||
.map(([k, v]) => `${formatGpuResourceName(k)}: ${v}`)
|
||||
.join(', ') || '—';
|
||||
return (
|
||||
Object.entries(reqs)
|
||||
.map(([k, v]) => `${formatGpuResourceName(k)}: ${v}`)
|
||||
.join(', ') || '—'
|
||||
);
|
||||
},
|
||||
},
|
||||
{
|
||||
label: 'Waiting Reason',
|
||||
getter: (p) => {
|
||||
getter: p => {
|
||||
const reason = p.status?.containerStatuses?.[0]?.state?.waiting?.reason;
|
||||
return reason ?? '—';
|
||||
},
|
||||
},
|
||||
{ label: 'Age', getter: (p) => formatAge(p.metadata.creationTimestamp) },
|
||||
{ label: 'Age', getter: p => formatAge(p.metadata.creationTimestamp) },
|
||||
]}
|
||||
data={pending}
|
||||
/>
|
||||
|
||||
@@ -11,12 +11,7 @@
|
||||
|
||||
import { StatusLabel } from '@kinvolk/headlamp-plugin/lib/CommonComponents';
|
||||
import React from 'react';
|
||||
import {
|
||||
formatGpuType,
|
||||
getNodeGpuCount,
|
||||
getNodeGpuType,
|
||||
isIntelGpuNode,
|
||||
} from '../../api/k8s';
|
||||
import { formatGpuType, getNodeGpuCount, getNodeGpuType, isIntelGpuNode } from '../../api/k8s';
|
||||
|
||||
/** Build GPU columns to append to the native Nodes table. */
|
||||
export function buildNodeGpuColumns() {
|
||||
@@ -33,11 +28,7 @@ export function buildNodeGpuColumns() {
|
||||
if (!isIntelGpuNode(raw)) return '—';
|
||||
const node = raw as Parameters<typeof getNodeGpuType>[0];
|
||||
const type = getNodeGpuType(node);
|
||||
return (
|
||||
<StatusLabel status="success">
|
||||
{formatGpuType(type)}
|
||||
</StatusLabel>
|
||||
);
|
||||
return <StatusLabel status="success">{formatGpuType(type)}</StatusLabel>;
|
||||
},
|
||||
},
|
||||
{
|
||||
|
||||
@@ -180,4 +180,3 @@ registerResourceTableColumnsProcessor(({ id, columns }) => {
|
||||
}
|
||||
return columns;
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user