Files
paperclip/server/src/services/gitea-skills.ts
T
Chris Farhood 33ab4f8cdd fork: address PR #19 review findings for Gitea skill support
- Fix GitHub Enterprise regression: dispatcher now probes for Gitea only
  on non-github.com hosts and falls back to the GitHub path for unknown
  hosts, preserving GHE support that the earlier strict github.com match
  broke.
- Refactor readUrlSkillImports into a flat dispatcher with a sibling
  readGitHubUrlSkillImports helper, mirroring readGiteaUrlSkillImports.
- Add SSRF guard (isPrivateOrLoopbackHost + assertPublicHost) in
  gitea-fetch; short-circuit probeGiteaHost and reject parseGiteaSourceUrl
  for loopback / RFC1918 / link-local literal IPs.
- Throw on fetchGiteaTreeBlobPaths cap-hit instead of silently returning a
  partial blob listing (would hide SKILL.md files).
- Validate non-empty repo in parseGiteaSourceUrl after .git strip.
- Remove dead resolveGiteaCommitSha + GiteaCommitResponse (unused since
  the branches-endpoint follow-up).
- Tests updated and extended.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-09 23:07:51 -04:00

301 lines
8.7 KiB
TypeScript

import path from "node:path";
import { unprocessable } from "../errors.js";
import {
assertPublicHost,
giteaApiBase,
giteaFetch,
getGiteaHostProbe,
giteaHostProbeCache,
isPrivateOrLoopbackHost,
resolveRawGiteaUrl,
resolveRawGiteaUrlLegacy,
setGiteaHostProbe,
} from "./gitea-fetch.js";
export {
assertPublicHost,
giteaApiBase,
giteaHostProbeCache,
isPrivateOrLoopbackHost,
resolveRawGiteaUrl,
resolveRawGiteaUrlLegacy,
setGiteaHostProbe,
getGiteaHostProbe,
};
const PROBE_TIMEOUT_MS = 3000;
const GITEA_TREE_PAGE_LIMIT = 1000;
export type GiteaSourceUrl = {
hostname: string;
owner: string;
repo: string;
ref: string;
basePath: string;
filePath: string | null;
explicitRef: boolean;
};
export type GiteaBranchResponse = {
name?: string;
commit?: { id?: string; url?: string };
};
export type GiteaRepoResponse = {
default_branch?: string;
};
export type GiteaTreeEntry = {
path?: string;
type?: string;
mode?: string;
sha?: string;
size?: number;
url?: string;
};
export type GiteaTreeResponse = {
sha?: string;
tree?: GiteaTreeEntry[];
truncated?: boolean;
};
function asString(value: unknown): string | null {
if (typeof value !== "string") return null;
const trimmed = value.trim();
return trimmed.length > 0 ? trimmed : null;
}
function isPlainRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
/**
* Parse a Gitea/Forgejo HTTPS repo URL into its components.
* Mirrors parseGitHubSourceUrl (server/src/services/company-skills.ts:634-660).
* Accepts:
* https://{host}/{owner}/{repo}
* https://{host}/{owner}/{repo}.git
* https://{host}/{owner}/{repo}/tree/{ref}/{basePath...}
* https://{host}/{owner}/{repo}/blob/{ref}/{filePath}
*/
export function parseGiteaSourceUrl(rawUrl: string): GiteaSourceUrl {
let url: URL;
try {
url = new URL(rawUrl);
} catch {
throw unprocessable("Invalid Gitea URL");
}
if (url.protocol !== "https:") {
throw unprocessable("Gitea source URL must use HTTPS");
}
const parts = url.pathname.split("/").filter(Boolean);
if (parts.length < 2) {
throw unprocessable("Invalid Gitea URL");
}
const owner = parts[0]!;
const repo = parts[1]!.replace(/\.git$/i, "");
if (!owner || !repo) {
throw unprocessable("Invalid Gitea URL: owner and repo are required");
}
let ref = "main";
let basePath = "";
let filePath: string | null = null;
let explicitRef = false;
if (parts[2] === "tree") {
ref = parts[3] ?? "main";
basePath = parts.slice(4).join("/");
explicitRef = true;
} else if (parts[2] === "blob") {
ref = parts[3] ?? "main";
filePath = parts.slice(4).join("/");
basePath = filePath ? path.posix.dirname(filePath) : "";
explicitRef = true;
}
assertPublicHost(url.hostname);
return { hostname: url.hostname, owner, repo, ref, basePath, filePath, explicitRef };
}
/**
* Probe a hostname to determine if it hosts a Gitea/Forgejo instance.
* GETs `https://{host}/api/v1/version` with a short timeout. Cached for
* the process lifetime in giteaHostProbeCache.
*
* Returns false without contacting the host for loopback / link-local /
* RFC1918 literal-IP hosts, to avoid being used as an SSRF probe.
*/
export async function probeGiteaHost(hostname: string): Promise<boolean> {
const key = hostname.toLowerCase();
const cached = getGiteaHostProbe(key);
if (cached !== undefined) return cached;
if (isPrivateOrLoopbackHost(key)) {
setGiteaHostProbe(key, false);
return false;
}
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), PROBE_TIMEOUT_MS);
let result = false;
try {
const response = await fetch(`https://${key}/api/v1/version`, {
method: "GET",
signal: controller.signal,
headers: { accept: "application/json" },
});
if (response.ok) {
const data = (await response.json().catch(() => null)) as unknown;
if (isPlainRecord(data) && typeof data.version === "string") {
result = true;
}
}
} catch {
// network error, abort, parse error — all treated as "not gitea"
} finally {
clearTimeout(timer);
}
setGiteaHostProbe(key, result);
return result;
}
export async function resolveGiteaDefaultBranch(
owner: string,
repo: string,
apiBase: string,
): Promise<string> {
const response = await fetchGiteaJson<GiteaRepoResponse>(`${apiBase}/repos/${owner}/${repo}`);
return asString(response.default_branch) ?? "main";
}
/**
* Resolve a parsed Gitea URL into a pinned commit SHA and a tracking ref.
* Mirrors resolveGitHubPinnedRef (server/src/services/company-skills.ts:662-676).
*/
export async function resolveGiteaPinnedRef(parsed: GiteaSourceUrl): Promise<{
pinnedRef: string;
trackingRef: string | null;
}> {
if (/^[0-9a-f]{40}$/i.test(parsed.ref.trim())) {
return {
pinnedRef: parsed.ref,
trackingRef: parsed.explicitRef ? parsed.ref : null,
};
}
const apiBase = giteaApiBase(parsed.hostname);
const trackingRef = parsed.explicitRef
? parsed.ref
: await resolveGiteaDefaultBranch(parsed.owner, parsed.repo, apiBase);
// Gitea's /repos/{o}/{r}/commits/{ref} endpoint only resolves SHAs — a branch
// name returns 404. The branches endpoint accepts both branch names and tags.
const branch = await fetchGiteaBranch(apiBase, parsed.owner, parsed.repo, trackingRef);
const pinnedRef = asString(branch.commit?.id);
if (!pinnedRef) {
throw unprocessable(`Failed to resolve Gitea ref ${trackingRef}`);
}
return { pinnedRef, trackingRef };
}
/**
* Fetch the full list of blob paths in a repo tree at a given ref.
* Paginates with `?page=N&limit=1000` when the response is truncated.
*/
export async function fetchGiteaTreeBlobPaths(
apiBase: string,
owner: string,
repo: string,
ref: string,
): Promise<string[]> {
const all: string[] = [];
let page = 1;
// hard cap so a misconfigured host can't make us loop forever
const MAX_PAGES = 50;
let stillTruncated = false;
for (let i = 0; i < MAX_PAGES; i += 1) {
const url =
page === 1
? `${apiBase}/repos/${owner}/${repo}/git/trees/${ref}?recursive=true&limit=${GITEA_TREE_PAGE_LIMIT}`
: `${apiBase}/repos/${owner}/${repo}/git/trees/${ref}?recursive=true&limit=${GITEA_TREE_PAGE_LIMIT}&page=${page}`;
const data = await fetchGiteaJson<GiteaTreeResponse>(url);
const entries = Array.isArray(data.tree) ? data.tree : [];
for (const entry of entries) {
if (entry.type === "blob" && typeof entry.path === "string") {
all.push(entry.path);
}
}
stillTruncated = Boolean(data.truncated);
if (!stillTruncated) break;
page += 1;
}
if (stillTruncated) {
// Tree still truncated at the page cap — refuse rather than silently
// import a partial skill listing, which would hide SKILL.md files.
throw unprocessable(
`Gitea repo tree for ${owner}/${repo}@${ref} exceeds ${MAX_PAGES * GITEA_TREE_PAGE_LIMIT} entries; refusing to import a partial listing.`,
);
}
return all;
}
/**
* Fetch a raw file from a Gitea/Forgejo repo. Tries the modern
* /raw/branch/{ref}/{path} URL first, falling back to legacy
* /raw/{ref}/{path} on 404.
*/
export async function fetchGiteaText(
hostname: string,
owner: string,
repo: string,
ref: string,
filePath: string,
): Promise<string> {
const canonical = resolveRawGiteaUrl(hostname, owner, repo, ref, filePath);
const canonicalResponse = await giteaFetch(canonical, {
headers: { accept: "text/plain" },
});
if (canonicalResponse.ok) {
return canonicalResponse.text();
}
if (canonicalResponse.status !== 404) {
throw unprocessable(
`Failed to fetch ${canonical}: ${canonicalResponse.status}`,
);
}
const legacy = resolveRawGiteaUrlLegacy(hostname, owner, repo, ref, filePath);
const legacyResponse = await giteaFetch(legacy, {
headers: { accept: "text/plain" },
});
if (!legacyResponse.ok) {
throw unprocessable(
`Failed to fetch ${legacy}: ${legacyResponse.status}`,
);
}
return legacyResponse.text();
}
/**
* Fetch a branch record by name. Used for update checks to resolve
* the latest commit SHA on the tracking branch.
*/
export async function fetchGiteaBranch(
apiBase: string,
owner: string,
repo: string,
branch: string,
): Promise<GiteaBranchResponse> {
return fetchGiteaJson<GiteaBranchResponse>(
`${apiBase}/repos/${owner}/${repo}/branches/${encodeURIComponent(branch)}`,
);
}
export async function fetchGiteaJson<T>(url: string): Promise<T> {
const response = await giteaFetch(url, {
headers: { accept: "application/json" },
});
if (!response.ok) {
throw unprocessable(`Failed to fetch ${url}: ${response.status}`);
}
return (await response.json()) as T;
}