fix(db): wait for/retry DB DNS resolution before drizzle-kit migrate (GRO-2163)
CI / Test (pull_request) Successful in 10s
CI / Lint & Typecheck (pull_request) Successful in 16s
CI / Build & Push Docker Images (pull_request) Failing after 30m28s

A fresh migrate-schema pod occasionally hits a transient CoreDNS miss
(EAI_AGAIN) on groombook-postgres-rw.<ns>.svc on its first attempt.
With backoffLimit: 2 the retry pod usually wins, but three unlucky
attempts in a row trips BackoffLimitExceeded and the Job is recreated
on every Flux reconcile (3+ Completed events observed in 8 min in uat).

Add packages/db/scripts/wait-for-db.mjs: a tiny no-deps Node 22 script
that parses DATABASE_URL, resolves the hostname via node:dns.promises
with exponential backoff (12 attempts, ~30s total) and only exits 0
once a real IP is returned. EAI_AGAIN / ENOTFOUND / EAI_NODATA are
retried; any other DNS error is surfaced so drizzle-kit gets a clear
message instead of being starved by retries.

Wire it as a pnpm `pre-migrate` (and `pre-seed` / `pre-reset`) hook
in @groombook/db so pnpm auto-runs it before any of the data-plane
commands. Mirrors the belt-and-braces pattern used in GRO-1985
(disable Corepack download fallback): do not try to outsmart CoreDNS,
just do not ask drizzle-kit to perform the very first DNS lookup of a
freshly-scheduled pod.

Defaults are env-tunable (WAIT_FOR_DB_MAX_ATTEMPTS, _BASE_DELAY_MS,
_MAX_DELAY_MS, _SKIP) so a future uat-debug pod can sidestep the
wait if needed.

Refs: GRO-2163, GRO-1985.
This commit is contained in:
Flea Flicker
2026-06-08 00:31:38 +00:00
parent f67b96ddfe
commit 323f6d6bcb
2 changed files with 107 additions and 0 deletions
+3
View File
@@ -18,7 +18,10 @@
"scripts": { "scripts": {
"build": "tsc --project .", "build": "tsc --project .",
"generate": "drizzle-kit generate", "generate": "drizzle-kit generate",
"pre-migrate": "node ./scripts/wait-for-db.mjs",
"migrate": "drizzle-kit migrate", "migrate": "drizzle-kit migrate",
"pre-seed": "node ./scripts/wait-for-db.mjs",
"pre-reset": "node ./scripts/wait-for-db.mjs",
"seed": "tsx src/seed.ts", "seed": "tsx src/seed.ts",
"reset": "tsx src/reset.ts && drizzle-kit migrate && tsx src/seed.ts", "reset": "tsx src/reset.ts && drizzle-kit migrate && tsx src/seed.ts",
"studio": "drizzle-kit studio", "studio": "drizzle-kit studio",
+104
View File
@@ -0,0 +1,104 @@
#!/usr/bin/env node
// wait-for-db.mjs
//
// GRO-2163: wait for / retry DNS resolution of the database hostname derived
// from DATABASE_URL before invoking `drizzle-kit migrate`. The first attempt
// of a fresh migrate-schema pod occasionally hits a transient CoreDNS miss
// (EAI_AGAIN) on `groombook-postgres-rw.<ns>.svc`; with backoffLimit: 2 the
// retry pod usually wins, but three unlucky attempts in a row trips
// BackoffLimitExceeded. Resolving once here, with backoff, removes the dice
// roll at the source so the first attempt reliably succeeds.
//
// Mirrors the belt-and-braces pattern used in GRO-1985 (no Corepack
// download fallback): we don't try to outsmart CoreDNS, we just don't ask
// drizzle-kit to do the very first DNS lookup of a freshly-scheduled pod.
//
// Configuration (env):
// WAIT_FOR_DB_MAX_ATTEMPTS default 12 (~30s of total wait at default backoff)
// WAIT_FOR_DB_BASE_DELAY_MS default 500
// WAIT_FOR_DB_MAX_DELAY_MS default 5000
// WAIT_FOR_DB_SKIP default unset; set to "1" to skip (debug only)
//
// On success: exit 0. On exhaustion: exit 1 so the Job's backoff is
// preserved (we don't want to silently mask a real outage by giving up
// after 30s and letting drizzle-kit fail with a less-actionable error).
import { setTimeout as delay } from "node:timers/promises";
import dns from "node:dns/promises";
const MAX_ATTEMPTS = Number(process.env.WAIT_FOR_DB_MAX_ATTEMPTS ?? 12);
const BASE_DELAY_MS = Number(process.env.WAIT_FOR_DB_BASE_DELAY_MS ?? 500);
const MAX_DELAY_MS = Number(process.env.WAIT_FOR_DB_MAX_DELAY_MS ?? 5000);
function parseHost(databaseUrl) {
try {
return new URL(databaseUrl).hostname || null;
} catch {
return null;
}
}
async function resolveOnce(host) {
const start = Date.now();
const result = await dns.lookup(host);
return { address: result.address, ms: Date.now() - start };
}
async function main() {
if (process.env.WAIT_FOR_DB_SKIP === "1") {
console.log("[wait-for-db] WAIT_FOR_DB_SKIP=1, skipping");
return;
}
const databaseUrl = process.env.DATABASE_URL;
if (!databaseUrl) {
// Don't gate the migrate on a misconfigured env — let drizzle-kit fail
// loudly with its own clear error.
console.warn("[wait-for-db] DATABASE_URL not set; skipping");
return;
}
const host = parseHost(databaseUrl);
if (!host) {
console.warn(`[wait-for-db] could not parse hostname from DATABASE_URL; skipping`);
return;
}
console.log(
`[wait-for-db] host=${host} max_attempts=${MAX_ATTEMPTS} ` +
`base_delay_ms=${BASE_DELAY_MS} max_delay_ms=${MAX_DELAY_MS}`,
);
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
try {
const { address, ms } = await resolveOnce(host);
console.log(`[wait-for-db] ok attempt=${attempt} host=${host} -> ${address} (${ms}ms)`);
return;
} catch (err) {
const code = err?.code ?? "UNKNOWN";
const transient = code === "EAI_AGAIN" || code === "ENOTFOUND" || code === "EAI_NODATA";
if (!transient) {
// Hard error (e.g. invalid hostname): surface and let drizzle-kit fail
// with a real error rather than spinning.
console.error(`[wait-for-db] non-transient DNS error attempt=${attempt} code=${code}: ${err.message}`);
process.exit(1);
}
if (attempt === MAX_ATTEMPTS) {
console.error(
`[wait-for-db] exhausted attempts=${MAX_ATTEMPTS} host=${host} last_code=${code}; exiting 1`,
);
process.exit(1);
}
const backoff = Math.min(
MAX_DELAY_MS,
BASE_DELAY_MS * 2 ** (attempt - 1) + Math.floor(Math.random() * BASE_DELAY_MS),
);
console.log(
`[wait-for-db] transient attempt=${attempt} code=${code} retry_in_ms=${backoff}`,
);
await delay(backoff);
}
}
}
main().catch((err) => {
console.error(`[wait-for-db] fatal: ${err?.message ?? err}`);
process.exit(1);
});