agentify-help/routes/corpus-gather.ts

35 lines
1.4 KiB
TypeScript

/**
* Corpus Gather — browser-based web crawling.
*
* This module is stubbed in the cloud (Fly.io) deployment.
* Browser crawling runs on the dedicated Hetzner worker (wellspring-builder)
* which has Chromium installed. Deploy corpus-gather as a separate worker
* service with playwright-core and point it at the same DATABASE_URL.
*
* To run locally with full capability: NODE_ENV=development tsx routes/corpus-gather-worker.ts
*/
import type { Express } from "express";
const WORKER_URL = process.env.CORPUS_GATHER_WORKER_URL || null;
export function registerCorpusGatherRoutes(app: Express) {
app.all("/api/corpus/gather*", async (req: any, res: any) => {
if (WORKER_URL) {
try {
const upstream = await fetch(`${WORKER_URL}${req.path}`, {
method: req.method,
headers: { "content-type": "application/json", "authorization": req.headers.authorization || "" },
body: ["GET", "HEAD"].includes(req.method) ? undefined : JSON.stringify(req.body),
});
const data = await upstream.json();
return res.status(upstream.status).json(data);
} catch (e: any) {
return res.status(502).json({ error: "corpus-gather-worker unreachable", detail: e.message });
}
}
res.status(503).json({
error: "corpus-gather-worker not configured",
hint: "Set CORPUS_GATHER_WORKER_URL to the Hetzner worker URL, or run corpus-gather-worker.ts locally.",
});
});
}