diff --git a/routes/corpus-gather.ts b/routes/corpus-gather.ts new file mode 100644 index 0000000..e514ff3 --- /dev/null +++ b/routes/corpus-gather.ts @@ -0,0 +1,35 @@ +/** + * Corpus Gather — browser-based web crawling. + * + * This module is stubbed in the cloud (Fly.io) deployment. + * Browser crawling runs on the dedicated Hetzner worker (wellspring-builder) + * which has Chromium installed. Deploy corpus-gather as a separate worker + * service with playwright-core and point it at the same DATABASE_URL. + * + * To run locally with full capability: NODE_ENV=development tsx routes/corpus-gather-worker.ts + */ +import type { Express } from "express"; + +const WORKER_URL = process.env.CORPUS_GATHER_WORKER_URL || null; + +export function registerCorpusGatherRoutes(app: Express) { + app.all("/api/corpus/gather*", async (req: any, res: any) => { + if (WORKER_URL) { + try { + const upstream = await fetch(`${WORKER_URL}${req.path}`, { + method: req.method, + headers: { "content-type": "application/json", "authorization": req.headers.authorization || "" }, + body: ["GET", "HEAD"].includes(req.method) ? undefined : JSON.stringify(req.body), + }); + const data = await upstream.json(); + return res.status(upstream.status).json(data); + } catch (e: any) { + return res.status(502).json({ error: "corpus-gather-worker unreachable", detail: e.message }); + } + } + res.status(503).json({ + error: "corpus-gather-worker not configured", + hint: "Set CORPUS_GATHER_WORKER_URL to the Hetzner worker URL, or run corpus-gather-worker.ts locally.", + }); + }); +}