/** * Corpus Gather — browser-based web crawling. * * This module is stubbed in the cloud (Fly.io) deployment. * Browser crawling runs on the dedicated Hetzner worker (wellspring-builder) * which has Chromium installed. Deploy corpus-gather as a separate worker * service with playwright-core and point it at the same DATABASE_URL. * * To run locally with full capability: NODE_ENV=development tsx routes/corpus-gather-worker.ts */ import type { Express } from "express"; const WORKER_URL = process.env.CORPUS_GATHER_WORKER_URL || null; export function registerCorpusGatherRoutes(app: Express) { async function gatherHandler(req: any, res: any) { if (WORKER_URL) { try { const upstream = await fetch(`${WORKER_URL}${req.path}`, { method: req.method, headers: { "content-type": "application/json", "authorization": req.headers.authorization || "" }, body: ["GET", "HEAD"].includes(req.method) ? undefined : JSON.stringify(req.body), }); const data = await upstream.json(); return res.status(upstream.status).json(data); } catch (e: any) { return res.status(502).json({ error: "corpus-gather-worker unreachable", detail: e.message }); } } res.status(503).json({ error: "corpus-gather-worker not configured", hint: "Set CORPUS_GATHER_WORKER_URL to the Hetzner worker URL.", }); } app.all("/api/corpus/gather", gatherHandler); app.all("/api/corpus/gather/*path", gatherHandler); }