35 lines
1.4 KiB
TypeScript
35 lines
1.4 KiB
TypeScript
/**
|
|
* Corpus Gather — browser-based web crawling.
|
|
*
|
|
* This module is stubbed in the cloud (Fly.io) deployment.
|
|
* Browser crawling runs on the dedicated Hetzner worker (wellspring-builder)
|
|
* which has Chromium installed. Deploy corpus-gather as a separate worker
|
|
* service with playwright-core and point it at the same DATABASE_URL.
|
|
*
|
|
* To run locally with full capability: NODE_ENV=development tsx routes/corpus-gather-worker.ts
|
|
*/
|
|
import type { Express } from "express";
|
|
|
|
const WORKER_URL = process.env.CORPUS_GATHER_WORKER_URL || null;
|
|
|
|
export function registerCorpusGatherRoutes(app: Express) {
|
|
app.all("/api/corpus/gather*", async (req: any, res: any) => {
|
|
if (WORKER_URL) {
|
|
try {
|
|
const upstream = await fetch(`${WORKER_URL}${req.path}`, {
|
|
method: req.method,
|
|
headers: { "content-type": "application/json", "authorization": req.headers.authorization || "" },
|
|
body: ["GET", "HEAD"].includes(req.method) ? undefined : JSON.stringify(req.body),
|
|
});
|
|
const data = await upstream.json();
|
|
return res.status(upstream.status).json(data);
|
|
} catch (e: any) {
|
|
return res.status(502).json({ error: "corpus-gather-worker unreachable", detail: e.message });
|
|
}
|
|
}
|
|
res.status(503).json({
|
|
error: "corpus-gather-worker not configured",
|
|
hint: "Set CORPUS_GATHER_WORKER_URL to the Hetzner worker URL, or run corpus-gather-worker.ts locally.",
|
|
});
|
|
});
|
|
}
|