feat: add routes/corpus-gather.ts
This commit is contained in:
parent
9f36274926
commit
dc6e6aa3d5
1 changed files with 35 additions and 0 deletions
35
routes/corpus-gather.ts
Normal file
35
routes/corpus-gather.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
/**
|
||||||
|
* Corpus Gather — browser-based web crawling.
|
||||||
|
*
|
||||||
|
* This module is stubbed in the cloud (Fly.io) deployment.
|
||||||
|
* Browser crawling runs on the dedicated Hetzner worker (wellspring-builder)
|
||||||
|
* which has Chromium installed. Deploy corpus-gather as a separate worker
|
||||||
|
* service with playwright-core and point it at the same DATABASE_URL.
|
||||||
|
*
|
||||||
|
* To run locally with full capability: NODE_ENV=development tsx routes/corpus-gather-worker.ts
|
||||||
|
*/
|
||||||
|
import type { Express } from "express";
|
||||||
|
|
||||||
|
const WORKER_URL = process.env.CORPUS_GATHER_WORKER_URL || null;
|
||||||
|
|
||||||
|
export function registerCorpusGatherRoutes(app: Express) {
|
||||||
|
app.all("/api/corpus/gather*", async (req: any, res: any) => {
|
||||||
|
if (WORKER_URL) {
|
||||||
|
try {
|
||||||
|
const upstream = await fetch(`${WORKER_URL}${req.path}`, {
|
||||||
|
method: req.method,
|
||||||
|
headers: { "content-type": "application/json", "authorization": req.headers.authorization || "" },
|
||||||
|
body: ["GET", "HEAD"].includes(req.method) ? undefined : JSON.stringify(req.body),
|
||||||
|
});
|
||||||
|
const data = await upstream.json();
|
||||||
|
return res.status(upstream.status).json(data);
|
||||||
|
} catch (e: any) {
|
||||||
|
return res.status(502).json({ error: "corpus-gather-worker unreachable", detail: e.message });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res.status(503).json({
|
||||||
|
error: "corpus-gather-worker not configured",
|
||||||
|
hint: "Set CORPUS_GATHER_WORKER_URL to the Hetzner worker URL, or run corpus-gather-worker.ts locally.",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue