feat: add routes/corpus-gather.ts
This commit is contained in:
parent
9f36274926
commit
dc6e6aa3d5
1 changed files with 35 additions and 0 deletions
35
routes/corpus-gather.ts
Normal file
35
routes/corpus-gather.ts
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Corpus Gather — browser-based web crawling.
|
||||
*
|
||||
* This module is stubbed in the cloud (Fly.io) deployment.
|
||||
* Browser crawling runs on the dedicated Hetzner worker (wellspring-builder)
|
||||
* which has Chromium installed. Deploy corpus-gather as a separate worker
|
||||
* service with playwright-core and point it at the same DATABASE_URL.
|
||||
*
|
||||
* To run locally with full capability: NODE_ENV=development tsx routes/corpus-gather-worker.ts
|
||||
*/
|
||||
import type { Express } from "express";
|
||||
|
||||
const WORKER_URL = process.env.CORPUS_GATHER_WORKER_URL || null;
|
||||
|
||||
export function registerCorpusGatherRoutes(app: Express) {
|
||||
app.all("/api/corpus/gather*", async (req: any, res: any) => {
|
||||
if (WORKER_URL) {
|
||||
try {
|
||||
const upstream = await fetch(`${WORKER_URL}${req.path}`, {
|
||||
method: req.method,
|
||||
headers: { "content-type": "application/json", "authorization": req.headers.authorization || "" },
|
||||
body: ["GET", "HEAD"].includes(req.method) ? undefined : JSON.stringify(req.body),
|
||||
});
|
||||
const data = await upstream.json();
|
||||
return res.status(upstream.status).json(data);
|
||||
} catch (e: any) {
|
||||
return res.status(502).json({ error: "corpus-gather-worker unreachable", detail: e.message });
|
||||
}
|
||||
}
|
||||
res.status(503).json({
|
||||
error: "corpus-gather-worker not configured",
|
||||
hint: "Set CORPUS_GATHER_WORKER_URL to the Hetzner worker URL, or run corpus-gather-worker.ts locally.",
|
||||
});
|
||||
});
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue