From a3ae3b248f35f5f173cdaabbaa9715b087894f42 Mon Sep 17 00:00:00 2001 From: mariosemes Date: Thu, 26 Mar 2026 22:47:40 +0100 Subject: [PATCH] Use remote Chromium container instead of local Puppeteer launch - Add browserless/chromium container to docker-compose - Add docker-compose.dev.yml for local dev (Chromium on port 3001) - Browser scraper connects via WebSocket (CHROMIUM_WS env var) - Falls back to local launch if CHROMIUM_WS not set - Remove Chromium install from Dockerfile (smaller image) - Auto-reconnect on browser disconnect Tested: remote Chromium connects in ~500ms, HG Spot scrapes in ~2.2s total. No longer blocks the Node.js event loop. Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.example | 1 + Dockerfile | 5 ++- docker-compose.dev.yml | 9 +++++ docker-compose.yml | 10 ++++++ src/server/config.ts | 1 + src/server/scraper/browser-scraper.ts | 49 +++++++++++++++++++-------- 6 files changed, 58 insertions(+), 17 deletions(-) create mode 100644 docker-compose.dev.yml diff --git a/.env.example b/.env.example index 3cadc53..b25b3a4 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,4 @@ PORT=3000 DATABASE_PATH=./data/pricehunter.db STORES_DIR=./stores +CHROMIUM_WS=ws://localhost:3001 diff --git a/Dockerfile b/Dockerfile index 322a8a3..ad37a12 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,9 +19,7 @@ RUN npx tsc FROM node:20-alpine WORKDIR /app -# Install Chromium for Puppeteer -RUN apk add --no-cache chromium -ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser +# Skip Chromium download — we use a remote Chromium container ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true COPY package*.json ./ @@ -35,6 +33,7 @@ ENV NODE_ENV=production ENV PORT=3000 ENV DATABASE_PATH=/app/data/pricehunter.db ENV STORES_DIR=/app/stores +ENV CHROMIUM_WS=ws://chromium:3000 EXPOSE 3000 VOLUME /app/data diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..1631a36 --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,9 @@ +services: + chromium: + image: ghcr.io/browserless/chromium + ports: + - "3001:3000" + environment: + - MAX_CONCURRENT_SESSIONS=5 + - CONNECTION_TIMEOUT=30000 + restart: unless-stopped diff --git a/docker-compose.yml b/docker-compose.yml index c4f2a84..b88996b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,5 +10,15 @@ services: - NODE_ENV=production - DATABASE_PATH=/app/data/pricehunter.db - STORES_DIR=/app/stores + - CHROMIUM_WS=ws://chromium:3000 - PORT=3000 + depends_on: + - chromium + restart: unless-stopped + + chromium: + image: ghcr.io/browserless/chromium + environment: + - MAX_CONCURRENT_SESSIONS=5 + - CONNECTION_TIMEOUT=30000 restart: unless-stopped diff --git a/src/server/config.ts b/src/server/config.ts index 459cddf..7380abf 100644 --- a/src/server/config.ts +++ b/src/server/config.ts @@ -5,5 +5,6 @@ export const config = { host: process.env.HOST || '0.0.0.0', databasePath: process.env.DATABASE_PATH || './data/pricehunter.db', storesDir: process.env.STORES_DIR || './stores', + chromiumWs: process.env.CHROMIUM_WS || '', isProduction: process.env.NODE_ENV === 'production', }; diff --git a/src/server/scraper/browser-scraper.ts b/src/server/scraper/browser-scraper.ts index 9badc4a..a978002 100644 --- a/src/server/scraper/browser-scraper.ts +++ b/src/server/scraper/browser-scraper.ts @@ -1,6 +1,7 @@ import type { Browser } from 'puppeteer'; import type { Store } from '../models/store.js'; import type { ScrapedItem } from './result-parser.js'; +import { config } from '../config.js'; let browser: Browser | null = null; @@ -14,20 +15,37 @@ function log(msg: string) { async function getBrowser(): Promise { if (browser && browser.connected) return browser; - log('Launching Chromium...'); const puppeteer = await import('puppeteer'); - browser = await puppeteer.default.launch({ - headless: true, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--disable-extensions', - '--no-first-run', - ], + + if (config.chromiumWs) { + // Connect to remote Chromium (Docker container) + log(`Connecting to remote Chromium at ${config.chromiumWs}`); + browser = await puppeteer.default.connect({ + browserWSEndpoint: config.chromiumWs, + }); + log('Connected to remote Chromium'); + } else { + // Fall back to local launch + log('Launching local Chromium...'); + browser = await puppeteer.default.launch({ + headless: true, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--disable-extensions', + '--no-first-run', + ], + }); + log('Local Chromium launched'); + } + + // Reconnect if browser disconnects + browser.on('disconnected', () => { + log('Browser disconnected'); + browser = null; }); - log('Chromium launched'); return browser; } @@ -73,7 +91,6 @@ export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): P } log(`${store.name}: selector found`); - // Brief wait for remaining renders await new Promise((r) => setTimeout(r, 300)); log(`${store.name}: extracting products...`); @@ -124,7 +141,11 @@ export async function warmupBrowser(): Promise { export async function closeBrowser(): Promise { if (browser) { - await browser.close(); + if (config.chromiumWs) { + browser.disconnect(); + } else { + await browser.close(); + } browser = null; } }