Files
PriceHunter/src/server/scraper/browser-scraper.ts
mariosemes 0e2e8d1766 Pre-launch Chromium on server startup to avoid cold-start blocking
Chromium cold launch takes several seconds and blocks the event
loop, preventing SSE events from flushing. Now the browser is
warmed up during server startup if any store uses render_js,
so the first search doesn't pay the launch penalty.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:42:15 +01:00

131 lines
3.8 KiB
TypeScript

import type { Browser } from 'puppeteer';
import type { Store } from '../models/store.js';
import type { ScrapedItem } from './result-parser.js';
let browser: Browser | null = null;
const SELECTOR_TIMEOUT = 15_000;
const NAVIGATION_TIMEOUT = 15_000;
function log(msg: string) {
console.log(`[browser-scraper] ${msg}`);
}
async function getBrowser(): Promise<Browser> {
if (browser && browser.connected) return browser;
log('Launching Chromium...');
const puppeteer = await import('puppeteer');
browser = await puppeteer.default.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--no-first-run',
],
});
log('Chromium launched');
return browser;
}
export interface BrowserScrapeResult {
items: ScrapedItem[];
html: string;
statusCode: number;
}
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
log(`Scraping ${store.name}: ${searchUrl}`);
const b = await getBrowser();
const page = await b.newPage();
try {
if (store.user_agent) {
await page.setUserAgent(store.user_agent);
}
if (store.headers_json) {
try {
const headers = JSON.parse(store.headers_json);
await page.setExtraHTTPHeaders(headers);
} catch { /* ignore */ }
}
log(`${store.name}: navigating...`);
const response = await page.goto(searchUrl, {
waitUntil: 'domcontentloaded',
timeout: NAVIGATION_TIMEOUT,
});
const statusCode = response?.status() ?? 200;
log(`${store.name}: DOM loaded (status ${statusCode})`);
log(`${store.name}: waiting for selector "${store.sel_container}"...`);
try {
await page.waitForSelector(store.sel_container, { timeout: SELECTOR_TIMEOUT });
} catch {
log(`${store.name}: selector not found, returning empty`);
return { items: [], html: '', statusCode };
}
log(`${store.name}: selector found`);
// Brief wait for remaining renders
await new Promise((r) => setTimeout(r, 300));
log(`${store.name}: extracting products...`);
const items = await page.evaluate((selectors) => {
const containers = document.querySelectorAll(selectors.container);
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
containers.forEach((el) => {
const nameEl = el.querySelector(selectors.name);
const priceEl = el.querySelector(selectors.price);
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
const name = nameEl?.textContent?.trim() || '';
const priceText = priceEl?.textContent?.trim() || '';
const link = linkEl?.getAttribute('href') || '';
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
if (name && priceText) {
results.push({ name, priceText, link, image });
}
});
return results;
}, {
container: store.sel_container,
name: store.sel_name,
price: store.sel_price,
link: store.sel_link,
image: store.sel_image || null,
});
log(`${store.name}: found ${items.length} products`);
return { items, html: '', statusCode };
} finally {
log(`${store.name}: closing page`);
await page.close();
log(`${store.name}: done`);
}
}
export async function warmupBrowser(): Promise<void> {
log('Warming up browser...');
await getBrowser();
log('Browser ready');
}
export async function closeBrowser(): Promise<void> {
if (browser) {
await browser.close();
browser = null;
}
}