Chromium cold launch takes several seconds and blocks the event loop, preventing SSE events from flushing. Now the browser is warmed up during server startup if any store uses render_js, so the first search doesn't pay the launch penalty. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
131 lines
3.8 KiB
TypeScript
131 lines
3.8 KiB
TypeScript
import type { Browser } from 'puppeteer';
|
|
import type { Store } from '../models/store.js';
|
|
import type { ScrapedItem } from './result-parser.js';
|
|
|
|
let browser: Browser | null = null;
|
|
|
|
const SELECTOR_TIMEOUT = 15_000;
|
|
const NAVIGATION_TIMEOUT = 15_000;
|
|
|
|
function log(msg: string) {
|
|
console.log(`[browser-scraper] ${msg}`);
|
|
}
|
|
|
|
async function getBrowser(): Promise<Browser> {
|
|
if (browser && browser.connected) return browser;
|
|
|
|
log('Launching Chromium...');
|
|
const puppeteer = await import('puppeteer');
|
|
browser = await puppeteer.default.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-gpu',
|
|
'--disable-extensions',
|
|
'--no-first-run',
|
|
],
|
|
});
|
|
log('Chromium launched');
|
|
|
|
return browser;
|
|
}
|
|
|
|
export interface BrowserScrapeResult {
|
|
items: ScrapedItem[];
|
|
html: string;
|
|
statusCode: number;
|
|
}
|
|
|
|
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
|
|
log(`Scraping ${store.name}: ${searchUrl}`);
|
|
const b = await getBrowser();
|
|
const page = await b.newPage();
|
|
|
|
try {
|
|
if (store.user_agent) {
|
|
await page.setUserAgent(store.user_agent);
|
|
}
|
|
|
|
if (store.headers_json) {
|
|
try {
|
|
const headers = JSON.parse(store.headers_json);
|
|
await page.setExtraHTTPHeaders(headers);
|
|
} catch { /* ignore */ }
|
|
}
|
|
|
|
log(`${store.name}: navigating...`);
|
|
const response = await page.goto(searchUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: NAVIGATION_TIMEOUT,
|
|
});
|
|
|
|
const statusCode = response?.status() ?? 200;
|
|
log(`${store.name}: DOM loaded (status ${statusCode})`);
|
|
|
|
log(`${store.name}: waiting for selector "${store.sel_container}"...`);
|
|
try {
|
|
await page.waitForSelector(store.sel_container, { timeout: SELECTOR_TIMEOUT });
|
|
} catch {
|
|
log(`${store.name}: selector not found, returning empty`);
|
|
return { items: [], html: '', statusCode };
|
|
}
|
|
log(`${store.name}: selector found`);
|
|
|
|
// Brief wait for remaining renders
|
|
await new Promise((r) => setTimeout(r, 300));
|
|
|
|
log(`${store.name}: extracting products...`);
|
|
const items = await page.evaluate((selectors) => {
|
|
const containers = document.querySelectorAll(selectors.container);
|
|
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
|
|
|
|
containers.forEach((el) => {
|
|
const nameEl = el.querySelector(selectors.name);
|
|
const priceEl = el.querySelector(selectors.price);
|
|
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
|
|
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
|
|
|
|
const name = nameEl?.textContent?.trim() || '';
|
|
const priceText = priceEl?.textContent?.trim() || '';
|
|
const link = linkEl?.getAttribute('href') || '';
|
|
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
|
|
|
|
if (name && priceText) {
|
|
results.push({ name, priceText, link, image });
|
|
}
|
|
});
|
|
|
|
return results;
|
|
}, {
|
|
container: store.sel_container,
|
|
name: store.sel_name,
|
|
price: store.sel_price,
|
|
link: store.sel_link,
|
|
image: store.sel_image || null,
|
|
});
|
|
|
|
log(`${store.name}: found ${items.length} products`);
|
|
|
|
return { items, html: '', statusCode };
|
|
} finally {
|
|
log(`${store.name}: closing page`);
|
|
await page.close();
|
|
log(`${store.name}: done`);
|
|
}
|
|
}
|
|
|
|
export async function warmupBrowser(): Promise<void> {
|
|
log('Warming up browser...');
|
|
await getBrowser();
|
|
log('Browser ready');
|
|
}
|
|
|
|
export async function closeBrowser(): Promise<void> {
|
|
if (browser) {
|
|
await browser.close();
|
|
browser = null;
|
|
}
|
|
}
|