- Add browser-scraper.ts using Puppeteer for JS-heavy stores - Add render_js flag to store model, migration, YAML sync, and UI - Scraper engine auto-selects cheerio vs Puppeteer based on flag - Store forms include JS rendering toggle in Advanced section - Create first store config: HG Spot (Croatian electronics retailer) - Update Dockerfile with Chromium for production Puppeteer support Tested: HG Spot returns 15 products per page with correct names, prices (EUR), links, and images using headless browser rendering. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
116 lines
3.3 KiB
TypeScript
116 lines
3.3 KiB
TypeScript
import puppeteer, { type Browser } from 'puppeteer';
|
|
import type { Store } from '../models/store.js';
|
|
import type { ScrapedItem } from './result-parser.js';
|
|
|
|
let browser: Browser | null = null;
|
|
|
|
const PAGE_TIMEOUT = 30_000;
|
|
const NAVIGATION_TIMEOUT = 20_000;
|
|
|
|
async function getBrowser(): Promise<Browser> {
|
|
if (browser && browser.connected) return browser;
|
|
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-gpu',
|
|
'--disable-extensions',
|
|
'--no-first-run',
|
|
],
|
|
});
|
|
|
|
return browser;
|
|
}
|
|
|
|
export interface BrowserScrapeResult {
|
|
items: ScrapedItem[];
|
|
html: string;
|
|
statusCode: number;
|
|
}
|
|
|
|
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
|
|
const b = await getBrowser();
|
|
const page = await b.newPage();
|
|
|
|
try {
|
|
// Set user agent if configured
|
|
if (store.user_agent) {
|
|
await page.setUserAgent(store.user_agent);
|
|
}
|
|
|
|
// Set extra headers if configured
|
|
if (store.headers_json) {
|
|
try {
|
|
const headers = JSON.parse(store.headers_json);
|
|
await page.setExtraHTTPHeaders(headers);
|
|
} catch { /* ignore invalid headers */ }
|
|
}
|
|
|
|
// Navigate to the page
|
|
const response = await page.goto(searchUrl, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: NAVIGATION_TIMEOUT,
|
|
});
|
|
|
|
const statusCode = response?.status() ?? 200;
|
|
|
|
// Wait for the product container to appear
|
|
try {
|
|
await page.waitForSelector(store.sel_container, { timeout: PAGE_TIMEOUT });
|
|
} catch {
|
|
// Container might not exist if no results — return empty
|
|
const html = await page.content();
|
|
return { items: [], html, statusCode };
|
|
}
|
|
|
|
// Small extra wait for any remaining renders
|
|
await new Promise((r) => setTimeout(r, 500));
|
|
|
|
// Extract product data from the rendered DOM
|
|
const items = await page.evaluate((selectors) => {
|
|
const containers = document.querySelectorAll(selectors.container);
|
|
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
|
|
|
|
containers.forEach((el) => {
|
|
const nameEl = el.querySelector(selectors.name);
|
|
const priceEl = el.querySelector(selectors.price);
|
|
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
|
|
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
|
|
|
|
const name = nameEl?.textContent?.trim() || '';
|
|
const priceText = priceEl?.textContent?.trim() || '';
|
|
const link = linkEl?.getAttribute('href') || '';
|
|
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
|
|
|
|
if (name && priceText) {
|
|
results.push({ name, priceText, link, image });
|
|
}
|
|
});
|
|
|
|
return results;
|
|
}, {
|
|
container: store.sel_container,
|
|
name: store.sel_name,
|
|
price: store.sel_price,
|
|
link: store.sel_link,
|
|
image: store.sel_image || null,
|
|
});
|
|
|
|
const html = await page.content();
|
|
|
|
return { items, html, statusCode };
|
|
} finally {
|
|
await page.close();
|
|
}
|
|
}
|
|
|
|
export async function closeBrowser(): Promise<void> {
|
|
if (browser) {
|
|
await browser.close();
|
|
browser = null;
|
|
}
|
|
}
|