Add Puppeteer browser scraping and HG Spot store config

- Add browser-scraper.ts using Puppeteer for JS-heavy stores
- Add render_js flag to store model, migration, YAML sync, and UI
- Scraper engine auto-selects cheerio vs Puppeteer based on flag
- Store forms include JS rendering toggle in Advanced section
- Create first store config: HG Spot (Croatian electronics retailer)
- Update Dockerfile with Chromium for production Puppeteer support

Tested: HG Spot returns 15 products per page with correct names,
prices (EUR), links, and images using headless browser rendering.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
mariosemes
2026-03-26 21:36:20 +01:00
parent 97fb8d9663
commit 130ab30fcc
13 changed files with 1037 additions and 39 deletions

View File

@@ -0,0 +1,115 @@
import puppeteer, { type Browser } from 'puppeteer';
import type { Store } from '../models/store.js';
import type { ScrapedItem } from './result-parser.js';
let browser: Browser | null = null;
const PAGE_TIMEOUT = 30_000;
const NAVIGATION_TIMEOUT = 20_000;
async function getBrowser(): Promise<Browser> {
if (browser && browser.connected) return browser;
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--no-first-run',
],
});
return browser;
}
export interface BrowserScrapeResult {
items: ScrapedItem[];
html: string;
statusCode: number;
}
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
const b = await getBrowser();
const page = await b.newPage();
try {
// Set user agent if configured
if (store.user_agent) {
await page.setUserAgent(store.user_agent);
}
// Set extra headers if configured
if (store.headers_json) {
try {
const headers = JSON.parse(store.headers_json);
await page.setExtraHTTPHeaders(headers);
} catch { /* ignore invalid headers */ }
}
// Navigate to the page
const response = await page.goto(searchUrl, {
waitUntil: 'networkidle2',
timeout: NAVIGATION_TIMEOUT,
});
const statusCode = response?.status() ?? 200;
// Wait for the product container to appear
try {
await page.waitForSelector(store.sel_container, { timeout: PAGE_TIMEOUT });
} catch {
// Container might not exist if no results — return empty
const html = await page.content();
return { items: [], html, statusCode };
}
// Small extra wait for any remaining renders
await new Promise((r) => setTimeout(r, 500));
// Extract product data from the rendered DOM
const items = await page.evaluate((selectors) => {
const containers = document.querySelectorAll(selectors.container);
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
containers.forEach((el) => {
const nameEl = el.querySelector(selectors.name);
const priceEl = el.querySelector(selectors.price);
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
const name = nameEl?.textContent?.trim() || '';
const priceText = priceEl?.textContent?.trim() || '';
const link = linkEl?.getAttribute('href') || '';
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
if (name && priceText) {
results.push({ name, priceText, link, image });
}
});
return results;
}, {
container: store.sel_container,
name: store.sel_name,
price: store.sel_price,
link: store.sel_link,
image: store.sel_image || null,
});
const html = await page.content();
return { items, html, statusCode };
} finally {
await page.close();
}
}
export async function closeBrowser(): Promise<void> {
if (browser) {
await browser.close();
browser = null;
}
}