Add Puppeteer browser scraping and HG Spot store config

- Add browser-scraper.ts using Puppeteer for JS-heavy stores
- Add render_js flag to store model, migration, YAML sync, and UI
- Scraper engine auto-selects cheerio vs Puppeteer based on flag
- Store forms include JS rendering toggle in Advanced section
- Create first store config: HG Spot (Croatian electronics retailer)
- Update Dockerfile with Chromium for production Puppeteer support

Tested: HG Spot returns 15 products per page with correct names,
prices (EUR), links, and images using headless browser rendering.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
mariosemes
2026-03-26 21:36:20 +01:00
parent 97fb8d9663
commit 130ab30fcc
13 changed files with 1037 additions and 39 deletions

View File

@@ -0,0 +1 @@
ALTER TABLE stores ADD COLUMN render_js INTEGER NOT NULL DEFAULT 0;

View File

@@ -7,6 +7,7 @@ export interface Store {
base_url: string;
search_url: string;
enabled: number;
render_js: number;
sel_container: string;
sel_name: string;
sel_price: string;
@@ -38,6 +39,7 @@ export interface CreateStoreInput {
sel_price: string;
sel_link: string;
sel_image?: string;
render_js?: boolean;
rate_limit?: number;
rate_window?: number;
proxy_url?: string;
@@ -118,11 +120,12 @@ export function createStore(input: CreateStoreInput): Store {
db.run(`
INSERT INTO stores (name, slug, base_url, search_url, sel_container, sel_name, sel_price, sel_link, sel_image,
rate_limit, rate_window, proxy_url, user_agent, headers_json, currency, category_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
render_js, rate_limit, rate_window, proxy_url, user_agent, headers_json, currency, category_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, [
input.name, slug, input.base_url, input.search_url,
input.sel_container, input.sel_name, input.sel_price, input.sel_link, input.sel_image || null,
input.render_js ? 1 : 0,
input.rate_limit ?? 2, input.rate_window ?? 1000,
input.proxy_url || null, input.user_agent || null, input.headers_json || null,
input.currency || 'EUR', input.category_id || null,

View File

@@ -35,6 +35,7 @@ export const storeRoutes: FastifyPluginAsync = async (app) => {
sel_price: { type: 'string', minLength: 1 },
sel_link: { type: 'string', minLength: 1 },
sel_image: { type: 'string' },
render_js: { type: 'boolean' },
rate_limit: { type: 'number' },
rate_window: { type: 'number' },
proxy_url: { type: 'string' },

View File

@@ -2,6 +2,7 @@ import type { FastifyPluginAsync } from 'fastify';
import { getStoreById } from '../models/store.js';
import { logScrape, getLogsByStore, getStoreHealth } from '../models/scrape-log.js';
import { scrapeStore } from '../scraper/http-scraper.js';
import { scrapeStoreWithBrowser } from '../scraper/browser-scraper.js';
import { normalizeResult } from '../scraper/result-parser.js';
export const testRoutes: FastifyPluginAsync = async (app) => {
@@ -26,7 +27,9 @@ export const testRoutes: FastifyPluginAsync = async (app) => {
const startTime = Date.now();
try {
const result = await scrapeStore(store, searchUrl);
const result = store.render_js
? await scrapeStoreWithBrowser(store, searchUrl)
: await scrapeStore(store, searchUrl);
const duration = Date.now() - startTime;
const products = result.items.map((item) =>

View File

@@ -0,0 +1,115 @@
import puppeteer, { type Browser } from 'puppeteer';
import type { Store } from '../models/store.js';
import type { ScrapedItem } from './result-parser.js';
let browser: Browser | null = null;
const PAGE_TIMEOUT = 30_000;
const NAVIGATION_TIMEOUT = 20_000;
async function getBrowser(): Promise<Browser> {
if (browser && browser.connected) return browser;
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--no-first-run',
],
});
return browser;
}
export interface BrowserScrapeResult {
items: ScrapedItem[];
html: string;
statusCode: number;
}
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
const b = await getBrowser();
const page = await b.newPage();
try {
// Set user agent if configured
if (store.user_agent) {
await page.setUserAgent(store.user_agent);
}
// Set extra headers if configured
if (store.headers_json) {
try {
const headers = JSON.parse(store.headers_json);
await page.setExtraHTTPHeaders(headers);
} catch { /* ignore invalid headers */ }
}
// Navigate to the page
const response = await page.goto(searchUrl, {
waitUntil: 'networkidle2',
timeout: NAVIGATION_TIMEOUT,
});
const statusCode = response?.status() ?? 200;
// Wait for the product container to appear
try {
await page.waitForSelector(store.sel_container, { timeout: PAGE_TIMEOUT });
} catch {
// Container might not exist if no results — return empty
const html = await page.content();
return { items: [], html, statusCode };
}
// Small extra wait for any remaining renders
await new Promise((r) => setTimeout(r, 500));
// Extract product data from the rendered DOM
const items = await page.evaluate((selectors) => {
const containers = document.querySelectorAll(selectors.container);
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
containers.forEach((el) => {
const nameEl = el.querySelector(selectors.name);
const priceEl = el.querySelector(selectors.price);
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
const name = nameEl?.textContent?.trim() || '';
const priceText = priceEl?.textContent?.trim() || '';
const link = linkEl?.getAttribute('href') || '';
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
if (name && priceText) {
results.push({ name, priceText, link, image });
}
});
return results;
}, {
container: store.sel_container,
name: store.sel_name,
price: store.sel_price,
link: store.sel_link,
image: store.sel_image || null,
});
const html = await page.content();
return { items, html, statusCode };
} finally {
await page.close();
}
}
export async function closeBrowser(): Promise<void> {
if (browser) {
await browser.close();
browser = null;
}
}

View File

@@ -3,6 +3,7 @@ import type { Store } from '../models/store.js';
import { getEnabledStores, getStoresByCategory, getStoresByGroup, getStoresByIds } from '../models/store.js';
import { logScrape } from '../models/scrape-log.js';
import { scrapeStore } from './http-scraper.js';
import { scrapeStoreWithBrowser } from './browser-scraper.js';
import { normalizeResult, type Product } from './result-parser.js';
import { getLimiter } from './rate-limiter.js';
@@ -66,7 +67,10 @@ export async function search(options: SearchOptions): Promise<SearchResult> {
const rateLimiter = getLimiter(store.id, 1, Math.floor(store.rate_window / store.rate_limit));
try {
const result = await rateLimiter.schedule(() => scrapeStore(store, searchUrl));
const scrapeFn = store.render_js
? () => scrapeStoreWithBrowser(store, searchUrl)
: () => scrapeStore(store, searchUrl);
const result = await rateLimiter.schedule(scrapeFn);
const duration = Date.now() - storeStart;
const products = result.items.map((item) =>

View File

@@ -8,6 +8,7 @@ export interface StoreFileConfig {
base_url: string;
search_url: string;
enabled?: boolean;
render_js?: boolean;
category?: string;
currency?: string;
selectors: {
@@ -98,7 +99,7 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
if (existing) {
db.run(`
UPDATE stores SET
name = ?, base_url = ?, search_url = ?, enabled = ?,
name = ?, base_url = ?, search_url = ?, enabled = ?, render_js = ?,
sel_container = ?, sel_name = ?, sel_price = ?, sel_link = ?, sel_image = ?,
rate_limit = ?, rate_window = ?, proxy_url = ?, user_agent = ?, headers_json = ?,
currency = ?, category_id = ?, updated_at = datetime('now')
@@ -106,6 +107,7 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
`, [
config.name, config.base_url, config.search_url,
config.enabled === false ? 0 : 1,
config.render_js ? 1 : 0,
config.selectors.container, config.selectors.name,
config.selectors.price, config.selectors.link,
config.selectors.image || null,
@@ -117,14 +119,15 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
updated++;
} else {
db.run(`
INSERT INTO stores (name, slug, base_url, search_url, enabled,
INSERT INTO stores (name, slug, base_url, search_url, enabled, render_js,
sel_container, sel_name, sel_price, sel_link, sel_image,
rate_limit, rate_window, proxy_url, user_agent, headers_json,
currency, category_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, [
config.name, slug, config.base_url, config.search_url,
config.enabled === false ? 0 : 1,
config.render_js ? 1 : 0,
config.selectors.container, config.selectors.name,
config.selectors.price, config.selectors.link,
config.selectors.image || null,
@@ -161,6 +164,7 @@ function storeToConfig(store: any, categoryName?: string): StoreFileConfig {
if (store.sel_image) config.selectors.image = store.sel_image;
if (store.enabled === 0) config.enabled = false;
if (store.render_js) config.render_js = true;
if (categoryName) config.category = categoryName;
if (store.currency && store.currency !== 'EUR') config.currency = store.currency;
if (store.rate_limit && store.rate_limit !== 2) config.rate_limit = store.rate_limit;