Add Puppeteer browser scraping and HG Spot store config

- Add browser-scraper.ts using Puppeteer for JS-heavy stores
- Add render_js flag to store model, migration, YAML sync, and UI
- Scraper engine auto-selects cheerio vs Puppeteer based on flag
- Store forms include JS rendering toggle in Advanced section
- Create first store config: HG Spot (Croatian electronics retailer)
- Update Dockerfile with Chromium for production Puppeteer support

Tested: HG Spot returns 15 products per page with correct names,
prices (EUR), links, and images using headless browser rendering.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
mariosemes
2026-03-26 21:36:20 +01:00
parent 97fb8d9663
commit 130ab30fcc
13 changed files with 1037 additions and 39 deletions

View File

@@ -19,6 +19,11 @@ RUN npx tsc
FROM node:20-alpine
WORKDIR /app
# Install Chromium for Puppeteer
RUN apk add --no-cache chromium
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
COPY package*.json ./
RUN npm ci --omit=dev
@@ -29,6 +34,7 @@ COPY src/server/db/migrations ./dist/server/db/migrations
ENV NODE_ENV=production
ENV PORT=3000
ENV DATABASE_PATH=/app/data/pricehunter.db
ENV STORES_DIR=/app/stores
EXPOSE 3000
VOLUME /app/data

880
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -22,6 +22,7 @@
"dotenv": "^16.4.7",
"fastify": "^5.2.1",
"p-limit": "^6.2.0",
"puppeteer": "^24.40.0",
"sql.js": "^1.11.0",
"yaml": "^2.8.3"
},

View File

@@ -12,7 +12,7 @@
let form = $state({
name: '', base_url: '', search_url: '',
sel_container: '', sel_name: '', sel_price: '', sel_link: '', sel_image: '',
category_id: '', currency: 'EUR', rate_limit: 2,
category_id: '', currency: 'EUR', rate_limit: 2, render_js: false,
user_agent: '', proxy_url: '', headers_json: '',
});
@@ -24,8 +24,8 @@
sel_container: store.sel_container, sel_name: store.sel_name, sel_price: store.sel_price,
sel_link: store.sel_link, sel_image: store.sel_image || '',
category_id: store.category_id?.toString() || '', currency: store.currency,
rate_limit: store.rate_limit, user_agent: store.user_agent || '',
proxy_url: store.proxy_url || '', headers_json: store.headers_json || '',
rate_limit: store.rate_limit, render_js: !!store.render_js,
user_agent: store.user_agent || '', headers_json: store.headers_json || '',
};
loading = false;
});
@@ -114,6 +114,17 @@
<section class="card p-5">
<h2 class="text-xs font-semibold text-text-primary uppercase tracking-wider mb-4">Advanced</h2>
<div class="flex items-center gap-3 mb-4 px-1">
<button type="button" onclick={() => form.render_js = !form.render_js}
class="w-8 h-[18px] rounded-full transition-colors relative {form.render_js ? 'bg-accent' : 'bg-surface-hover border border-surface-border'}"
aria-label="Toggle JavaScript rendering">
<span class="absolute top-[2px] w-[14px] h-[14px] bg-white rounded-full shadow transition-all {form.render_js ? 'right-[2px]' : 'left-[2px]'}"></span>
</button>
<div>
<span class="text-sm text-text-primary">JavaScript Rendering</span>
<p class="text-2xs text-text-tertiary">Use a headless browser for JS-heavy stores</p>
</div>
</div>
<div class="grid grid-cols-2 gap-4">
<div><label class="label">Rate Limit</label><input type="number" bind:value={form.rate_limit} min="1" max="10" class="input-field" /></div>
<div><label class="label">User Agent</label><input type="text" bind:value={form.user_agent} class="input-field" /></div>

View File

@@ -10,7 +10,7 @@
let form = $state({
name: '', base_url: '', search_url: '',
sel_container: '', sel_name: '', sel_price: '', sel_link: '', sel_image: '',
category_id: '', currency: 'EUR', rate_limit: 2,
category_id: '', currency: 'EUR', rate_limit: 2, render_js: false,
user_agent: '', proxy_url: '', headers_json: '',
});
@@ -111,6 +111,17 @@
<section class="card p-5">
<h2 class="text-xs font-semibold text-text-primary uppercase tracking-wider mb-4">Advanced</h2>
<div class="flex items-center gap-3 mb-4 px-1">
<button type="button" onclick={() => form.render_js = !form.render_js}
class="w-8 h-[18px] rounded-full transition-colors relative {form.render_js ? 'bg-accent' : 'bg-surface-hover border border-surface-border'}"
aria-label="Toggle JavaScript rendering">
<span class="absolute top-[2px] w-[14px] h-[14px] bg-white rounded-full shadow transition-all {form.render_js ? 'right-[2px]' : 'left-[2px]'}"></span>
</button>
<div>
<span class="text-sm text-text-primary">JavaScript Rendering</span>
<p class="text-2xs text-text-tertiary">Use a headless browser for JS-heavy stores (slower but handles dynamic content)</p>
</div>
</div>
<div class="grid grid-cols-2 gap-4">
<div>
<label class="label">Rate Limit (req/sec)</label>

View File

@@ -0,0 +1 @@
ALTER TABLE stores ADD COLUMN render_js INTEGER NOT NULL DEFAULT 0;

View File

@@ -7,6 +7,7 @@ export interface Store {
base_url: string;
search_url: string;
enabled: number;
render_js: number;
sel_container: string;
sel_name: string;
sel_price: string;
@@ -38,6 +39,7 @@ export interface CreateStoreInput {
sel_price: string;
sel_link: string;
sel_image?: string;
render_js?: boolean;
rate_limit?: number;
rate_window?: number;
proxy_url?: string;
@@ -118,11 +120,12 @@ export function createStore(input: CreateStoreInput): Store {
db.run(`
INSERT INTO stores (name, slug, base_url, search_url, sel_container, sel_name, sel_price, sel_link, sel_image,
rate_limit, rate_window, proxy_url, user_agent, headers_json, currency, category_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
render_js, rate_limit, rate_window, proxy_url, user_agent, headers_json, currency, category_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, [
input.name, slug, input.base_url, input.search_url,
input.sel_container, input.sel_name, input.sel_price, input.sel_link, input.sel_image || null,
input.render_js ? 1 : 0,
input.rate_limit ?? 2, input.rate_window ?? 1000,
input.proxy_url || null, input.user_agent || null, input.headers_json || null,
input.currency || 'EUR', input.category_id || null,

View File

@@ -35,6 +35,7 @@ export const storeRoutes: FastifyPluginAsync = async (app) => {
sel_price: { type: 'string', minLength: 1 },
sel_link: { type: 'string', minLength: 1 },
sel_image: { type: 'string' },
render_js: { type: 'boolean' },
rate_limit: { type: 'number' },
rate_window: { type: 'number' },
proxy_url: { type: 'string' },

View File

@@ -2,6 +2,7 @@ import type { FastifyPluginAsync } from 'fastify';
import { getStoreById } from '../models/store.js';
import { logScrape, getLogsByStore, getStoreHealth } from '../models/scrape-log.js';
import { scrapeStore } from '../scraper/http-scraper.js';
import { scrapeStoreWithBrowser } from '../scraper/browser-scraper.js';
import { normalizeResult } from '../scraper/result-parser.js';
export const testRoutes: FastifyPluginAsync = async (app) => {
@@ -26,7 +27,9 @@ export const testRoutes: FastifyPluginAsync = async (app) => {
const startTime = Date.now();
try {
const result = await scrapeStore(store, searchUrl);
const result = store.render_js
? await scrapeStoreWithBrowser(store, searchUrl)
: await scrapeStore(store, searchUrl);
const duration = Date.now() - startTime;
const products = result.items.map((item) =>

View File

@@ -0,0 +1,115 @@
import puppeteer, { type Browser } from 'puppeteer';
import type { Store } from '../models/store.js';
import type { ScrapedItem } from './result-parser.js';
let browser: Browser | null = null;
const PAGE_TIMEOUT = 30_000;
const NAVIGATION_TIMEOUT = 20_000;
async function getBrowser(): Promise<Browser> {
if (browser && browser.connected) return browser;
browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--no-first-run',
],
});
return browser;
}
export interface BrowserScrapeResult {
items: ScrapedItem[];
html: string;
statusCode: number;
}
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
const b = await getBrowser();
const page = await b.newPage();
try {
// Set user agent if configured
if (store.user_agent) {
await page.setUserAgent(store.user_agent);
}
// Set extra headers if configured
if (store.headers_json) {
try {
const headers = JSON.parse(store.headers_json);
await page.setExtraHTTPHeaders(headers);
} catch { /* ignore invalid headers */ }
}
// Navigate to the page
const response = await page.goto(searchUrl, {
waitUntil: 'networkidle2',
timeout: NAVIGATION_TIMEOUT,
});
const statusCode = response?.status() ?? 200;
// Wait for the product container to appear
try {
await page.waitForSelector(store.sel_container, { timeout: PAGE_TIMEOUT });
} catch {
// Container might not exist if no results — return empty
const html = await page.content();
return { items: [], html, statusCode };
}
// Small extra wait for any remaining renders
await new Promise((r) => setTimeout(r, 500));
// Extract product data from the rendered DOM
const items = await page.evaluate((selectors) => {
const containers = document.querySelectorAll(selectors.container);
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
containers.forEach((el) => {
const nameEl = el.querySelector(selectors.name);
const priceEl = el.querySelector(selectors.price);
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
const name = nameEl?.textContent?.trim() || '';
const priceText = priceEl?.textContent?.trim() || '';
const link = linkEl?.getAttribute('href') || '';
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
if (name && priceText) {
results.push({ name, priceText, link, image });
}
});
return results;
}, {
container: store.sel_container,
name: store.sel_name,
price: store.sel_price,
link: store.sel_link,
image: store.sel_image || null,
});
const html = await page.content();
return { items, html, statusCode };
} finally {
await page.close();
}
}
export async function closeBrowser(): Promise<void> {
if (browser) {
await browser.close();
browser = null;
}
}

View File

@@ -3,6 +3,7 @@ import type { Store } from '../models/store.js';
import { getEnabledStores, getStoresByCategory, getStoresByGroup, getStoresByIds } from '../models/store.js';
import { logScrape } from '../models/scrape-log.js';
import { scrapeStore } from './http-scraper.js';
import { scrapeStoreWithBrowser } from './browser-scraper.js';
import { normalizeResult, type Product } from './result-parser.js';
import { getLimiter } from './rate-limiter.js';
@@ -66,7 +67,10 @@ export async function search(options: SearchOptions): Promise<SearchResult> {
const rateLimiter = getLimiter(store.id, 1, Math.floor(store.rate_window / store.rate_limit));
try {
const result = await rateLimiter.schedule(() => scrapeStore(store, searchUrl));
const scrapeFn = store.render_js
? () => scrapeStoreWithBrowser(store, searchUrl)
: () => scrapeStore(store, searchUrl);
const result = await rateLimiter.schedule(scrapeFn);
const duration = Date.now() - storeStart;
const products = result.items.map((item) =>

View File

@@ -8,6 +8,7 @@ export interface StoreFileConfig {
base_url: string;
search_url: string;
enabled?: boolean;
render_js?: boolean;
category?: string;
currency?: string;
selectors: {
@@ -98,7 +99,7 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
if (existing) {
db.run(`
UPDATE stores SET
name = ?, base_url = ?, search_url = ?, enabled = ?,
name = ?, base_url = ?, search_url = ?, enabled = ?, render_js = ?,
sel_container = ?, sel_name = ?, sel_price = ?, sel_link = ?, sel_image = ?,
rate_limit = ?, rate_window = ?, proxy_url = ?, user_agent = ?, headers_json = ?,
currency = ?, category_id = ?, updated_at = datetime('now')
@@ -106,6 +107,7 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
`, [
config.name, config.base_url, config.search_url,
config.enabled === false ? 0 : 1,
config.render_js ? 1 : 0,
config.selectors.container, config.selectors.name,
config.selectors.price, config.selectors.link,
config.selectors.image || null,
@@ -117,14 +119,15 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
updated++;
} else {
db.run(`
INSERT INTO stores (name, slug, base_url, search_url, enabled,
INSERT INTO stores (name, slug, base_url, search_url, enabled, render_js,
sel_container, sel_name, sel_price, sel_link, sel_image,
rate_limit, rate_window, proxy_url, user_agent, headers_json,
currency, category_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, [
config.name, slug, config.base_url, config.search_url,
config.enabled === false ? 0 : 1,
config.render_js ? 1 : 0,
config.selectors.container, config.selectors.name,
config.selectors.price, config.selectors.link,
config.selectors.image || null,
@@ -161,6 +164,7 @@ function storeToConfig(store: any, categoryName?: string): StoreFileConfig {
if (store.sel_image) config.selectors.image = store.sel_image;
if (store.enabled === 0) config.enabled = false;
if (store.render_js) config.render_js = true;
if (categoryName) config.category = categoryName;
if (store.currency && store.currency !== 'EUR') config.currency = store.currency;
if (store.rate_limit && store.rate_limit !== 2) config.rate_limit = store.rate_limit;

14
stores/hgspot.yaml Normal file
View File

@@ -0,0 +1,14 @@
name: HG Spot
base_url: https://www.hgspot.hr
search_url: https://www.hgspot.hr/pretraga?q={query}&page=0
category: Electronics
currency: EUR
render_js: true
rate_limit: 1
selectors:
container: "section.product-container-comm"
name: "h2.product-name"
price: "div.product-price"
link: "h2.product-name a"
image: ".product-image img"