Add Puppeteer browser scraping and HG Spot store config
- Add browser-scraper.ts using Puppeteer for JS-heavy stores - Add render_js flag to store model, migration, YAML sync, and UI - Scraper engine auto-selects cheerio vs Puppeteer based on flag - Store forms include JS rendering toggle in Advanced section - Create first store config: HG Spot (Croatian electronics retailer) - Update Dockerfile with Chromium for production Puppeteer support Tested: HG Spot returns 15 products per page with correct names, prices (EUR), links, and images using headless browser rendering. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,11 @@ RUN npx tsc
|
||||
FROM node:20-alpine
|
||||
WORKDIR /app
|
||||
|
||||
# Install Chromium for Puppeteer
|
||||
RUN apk add --no-cache chromium
|
||||
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
|
||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci --omit=dev
|
||||
|
||||
@@ -29,6 +34,7 @@ COPY src/server/db/migrations ./dist/server/db/migrations
|
||||
ENV NODE_ENV=production
|
||||
ENV PORT=3000
|
||||
ENV DATABASE_PATH=/app/data/pricehunter.db
|
||||
ENV STORES_DIR=/app/stores
|
||||
|
||||
EXPOSE 3000
|
||||
VOLUME /app/data
|
||||
|
||||
880
package-lock.json
generated
880
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -22,6 +22,7 @@
|
||||
"dotenv": "^16.4.7",
|
||||
"fastify": "^5.2.1",
|
||||
"p-limit": "^6.2.0",
|
||||
"puppeteer": "^24.40.0",
|
||||
"sql.js": "^1.11.0",
|
||||
"yaml": "^2.8.3"
|
||||
},
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
let form = $state({
|
||||
name: '', base_url: '', search_url: '',
|
||||
sel_container: '', sel_name: '', sel_price: '', sel_link: '', sel_image: '',
|
||||
category_id: '', currency: 'EUR', rate_limit: 2,
|
||||
category_id: '', currency: 'EUR', rate_limit: 2, render_js: false,
|
||||
user_agent: '', proxy_url: '', headers_json: '',
|
||||
});
|
||||
|
||||
@@ -24,8 +24,8 @@
|
||||
sel_container: store.sel_container, sel_name: store.sel_name, sel_price: store.sel_price,
|
||||
sel_link: store.sel_link, sel_image: store.sel_image || '',
|
||||
category_id: store.category_id?.toString() || '', currency: store.currency,
|
||||
rate_limit: store.rate_limit, user_agent: store.user_agent || '',
|
||||
proxy_url: store.proxy_url || '', headers_json: store.headers_json || '',
|
||||
rate_limit: store.rate_limit, render_js: !!store.render_js,
|
||||
user_agent: store.user_agent || '', headers_json: store.headers_json || '',
|
||||
};
|
||||
loading = false;
|
||||
});
|
||||
@@ -114,6 +114,17 @@
|
||||
|
||||
<section class="card p-5">
|
||||
<h2 class="text-xs font-semibold text-text-primary uppercase tracking-wider mb-4">Advanced</h2>
|
||||
<div class="flex items-center gap-3 mb-4 px-1">
|
||||
<button type="button" onclick={() => form.render_js = !form.render_js}
|
||||
class="w-8 h-[18px] rounded-full transition-colors relative {form.render_js ? 'bg-accent' : 'bg-surface-hover border border-surface-border'}"
|
||||
aria-label="Toggle JavaScript rendering">
|
||||
<span class="absolute top-[2px] w-[14px] h-[14px] bg-white rounded-full shadow transition-all {form.render_js ? 'right-[2px]' : 'left-[2px]'}"></span>
|
||||
</button>
|
||||
<div>
|
||||
<span class="text-sm text-text-primary">JavaScript Rendering</span>
|
||||
<p class="text-2xs text-text-tertiary">Use a headless browser for JS-heavy stores</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="grid grid-cols-2 gap-4">
|
||||
<div><label class="label">Rate Limit</label><input type="number" bind:value={form.rate_limit} min="1" max="10" class="input-field" /></div>
|
||||
<div><label class="label">User Agent</label><input type="text" bind:value={form.user_agent} class="input-field" /></div>
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
let form = $state({
|
||||
name: '', base_url: '', search_url: '',
|
||||
sel_container: '', sel_name: '', sel_price: '', sel_link: '', sel_image: '',
|
||||
category_id: '', currency: 'EUR', rate_limit: 2,
|
||||
category_id: '', currency: 'EUR', rate_limit: 2, render_js: false,
|
||||
user_agent: '', proxy_url: '', headers_json: '',
|
||||
});
|
||||
|
||||
@@ -111,6 +111,17 @@
|
||||
|
||||
<section class="card p-5">
|
||||
<h2 class="text-xs font-semibold text-text-primary uppercase tracking-wider mb-4">Advanced</h2>
|
||||
<div class="flex items-center gap-3 mb-4 px-1">
|
||||
<button type="button" onclick={() => form.render_js = !form.render_js}
|
||||
class="w-8 h-[18px] rounded-full transition-colors relative {form.render_js ? 'bg-accent' : 'bg-surface-hover border border-surface-border'}"
|
||||
aria-label="Toggle JavaScript rendering">
|
||||
<span class="absolute top-[2px] w-[14px] h-[14px] bg-white rounded-full shadow transition-all {form.render_js ? 'right-[2px]' : 'left-[2px]'}"></span>
|
||||
</button>
|
||||
<div>
|
||||
<span class="text-sm text-text-primary">JavaScript Rendering</span>
|
||||
<p class="text-2xs text-text-tertiary">Use a headless browser for JS-heavy stores (slower but handles dynamic content)</p>
|
||||
</div>
|
||||
</div>
|
||||
<div class="grid grid-cols-2 gap-4">
|
||||
<div>
|
||||
<label class="label">Rate Limit (req/sec)</label>
|
||||
|
||||
1
src/server/db/migrations/002_add_render_js.sql
Normal file
1
src/server/db/migrations/002_add_render_js.sql
Normal file
@@ -0,0 +1 @@
|
||||
ALTER TABLE stores ADD COLUMN render_js INTEGER NOT NULL DEFAULT 0;
|
||||
@@ -7,6 +7,7 @@ export interface Store {
|
||||
base_url: string;
|
||||
search_url: string;
|
||||
enabled: number;
|
||||
render_js: number;
|
||||
sel_container: string;
|
||||
sel_name: string;
|
||||
sel_price: string;
|
||||
@@ -38,6 +39,7 @@ export interface CreateStoreInput {
|
||||
sel_price: string;
|
||||
sel_link: string;
|
||||
sel_image?: string;
|
||||
render_js?: boolean;
|
||||
rate_limit?: number;
|
||||
rate_window?: number;
|
||||
proxy_url?: string;
|
||||
@@ -118,11 +120,12 @@ export function createStore(input: CreateStoreInput): Store {
|
||||
|
||||
db.run(`
|
||||
INSERT INTO stores (name, slug, base_url, search_url, sel_container, sel_name, sel_price, sel_link, sel_image,
|
||||
rate_limit, rate_window, proxy_url, user_agent, headers_json, currency, category_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
render_js, rate_limit, rate_window, proxy_url, user_agent, headers_json, currency, category_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`, [
|
||||
input.name, slug, input.base_url, input.search_url,
|
||||
input.sel_container, input.sel_name, input.sel_price, input.sel_link, input.sel_image || null,
|
||||
input.render_js ? 1 : 0,
|
||||
input.rate_limit ?? 2, input.rate_window ?? 1000,
|
||||
input.proxy_url || null, input.user_agent || null, input.headers_json || null,
|
||||
input.currency || 'EUR', input.category_id || null,
|
||||
|
||||
@@ -35,6 +35,7 @@ export const storeRoutes: FastifyPluginAsync = async (app) => {
|
||||
sel_price: { type: 'string', minLength: 1 },
|
||||
sel_link: { type: 'string', minLength: 1 },
|
||||
sel_image: { type: 'string' },
|
||||
render_js: { type: 'boolean' },
|
||||
rate_limit: { type: 'number' },
|
||||
rate_window: { type: 'number' },
|
||||
proxy_url: { type: 'string' },
|
||||
|
||||
@@ -2,6 +2,7 @@ import type { FastifyPluginAsync } from 'fastify';
|
||||
import { getStoreById } from '../models/store.js';
|
||||
import { logScrape, getLogsByStore, getStoreHealth } from '../models/scrape-log.js';
|
||||
import { scrapeStore } from '../scraper/http-scraper.js';
|
||||
import { scrapeStoreWithBrowser } from '../scraper/browser-scraper.js';
|
||||
import { normalizeResult } from '../scraper/result-parser.js';
|
||||
|
||||
export const testRoutes: FastifyPluginAsync = async (app) => {
|
||||
@@ -26,7 +27,9 @@ export const testRoutes: FastifyPluginAsync = async (app) => {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const result = await scrapeStore(store, searchUrl);
|
||||
const result = store.render_js
|
||||
? await scrapeStoreWithBrowser(store, searchUrl)
|
||||
: await scrapeStore(store, searchUrl);
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
const products = result.items.map((item) =>
|
||||
|
||||
115
src/server/scraper/browser-scraper.ts
Normal file
115
src/server/scraper/browser-scraper.ts
Normal file
@@ -0,0 +1,115 @@
|
||||
import puppeteer, { type Browser } from 'puppeteer';
|
||||
import type { Store } from '../models/store.js';
|
||||
import type { ScrapedItem } from './result-parser.js';
|
||||
|
||||
let browser: Browser | null = null;
|
||||
|
||||
const PAGE_TIMEOUT = 30_000;
|
||||
const NAVIGATION_TIMEOUT = 20_000;
|
||||
|
||||
async function getBrowser(): Promise<Browser> {
|
||||
if (browser && browser.connected) return browser;
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
'--disable-extensions',
|
||||
'--no-first-run',
|
||||
],
|
||||
});
|
||||
|
||||
return browser;
|
||||
}
|
||||
|
||||
export interface BrowserScrapeResult {
|
||||
items: ScrapedItem[];
|
||||
html: string;
|
||||
statusCode: number;
|
||||
}
|
||||
|
||||
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
|
||||
const b = await getBrowser();
|
||||
const page = await b.newPage();
|
||||
|
||||
try {
|
||||
// Set user agent if configured
|
||||
if (store.user_agent) {
|
||||
await page.setUserAgent(store.user_agent);
|
||||
}
|
||||
|
||||
// Set extra headers if configured
|
||||
if (store.headers_json) {
|
||||
try {
|
||||
const headers = JSON.parse(store.headers_json);
|
||||
await page.setExtraHTTPHeaders(headers);
|
||||
} catch { /* ignore invalid headers */ }
|
||||
}
|
||||
|
||||
// Navigate to the page
|
||||
const response = await page.goto(searchUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: NAVIGATION_TIMEOUT,
|
||||
});
|
||||
|
||||
const statusCode = response?.status() ?? 200;
|
||||
|
||||
// Wait for the product container to appear
|
||||
try {
|
||||
await page.waitForSelector(store.sel_container, { timeout: PAGE_TIMEOUT });
|
||||
} catch {
|
||||
// Container might not exist if no results — return empty
|
||||
const html = await page.content();
|
||||
return { items: [], html, statusCode };
|
||||
}
|
||||
|
||||
// Small extra wait for any remaining renders
|
||||
await new Promise((r) => setTimeout(r, 500));
|
||||
|
||||
// Extract product data from the rendered DOM
|
||||
const items = await page.evaluate((selectors) => {
|
||||
const containers = document.querySelectorAll(selectors.container);
|
||||
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
|
||||
|
||||
containers.forEach((el) => {
|
||||
const nameEl = el.querySelector(selectors.name);
|
||||
const priceEl = el.querySelector(selectors.price);
|
||||
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
|
||||
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
|
||||
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
const priceText = priceEl?.textContent?.trim() || '';
|
||||
const link = linkEl?.getAttribute('href') || '';
|
||||
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
|
||||
|
||||
if (name && priceText) {
|
||||
results.push({ name, priceText, link, image });
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}, {
|
||||
container: store.sel_container,
|
||||
name: store.sel_name,
|
||||
price: store.sel_price,
|
||||
link: store.sel_link,
|
||||
image: store.sel_image || null,
|
||||
});
|
||||
|
||||
const html = await page.content();
|
||||
|
||||
return { items, html, statusCode };
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
export async function closeBrowser(): Promise<void> {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
browser = null;
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ import type { Store } from '../models/store.js';
|
||||
import { getEnabledStores, getStoresByCategory, getStoresByGroup, getStoresByIds } from '../models/store.js';
|
||||
import { logScrape } from '../models/scrape-log.js';
|
||||
import { scrapeStore } from './http-scraper.js';
|
||||
import { scrapeStoreWithBrowser } from './browser-scraper.js';
|
||||
import { normalizeResult, type Product } from './result-parser.js';
|
||||
import { getLimiter } from './rate-limiter.js';
|
||||
|
||||
@@ -66,7 +67,10 @@ export async function search(options: SearchOptions): Promise<SearchResult> {
|
||||
const rateLimiter = getLimiter(store.id, 1, Math.floor(store.rate_window / store.rate_limit));
|
||||
|
||||
try {
|
||||
const result = await rateLimiter.schedule(() => scrapeStore(store, searchUrl));
|
||||
const scrapeFn = store.render_js
|
||||
? () => scrapeStoreWithBrowser(store, searchUrl)
|
||||
: () => scrapeStore(store, searchUrl);
|
||||
const result = await rateLimiter.schedule(scrapeFn);
|
||||
const duration = Date.now() - storeStart;
|
||||
|
||||
const products = result.items.map((item) =>
|
||||
|
||||
@@ -8,6 +8,7 @@ export interface StoreFileConfig {
|
||||
base_url: string;
|
||||
search_url: string;
|
||||
enabled?: boolean;
|
||||
render_js?: boolean;
|
||||
category?: string;
|
||||
currency?: string;
|
||||
selectors: {
|
||||
@@ -98,7 +99,7 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
|
||||
if (existing) {
|
||||
db.run(`
|
||||
UPDATE stores SET
|
||||
name = ?, base_url = ?, search_url = ?, enabled = ?,
|
||||
name = ?, base_url = ?, search_url = ?, enabled = ?, render_js = ?,
|
||||
sel_container = ?, sel_name = ?, sel_price = ?, sel_link = ?, sel_image = ?,
|
||||
rate_limit = ?, rate_window = ?, proxy_url = ?, user_agent = ?, headers_json = ?,
|
||||
currency = ?, category_id = ?, updated_at = datetime('now')
|
||||
@@ -106,6 +107,7 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
|
||||
`, [
|
||||
config.name, config.base_url, config.search_url,
|
||||
config.enabled === false ? 0 : 1,
|
||||
config.render_js ? 1 : 0,
|
||||
config.selectors.container, config.selectors.name,
|
||||
config.selectors.price, config.selectors.link,
|
||||
config.selectors.image || null,
|
||||
@@ -117,14 +119,15 @@ export function syncFromFiles(storesDir: string): { created: number; updated: nu
|
||||
updated++;
|
||||
} else {
|
||||
db.run(`
|
||||
INSERT INTO stores (name, slug, base_url, search_url, enabled,
|
||||
INSERT INTO stores (name, slug, base_url, search_url, enabled, render_js,
|
||||
sel_container, sel_name, sel_price, sel_link, sel_image,
|
||||
rate_limit, rate_window, proxy_url, user_agent, headers_json,
|
||||
currency, category_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`, [
|
||||
config.name, slug, config.base_url, config.search_url,
|
||||
config.enabled === false ? 0 : 1,
|
||||
config.render_js ? 1 : 0,
|
||||
config.selectors.container, config.selectors.name,
|
||||
config.selectors.price, config.selectors.link,
|
||||
config.selectors.image || null,
|
||||
@@ -161,6 +164,7 @@ function storeToConfig(store: any, categoryName?: string): StoreFileConfig {
|
||||
|
||||
if (store.sel_image) config.selectors.image = store.sel_image;
|
||||
if (store.enabled === 0) config.enabled = false;
|
||||
if (store.render_js) config.render_js = true;
|
||||
if (categoryName) config.category = categoryName;
|
||||
if (store.currency && store.currency !== 'EUR') config.currency = store.currency;
|
||||
if (store.rate_limit && store.rate_limit !== 2) config.rate_limit = store.rate_limit;
|
||||
|
||||
14
stores/hgspot.yaml
Normal file
14
stores/hgspot.yaml
Normal file
@@ -0,0 +1,14 @@
|
||||
name: HG Spot
|
||||
base_url: https://www.hgspot.hr
|
||||
search_url: https://www.hgspot.hr/pretraga?q={query}&page=0
|
||||
category: Electronics
|
||||
currency: EUR
|
||||
render_js: true
|
||||
rate_limit: 1
|
||||
|
||||
selectors:
|
||||
container: "section.product-container-comm"
|
||||
name: "h2.product-name"
|
||||
price: "div.product-price"
|
||||
link: "h2.product-name a"
|
||||
image: ".product-image img"
|
||||
Reference in New Issue
Block a user