Add Puppeteer browser scraping and HG Spot store config
- Add browser-scraper.ts using Puppeteer for JS-heavy stores - Add render_js flag to store model, migration, YAML sync, and UI - Scraper engine auto-selects cheerio vs Puppeteer based on flag - Store forms include JS rendering toggle in Advanced section - Create first store config: HG Spot (Croatian electronics retailer) - Update Dockerfile with Chromium for production Puppeteer support Tested: HG Spot returns 15 products per page with correct names, prices (EUR), links, and images using headless browser rendering. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
115
src/server/scraper/browser-scraper.ts
Normal file
115
src/server/scraper/browser-scraper.ts
Normal file
@@ -0,0 +1,115 @@
|
||||
import puppeteer, { type Browser } from 'puppeteer';
|
||||
import type { Store } from '../models/store.js';
|
||||
import type { ScrapedItem } from './result-parser.js';
|
||||
|
||||
let browser: Browser | null = null;
|
||||
|
||||
const PAGE_TIMEOUT = 30_000;
|
||||
const NAVIGATION_TIMEOUT = 20_000;
|
||||
|
||||
async function getBrowser(): Promise<Browser> {
|
||||
if (browser && browser.connected) return browser;
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
'--disable-extensions',
|
||||
'--no-first-run',
|
||||
],
|
||||
});
|
||||
|
||||
return browser;
|
||||
}
|
||||
|
||||
export interface BrowserScrapeResult {
|
||||
items: ScrapedItem[];
|
||||
html: string;
|
||||
statusCode: number;
|
||||
}
|
||||
|
||||
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
|
||||
const b = await getBrowser();
|
||||
const page = await b.newPage();
|
||||
|
||||
try {
|
||||
// Set user agent if configured
|
||||
if (store.user_agent) {
|
||||
await page.setUserAgent(store.user_agent);
|
||||
}
|
||||
|
||||
// Set extra headers if configured
|
||||
if (store.headers_json) {
|
||||
try {
|
||||
const headers = JSON.parse(store.headers_json);
|
||||
await page.setExtraHTTPHeaders(headers);
|
||||
} catch { /* ignore invalid headers */ }
|
||||
}
|
||||
|
||||
// Navigate to the page
|
||||
const response = await page.goto(searchUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: NAVIGATION_TIMEOUT,
|
||||
});
|
||||
|
||||
const statusCode = response?.status() ?? 200;
|
||||
|
||||
// Wait for the product container to appear
|
||||
try {
|
||||
await page.waitForSelector(store.sel_container, { timeout: PAGE_TIMEOUT });
|
||||
} catch {
|
||||
// Container might not exist if no results — return empty
|
||||
const html = await page.content();
|
||||
return { items: [], html, statusCode };
|
||||
}
|
||||
|
||||
// Small extra wait for any remaining renders
|
||||
await new Promise((r) => setTimeout(r, 500));
|
||||
|
||||
// Extract product data from the rendered DOM
|
||||
const items = await page.evaluate((selectors) => {
|
||||
const containers = document.querySelectorAll(selectors.container);
|
||||
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
|
||||
|
||||
containers.forEach((el) => {
|
||||
const nameEl = el.querySelector(selectors.name);
|
||||
const priceEl = el.querySelector(selectors.price);
|
||||
const linkEl = el.querySelector(selectors.link) as HTMLAnchorElement | null;
|
||||
const imageEl = selectors.image ? el.querySelector(selectors.image) as HTMLImageElement | null : null;
|
||||
|
||||
const name = nameEl?.textContent?.trim() || '';
|
||||
const priceText = priceEl?.textContent?.trim() || '';
|
||||
const link = linkEl?.getAttribute('href') || '';
|
||||
const image = imageEl?.getAttribute('src') || imageEl?.getAttribute('data-src') || null;
|
||||
|
||||
if (name && priceText) {
|
||||
results.push({ name, priceText, link, image });
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}, {
|
||||
container: store.sel_container,
|
||||
name: store.sel_name,
|
||||
price: store.sel_price,
|
||||
link: store.sel_link,
|
||||
image: store.sel_image || null,
|
||||
});
|
||||
|
||||
const html = await page.content();
|
||||
|
||||
return { items, html, statusCode };
|
||||
} finally {
|
||||
await page.close();
|
||||
}
|
||||
}
|
||||
|
||||
export async function closeBrowser(): Promise<void> {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
browser = null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user