Add step-by-step logging to browser scraper and skip HTML capture
Logs each phase (launch, navigate, wait selector, extract, close) so we can diagnose where Puppeteer gets stuck. Also skips the expensive page.content() call since full HTML is only needed for the test endpoint, not search. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,9 +7,14 @@ let browser: Browser | null = null;
|
|||||||
const SELECTOR_TIMEOUT = 15_000;
|
const SELECTOR_TIMEOUT = 15_000;
|
||||||
const NAVIGATION_TIMEOUT = 15_000;
|
const NAVIGATION_TIMEOUT = 15_000;
|
||||||
|
|
||||||
|
function log(msg: string) {
|
||||||
|
console.log(`[browser-scraper] ${msg}`);
|
||||||
|
}
|
||||||
|
|
||||||
async function getBrowser(): Promise<Browser> {
|
async function getBrowser(): Promise<Browser> {
|
||||||
if (browser && browser.connected) return browser;
|
if (browser && browser.connected) return browser;
|
||||||
|
|
||||||
|
log('Launching Chromium...');
|
||||||
const puppeteer = await import('puppeteer');
|
const puppeteer = await import('puppeteer');
|
||||||
browser = await puppeteer.default.launch({
|
browser = await puppeteer.default.launch({
|
||||||
headless: true,
|
headless: true,
|
||||||
@@ -22,6 +27,7 @@ async function getBrowser(): Promise<Browser> {
|
|||||||
'--no-first-run',
|
'--no-first-run',
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
|
log('Chromium launched');
|
||||||
|
|
||||||
return browser;
|
return browser;
|
||||||
}
|
}
|
||||||
@@ -33,44 +39,44 @@ export interface BrowserScrapeResult {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
|
export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): Promise<BrowserScrapeResult> {
|
||||||
|
log(`Scraping ${store.name}: ${searchUrl}`);
|
||||||
const b = await getBrowser();
|
const b = await getBrowser();
|
||||||
const page = await b.newPage();
|
const page = await b.newPage();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Set user agent if configured
|
|
||||||
if (store.user_agent) {
|
if (store.user_agent) {
|
||||||
await page.setUserAgent(store.user_agent);
|
await page.setUserAgent(store.user_agent);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set extra headers if configured
|
|
||||||
if (store.headers_json) {
|
if (store.headers_json) {
|
||||||
try {
|
try {
|
||||||
const headers = JSON.parse(store.headers_json);
|
const headers = JSON.parse(store.headers_json);
|
||||||
await page.setExtraHTTPHeaders(headers);
|
await page.setExtraHTTPHeaders(headers);
|
||||||
} catch { /* ignore invalid headers */ }
|
} catch { /* ignore */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Navigate to the page
|
log(`${store.name}: navigating...`);
|
||||||
const response = await page.goto(searchUrl, {
|
const response = await page.goto(searchUrl, {
|
||||||
waitUntil: 'domcontentloaded',
|
waitUntil: 'domcontentloaded',
|
||||||
timeout: NAVIGATION_TIMEOUT,
|
timeout: NAVIGATION_TIMEOUT,
|
||||||
});
|
});
|
||||||
|
|
||||||
const statusCode = response?.status() ?? 200;
|
const statusCode = response?.status() ?? 200;
|
||||||
|
log(`${store.name}: DOM loaded (status ${statusCode})`);
|
||||||
|
|
||||||
// Wait for the product container to appear
|
log(`${store.name}: waiting for selector "${store.sel_container}"...`);
|
||||||
try {
|
try {
|
||||||
await page.waitForSelector(store.sel_container, { timeout: SELECTOR_TIMEOUT });
|
await page.waitForSelector(store.sel_container, { timeout: SELECTOR_TIMEOUT });
|
||||||
} catch {
|
} catch {
|
||||||
// Container might not exist if no results — return empty
|
log(`${store.name}: selector not found, returning empty`);
|
||||||
const html = await page.content();
|
return { items: [], html: '', statusCode };
|
||||||
return { items: [], html, statusCode };
|
|
||||||
}
|
}
|
||||||
|
log(`${store.name}: selector found`);
|
||||||
|
|
||||||
// Small extra wait for any remaining renders
|
// Brief wait for remaining renders
|
||||||
await new Promise((r) => setTimeout(r, 500));
|
await new Promise((r) => setTimeout(r, 300));
|
||||||
|
|
||||||
// Extract product data from the rendered DOM
|
log(`${store.name}: extracting products...`);
|
||||||
const items = await page.evaluate((selectors) => {
|
const items = await page.evaluate((selectors) => {
|
||||||
const containers = document.querySelectorAll(selectors.container);
|
const containers = document.querySelectorAll(selectors.container);
|
||||||
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
|
const results: Array<{ name: string; priceText: string; link: string; image: string | null }> = [];
|
||||||
@@ -100,11 +106,13 @@ export async function scrapeStoreWithBrowser(store: Store, searchUrl: string): P
|
|||||||
image: store.sel_image || null,
|
image: store.sel_image || null,
|
||||||
});
|
});
|
||||||
|
|
||||||
const html = await page.content();
|
log(`${store.name}: found ${items.length} products`);
|
||||||
|
|
||||||
return { items, html, statusCode };
|
return { items, html: '', statusCode };
|
||||||
} finally {
|
} finally {
|
||||||
|
log(`${store.name}: closing page`);
|
||||||
await page.close();
|
await page.close();
|
||||||
|
log(`${store.name}: done`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user