Initial commit: Price Hunter — self-hosted price comparison engine
Complete application scaffolding with: - Backend: Node.js + Fastify + sql.js (SQLite) - Frontend: SvelteKit + Tailwind CSS - Scraper engine with parallel fan-out, rate limiting, cheerio-based parsing - Store management with CSS selector config and per-store test pages - Docker setup for single-command deployment Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
120
src/server/scraper/engine.ts
Normal file
120
src/server/scraper/engine.ts
Normal file
@@ -0,0 +1,120 @@
|
||||
import pLimit from 'p-limit';
|
||||
import type { Store } from '../models/store.js';
|
||||
import { getEnabledStores, getStoresByCategory, getStoresByGroup, getStoresByIds } from '../models/store.js';
|
||||
import { logScrape } from '../models/scrape-log.js';
|
||||
import { scrapeStore } from './http-scraper.js';
|
||||
import { normalizeResult, type Product } from './result-parser.js';
|
||||
import { getLimiter } from './rate-limiter.js';
|
||||
|
||||
const MAX_CONCURRENCY = 5;
|
||||
const SEARCH_TIMEOUT = 60_000;
|
||||
|
||||
export interface SearchOptions {
|
||||
query: string;
|
||||
storeIds?: number[];
|
||||
categoryId?: number;
|
||||
groupId?: number;
|
||||
}
|
||||
|
||||
export interface SearchResult {
|
||||
results: Product[];
|
||||
meta: {
|
||||
query: string;
|
||||
duration: number;
|
||||
storeCount: number;
|
||||
totalResults: number;
|
||||
errors: Array<{ storeId: number; storeName: string; error: string }>;
|
||||
};
|
||||
}
|
||||
|
||||
export async function search(options: SearchOptions): Promise<SearchResult> {
|
||||
const startTime = Date.now();
|
||||
const { query } = options;
|
||||
|
||||
// Determine which stores to scrape
|
||||
let stores: Store[];
|
||||
if (options.storeIds?.length) {
|
||||
stores = getStoresByIds(options.storeIds);
|
||||
} else if (options.groupId) {
|
||||
stores = getStoresByGroup(options.groupId);
|
||||
} else if (options.categoryId) {
|
||||
stores = getStoresByCategory(options.categoryId);
|
||||
} else {
|
||||
stores = getEnabledStores();
|
||||
}
|
||||
|
||||
if (stores.length === 0) {
|
||||
return {
|
||||
results: [],
|
||||
meta: { query, duration: Date.now() - startTime, storeCount: 0, totalResults: 0, errors: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const limit = pLimit(MAX_CONCURRENCY);
|
||||
const errors: SearchResult['meta']['errors'] = [];
|
||||
const allProducts: Product[] = [];
|
||||
|
||||
// Create an overall timeout
|
||||
const timeoutPromise = new Promise<never>((_, reject) =>
|
||||
setTimeout(() => reject(new Error('Search timeout')), SEARCH_TIMEOUT)
|
||||
);
|
||||
|
||||
const scrapePromises = stores.map((store) =>
|
||||
limit(async () => {
|
||||
const searchUrl = store.search_url.replace('{query}', encodeURIComponent(query));
|
||||
const storeStart = Date.now();
|
||||
const rateLimiter = getLimiter(store.id, 1, Math.floor(store.rate_window / store.rate_limit));
|
||||
|
||||
try {
|
||||
const result = await rateLimiter.schedule(() => scrapeStore(store, searchUrl));
|
||||
const duration = Date.now() - storeStart;
|
||||
|
||||
const products = result.items.map((item) =>
|
||||
normalizeResult(item, store.id, store.name, store.base_url, store.currency)
|
||||
);
|
||||
|
||||
logScrape(store.id, query, true, products.length, duration);
|
||||
return products;
|
||||
} catch (err) {
|
||||
const duration = Date.now() - storeStart;
|
||||
const errorMessage = err instanceof Error ? err.message : String(err);
|
||||
logScrape(store.id, query, false, 0, duration, errorMessage);
|
||||
errors.push({ storeId: store.id, storeName: store.name, error: errorMessage });
|
||||
return [];
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
try {
|
||||
const results = await Promise.race([
|
||||
Promise.all(scrapePromises),
|
||||
timeoutPromise,
|
||||
]) as Product[][];
|
||||
|
||||
for (const products of results) {
|
||||
allProducts.push(...products);
|
||||
}
|
||||
} catch (err) {
|
||||
// Timeout — collect whatever we have
|
||||
errors.push({ storeId: 0, storeName: 'System', error: 'Search timed out' });
|
||||
}
|
||||
|
||||
// Sort by price ascending, nulls last
|
||||
allProducts.sort((a, b) => {
|
||||
if (a.price === null && b.price === null) return 0;
|
||||
if (a.price === null) return 1;
|
||||
if (b.price === null) return -1;
|
||||
return a.price - b.price;
|
||||
});
|
||||
|
||||
return {
|
||||
results: allProducts,
|
||||
meta: {
|
||||
query,
|
||||
duration: Date.now() - startTime,
|
||||
storeCount: stores.length,
|
||||
totalResults: allProducts.length,
|
||||
errors,
|
||||
},
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user