From 72b463714dc02eb7db1178fb49408262d047b3d4 Mon Sep 17 00:00:00 2001 From: nguyentrungthat <80239428+nguentrungthat@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:03:53 +0700 Subject: [PATCH] Update scrap --- package-lock.json | 7 + package.json | 1 + scrap_new_items.js | 375 ++++++++++++++++++++++----------------------- 3 files changed, 193 insertions(+), 190 deletions(-) diff --git a/package-lock.json b/package-lock.json index 7ac6755..c3749c7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,6 +13,7 @@ "cheerio": "^1.1.0", "dayjs": "^1.11.18", "dotenv": "^17.2.3", + "fs": "^0.0.1-security", "googleapis": "^166.0.0", "luxon": "^3.7.2", "mysql2": "^3.15.2", @@ -1507,6 +1508,12 @@ "node": ">=0.8" } }, + "node_modules/fs": { + "version": "0.0.1-security", + "resolved": "https://registry.npmjs.org/fs/-/fs-0.0.1-security.tgz", + "integrity": "sha512-3XY9e1pP0CVEUCdj5BmfIZxRBTSDycnbqhIOGec9QYtmVH2fbLpj86CFWkrNOkt/Fvty4KZG5lTglL9j/gJ87w==", + "license": "ISC" + }, "node_modules/fs-constants": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", diff --git a/package.json b/package.json index b87616c..9308afc 100644 --- a/package.json +++ b/package.json @@ -13,6 +13,7 @@ "cheerio": "^1.1.0", "dayjs": "^1.11.18", "dotenv": "^17.2.3", + "fs": "^0.0.1-security", "googleapis": "^166.0.0", "luxon": "^3.7.2", "mysql2": "^3.15.2", diff --git a/scrap_new_items.js b/scrap_new_items.js index d86ff19..097f71d 100644 --- a/scrap_new_items.js +++ b/scrap_new_items.js @@ -3,17 +3,19 @@ import puppeteer from "puppeteer"; import axios from "axios"; import mysql from "mysql2/promise"; import { DateTime } from "luxon"; +import path from "node:path"; +import fs from "node:fs"; -// Define function promise waiting for a given time -async function wait(ms) { - return new Promise((resolve) => setTimeout(resolve, ms)); -} +const LOG_FILE = path.join(process.cwd(), "newitems.log"); +const MAX_LOG_SIZE = 20 * 1024 * 1024; // 20MB + +// --- HELPERS --- +const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); async function safeGetContent(page) { try { return await page.content(); } catch (err) { - // Nếu navigation xảy ra → chờ page ổn định lại rồi đọc tiếp if (err.message.includes("Execution context was destroyed")) { await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {}); return await page.content(); @@ -24,71 +26,107 @@ async function safeGetContent(page) { function parseAndConvertToSydney(dateString, fromTimezone, formatDate) { const currentYear = new Date().getFullYear(); - - // Detect format: "Dec-9 23:05" or "9-Dec 23:05" - let format = "MMM-d HH:mm"; - if (/^\d/.test(dateString[0])) { - format = "d-MMM HH:mm"; - } - + let format = /^\d/.test(dateString[0]) ? "d-MMM HH:mm" : "MMM-d HH:mm"; const fullDateString = `${dateString} ${currentYear}`; - const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone }); - if (!dt.isValid) { - // console.log("❌ Invalid Luxon parse:", dt.invalidReason, fullDateString, format); - return null; - } - + if (!dt.isValid) return null; const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone; return dt.setZone(systemTz).toFormat(formatDate); } -async function scrapeWithPuppeteer(store) { - // console.log(`Fetching with scrapeWithPuppeteer`); - try { - const browser = await puppeteer.launch({ - headless: true, - args: ["--no-sandbox", "--disable-setuid-sandbox"], - }); - const page = await browser.newPage(); - await page.goto(store.data, { waitUntil: "networkidle2" }); +// --- LOGIC LẤY CONFIG TỪ DB --- +async function getConfigs(pool) { + let configs = []; + const [keywords] = await pool.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1"); + const [keywordsCustom] = await pool.execute("SELECT id, name, url FROM newitems_config"); + const [markets] = await pool.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1"); + + const keyWord = keywords?.map((el) => el.name)?.join("+"); + + markets.forEach((m) => { + const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : ""; + // URL FixedPrice + configs.push({ + data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`, + type_custom: "cisco", + config_id: null, + from_site: m.market_code, + timezone: m.timezone, + listingType: "FixedPrice", + }); + // URL Auction + configs.push({ + data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`, + type_custom: "cisco", + config_id: null, + from_site: m.market_code, + timezone: m.timezone, + listingType: "Auction", + }); + }); + + if (keywordsCustom.length > 0) { + keywordsCustom.forEach((k) => { + const matchedMarket = findMarketDataFromSearchUrl(k.url, markets); + if (matchedMarket) { + configs.push({ + data: k.url, + type_custom: "custom", + config_id: k.id, + from_site: matchedMarket.market_code, + timezone: matchedMarket.timezone, + listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction", + }); + } + }); + } + return configs; +} + +function findMarketDataFromSearchUrl(searchUrl, marketDatas) { + if (!searchUrl?.trim()) return null; + try { + const searchHost = new URL(searchUrl.trim()).host; + for (const data of marketDatas) { + if (searchUrl.includes(data.url)) return data; + const marketHost = new URL(data.url).host; + if (searchHost === marketHost) return data; + } + } catch (e) { + return null; + } + return null; +} + +// --- CORE SCRAPER --- +async function scrapeWithPuppeteer(browser, store) { + let page = null; + try { + page = await browser.newPage(); + + // Tối ưu RAM: Chặn các request không cần thiết + await page.setRequestInterception(true); + page.on("request", (req) => { + if (["image", "stylesheet", "font", "media"].includes(req.resourceType())) req.abort(); + else req.continue(); + }); + + await page.goto(store.data, { waitUntil: "networkidle2", timeout: 60000 }); - // ----- RETRY HANDLE (Cloudflare / Slow load) ----- let retries = 0; while (retries < 10) { - try { - // console.log(`Retry ${retries + 1}`); - const html = await safeGetContent(page); - - // Detect Cloudflare or other blocking messages - if (html.includes("Checking your browser")) { - await wait(2000); - retries++; - continue; - } - - const hasItems = await page.$("li.s-card--horizontal"); - if (hasItems) break; - - await wait(2000); - retries++; - } catch (err) { + const html = await safeGetContent(page); + if (html.includes("Checking your browser")) { await wait(2000); retries++; + continue; } + if (await page.$("li.s-card--horizontal")) break; + await wait(2000); + retries++; } - // Wait for cards or detect Cloudflare - const html = await safeGetContent(page); - const needBrowserCheck = html.includes("Checking your browser"); - if (needBrowserCheck) { - await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null); - } - - // ================================================ - // MAIN SCRAPING LOGIC (FULL CONVERT FROM PHP) - // ================================================ const items = await page?.$$eval( "li.s-card--horizontal", (nodes, store) => { @@ -117,11 +155,12 @@ async function scrapeWithPuppeteer(store) { nodes.forEach((node) => { const payload = {}; // ---------------- LINK + ID ---------------- - const linkEl = node.querySelector("div.su-media__image a"); + const linkEl = node.querySelector("div.su-image a"); // if (!linkEl) return; - - payload.link_detail = linkEl?.href || ""; - const idMatch = linkEl.href.match(/\/itm\/(\d+)/); + const linkDetail = linkEl && linkEl?.href ? linkEl?.href : ""; + if (!linkDetail) return; + payload.link_detail = linkDetail; + const idMatch = linkDetail.match(/\/itm\/(\d+)/); if (!idMatch) return; payload.id = idMatch[1]; @@ -271,164 +310,120 @@ async function scrapeWithPuppeteer(store) { store ); - const results = items.map((item) => { + return items.map((item) => { if (!item.time) return item; - - // PROCESS START TIME + END TIME const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm"); - const timestamp = new Date(timeConvert).getTime() / 1000; - - item.timeConvert = timeConvert; - item.start_time = timestamp; - item.end_time = timestamp + 2592000; - item.start_time_string = timeConvert; - // delete item.time; - return { - ...item, - }; + const ts = new Date(timeConvert).getTime() / 1000; + return { ...item, timeConvert, start_time: ts, end_time: item.end_time || ts + 2592000, start_time_string: timeConvert }; }); - - await browser.close(); - return results; } catch (err) { - console.log("Error scrapeWithPuppeteer:", err); + console.error(`Lỗi tại ${store.data}:`, err.message); return []; + } finally { + if (page) await page.close(); } } -function findMarketDataFromSearchUrl(searchUrl, marketDatas) { - if (!searchUrl || !searchUrl.trim()) { - return null; - } - - searchUrl = searchUrl.trim(); - - let searchHost; +function trimLogFileIfNeeded() { try { - searchHost = new URL(searchUrl).host; - } catch (e) { - searchHost = null; + if (!fs.existsSync(LOG_FILE)) return; + + const stats = fs.statSync(LOG_FILE); + if (stats.size > MAX_LOG_SIZE) { + const data = fs.readFileSync(LOG_FILE, "utf8"); + const lines = data.split("\n"); + // Giữ lại 70% dòng mới nhất + const keepLines = Math.floor(lines.length * 0.7); + const trimmed = lines.slice(-keepLines).join("\n"); + + fs.writeFileSync(LOG_FILE, trimmed + "\n", "utf8"); + console.log(`\n[SYSTEM] Log file trimmed (Size: ${(stats.size / 1024 / 1024).toFixed(2)}MB > 20MB)`); + } + } catch (err) { + console.error("Error trimming log file:", err); } - - for (const data of marketDatas) { - if (!data.url) continue; - - // 1) direct substring match - if (searchUrl.includes(data.url)) { - return data; - } - - // 2) fallback: host comparison - let marketHost = null; - try { - marketHost = new URL(data.url).host; - } catch (e) { - marketHost = null; - } - - if (searchHost && marketHost && searchHost === marketHost) { - return data; - } - } - - return null; } +// --- MAIN PROCESS --- async function main() { - const db = await mysql.createConnection({ - host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP + const pool = mysql.createPool({ + host: process.env.MYSQL_HOST, user: process.env.MYSQL_USER, password: process.env.MYSQL_PASSWORD, database: process.env.MYSQL_DB_NAME, + waitForConnections: true, + connectionLimit: 5, }); - // console.log("✅ Connected to MySQL"); - - // const errors = []; - let configs = []; - const [keywords] = await db.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1"); - const [keywordsCustom] = await db.execute("SELECT id, name, url FROM newitems_config"); - const [markets] = await db.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1"); - const keyWord = keywords?.map((el) => el.name)?.join("+"); - markets.forEach((m) => { - const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : ""; - - // URL FixedPrice (_sop=10) - configs.push({ - data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`, - type_custom: "cisco", - config_id: null, - from_site: m.market_code, - timezone: m.timezone, - listingType: "FixedPrice", - }); - - // URL Auction (_sop=1) - configs.push({ - data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`, - type_custom: "cisco", - config_id: null, - from_site: m.market_code, - timezone: m.timezone, - listingType: "Auction", - }); + const browser = await puppeteer.launch({ + headless: true, + args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--single-process"], }); - if (keywordsCustom.length > 0) { - keywordsCustom.forEach((k) => { - const matchedMarket = findMarketDataFromSearchUrl(k.url, markets); - if (matchedMarket) { - configs.push({ - data: k.url, - type_custom: "custom", - config_id: k.id, - from_site: matchedMarket.market_code, - timezone: matchedMarket.timezone, - listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction", - }); - } - }); - } - // console.log(`Total configs to process: ${configs.length}`); - for (const store of configs) { - // console.log(`Processing ${store.data}`); - // let items = await scrapeWithCheerio(store); - let items = await scrapeWithPuppeteer(store); - for (const item of items) { - // 2️⃣ Check if record exists - const [rows] = await db.execute("SELECT id FROM items WHERE id = ?", [item.id]); - if (rows.length > 0 || item.id === "123456") continue; + console.log("🚀 Bắt đầu phiên làm việc mới..."); - // 3️⃣ Insert new record - const title = (item.title || "").replace("Opens in a new window or tab", "").trim(); - console.log(`Processing ${store.data}`); - console.log({ ...item, title }); - await axios - .post( - process.env.API_DISTI_HOST + "/api/items/insert", - { ...item, title }, - { - headers: { - "x-key": "CanTho#1", - }, + let runCount = 0; + const MAX_RUNS = 60; // Chạy 60 chu kỳ (~1 tiếng) + const ONE_HOUR_MS = 60 * 60 * 1000; + const SESSION_START = Date.now(); // Thời điểm bắt đầu chạy script + const MAX_TIME_EXTENSIONS = SESSION_START + ONE_HOUR_MS; // Thời điểm phải kết thúc + + while (runCount < MAX_RUNS && Date.now() < MAX_TIME_EXTENSIONS) { + runCount++; + const startTime = Date.now(); + console.log(`--- Chu kỳ ${runCount}/${MAX_RUNS} --- ${MAX_TIME_EXTENSIONS - Date.now()} ---`); + + try { + const configs = await getConfigs(pool); + for (const store of configs) { + if (Date.now() > MAX_TIME_EXTENSIONS) break; + const items = await scrapeWithPuppeteer(browser, store); + for (const item of items) { + const [rows] = await pool.execute("SELECT id FROM items WHERE id = ?", [item.id]); + if (rows.length > 0) continue; + const title = (item.title || "").replace("Opens in a new window or tab", "").trim(); + console.log(`Processing ${store.data}`); + console.log({ ...item, title }); + try { + const res = await axios + .post( + `${process.env.API_DISTI_HOST}/api/items/insert`, + { ...item, title }, + { + headers: { "x-key": "CanTho#1" }, + } + ) + .then((res) => { + console.log(res.data, item.id, item.timeConvert, item.time); + }) + .catch((err) => { + console.log(err); + }); + } catch (e) { + console.error(`❌ Lỗi API ID ${item.id}:`, e.message); } - ) - .then((res) => { - console.log(res.data, item.id, item.timeConvert, item.time); - }) - .catch((err) => { - console.log(err); - }); + } + } + } catch (err) { + console.error("Lỗi chu kỳ:", err.message); } - } - // console.log("✅ Done scraping."); - await db.end(); - // console.log("🔌 MySQL connection closed"); + const duration = Date.now() - startTime; + const delay = Math.max(0, 60000 - duration); + console.log(`Hoàn thành chu kỳ trong ${duration / 1000}s. Nghỉ ${delay / 1000}s.`); + + if (runCount < MAX_RUNS && Date.now() + delay < MAX_TIME_EXTENSIONS) await wait(delay); + } + + console.log("🏁 Đã chạy đủ 60 lần. Đang làm mới tiến trình..."); + await browser.close(); + await pool.end(); + // Thực hiện dọn log một lần cuối trước khi thoát process hoàn toàn + trimLogFileIfNeeded(); process.exit(0); } main().catch((err) => { - console.error(err); + console.error("FATAL ERROR:", err); process.exit(1); });