Update scrap

This commit is contained in:
nguyentrungthat 2026-04-01 12:03:53 +07:00
parent 3b0f4992a2
commit 72b463714d
3 changed files with 193 additions and 190 deletions

7
package-lock.json generated
View File

@ -13,6 +13,7 @@
"cheerio": "^1.1.0", "cheerio": "^1.1.0",
"dayjs": "^1.11.18", "dayjs": "^1.11.18",
"dotenv": "^17.2.3", "dotenv": "^17.2.3",
"fs": "^0.0.1-security",
"googleapis": "^166.0.0", "googleapis": "^166.0.0",
"luxon": "^3.7.2", "luxon": "^3.7.2",
"mysql2": "^3.15.2", "mysql2": "^3.15.2",
@ -1507,6 +1508,12 @@
"node": ">=0.8" "node": ">=0.8"
} }
}, },
"node_modules/fs": {
"version": "0.0.1-security",
"resolved": "https://registry.npmjs.org/fs/-/fs-0.0.1-security.tgz",
"integrity": "sha512-3XY9e1pP0CVEUCdj5BmfIZxRBTSDycnbqhIOGec9QYtmVH2fbLpj86CFWkrNOkt/Fvty4KZG5lTglL9j/gJ87w==",
"license": "ISC"
},
"node_modules/fs-constants": { "node_modules/fs-constants": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",

View File

@ -13,6 +13,7 @@
"cheerio": "^1.1.0", "cheerio": "^1.1.0",
"dayjs": "^1.11.18", "dayjs": "^1.11.18",
"dotenv": "^17.2.3", "dotenv": "^17.2.3",
"fs": "^0.0.1-security",
"googleapis": "^166.0.0", "googleapis": "^166.0.0",
"luxon": "^3.7.2", "luxon": "^3.7.2",
"mysql2": "^3.15.2", "mysql2": "^3.15.2",

View File

@ -3,17 +3,19 @@ import puppeteer from "puppeteer";
import axios from "axios"; import axios from "axios";
import mysql from "mysql2/promise"; import mysql from "mysql2/promise";
import { DateTime } from "luxon"; import { DateTime } from "luxon";
import path from "node:path";
import fs from "node:fs";
// Define function promise waiting for a given time const LOG_FILE = path.join(process.cwd(), "newitems.log");
async function wait(ms) { const MAX_LOG_SIZE = 20 * 1024 * 1024; // 20MB
return new Promise((resolve) => setTimeout(resolve, ms));
} // --- HELPERS ---
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
async function safeGetContent(page) { async function safeGetContent(page) {
try { try {
return await page.content(); return await page.content();
} catch (err) { } catch (err) {
// Nếu navigation xảy ra → chờ page ổn định lại rồi đọc tiếp
if (err.message.includes("Execution context was destroyed")) { if (err.message.includes("Execution context was destroyed")) {
await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {}); await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {});
return await page.content(); return await page.content();
@ -24,71 +26,107 @@ async function safeGetContent(page) {
function parseAndConvertToSydney(dateString, fromTimezone, formatDate) { function parseAndConvertToSydney(dateString, fromTimezone, formatDate) {
const currentYear = new Date().getFullYear(); const currentYear = new Date().getFullYear();
let format = /^\d/.test(dateString[0]) ? "d-MMM HH:mm" : "MMM-d HH:mm";
// Detect format: "Dec-9 23:05" or "9-Dec 23:05"
let format = "MMM-d HH:mm";
if (/^\d/.test(dateString[0])) {
format = "d-MMM HH:mm";
}
const fullDateString = `${dateString} ${currentYear}`; const fullDateString = `${dateString} ${currentYear}`;
const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone }); const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone });
if (!dt.isValid) { if (!dt.isValid) return null;
// console.log("❌ Invalid Luxon parse:", dt.invalidReason, fullDateString, format);
return null;
}
const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone; const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone;
return dt.setZone(systemTz).toFormat(formatDate); return dt.setZone(systemTz).toFormat(formatDate);
} }
async function scrapeWithPuppeteer(store) { // --- LOGIC LẤY CONFIG TỪ DB ---
// console.log(`Fetching with scrapeWithPuppeteer`); async function getConfigs(pool) {
try { let configs = [];
const browser = await puppeteer.launch({ const [keywords] = await pool.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1");
headless: true, const [keywordsCustom] = await pool.execute("SELECT id, name, url FROM newitems_config");
args: ["--no-sandbox", "--disable-setuid-sandbox"], const [markets] = await pool.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1");
});
const page = await browser.newPage(); const keyWord = keywords?.map((el) => el.name)?.join("+");
await page.goto(store.data, { waitUntil: "networkidle2" });
markets.forEach((m) => {
const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : "";
// URL FixedPrice
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "FixedPrice",
});
// URL Auction
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "Auction",
});
});
if (keywordsCustom.length > 0) {
keywordsCustom.forEach((k) => {
const matchedMarket = findMarketDataFromSearchUrl(k.url, markets);
if (matchedMarket) {
configs.push({
data: k.url,
type_custom: "custom",
config_id: k.id,
from_site: matchedMarket.market_code,
timezone: matchedMarket.timezone,
listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction",
});
}
});
}
return configs;
}
function findMarketDataFromSearchUrl(searchUrl, marketDatas) {
if (!searchUrl?.trim()) return null;
try {
const searchHost = new URL(searchUrl.trim()).host;
for (const data of marketDatas) {
if (searchUrl.includes(data.url)) return data;
const marketHost = new URL(data.url).host;
if (searchHost === marketHost) return data;
}
} catch (e) {
return null;
}
return null;
}
// --- CORE SCRAPER ---
async function scrapeWithPuppeteer(browser, store) {
let page = null;
try {
page = await browser.newPage();
// Tối ưu RAM: Chặn các request không cần thiết
await page.setRequestInterception(true);
page.on("request", (req) => {
if (["image", "stylesheet", "font", "media"].includes(req.resourceType())) req.abort();
else req.continue();
});
await page.goto(store.data, { waitUntil: "networkidle2", timeout: 60000 });
// ----- RETRY HANDLE (Cloudflare / Slow load) -----
let retries = 0; let retries = 0;
while (retries < 10) { while (retries < 10) {
try {
// console.log(`Retry ${retries + 1}`);
const html = await safeGetContent(page); const html = await safeGetContent(page);
// Detect Cloudflare or other blocking messages
if (html.includes("Checking your browser")) { if (html.includes("Checking your browser")) {
await wait(2000); await wait(2000);
retries++; retries++;
continue; continue;
} }
if (await page.$("li.s-card--horizontal")) break;
const hasItems = await page.$("li.s-card--horizontal");
if (hasItems) break;
await wait(2000);
retries++;
} catch (err) {
await wait(2000); await wait(2000);
retries++; retries++;
} }
}
// Wait for cards or detect Cloudflare
const html = await safeGetContent(page);
const needBrowserCheck = html.includes("Checking your browser");
if (needBrowserCheck) {
await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null);
}
// ================================================
// MAIN SCRAPING LOGIC (FULL CONVERT FROM PHP)
// ================================================
const items = await page?.$$eval( const items = await page?.$$eval(
"li.s-card--horizontal", "li.s-card--horizontal",
(nodes, store) => { (nodes, store) => {
@ -117,11 +155,12 @@ async function scrapeWithPuppeteer(store) {
nodes.forEach((node) => { nodes.forEach((node) => {
const payload = {}; const payload = {};
// ---------------- LINK + ID ---------------- // ---------------- LINK + ID ----------------
const linkEl = node.querySelector("div.su-media__image a"); const linkEl = node.querySelector("div.su-image a");
// if (!linkEl) return; // if (!linkEl) return;
const linkDetail = linkEl && linkEl?.href ? linkEl?.href : "";
payload.link_detail = linkEl?.href || ""; if (!linkDetail) return;
const idMatch = linkEl.href.match(/\/itm\/(\d+)/); payload.link_detail = linkDetail;
const idMatch = linkDetail.match(/\/itm\/(\d+)/);
if (!idMatch) return; if (!idMatch) return;
payload.id = idMatch[1]; payload.id = idMatch[1];
@ -271,146 +310,87 @@ async function scrapeWithPuppeteer(store) {
store store
); );
const results = items.map((item) => { return items.map((item) => {
if (!item.time) return item; if (!item.time) return item;
// PROCESS START TIME + END TIME
const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm"); const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm");
const timestamp = new Date(timeConvert).getTime() / 1000; const ts = new Date(timeConvert).getTime() / 1000;
return { ...item, timeConvert, start_time: ts, end_time: item.end_time || ts + 2592000, start_time_string: timeConvert };
item.timeConvert = timeConvert;
item.start_time = timestamp;
item.end_time = timestamp + 2592000;
item.start_time_string = timeConvert;
// delete item.time;
return {
...item,
};
}); });
await browser.close();
return results;
} catch (err) { } catch (err) {
console.log("Error scrapeWithPuppeteer:", err); console.error(`Lỗi tại ${store.data}:`, err.message);
return []; return [];
} finally {
if (page) await page.close();
} }
} }
function findMarketDataFromSearchUrl(searchUrl, marketDatas) { function trimLogFileIfNeeded() {
if (!searchUrl || !searchUrl.trim()) {
return null;
}
searchUrl = searchUrl.trim();
let searchHost;
try { try {
searchHost = new URL(searchUrl).host; if (!fs.existsSync(LOG_FILE)) return;
} catch (e) {
searchHost = null;
}
for (const data of marketDatas) { const stats = fs.statSync(LOG_FILE);
if (!data.url) continue; if (stats.size > MAX_LOG_SIZE) {
const data = fs.readFileSync(LOG_FILE, "utf8");
const lines = data.split("\n");
// Giữ lại 70% dòng mới nhất
const keepLines = Math.floor(lines.length * 0.7);
const trimmed = lines.slice(-keepLines).join("\n");
// 1) direct substring match fs.writeFileSync(LOG_FILE, trimmed + "\n", "utf8");
if (searchUrl.includes(data.url)) { console.log(`\n[SYSTEM] Log file trimmed (Size: ${(stats.size / 1024 / 1024).toFixed(2)}MB > 20MB)`);
return data;
} }
} catch (err) {
// 2) fallback: host comparison console.error("Error trimming log file:", err);
let marketHost = null;
try {
marketHost = new URL(data.url).host;
} catch (e) {
marketHost = null;
} }
if (searchHost && marketHost && searchHost === marketHost) {
return data;
}
}
return null;
} }
// --- MAIN PROCESS ---
async function main() { async function main() {
const db = await mysql.createConnection({ const pool = mysql.createPool({
host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP host: process.env.MYSQL_HOST,
user: process.env.MYSQL_USER, user: process.env.MYSQL_USER,
password: process.env.MYSQL_PASSWORD, password: process.env.MYSQL_PASSWORD,
database: process.env.MYSQL_DB_NAME, database: process.env.MYSQL_DB_NAME,
waitForConnections: true,
connectionLimit: 5,
}); });
// console.log("✅ Connected to MySQL"); const browser = await puppeteer.launch({
headless: true,
// const errors = []; args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--single-process"],
let configs = [];
const [keywords] = await db.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1");
const [keywordsCustom] = await db.execute("SELECT id, name, url FROM newitems_config");
const [markets] = await db.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1");
const keyWord = keywords?.map((el) => el.name)?.join("+");
markets.forEach((m) => {
const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : "";
// URL FixedPrice (_sop=10)
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "FixedPrice",
}); });
// URL Auction (_sop=1) console.log("🚀 Bắt đầu phiên làm việc mới...");
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "Auction",
});
});
if (keywordsCustom.length > 0) { let runCount = 0;
keywordsCustom.forEach((k) => { const MAX_RUNS = 60; // Chạy 60 chu kỳ (~1 tiếng)
const matchedMarket = findMarketDataFromSearchUrl(k.url, markets); const ONE_HOUR_MS = 60 * 60 * 1000;
if (matchedMarket) { const SESSION_START = Date.now(); // Thời điểm bắt đầu chạy script
configs.push({ const MAX_TIME_EXTENSIONS = SESSION_START + ONE_HOUR_MS; // Thời điểm phải kết thúc
data: k.url,
type_custom: "custom", while (runCount < MAX_RUNS && Date.now() < MAX_TIME_EXTENSIONS) {
config_id: k.id, runCount++;
from_site: matchedMarket.market_code, const startTime = Date.now();
timezone: matchedMarket.timezone, console.log(`--- Chu kỳ ${runCount}/${MAX_RUNS} --- ${MAX_TIME_EXTENSIONS - Date.now()} ---`);
listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction",
}); try {
} const configs = await getConfigs(pool);
});
}
// console.log(`Total configs to process: ${configs.length}`);
for (const store of configs) { for (const store of configs) {
// console.log(`Processing ${store.data}`); if (Date.now() > MAX_TIME_EXTENSIONS) break;
// let items = await scrapeWithCheerio(store); const items = await scrapeWithPuppeteer(browser, store);
let items = await scrapeWithPuppeteer(store);
for (const item of items) { for (const item of items) {
// 2⃣ Check if record exists const [rows] = await pool.execute("SELECT id FROM items WHERE id = ?", [item.id]);
const [rows] = await db.execute("SELECT id FROM items WHERE id = ?", [item.id]); if (rows.length > 0) continue;
if (rows.length > 0 || item.id === "123456") continue;
// 3⃣ Insert new record
const title = (item.title || "").replace("Opens in a new window or tab", "").trim(); const title = (item.title || "").replace("Opens in a new window or tab", "").trim();
console.log(`Processing ${store.data}`); console.log(`Processing ${store.data}`);
console.log({ ...item, title }); console.log({ ...item, title });
await axios try {
const res = await axios
.post( .post(
process.env.API_DISTI_HOST + "/api/items/insert", `${process.env.API_DISTI_HOST}/api/items/insert`,
{ ...item, title }, { ...item, title },
{ {
headers: { headers: { "x-key": "CanTho#1" },
"x-key": "CanTho#1",
},
} }
) )
.then((res) => { .then((res) => {
@ -419,16 +399,31 @@ async function main() {
.catch((err) => { .catch((err) => {
console.log(err); console.log(err);
}); });
} catch (e) {
console.error(`❌ Lỗi API ID ${item.id}:`, e.message);
} }
} }
// console.log("✅ Done scraping."); }
} catch (err) {
console.error("Lỗi chu kỳ:", err.message);
}
await db.end(); const duration = Date.now() - startTime;
// console.log("🔌 MySQL connection closed"); const delay = Math.max(0, 60000 - duration);
console.log(`Hoàn thành chu kỳ trong ${duration / 1000}s. Nghỉ ${delay / 1000}s.`);
if (runCount < MAX_RUNS && Date.now() + delay < MAX_TIME_EXTENSIONS) await wait(delay);
}
console.log("🏁 Đã chạy đủ 60 lần. Đang làm mới tiến trình...");
await browser.close();
await pool.end();
// Thực hiện dọn log một lần cuối trước khi thoát process hoàn toàn
trimLogFileIfNeeded();
process.exit(0); process.exit(0);
} }
main().catch((err) => { main().catch((err) => {
console.error(err); console.error("FATAL ERROR:", err);
process.exit(1); process.exit(1);
}); });