SoldOut/scrap_new_items.js

430 lines
14 KiB
JavaScript

import "dotenv/config";
import puppeteer from "puppeteer";
import axios from "axios";
import mysql from "mysql2/promise";
import { DateTime } from "luxon";
import path from "node:path";
import fs from "node:fs";
const LOG_FILE = path.join(process.cwd(), "newitems.log");
const MAX_LOG_SIZE = 20 * 1024 * 1024; // 20MB
// --- HELPERS ---
const wait = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
async function safeGetContent(page) {
try {
return await page.content();
} catch (err) {
if (err.message.includes("Execution context was destroyed")) {
await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {});
return await page.content();
}
throw err;
}
}
function parseAndConvertToSydney(dateString, fromTimezone, formatDate) {
const currentYear = new Date().getFullYear();
let format = /^\d/.test(dateString[0]) ? "d-MMM HH:mm" : "MMM-d HH:mm";
const fullDateString = `${dateString} ${currentYear}`;
const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone });
if (!dt.isValid) return null;
const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone;
return dt.setZone(systemTz).toFormat(formatDate);
}
// --- LOGIC LẤY CONFIG TỪ DB ---
async function getConfigs(pool) {
let configs = [];
const [keywords] = await pool.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1");
const [keywordsCustom] = await pool.execute("SELECT id, name, url FROM newitems_config");
const [markets] = await pool.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1");
const keyWord = keywords?.map((el) => el.name)?.join("+");
markets.forEach((m) => {
const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : "";
// URL FixedPrice
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "FixedPrice",
});
// URL Auction
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "Auction",
});
});
if (keywordsCustom.length > 0) {
keywordsCustom.forEach((k) => {
const matchedMarket = findMarketDataFromSearchUrl(k.url, markets);
if (matchedMarket) {
configs.push({
data: k.url,
type_custom: "custom",
config_id: k.id,
from_site: matchedMarket.market_code,
timezone: matchedMarket.timezone,
listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction",
});
}
});
}
return configs;
}
function findMarketDataFromSearchUrl(searchUrl, marketDatas) {
if (!searchUrl?.trim()) return null;
try {
const searchHost = new URL(searchUrl.trim()).host;
for (const data of marketDatas) {
if (searchUrl.includes(data.url)) return data;
const marketHost = new URL(data.url).host;
if (searchHost === marketHost) return data;
}
} catch (e) {
return null;
}
return null;
}
// --- CORE SCRAPER ---
async function scrapeWithPuppeteer(browser, store) {
let page = null;
try {
page = await browser.newPage();
// Tối ưu RAM: Chặn các request không cần thiết
await page.setRequestInterception(true);
page.on("request", (req) => {
if (["image", "stylesheet", "font", "media"].includes(req.resourceType())) req.abort();
else req.continue();
});
await page.goto(store.data, { waitUntil: "networkidle2", timeout: 60000 });
let retries = 0;
while (retries < 10) {
const html = await safeGetContent(page);
if (html.includes("Checking your browser")) {
await wait(2000);
retries++;
continue;
}
if (await page.$("li.s-card--horizontal")) break;
await wait(2000);
retries++;
}
const items = await page?.$$eval(
"li.s-card--horizontal",
(nodes, store) => {
const results = [];
const stringToTimestamp = (str) => {
if (!str) return 0;
const regex = /(\d+)\s*(d|day|h|hour|m|minute|s|second)s?/gi;
let total = 0;
let match;
while ((match = regex.exec(str))) {
const value = parseInt(match[1]);
const unit = match[2].toLowerCase();
if (unit === "d" || unit === "day") total += value * 86400;
else if (unit === "h" || unit === "hour") total += value * 3600;
else if (unit === "m" || unit === "minute") total += value * 60;
else if (unit === "s" || unit === "second") total += value;
}
return total;
};
nodes.forEach((node) => {
const payload = {};
// ---------------- LINK + ID ----------------
const linkEl = node.querySelector("div.su-image a");
// if (!linkEl) return;
const linkDetail = linkEl && linkEl?.href ? linkEl?.href : "";
if (!linkDetail) return;
payload.link_detail = linkDetail;
const idMatch = linkDetail.match(/\/itm\/(\d+)/);
if (!idMatch) return;
payload.id = idMatch[1];
// ---------------- NAME ----------------
const titleEl = node.querySelector(".s-card__title");
if (titleEl) {
let name = titleEl.textContent.replace(/New\s*listing/i, "").trim();
payload.title = name;
payload.description = name;
}
// ---------------- CONDITION ----------------
const condEl = node.querySelector(".s-card__subtitle");
payload.condition_item = condEl?.textContent.trim() || "";
// -------- IMAGE --------
const pictureEl = node.querySelector("img.s-card__image");
if (pictureEl) payload.picture = pictureEl.getAttribute("src") || "";
// ---------------- PRICE + CURRENCY ----------------
const priceEl = node.querySelector(".s-card__price");
if (priceEl) {
let text = priceEl.textContent.replace(",", "").trim();
text = text.replace("£", "GBP ").replace("$", "");
payload.priceText = text;
const match = text.match(/([A-Za-z]{3})?\s?([\d.,]+)\s?([A-Za-z]{3})?/);
if (match) {
payload.currencyID = match[1] || match[3] || "";
payload.price = match[2] || "";
if (!payload.currencyID?.trim()) {
const match1 = text.match(/([A-Za-z]{2,3})\s?([\d.]+)/);
if (match1) {
payload.currencyID = match1[1] || match1[3] || "";
}
}
// if (!payload.currencyID?.trim()) payload.currencyID = "";
}
}
// ---------------- AUCTION / BIDS ----------------
const bidsEl = node.querySelector(".s-card__attribute-row .su-styled-text.large");
if (bidsEl) {
let txt = bidsEl.textContent.trim();
if (/bids/i.test(txt)) {
let bid = txt.match(/\d+/);
if (bid) payload.bidCount = bid[0];
payload.is_auctionList = 1;
}
if (/Buy It Now/i.test(txt)) payload.buyItNowAvailable = true;
if (/Best Offer/i.test(txt)) payload.makeOffer = true;
}
// Keep only correct listing type
const type = store.listingType || "FixedPrice";
if (type === "Auction" && !payload.is_auctionList) return;
if (type !== "Auction" && !payload.buyItNowAvailable && !payload.makeOffer) return;
// ---------------- TIME LEFT (auction) ----------------
const leftEl = node.querySelector(".s-card__time-left");
if (leftEl) {
const leftText = leftEl.textContent.trim();
payload.timeLeft = leftText;
payload.end_time = Math.floor(Date.now() / 1000 + stringToTimestamp(leftText));
}
// ---------------- SHIPPING & FROM SITE ----------------
node.querySelectorAll(".s-card__attribute-row .su-styled-text.secondary.large").forEach((sp) => {
const txt = sp.textContent;
// shipping cost
if (txt?.toLowerCase().includes("delivery")) {
// payload.shipping = txt;
const m = txt.match(/\d+\.\d+/);
if (m) payload.shipping_cost = m[0];
if (txt.toLowerCase().includes("free") || !payload.shipping_cost) payload.shipping_cost = 0;
}
// country
if (txt?.toLowerCase().includes("from") || txt?.toLowerCase().includes("located in")) {
payload.country = txt;
const markets = [
{ country_name: "Australia", market_code: "EBAY_AU" },
{ country_name: "United Kingdom", market_code: "EBAY_GB" },
{ country_name: "United States", market_code: "EBAY_US" },
{ country_name: "Canada", market_code: "EBAY_ENCA" },
];
for (let site of markets) {
if (txt.includes(site.country_name)) {
payload.from_site = site.market_code;
break;
}
}
}
});
const from_site = store.from_site;
if (!payload.country) payload.from_site = from_site;
if ((payload.country && payload.from_site !== from_site) || (payload.country && !payload.from_site)) return;
delete payload.country;
// ---------------- SELLER / FEEDBACK ----------------
node.querySelectorAll(".s-card__attribute-row .su-styled-text.primary.large").forEach((sp) => {
const text = sp.textContent;
// 96.9% positive (105)
const m = text.match(/^([\d.]+%)\s*\w*\s*\(([^)]+)\)/);
if (m) {
payload.feedbackPercent = m[1];
payload.feedbackScore = m[2];
} else {
payload.seller = text.trim();
}
});
if (!payload.seller || !payload.price) return;
// ---------------- START TIME / END TIME (Buy it now listing) ----------------
const dateEl = node.querySelector(".s-card__attribute-row .su-styled-text.secondary.bold.large");
if (dateEl) {
const text = dateEl.textContent.trim();
// NOTE: tùy bạn → tôi giữ logic y như PHP
const ts = Date.parse(text);
if (!isNaN(ts)) {
payload.time = text;
// const timeConvert = parseAndConvertToSydney(text, store.timezone, "yyyy/MM/dd HH:mm");
// const timestamp = DateTime.fromFormat(timeConvert, format) // parse theo format
// .toSeconds(); // timestamp giây
// payload.start_time = Math.floor(timestamp);
// payload.end_time = Math.floor(timestamp) + 2592000;
// payload.start_time_string = timeConvert;
}
}
const type_custom = store.type_custom || "custom";
payload.current_time = Date.now();
payload.type_custom = type_custom;
payload.listingType = type;
payload.config_id = store.config_id || null;
results.push(payload);
});
return results;
},
store
);
return items.map((item) => {
if (!item.time) return item;
const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm");
const ts = new Date(timeConvert).getTime() / 1000;
return { ...item, timeConvert, start_time: ts, end_time: item.end_time || ts + 2592000, start_time_string: timeConvert };
});
} catch (err) {
console.error(`Lỗi tại ${store.data}:`, err.message);
return [];
} finally {
if (page) await page.close();
}
}
function trimLogFileIfNeeded() {
try {
if (!fs.existsSync(LOG_FILE)) return;
const stats = fs.statSync(LOG_FILE);
if (stats.size > MAX_LOG_SIZE) {
const data = fs.readFileSync(LOG_FILE, "utf8");
const lines = data.split("\n");
// Giữ lại 70% dòng mới nhất
const keepLines = Math.floor(lines.length * 0.7);
const trimmed = lines.slice(-keepLines).join("\n");
fs.writeFileSync(LOG_FILE, trimmed + "\n", "utf8");
console.log(`\n[SYSTEM] Log file trimmed (Size: ${(stats.size / 1024 / 1024).toFixed(2)}MB > 20MB)`);
}
} catch (err) {
console.error("Error trimming log file:", err);
}
}
// --- MAIN PROCESS ---
async function main() {
const pool = mysql.createPool({
host: process.env.MYSQL_HOST,
user: process.env.MYSQL_USER,
password: process.env.MYSQL_PASSWORD,
database: process.env.MYSQL_DB_NAME,
waitForConnections: true,
connectionLimit: 5,
});
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--single-process"],
});
console.log("🚀 Bắt đầu phiên làm việc mới...");
let runCount = 0;
const MAX_RUNS = 60; // Chạy 60 chu kỳ (~1 tiếng)
const ONE_HOUR_MS = 60 * 60 * 1000;
const SESSION_START = Date.now(); // Thời điểm bắt đầu chạy script
const MAX_TIME_EXTENSIONS = SESSION_START + ONE_HOUR_MS; // Thời điểm phải kết thúc
while (runCount < MAX_RUNS && Date.now() < MAX_TIME_EXTENSIONS) {
runCount++;
const startTime = Date.now();
console.log(`--- Chu kỳ ${runCount}/${MAX_RUNS} --- ${MAX_TIME_EXTENSIONS - Date.now()} ---`);
try {
const configs = await getConfigs(pool);
for (const store of configs) {
if (Date.now() > MAX_TIME_EXTENSIONS) break;
const items = await scrapeWithPuppeteer(browser, store);
for (const item of items) {
const [rows] = await pool.execute("SELECT id FROM items WHERE id = ?", [item.id]);
if (rows.length > 0) continue;
const title = (item.title || "").replace("Opens in a new window or tab", "").trim();
console.log(`Processing ${store.data}`);
console.log({ ...item, title });
try {
const res = await axios
.post(
`${process.env.API_DISTI_HOST}/api/items/insert`,
{ ...item, title },
{
headers: { "x-key": "CanTho#1" },
}
)
.then((res) => {
console.log(res.data, item.id, item.timeConvert, item.time);
})
.catch((err) => {
console.log(err);
});
} catch (e) {
console.error(`❌ Lỗi API ID ${item.id}:`, e.message);
}
}
}
} catch (err) {
console.error("Lỗi chu kỳ:", err.message);
}
const duration = Date.now() - startTime;
const delay = Math.max(0, 60000 - duration);
console.log(`Hoàn thành chu kỳ trong ${duration / 1000}s. Nghỉ ${delay / 1000}s.`);
if (runCount < MAX_RUNS && Date.now() + delay < MAX_TIME_EXTENSIONS) await wait(delay);
}
console.log("🏁 Đã chạy đủ 60 lần. Đang làm mới tiến trình...");
await browser.close();
await pool.end();
// Thực hiện dọn log một lần cuối trước khi thoát process hoàn toàn
trimLogFileIfNeeded();
process.exit(0);
}
main().catch((err) => {
console.error("FATAL ERROR:", err);
process.exit(1);
});