SoldOut/scrap_new_items.js

435 lines
13 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import "dotenv/config";
import puppeteer from "puppeteer";
import axios from "axios";
import mysql from "mysql2/promise";
import { DateTime } from "luxon";
// Define function promise waiting for a given time
async function wait(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function safeGetContent(page) {
try {
return await page.content();
} catch (err) {
// Nếu navigation xảy ra → chờ page ổn định lại rồi đọc tiếp
if (err.message.includes("Execution context was destroyed")) {
await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {});
return await page.content();
}
throw err;
}
}
function parseAndConvertToSydney(dateString, fromTimezone, formatDate) {
const currentYear = new Date().getFullYear();
// Detect format: "Dec-9 23:05" or "9-Dec 23:05"
let format = "MMM-d HH:mm";
if (/^\d/.test(dateString[0])) {
format = "d-MMM HH:mm";
}
const fullDateString = `${dateString} ${currentYear}`;
const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone });
if (!dt.isValid) {
// console.log("❌ Invalid Luxon parse:", dt.invalidReason, fullDateString, format);
return null;
}
const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone;
return dt.setZone(systemTz).toFormat(formatDate);
}
async function scrapeWithPuppeteer(store) {
// console.log(`Fetching with scrapeWithPuppeteer`);
try {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.goto(store.data, { waitUntil: "networkidle2" });
// ----- RETRY HANDLE (Cloudflare / Slow load) -----
let retries = 0;
while (retries < 10) {
try {
// console.log(`Retry ${retries + 1}`);
const html = await safeGetContent(page);
// Detect Cloudflare or other blocking messages
if (html.includes("Checking your browser")) {
await wait(2000);
retries++;
continue;
}
const hasItems = await page.$("li.s-card--horizontal");
if (hasItems) break;
await wait(2000);
retries++;
} catch (err) {
await wait(2000);
retries++;
}
}
// Wait for cards or detect Cloudflare
const html = await safeGetContent(page);
const needBrowserCheck = html.includes("Checking your browser");
if (needBrowserCheck) {
await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null);
}
// ================================================
// MAIN SCRAPING LOGIC (FULL CONVERT FROM PHP)
// ================================================
const items = await page?.$$eval(
"li.s-card--horizontal",
(nodes, store) => {
const results = [];
const stringToTimestamp = (str) => {
if (!str) return 0;
const regex = /(\d+)\s*(d|day|h|hour|m|minute|s|second)s?/gi;
let total = 0;
let match;
while ((match = regex.exec(str))) {
const value = parseInt(match[1]);
const unit = match[2].toLowerCase();
if (unit === "d" || unit === "day") total += value * 86400;
else if (unit === "h" || unit === "hour") total += value * 3600;
else if (unit === "m" || unit === "minute") total += value * 60;
else if (unit === "s" || unit === "second") total += value;
}
return total;
};
nodes.forEach((node) => {
const payload = {};
// ---------------- LINK + ID ----------------
const linkEl = node.querySelector("div.su-media__image a");
// if (!linkEl) return;
payload.link_detail = linkEl?.href || "";
const idMatch = linkEl.href.match(/\/itm\/(\d+)/);
if (!idMatch) return;
payload.id = idMatch[1];
// ---------------- NAME ----------------
const titleEl = node.querySelector(".s-card__title");
if (titleEl) {
let name = titleEl.textContent.replace(/New\s*listing/i, "").trim();
payload.title = name;
payload.description = name;
}
// ---------------- CONDITION ----------------
const condEl = node.querySelector(".s-card__subtitle");
payload.condition_item = condEl?.textContent.trim() || "";
// -------- IMAGE --------
const pictureEl = node.querySelector("img.s-card__image");
if (pictureEl) payload.picture = pictureEl.getAttribute("src") || "";
// ---------------- PRICE + CURRENCY ----------------
const priceEl = node.querySelector(".s-card__price");
if (priceEl) {
let text = priceEl.textContent.replace(",", "").trim();
text = text.replace("£", "GBP ").replace("$", "");
payload.priceText = text;
const match = text.match(/([A-Za-z]{3})?\s?([\d.,]+)\s?([A-Za-z]{3})?/);
if (match) {
payload.currencyID = match[1] || match[3] || "";
payload.price = match[2] || "";
if (!payload.currencyID?.trim()) {
const match1 = text.match(/([A-Za-z]{2,3})\s?([\d.]+)/);
if (match1) {
payload.currencyID = match[1] || match[3] || "";
}
}
// if (!payload.currencyID?.trim()) payload.currencyID = "";
}
}
// ---------------- AUCTION / BIDS ----------------
const bidsEl = node.querySelector(".s-card__attribute-row .su-styled-text.large");
if (bidsEl) {
let txt = bidsEl.textContent.trim();
if (/bids/i.test(txt)) {
let bid = txt.match(/\d+/);
if (bid) payload.bidCount = bid[0];
payload.is_auctionList = 1;
}
if (/Buy It Now/i.test(txt)) payload.buyItNowAvailable = true;
if (/Best Offer/i.test(txt)) payload.makeOffer = true;
}
// Keep only correct listing type
const type = store.listingType || "FixedPrice";
if (type === "Auction" && !payload.is_auctionList) return;
if (type !== "Auction" && !payload.buyItNowAvailable && !payload.makeOffer) return;
// ---------------- TIME LEFT (auction) ----------------
const leftEl = node.querySelector(".s-card__time-left");
if (leftEl) {
const leftText = leftEl.textContent.trim();
payload.timeLeft = leftText;
payload.end_time = Math.floor(Date.now() / 1000 + stringToTimestamp(leftText));
}
// ---------------- SHIPPING & FROM SITE ----------------
node.querySelectorAll(".s-card__attribute-row .su-styled-text.secondary.large").forEach((sp) => {
const txt = sp.textContent;
// shipping cost
if (txt?.toLowerCase().includes("delivery")) {
// payload.shipping = txt;
const m = txt.match(/\d+\.\d+/);
if (m) payload.shipping_cost = m[0];
if (txt.toLowerCase().includes("free") || !payload.shipping_cost) payload.shipping_cost = 0;
}
// country
if (txt?.toLowerCase().includes("from") || txt?.toLowerCase().includes("located in")) {
payload.country = txt;
const markets = [
{ country_name: "Australia", market_code: "EBAY_AU" },
{ country_name: "United Kingdom", market_code: "EBAY_GB" },
{ country_name: "United States", market_code: "EBAY_US" },
{ country_name: "Canada", market_code: "EBAY_ENCA" },
];
for (let site of markets) {
if (txt.includes(site.country_name)) {
payload.from_site = site.market_code;
break;
}
}
}
});
const from_site = store.from_site;
if (!payload.country) payload.from_site = from_site;
if ((payload.country && payload.from_site !== from_site) || (payload.country && !payload.from_site)) return;
delete payload.country;
// ---------------- SELLER / FEEDBACK ----------------
node.querySelectorAll(".s-card__attribute-row .su-styled-text.primary.large").forEach((sp) => {
const text = sp.textContent;
// 96.9% positive (105)
const m = text.match(/^([\d.]+%)\s*\w*\s*\(([^)]+)\)/);
if (m) {
payload.feedbackPercent = m[1];
payload.feedbackScore = m[2];
} else {
payload.seller = text.trim();
}
});
if (!payload.seller || !payload.price) return;
// ---------------- START TIME / END TIME (Buy it now listing) ----------------
const dateEl = node.querySelector(".s-card__attribute-row .su-styled-text.secondary.bold.large");
if (dateEl) {
const text = dateEl.textContent.trim();
// NOTE: tùy bạn → tôi giữ logic y như PHP
const ts = Date.parse(text);
if (!isNaN(ts)) {
payload.time = text;
// const timeConvert = parseAndConvertToSydney(text, store.timezone, "yyyy/MM/dd HH:mm");
// const timestamp = DateTime.fromFormat(timeConvert, format) // parse theo format
// .toSeconds(); // timestamp giây
// payload.start_time = Math.floor(timestamp);
// payload.end_time = Math.floor(timestamp) + 2592000;
// payload.start_time_string = timeConvert;
}
}
const type_custom = store.type_custom || "custom";
payload.current_time = Date.now();
payload.type_custom = type_custom;
payload.listingType = type;
payload.config_id = store.config_id || null;
results.push(payload);
});
return results;
},
store
);
const results = items.map((item) => {
if (!item.time) return item;
// PROCESS START TIME + END TIME
const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm");
const timestamp = new Date(timeConvert).getTime() / 1000;
item.timeConvert = timeConvert;
item.start_time = timestamp;
item.end_time = timestamp + 2592000;
item.start_time_string = timeConvert;
// delete item.time;
return {
...item,
};
});
await browser.close();
return results;
} catch (err) {
console.log("Error scrapeWithPuppeteer:", err);
return [];
}
}
function findMarketDataFromSearchUrl(searchUrl, marketDatas) {
if (!searchUrl || !searchUrl.trim()) {
return null;
}
searchUrl = searchUrl.trim();
let searchHost;
try {
searchHost = new URL(searchUrl).host;
} catch (e) {
searchHost = null;
}
for (const data of marketDatas) {
if (!data.url) continue;
// 1) direct substring match
if (searchUrl.includes(data.url)) {
return data;
}
// 2) fallback: host comparison
let marketHost = null;
try {
marketHost = new URL(data.url).host;
} catch (e) {
marketHost = null;
}
if (searchHost && marketHost && searchHost === marketHost) {
return data;
}
}
return null;
}
async function main() {
const db = await mysql.createConnection({
host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP
user: process.env.MYSQL_USER,
password: process.env.MYSQL_PASSWORD,
database: process.env.MYSQL_DB_NAME,
});
// console.log("✅ Connected to MySQL");
// const errors = [];
let configs = [];
const [keywords] = await db.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1");
const [keywordsCustom] = await db.execute("SELECT id, name, url FROM newitems_config");
const [markets] = await db.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1");
const keyWord = keywords?.map((el) => el.name)?.join("+");
markets.forEach((m) => {
const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : "";
// URL FixedPrice (_sop=10)
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "FixedPrice",
});
// URL Auction (_sop=1)
configs.push({
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`,
type_custom: "cisco",
config_id: null,
from_site: m.market_code,
timezone: m.timezone,
listingType: "Auction",
});
});
if (keywordsCustom.length > 0) {
keywordsCustom.forEach((k) => {
const matchedMarket = findMarketDataFromSearchUrl(k.url, markets);
if (matchedMarket) {
configs.push({
data: k.url,
type_custom: "custom",
config_id: k.id,
from_site: matchedMarket.market_code,
timezone: matchedMarket.timezone,
listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction",
});
}
});
}
// console.log(`Total configs to process: ${configs.length}`);
for (const store of configs) {
// console.log(`Processing ${store.data}`);
// let items = await scrapeWithCheerio(store);
let items = await scrapeWithPuppeteer(store);
for (const item of items) {
// 2⃣ Check if record exists
const [rows] = await db.execute("SELECT id FROM items WHERE id = ?", [item.id]);
if (rows.length > 0 || item.id === "123456") continue;
// 3⃣ Insert new record
const title = (item.title || "").replace("Opens in a new window or tab", "").trim();
console.log(`Processing ${store.data}`);
console.log({ ...item, title });
await axios
.post(
process.env.API_DISTI_HOST + "/api/items/insert",
{ ...item, title },
{
headers: {
"x-key": "CanTho#1",
},
}
)
.then((res) => {
console.log(res.data, item.id, item.timeConvert, item.time);
})
.catch((err) => {
console.log(err);
});
}
}
// console.log("✅ Done scraping.");
await db.end();
// console.log("🔌 MySQL connection closed");
process.exit(0);
}
main().catch((err) => {
console.error(err);
process.exit(1);
});