Update scrap New Items
This commit is contained in:
parent
4caefde9f8
commit
36cedb761f
|
|
@ -14,6 +14,7 @@
|
||||||
"dayjs": "^1.11.18",
|
"dayjs": "^1.11.18",
|
||||||
"dotenv": "^17.2.3",
|
"dotenv": "^17.2.3",
|
||||||
"googleapis": "^166.0.0",
|
"googleapis": "^166.0.0",
|
||||||
|
"luxon": "^3.7.2",
|
||||||
"mysql2": "^3.15.2",
|
"mysql2": "^3.15.2",
|
||||||
"nodemailer": "^7.0.9",
|
"nodemailer": "^7.0.9",
|
||||||
"puppeteer": "^24.24.1",
|
"puppeteer": "^24.24.1",
|
||||||
|
|
@ -2225,6 +2226,15 @@
|
||||||
"url": "https://github.com/sponsors/wellwelwel"
|
"url": "https://github.com/sponsors/wellwelwel"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/luxon": {
|
||||||
|
"version": "3.7.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/luxon/-/luxon-3.7.2.tgz",
|
||||||
|
"integrity": "sha512-vtEhXh/gNjI9Yg1u4jX/0YVPMvxzHuGgCm6tC5kZyb08yjGWGnqAjGJvcXbqQR2P3MyMEFnRbpcdFS6PBcLqew==",
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=12"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/make-fetch-happen": {
|
"node_modules/make-fetch-happen": {
|
||||||
"version": "9.1.0",
|
"version": "9.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-9.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-9.1.0.tgz",
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
"dayjs": "^1.11.18",
|
"dayjs": "^1.11.18",
|
||||||
"dotenv": "^17.2.3",
|
"dotenv": "^17.2.3",
|
||||||
"googleapis": "^166.0.0",
|
"googleapis": "^166.0.0",
|
||||||
|
"luxon": "^3.7.2",
|
||||||
"mysql2": "^3.15.2",
|
"mysql2": "^3.15.2",
|
||||||
"nodemailer": "^7.0.9",
|
"nodemailer": "^7.0.9",
|
||||||
"puppeteer": "^24.24.1",
|
"puppeteer": "^24.24.1",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,426 @@
|
||||||
|
import "dotenv/config";
|
||||||
|
import puppeteer from "puppeteer";
|
||||||
|
import axios from "axios";
|
||||||
|
import mysql from "mysql2/promise";
|
||||||
|
import { DateTime } from "luxon";
|
||||||
|
|
||||||
|
// Define function promise waiting for a given time
|
||||||
|
async function wait(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function safeGetContent(page) {
|
||||||
|
try {
|
||||||
|
return await page.content();
|
||||||
|
} catch (err) {
|
||||||
|
// Nếu navigation xảy ra → chờ page ổn định lại rồi đọc tiếp
|
||||||
|
if (err.message.includes("Execution context was destroyed")) {
|
||||||
|
await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {});
|
||||||
|
return await page.content();
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseAndConvertToSydney(dateString, fromTimezone, formatDate) {
|
||||||
|
const currentYear = new Date().getFullYear();
|
||||||
|
|
||||||
|
// Detect format: "Dec-9 23:05" or "9-Dec 23:05"
|
||||||
|
let format = "MMM-d HH:mm";
|
||||||
|
if (/^\d/.test(dateString[0])) {
|
||||||
|
format = "d-MMM HH:mm";
|
||||||
|
}
|
||||||
|
|
||||||
|
const fullDateString = `${dateString} ${currentYear}`;
|
||||||
|
|
||||||
|
const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone });
|
||||||
|
|
||||||
|
if (!dt.isValid) {
|
||||||
|
console.log("❌ Invalid Luxon parse:", dt.invalidReason, fullDateString, format);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
||||||
|
return dt.setZone(systemTz).toFormat(formatDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeWithPuppeteer(store) {
|
||||||
|
// console.log(`Fetching with scrapeWithPuppeteer`);
|
||||||
|
try {
|
||||||
|
const browser = await puppeteer.launch({
|
||||||
|
headless: true,
|
||||||
|
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
||||||
|
});
|
||||||
|
const page = await browser.newPage();
|
||||||
|
await page.goto(store.data, { waitUntil: "networkidle2" });
|
||||||
|
|
||||||
|
// ----- RETRY HANDLE (Cloudflare / Slow load) -----
|
||||||
|
let retries = 0;
|
||||||
|
while (retries < 10) {
|
||||||
|
try {
|
||||||
|
console.log(`Retry ${retries + 1}`);
|
||||||
|
const html = await safeGetContent(page);
|
||||||
|
|
||||||
|
// Detect Cloudflare or other blocking messages
|
||||||
|
if (html.includes("Checking your browser")) {
|
||||||
|
await wait(2000);
|
||||||
|
retries++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const hasItems = await page.$("li.s-card--horizontal");
|
||||||
|
if (hasItems) break;
|
||||||
|
|
||||||
|
await wait(2000);
|
||||||
|
retries++;
|
||||||
|
} catch (err) {
|
||||||
|
await wait(2000);
|
||||||
|
retries++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for cards or detect Cloudflare
|
||||||
|
const html = await safeGetContent(page);
|
||||||
|
const needBrowserCheck = html.includes("Checking your browser");
|
||||||
|
if (needBrowserCheck) {
|
||||||
|
await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ================================================
|
||||||
|
// MAIN SCRAPING LOGIC (FULL CONVERT FROM PHP)
|
||||||
|
// ================================================
|
||||||
|
const items = await page?.$$eval(
|
||||||
|
"li.s-card--horizontal",
|
||||||
|
(nodes, store) => {
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
const stringToTimestamp = (str) => {
|
||||||
|
if (!str) return 0;
|
||||||
|
|
||||||
|
const regex = /(\d+)\s*(d|day|h|hour|m|minute|s|second)s?/gi;
|
||||||
|
let total = 0;
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = regex.exec(str))) {
|
||||||
|
const value = parseInt(match[1]);
|
||||||
|
const unit = match[2].toLowerCase();
|
||||||
|
|
||||||
|
if (unit === "d" || unit === "day") total += value * 86400;
|
||||||
|
else if (unit === "h" || unit === "hour") total += value * 3600;
|
||||||
|
else if (unit === "m" || unit === "minute") total += value * 60;
|
||||||
|
else if (unit === "s" || unit === "second") total += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
return total;
|
||||||
|
};
|
||||||
|
|
||||||
|
nodes.forEach((node) => {
|
||||||
|
const payload = {};
|
||||||
|
// ---------------- LINK + ID ----------------
|
||||||
|
const linkEl = node.querySelector("div.su-media__image a");
|
||||||
|
// if (!linkEl) return;
|
||||||
|
|
||||||
|
payload.link_detail = linkEl?.href || "";
|
||||||
|
const idMatch = linkEl.href.match(/\/itm\/(\d+)/);
|
||||||
|
if (!idMatch) return;
|
||||||
|
|
||||||
|
payload.id = idMatch[1];
|
||||||
|
|
||||||
|
// ---------------- NAME ----------------
|
||||||
|
const titleEl = node.querySelector(".s-card__title");
|
||||||
|
if (titleEl) {
|
||||||
|
let name = titleEl.textContent.replace(/New\s*listing/i, "").trim();
|
||||||
|
payload.title = name;
|
||||||
|
payload.description = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------- CONDITION ----------------
|
||||||
|
const condEl = node.querySelector(".s-card__subtitle");
|
||||||
|
payload.condition_item = condEl?.textContent.trim() || "";
|
||||||
|
|
||||||
|
// -------- IMAGE --------
|
||||||
|
const pictureEl = node.querySelector("img.s-card__image");
|
||||||
|
if (pictureEl) payload.picture = pictureEl.getAttribute("src") || "";
|
||||||
|
|
||||||
|
// ---------------- PRICE + CURRENCY ----------------
|
||||||
|
const priceEl = node.querySelector(".s-card__price");
|
||||||
|
if (priceEl) {
|
||||||
|
let text = priceEl.textContent.replace(/,/g, "").trim();
|
||||||
|
text = text.replace("£", "GBP").replace("$", "");
|
||||||
|
|
||||||
|
const m = text.match(/([A-Za-z]{2,4})?\s?([\d.]+)\s?([A-Za-z]{2,4})?/);
|
||||||
|
if (m) {
|
||||||
|
payload.currencyID = m[1] || m[3] || "AU";
|
||||||
|
payload.price = m[2] || "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------- AUCTION / BIDS ----------------
|
||||||
|
const bidsEl = node.querySelector(".s-card__attribute-row .su-styled-text.large");
|
||||||
|
if (bidsEl) {
|
||||||
|
let txt = bidsEl.textContent.trim();
|
||||||
|
|
||||||
|
if (/bids/i.test(txt)) {
|
||||||
|
let bid = txt.match(/\d+/);
|
||||||
|
if (bid) payload.bidCount = bid[0];
|
||||||
|
payload.is_auctionList = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/Buy It Now/i.test(txt)) payload.buyItNowAvailable = true;
|
||||||
|
if (/Best Offer/i.test(txt)) payload.makeOffer = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep only correct listing type
|
||||||
|
const type = store.listingType || "FixedPrice";
|
||||||
|
if (type === "Auction" && !payload.is_auctionList) return;
|
||||||
|
if (type !== "Auction" && !payload.buyItNowAvailable && !payload.makeOffer) return;
|
||||||
|
|
||||||
|
// ---------------- TIME LEFT (auction) ----------------
|
||||||
|
const leftEl = node.querySelector(".s-card__time-left");
|
||||||
|
if (leftEl) {
|
||||||
|
const leftText = leftEl.textContent.trim();
|
||||||
|
payload.timeLeft = leftText;
|
||||||
|
payload.end_time = Math.floor(Date.now() / 1000 + stringToTimestamp(leftText));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------- SHIPPING & FROM SITE ----------------
|
||||||
|
node.querySelectorAll(".s-card__attribute-row .su-styled-text.secondary.large").forEach((sp) => {
|
||||||
|
const txt = sp.textContent;
|
||||||
|
|
||||||
|
// shipping cost
|
||||||
|
if (txt?.toLowerCase().includes("delivery")) {
|
||||||
|
// payload.shipping = txt;
|
||||||
|
const m = txt.match(/\d+\.\d+/);
|
||||||
|
if (m) payload.shipping_cost = m[0];
|
||||||
|
if (txt.toLowerCase().includes("free") || !payload.shipping_cost) payload.shipping_cost = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// country
|
||||||
|
if (txt?.toLowerCase().includes("from") || txt?.toLowerCase().includes("located in")) {
|
||||||
|
payload.country = txt;
|
||||||
|
const markets = [
|
||||||
|
{ country_name: "Australia", market_code: "EBAY_AU" },
|
||||||
|
{ country_name: "United Kingdom", market_code: "EBAY_GB" },
|
||||||
|
{ country_name: "United States", market_code: "EBAY_US" },
|
||||||
|
{ country_name: "Canada", market_code: "EBAY_ENCA" },
|
||||||
|
];
|
||||||
|
for (let site of markets) {
|
||||||
|
if (txt.includes(site.country_name)) {
|
||||||
|
payload.from_site = site.market_code;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
const from_site = store.from_site;
|
||||||
|
if (!payload.country) payload.from_site = from_site;
|
||||||
|
|
||||||
|
if ((payload.country && payload.from_site !== from_site) || (payload.country && !payload.from_site)) return;
|
||||||
|
delete payload.country;
|
||||||
|
|
||||||
|
// ---------------- SELLER / FEEDBACK ----------------
|
||||||
|
node.querySelectorAll(".s-card__attribute-row .su-styled-text.primary.large").forEach((sp) => {
|
||||||
|
const text = sp.textContent;
|
||||||
|
// 96.9% positive (105)
|
||||||
|
const m = text.match(/^([\d.]+%)\s*\w*\s*\(([^)]+)\)/);
|
||||||
|
if (m) {
|
||||||
|
payload.feedbackPercent = m[1];
|
||||||
|
payload.feedbackScore = m[2];
|
||||||
|
} else {
|
||||||
|
payload.seller = text.trim();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!payload.seller || !payload.price) return;
|
||||||
|
|
||||||
|
// ---------------- START TIME / END TIME (Buy it now listing) ----------------
|
||||||
|
const dateEl = node.querySelector(".s-card__attribute-row .su-styled-text.secondary.bold.large");
|
||||||
|
if (dateEl) {
|
||||||
|
const text = dateEl.textContent.trim();
|
||||||
|
// NOTE: tùy bạn → tôi giữ logic y như PHP
|
||||||
|
const ts = Date.parse(text);
|
||||||
|
if (!isNaN(ts)) {
|
||||||
|
payload.time = text;
|
||||||
|
// const timeConvert = parseAndConvertToSydney(text, store.timezone, "yyyy/MM/dd HH:mm");
|
||||||
|
// const timestamp = DateTime.fromFormat(timeConvert, format) // parse theo format
|
||||||
|
// .toSeconds(); // timestamp giây
|
||||||
|
|
||||||
|
// payload.start_time = Math.floor(timestamp);
|
||||||
|
// payload.end_time = Math.floor(timestamp) + 2592000;
|
||||||
|
// payload.start_time_string = timeConvert;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const type_custom = store.type_custom || "custom";
|
||||||
|
payload.current_time = Date.now();
|
||||||
|
payload.type_custom = type_custom;
|
||||||
|
payload.listingType = type;
|
||||||
|
payload.config_id = store.config_id || null;
|
||||||
|
|
||||||
|
results.push(payload);
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
},
|
||||||
|
store
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = items.map((item) => {
|
||||||
|
if (!item.time) return item;
|
||||||
|
|
||||||
|
// PROCESS START TIME + END TIME
|
||||||
|
const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm");
|
||||||
|
const timestamp = new Date(timeConvert).getTime() / 1000;
|
||||||
|
|
||||||
|
item.timeConvert = timeConvert;
|
||||||
|
item.start_time = timestamp;
|
||||||
|
item.end_time = timestamp + 2592000;
|
||||||
|
item.start_time_string = timeConvert;
|
||||||
|
// delete item.time;
|
||||||
|
return {
|
||||||
|
...item,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
return results;
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Error scrapeWithPuppeteer:", err);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function findMarketDataFromSearchUrl(searchUrl, marketDatas) {
|
||||||
|
if (!searchUrl || !searchUrl.trim()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
searchUrl = searchUrl.trim();
|
||||||
|
|
||||||
|
let searchHost;
|
||||||
|
try {
|
||||||
|
searchHost = new URL(searchUrl).host;
|
||||||
|
} catch (e) {
|
||||||
|
searchHost = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const data of marketDatas) {
|
||||||
|
if (!data.url) continue;
|
||||||
|
|
||||||
|
// 1) direct substring match
|
||||||
|
if (searchUrl.includes(data.url)) {
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) fallback: host comparison
|
||||||
|
let marketHost = null;
|
||||||
|
try {
|
||||||
|
marketHost = new URL(data.url).host;
|
||||||
|
} catch (e) {
|
||||||
|
marketHost = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (searchHost && marketHost && searchHost === marketHost) {
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const db = await mysql.createConnection({
|
||||||
|
host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP
|
||||||
|
user: process.env.MYSQL_USER,
|
||||||
|
password: process.env.MYSQL_PASSWORD,
|
||||||
|
database: process.env.MYSQL_DB_NAME,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log("✅ Connected to MySQL");
|
||||||
|
|
||||||
|
// const errors = [];
|
||||||
|
let configs = [];
|
||||||
|
const [keywords] = await db.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1");
|
||||||
|
const [keywordsCustom] = await db.execute("SELECT id, name, url FROM newitems_config");
|
||||||
|
const [markets] = await db.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1");
|
||||||
|
const keyWord = keywords?.map((el) => el.name)?.join("+");
|
||||||
|
markets.forEach((m) => {
|
||||||
|
const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : "";
|
||||||
|
|
||||||
|
// URL FixedPrice (_sop=10)
|
||||||
|
configs.push({
|
||||||
|
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`,
|
||||||
|
type_custom: "cisco",
|
||||||
|
config_id: null,
|
||||||
|
from_site: m.market_code,
|
||||||
|
timezone: m.timezone,
|
||||||
|
listingType: "FixedPrice",
|
||||||
|
});
|
||||||
|
|
||||||
|
// URL Auction (_sop=1)
|
||||||
|
configs.push({
|
||||||
|
data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`,
|
||||||
|
type_custom: "cisco",
|
||||||
|
config_id: null,
|
||||||
|
from_site: m.market_code,
|
||||||
|
timezone: m.timezone,
|
||||||
|
listingType: "Auction",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
if (keywordsCustom.length > 0) {
|
||||||
|
keywordsCustom.forEach((k) => {
|
||||||
|
const matchedMarket = findMarketDataFromSearchUrl(k.url, markets);
|
||||||
|
if (matchedMarket) {
|
||||||
|
configs.push({
|
||||||
|
data: k.url,
|
||||||
|
type_custom: "custom",
|
||||||
|
config_id: k.id,
|
||||||
|
from_site: matchedMarket.market_code,
|
||||||
|
timezone: matchedMarket.timezone,
|
||||||
|
listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
console.log(`Total configs to process: ${configs.length}`);
|
||||||
|
for (const store of configs) {
|
||||||
|
console.log(`Processing ${store.data}`);
|
||||||
|
// let items = await scrapeWithCheerio(store);
|
||||||
|
let items = await scrapeWithPuppeteer(store);
|
||||||
|
for (const item of items) {
|
||||||
|
// 2️⃣ Check if record exists
|
||||||
|
const [rows] = await db.execute("SELECT id FROM items WHERE id = ?", [item.id]);
|
||||||
|
if (rows.length > 0 || item.id === "123456") continue;
|
||||||
|
|
||||||
|
// 3️⃣ Insert new record
|
||||||
|
const title = (item.title || "").replace("Opens in a new window or tab", "").trim();
|
||||||
|
// console.log({ ...item, title });
|
||||||
|
await axios
|
||||||
|
.post(
|
||||||
|
process.env.API_DISTI_HOST + "/api/items/insert",
|
||||||
|
{ ...item, title },
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
"x-key": "CanTho#1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.then((res) => {
|
||||||
|
console.log(res.data, item.id, item.timeConvert, item.time);
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log("✅ Done scraping.");
|
||||||
|
|
||||||
|
await db.end();
|
||||||
|
console.log("🔌 MySQL connection closed");
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((err) => {
|
||||||
|
console.error(err);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
Loading…
Reference in New Issue