diff --git a/package-lock.json b/package-lock.json index 78a55e3..7ac6755 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,6 +14,7 @@ "dayjs": "^1.11.18", "dotenv": "^17.2.3", "googleapis": "^166.0.0", + "luxon": "^3.7.2", "mysql2": "^3.15.2", "nodemailer": "^7.0.9", "puppeteer": "^24.24.1", @@ -2225,6 +2226,15 @@ "url": "https://github.com/sponsors/wellwelwel" } }, + "node_modules/luxon": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.7.2.tgz", + "integrity": "sha512-vtEhXh/gNjI9Yg1u4jX/0YVPMvxzHuGgCm6tC5kZyb08yjGWGnqAjGJvcXbqQR2P3MyMEFnRbpcdFS6PBcLqew==", + "license": "MIT", + "engines": { + "node": ">=12" + } + }, "node_modules/make-fetch-happen": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/make-fetch-happen/-/make-fetch-happen-9.1.0.tgz", diff --git a/package.json b/package.json index 7f2e075..b87616c 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "dayjs": "^1.11.18", "dotenv": "^17.2.3", "googleapis": "^166.0.0", + "luxon": "^3.7.2", "mysql2": "^3.15.2", "nodemailer": "^7.0.9", "puppeteer": "^24.24.1", diff --git a/scrap_new_items.js b/scrap_new_items.js new file mode 100644 index 0000000..b197941 --- /dev/null +++ b/scrap_new_items.js @@ -0,0 +1,426 @@ +import "dotenv/config"; +import puppeteer from "puppeteer"; +import axios from "axios"; +import mysql from "mysql2/promise"; +import { DateTime } from "luxon"; + +// Define function promise waiting for a given time +async function wait(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +async function safeGetContent(page) { + try { + return await page.content(); + } catch (err) { + // Nếu navigation xảy ra → chờ page ổn định lại rồi đọc tiếp + if (err.message.includes("Execution context was destroyed")) { + await page.waitForNetworkIdle({ idleTime: 1000 }).catch(() => {}); + return await page.content(); + } + throw err; + } +} + +function parseAndConvertToSydney(dateString, fromTimezone, formatDate) { + const currentYear = new Date().getFullYear(); + + // Detect format: "Dec-9 23:05" or "9-Dec 23:05" + let format = "MMM-d HH:mm"; + if (/^\d/.test(dateString[0])) { + format = "d-MMM HH:mm"; + } + + const fullDateString = `${dateString} ${currentYear}`; + + const dt = DateTime.fromFormat(fullDateString, `${format} yyyy`, { zone: fromTimezone }); + + if (!dt.isValid) { + console.log("❌ Invalid Luxon parse:", dt.invalidReason, fullDateString, format); + return null; + } + + const systemTz = Intl.DateTimeFormat().resolvedOptions().timeZone; + return dt.setZone(systemTz).toFormat(formatDate); +} + +async function scrapeWithPuppeteer(store) { + // console.log(`Fetching with scrapeWithPuppeteer`); + try { + const browser = await puppeteer.launch({ + headless: true, + args: ["--no-sandbox", "--disable-setuid-sandbox"], + }); + const page = await browser.newPage(); + await page.goto(store.data, { waitUntil: "networkidle2" }); + + // ----- RETRY HANDLE (Cloudflare / Slow load) ----- + let retries = 0; + while (retries < 10) { + try { + console.log(`Retry ${retries + 1}`); + const html = await safeGetContent(page); + + // Detect Cloudflare or other blocking messages + if (html.includes("Checking your browser")) { + await wait(2000); + retries++; + continue; + } + + const hasItems = await page.$("li.s-card--horizontal"); + if (hasItems) break; + + await wait(2000); + retries++; + } catch (err) { + await wait(2000); + retries++; + } + } + + // Wait for cards or detect Cloudflare + const html = await safeGetContent(page); + const needBrowserCheck = html.includes("Checking your browser"); + if (needBrowserCheck) { + await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null); + } + + // ================================================ + // MAIN SCRAPING LOGIC (FULL CONVERT FROM PHP) + // ================================================ + const items = await page?.$$eval( + "li.s-card--horizontal", + (nodes, store) => { + const results = []; + + const stringToTimestamp = (str) => { + if (!str) return 0; + + const regex = /(\d+)\s*(d|day|h|hour|m|minute|s|second)s?/gi; + let total = 0; + let match; + + while ((match = regex.exec(str))) { + const value = parseInt(match[1]); + const unit = match[2].toLowerCase(); + + if (unit === "d" || unit === "day") total += value * 86400; + else if (unit === "h" || unit === "hour") total += value * 3600; + else if (unit === "m" || unit === "minute") total += value * 60; + else if (unit === "s" || unit === "second") total += value; + } + + return total; + }; + + nodes.forEach((node) => { + const payload = {}; + // ---------------- LINK + ID ---------------- + const linkEl = node.querySelector("div.su-media__image a"); + // if (!linkEl) return; + + payload.link_detail = linkEl?.href || ""; + const idMatch = linkEl.href.match(/\/itm\/(\d+)/); + if (!idMatch) return; + + payload.id = idMatch[1]; + + // ---------------- NAME ---------------- + const titleEl = node.querySelector(".s-card__title"); + if (titleEl) { + let name = titleEl.textContent.replace(/New\s*listing/i, "").trim(); + payload.title = name; + payload.description = name; + } + + // ---------------- CONDITION ---------------- + const condEl = node.querySelector(".s-card__subtitle"); + payload.condition_item = condEl?.textContent.trim() || ""; + + // -------- IMAGE -------- + const pictureEl = node.querySelector("img.s-card__image"); + if (pictureEl) payload.picture = pictureEl.getAttribute("src") || ""; + + // ---------------- PRICE + CURRENCY ---------------- + const priceEl = node.querySelector(".s-card__price"); + if (priceEl) { + let text = priceEl.textContent.replace(/,/g, "").trim(); + text = text.replace("£", "GBP").replace("$", ""); + + const m = text.match(/([A-Za-z]{2,4})?\s?([\d.]+)\s?([A-Za-z]{2,4})?/); + if (m) { + payload.currencyID = m[1] || m[3] || "AU"; + payload.price = m[2] || ""; + } + } + + // ---------------- AUCTION / BIDS ---------------- + const bidsEl = node.querySelector(".s-card__attribute-row .su-styled-text.large"); + if (bidsEl) { + let txt = bidsEl.textContent.trim(); + + if (/bids/i.test(txt)) { + let bid = txt.match(/\d+/); + if (bid) payload.bidCount = bid[0]; + payload.is_auctionList = 1; + } + + if (/Buy It Now/i.test(txt)) payload.buyItNowAvailable = true; + if (/Best Offer/i.test(txt)) payload.makeOffer = true; + } + + // Keep only correct listing type + const type = store.listingType || "FixedPrice"; + if (type === "Auction" && !payload.is_auctionList) return; + if (type !== "Auction" && !payload.buyItNowAvailable && !payload.makeOffer) return; + + // ---------------- TIME LEFT (auction) ---------------- + const leftEl = node.querySelector(".s-card__time-left"); + if (leftEl) { + const leftText = leftEl.textContent.trim(); + payload.timeLeft = leftText; + payload.end_time = Math.floor(Date.now() / 1000 + stringToTimestamp(leftText)); + } + + // ---------------- SHIPPING & FROM SITE ---------------- + node.querySelectorAll(".s-card__attribute-row .su-styled-text.secondary.large").forEach((sp) => { + const txt = sp.textContent; + + // shipping cost + if (txt?.toLowerCase().includes("delivery")) { + // payload.shipping = txt; + const m = txt.match(/\d+\.\d+/); + if (m) payload.shipping_cost = m[0]; + if (txt.toLowerCase().includes("free") || !payload.shipping_cost) payload.shipping_cost = 0; + } + + // country + if (txt?.toLowerCase().includes("from") || txt?.toLowerCase().includes("located in")) { + payload.country = txt; + const markets = [ + { country_name: "Australia", market_code: "EBAY_AU" }, + { country_name: "United Kingdom", market_code: "EBAY_GB" }, + { country_name: "United States", market_code: "EBAY_US" }, + { country_name: "Canada", market_code: "EBAY_ENCA" }, + ]; + for (let site of markets) { + if (txt.includes(site.country_name)) { + payload.from_site = site.market_code; + break; + } + } + } + }); + const from_site = store.from_site; + if (!payload.country) payload.from_site = from_site; + + if ((payload.country && payload.from_site !== from_site) || (payload.country && !payload.from_site)) return; + delete payload.country; + + // ---------------- SELLER / FEEDBACK ---------------- + node.querySelectorAll(".s-card__attribute-row .su-styled-text.primary.large").forEach((sp) => { + const text = sp.textContent; + // 96.9% positive (105) + const m = text.match(/^([\d.]+%)\s*\w*\s*\(([^)]+)\)/); + if (m) { + payload.feedbackPercent = m[1]; + payload.feedbackScore = m[2]; + } else { + payload.seller = text.trim(); + } + }); + + if (!payload.seller || !payload.price) return; + + // ---------------- START TIME / END TIME (Buy it now listing) ---------------- + const dateEl = node.querySelector(".s-card__attribute-row .su-styled-text.secondary.bold.large"); + if (dateEl) { + const text = dateEl.textContent.trim(); + // NOTE: tùy bạn → tôi giữ logic y như PHP + const ts = Date.parse(text); + if (!isNaN(ts)) { + payload.time = text; + // const timeConvert = parseAndConvertToSydney(text, store.timezone, "yyyy/MM/dd HH:mm"); + // const timestamp = DateTime.fromFormat(timeConvert, format) // parse theo format + // .toSeconds(); // timestamp giây + + // payload.start_time = Math.floor(timestamp); + // payload.end_time = Math.floor(timestamp) + 2592000; + // payload.start_time_string = timeConvert; + } + } + const type_custom = store.type_custom || "custom"; + payload.current_time = Date.now(); + payload.type_custom = type_custom; + payload.listingType = type; + payload.config_id = store.config_id || null; + + results.push(payload); + }); + + return results; + }, + store + ); + + const results = items.map((item) => { + if (!item.time) return item; + + // PROCESS START TIME + END TIME + const timeConvert = parseAndConvertToSydney(item.time, store.timezone, "yyyy/MM/dd HH:mm"); + const timestamp = new Date(timeConvert).getTime() / 1000; + + item.timeConvert = timeConvert; + item.start_time = timestamp; + item.end_time = timestamp + 2592000; + item.start_time_string = timeConvert; + // delete item.time; + return { + ...item, + }; + }); + + await browser.close(); + return results; + } catch (err) { + console.error("Error scrapeWithPuppeteer:", err); + return []; + } +} + +function findMarketDataFromSearchUrl(searchUrl, marketDatas) { + if (!searchUrl || !searchUrl.trim()) { + return null; + } + + searchUrl = searchUrl.trim(); + + let searchHost; + try { + searchHost = new URL(searchUrl).host; + } catch (e) { + searchHost = null; + } + + for (const data of marketDatas) { + if (!data.url) continue; + + // 1) direct substring match + if (searchUrl.includes(data.url)) { + return data; + } + + // 2) fallback: host comparison + let marketHost = null; + try { + marketHost = new URL(data.url).host; + } catch (e) { + marketHost = null; + } + + if (searchHost && marketHost && searchHost === marketHost) { + return data; + } + } + + return null; +} + +async function main() { + const db = await mysql.createConnection({ + host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP + user: process.env.MYSQL_USER, + password: process.env.MYSQL_PASSWORD, + database: process.env.MYSQL_DB_NAME, + }); + + console.log("✅ Connected to MySQL"); + + // const errors = []; + let configs = []; + const [keywords] = await db.execute("SELECT id, name FROM hot_item_keyword WHERE level = 1"); + const [keywordsCustom] = await db.execute("SELECT id, name, url FROM newitems_config"); + const [markets] = await db.execute("SELECT id, url, market_code, shipping_postcode, timezone, country_name FROM ebay_site WHERE flag = 1"); + const keyWord = keywords?.map((el) => el.name)?.join("+"); + markets.forEach((m) => { + const postCode = m.shipping_postcode ? `&_stpos=${m.shipping_postcode}` : ""; + + // URL FixedPrice (_sop=10) + configs.push({ + data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=10${postCode}`, + type_custom: "cisco", + config_id: null, + from_site: m.market_code, + timezone: m.timezone, + listingType: "FixedPrice", + }); + + // URL Auction (_sop=1) + configs.push({ + data: `${m.url}/sch/i.html?_from=R40&_nkw=${keyWord}&_sacat=0&_sop=1${postCode}`, + type_custom: "cisco", + config_id: null, + from_site: m.market_code, + timezone: m.timezone, + listingType: "Auction", + }); + }); + + if (keywordsCustom.length > 0) { + keywordsCustom.forEach((k) => { + const matchedMarket = findMarketDataFromSearchUrl(k.url, markets); + if (matchedMarket) { + configs.push({ + data: k.url, + type_custom: "custom", + config_id: k.id, + from_site: matchedMarket.market_code, + timezone: matchedMarket.timezone, + listingType: k.url?.includes("_sop=10") ? "FixedPrice" : "Auction", + }); + } + }); + } + console.log(`Total configs to process: ${configs.length}`); + for (const store of configs) { + console.log(`Processing ${store.data}`); + // let items = await scrapeWithCheerio(store); + let items = await scrapeWithPuppeteer(store); + for (const item of items) { + // 2️⃣ Check if record exists + const [rows] = await db.execute("SELECT id FROM items WHERE id = ?", [item.id]); + if (rows.length > 0 || item.id === "123456") continue; + + // 3️⃣ Insert new record + const title = (item.title || "").replace("Opens in a new window or tab", "").trim(); + // console.log({ ...item, title }); + await axios + .post( + process.env.API_DISTI_HOST + "/api/items/insert", + { ...item, title }, + { + headers: { + "x-key": "CanTho#1", + }, + } + ) + .then((res) => { + console.log(res.data, item.id, item.timeConvert, item.time); + }) + .catch((err) => { + console.error(err); + }); + } + } + console.log("✅ Done scraping."); + + await db.end(); + console.log("🔌 MySQL connection closed"); + process.exit(0); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +});