Update index.js

This commit is contained in:
nguyentrungthat 2025-10-22 11:01:11 +07:00
parent 5237ceaba1
commit 703fbcffe7
1 changed files with 100 additions and 41 deletions

141
index.js
View File

@ -3,11 +3,11 @@ import puppeteer from "puppeteer";
import axios from "axios"; import axios from "axios";
import * as cheerio from "cheerio"; import * as cheerio from "cheerio";
import nodemailer from "nodemailer"; import nodemailer from "nodemailer";
import path from "path";
import dayjs from "dayjs"; import dayjs from "dayjs";
import mysql from "mysql2/promise"; import mysql from "mysql2/promise";
const EMAILS = ["andrew.ng@apactech.io", "dev@apactech.io"]; // const EMAILS = ["andrew.ng@apactech.io", "dev@apactech.io"];
const EMAILS = ["andrew.ng@apactech.io"];
const LIST_STORE = [ const LIST_STORE = [
{ {
@ -28,7 +28,12 @@ const LIST_STORE = [
}, },
]; ];
async function scrapeWithPuppeteer(url) { // Define function promise waiting for a given time
async function wait(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function scrapeWithPuppeteer(url, name) {
try { try {
const browser = await puppeteer.launch({ const browser = await puppeteer.launch({
headless: true, headless: true,
@ -37,13 +42,36 @@ async function scrapeWithPuppeteer(url) {
const page = await browser.newPage(); const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle2" }); await page.goto(url, { waitUntil: "networkidle2" });
let retries = 0;
while (retries < 10) {
try {
console.log(`${name} retry ${retries + 1}`);
const html = await page.content();
// Detect Cloudflare or other blocking messages
if (html.includes("Checking your browser")) {
await wait(5000);
retries++;
continue;
}
const hasItems = await page.$("li.s-card--horizontal");
if (hasItems) break;
await wait(2000);
retries++;
} catch (err) {
await wait(2000);
retries++;
}
}
// Wait for cards or detect Cloudflare // Wait for cards or detect Cloudflare
const html = await page.content(); const html = await page.content();
const needBrowserCheck = html.includes("Checking your browser"); const needBrowserCheck = html.includes("Checking your browser");
if (needBrowserCheck) { if (needBrowserCheck) {
await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null); await page.waitForSelector("li.s-card--horizontal", { timeout: 15000 }).catch(() => null);
} }
const items = await page.$$eval("li.s-card--horizontal", (nodes) => { const items = await page.$$eval("li.s-card--horizontal", (nodes) => {
const results = []; const results = [];
nodes.forEach((node) => { nodes.forEach((node) => {
@ -77,14 +105,14 @@ async function scrapeWithPuppeteer(url) {
const priceEl = node.querySelector(".s-card__price"); const priceEl = node.querySelector(".s-card__price");
if (priceEl) { if (priceEl) {
let txt = priceEl.textContent.replace(",", "").trim(); let txt = priceEl.textContent.replace(",", "").trim();
txt = txt.replace("£", "GBP ").replace("$", "USD "); txt = txt.replace("£", "GBP").replace("$", "USD");
const match = txt.match(/([A-Za-z]{3})?\s?([\d.,]+)\s?([A-Za-z]{3})?/); const match = txt.match(/([A-Za-z]{3})?\s?([\d.,]+)\s?([A-Za-z]{3})?/);
if (match) { if (match) {
payload.currency = match[1] || ""; payload.currency = match[1] || match[3] || "";
payload.price = match[2] || ""; payload.price = match[2] || "";
if (!payload.currency?.trim()) payload.currency = "USD";
} }
} }
if (payload.id) results.push(payload); if (payload.id) results.push(payload);
}); });
return results; return results;
@ -131,38 +159,67 @@ async function scrapeWithCheerio(url) {
const currency = match[1] || match[3] || ""; const currency = match[1] || match[3] || "";
payload.price = match[2]; payload.price = match[2];
payload.currency = currency.replace("£", "GBP").replace("$", "USD"); payload.currency = currency.replace("£", "GBP").replace("$", "USD");
if(!payload.currency?.trim()) payload.currency = "USD"; if (!payload.currency?.trim()) payload.currency = "USD";
} }
if (payload.id) items.push(payload); if (payload.id) items.push(payload);
}); });
return items; return items;
} }
async function sendMail(subject, body, attachmentPath = null) { async function sendMail(subject, body, isError = false) {
const transporter = nodemailer.createTransport({ try {
host: process.env.MAIL_HOST, await axios({
port: process.env.MAIL_PORT, url: process.env.API_DISTI_HOST + "/api/export/items-sold-out",
secure: true, method: "GET",
auth: { user: process.env.MAIL_USERNAME, pass: process.env.MAIL_PASSWORD }, responseType: "arraybuffer",
}); timeout: 60000,
headers: {
"x-key": "CanTho#1",
},
}).then(async (response) => {
// console.log(response);
const now = new Date();
const year = now.getFullYear();
const month = String(now.getMonth() + 1).padStart(2, "0");
const day = String(now.getDate()).padStart(2, "0");
const fileName = `items_sold_out_${year}_${month}_${day}.xlsx`;
const mailOptions = { const transporter = nodemailer.createTransport({
from: process.env.MAIL_USERNAME, host: process.env.MAIL_HOST,
to: EMAILS.join(","), port: process.env.MAIL_PORT,
subject, secure: true,
text: body, auth: {
}; user: process.env.MAIL_USERNAME,
pass: process.env.MAIL_PASSWORD,
},
connectionTimeout: 10000, // prevent timeout
pool: true, // reuse connection if many mails
});
if (attachmentPath) { const mailOptions = {
mailOptions.attachments = [{ filename: path.basename(attachmentPath), path: attachmentPath }]; from: process.env.MAIL_USERNAME,
to: EMAILS.join(","),
subject,
text: body,
attachments: isError
? []
: [
{
filename: fileName,
content: response.data, // attach from memory
},
],
};
await transporter.sendMail(mailOptions);
console.log("✅ Email sent successfully with Excel attachment!");
});
} catch (err) {
console.error("❌ Failed to send email:", err.message);
} }
await transporter.sendMail(mailOptions);
} }
async function main() { async function main() {
// 1⃣ Connect to MySQL
const db = await mysql.createConnection({ const db = await mysql.createConnection({
host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP host: process.env.MYSQL_HOST, // e.g. '127.0.0.1' or remote IP
user: process.env.MYSQL_USER, user: process.env.MYSQL_USER,
@ -178,8 +235,7 @@ async function main() {
for (const store of LIST_STORE) { for (const store of LIST_STORE) {
console.log(`Processing ${store.name}`); console.log(`Processing ${store.name}`);
let items = await scrapeWithCheerio(store.url); let items = await scrapeWithCheerio(store.url);
if (!items) items = await scrapeWithPuppeteer(store.url); if (!items) items = await scrapeWithPuppeteer(store.url, store.name);
let count = 0; let count = 0;
for (const item of items) { for (const item of items) {
// 2⃣ Check if record exists // 2⃣ Check if record exists
@ -188,18 +244,19 @@ async function main() {
// 3⃣ Insert new record // 3⃣ Insert new record
const priceText = item.price || ""; const priceText = item.price || "";
const title = (item.name || "").replace("Opens in a new window or tab", "").trim();
const amount = parseFloat(priceText.replace(/[^\d.]/g, "").replace(/,/g, "")); const amount = parseFloat(priceText.replace(/[^\d.]/g, "").replace(/,/g, ""));
await db.execute( await db.execute(
`INSERT INTO items_sold_out (id, name, \`condition\`, price, currency, link_detail, shop_name, sold_out_date, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, `INSERT INTO items_sold_out (id, name, \`condition\`, price, currency, link_detail, shop_name, sold_out_date, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
[ [
item.id, item.id || "",
item.name, title,
item.condition, item.condition || "",
amount || 0, amount || 0,
item.currency, item.currency || "",
item.link_detail, item.link_detail || "",
store.name, store.name || "",
item.sold_out_date, item.sold_out_date || "",
dayjs().format("YYYY-MM-DD HH:mm:ss"), dayjs().format("YYYY-MM-DD HH:mm:ss"),
dayjs().format("YYYY-MM-DD HH:mm:ss"), dayjs().format("YYYY-MM-DD HH:mm:ss"),
] ]
@ -211,20 +268,22 @@ async function main() {
inserted.push({ name: store.name, count }); inserted.push({ name: store.name, count });
} }
// 4⃣ Send email report
if (errors.length > 3) { if (errors.length > 3) {
const msg = errors.map((e) => `URL: ${e.url}\nMessage: ${e.message}`).join("\n\n"); const msg = errors.map((e) => `URL: ${e.url}\nMessage: ${e.message}`).join("\n\n");
await sendMail("[New Items] - Scraping Sold Out Error Report", msg); await sendMail("[New Items] - Scraping Sold Out Error Report", msg, true);
} else { } else {
const msg = inserted.map((i) => `Shop: ${i.name}\nInserted: ${i.count}`).join("\n\n"); const msg = inserted.map((i) => `Shop: ${i.name}\nSold: ${i.count} items`).join("\n\n");
await sendMail("[New Items] - Scraping Sold Out Success", msg); await sendMail("[New Items] - Scraping Sold Out Success", msg);
} }
console.log("✅ Done scraping."); console.log("✅ Done scraping.");
// 5⃣ Close MySQL connection
await db.end(); await db.end();
console.log("🔌 MySQL connection closed"); console.log("🔌 MySQL connection closed");
process.exit(0);
} }
main().catch((err) => console.error(err)); main().catch((err) => {
console.error(err);
process.exit(1);
});