258 lines
12 KiB
JavaScript
258 lines
12 KiB
JavaScript
import { access, constants, mkdir, readFile, writeFile } from "fs/promises";
|
|
import * as path from "path";
|
|
import { dirname } from "path";
|
|
import browser from "../system/browser.js";
|
|
import CONSTANTS from "../system/constants.js";
|
|
import {
|
|
extractDomain,
|
|
extractModelId,
|
|
extractNumber,
|
|
} from "../system/ultils.js";
|
|
import { ScrapModel } from "./scrap-model.js";
|
|
|
|
export class GumtreeScrapModel extends ScrapModel {
|
|
COOKIE_FILE_PATH = () => {
|
|
const filePath = path.join(
|
|
CONSTANTS.PROFILE_PATH,
|
|
`${extractDomain(this.web_bid.origin_url)}.json`
|
|
);
|
|
const dir = dirname(filePath);
|
|
|
|
// Đảm bảo thư mục chứa cookie tồn tại
|
|
access(dir, constants.F_OK).catch(() => mkdir(dir, { recursive: true }));
|
|
|
|
return filePath;
|
|
};
|
|
action = async () => {
|
|
const urlsData = this.extractUrls();
|
|
|
|
for (let item of urlsData) {
|
|
const data = await this.scrapOnMultiplePage(item);
|
|
const results = this.filterItemByKeyword(item.keyword, data);
|
|
this.results[item.keyword] = results;
|
|
console.log({ results: this.results });
|
|
}
|
|
};
|
|
|
|
async loadCookiesIfExist(page) {
|
|
try {
|
|
// Đọc file cookie
|
|
const cookieJson = await readFile(this.COOKIE_FILE_PATH(), "utf-8");
|
|
const cookies = JSON.parse(cookieJson);
|
|
|
|
if (cookies && cookies.length > 0) {
|
|
// Set cookie lên page
|
|
await page.setCookie(...cookies);
|
|
console.log("Cookies loaded to page");
|
|
}
|
|
} catch (error) {
|
|
if (error.code === "ENOENT") {
|
|
console.log("No cookie file found, continue without loading cookies");
|
|
} else {
|
|
throw error;
|
|
}
|
|
}
|
|
}
|
|
|
|
buildPaginatedUrl(item, current_page = 1) {
|
|
if (current_page <= 1) {
|
|
return item.url;
|
|
}
|
|
|
|
const url = new URL(item.url);
|
|
|
|
// Lấy pathname và tách các phần
|
|
const parts = url.pathname.split("/").filter(Boolean); // loại bỏ phần rỗng do dấu /
|
|
const keywordIndex = parts.findIndex((p) => p === item.keyword);
|
|
|
|
if (keywordIndex === -1) {
|
|
// fallback nếu không tìm thấy keyword trong url
|
|
return item.url;
|
|
}
|
|
|
|
// Chèn "page-N" sau keyword
|
|
parts.splice(keywordIndex + 1, 0, `page-${current_page}`);
|
|
|
|
// Gán lại pathname và trả về
|
|
url.pathname = "/" + parts.join("/");
|
|
return url.toString();
|
|
}
|
|
|
|
async saveCookies(page) {
|
|
const cookies = await page.cookies();
|
|
|
|
try {
|
|
let existingCookies = [];
|
|
try {
|
|
const data = await readFile(this.COOKIE_FILE_PATH(), "utf-8");
|
|
existingCookies = JSON.parse(data);
|
|
} catch (err) {
|
|
// Nếu chưa có file cookies thì bỏ qua
|
|
}
|
|
|
|
// Gộp cookie theo tên + domain
|
|
const merged = [...existingCookies, ...cookies];
|
|
const uniqueCookies = Object.values(
|
|
merged.reduce((acc, c) => {
|
|
acc[`${c.name}|${c.domain}`] = c;
|
|
return acc;
|
|
}, {})
|
|
);
|
|
|
|
await writeFile(
|
|
this.COOKIE_FILE_PATH(),
|
|
JSON.stringify(uniqueCookies, null, 2)
|
|
);
|
|
console.log(`✅ Saved ${uniqueCookies.length} cookies.`);
|
|
} catch (err) {
|
|
console.error("❌ Failed to save cookies:", err);
|
|
}
|
|
}
|
|
|
|
scrapOnMultiplePage = async (item, current_page = 1) => {
|
|
const context = await browser.createBrowserContext();
|
|
|
|
const newPage = await context.newPage();
|
|
await newPage.emulateTimezone("Australia/Sydney");
|
|
|
|
await newPage.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, "webdriver", {
|
|
get: () => false,
|
|
});
|
|
});
|
|
|
|
try {
|
|
const url = this.buildPaginatedUrl(item, current_page);
|
|
|
|
await newPage.setExtraHTTPHeaders({
|
|
Accept:
|
|
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
|
|
"User-Agent":
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
Referer: item.url,
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
Connection: "keep-alive",
|
|
"Device-Memory": "8",
|
|
Dpr: "1",
|
|
Priority: "u=0, i",
|
|
Downlink: "1.7",
|
|
Rtt: "150",
|
|
Ect: "4g",
|
|
Cookie: `machId=Ooi7FqYBXo3ARpf8UgUnyaqDXb8gu6gWi5RBb6AYKA5uU27aumFtkY31jSNqB-rWbMi4_YrC66nS_6ixprMIf0ZVwQv6r_llGU0q; libtg=a; bs=%7B%22st%22%3A%7B%7D%7D; afx_csid_hs=ec41291cab664d00a9a715f651fafebf8dda253152a003f; _gcl_au=1.1.442579364.1748923314; _sharedID=3bf469ac-7844-43cd-8f97-578eabfcedde; _sharedID_cst=zix7LPQsHA%3D%3D; _ga=GA1.1.1525016630.1748923314; _fbp=fb.2.1748923314432.133704941733557798; __gsas=ID=3feeb077fda5501c:T=1748923189:RT=1748923189:S=ALNI_Mbsr7RafzbystD4r52ksjRS4S23zQ; s_ecid=MCMID%7C37798711041403725812860372477407048703; AMCVS_50BE5F5858D2477A0A495C7F%40AdobeOrg=1; _lr_geo_location_state=SG; _lr_geo_location=VN; _cc_id=7e5f418613248c588934fb2345dd9bac; panoramaId_expiry=1749527989238; panoramaId=1f8fc411f0404b0c540182b8fcd04945a7023f3b737fa7dcdac189479761ed2b; panoramaIdType=panoIndiv; optimizelyEndUserId=oeu1748923315172r0.2636919564518334; DM_SitId1958=1; DM_SitId1958SecId15098=1; afx_ptpce=1; _ga_TC0NXL1S6B=deleted; _lr_env_src_ats=false; uuid=410C8105-3511-44FC-998F-499E8D02C682; aam_tnt=aamsegid%3D6797281%2Caamsegid%3D6880889; aam_uuid=38041763738279131192814186141414595589; __utmc=160852194; __utmz=160852194.1748931762.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); DM_SitId1958SecId15099=1; afx_profile_da=1; afx_syndr=1748997579162; afx_profile_hs=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; afx_profile=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; __utma=160852194.1525016630.1748923314.1748931762.1748999624.2; optimizelySession=0; aam_dfp=aamsegid%3D6797281%2C6880889%2C7220740%2C7333813%2C8458228%2C8458232%2C9320660%2C9448391; _lr_sampling_rate=100; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_identity=CiYzNzc5ODcxMTA0MTQwMzcyNTgxMjg2MDM3MjQ3NzQwNzA0ODcwM1IRCOeLyJ_zMhgBKgRTR1AzMAHwAaW2t8zzMg==; PEAKHOUR_VISIT=683fff8f780a5460000003137b91a2b1; __rp_ch=683fff8f780a5460000003137b91a30f:smXduwBNl3z822Ybp00oZcPPLV3__dJEOFcljzXBmjLgXU2-xUTbqQbLyq7sX0AdFq816gpMpa1-88AVag; AMCV_50BE5F5858D2477A0A495C7F%40AdobeOrg=-408604571%7CMCMID%7C37798711041403725812860372477407048703%7CMCIDTS%7C20243%7CMCAID%7CNONE%7CMCOPTOUT-1749031987s%7CNONE%7CMCAAMLH-1749629587%7C3%7CMCAAMB-1749629587%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CvVersion%7C4.6.0%7CMCCIDH%7C-1081851458; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_cluster=sgp3; _lr_retry_request=true; up=%7B%22upData%22%3A%22eJy1kcEOgjAQRL9m7%2B22CB4VLibe9AdKBSUibVgw8e%2FdKomYaEKMXNqXzWZmOoV4DYh1fuIT1CowqEwALs585zdviAAjFLVzvmh5bnkuEy2WzO2wS1eG7Wa3D1wyH0xXMFc2bJSmpgLidKxsK7KO2TPjoIpSSjGoyqmyHBvTEJs%2BP4GNnk4PByF0NN2ha%2FvZcr%2FLzlc0mQv1zXEUVf1e8beG%2F%2FuVjX8Z6SRgnN0BgF606w%3D%3D%22%7D; afx_dcr=; afx_orref=; __gads=ID=dd76a5311e79db68:T=1748923190:RT=1749025285:S=ALNI_MaQdPDjdG0-l54Mw-5zkEZq3ruzwA; __eoi=ID=34a32a91b325f0d8:T=1748923190:RT=1749025285:S=AA-AfjZsqCgmvwIEsjsGZWkO7Mjv; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22XCQntaL7xSSUJ3wNfloA%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.727Z%22%7D; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22undefined%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.779Z%22%7D; _uetsid=7caa7710402f11f08d410f2f6e2194d2; _uetvid=7caabae0402f11f0ad477f10ec12ba4d; nol_fpid=wxumrg5hye2hhnzso9vqwynhtcfrc1748923314|1748923314929|1749025428972|1749025429412; cto_bundle=C4hywl9rRTJJVXZPJTJGMVMlMkZWYjJYS1Z3OTlmY3BwZWklMkJVRDMzUGQxZk12OGhKU2JTbzZlSG9MU2J4JTJCVG5iNndUOXhtMjJmTXpzQjhKV2RjVzNBeGJINGQ1RWxOazBUazQxQXFEek8wSGFIYUpUZzRhNWlpVmlGVTElMkZYeTZicyUyRmpkU0hFS1BQaFJ6YmgwSzM5SXo1bVdaa1lBc0FOM2NpU28xR1A3STlnVmhuR1Nlbjk4ZVBvT2ZrdUhCZVJuNzdlajNpZzg4bGxxZWlZQkNoVmpLRjBhU25wJTJGVnclM0QlM0Q; cto_bidid=bTR-CV9JMmNVbnVJOTVEVWl1YWdiNEsybEFlYzR5Q2JLQnVLdjBkcU1VU1d0JTJGVkdqVSUyRnQ0QzZaem5IcVBRcUY4Vm9EVFBLJTJCa2dPU2lyWWVPbUk4YVQ1TG5tQzJGTjQ0dEp2d1pUSjZOcXNkNEltVDBvb3RmOFZJUktkVXZUS0dGQTV3Zw; _ga_TC0NXL1S6B=GS2.1.s1749024783$o6$g1$t1749025457$j60$l0$h0`,
|
|
});
|
|
|
|
await this.loadCookiesIfExist(newPage);
|
|
|
|
await newPage.goto(url);
|
|
|
|
await this.saveCookies(newPage);
|
|
|
|
const result = await this.getItemsInHtml({ ...item, url }, newPage);
|
|
|
|
console.log({ current_page });
|
|
// Kiểm tra nếu là trang cuối hoặc không còn kết quả
|
|
const lastBtn = await newPage.$(
|
|
".page-number-navigation__link.page-number-navigation__link-last"
|
|
);
|
|
const lastPageNumber = lastBtn
|
|
? this.extractPageNumber(
|
|
await lastBtn.evaluate((a) => a.getAttribute("href"))
|
|
)
|
|
: current_page;
|
|
|
|
await this.closePageAndContext(newPage, context);
|
|
|
|
if (!result.length || current_page >= lastPageNumber) {
|
|
return result;
|
|
}
|
|
|
|
// Đệ quy lấy trang tiếp theo
|
|
const nextResults = await this.scrapOnMultiplePage(
|
|
item,
|
|
current_page + 1
|
|
);
|
|
return [...result, ...nextResults];
|
|
} catch (error) {
|
|
console.log(error);
|
|
return [];
|
|
}
|
|
};
|
|
|
|
async closePageAndContext(newPage, context) {
|
|
// try {
|
|
// if (newPage && !newPage.isClosed?.()) {
|
|
// await newPage.close();
|
|
// }
|
|
// } catch (err) {
|
|
// console.warn("newPage close error:", err.message);
|
|
// }
|
|
// try {
|
|
// await context?.close();
|
|
// } catch (err) {
|
|
// console.warn("context close error:", err.message);
|
|
// }
|
|
}
|
|
|
|
extractPageNumber(url) {
|
|
const match = url.match(/page-(\d+)/);
|
|
return match ? parseInt(match[1], 10) : 1; // Trả về 1 nếu không tìm thấy số trang
|
|
}
|
|
|
|
getItemsInHtml = async (data, page) => {
|
|
const elements = await page.$$(
|
|
"#react-root > div > div:nth-child(2) > div > div:nth-child(2) > main > section > div:first-child > div a"
|
|
);
|
|
|
|
const results = [];
|
|
|
|
for (const el of elements) {
|
|
// Scroll tới phần tử trước khi thao tác
|
|
await el.evaluate((node) =>
|
|
node.scrollIntoView({ behavior: "smooth", block: "center" })
|
|
);
|
|
|
|
await new Promise((r) => setTimeout(r, 800));
|
|
|
|
const url = await el
|
|
.evaluate((a) => a.getAttribute("href"))
|
|
.catch(() => null);
|
|
|
|
const { imageUrl, className } = await el
|
|
.$eval("img.user-ad-image__thumbnail", (img) => {
|
|
const src = img.getAttribute("src") || img.getAttribute("data-src");
|
|
return {
|
|
imageUrl: src && src.trim() !== "" ? src : null,
|
|
className: img.className, // Lấy class name đầy đủ của thẻ img
|
|
};
|
|
})
|
|
.catch(() => ({ imageUrl: null, className: null }));
|
|
|
|
const name = await el
|
|
.$eval(".user-ad-row-new-design__title-span", (el) =>
|
|
el.textContent.trim()
|
|
)
|
|
.catch(() => null);
|
|
|
|
const current_price = await el
|
|
.$eval(".user-ad-price-new-design__price", (el) =>
|
|
el.textContent.trim()
|
|
)
|
|
.catch(() => null);
|
|
|
|
if (url) {
|
|
results.push({
|
|
url: `${this.web_bid.origin_url}${url}`,
|
|
image_url: imageUrl,
|
|
name,
|
|
keyword: data.keyword,
|
|
model: extractModelId(`${this.web_bid.origin_url}${url}`),
|
|
current_price: extractNumber(current_price),
|
|
scrap_config_id: this.scrap_config_id,
|
|
});
|
|
}
|
|
}
|
|
|
|
return results;
|
|
};
|
|
}
|