bid-tool/scrape-data-keyword/models/gumtree-scrap-model copy.js

258 lines
12 KiB
JavaScript

import { access, constants, mkdir, readFile, writeFile } from "fs/promises";
import * as path from "path";
import { dirname } from "path";
import browser from "../system/browser.js";
import CONSTANTS from "../system/constants.js";
import {
extractDomain,
extractModelId,
extractNumber,
} from "../system/ultils.js";
import { ScrapModel } from "./scrap-model.js";
export class GumtreeScrapModel extends ScrapModel {
COOKIE_FILE_PATH = () => {
const filePath = path.join(
CONSTANTS.PROFILE_PATH,
`${extractDomain(this.web_bid.origin_url)}.json`
);
const dir = dirname(filePath);
// Đảm bảo thư mục chứa cookie tồn tại
access(dir, constants.F_OK).catch(() => mkdir(dir, { recursive: true }));
return filePath;
};
action = async () => {
const urlsData = this.extractUrls();
for (let item of urlsData) {
const data = await this.scrapOnMultiplePage(item);
const results = this.filterItemByKeyword(item.keyword, data);
this.results[item.keyword] = results;
console.log({ results: this.results });
}
};
async loadCookiesIfExist(page) {
try {
// Đọc file cookie
const cookieJson = await readFile(this.COOKIE_FILE_PATH(), "utf-8");
const cookies = JSON.parse(cookieJson);
if (cookies && cookies.length > 0) {
// Set cookie lên page
await page.setCookie(...cookies);
console.log("Cookies loaded to page");
}
} catch (error) {
if (error.code === "ENOENT") {
console.log("No cookie file found, continue without loading cookies");
} else {
throw error;
}
}
}
buildPaginatedUrl(item, current_page = 1) {
if (current_page <= 1) {
return item.url;
}
const url = new URL(item.url);
// Lấy pathname và tách các phần
const parts = url.pathname.split("/").filter(Boolean); // loại bỏ phần rỗng do dấu /
const keywordIndex = parts.findIndex((p) => p === item.keyword);
if (keywordIndex === -1) {
// fallback nếu không tìm thấy keyword trong url
return item.url;
}
// Chèn "page-N" sau keyword
parts.splice(keywordIndex + 1, 0, `page-${current_page}`);
// Gán lại pathname và trả về
url.pathname = "/" + parts.join("/");
return url.toString();
}
async saveCookies(page) {
const cookies = await page.cookies();
try {
let existingCookies = [];
try {
const data = await readFile(this.COOKIE_FILE_PATH(), "utf-8");
existingCookies = JSON.parse(data);
} catch (err) {
// Nếu chưa có file cookies thì bỏ qua
}
// Gộp cookie theo tên + domain
const merged = [...existingCookies, ...cookies];
const uniqueCookies = Object.values(
merged.reduce((acc, c) => {
acc[`${c.name}|${c.domain}`] = c;
return acc;
}, {})
);
await writeFile(
this.COOKIE_FILE_PATH(),
JSON.stringify(uniqueCookies, null, 2)
);
console.log(`✅ Saved ${uniqueCookies.length} cookies.`);
} catch (err) {
console.error("❌ Failed to save cookies:", err);
}
}
scrapOnMultiplePage = async (item, current_page = 1) => {
const context = await browser.createBrowserContext();
const newPage = await context.newPage();
await newPage.emulateTimezone("Australia/Sydney");
await newPage.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "webdriver", {
get: () => false,
});
});
try {
const url = this.buildPaginatedUrl(item, current_page);
await newPage.setExtraHTTPHeaders({
Accept:
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
Referer: item.url,
"Accept-Language": "en-US,en;q=0.9",
Connection: "keep-alive",
"Device-Memory": "8",
Dpr: "1",
Priority: "u=0, i",
Downlink: "1.7",
Rtt: "150",
Ect: "4g",
Cookie: `machId=Ooi7FqYBXo3ARpf8UgUnyaqDXb8gu6gWi5RBb6AYKA5uU27aumFtkY31jSNqB-rWbMi4_YrC66nS_6ixprMIf0ZVwQv6r_llGU0q; libtg=a; bs=%7B%22st%22%3A%7B%7D%7D; afx_csid_hs=ec41291cab664d00a9a715f651fafebf8dda253152a003f; _gcl_au=1.1.442579364.1748923314; _sharedID=3bf469ac-7844-43cd-8f97-578eabfcedde; _sharedID_cst=zix7LPQsHA%3D%3D; _ga=GA1.1.1525016630.1748923314; _fbp=fb.2.1748923314432.133704941733557798; __gsas=ID=3feeb077fda5501c:T=1748923189:RT=1748923189:S=ALNI_Mbsr7RafzbystD4r52ksjRS4S23zQ; s_ecid=MCMID%7C37798711041403725812860372477407048703; AMCVS_50BE5F5858D2477A0A495C7F%40AdobeOrg=1; _lr_geo_location_state=SG; _lr_geo_location=VN; _cc_id=7e5f418613248c588934fb2345dd9bac; panoramaId_expiry=1749527989238; panoramaId=1f8fc411f0404b0c540182b8fcd04945a7023f3b737fa7dcdac189479761ed2b; panoramaIdType=panoIndiv; optimizelyEndUserId=oeu1748923315172r0.2636919564518334; DM_SitId1958=1; DM_SitId1958SecId15098=1; afx_ptpce=1; _ga_TC0NXL1S6B=deleted; _lr_env_src_ats=false; uuid=410C8105-3511-44FC-998F-499E8D02C682; aam_tnt=aamsegid%3D6797281%2Caamsegid%3D6880889; aam_uuid=38041763738279131192814186141414595589; __utmc=160852194; __utmz=160852194.1748931762.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); DM_SitId1958SecId15099=1; afx_profile_da=1; afx_syndr=1748997579162; afx_profile_hs=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; afx_profile=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; __utma=160852194.1525016630.1748923314.1748931762.1748999624.2; optimizelySession=0; aam_dfp=aamsegid%3D6797281%2C6880889%2C7220740%2C7333813%2C8458228%2C8458232%2C9320660%2C9448391; _lr_sampling_rate=100; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_identity=CiYzNzc5ODcxMTA0MTQwMzcyNTgxMjg2MDM3MjQ3NzQwNzA0ODcwM1IRCOeLyJ_zMhgBKgRTR1AzMAHwAaW2t8zzMg==; PEAKHOUR_VISIT=683fff8f780a5460000003137b91a2b1; __rp_ch=683fff8f780a5460000003137b91a30f:smXduwBNl3z822Ybp00oZcPPLV3__dJEOFcljzXBmjLgXU2-xUTbqQbLyq7sX0AdFq816gpMpa1-88AVag; AMCV_50BE5F5858D2477A0A495C7F%40AdobeOrg=-408604571%7CMCMID%7C37798711041403725812860372477407048703%7CMCIDTS%7C20243%7CMCAID%7CNONE%7CMCOPTOUT-1749031987s%7CNONE%7CMCAAMLH-1749629587%7C3%7CMCAAMB-1749629587%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CvVersion%7C4.6.0%7CMCCIDH%7C-1081851458; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_cluster=sgp3; _lr_retry_request=true; up=%7B%22upData%22%3A%22eJy1kcEOgjAQRL9m7%2B22CB4VLibe9AdKBSUibVgw8e%2FdKomYaEKMXNqXzWZmOoV4DYh1fuIT1CowqEwALs585zdviAAjFLVzvmh5bnkuEy2WzO2wS1eG7Wa3D1wyH0xXMFc2bJSmpgLidKxsK7KO2TPjoIpSSjGoyqmyHBvTEJs%2BP4GNnk4PByF0NN2ha%2FvZcr%2FLzlc0mQv1zXEUVf1e8beG%2F%2FuVjX8Z6SRgnN0BgF606w%3D%3D%22%7D; afx_dcr=; afx_orref=; __gads=ID=dd76a5311e79db68:T=1748923190:RT=1749025285:S=ALNI_MaQdPDjdG0-l54Mw-5zkEZq3ruzwA; __eoi=ID=34a32a91b325f0d8:T=1748923190:RT=1749025285:S=AA-AfjZsqCgmvwIEsjsGZWkO7Mjv; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22XCQntaL7xSSUJ3wNfloA%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.727Z%22%7D; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22undefined%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.779Z%22%7D; _uetsid=7caa7710402f11f08d410f2f6e2194d2; _uetvid=7caabae0402f11f0ad477f10ec12ba4d; nol_fpid=wxumrg5hye2hhnzso9vqwynhtcfrc1748923314|1748923314929|1749025428972|1749025429412; cto_bundle=C4hywl9rRTJJVXZPJTJGMVMlMkZWYjJYS1Z3OTlmY3BwZWklMkJVRDMzUGQxZk12OGhKU2JTbzZlSG9MU2J4JTJCVG5iNndUOXhtMjJmTXpzQjhKV2RjVzNBeGJINGQ1RWxOazBUazQxQXFEek8wSGFIYUpUZzRhNWlpVmlGVTElMkZYeTZicyUyRmpkU0hFS1BQaFJ6YmgwSzM5SXo1bVdaa1lBc0FOM2NpU28xR1A3STlnVmhuR1Nlbjk4ZVBvT2ZrdUhCZVJuNzdlajNpZzg4bGxxZWlZQkNoVmpLRjBhU25wJTJGVnclM0QlM0Q; cto_bidid=bTR-CV9JMmNVbnVJOTVEVWl1YWdiNEsybEFlYzR5Q2JLQnVLdjBkcU1VU1d0JTJGVkdqVSUyRnQ0QzZaem5IcVBRcUY4Vm9EVFBLJTJCa2dPU2lyWWVPbUk4YVQ1TG5tQzJGTjQ0dEp2d1pUSjZOcXNkNEltVDBvb3RmOFZJUktkVXZUS0dGQTV3Zw; _ga_TC0NXL1S6B=GS2.1.s1749024783$o6$g1$t1749025457$j60$l0$h0`,
});
await this.loadCookiesIfExist(newPage);
await newPage.goto(url);
await this.saveCookies(newPage);
const result = await this.getItemsInHtml({ ...item, url }, newPage);
console.log({ current_page });
// Kiểm tra nếu là trang cuối hoặc không còn kết quả
const lastBtn = await newPage.$(
".page-number-navigation__link.page-number-navigation__link-last"
);
const lastPageNumber = lastBtn
? this.extractPageNumber(
await lastBtn.evaluate((a) => a.getAttribute("href"))
)
: current_page;
await this.closePageAndContext(newPage, context);
if (!result.length || current_page >= lastPageNumber) {
return result;
}
// Đệ quy lấy trang tiếp theo
const nextResults = await this.scrapOnMultiplePage(
item,
current_page + 1
);
return [...result, ...nextResults];
} catch (error) {
console.log(error);
return [];
}
};
async closePageAndContext(newPage, context) {
// try {
// if (newPage && !newPage.isClosed?.()) {
// await newPage.close();
// }
// } catch (err) {
// console.warn("newPage close error:", err.message);
// }
// try {
// await context?.close();
// } catch (err) {
// console.warn("context close error:", err.message);
// }
}
extractPageNumber(url) {
const match = url.match(/page-(\d+)/);
return match ? parseInt(match[1], 10) : 1; // Trả về 1 nếu không tìm thấy số trang
}
getItemsInHtml = async (data, page) => {
const elements = await page.$$(
"#react-root > div > div:nth-child(2) > div > div:nth-child(2) > main > section > div:first-child > div a"
);
const results = [];
for (const el of elements) {
// Scroll tới phần tử trước khi thao tác
await el.evaluate((node) =>
node.scrollIntoView({ behavior: "smooth", block: "center" })
);
await new Promise((r) => setTimeout(r, 800));
const url = await el
.evaluate((a) => a.getAttribute("href"))
.catch(() => null);
const { imageUrl, className } = await el
.$eval("img.user-ad-image__thumbnail", (img) => {
const src = img.getAttribute("src") || img.getAttribute("data-src");
return {
imageUrl: src && src.trim() !== "" ? src : null,
className: img.className, // Lấy class name đầy đủ của thẻ img
};
})
.catch(() => ({ imageUrl: null, className: null }));
const name = await el
.$eval(".user-ad-row-new-design__title-span", (el) =>
el.textContent.trim()
)
.catch(() => null);
const current_price = await el
.$eval(".user-ad-price-new-design__price", (el) =>
el.textContent.trim()
)
.catch(() => null);
if (url) {
results.push({
url: `${this.web_bid.origin_url}${url}`,
image_url: imageUrl,
name,
keyword: data.keyword,
model: extractModelId(`${this.web_bid.origin_url}${url}`),
current_price: extractNumber(current_price),
scrap_config_id: this.scrap_config_id,
});
}
}
return results;
};
}