import { access, constants, mkdir, readFile, writeFile } from "fs/promises"; import * as path from "path"; import { dirname } from "path"; import browser from "../system/browser.js"; import CONSTANTS from "../system/constants.js"; import { extractDomain, extractModelId, extractNumber, } from "../system/ultils.js"; import { ScrapModel } from "./scrap-model.js"; export class GumtreeScrapModel extends ScrapModel { COOKIE_FILE_PATH = () => { const filePath = path.join( CONSTANTS.PROFILE_PATH, `${extractDomain(this.web_bid.origin_url)}.json` ); const dir = dirname(filePath); // Đảm bảo thư mục chứa cookie tồn tại access(dir, constants.F_OK).catch(() => mkdir(dir, { recursive: true })); return filePath; }; action = async () => { const urlsData = this.extractUrls(); for (let item of urlsData) { const data = await this.scrapOnMultiplePage(item); const results = this.filterItemByKeyword(item.keyword, data); this.results[item.keyword] = results; console.log({ results: this.results }); } }; async loadCookiesIfExist(page) { try { // Đọc file cookie const cookieJson = await readFile(this.COOKIE_FILE_PATH(), "utf-8"); const cookies = JSON.parse(cookieJson); if (cookies && cookies.length > 0) { // Set cookie lên page await page.setCookie(...cookies); console.log("Cookies loaded to page"); } } catch (error) { if (error.code === "ENOENT") { console.log("No cookie file found, continue without loading cookies"); } else { throw error; } } } buildPaginatedUrl(item, current_page = 1) { if (current_page <= 1) { return item.url; } const url = new URL(item.url); // Lấy pathname và tách các phần const parts = url.pathname.split("/").filter(Boolean); // loại bỏ phần rỗng do dấu / const keywordIndex = parts.findIndex((p) => p === item.keyword); if (keywordIndex === -1) { // fallback nếu không tìm thấy keyword trong url return item.url; } // Chèn "page-N" sau keyword parts.splice(keywordIndex + 1, 0, `page-${current_page}`); // Gán lại pathname và trả về url.pathname = "/" + parts.join("/"); return url.toString(); } async saveCookies(page) { const cookies = await page.cookies(); try { let existingCookies = []; try { const data = await readFile(this.COOKIE_FILE_PATH(), "utf-8"); existingCookies = JSON.parse(data); } catch (err) { // Nếu chưa có file cookies thì bỏ qua } // Gộp cookie theo tên + domain const merged = [...existingCookies, ...cookies]; const uniqueCookies = Object.values( merged.reduce((acc, c) => { acc[`${c.name}|${c.domain}`] = c; return acc; }, {}) ); await writeFile( this.COOKIE_FILE_PATH(), JSON.stringify(uniqueCookies, null, 2) ); console.log(`✅ Saved ${uniqueCookies.length} cookies.`); } catch (err) { console.error("❌ Failed to save cookies:", err); } } scrapOnMultiplePage = async (item, current_page = 1) => { const context = await browser.createBrowserContext(); const newPage = await context.newPage(); await newPage.emulateTimezone("Australia/Sydney"); await newPage.evaluateOnNewDocument(() => { Object.defineProperty(navigator, "webdriver", { get: () => false, }); }); try { const url = this.buildPaginatedUrl(item, current_page); await newPage.setExtraHTTPHeaders({ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", Referer: item.url, "Accept-Language": "en-US,en;q=0.9", Connection: "keep-alive", "Device-Memory": "8", Dpr: "1", Priority: "u=0, i", Downlink: "1.7", Rtt: "150", Ect: "4g", Cookie: `machId=Ooi7FqYBXo3ARpf8UgUnyaqDXb8gu6gWi5RBb6AYKA5uU27aumFtkY31jSNqB-rWbMi4_YrC66nS_6ixprMIf0ZVwQv6r_llGU0q; libtg=a; bs=%7B%22st%22%3A%7B%7D%7D; afx_csid_hs=ec41291cab664d00a9a715f651fafebf8dda253152a003f; _gcl_au=1.1.442579364.1748923314; _sharedID=3bf469ac-7844-43cd-8f97-578eabfcedde; _sharedID_cst=zix7LPQsHA%3D%3D; _ga=GA1.1.1525016630.1748923314; _fbp=fb.2.1748923314432.133704941733557798; __gsas=ID=3feeb077fda5501c:T=1748923189:RT=1748923189:S=ALNI_Mbsr7RafzbystD4r52ksjRS4S23zQ; s_ecid=MCMID%7C37798711041403725812860372477407048703; AMCVS_50BE5F5858D2477A0A495C7F%40AdobeOrg=1; _lr_geo_location_state=SG; _lr_geo_location=VN; _cc_id=7e5f418613248c588934fb2345dd9bac; panoramaId_expiry=1749527989238; panoramaId=1f8fc411f0404b0c540182b8fcd04945a7023f3b737fa7dcdac189479761ed2b; panoramaIdType=panoIndiv; optimizelyEndUserId=oeu1748923315172r0.2636919564518334; DM_SitId1958=1; DM_SitId1958SecId15098=1; afx_ptpce=1; _ga_TC0NXL1S6B=deleted; _lr_env_src_ats=false; uuid=410C8105-3511-44FC-998F-499E8D02C682; aam_tnt=aamsegid%3D6797281%2Caamsegid%3D6880889; aam_uuid=38041763738279131192814186141414595589; __utmc=160852194; __utmz=160852194.1748931762.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); DM_SitId1958SecId15099=1; afx_profile_da=1; afx_syndr=1748997579162; afx_profile_hs=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; afx_profile=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; __utma=160852194.1525016630.1748923314.1748931762.1748999624.2; optimizelySession=0; aam_dfp=aamsegid%3D6797281%2C6880889%2C7220740%2C7333813%2C8458228%2C8458232%2C9320660%2C9448391; _lr_sampling_rate=100; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_identity=CiYzNzc5ODcxMTA0MTQwMzcyNTgxMjg2MDM3MjQ3NzQwNzA0ODcwM1IRCOeLyJ_zMhgBKgRTR1AzMAHwAaW2t8zzMg==; PEAKHOUR_VISIT=683fff8f780a5460000003137b91a2b1; __rp_ch=683fff8f780a5460000003137b91a30f:smXduwBNl3z822Ybp00oZcPPLV3__dJEOFcljzXBmjLgXU2-xUTbqQbLyq7sX0AdFq816gpMpa1-88AVag; AMCV_50BE5F5858D2477A0A495C7F%40AdobeOrg=-408604571%7CMCMID%7C37798711041403725812860372477407048703%7CMCIDTS%7C20243%7CMCAID%7CNONE%7CMCOPTOUT-1749031987s%7CNONE%7CMCAAMLH-1749629587%7C3%7CMCAAMB-1749629587%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CvVersion%7C4.6.0%7CMCCIDH%7C-1081851458; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_cluster=sgp3; _lr_retry_request=true; up=%7B%22upData%22%3A%22eJy1kcEOgjAQRL9m7%2B22CB4VLibe9AdKBSUibVgw8e%2FdKomYaEKMXNqXzWZmOoV4DYh1fuIT1CowqEwALs585zdviAAjFLVzvmh5bnkuEy2WzO2wS1eG7Wa3D1wyH0xXMFc2bJSmpgLidKxsK7KO2TPjoIpSSjGoyqmyHBvTEJs%2BP4GNnk4PByF0NN2ha%2FvZcr%2FLzlc0mQv1zXEUVf1e8beG%2F%2FuVjX8Z6SRgnN0BgF606w%3D%3D%22%7D; afx_dcr=; afx_orref=; __gads=ID=dd76a5311e79db68:T=1748923190:RT=1749025285:S=ALNI_MaQdPDjdG0-l54Mw-5zkEZq3ruzwA; __eoi=ID=34a32a91b325f0d8:T=1748923190:RT=1749025285:S=AA-AfjZsqCgmvwIEsjsGZWkO7Mjv; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22XCQntaL7xSSUJ3wNfloA%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.727Z%22%7D; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22undefined%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.779Z%22%7D; _uetsid=7caa7710402f11f08d410f2f6e2194d2; _uetvid=7caabae0402f11f0ad477f10ec12ba4d; nol_fpid=wxumrg5hye2hhnzso9vqwynhtcfrc1748923314|1748923314929|1749025428972|1749025429412; cto_bundle=C4hywl9rRTJJVXZPJTJGMVMlMkZWYjJYS1Z3OTlmY3BwZWklMkJVRDMzUGQxZk12OGhKU2JTbzZlSG9MU2J4JTJCVG5iNndUOXhtMjJmTXpzQjhKV2RjVzNBeGJINGQ1RWxOazBUazQxQXFEek8wSGFIYUpUZzRhNWlpVmlGVTElMkZYeTZicyUyRmpkU0hFS1BQaFJ6YmgwSzM5SXo1bVdaa1lBc0FOM2NpU28xR1A3STlnVmhuR1Nlbjk4ZVBvT2ZrdUhCZVJuNzdlajNpZzg4bGxxZWlZQkNoVmpLRjBhU25wJTJGVnclM0QlM0Q; cto_bidid=bTR-CV9JMmNVbnVJOTVEVWl1YWdiNEsybEFlYzR5Q2JLQnVLdjBkcU1VU1d0JTJGVkdqVSUyRnQ0QzZaem5IcVBRcUY4Vm9EVFBLJTJCa2dPU2lyWWVPbUk4YVQ1TG5tQzJGTjQ0dEp2d1pUSjZOcXNkNEltVDBvb3RmOFZJUktkVXZUS0dGQTV3Zw; _ga_TC0NXL1S6B=GS2.1.s1749024783$o6$g1$t1749025457$j60$l0$h0`, }); await this.loadCookiesIfExist(newPage); await newPage.goto(url); await this.saveCookies(newPage); const result = await this.getItemsInHtml({ ...item, url }, newPage); console.log({ current_page }); // Kiểm tra nếu là trang cuối hoặc không còn kết quả const lastBtn = await newPage.$( ".page-number-navigation__link.page-number-navigation__link-last" ); const lastPageNumber = lastBtn ? this.extractPageNumber( await lastBtn.evaluate((a) => a.getAttribute("href")) ) : current_page; await this.closePageAndContext(newPage, context); if (!result.length || current_page >= lastPageNumber) { return result; } // Đệ quy lấy trang tiếp theo const nextResults = await this.scrapOnMultiplePage( item, current_page + 1 ); return [...result, ...nextResults]; } catch (error) { console.log(error); return []; } }; async closePageAndContext(newPage, context) { // try { // if (newPage && !newPage.isClosed?.()) { // await newPage.close(); // } // } catch (err) { // console.warn("newPage close error:", err.message); // } // try { // await context?.close(); // } catch (err) { // console.warn("context close error:", err.message); // } } extractPageNumber(url) { const match = url.match(/page-(\d+)/); return match ? parseInt(match[1], 10) : 1; // Trả về 1 nếu không tìm thấy số trang } getItemsInHtml = async (data, page) => { const elements = await page.$$( "#react-root > div > div:nth-child(2) > div > div:nth-child(2) > main > section > div:first-child > div a" ); const results = []; for (const el of elements) { // Scroll tới phần tử trước khi thao tác await el.evaluate((node) => node.scrollIntoView({ behavior: "smooth", block: "center" }) ); await new Promise((r) => setTimeout(r, 800)); const url = await el .evaluate((a) => a.getAttribute("href")) .catch(() => null); const { imageUrl, className } = await el .$eval("img.user-ad-image__thumbnail", (img) => { const src = img.getAttribute("src") || img.getAttribute("data-src"); return { imageUrl: src && src.trim() !== "" ? src : null, className: img.className, // Lấy class name đầy đủ của thẻ img }; }) .catch(() => ({ imageUrl: null, className: null })); const name = await el .$eval(".user-ad-row-new-design__title-span", (el) => el.textContent.trim() ) .catch(() => null); const current_price = await el .$eval(".user-ad-price-new-design__price", (el) => el.textContent.trim() ) .catch(() => null); if (url) { results.push({ url: `${this.web_bid.origin_url}${url}`, image_url: imageUrl, name, keyword: data.keyword, model: extractModelId(`${this.web_bid.origin_url}${url}`), current_price: extractNumber(current_price), scrap_config_id: this.scrap_config_id, }); } } return results; }; }