From 326c9e71adbd17cf24100bfd3fabff6669f3ebcf Mon Sep 17 00:00:00 2001 From: Admin Date: Sat, 7 Jun 2025 10:27:25 +0700 Subject: [PATCH] gumtree --- scrape-data-keyword/index.js | 4 + .../models/gumtree-scrap-model copy.js | 257 ++++++++++ .../models/gumtree-scrap-model.js | 21 + scrape-data-keyword/package-lock.json | 482 ++++++++++++++++++ scrape-data-keyword/package.json | 3 + .../services/scrap-configs-service.js | 8 + scrape-data-keyword/system/browser.js | 6 +- scrape-data-keyword/system/constants.js | 11 + scrape-data-keyword/system/ultils.js | 4 + 9 files changed, 794 insertions(+), 2 deletions(-) create mode 100644 scrape-data-keyword/models/gumtree-scrap-model copy.js create mode 100644 scrape-data-keyword/models/gumtree-scrap-model.js create mode 100644 scrape-data-keyword/system/constants.js diff --git a/scrape-data-keyword/index.js b/scrape-data-keyword/index.js index 8ca0a3c..8294c5b 100644 --- a/scrape-data-keyword/index.js +++ b/scrape-data-keyword/index.js @@ -15,6 +15,10 @@ const init = async () => { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" ); + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, "webdriver", { get: () => false }); + }); + const models = ScrapConfigsService.scrapModels(scrapConfigs, page); console.log(`Loaded ${models.length} scrape models`); diff --git a/scrape-data-keyword/models/gumtree-scrap-model copy.js b/scrape-data-keyword/models/gumtree-scrap-model copy.js new file mode 100644 index 0000000..c0453a6 --- /dev/null +++ b/scrape-data-keyword/models/gumtree-scrap-model copy.js @@ -0,0 +1,257 @@ +import { access, constants, mkdir, readFile, writeFile } from "fs/promises"; +import * as path from "path"; +import { dirname } from "path"; +import browser from "../system/browser.js"; +import CONSTANTS from "../system/constants.js"; +import { + extractDomain, + extractModelId, + extractNumber, +} from "../system/ultils.js"; +import { ScrapModel } from "./scrap-model.js"; + +export class GumtreeScrapModel extends ScrapModel { + COOKIE_FILE_PATH = () => { + const filePath = path.join( + CONSTANTS.PROFILE_PATH, + `${extractDomain(this.web_bid.origin_url)}.json` + ); + const dir = dirname(filePath); + + // Đảm bảo thư mục chứa cookie tồn tại + access(dir, constants.F_OK).catch(() => mkdir(dir, { recursive: true })); + + return filePath; + }; + action = async () => { + const urlsData = this.extractUrls(); + + for (let item of urlsData) { + const data = await this.scrapOnMultiplePage(item); + const results = this.filterItemByKeyword(item.keyword, data); + this.results[item.keyword] = results; + console.log({ results: this.results }); + } + }; + + async loadCookiesIfExist(page) { + try { + // Đọc file cookie + const cookieJson = await readFile(this.COOKIE_FILE_PATH(), "utf-8"); + const cookies = JSON.parse(cookieJson); + + if (cookies && cookies.length > 0) { + // Set cookie lên page + await page.setCookie(...cookies); + console.log("Cookies loaded to page"); + } + } catch (error) { + if (error.code === "ENOENT") { + console.log("No cookie file found, continue without loading cookies"); + } else { + throw error; + } + } + } + + buildPaginatedUrl(item, current_page = 1) { + if (current_page <= 1) { + return item.url; + } + + const url = new URL(item.url); + + // Lấy pathname và tách các phần + const parts = url.pathname.split("/").filter(Boolean); // loại bỏ phần rỗng do dấu / + const keywordIndex = parts.findIndex((p) => p === item.keyword); + + if (keywordIndex === -1) { + // fallback nếu không tìm thấy keyword trong url + return item.url; + } + + // Chèn "page-N" sau keyword + parts.splice(keywordIndex + 1, 0, `page-${current_page}`); + + // Gán lại pathname và trả về + url.pathname = "/" + parts.join("/"); + return url.toString(); + } + + async saveCookies(page) { + const cookies = await page.cookies(); + + try { + let existingCookies = []; + try { + const data = await readFile(this.COOKIE_FILE_PATH(), "utf-8"); + existingCookies = JSON.parse(data); + } catch (err) { + // Nếu chưa có file cookies thì bỏ qua + } + + // Gộp cookie theo tên + domain + const merged = [...existingCookies, ...cookies]; + const uniqueCookies = Object.values( + merged.reduce((acc, c) => { + acc[`${c.name}|${c.domain}`] = c; + return acc; + }, {}) + ); + + await writeFile( + this.COOKIE_FILE_PATH(), + JSON.stringify(uniqueCookies, null, 2) + ); + console.log(`✅ Saved ${uniqueCookies.length} cookies.`); + } catch (err) { + console.error("❌ Failed to save cookies:", err); + } + } + + scrapOnMultiplePage = async (item, current_page = 1) => { + const context = await browser.createBrowserContext(); + + const newPage = await context.newPage(); + await newPage.emulateTimezone("Australia/Sydney"); + + await newPage.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, "webdriver", { + get: () => false, + }); + }); + + try { + const url = this.buildPaginatedUrl(item, current_page); + + await newPage.setExtraHTTPHeaders({ + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8", + "User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + Referer: item.url, + "Accept-Language": "en-US,en;q=0.9", + Connection: "keep-alive", + "Device-Memory": "8", + Dpr: "1", + Priority: "u=0, i", + Downlink: "1.7", + Rtt: "150", + Ect: "4g", + Cookie: `machId=Ooi7FqYBXo3ARpf8UgUnyaqDXb8gu6gWi5RBb6AYKA5uU27aumFtkY31jSNqB-rWbMi4_YrC66nS_6ixprMIf0ZVwQv6r_llGU0q; libtg=a; bs=%7B%22st%22%3A%7B%7D%7D; afx_csid_hs=ec41291cab664d00a9a715f651fafebf8dda253152a003f; _gcl_au=1.1.442579364.1748923314; _sharedID=3bf469ac-7844-43cd-8f97-578eabfcedde; _sharedID_cst=zix7LPQsHA%3D%3D; _ga=GA1.1.1525016630.1748923314; _fbp=fb.2.1748923314432.133704941733557798; __gsas=ID=3feeb077fda5501c:T=1748923189:RT=1748923189:S=ALNI_Mbsr7RafzbystD4r52ksjRS4S23zQ; s_ecid=MCMID%7C37798711041403725812860372477407048703; AMCVS_50BE5F5858D2477A0A495C7F%40AdobeOrg=1; _lr_geo_location_state=SG; _lr_geo_location=VN; _cc_id=7e5f418613248c588934fb2345dd9bac; panoramaId_expiry=1749527989238; panoramaId=1f8fc411f0404b0c540182b8fcd04945a7023f3b737fa7dcdac189479761ed2b; panoramaIdType=panoIndiv; optimizelyEndUserId=oeu1748923315172r0.2636919564518334; DM_SitId1958=1; DM_SitId1958SecId15098=1; afx_ptpce=1; _ga_TC0NXL1S6B=deleted; _lr_env_src_ats=false; uuid=410C8105-3511-44FC-998F-499E8D02C682; aam_tnt=aamsegid%3D6797281%2Caamsegid%3D6880889; aam_uuid=38041763738279131192814186141414595589; __utmc=160852194; __utmz=160852194.1748931762.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); DM_SitId1958SecId15099=1; afx_profile_da=1; afx_syndr=1748997579162; afx_profile_hs=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; afx_profile=%7B%22acid%22%3A%22da141af1ddce2856532f636d429ee1aaf7709fbd%22%2C%22prid%22%3A%22CBmqA1KYADpw7phCiqki9LCJKtakXP1T9daPv46II%2BCD8OxOEvqXyJN%2BLRXlRVmg2WbQMWDpvSljMflz8UN15cgfZ2SO2UMG%2FoGVoX%2BMZDZlgoJq7E4YAJOovVXxybLQd%2Bv9aeSYfoTB2wYIsD1TQJGLFeebO%2BDxTeffm2o4qhXLHj3Lj02TGn21W5GMWBiknNuosNKK8lEooKTNsCeCRw%3D%3D%22%2C%22pridsd%22%3A%222025-06-04T00%3A37%3A32.3575981%2B00%3A00%22%2C%22consent%22%3A%7B%22dnt%22%3Afalse%7D%2C%22pv%22%3A%222025.6.2.1%22%7D; __utma=160852194.1525016630.1748923314.1748931762.1748999624.2; optimizelySession=0; aam_dfp=aamsegid%3D6797281%2C6880889%2C7220740%2C7333813%2C8458228%2C8458232%2C9320660%2C9448391; _lr_sampling_rate=100; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_identity=CiYzNzc5ODcxMTA0MTQwMzcyNTgxMjg2MDM3MjQ3NzQwNzA0ODcwM1IRCOeLyJ_zMhgBKgRTR1AzMAHwAaW2t8zzMg==; PEAKHOUR_VISIT=683fff8f780a5460000003137b91a2b1; __rp_ch=683fff8f780a5460000003137b91a30f:smXduwBNl3z822Ybp00oZcPPLV3__dJEOFcljzXBmjLgXU2-xUTbqQbLyq7sX0AdFq816gpMpa1-88AVag; AMCV_50BE5F5858D2477A0A495C7F%40AdobeOrg=-408604571%7CMCMID%7C37798711041403725812860372477407048703%7CMCIDTS%7C20243%7CMCAID%7CNONE%7CMCOPTOUT-1749031987s%7CNONE%7CMCAAMLH-1749629587%7C3%7CMCAAMB-1749629587%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CvVersion%7C4.6.0%7CMCCIDH%7C-1081851458; kndctr_50BE5F5858D2477A0A495C7F_AdobeOrg_cluster=sgp3; _lr_retry_request=true; up=%7B%22upData%22%3A%22eJy1kcEOgjAQRL9m7%2B22CB4VLibe9AdKBSUibVgw8e%2FdKomYaEKMXNqXzWZmOoV4DYh1fuIT1CowqEwALs585zdviAAjFLVzvmh5bnkuEy2WzO2wS1eG7Wa3D1wyH0xXMFc2bJSmpgLidKxsK7KO2TPjoIpSSjGoyqmyHBvTEJs%2BP4GNnk4PByF0NN2ha%2FvZcr%2FLzlc0mQv1zXEUVf1e8beG%2F%2FuVjX8Z6SRgnN0BgF606w%3D%3D%22%7D; afx_dcr=; afx_orref=; __gads=ID=dd76a5311e79db68:T=1748923190:RT=1749025285:S=ALNI_MaQdPDjdG0-l54Mw-5zkEZq3ruzwA; __eoi=ID=34a32a91b325f0d8:T=1748923190:RT=1749025285:S=AA-AfjZsqCgmvwIEsjsGZWkO7Mjv; __rtbh.lid=%7B%22eventType%22%3A%22lid%22%2C%22id%22%3A%22XCQntaL7xSSUJ3wNfloA%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.727Z%22%7D; __rtbh.uid=%7B%22eventType%22%3A%22uid%22%2C%22id%22%3A%22undefined%22%2C%22expiryDate%22%3A%222026-06-04T08%3A23%3A48.779Z%22%7D; _uetsid=7caa7710402f11f08d410f2f6e2194d2; _uetvid=7caabae0402f11f0ad477f10ec12ba4d; nol_fpid=wxumrg5hye2hhnzso9vqwynhtcfrc1748923314|1748923314929|1749025428972|1749025429412; cto_bundle=C4hywl9rRTJJVXZPJTJGMVMlMkZWYjJYS1Z3OTlmY3BwZWklMkJVRDMzUGQxZk12OGhKU2JTbzZlSG9MU2J4JTJCVG5iNndUOXhtMjJmTXpzQjhKV2RjVzNBeGJINGQ1RWxOazBUazQxQXFEek8wSGFIYUpUZzRhNWlpVmlGVTElMkZYeTZicyUyRmpkU0hFS1BQaFJ6YmgwSzM5SXo1bVdaa1lBc0FOM2NpU28xR1A3STlnVmhuR1Nlbjk4ZVBvT2ZrdUhCZVJuNzdlajNpZzg4bGxxZWlZQkNoVmpLRjBhU25wJTJGVnclM0QlM0Q; cto_bidid=bTR-CV9JMmNVbnVJOTVEVWl1YWdiNEsybEFlYzR5Q2JLQnVLdjBkcU1VU1d0JTJGVkdqVSUyRnQ0QzZaem5IcVBRcUY4Vm9EVFBLJTJCa2dPU2lyWWVPbUk4YVQ1TG5tQzJGTjQ0dEp2d1pUSjZOcXNkNEltVDBvb3RmOFZJUktkVXZUS0dGQTV3Zw; _ga_TC0NXL1S6B=GS2.1.s1749024783$o6$g1$t1749025457$j60$l0$h0`, + }); + + await this.loadCookiesIfExist(newPage); + + await newPage.goto(url); + + await this.saveCookies(newPage); + + const result = await this.getItemsInHtml({ ...item, url }, newPage); + + console.log({ current_page }); + // Kiểm tra nếu là trang cuối hoặc không còn kết quả + const lastBtn = await newPage.$( + ".page-number-navigation__link.page-number-navigation__link-last" + ); + const lastPageNumber = lastBtn + ? this.extractPageNumber( + await lastBtn.evaluate((a) => a.getAttribute("href")) + ) + : current_page; + + await this.closePageAndContext(newPage, context); + + if (!result.length || current_page >= lastPageNumber) { + return result; + } + + // Đệ quy lấy trang tiếp theo + const nextResults = await this.scrapOnMultiplePage( + item, + current_page + 1 + ); + return [...result, ...nextResults]; + } catch (error) { + console.log(error); + return []; + } + }; + + async closePageAndContext(newPage, context) { + // try { + // if (newPage && !newPage.isClosed?.()) { + // await newPage.close(); + // } + // } catch (err) { + // console.warn("newPage close error:", err.message); + // } + // try { + // await context?.close(); + // } catch (err) { + // console.warn("context close error:", err.message); + // } + } + + extractPageNumber(url) { + const match = url.match(/page-(\d+)/); + return match ? parseInt(match[1], 10) : 1; // Trả về 1 nếu không tìm thấy số trang + } + + getItemsInHtml = async (data, page) => { + const elements = await page.$$( + "#react-root > div > div:nth-child(2) > div > div:nth-child(2) > main > section > div:first-child > div a" + ); + + const results = []; + + for (const el of elements) { + // Scroll tới phần tử trước khi thao tác + await el.evaluate((node) => + node.scrollIntoView({ behavior: "smooth", block: "center" }) + ); + + await new Promise((r) => setTimeout(r, 800)); + + const url = await el + .evaluate((a) => a.getAttribute("href")) + .catch(() => null); + + const { imageUrl, className } = await el + .$eval("img.user-ad-image__thumbnail", (img) => { + const src = img.getAttribute("src") || img.getAttribute("data-src"); + return { + imageUrl: src && src.trim() !== "" ? src : null, + className: img.className, // Lấy class name đầy đủ của thẻ img + }; + }) + .catch(() => ({ imageUrl: null, className: null })); + + const name = await el + .$eval(".user-ad-row-new-design__title-span", (el) => + el.textContent.trim() + ) + .catch(() => null); + + const current_price = await el + .$eval(".user-ad-price-new-design__price", (el) => + el.textContent.trim() + ) + .catch(() => null); + + if (url) { + results.push({ + url: `${this.web_bid.origin_url}${url}`, + image_url: imageUrl, + name, + keyword: data.keyword, + model: extractModelId(`${this.web_bid.origin_url}${url}`), + current_price: extractNumber(current_price), + scrap_config_id: this.scrap_config_id, + }); + } + } + + return results; + }; +} diff --git a/scrape-data-keyword/models/gumtree-scrap-model.js b/scrape-data-keyword/models/gumtree-scrap-model.js new file mode 100644 index 0000000..b8245c7 --- /dev/null +++ b/scrape-data-keyword/models/gumtree-scrap-model.js @@ -0,0 +1,21 @@ +import { ScrapModel } from "./scrap-model.js"; +import axios from "axios"; + +export class GumtreeScrapModel extends ScrapModel { + action = async () => { + const urlsData = this.extractUrls(); + + const { data } = await axios({ + method: "POST", + baseURL: process.env.WVM_API, + url: "gumtree-scrap", + timeout: 300000, + data: { + data: urlsData, + scrap_config_id: this.scrap_config_id, + }, + }); + + this.results = data; + }; +} diff --git a/scrape-data-keyword/package-lock.json b/scrape-data-keyword/package-lock.json index 1ead6fe..5dfdae7 100644 --- a/scrape-data-keyword/package-lock.json +++ b/scrape-data-keyword/package-lock.json @@ -10,8 +10,11 @@ "license": "ISC", "dependencies": { "axios": "^1.8.2", + "cheerio": "^1.0.0", "dotenv": "^16.4.7", "lodash": "^4.17.21", + "node-ssh": "^13.2.1", + "playwright": "^1.52.0", "puppeteer": "^24.4.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-stealth": "^2.11.2" @@ -150,6 +153,15 @@ "node": ">=0.10.0" } }, + "node_modules/asn1": { + "version": "0.2.6", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz", + "integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==", + "license": "MIT", + "dependencies": { + "safer-buffer": "~2.1.0" + } + }, "node_modules/ast-types": { "version": "0.13.4", "resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz", @@ -272,6 +284,21 @@ "node": ">=10.0.0" } }, + "node_modules/bcrypt-pbkdf": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz", + "integrity": "sha512-qeFIXtP4MSoi6NLqO12WfqARWWuCKi2Rn/9hJLEmtB5yTNr9DqFWkJRCf2qShWzPeAMRnOgCrq0sg/KLv5ES9w==", + "license": "BSD-3-Clause", + "dependencies": { + "tweetnacl": "^0.14.3" + } + }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -291,6 +318,15 @@ "node": "*" } }, + "node_modules/buildcheck": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/buildcheck/-/buildcheck-0.0.6.tgz", + "integrity": "sha512-8f9ZJCUXyT1M35Jx7MkBgmBMo3oHTTBIPLiY9xyL0pl3T5RwcPEY8cUHr5LBNfu/fk6c2T4DJZuVM/8ZZT2D2A==", + "optional": true, + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/call-bind-apply-helpers": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", @@ -313,6 +349,48 @@ "node": ">=6" } }, + "node_modules/cheerio": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz", + "integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==", + "license": "MIT", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "encoding-sniffer": "^0.2.0", + "htmlparser2": "^9.1.0", + "parse5": "^7.1.2", + "parse5-htmlparser2-tree-adapter": "^7.0.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^6.19.5", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=18.17" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/chromium-bidi": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-5.1.0.tgz", @@ -418,6 +496,48 @@ } } }, + "node_modules/cpu-features": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/cpu-features/-/cpu-features-0.0.10.tgz", + "integrity": "sha512-9IkYqtX3YHPCzoVg1Py+o9057a3i0fp7S530UWokCSaFVTc7CwXPRiOjRjBQQ18ZCNafx78YfnG+HALxtVmOGA==", + "hasInstallScript": true, + "optional": true, + "dependencies": { + "buildcheck": "~0.0.6", + "nan": "^2.19.0" + }, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/data-uri-to-buffer": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz", @@ -482,6 +602,61 @@ "integrity": "sha512-jJF48UdryzKiWhJ1bLKr7BFWUQCEIT5uCNbDLqkQJBtkFxYzILJH44WN0PDKMIlGDN7Utb8vyUY85C3w4R/t2g==", "license": "BSD-3-Clause" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, "node_modules/dotenv": { "version": "16.5.0", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.5.0.tgz", @@ -514,6 +689,19 @@ "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", "license": "MIT" }, + "node_modules/encoding-sniffer": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz", + "integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, "node_modules/end-of-stream": { "version": "1.4.4", "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", @@ -523,6 +711,18 @@ "once": "^1.4.0" } }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/env-paths": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", @@ -758,6 +958,20 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", "license": "ISC" }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/function-bind": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", @@ -920,6 +1134,25 @@ "node": ">= 0.4" } }, + "node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" + } + }, "node_modules/http-proxy-agent": { "version": "7.0.2", "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", @@ -946,6 +1179,18 @@ "node": ">= 14" } }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -1034,6 +1279,18 @@ "node": ">=0.10.0" } }, + "node_modules/is-stream": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", + "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/isobject": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", @@ -1127,6 +1384,30 @@ "node": ">=12" } }, + "node_modules/make-dir": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-3.1.0.tgz", + "integrity": "sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==", + "license": "MIT", + "dependencies": { + "semver": "^6.0.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/make-dir/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } + }, "node_modules/math-intrinsics": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", @@ -1217,6 +1498,13 @@ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, + "node_modules/nan": { + "version": "2.22.2", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz", + "integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ==", + "license": "MIT", + "optional": true + }, "node_modules/netmask": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz", @@ -1226,6 +1514,35 @@ "node": ">= 0.4.0" } }, + "node_modules/node-ssh": { + "version": "13.2.1", + "resolved": "https://registry.npmjs.org/node-ssh/-/node-ssh-13.2.1.tgz", + "integrity": "sha512-rfl4GWMygQfzlExPkQ2LWyya5n2jOBm5vhEnup+4mdw7tQhNpJWbP5ldr09Jfj93k5SfY5lxcn8od5qrQ/6mBg==", + "license": "MIT", + "dependencies": { + "is-stream": "^2.0.0", + "make-dir": "^3.1.0", + "sb-promise-queue": "^2.1.0", + "sb-scandir": "^3.1.0", + "shell-escape": "^0.2.0", + "ssh2": "^1.14.0" + }, + "engines": { + "node": ">= 10" + } + }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/once": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", @@ -1297,6 +1614,55 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "license": "MIT", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "license": "MIT", + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5/node_modules/entities": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.0.tgz", + "integrity": "sha512-aKstq2TDOndCn4diEyp9Uq/Flu2i1GlLkc6XIDQSDMuaFE3OPW5OphLCyQ5SpSJZTb4reN+kTcYru5yIfXoRPw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/path-is-absolute": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", @@ -1318,6 +1684,36 @@ "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", "license": "ISC" }, + "node_modules/playwright": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.52.0.tgz", + "integrity": "sha512-JAwMNMBlxJ2oD1kce4KPtMkDeKGHQstdpFPcPH3maElAXon/QZeTvtsfXmTMRyO9TslfoYOXkSsvao2nE1ilTw==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.52.0" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.52.0.tgz", + "integrity": "sha512-l2osTgLXSMeuLZOML9qYODUQoPPnUsKsb5/P6LJ2e6uPKXUdPK5WYhN4z03G+YNbWmGDY4YENauNu4ZKczreHg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/progress": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", @@ -1570,6 +1966,33 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, + "node_modules/sb-promise-queue": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/sb-promise-queue/-/sb-promise-queue-2.1.1.tgz", + "integrity": "sha512-qXfdcJQMxMljxmPprn4Q4hl3pJmoljSCzUvvEBa9Kscewnv56n0KqrO6yWSrGLOL9E021wcGdPa39CHGKA6G0w==", + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/sb-scandir": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/sb-scandir/-/sb-scandir-3.1.1.tgz", + "integrity": "sha512-Q5xiQMtoragW9z8YsVYTAZcew+cRzdVBefPbb9theaIKw6cBo34WonP9qOCTKgyAmn/Ch5gmtAxT/krUgMILpA==", + "license": "MIT", + "dependencies": { + "sb-promise-queue": "^2.1.0" + }, + "engines": { + "node": ">= 8" + } + }, "node_modules/semver": { "version": "7.7.2", "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz", @@ -1618,6 +2041,12 @@ "node": ">=0.10.0" } }, + "node_modules/shell-escape": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/shell-escape/-/shell-escape-0.2.0.tgz", + "integrity": "sha512-uRRBT2MfEOyxuECseCZd28jC1AJ8hmqqneWQ4VWUTgCAFvb3wKU1jLqj6egC4Exrr88ogg3dp+zroH4wJuaXzw==", + "license": "MIT" + }, "node_modules/smart-buffer": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz", @@ -1672,6 +2101,23 @@ "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==", "license": "BSD-3-Clause" }, + "node_modules/ssh2": { + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/ssh2/-/ssh2-1.16.0.tgz", + "integrity": "sha512-r1X4KsBGedJqo7h8F5c4Ybpcr5RjyP+aWIG007uBPRjmdQWfEiVLzSK71Zji1B9sKxwaCvD8y8cwSkYrlLiRRg==", + "hasInstallScript": true, + "dependencies": { + "asn1": "^0.2.6", + "bcrypt-pbkdf": "^1.0.2" + }, + "engines": { + "node": ">=10.16.0" + }, + "optionalDependencies": { + "cpu-features": "~0.0.10", + "nan": "^2.20.0" + } + }, "node_modules/streamx": { "version": "2.22.0", "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.22.0.tgz", @@ -1751,12 +2197,27 @@ "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", "license": "0BSD" }, + "node_modules/tweetnacl": { + "version": "0.14.5", + "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", + "integrity": "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA==", + "license": "Unlicense" + }, "node_modules/typed-query-selector": { "version": "2.12.0", "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz", "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==", "license": "MIT" }, + "node_modules/undici": { + "version": "6.21.3", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.3.tgz", + "integrity": "sha512-gBLkYIlEnSp8pFbT64yFgGE6UIB9tAkhukC23PmMDCe5Nd+cRqKxSjw5y54MK2AZMgZfJWMaNE4nYUHgi1XEOw==", + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "6.21.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", @@ -1773,6 +2234,27 @@ "node": ">= 10.0.0" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", diff --git a/scrape-data-keyword/package.json b/scrape-data-keyword/package.json index f489048..dd28755 100644 --- a/scrape-data-keyword/package.json +++ b/scrape-data-keyword/package.json @@ -13,8 +13,11 @@ "type": "module", "dependencies": { "axios": "^1.8.2", + "cheerio": "^1.0.0", "dotenv": "^16.4.7", "lodash": "^4.17.21", + "node-ssh": "^13.2.1", + "playwright": "^1.52.0", "puppeteer": "^24.4.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-stealth": "^2.11.2" diff --git a/scrape-data-keyword/services/scrap-configs-service.js b/scrape-data-keyword/services/scrap-configs-service.js index fa7ba78..189ddcb 100644 --- a/scrape-data-keyword/services/scrap-configs-service.js +++ b/scrape-data-keyword/services/scrap-configs-service.js @@ -1,5 +1,6 @@ import { AllbidsScrapModel } from "../models/allbids-scrap-model.js"; import { GraysScrapModel } from "../models/grays-scrap-model.js"; +import { GumtreeScrapModel } from "../models/gumtree-scrap-model.js"; import { LangtonsScrapModel } from "../models/langtons-scrap-model.js"; import { LawsonsScrapModel } from "../models/lawsons-scrap-model.js"; import { PicklesScrapModel } from "../models/pickles-scrap-model.js"; @@ -42,6 +43,13 @@ export class ScrapConfigsService { page: page, }); } + case "https://www.gumtree.com.au": { + return new GumtreeScrapModel({ + ...scrapConfig, + scrap_config_id: scrapConfig.id, + page: page, + }); + } default: { return null; } diff --git a/scrape-data-keyword/system/browser.js b/scrape-data-keyword/system/browser.js index 8aeb541..05184ba 100644 --- a/scrape-data-keyword/system/browser.js +++ b/scrape-data-keyword/system/browser.js @@ -1,10 +1,12 @@ // import puppeteer from 'puppeteer'; import puppeteer from "puppeteer-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; +import CONSTANTS from "./constants.js"; puppeteer.use(StealthPlugin()); + const browser = await puppeteer.launch({ - headless: process.env.ENVIRONMENT === "prod" ? "new" : false, + headless: process.env.ENVIRONMENT === "prod" ? true : false, // userDataDir: CONSTANTS.PROFILE_PATH, // Thư mục lưu profile timeout: 60000, args: [ @@ -29,7 +31,7 @@ const browser = await puppeteer.launch({ "--disable-threaded-animation", // Giảm animation chạy trên nhiều thread "--disable-threaded-scrolling", // Tắt cuộn trang đa luồng "--disable-logging", // Tắt log debug - "--blink-settings=imagesEnabled=false", // Không tải hình ảnh, + // "--blink-settings=imagesEnabled=false", // Không tải hình ảnh, "--disable-background-timer-throttling", // Tránh việc throttling các timer khi chạy nền. "--disable-webrtc", "--disable-ipc-flooding-protection", // Nếu có extension cần IPC, cái này giúp tối ưu. diff --git a/scrape-data-keyword/system/constants.js b/scrape-data-keyword/system/constants.js new file mode 100644 index 0000000..e65a9ec --- /dev/null +++ b/scrape-data-keyword/system/constants.js @@ -0,0 +1,11 @@ +import * as path from "path"; +import { fileURLToPath } from "url"; // ✅ Cần import từ 'url' + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const CONSTANTS = { + PROFILE_PATH: path.join(__dirname, "profiles"), +}; + +export default CONSTANTS; diff --git a/scrape-data-keyword/system/ultils.js b/scrape-data-keyword/system/ultils.js index 61094bd..bf88345 100644 --- a/scrape-data-keyword/system/ultils.js +++ b/scrape-data-keyword/system/ultils.js @@ -29,6 +29,10 @@ export function extractModelId(url) { const match = url.match(/-(\d+)(?:[\?#]|$)/); return match ? match[1] : null; } + case "https://www.gumtree.com.au": { + const match = url.match(/\/(\d+)(?:\/)?$/); + return match ? match[1] : null; + } default: return null; }