# Puppeteer https://github.com/xtforgame/azcrawlercore npm install -g yarn yarn install yarn test https://www.twse.com.tw/robots.txt 爬蟲類型: 1.不需要瀏覽器功能 2.需要虛擬瀏覽器(Puppeteer) 網站目標類型: 1.需解析html 2.直接可以取得結構化資料:json, csv => example 1 3.須執行js(or session, cookie) // example 1 https://www.twse.com.tw/zh/page/trading/exchange/MI_INDEX.html // example 2 https://shopline.tw https://sso.shoplineapp.com/users/sign_in ID: xtxmlxtxml@gmail.com PWD:qqqqqqqq // example 3 https://rent.591.com.tw/ --- chrome chromium ```typescript= import fs from 'fs'; import path from 'path'; import moment from 'moment'; import puppeteer, { launch, Browser } from 'puppeteer'; import useProxy from 'puppeteer-page-proxy'; import { promiseWait, promiseWaitFor } from '~/utils'; export type PuppeteerLaunchOptions = Parameters<typeof launch>[0]; export default class CrawlerBase { getPuppeteerLaunchOptions(debug : boolean = false) : PuppeteerLaunchOptions { const args = [ `--window-size=1920,1080`, ]; const options : PuppeteerLaunchOptions = debug ? { devtools: true, headless: false, slowMo: 250, args, } : { headless: true, args, }; return options; } async newPage(browser: Browser, url: string = '') { const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080, }); if (url) { await page.goto(url, { waitUntil: 'networkidle2', }); } return page; } async run() { const browser = await puppeteer.launch(this.getPuppeteerLaunchOptions(true)); try { const page = await this.newPage(browser); // console.log('page'); await page.goto('https://httpbin.org', { waitUntil: 'networkidle2', }); const session = await page.target().createCDPSession(); await session.send('Page.enable'); await promiseWait(1000); const lists = await page.$$eval('.opblock-tag-section a.nostyle span', ($texts) => { console.log('$texts :', $texts); debugger; return $texts.map(t => t.innerHTML); }); console.log('lists :', lists); const title = await page.$$eval('hgroup h2.title', ($divs) => { // debugger; return $divs?.[0]?.innerHTML || ''; }); const title2 = await page.$eval('hgroup h2.title', ($div) => { // debugger; return $div?.innerHTML || ''; }); console.log('title :', title); console.log('title2 :', title2); await promiseWait(1000000); } catch (error) { console.log('error :', error); } await browser.close(); return 1; } } ``` ```typescript= import fs from 'fs'; import path from 'path'; import moment from 'moment'; import puppeteer, { launch, Browser } from 'puppeteer'; import useProxy from 'puppeteer-page-proxy'; import { promiseWait, promiseWaitFor } from '~/utils'; export type PuppeteerLaunchOptions = Parameters<typeof launch>[0]; export default class CrawlerBase { getPuppeteerLaunchOptions(debug : boolean = false) : PuppeteerLaunchOptions { const args = [ `--window-size=1920,1080`, ]; const options : PuppeteerLaunchOptions = debug ? { devtools: true, headless: false, slowMo: 250, args, } : { headless: true, args, }; return options; } async newPage(browser: Browser, url: string = '') { const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080, }); if (url) { await page.goto(url, { waitUntil: 'networkidle2', }); } return page; } async run() { const browser = await puppeteer.launch(this.getPuppeteerLaunchOptions(true)); try { const page = await this.newPage(browser); // console.log('page'); await page.goto('https://httpbin.org', { waitUntil: 'networkidle2', }); const session = await page.target().createCDPSession(); await session.send('Page.enable'); await promiseWait(1000); const lists = await page.$$eval('.opblock-tag-section a.nostyle span', ($texts) => { console.log('$texts :', $texts); debugger; return $texts.map(t => t.innerHTML); }); console.log('lists :', lists); const title = await page.$$eval('hgroup h2.title', ($divs) => { // debugger; return $divs?.[0]?.innerHTML || ''; }); const title2 = await page.$eval('hgroup h2.title', ($div) => { // debugger; return $div?.innerHTML || ''; }); console.log('title :', title); console.log('title2 :', title2); // await promiseWait(1000000); } catch (error) { console.log('error :', error); } await browser.close(); return 1; } } ``` await promiseWait(1000000); document.querySelectorAll('hgroup h2.title') page.$eval == document.querySelector page.$$eval == document.querySelectorAll