# Puppeteer
https://github.com/xtforgame/azcrawlercore
npm install -g yarn
yarn install
yarn test
https://www.twse.com.tw/robots.txt
爬蟲類型:
1.不需要瀏覽器功能
2.需要虛擬瀏覽器(Puppeteer)
網站目標類型:
1.需解析html
2.直接可以取得結構化資料:json, csv => example 1
3.須執行js(or session, cookie)
// example 1
https://www.twse.com.tw/zh/page/trading/exchange/MI_INDEX.html
// example 2
https://shopline.tw
https://sso.shoplineapp.com/users/sign_in
ID: xtxmlxtxml@gmail.com
PWD:qqqqqqqq
// example 3
https://rent.591.com.tw/
---
chrome
chromium
```typescript=
import fs from 'fs';
import path from 'path';
import moment from 'moment';
import puppeteer, { launch, Browser } from 'puppeteer';
import useProxy from 'puppeteer-page-proxy';
import { promiseWait, promiseWaitFor } from '~/utils';
export type PuppeteerLaunchOptions = Parameters<typeof launch>[0];
export default class CrawlerBase {
getPuppeteerLaunchOptions(debug : boolean = false) : PuppeteerLaunchOptions {
const args = [
`--window-size=1920,1080`,
];
const options : PuppeteerLaunchOptions = debug ? {
devtools: true,
headless: false,
slowMo: 250,
args,
} : {
headless: true,
args,
};
return options;
}
async newPage(browser: Browser, url: string = '') {
const page = await browser.newPage();
await page.setViewport({
width: 1920,
height: 1080,
});
if (url) {
await page.goto(url, {
waitUntil: 'networkidle2',
});
}
return page;
}
async run() {
const browser = await puppeteer.launch(this.getPuppeteerLaunchOptions(true));
try {
const page = await this.newPage(browser);
// console.log('page');
await page.goto('https://httpbin.org', {
waitUntil: 'networkidle2',
});
const session = await page.target().createCDPSession();
await session.send('Page.enable');
await promiseWait(1000);
const lists = await page.$$eval('.opblock-tag-section a.nostyle span', ($texts) => {
console.log('$texts :', $texts);
debugger;
return $texts.map(t => t.innerHTML);
});
console.log('lists :', lists);
const title = await page.$$eval('hgroup h2.title', ($divs) => {
// debugger;
return $divs?.[0]?.innerHTML || '';
});
const title2 = await page.$eval('hgroup h2.title', ($div) => {
// debugger;
return $div?.innerHTML || '';
});
console.log('title :', title);
console.log('title2 :', title2);
await promiseWait(1000000);
} catch (error) {
console.log('error :', error);
}
await browser.close();
return 1;
}
}
```
```typescript=
import fs from 'fs';
import path from 'path';
import moment from 'moment';
import puppeteer, { launch, Browser } from 'puppeteer';
import useProxy from 'puppeteer-page-proxy';
import { promiseWait, promiseWaitFor } from '~/utils';
export type PuppeteerLaunchOptions = Parameters<typeof launch>[0];
export default class CrawlerBase {
getPuppeteerLaunchOptions(debug : boolean = false) : PuppeteerLaunchOptions {
const args = [
`--window-size=1920,1080`,
];
const options : PuppeteerLaunchOptions = debug ? {
devtools: true,
headless: false,
slowMo: 250,
args,
} : {
headless: true,
args,
};
return options;
}
async newPage(browser: Browser, url: string = '') {
const page = await browser.newPage();
await page.setViewport({
width: 1920,
height: 1080,
});
if (url) {
await page.goto(url, {
waitUntil: 'networkidle2',
});
}
return page;
}
async run() {
const browser = await puppeteer.launch(this.getPuppeteerLaunchOptions(true));
try {
const page = await this.newPage(browser);
// console.log('page');
await page.goto('https://httpbin.org', {
waitUntil: 'networkidle2',
});
const session = await page.target().createCDPSession();
await session.send('Page.enable');
await promiseWait(1000);
const lists = await page.$$eval('.opblock-tag-section a.nostyle span', ($texts) => {
console.log('$texts :', $texts);
debugger;
return $texts.map(t => t.innerHTML);
});
console.log('lists :', lists);
const title = await page.$$eval('hgroup h2.title', ($divs) => {
// debugger;
return $divs?.[0]?.innerHTML || '';
});
const title2 = await page.$eval('hgroup h2.title', ($div) => {
// debugger;
return $div?.innerHTML || '';
});
console.log('title :', title);
console.log('title2 :', title2);
// await promiseWait(1000000);
} catch (error) {
console.log('error :', error);
}
await browser.close();
return 1;
}
}
```
await promiseWait(1000000);
document.querySelectorAll('hgroup h2.title')
page.$eval == document.querySelector
page.$$eval == document.querySelectorAll