---
disqus: ahb0222
GA : G-CQ4L16KHK4
---
# R爬取動態網頁資料(RSelenium)以河川監測數據為例
> [color=#40f1ef][name=LHB阿好伯, 2021/02/15][:earth_africa:](https://www.facebook.com/LHB0222/)
###### tags: `R`
[TOC]

之前介紹到使用[R 爬取動態網頁資料-RSelenium](/zXg7IeGVQFK0EvvpSWQZ0w)可以爬蟲
經過段時間的努力完成了一個版本的抓取程式碼XD
資料非常多所以先測試只抓取單一流域的資料
程式碼如下
```r=
library(devtools)
library(RSelenium)
library(stringr)
library(plyr)
remDr <- remoteDriver(browserName = "chrome")
remDr$open()
remDr$navigate("https://wq.epa.gov.tw/EWQP/zh/EnvWaterMonitoring/RiverWaterQuality.aspx")
#水質查詢位址
waterdata_npath <- '//li[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "Advanced", " " ))]'
#點擊
nextBtn <- remDr$findElement(using ='xpath',
value = waterdata_npath)
nextBtn$clickElement()
unit_xpath1 <-remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Unit')]")
str_split(unit_xpath1$getElementText(),pattern = "\\n")
```
先取得縣市、流域、測站、年、月資料大小
後續作為for迴圈的參數
```r=+
# 縣市
unit_xpath2 <- "//*[@id='CPH1_ddl_Unit']/option[7]"
btn2 <- remDr$findElement(using = 'xpath', value = unit_xpath2)
btn2$clickElement()
# 流域
unit_xpath3 <-"//*[(@id = 'CPH1_ddl_River')]/option[3]"
btn3 <- remDr$findElement(using = 'xpath', value = unit_xpath3)
btn3$clickElement()
Rivername <- strsplit( as.character(btn3$getElementText()), "\n")
#測站
Site_n <- length(str_split(remDr$findElement(using = 'xpath',
value = "//*[(@id = 'CPH1_ddl_Site')]")$getElementText(),
pattern = "\\n ")[[1]])
unit_xpath4 <-"//*[(@id = 'CPH1_ddl_Site')]/option[1]"
btn4 <- remDr$findElement(using = 'xpath', value = unit_xpath4)
Sitename <- strsplit( as.character(btn4$getElementText()), "\n")
btn4$clickElement()
#年
year_n <- length(str_split(remDr$findElement(using = 'xpath',
value = "//*[(@id = 'CPH1_ddl_Year')]")$getElementText(),
pattern = "\\n ")[[1]])
DetectData3 <- NA
DetectData4 <- data.frame()
DetectData5 <- data.frame()
DetectData6 <- data.frame()
```
利用前面得到的數量來進行迴圈讀取資料
在這過程中最大的問題在於最後的資料合併
這部分有兩個問題
1. 資料格式
得到的資料會是一個list先顯示檢測項目在顯示檢測結果
eg :
:::success
"懸浮固體(mg/L)" "6.7" "生化需氧量(mg/L)" "5.9" "氨氮(mg/L)" "3.5" "溶氧(滴定法)(mg/L)" "7.3"
:::
根本不能資料合併還好還有規律
一個簡單的迴圈解決
:::spoiler
for (k in c(1:(length(DetectData3)/2))) {
DetectData4[1,k] <- DetectData3[k*2]
colnames(DetectData4)[k] <- DetectData3[k+(k-1)]
}
:::
2. 資料合併
這邊就遇到他並不是每次檢驗項目都一樣QQ
所以無法使用內建的rbind()進行合併
若是使用dplyr 套件中的 bind_rows()進行合併會出現許多新欄位
最後使用plyr::rbind.fill()才成功合併
```r=+
for(h in c(1:Site_n)){
unit_xpath4 <-paste0("//*[(@id = 'CPH1_ddl_Site')]/option[",h,"]")
btn4 <- remDr$findElement(using = 'xpath', value = unit_xpath4)
Sitename <- strsplit( as.character(btn4$getElementText()), "\n")
btn4$clickElement()
print(paste0("Site : ", Sitename))
year_n <- length(str_split(remDr$findElement(using = 'xpath',
value = "//*[(@id = 'CPH1_ddl_Year')]")$getElementText(),
pattern = "\\n ")[[1]])
for (i in c(year_n:1)){
unit_xpath5 <-paste0("//*[(@id = 'CPH1_ddl_Year')]/option[",i,"]")
btn5 <- remDr$findElement(using = 'xpath', value = unit_xpath5)
btn5$clickElement()
yeardata <- str_split(remDr$findElement(using = 'xpath',
value = "//*[(@id = 'CPH1_ddl_Year')]")$getElementText(),
pattern = "\\n ")[[1]][i]
print(dim(DetectData6))
print(paste0("year : ",yeardata))
#月
Month_n <- length(str_split(remDr$findElement(using = 'xpath',
value = "//*[(@id = 'CPH1_ddl_Month')]")$getElementText(),
pattern = "\\n ")[[1]])
for (j in c(1:Month_n)){
unit_xpath6 <-paste0("//*[(@id = 'CPH1_ddl_Month')]/option[",j,"]")
btn6 <- remDr$findElement(using = 'xpath', value = unit_xpath6)
btn6$clickElement()
monthdata <- remDr$findElement(using = 'xpath', value = unit_xpath6)$getElementText()[[1]]
print(paste0("month : ",monthdata))
#查詢按鍵
xpath <- "//*[(@id = 'CPH1_Submit1')]"
btn <- remDr$findElement(using = 'xpath', value = xpath)
# 移動滑鼠至该元素
remDr$mouseMoveToLocation(webElement = btn)
# 點擊
remDr$click()
#Sys.sleep(1)
DetectData3 <-as.character( str_split(remDr$findElement(using = 'xpath',
value ="//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'dataCards', ' ' ))]")$getElementText(),
pattern = "\\n")[[1]])
for (k in c(1:(length(DetectData3)/2))) {
DetectData4[1,k] <- DetectData3[k*2]
colnames(DetectData4)[k] <- DetectData3[k+(k-1)]
}
DetectData5 <- data.frame(Site = Sitename[[1]],
years =yeardata,
months = monthdata)
DetectData5 <- cbind( DetectData5, DetectData4)
#colnames(DetectData6) <- str_replace(colnames(DetectData6),pattern = "\\...[:graph:][:graph:]", replacement = "")
DetectData6 <- rbind.fill(DetectData6,DetectData5)
sys.sleep(0.5)
}
}
}
View(DetectData6)
```
設計好程式碼後再來就是靜靜地等待了
看他自己那邊跑也是很爽XD

最後單一個流域就抓到1187次的檢測數據
不到一百行程式碼使用不到半小時完成
要是手動複製貼上應該花很多時間XD
# Python版本
```python=
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome()
driver.get("https://wq.epa.gov.tw/EWQP/zh/EnvWaterMonitoring/RiverWaterQuality.aspx")
elem = driver.find_element_by_xpath('//li[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "Advanced", " " ))]')
elem.click()
# 縣市
unit_xpath2 ="//*[@id='CPH1_ddl_Unit']/option[7]"
btn2 = driver.find_element_by_xpath( unit_xpath2)
btn2.click()
btn2.text
# 流域
unit_xpath3 ="//*[(@id = 'CPH1_ddl_River')]/option[3]"
btn3 = driver.find_element_by_xpath( unit_xpath3)
btn3.click()
#測站
unit_xpath4 ="//*[(@id = 'CPH1_ddl_Site')]/option"
btn4 =driver.find_elements_by_xpath( unit_xpath4)
Site_n = len(btn4)
for value in btn4:
print(value.text)
#年
unit_xpath5 ="//*[(@id = 'CPH1_ddl_Year')]/option"
btn5 =driver.find_elements_by_xpath( unit_xpath5)
for value in btn5:
print(value.text)
year_n = len(btn5)
DetectData3 = pd.DataFrame
DetectData4 = pd.DataFrame
DetectData5 = pd.DataFrame
DetectData6 = pd.DataFrame
#月
unit_xpath6 ="//*[(@id = 'CPH1_ddl_Month')]/option"
btn6 =driver.find_elements_by_xpath( unit_xpath6)
for value in btn6:
print(value.text)
Month_n = len(btn6)
for j in list(range(1,Month_n +1, 1)):
unit_xpath6 = "//*[(@id = 'CPH1_ddl_Month')]/option["+str(j)+"]"
btn6 = driver.find_element_by_xpath(unit_xpath6)
btn6.click()
unit_xpath_mdata = driver.find_element_by_xpath(unit_xpath6)
month_data = unit_xpath_mdata.text
print("month : "+month_data)
#查詢按鍵
xpath = "//*[(@id = 'CPH1_Submit1')]"
btn =driver.find_element_by_xpath(xpath)
# 點擊
btn.click()
#Sys.sleep(1)
data_Xpath = "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'dataCards', ' ' ))]"
data3 = driver.find_elements_by_xpath(data_Xpath)
for value in data3:
data4 = value.text.split('\n')
data4 = pd.DataFrame(data4)
data4[0].columns = "A"
data4[0].iloc[1] = 17
data5 = pd.DataFrame()
data6 = pd.DataFrame()
for k in list(range(int(1),int(len(data4)/2)+1,1)):
#print(2*(k-1))
#print(data4.iloc[2*(k-1)])
data5 = pd.DataFrame(columns=data4.iloc[2*(k-1)],index=[0])
data6 = data5+data6
#print(data4.iloc[k*2-1])
print(data4.iloc[k*2-1])
data6.iloc[0,k-1] = data4.iloc[k*2-1]
DetectData5 <- data.frame(Site = Sitename[[1]],
years =yeardata,
months = monthdata)
DetectData5 <- cbind( DetectData5, DetectData4)
#colnames(DetectData6) <- str_replace(colnames(DetectData6),pattern = "\\...[:graph:][:graph:]", replacement = "")
DetectData6 <- rbind.fill(DetectData6,DetectData5)
}
```

🌟全文可以至下方連結觀看或是補充
全文分享至
https://www.facebook.com/LHB0222/
https://www.instagram.com/ahb0222/
有疑問想討論的都歡迎於下方留言
喜歡的幫我分享給所有的朋友 \o/
有所錯誤歡迎指教
# [:page_with_curl: 全部文章列表](https://hackmd.io/@LHB-0222/AllWritings)
