--- disqus: ahb0222 GA : G-CQ4L16KHK4 --- # R爬取動態網頁資料(RSelenium)以河川監測數據為例 > [color=#40f1ef][name=LHB阿好伯, 2021/02/15][:earth_africa:](https://www.facebook.com/LHB0222/) ###### tags: `R` [TOC] ![](https://i.imgur.com/tXEF6i4.jpg) 之前介紹到使用[R 爬取動態網頁資料-RSelenium](/zXg7IeGVQFK0EvvpSWQZ0w)可以爬蟲 經過段時間的努力完成了一個版本的抓取程式碼XD 資料非常多所以先測試只抓取單一流域的資料 程式碼如下 ```r= library(devtools) library(RSelenium) library(stringr) library(plyr) remDr <- remoteDriver(browserName = "chrome") remDr$open() remDr$navigate("https://wq.epa.gov.tw/EWQP/zh/EnvWaterMonitoring/RiverWaterQuality.aspx") #水質查詢位址 waterdata_npath <- '//li[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "Advanced", " " ))]' #點擊 nextBtn <- remDr$findElement(using ='xpath', value = waterdata_npath) nextBtn$clickElement() unit_xpath1 <-remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Unit')]") str_split(unit_xpath1$getElementText(),pattern = "\\n") ``` 先取得縣市、流域、測站、年、月資料大小 後續作為for迴圈的參數 ```r=+ # 縣市 unit_xpath2 <- "//*[@id='CPH1_ddl_Unit']/option[7]" btn2 <- remDr$findElement(using = 'xpath', value = unit_xpath2) btn2$clickElement() # 流域 unit_xpath3 <-"//*[(@id = 'CPH1_ddl_River')]/option[3]" btn3 <- remDr$findElement(using = 'xpath', value = unit_xpath3) btn3$clickElement() Rivername <- strsplit( as.character(btn3$getElementText()), "\n") #測站 Site_n <- length(str_split(remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Site')]")$getElementText(), pattern = "\\n ")[[1]]) unit_xpath4 <-"//*[(@id = 'CPH1_ddl_Site')]/option[1]" btn4 <- remDr$findElement(using = 'xpath', value = unit_xpath4) Sitename <- strsplit( as.character(btn4$getElementText()), "\n") btn4$clickElement() #年 year_n <- length(str_split(remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Year')]")$getElementText(), pattern = "\\n ")[[1]]) DetectData3 <- NA DetectData4 <- data.frame() DetectData5 <- data.frame() DetectData6 <- data.frame() ``` 利用前面得到的數量來進行迴圈讀取資料 在這過程中最大的問題在於最後的資料合併 這部分有兩個問題 1. 資料格式 得到的資料會是一個list先顯示檢測項目在顯示檢測結果 eg : :::success "懸浮固體(mg/L)" "6.7" "生化需氧量(mg/L)" "5.9" "氨氮(mg/L)" "3.5" "溶氧(滴定法)(mg/L)" "7.3" ::: 根本不能資料合併還好還有規律 一個簡單的迴圈解決 :::spoiler for (k in c(1:(length(DetectData3)/2))) { DetectData4[1,k] <- DetectData3[k*2] colnames(DetectData4)[k] <- DetectData3[k+(k-1)] } ::: 2. 資料合併 這邊就遇到他並不是每次檢驗項目都一樣QQ 所以無法使用內建的rbind()進行合併 若是使用dplyr 套件中的 bind_rows()進行合併會出現許多新欄位 最後使用plyr::rbind.fill()才成功合併 ```r=+ for(h in c(1:Site_n)){ unit_xpath4 <-paste0("//*[(@id = 'CPH1_ddl_Site')]/option[",h,"]") btn4 <- remDr$findElement(using = 'xpath', value = unit_xpath4) Sitename <- strsplit( as.character(btn4$getElementText()), "\n") btn4$clickElement() print(paste0("Site : ", Sitename)) year_n <- length(str_split(remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Year')]")$getElementText(), pattern = "\\n ")[[1]]) for (i in c(year_n:1)){ unit_xpath5 <-paste0("//*[(@id = 'CPH1_ddl_Year')]/option[",i,"]") btn5 <- remDr$findElement(using = 'xpath', value = unit_xpath5) btn5$clickElement() yeardata <- str_split(remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Year')]")$getElementText(), pattern = "\\n ")[[1]][i] print(dim(DetectData6)) print(paste0("year : ",yeardata)) #月 Month_n <- length(str_split(remDr$findElement(using = 'xpath', value = "//*[(@id = 'CPH1_ddl_Month')]")$getElementText(), pattern = "\\n ")[[1]]) for (j in c(1:Month_n)){ unit_xpath6 <-paste0("//*[(@id = 'CPH1_ddl_Month')]/option[",j,"]") btn6 <- remDr$findElement(using = 'xpath', value = unit_xpath6) btn6$clickElement() monthdata <- remDr$findElement(using = 'xpath', value = unit_xpath6)$getElementText()[[1]] print(paste0("month : ",monthdata)) #查詢按鍵 xpath <- "//*[(@id = 'CPH1_Submit1')]" btn <- remDr$findElement(using = 'xpath', value = xpath) # 移動滑鼠至该元素 remDr$mouseMoveToLocation(webElement = btn) # 點擊 remDr$click() #Sys.sleep(1) DetectData3 <-as.character( str_split(remDr$findElement(using = 'xpath', value ="//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'dataCards', ' ' ))]")$getElementText(), pattern = "\\n")[[1]]) for (k in c(1:(length(DetectData3)/2))) { DetectData4[1,k] <- DetectData3[k*2] colnames(DetectData4)[k] <- DetectData3[k+(k-1)] } DetectData5 <- data.frame(Site = Sitename[[1]], years =yeardata, months = monthdata) DetectData5 <- cbind( DetectData5, DetectData4) #colnames(DetectData6) <- str_replace(colnames(DetectData6),pattern = "\\...[:graph:][:graph:]", replacement = "") DetectData6 <- rbind.fill(DetectData6,DetectData5) sys.sleep(0.5) } } } View(DetectData6) ``` 設計好程式碼後再來就是靜靜地等待了 看他自己那邊跑也是很爽XD ![](https://i.imgur.com/19ypEGS.gif) 最後單一個流域就抓到1187次的檢測數據 不到一百行程式碼使用不到半小時完成 要是手動複製貼上應該花很多時間XD # Python版本 ```python= import io import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8') from selenium import webdriver from selenium.webdriver.common.keys import Keys import pandas as pd driver = webdriver.Chrome() driver.get("https://wq.epa.gov.tw/EWQP/zh/EnvWaterMonitoring/RiverWaterQuality.aspx") elem = driver.find_element_by_xpath('//li[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "Advanced", " " ))]') elem.click() # 縣市 unit_xpath2 ="//*[@id='CPH1_ddl_Unit']/option[7]" btn2 = driver.find_element_by_xpath( unit_xpath2) btn2.click() btn2.text # 流域 unit_xpath3 ="//*[(@id = 'CPH1_ddl_River')]/option[3]" btn3 = driver.find_element_by_xpath( unit_xpath3) btn3.click() #測站 unit_xpath4 ="//*[(@id = 'CPH1_ddl_Site')]/option" btn4 =driver.find_elements_by_xpath( unit_xpath4) Site_n = len(btn4) for value in btn4: print(value.text) #年 unit_xpath5 ="//*[(@id = 'CPH1_ddl_Year')]/option" btn5 =driver.find_elements_by_xpath( unit_xpath5) for value in btn5: print(value.text) year_n = len(btn5) DetectData3 = pd.DataFrame DetectData4 = pd.DataFrame DetectData5 = pd.DataFrame DetectData6 = pd.DataFrame #月 unit_xpath6 ="//*[(@id = 'CPH1_ddl_Month')]/option" btn6 =driver.find_elements_by_xpath( unit_xpath6) for value in btn6: print(value.text) Month_n = len(btn6) for j in list(range(1,Month_n +1, 1)): unit_xpath6 = "//*[(@id = 'CPH1_ddl_Month')]/option["+str(j)+"]" btn6 = driver.find_element_by_xpath(unit_xpath6) btn6.click() unit_xpath_mdata = driver.find_element_by_xpath(unit_xpath6) month_data = unit_xpath_mdata.text print("month : "+month_data) #查詢按鍵 xpath = "//*[(@id = 'CPH1_Submit1')]" btn =driver.find_element_by_xpath(xpath) # 點擊 btn.click() #Sys.sleep(1) data_Xpath = "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'dataCards', ' ' ))]" data3 = driver.find_elements_by_xpath(data_Xpath) for value in data3: data4 = value.text.split('\n') data4 = pd.DataFrame(data4) data4[0].columns = "A" data4[0].iloc[1] = 17 data5 = pd.DataFrame() data6 = pd.DataFrame() for k in list(range(int(1),int(len(data4)/2)+1,1)): #print(2*(k-1)) #print(data4.iloc[2*(k-1)]) data5 = pd.DataFrame(columns=data4.iloc[2*(k-1)],index=[0]) data6 = data5+data6 #print(data4.iloc[k*2-1]) print(data4.iloc[k*2-1]) data6.iloc[0,k-1] = data4.iloc[k*2-1] DetectData5 <- data.frame(Site = Sitename[[1]], years =yeardata, months = monthdata) DetectData5 <- cbind( DetectData5, DetectData4) #colnames(DetectData6) <- str_replace(colnames(DetectData6),pattern = "\\...[:graph:][:graph:]", replacement = "") DetectData6 <- rbind.fill(DetectData6,DetectData5) } ``` ![](https://i.imgur.com/p3V0DCB.png) 🌟全文可以至下方連結觀看或是補充 全文分享至 https://www.facebook.com/LHB0222/ https://www.instagram.com/ahb0222/ 有疑問想討論的都歡迎於下方留言 喜歡的幫我分享給所有的朋友 \o/ 有所錯誤歡迎指教 # [:page_with_curl: 全部文章列表](https://hackmd.io/@LHB-0222/AllWritings) ![](https://i.imgur.com/47HlvGH.png)