# 第2回高木研の勉強会
## 必要なものをインストール
* pip自体をアップグレード
```
$ python3 -m pip install --upgrade pip
```
* Beautiful Soup
```
$ pip install beautifulsoup4
```
* iclawler
```
$ pip install icrawler
```
* requests
```
$ pip install requests
```
* selenium
```
$ pip uninstall selenium
$ pip install selenium==4.1.0
```
* [Chrome Driver](https://sites.google.com/a/chromium.org/chromedriver/downloads)をDL
* google-chromeのversionに合わせる
* driverを勉強用のディレクトリに移す
* `assets`みたいなディレクトリを作成し、中に入れると直良!
## driverの確認
```python=
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
def set_up_chrome_driver():
options = Options()
chrome_driver = webdriver.Chrome(options=options, executable_path=driverの絶対pathを指定)
return chrome_driver
def move_on_google_home(chrome_driver):
chrome_driver.get("https://www.yahoo.co.jp/")
time.sleep(0.5)
if __name__=='__main__':
chrome_driver = set_up_chrome_driver()
move_on_google_home(chrome_driver)
chrome_driver.quit()
```
## 画像検索した画像を集めよう
```python=
from icrawler.builtin import BingImageCrawler
#情報を入力
select_word = "ねこ+ロボ"
select_num = 100
#ダウンロード先のフォルダーを指定する
crawler = BingImageCrawler(storage={"root_dir": "assets/" + select_word})
#google検索&ダウンロード
#検索キーワードとダウンロード数を決定する
crawler.crawl(keyword=select_word, max_num=int(select_num))
```
## ホームページの画像を集めよう
```python=
from bs4 import BeautifulSoup
import requests
import os
def get_image(url):
res_race = requests.get(url)
res_race.raise_for_status()
soup_race = BeautifulSoup(res_race.content, 'lxml')
img_tags = soup_race.select('img')
for i, img_tag in enumerate(img_tags):
if 'https:' in img_tag.get('src'):
img_url = img_tag.get('src')
else:
img_url = "https:" + img_tag.get('src')
img_res = requests.get(img_url)
img_data =img_res.content
if img_res.status_code == 200:
file_name = "./assets/moodel/img_" + str(i) + ".jpg"
with open(file_name, 'wb') as f:
f.write(img_data)
if __name__=='__main__':
os.makedirs("./assets/moodel/", exist_ok=True)
get_image("https://kadai-moodle.kagawa-u.ac.jp/")
```
## 自動ログインをしてみよう
* HomePageの中の仕組みを見よう
`Cntr` + `Shift` + `I`を押す
* JSONファイルの設定
```json!
{
"id":"ここに自分の学籍番号",
"pswd":"ここに自分のパスワード"
}
```
* loginのpythonファイル
```python=
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json
def set_up_chrome_driver():
options = Options()
chrome_driver = webdriver.Chrome(options=options, executable_path=r'/usr/bin/chromedriver')
return chrome_driver
def login_moodel(chrome_driver,id,pswd):
chrome_driver.get("https://kadai-moodle.kagawa-u.ac.jp/login/index.php")
chrome_driver.find_element_by_name("username").send_keys(id)
time.sleep(0.5)
chrome_driver.find_element_by_name("password").send_keys(pswd)
time.sleep(0.5)
chrome_driver.find_elements_by_id('loginbtn')[0].click()
time.sleep(0.5)
if __name__=='__main__':
open_json = open('./json/login.json','r')
session = json.load(open_json)
id=session['id']
pswd=session['pswd']
chrome_driver = set_up_chrome_driver()
login_moodel(chrome_driver,id,pswd)
chrome_driver.quit()
```
## seleniumとbeutifulsoupを組み合わせ
```python=
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import json
def set_up_chrome_driver():
options = Options()
chrome_driver = webdriver.Chrome(options=options, executable_path=r'/usr/bin/chromedriver')
return chrome_driver
def login_moodel(chrome_driver,id,pswd):
chrome_driver.get("https://kadai-moodle.kagawa-u.ac.jp/login/index.php")
chrome_driver.find_element_by_name("username").send_keys(id)
time.sleep(0.5)
chrome_driver.find_element_by_name("password").send_keys(pswd)
time.sleep(0.5)
chrome_driver.find_elements_by_id('loginbtn')[0].click()
time.sleep(0.5)
def get_list_a_tag_href(chrome_driver):
chrome_driver.get("https://kadai-moodle.kagawa-u.ac.jp/my/")
html_data = chrome_driver.page_source.encode("utf-8")
soup = BeautifulSoup(html_data, 'html.parser')
elems = soup.select('a')
href_list = []
for elem in elems:
href = elem.get('href')
href_list.append(href)
print(href_list)
if __name__=='__main__':
open_json = open('./json/login.json','r')
session = json.load(open_json)
id=session['id']
pswd=session['pswd']
chrome_driver = set_up_chrome_driver()
login_moodel(chrome_driver,id,pswd)
get_list_a_tag_href(chrome_driver)
chrome_driver.quit()
```
## tableをcsv化
```python=
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
import io
def table_to_df(url):
res_race = requests.get(url)
res_race.raise_for_status()
soup_horse = BeautifulSoup(res_race.content, 'lxml')
table_horse = soup_horse.select('[class="db_h_race_results nk_tb_common"]')
with io.StringIO(str(table_horse)) as f:
# print(f)
df = pd.read_html(f)[0]
return df
def save_df_as_csv(df,csv_file):
df.to_csv(csv_file)
if __name__=='__main__':
os.makedirs("./assets/csv/", exist_ok=True)
df= table_to_df("https://db.netkeiba.com/horse/2017100720")
save_df_as_csv(df, "./assets/csv/tabel.csv")
```