# 第2回高木研の勉強会 ## 必要なものをインストール * pip自体をアップグレード ``` $ python3 -m pip install --upgrade pip ``` * Beautiful Soup ``` $ pip install beautifulsoup4 ``` * iclawler ``` $ pip install icrawler ``` * requests ``` $ pip install requests ``` * selenium ``` $ pip uninstall selenium $ pip install selenium==4.1.0 ``` * [Chrome Driver](https://sites.google.com/a/chromium.org/chromedriver/downloads)をDL * google-chromeのversionに合わせる * driverを勉強用のディレクトリに移す * `assets`みたいなディレクトリを作成し、中に入れると直良! ## driverの確認 ```python= from selenium import webdriver from selenium.webdriver.chrome.options import Options import time def set_up_chrome_driver(): options = Options() chrome_driver = webdriver.Chrome(options=options, executable_path=driverの絶対pathを指定) return chrome_driver def move_on_google_home(chrome_driver): chrome_driver.get("https://www.yahoo.co.jp/") time.sleep(0.5) if __name__=='__main__': chrome_driver = set_up_chrome_driver() move_on_google_home(chrome_driver) chrome_driver.quit() ``` ## 画像検索した画像を集めよう ```python= from icrawler.builtin import BingImageCrawler #情報を入力 select_word = "ねこ+ロボ" select_num = 100 #ダウンロード先のフォルダーを指定する crawler = BingImageCrawler(storage={"root_dir": "assets/" + select_word}) #google検索&ダウンロード #検索キーワードとダウンロード数を決定する crawler.crawl(keyword=select_word, max_num=int(select_num)) ``` ## ホームページの画像を集めよう ```python= from bs4 import BeautifulSoup import requests import os def get_image(url): res_race = requests.get(url) res_race.raise_for_status() soup_race = BeautifulSoup(res_race.content, 'lxml') img_tags = soup_race.select('img') for i, img_tag in enumerate(img_tags): if 'https:' in img_tag.get('src'): img_url = img_tag.get('src') else: img_url = "https:" + img_tag.get('src') img_res = requests.get(img_url) img_data =img_res.content if img_res.status_code == 200: file_name = "./assets/moodel/img_" + str(i) + ".jpg" with open(file_name, 'wb') as f: f.write(img_data) if __name__=='__main__': os.makedirs("./assets/moodel/", exist_ok=True) get_image("https://kadai-moodle.kagawa-u.ac.jp/") ``` ## 自動ログインをしてみよう * HomePageの中の仕組みを見よう `Cntr` + `Shift` + `I`を押す * JSONファイルの設定 ```json! { "id":"ここに自分の学籍番号", "pswd":"ここに自分のパスワード" } ``` * loginのpythonファイル ```python= from selenium import webdriver from selenium.webdriver.chrome.options import Options import time import json def set_up_chrome_driver(): options = Options() chrome_driver = webdriver.Chrome(options=options, executable_path=r'/usr/bin/chromedriver') return chrome_driver def login_moodel(chrome_driver,id,pswd): chrome_driver.get("https://kadai-moodle.kagawa-u.ac.jp/login/index.php") chrome_driver.find_element_by_name("username").send_keys(id) time.sleep(0.5) chrome_driver.find_element_by_name("password").send_keys(pswd) time.sleep(0.5) chrome_driver.find_elements_by_id('loginbtn')[0].click() time.sleep(0.5) if __name__=='__main__': open_json = open('./json/login.json','r') session = json.load(open_json) id=session['id'] pswd=session['pswd'] chrome_driver = set_up_chrome_driver() login_moodel(chrome_driver,id,pswd) chrome_driver.quit() ``` ## seleniumとbeutifulsoupを組み合わせ ```python= from selenium import webdriver from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup import time import json def set_up_chrome_driver(): options = Options() chrome_driver = webdriver.Chrome(options=options, executable_path=r'/usr/bin/chromedriver') return chrome_driver def login_moodel(chrome_driver,id,pswd): chrome_driver.get("https://kadai-moodle.kagawa-u.ac.jp/login/index.php") chrome_driver.find_element_by_name("username").send_keys(id) time.sleep(0.5) chrome_driver.find_element_by_name("password").send_keys(pswd) time.sleep(0.5) chrome_driver.find_elements_by_id('loginbtn')[0].click() time.sleep(0.5) def get_list_a_tag_href(chrome_driver): chrome_driver.get("https://kadai-moodle.kagawa-u.ac.jp/my/") html_data = chrome_driver.page_source.encode("utf-8") soup = BeautifulSoup(html_data, 'html.parser') elems = soup.select('a') href_list = [] for elem in elems: href = elem.get('href') href_list.append(href) print(href_list) if __name__=='__main__': open_json = open('./json/login.json','r') session = json.load(open_json) id=session['id'] pswd=session['pswd'] chrome_driver = set_up_chrome_driver() login_moodel(chrome_driver,id,pswd) get_list_a_tag_href(chrome_driver) chrome_driver.quit() ``` ## tableをcsv化 ```python= from bs4 import BeautifulSoup import requests import os import pandas as pd import io def table_to_df(url): res_race = requests.get(url) res_race.raise_for_status() soup_horse = BeautifulSoup(res_race.content, 'lxml') table_horse = soup_horse.select('[class="db_h_race_results nk_tb_common"]') with io.StringIO(str(table_horse)) as f: # print(f) df = pd.read_html(f)[0] return df def save_df_as_csv(df,csv_file): df.to_csv(csv_file) if __name__=='__main__': os.makedirs("./assets/csv/", exist_ok=True) df= table_to_df("https://db.netkeiba.com/horse/2017100720") save_df_as_csv(df, "./assets/csv/tabel.csv") ```