# Shoope ## Install Selenium [selenium-docker](https://github.com/SeleniumHQ/docker-selenium) Select ==chrome== version ```bash docker run -d --name selenuim -p 4444:4444 -p 7900:7900 --shm-size 2g selenium/standalone-chrome:dev ``` ## Login shopee & Save cookies #### Set up Selenium ```python= import os os.environ["SHOPEE_USERNAME"] = "" os.environ["SHOPEE_PASSWORD"] = "" import requests import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException options = webdriver.ChromeOptions() prefs = {"profile.default_content_setting_values.notifications": 2} options.add_experimental_option("prefs", prefs) # options.add_argument('blink-settings=imagesEnabled=false') 禁止讀取圖 # options.add_argument('headless') 無頭模式 driver = webdriver.Remote( command_executor='http://localhost:4444/wd/hub', options=options ) ``` #### Login Shopee ```python= driver.get('https://shopee.tw/buyer/login?next=https%3A%2F%2Fshopee.tw%2F') # Explicit wait setup wait = WebDriverWait(driver, 10) username_elem = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[2]/div[2]/div[1]/input'))) username_elem.send_keys(os.environ.get('SHOPEE_USERNAME')) password_elem = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div/div/div/div[2]/form/div/div[2]/div[3]/div[1]/input'))) password_elem.send_keys(os.environ.get('SHOPEE_PASSWORD')) wait = WebDriverWait(driver, 5) login_btn = driver.find_element("xpath", '//*[@id="main"]/div/div[2]/div/div/div/div[2]/form/div/div[2]/button') login_btn.click() ``` #### Check message open [`http://localhost:7900`](http://localhost:7900) to check login message #### Save cookies ```= cookies = driver.get_cookies() with open("cookies.json", "w") as f: json.dump(cookies, f) ``` #### Quit Selenium ```= driver.quit() ``` ## Start scraping ```python= import os import requests import json import pandas as pd from tqdm.auto import tqdm from time import sleep from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException # Set up Selenium options = webdriver.ChromeOptions() prefs = {"profile.default_content_setting_values.notifications": 2} options.add_experimental_option("prefs", prefs) # options.add_argument('blink-settings=imagesEnabled=false') # options.add_argument('headless') driver = webdriver.Remote( command_executor='http://localhost:4444/wd/hub', options=options ) driver.get('https://shopee.tw/') # Load the cookies with open("cookies.json", "r") as f: cookies = json.load(f) for cookie in cookies: driver.add_cookie(cookie) for keyword in tqdm(keywords): driver.get(f'https://shopee.tw/search?keyword={keyword}') # Explicit wait setup wait = WebDriverWait(driver, 10) data_list = [] for _ in range(10): # Let's say you want to scrape 5 items for this example try: blocks = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.XPATH, '//*[@data-sqe="item"]')) ) link_element = blocks[_].find_element("xpath", './/a[@data-sqe="link"]') link_element.click() sleep(5) # Give a little pause for the page to load completely title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div[1]/div[1]/div/div[2]/section[1]/section[2]/div/div[1]/span')) ).text standard = WebDriverWait(driver, 15).until( EC.visibility_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div[1]/div[1]/div/div[*]/div[*]/div/div[1]/div[1]/section[1]')) ).text description = WebDriverWait(driver, 15).until( EC.visibility_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div[1]/div[1]/div/div[*]/div[*]/div/div[1]/div[1]/section[2]')) ).text data_list.append({ 'Title': title, 'Standard': standard, 'Description': description }) driver.back() sleep(2) # Give a little pause for the page to load completely # Convert the data list into a DataFrame df = pd.DataFrame(data_list) df.to_csv(f'./output/{keyword}.csv',encoding='utf-8', index=False) except Exception as e: print(f"Error occurred while processing item {_+1}: {e}") driver.quit() ``` ``` Title \ 0 專利天然酵母NMN+日本專利穀胱甘肽+法國西印度櫻桃維他命C【青春專科】 [現貨供應] VI... 1 大醫生技圓酵母穀胱甘肽膠囊 30顆[買3送1買6送3]另添神經醯胺(賽洛美) 莓果多酚 維他... 2 【現貨速發】美 白 產品 素顏 曬黑 膠原蛋白 煥活美肌 亮白 防曬 提亮肌膚 皮膚保養 維... 3 嚴萃【日本膠原蛋白】NIPPI 膠原蛋白粉 蜂王乳 賽洛美 維生素C 玻尿酸 胜肽 膠原蛋白... Description 0 🌱食用方法:一日2顆,可於任一餐飯後補充(不需分開吃)\n🌱補充注意事項:\n1. 18 歲... 1 🍒優惠組合:同商品買3送1、買6送3(下單"買"的數量即可) \n🍒優惠組合的贈品,系統自動... 2 💞規格:1包60粒\n💞使用方法:每日4粒,晚餐前各使用2粒\n💞儲存方法:陰涼乾燥\n15... 3 💡限時多盒優惠 關注再領折扣\n每個女人,都值得更好的愛自己!\n美妍首選👉【日本膠原蛋白】... ```