# Shoope
## Install Selenium
[selenium-docker](https://github.com/SeleniumHQ/docker-selenium) Select ==chrome== version
```bash
docker run -d --name selenuim -p 4444:4444 -p 7900:7900 --shm-size 2g selenium/standalone-chrome:dev
```
## Login shopee & Save cookies
#### Set up Selenium
```python=
import os
os.environ["SHOPEE_USERNAME"] = ""
os.environ["SHOPEE_PASSWORD"] = ""
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
# options.add_argument('blink-settings=imagesEnabled=false') 禁止讀取圖
# options.add_argument('headless') 無頭模式
driver = webdriver.Remote(
command_executor='http://localhost:4444/wd/hub',
options=options
)
```
#### Login Shopee
```python=
driver.get('https://shopee.tw/buyer/login?next=https%3A%2F%2Fshopee.tw%2F')
# Explicit wait setup
wait = WebDriverWait(driver, 10)
username_elem = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[2]/div[2]/div[1]/input')))
username_elem.send_keys(os.environ.get('SHOPEE_USERNAME'))
password_elem = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div/div/div/div[2]/form/div/div[2]/div[3]/div[1]/input')))
password_elem.send_keys(os.environ.get('SHOPEE_PASSWORD'))
wait = WebDriverWait(driver, 5)
login_btn = driver.find_element("xpath", '//*[@id="main"]/div/div[2]/div/div/div/div[2]/form/div/div[2]/button')
login_btn.click()
```
#### Check message
open [`http://localhost:7900`](http://localhost:7900) to check login message
#### Save cookies
```=
cookies = driver.get_cookies()
with open("cookies.json", "w") as f:
json.dump(cookies, f)
```
#### Quit Selenium
```=
driver.quit()
```
## Start scraping
```python=
import os
import requests
import json
import pandas as pd
from tqdm.auto import tqdm
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
# Set up Selenium
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)
# options.add_argument('blink-settings=imagesEnabled=false')
# options.add_argument('headless')
driver = webdriver.Remote(
command_executor='http://localhost:4444/wd/hub',
options=options
)
driver.get('https://shopee.tw/')
# Load the cookies
with open("cookies.json", "r") as f:
cookies = json.load(f)
for cookie in cookies:
driver.add_cookie(cookie)
for keyword in tqdm(keywords):
driver.get(f'https://shopee.tw/search?keyword={keyword}')
# Explicit wait setup
wait = WebDriverWait(driver, 10)
data_list = []
for _ in range(10): # Let's say you want to scrape 5 items for this example
try:
blocks = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//*[@data-sqe="item"]'))
)
link_element = blocks[_].find_element("xpath", './/a[@data-sqe="link"]')
link_element.click()
sleep(5) # Give a little pause for the page to load completely
title = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div[1]/div[1]/div/div[2]/section[1]/section[2]/div/div[1]/span'))
).text
standard = WebDriverWait(driver, 15).until(
EC.visibility_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div[1]/div[1]/div/div[*]/div[*]/div/div[1]/div[1]/section[1]'))
).text
description = WebDriverWait(driver, 15).until(
EC.visibility_of_element_located((By.XPATH, '//*[@id="main"]/div/div[2]/div[1]/div[1]/div/div[*]/div[*]/div/div[1]/div[1]/section[2]'))
).text
data_list.append({
'Title': title,
'Standard': standard,
'Description': description
})
driver.back()
sleep(2) # Give a little pause for the page to load completely
# Convert the data list into a DataFrame
df = pd.DataFrame(data_list)
df.to_csv(f'./output/{keyword}.csv',encoding='utf-8', index=False)
except Exception as e:
print(f"Error occurred while processing item {_+1}: {e}")
driver.quit()
```
```
Title \
0 專利天然酵母NMN+日本專利穀胱甘肽+法國西印度櫻桃維他命C【青春專科】 [現貨供應] VI...
1 大醫生技圓酵母穀胱甘肽膠囊 30顆[買3送1買6送3]另添神經醯胺(賽洛美) 莓果多酚 維他...
2 【現貨速發】美 白 產品 素顏 曬黑 膠原蛋白 煥活美肌 亮白 防曬 提亮肌膚 皮膚保養 維...
3 嚴萃【日本膠原蛋白】NIPPI 膠原蛋白粉 蜂王乳 賽洛美 維生素C 玻尿酸 胜肽 膠原蛋白...
Description
0 🌱食用方法:一日2顆,可於任一餐飯後補充(不需分開吃)\n🌱補充注意事項:\n1. 18 歲...
1 🍒優惠組合:同商品買3送1、買6送3(下單"買"的數量即可) \n🍒優惠組合的贈品,系統自動...
2 💞規格:1包60粒\n💞使用方法:每日4粒,晚餐前各使用2粒\n💞儲存方法:陰涼乾燥\n15...
3 💡限時多盒優惠 關注再領折扣\n每個女人,都值得更好的愛自己!\n美妍首選👉【日本膠原蛋白】...
```