# python data seo analysis
Csv file
https://drive.google.com/open?id=181kBDudGYxBMEYUZT6BTv4hmMwNKtU63
我已經完成整個項目了,再請參考,如果有可以更改進的地方,希望可以分享給我~~
* def pivot_date_url =>完成Impressions, str to int,(read_csv後直接轉型態)
* def get_x_and_y => 轉備好 線性回歸的 datax, datay
* linear_regression_slope => 線性回歸,丟出斜率
型態的問題處理比較久

```python=
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.family']='DFKai-SB' #顯示中文 (for Win10)]
# https://vimsky.com/zh-tw/examples/detail/python-method-pandas.pivot_table.html
# https://www.codecademy.com/learn/data-processing-pandas/modules/dspath-intro-pandas
# gsc data source format
# column: Date(is Object;str), Page(is Object;str), Clicks(is Int), Impressions(is Object;str), CTR(is Object;str), Position(is float)
def pivot_date_url(file_path, value_matric='Impressions'): #<----匯入gsc檔案轉成,樞紐分析表,輸出檔案樞紐分析表
gsc_df = pd.read_csv(file_path, encoding='utf8')
print('各column type')
print(gsc_df.dtypes)
#Impression 將 str 1,423 => int 1432
gsc_df['Impressions'] = gsc_df['Impressions'].str.replace(',','').astype(int)
#開始轉置,樞紐 index = date
gsc_df_date_url = gsc_df.pivot_table(values=value_matric, index='Date', columns='Page', aggfunc=sum)
#replace NaN to 0
gsc_df_date_url = gsc_df_date_url.fillna(0)
page_list = list(gsc_df_date_url.columns)
return gsc_df_date_url, list(page_list)
# codertw.com/程式語言/462517 有dataframe抓資料的方法,loc、iloc
def get_x_and_y(gsc_df_date_url, page='https://starthealthy.nestle.com.tw/',start_date='2019-10-01', end_date='2019-12-31'):
# 強制 int化,datay 需要 int 型別,否則 np.array() 出錯
# print(gsc_df_date_url.loc[start_date:end_date,page])
datay = gsc_df_date_url.loc[start_date:end_date,page].astype(int)
# 把string中的千分位去除,如'2,019' =>'2019',再astype(int)
# datay = gsc_df_date_url.loc[start_date:end_date,page].astype('int')
# https://ithelp.ithome.com.tw/articles/10194852
# if isinstance(gsc_df_date_url.loc[end_date,page], str):
# # print(type(gsc_df_date_url.loc[end_date,page]))
# datay = gsc_df_date_url.loc[start_date:end_date,page].astype(int)
# #檢查datay是否為str:是的話轉成int
# else:
# datay = gsc_df_date_url.loc[start_date:end_date,page]
# #gsc是一個dataframe, 使用行、列的key搭配loc抓取特定範圍內的資料
# #https://www.geeksforgeeks.org/python-check-if-a-variable-is-string/
start_day = 0
end_day = 0
x = 0
for i in gsc_df_date_url.index:
if i == start_date:
start_day = x
#print(start_day)
elif i == end_date:
end_day = x
x+=1
#print(end_day)
datax = list(range(1,end_day-start_day+2))
#https://www.codecademy.com/learn/learn-python-3/modules/learn-python3-lists
# 這裡的一樣輸出pandas dataframe
return datax, datay
def linear_regression_slope (datax, datay, exponential=1):
# np.array 放到這邊
coefficients = np.polyfit(np.array(datax), np.array(datay), exponential)
p = np.poly1d(coefficients)
return coefficients[0], coefficients, p
# In[2]:
def export_slope_as_df(file_path, value_matric='Impressions',start_date='2019-10-01', end_date='2019-12-31'):
#read csv data export page_list, and timestamp data as gsc_df
gsc_df, page_list = pivot_date_url(file_path)
print(start_date + end_date)
slopes = {"page":"slope"}
for page in page_list:
datax, datay = get_x_and_y(gsc_df, page)
slope = linear_regression_slope(datax, datay)
slopes.update({page:slope})
return slopes
# In[3]:
nestle_slope_dict = export_slope_as_df("Nestle_181001_Raw_2.csv")
# In[4]:
print('列出所有資料,用{page:slope}的形式,dictionary')
print(nestle_slope_dict)
# In[5]:
print('列出以下網址的斜率,https://starthealthy.nestle.com.tw/section1.aspx')
print(nestle_slope_dict['https://starthealthy.nestle.com.tw/section1.aspx'][0])
# In[6]:
```
黃底就是斜率

```python=
if isinstance(gsc_df_date_url.loc[end_date,page], str):
print(type(gsc_df_date_url.loc[end_date,page]))
datay = gsc_df_date_url.loc[start_date:end_date,page].astype(int)
#檢查datay是否為str:是的話轉成int
else:
datay = gsc_df_date_url.loc[start_date:end_date,page]
#gsc是一個dataframe, 使用行、列的key搭配loc抓取特定範圍內的資料
#https://www.geeksforgeeks.org/python-check-if-a-variable-is-string/
```
可以考慮改成
```python=
# datay 需要 int 型別,否則 np.array() 出錯
datay = gsc_df_date_url.loc[start_date:end_date,page].astype(int)
# gsc是一個dataframe, 使用行、列的key搭配loc抓取特定範圍內的資料
# https://www.geeksforgeeks.org/python-check-if-a-variable-is-string/
```