# 10.Friday, 26-07-2019, Scrape - Clean Data - Draw Chart ### Scrape We can take a TAG SPAN in another SPAN: Ex: - In a list tag product_itemT - Take a span have STYLE, in another SPAN with class is 'rating-content' - Split this string by ':' and take the park with index 1 and replace '%' by a blank to take only the number ``` <span class="rating-content"> <span style="width:80%"> </span> </span> rating = product_item.select_one(".rating-content span")["style"].split(':')[1].replace("%", "") ``` Check a character is number or not Ex: - Take a TAG with name review - Split() default is a blank: ['100','nhan','xet'] - Take the first element [0]: 100 - Take from second position [1:]: 1 (to check when a product have "Chua co nhan xet") ``` <p class="review">(100 nhan xet)</p> if product_item.select_one(".review").text.split()[0][1:].isdigit(): review = product_item.select_one(".review").text.split()[0][1:] ``` Check if a string exist or not ``` if '<i class="tikicon icon-tikinow-20"></i>' in str(product_item): tiki_now = 1 ``` ### Clean Data 1. Get all data and pour into a table (raw data) 2. Create new table with ROW_NUMBER = 1 to remove duplicated data (1 second) 3. Continue to clean data in this table: change data type, change column data type, .... 4. Pour the cleaned data to main table 5. Create PK ``` --CLEANSING DATA -----CREATE NEW TABLE WITH CLEAN DATA CREATE TABLE pro_tmp AS WITH CTE AS( SELECT *, ROW_NUMBER() OVER( PARTITION BY productid ORDER BY productid DESC ) AS num FROM products_bkup ) SELECT * FROM CTE WHERE num = 1; -----CHANGE DATA TYPE UPDATE pro_tmp SET productid = CAST(productid AS INTEGER); ALTER TABLE pro_tmp ALTER COLUMN productid TYPE INTEGER USING productid::INTEGER; -- UPDATE pro_tmp SET price = CAST(CASE WHEN COALESCE(price,'') = '' THEN '0' ELSE price END AS FLOAT); ALTER TABLE pro_tmp ALTER COLUMN price TYPE DOUBLE PRECISION USING price::DOUBLE PRECISION; -- UPDATE pro_tmp SET seller_id = CAST(seller_id AS INTEGER); ALTER TABLE pro_tmp ALTER COLUMN seller_id TYPE INTEGER USING seller_id::INTEGER; -- ALTER TABLE pro_tmp ALTER COLUMN tiki_now TYPE INTEGER USING tiki_now::INTEGER; ALTER TABLE pro_tmp ALTER COLUMN rating TYPE INTEGER USING rating::INTEGER; ALTER TABLE pro_tmp ALTER COLUMN comment TYPE INTEGER USING comment::INTEGER; -----MAIN TABLE CREATE TABLE Products AS SELECT productid ,seller_id ,productname ,price ,brand ,image ,tiki_now ,rating ,comment ,category FROM pro_tmp; ALTER TABLE Products ADD PRIMARY KEY (productid); ``` ### Draw Chart ``` import matplotlib.pyplot as plt from matplotlib.pyplot import figure import plotly import chart_studio chart_studio.tools.set_credentials_file(username='thechain', api_key='Tb4kz7txNipqO2nmLwaB') import chart_studio.plotly as py import plotly.graph_objs as go import plotly.io as pio import plotly.express as px ``` - Prepare needed SQL - Execute and pour result into a list - Set this list into a DataFrame Set the size of chart ``` plt.figure(figsize=(20,20)) ``` - Bar Chart ``` df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query df.rename(columns={0: 'Category', 1: 'Item'}, inplace=True); #Rename column in DataFrame df = df.sort_values(['Item'], ascending=[1]); #Sort in DataFrame fig = px.bar(df, x='Category', y='Item', hover_data=['Category', 'Item'], color='Item', labels={'pop':'Amount of products in each category'}, height=600) fig.show() ``` - Multi Bar Chart (Group) ``` df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query df.rename(columns={0: 'Category', 1: 'Item', 2: 'TIKINOW'}, inplace=True); #Rename column in DataFrame df = df.sort_values(['Item'], ascending=[1]); #Sort in DataFrame # Draw Chart fig = go.Figure() # Add 2 bar charts together fig.add_trace(go.Bar( x=df['Category'], y=df['Item'], name='Item', marker_color='indigo' ) ) fig.add_trace(go.Bar( x=df['Category'], y=df['TIKINOW'], name='Tiki NOW', marker_color='blueviolet' ) ) fig.update_layout(barmode='group') fig.show() ``` - Bubble Chart ``` df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query df.rename(columns={0: 'Category', 1: 'Item', 2: 'Star', 3: 'Price'}, inplace=True); #Rename column in DataFrame fig = px.scatter(df, x="Price", y="Star", size="Item", color="Category", hover_name="Item", log_x=True, size_max=60) fig.show() ``` - Stack Bar Chart ``` df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query df.rename(columns={0: 'Category', 1: 'Star', 2: 'Comment'}, inplace=True); df = df.sort_values(['Star'], ascending=[0]); fig = go.Figure() fig.add_trace(go.Bar( y=df['Category'], x=df['Star'], name='Rating', orientation='h', marker=dict( color='rgba(246, 78, 139, 0.6)', line=dict(color='rgba(246, 78, 139, 1.0)', width=3) ) ) ) fig.add_trace(go.Bar( y=df['Category'], x=df['Comment'], name='Interesting', orientation='h', marker=dict( color='rgba(58, 71, 80, 0.6)', line=dict(color='rgba(58, 71, 80, 1.0)', width=3) ) ) ) fig.update_layout(barmode='stack') fig.show() ```