# 10.Friday, 26-07-2019, Scrape - Clean Data - Draw Chart
### Scrape
We can take a TAG SPAN in another SPAN:
Ex:
- In a list tag product_itemT
- Take a span have STYLE, in another SPAN with class is 'rating-content'
- Split this string by ':' and take the park with index 1 and replace '%' by a blank to take only the number
```
<span class="rating-content">
<span style="width:80%">
</span>
</span>
rating = product_item.select_one(".rating-content span")["style"].split(':')[1].replace("%", "")
```
Check a character is number or not
Ex:
- Take a TAG with name review
- Split() default is a blank: ['100','nhan','xet']
- Take the first element [0]: 100
- Take from second position [1:]: 1 (to check when a product have "Chua co nhan xet")
```
<p class="review">(100 nhan xet)</p>
if product_item.select_one(".review").text.split()[0][1:].isdigit():
review = product_item.select_one(".review").text.split()[0][1:]
```
Check if a string exist or not
```
if '<i class="tikicon icon-tikinow-20"></i>' in str(product_item):
tiki_now = 1
```
### Clean Data
1. Get all data and pour into a table (raw data)
2. Create new table with ROW_NUMBER = 1 to remove duplicated data (1 second)
3. Continue to clean data in this table: change data type, change column data type, ....
4. Pour the cleaned data to main table
5. Create PK
```
--CLEANSING DATA
-----CREATE NEW TABLE WITH CLEAN DATA
CREATE TABLE pro_tmp AS
WITH CTE AS(
SELECT *, ROW_NUMBER() OVER( PARTITION BY productid ORDER BY productid DESC ) AS num
FROM products_bkup
)
SELECT * FROM CTE WHERE num = 1;
-----CHANGE DATA TYPE
UPDATE pro_tmp SET productid = CAST(productid AS INTEGER);
ALTER TABLE pro_tmp ALTER COLUMN productid TYPE INTEGER USING productid::INTEGER;
--
UPDATE pro_tmp SET price = CAST(CASE WHEN COALESCE(price,'') = '' THEN '0' ELSE price END AS FLOAT);
ALTER TABLE pro_tmp ALTER COLUMN price TYPE DOUBLE PRECISION USING price::DOUBLE PRECISION;
--
UPDATE pro_tmp SET seller_id = CAST(seller_id AS INTEGER);
ALTER TABLE pro_tmp ALTER COLUMN seller_id TYPE INTEGER USING seller_id::INTEGER;
--
ALTER TABLE pro_tmp ALTER COLUMN tiki_now TYPE INTEGER USING tiki_now::INTEGER;
ALTER TABLE pro_tmp ALTER COLUMN rating TYPE INTEGER USING rating::INTEGER;
ALTER TABLE pro_tmp ALTER COLUMN comment TYPE INTEGER USING comment::INTEGER;
-----MAIN TABLE
CREATE TABLE Products AS
SELECT productid
,seller_id
,productname
,price
,brand
,image
,tiki_now
,rating
,comment
,category
FROM pro_tmp;
ALTER TABLE Products ADD PRIMARY KEY (productid);
```
### Draw Chart
```
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import plotly
import chart_studio
chart_studio.tools.set_credentials_file(username='thechain', api_key='Tb4kz7txNipqO2nmLwaB')
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
```
- Prepare needed SQL
- Execute and pour result into a list
- Set this list into a DataFrame
Set the size of chart
```
plt.figure(figsize=(20,20))
```
- Bar Chart
```
df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query
df.rename(columns={0: 'Category', 1: 'Item'}, inplace=True); #Rename column in DataFrame
df = df.sort_values(['Item'], ascending=[1]); #Sort in DataFrame
fig = px.bar(df, x='Category', y='Item',
hover_data=['Category', 'Item'], color='Item',
labels={'pop':'Amount of products in each category'}, height=600)
fig.show()
```
- Multi Bar Chart (Group)
```
df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query
df.rename(columns={0: 'Category', 1: 'Item', 2: 'TIKINOW'}, inplace=True); #Rename column in DataFrame
df = df.sort_values(['Item'], ascending=[1]); #Sort in DataFrame
# Draw Chart
fig = go.Figure()
# Add 2 bar charts together
fig.add_trace(go.Bar( x=df['Category'],
y=df['Item'],
name='Item',
marker_color='indigo'
)
)
fig.add_trace(go.Bar( x=df['Category'],
y=df['TIKINOW'],
name='Tiki NOW',
marker_color='blueviolet'
)
)
fig.update_layout(barmode='group')
fig.show()
```
- Bubble Chart
```
df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query
df.rename(columns={0: 'Category', 1: 'Item', 2: 'Star', 3: 'Price'}, inplace=True); #Rename column in DataFrame
fig = px.scatter(df, x="Price", y="Star",
size="Item",
color="Category",
hover_name="Item",
log_x=True, size_max=60)
fig.show()
```
- Stack Bar Chart
```
df = pd.DataFrame( [[ij for ij in i] for i in data] ) #create DataFrame from result of query
df.rename(columns={0: 'Category', 1: 'Star', 2: 'Comment'}, inplace=True);
df = df.sort_values(['Star'], ascending=[0]);
fig = go.Figure()
fig.add_trace(go.Bar( y=df['Category'],
x=df['Star'],
name='Rating',
orientation='h',
marker=dict( color='rgba(246, 78, 139, 0.6)',
line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
)
)
)
fig.add_trace(go.Bar( y=df['Category'],
x=df['Comment'],
name='Interesting',
orientation='h',
marker=dict( color='rgba(58, 71, 80, 0.6)',
line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
)
)
)
fig.update_layout(barmode='stack')
fig.show()
```