# Exploratory Data Analysis - Example
# Step 0: Imports and Reading Data
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# set plotting style
plt.style.use("ggplot")
# set max columns display
pd.set_option("display.max_columns", 200)
```
```python
df = pd.read_csv("data/coaster_db.csv")
```
# Step 1: Data Understanding
```python
df.shape
```
(1087, 56)
```python
df.head()
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>coaster_name</th>
<th>Length</th>
<th>Speed</th>
<th>Location</th>
<th>Status</th>
<th>Opening date</th>
<th>Type</th>
<th>Manufacturer</th>
<th>Height restriction</th>
<th>Model</th>
<th>Height</th>
<th>Inversions</th>
<th>Lift/launch system</th>
<th>Cost</th>
<th>Trains</th>
<th>Park section</th>
<th>Duration</th>
<th>Capacity</th>
<th>G-force</th>
<th>Designer</th>
<th>Max vertical angle</th>
<th>Drop</th>
<th>Soft opening date</th>
<th>Fast Lane available</th>
<th>Replaced</th>
<th>Track layout</th>
<th>Fastrack available</th>
<th>Soft opening date.1</th>
<th>Closing date</th>
<th>Opened</th>
<th>Replaced by</th>
<th>Website</th>
<th>Flash Pass Available</th>
<th>Must transfer from wheelchair</th>
<th>Theme</th>
<th>Single rider line available</th>
<th>Restraint Style</th>
<th>Flash Pass available</th>
<th>Acceleration</th>
<th>Restraints</th>
<th>Name</th>
<th>year_introduced</th>
<th>latitude</th>
<th>longitude</th>
<th>Type_Main</th>
<th>opening_date_clean</th>
<th>speed1</th>
<th>speed2</th>
<th>speed1_value</th>
<th>speed1_unit</th>
<th>speed_mph</th>
<th>height_value</th>
<th>height_unit</th>
<th>height_ft</th>
<th>Inversions_clean</th>
<th>Gforce_clean</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Switchback Railway</td>
<td>600 ft (180 m)</td>
<td>6 mph (9.7 km/h)</td>
<td>Coney Island</td>
<td>Removed</td>
<td>June 16, 1884</td>
<td>Wood</td>
<td>LaMarcus Adna Thompson</td>
<td>NaN</td>
<td>Lift Packed</td>
<td>50 ft (15 m)</td>
<td>NaN</td>
<td>gravity</td>
<td>NaN</td>
<td>NaN</td>
<td>Coney Island Cyclone Site</td>
<td>1:00</td>
<td>1600 riders per hour</td>
<td>2.9</td>
<td>LaMarcus Adna Thompson</td>
<td>30°</td>
<td>43 ft (13 m)</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>Gravity pulled coaster</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1884</td>
<td>40.5740</td>
<td>-73.9780</td>
<td>Wood</td>
<td>1884-06-16</td>
<td>6 mph</td>
<td>9.7 km/h</td>
<td>6.0</td>
<td>mph</td>
<td>6.0</td>
<td>50.0</td>
<td>ft</td>
<td>NaN</td>
<td>0</td>
<td>2.9</td>
</tr>
<tr>
<th>1</th>
<td>Flip Flap Railway</td>
<td>NaN</td>
<td>NaN</td>
<td>Sea Lion Park</td>
<td>Removed</td>
<td>1895</td>
<td>Wood</td>
<td>Lina Beecher</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>NaN</td>
<td>NaN</td>
<td>a single car. Riders are arranged 1 across in ...</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>12</td>
<td>Lina Beecher</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1902</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1895</td>
<td>40.5780</td>
<td>-73.9790</td>
<td>Wood</td>
<td>1895-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1</td>
<td>12.0</td>
</tr>
<tr>
<th>2</th>
<td>Switchback Railway (Euclid Beach Park)</td>
<td>NaN</td>
<td>NaN</td>
<td>Cleveland, Ohio, United States</td>
<td>Closed</td>
<td>NaN</td>
<td>Other</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1895</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1896</td>
<td>41.5800</td>
<td>-81.5700</td>
<td>Other</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>3</th>
<td>Loop the Loop (Coney Island)</td>
<td>NaN</td>
<td>NaN</td>
<td>Other</td>
<td>Removed</td>
<td>1901</td>
<td>Steel</td>
<td>Edwin Prescott</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>NaN</td>
<td>NaN</td>
<td>a single car. Riders are arranged 2 across in ...</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>Edward A. Green</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>Switchback Railway</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1910</td>
<td>NaN</td>
<td>Giant Racer</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1901</td>
<td>40.5745</td>
<td>-73.9780</td>
<td>Steel</td>
<td>1901-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1</td>
<td>NaN</td>
</tr>
<tr>
<th>4</th>
<td>Loop the Loop (Young's Pier)</td>
<td>NaN</td>
<td>NaN</td>
<td>Other</td>
<td>Removed</td>
<td>1901</td>
<td>Steel</td>
<td>Edwin Prescott</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>Edward A. Green</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1912</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1901</td>
<td>39.3538</td>
<td>-74.4342</td>
<td>Steel</td>
<td>1901-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1</td>
<td>NaN</td>
</tr>
</tbody>
</table>
</div>
```python
# show columns
df.columns
```
Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
'Track layout', 'Fastrack available', 'Soft opening date.1',
'Closing date', 'Opened', 'Replaced by', 'Website',
'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
'Single rider line available', 'Restraint Style',
'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
'year_introduced', 'latitude', 'longitude', 'Type_Main',
'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
'speed_mph', 'height_value', 'height_unit', 'height_ft',
'Inversions_clean', 'Gforce_clean'],
dtype='object')
```python
# show dtypes
df.dtypes
```
coaster_name object
Length object
Speed object
Location object
Status object
Opening date object
Type object
Manufacturer object
Height restriction object
Model object
Height object
Inversions float64
Lift/launch system object
Cost object
Trains object
Park section object
Duration object
Capacity object
G-force object
Designer object
Max vertical angle object
Drop object
Soft opening date object
Fast Lane available object
Replaced object
Track layout object
Fastrack available object
Soft opening date.1 object
Closing date object
Opened object
Replaced by object
Website object
Flash Pass Available object
Must transfer from wheelchair object
Theme object
Single rider line available object
Restraint Style object
Flash Pass available object
Acceleration object
Restraints object
Name object
year_introduced int64
latitude float64
longitude float64
Type_Main object
opening_date_clean object
speed1 object
speed2 object
speed1_value float64
speed1_unit object
speed_mph float64
height_value float64
height_unit object
height_ft float64
Inversions_clean int64
Gforce_clean float64
dtype: object
```python
df.describe()
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Inversions</th>
<th>year_introduced</th>
<th>latitude</th>
<th>longitude</th>
<th>speed1_value</th>
<th>speed_mph</th>
<th>height_value</th>
<th>height_ft</th>
<th>Inversions_clean</th>
<th>Gforce_clean</th>
</tr>
</thead>
<tbody>
<tr>
<th>count</th>
<td>932.000000</td>
<td>1087.000000</td>
<td>812.000000</td>
<td>812.000000</td>
<td>937.000000</td>
<td>937.000000</td>
<td>965.000000</td>
<td>171.000000</td>
<td>1087.000000</td>
<td>362.000000</td>
</tr>
<tr>
<th>mean</th>
<td>1.547210</td>
<td>1994.986201</td>
<td>38.373484</td>
<td>-41.595373</td>
<td>53.850374</td>
<td>48.617289</td>
<td>89.575171</td>
<td>101.996491</td>
<td>1.326587</td>
<td>3.824006</td>
</tr>
<tr>
<th>std</th>
<td>2.114073</td>
<td>23.475248</td>
<td>15.516596</td>
<td>72.285227</td>
<td>23.385518</td>
<td>16.678031</td>
<td>136.246444</td>
<td>67.329092</td>
<td>2.030854</td>
<td>0.989998</td>
</tr>
<tr>
<th>min</th>
<td>0.000000</td>
<td>1884.000000</td>
<td>-48.261700</td>
<td>-123.035700</td>
<td>5.000000</td>
<td>5.000000</td>
<td>4.000000</td>
<td>13.100000</td>
<td>0.000000</td>
<td>0.800000</td>
</tr>
<tr>
<th>25%</th>
<td>0.000000</td>
<td>1989.000000</td>
<td>35.031050</td>
<td>-84.552200</td>
<td>40.000000</td>
<td>37.300000</td>
<td>44.000000</td>
<td>51.800000</td>
<td>0.000000</td>
<td>3.400000</td>
</tr>
<tr>
<th>50%</th>
<td>0.000000</td>
<td>2000.000000</td>
<td>40.289800</td>
<td>-76.653600</td>
<td>50.000000</td>
<td>49.700000</td>
<td>79.000000</td>
<td>91.200000</td>
<td>0.000000</td>
<td>4.000000</td>
</tr>
<tr>
<th>75%</th>
<td>3.000000</td>
<td>2010.000000</td>
<td>44.799600</td>
<td>2.778100</td>
<td>63.000000</td>
<td>58.000000</td>
<td>113.000000</td>
<td>131.200000</td>
<td>2.000000</td>
<td>4.500000</td>
</tr>
<tr>
<th>max</th>
<td>14.000000</td>
<td>2022.000000</td>
<td>63.230900</td>
<td>153.426500</td>
<td>240.000000</td>
<td>149.100000</td>
<td>3937.000000</td>
<td>377.300000</td>
<td>14.000000</td>
<td>12.000000</td>
</tr>
</tbody>
</table>
</div>
# Step 2: Data Preperation
```python
df.head(3)
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>coaster_name</th>
<th>Length</th>
<th>Speed</th>
<th>Location</th>
<th>Status</th>
<th>Opening date</th>
<th>Type</th>
<th>Manufacturer</th>
<th>Height restriction</th>
<th>Model</th>
<th>Height</th>
<th>Inversions</th>
<th>Lift/launch system</th>
<th>Cost</th>
<th>Trains</th>
<th>Park section</th>
<th>Duration</th>
<th>Capacity</th>
<th>G-force</th>
<th>Designer</th>
<th>Max vertical angle</th>
<th>Drop</th>
<th>Soft opening date</th>
<th>Fast Lane available</th>
<th>Replaced</th>
<th>Track layout</th>
<th>Fastrack available</th>
<th>Soft opening date.1</th>
<th>Closing date</th>
<th>Opened</th>
<th>Replaced by</th>
<th>Website</th>
<th>Flash Pass Available</th>
<th>Must transfer from wheelchair</th>
<th>Theme</th>
<th>Single rider line available</th>
<th>Restraint Style</th>
<th>Flash Pass available</th>
<th>Acceleration</th>
<th>Restraints</th>
<th>Name</th>
<th>year_introduced</th>
<th>latitude</th>
<th>longitude</th>
<th>Type_Main</th>
<th>opening_date_clean</th>
<th>speed1</th>
<th>speed2</th>
<th>speed1_value</th>
<th>speed1_unit</th>
<th>speed_mph</th>
<th>height_value</th>
<th>height_unit</th>
<th>height_ft</th>
<th>Inversions_clean</th>
<th>Gforce_clean</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Switchback Railway</td>
<td>600 ft (180 m)</td>
<td>6 mph (9.7 km/h)</td>
<td>Coney Island</td>
<td>Removed</td>
<td>June 16, 1884</td>
<td>Wood</td>
<td>LaMarcus Adna Thompson</td>
<td>NaN</td>
<td>Lift Packed</td>
<td>50 ft (15 m)</td>
<td>NaN</td>
<td>gravity</td>
<td>NaN</td>
<td>NaN</td>
<td>Coney Island Cyclone Site</td>
<td>1:00</td>
<td>1600 riders per hour</td>
<td>2.9</td>
<td>LaMarcus Adna Thompson</td>
<td>30°</td>
<td>43 ft (13 m)</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>Gravity pulled coaster</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1884</td>
<td>40.574</td>
<td>-73.978</td>
<td>Wood</td>
<td>1884-06-16</td>
<td>6 mph</td>
<td>9.7 km/h</td>
<td>6.0</td>
<td>mph</td>
<td>6.0</td>
<td>50.0</td>
<td>ft</td>
<td>NaN</td>
<td>0</td>
<td>2.9</td>
</tr>
<tr>
<th>1</th>
<td>Flip Flap Railway</td>
<td>NaN</td>
<td>NaN</td>
<td>Sea Lion Park</td>
<td>Removed</td>
<td>1895</td>
<td>Wood</td>
<td>Lina Beecher</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>NaN</td>
<td>NaN</td>
<td>a single car. Riders are arranged 1 across in ...</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>12</td>
<td>Lina Beecher</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1902</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1895</td>
<td>40.578</td>
<td>-73.979</td>
<td>Wood</td>
<td>1895-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1</td>
<td>12.0</td>
</tr>
<tr>
<th>2</th>
<td>Switchback Railway (Euclid Beach Park)</td>
<td>NaN</td>
<td>NaN</td>
<td>Cleveland, Ohio, United States</td>
<td>Closed</td>
<td>NaN</td>
<td>Other</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1895</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>1896</td>
<td>41.580</td>
<td>-81.570</td>
<td>Other</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
</tbody>
</table>
</div>
```python
# Subsetting and copying
df = df[['coaster_name',
# 'Length', 'Speed',
'Location',
'Status',
'Opening date',
# 'Type',
'Manufacturer',
# 'Height restriction', 'Model', 'Height',
# 'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
# 'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
# 'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
# 'Track layout', 'Fastrack available', 'Soft opening date.1',
# 'Closing date',
'Opened',
# 'Replaced by', 'Website',
# 'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
# 'Single rider line available', 'Restraint Style',
# 'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
'year_introduced',
'latitude',
'longitude',
'Type_Main',
'opening_date_clean',
# 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
'speed_mph',
# 'height_value', 'height_unit',
'height_ft',
'Inversions_clean', 'Gforce_clean']].copy()
```
```python
df.shape
```
(1087, 15)
```python
df.dtypes
```
coaster_name object
Location object
Status object
Opening date object
Manufacturer object
Opened object
year_introduced int64
latitude float64
longitude float64
Type_Main object
opening_date_clean object
speed_mph float64
height_ft float64
Inversions_clean int64
Gforce_clean float64
dtype: object
```python
# Example of dropping columns
df.drop(["Opened"], axis=1, inplace=True)
```
```python
# Change object to datetime
df["opening_date_clean"] = pd.to_datetime(df["opening_date_clean"])
df.dtypes
```
coaster_name object
Location object
Status object
Opening date object
Manufacturer object
year_introduced int64
latitude float64
longitude float64
Type_Main object
opening_date_clean datetime64[ns]
speed_mph float64
height_ft float64
Inversions_clean int64
Gforce_clean float64
dtype: object
```python
# renaming columns
df = df.rename(columns={
"coaster_name": "Coaster_Name",
"Opening date": "Opening_Date",
"year_introduced": "Year_Introduced",
"latitude": "Latitude",
"longitude": "Longitude",
"opening_data_clean": "Opening_Data_Clean",
"speed_mph": "Speed_mph",
"height_ft": "Height_ft",
"Gforce_clean": "Gforce",
"Inversions_clean": "Inversions"
})
```
```python
df.head(3)
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Coaster_Name</th>
<th>Location</th>
<th>Status</th>
<th>Opening_Date</th>
<th>Manufacturer</th>
<th>Year_Introduced</th>
<th>Latitude</th>
<th>Longitude</th>
<th>Type_Main</th>
<th>opening_date_clean</th>
<th>Speed_mph</th>
<th>Height_ft</th>
<th>Inversions</th>
<th>Gforce</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>Switchback Railway</td>
<td>Coney Island</td>
<td>Removed</td>
<td>June 16, 1884</td>
<td>LaMarcus Adna Thompson</td>
<td>1884</td>
<td>40.574</td>
<td>-73.978</td>
<td>Wood</td>
<td>1884-06-16</td>
<td>6.0</td>
<td>NaN</td>
<td>0</td>
<td>2.9</td>
</tr>
<tr>
<th>1</th>
<td>Flip Flap Railway</td>
<td>Sea Lion Park</td>
<td>Removed</td>
<td>1895</td>
<td>Lina Beecher</td>
<td>1895</td>
<td>40.578</td>
<td>-73.979</td>
<td>Wood</td>
<td>1895-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>1</td>
<td>12.0</td>
</tr>
<tr>
<th>2</th>
<td>Switchback Railway (Euclid Beach Park)</td>
<td>Cleveland, Ohio, United States</td>
<td>Closed</td>
<td>NaN</td>
<td>NaN</td>
<td>1896</td>
<td>41.580</td>
<td>-81.570</td>
<td>Other</td>
<td>NaT</td>
<td>NaN</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
</tbody>
</table>
</div>
```python
# Identify Missing Values
df.isna().sum()
```
Coaster_Name 0
Location 0
Status 213
Opening_Date 250
Manufacturer 59
Year_Introduced 0
Latitude 275
Longitude 275
Type_Main 0
opening_date_clean 250
Speed_mph 150
Height_ft 916
Inversions 0
Gforce 725
dtype: int64
```python
# Identify duplicated data
df.duplicated()
```
0 False
1 False
2 False
3 False
4 False
...
1082 False
1083 False
1084 False
1085 False
1086 False
Length: 1087, dtype: bool
```python
df.loc[df.duplicated()]
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Coaster_Name</th>
<th>Location</th>
<th>Status</th>
<th>Opening_Date</th>
<th>Manufacturer</th>
<th>Year_Introduced</th>
<th>Latitude</th>
<th>Longitude</th>
<th>Type_Main</th>
<th>opening_date_clean</th>
<th>Speed_mph</th>
<th>Height_ft</th>
<th>Inversions</th>
<th>Gforce</th>
</tr>
</thead>
<tbody>
</tbody>
</table>
</div>
```python
df.duplicated(subset=["Coaster_Name"])
```
0 False
1 False
2 False
3 False
4 False
...
1082 True
1083 False
1084 True
1085 False
1086 False
Length: 1087, dtype: bool
```python
df.loc[df.duplicated(subset=["Coaster_Name"])]
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Coaster_Name</th>
<th>Location</th>
<th>Status</th>
<th>Opening_Date</th>
<th>Manufacturer</th>
<th>Year_Introduced</th>
<th>Latitude</th>
<th>Longitude</th>
<th>Type_Main</th>
<th>opening_date_clean</th>
<th>Speed_mph</th>
<th>Height_ft</th>
<th>Inversions</th>
<th>Gforce</th>
</tr>
</thead>
<tbody>
<tr>
<th>43</th>
<td>Crystal Beach Cyclone</td>
<td>Crystal Beach Park</td>
<td>Removed</td>
<td>1926</td>
<td>Traver Engineering</td>
<td>1927</td>
<td>42.8617</td>
<td>-79.0598</td>
<td>Wood</td>
<td>1926-01-01</td>
<td>60.0</td>
<td>NaN</td>
<td>0</td>
<td>4.0</td>
</tr>
<tr>
<th>60</th>
<td>Derby Racer</td>
<td>Revere Beach</td>
<td>Removed</td>
<td>1911</td>
<td>Fred W. Pearce</td>
<td>1937</td>
<td>42.4200</td>
<td>-70.9860</td>
<td>Wood</td>
<td>1911-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>61</th>
<td>Blue Streak (Conneaut Lake)</td>
<td>Conneaut Lake Park</td>
<td>Closed</td>
<td>May 23, 1938</td>
<td>NaN</td>
<td>1938</td>
<td>41.6349</td>
<td>-80.3180</td>
<td>Wood</td>
<td>1938-05-23</td>
<td>50.0</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>167</th>
<td>Big Thunder Mountain Railroad</td>
<td>Other</td>
<td>NaN</td>
<td>NaN</td>
<td>Arrow Development (California and Florida)Dyna...</td>
<td>1980</td>
<td>NaN</td>
<td>NaN</td>
<td>Steel</td>
<td>NaT</td>
<td>35.0</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>237</th>
<td>Thunder Run (Canada's Wonderland)</td>
<td>Canada's Wonderland</td>
<td>Operating</td>
<td>May 23, 1981 as Blauer Enzian, 1986 as Thunder...</td>
<td>Mack Rides</td>
<td>1986</td>
<td>43.8427</td>
<td>-79.5423</td>
<td>Steel</td>
<td>1981-05-23</td>
<td>39.8</td>
<td>32.8</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>...</th>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
<td>...</td>
</tr>
<tr>
<th>1063</th>
<td>Lil' Devil Coaster</td>
<td>Six Flags Great Adventure</td>
<td>Operating</td>
<td>1999 as Road Runner Railway; 2021 as Lil' Devi...</td>
<td>Zamperla</td>
<td>2021</td>
<td>40.1343</td>
<td>-74.4434</td>
<td>Steel</td>
<td>1999-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>1064</th>
<td>Little Dipper (Conneaut Lake Park)</td>
<td>Conneaut Lake Park</td>
<td>Operating</td>
<td>1950s</td>
<td>Allan Herschell Company</td>
<td>2021</td>
<td>41.6343</td>
<td>-80.3165</td>
<td>Steel</td>
<td>1950-01-01</td>
<td>NaN</td>
<td>NaN</td>
<td>0</td>
<td>NaN</td>
</tr>
<tr>
<th>1080</th>
<td>Iron Gwazi</td>
<td>Busch Gardens Tampa Bay</td>
<td>Under construction</td>
<td>NaN</td>
<td>Rocky Mountain Construction</td>
<td>2022</td>
<td>28.0339</td>
<td>-82.4231</td>
<td>Steel</td>
<td>NaT</td>
<td>76.0</td>
<td>NaN</td>
<td>2</td>
<td>NaN</td>
</tr>
<tr>
<th>1082</th>
<td>American Dreier Looping</td>
<td>Other</td>
<td>NaN</td>
<td>NaN</td>
<td>Anton Schwarzkopf</td>
<td>2022</td>
<td>NaN</td>
<td>NaN</td>
<td>Steel</td>
<td>NaT</td>
<td>53.0</td>
<td>NaN</td>
<td>3</td>
<td>4.7</td>
</tr>
<tr>
<th>1084</th>
<td>Tron Lightcycle Power Run</td>
<td>Other</td>
<td>NaN</td>
<td>June 16, 2016</td>
<td>Vekoma</td>
<td>2022</td>
<td>NaN</td>
<td>NaN</td>
<td>Steel</td>
<td>2016-06-16</td>
<td>59.3</td>
<td>NaN</td>
<td>0</td>
<td>4.0</td>
</tr>
</tbody>
</table>
<p>97 rows × 14 columns</p>
</div>
```python
# Checking an example of duplicated row
df.query("Coaster_Name == 'Crystal Beach Cyclone'")
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Coaster_Name</th>
<th>Location</th>
<th>Status</th>
<th>Opening_Date</th>
<th>Manufacturer</th>
<th>Year_Introduced</th>
<th>Latitude</th>
<th>Longitude</th>
<th>Type_Main</th>
<th>opening_date_clean</th>
<th>Speed_mph</th>
<th>Height_ft</th>
<th>Inversions</th>
<th>Gforce</th>
</tr>
</thead>
<tbody>
<tr>
<th>39</th>
<td>Crystal Beach Cyclone</td>
<td>Crystal Beach Park</td>
<td>Removed</td>
<td>1926</td>
<td>Traver Engineering</td>
<td>1926</td>
<td>42.8617</td>
<td>-79.0598</td>
<td>Wood</td>
<td>1926-01-01</td>
<td>60.0</td>
<td>NaN</td>
<td>0</td>
<td>4.0</td>
</tr>
<tr>
<th>43</th>
<td>Crystal Beach Cyclone</td>
<td>Crystal Beach Park</td>
<td>Removed</td>
<td>1926</td>
<td>Traver Engineering</td>
<td>1927</td>
<td>42.8617</td>
<td>-79.0598</td>
<td>Wood</td>
<td>1926-01-01</td>
<td>60.0</td>
<td>NaN</td>
<td>0</td>
<td>4.0</td>
</tr>
</tbody>
</table>
</div>
```python
# Subsetting non duplicated rows
df = df.loc[~df.duplicated(subset=["Coaster_Name", "Location", "Opening_Date"])].reset_index(drop=True).copy()
```
```python
df.shape
```
(990, 14)
# Step 3: Feature Understanding
Univariate analysis
- Histogram
- KDE
- Boxplot
```python
df["Year_Introduced"].value_counts()
```
Year_Introduced
1999 46
2000 45
1998 30
2001 29
2002 28
..
1956 1
1959 1
1961 1
1895 1
1884 1
Name: count, Length: 101, dtype: int64
```python
ax = df["Year_Introduced"].value_counts() \
.head(10) \
.plot(kind="bar", title="Top 10 Years Coasters Introduced")
ax.set_xlabel("Year Introduced")
ax.set_ylabel("Count")
```

```python
ax = df["Speed_mph"].plot(kind="hist", bins=20, title="Coaster Speed (mph)")
ax.set_xlabel("Speed (mph)")
```

```python
# Density Plot
ax = df["Speed_mph"].plot(kind="kde", title="Coaster Speed (mph)")
ax.set_xlabel("Speed (mph)")
```

# Step 4: Feature Relationship
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby comparisons
```python
df.plot(kind="scatter",
x="Speed_mph",
y="Height_ft",
title="Coaster Speed vs. Height")
plt.show()
```

```python
# Use seaborn
sns.scatterplot(x="Speed_mph", y="Height_ft", data=df, hue="Year_Introduced")
```

```python
# # multiple comparison
features = ["Year_Introduced", "Speed_mph", "Height_ft", "Inversions", "Gforce"]
sns.pairplot(df, vars=features, hue="Type_Main")
plt.show()
```

```python
# Show correlations
df[features].dropna().corr()
```
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>Year_Introduced</th>
<th>Speed_mph</th>
<th>Height_ft</th>
<th>Inversions</th>
<th>Gforce</th>
</tr>
</thead>
<tbody>
<tr>
<th>Year_Introduced</th>
<td>1.000000</td>
<td>0.171978</td>
<td>0.135413</td>
<td>-0.209887</td>
<td>0.160247</td>
</tr>
<tr>
<th>Speed_mph</th>
<td>0.171978</td>
<td>1.000000</td>
<td>0.733999</td>
<td>-0.028705</td>
<td>0.607383</td>
</tr>
<tr>
<th>Height_ft</th>
<td>0.135413</td>
<td>0.733999</td>
<td>1.000000</td>
<td>-0.079736</td>
<td>0.466482</td>
</tr>
<tr>
<th>Inversions</th>
<td>-0.209887</td>
<td>-0.028705</td>
<td>-0.079736</td>
<td>1.000000</td>
<td>0.275991</td>
</tr>
<tr>
<th>Gforce</th>
<td>0.160247</td>
<td>0.607383</td>
<td>0.466482</td>
<td>0.275991</td>
<td>1.000000</td>
</tr>
</tbody>
</table>
</div>
```python
# Heatmaps
df_corr = df[features].dropna().corr()
sns.heatmap(df_corr, annot=True)
```

# Step 5: Ask a Question about the data
### What are the locations with the fastest roller coasters(minimum of 10)?
```python
ax = df.query("Location != 'Other'") \
.groupby("Location")["Speed_mph"] \
.agg(["mean", "count"]) \
.query("count > 10") \
.sort_values("mean")["mean"] \
.plot(kind="barh", figsize=(12, 5), title="Average Coaster Speed by Location")
ax.set_xlabel("Average Coaster Speed")
```
