owned this note
owned this note
Published
Linked with GitHub
# Lab 1-3
[TOC]
# Code
```python
import os
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
```
```python
import warnings
warnings.filterwarnings('ignore')
```
## Reading and processing dataset
```python
dataset_root = 'datasets/nsl-kdd'
```
```python
train_file = os.path.join(dataset_root, 'KDDTrain+.txt')
test_file = os.path.join(dataset_root, 'KDDTest+.txt')
```
```python
# Original KDD dataset feature names obtained from
# http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names
# http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
header_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'success_pred']
```
```python
# Differentiating between nominal, binary, and numeric features
# root_shell is marked as a continuous feature in the kddcup.names
# file, but it is supposed to be a binary feature according to the
# dataset documentation
col_names = np.array(header_names)
nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(nominal_idx).difference(binary_idx))
nominal_cols = col_names[nominal_idx].tolist()
binary_cols = col_names[binary_idx].tolist()
numeric_cols = col_names[numeric_idx].tolist()
```
```python
# training_attack_types.txt maps each of the 22 different attacks to 1 of 4 categories
# file obtained from http://kdd.ics.uci.edu/databases/kddcup99/training_attack_types
category = defaultdict(list)
category['benign'].append('normal')
with open('datasets/training_attack_types.txt', 'r') as f:
for line in f.readlines():
attack, cat = line.strip().split(' ')
category[cat].append(attack)
attack_mapping = dict((v,k) for k in category for v in category[k])
```
## Generating and analyzing train and test sets
```python
train_df = pd.read_csv(train_file, names=header_names)
train_df['attack_category'] = train_df['attack_type'] \
.map(lambda x: attack_mapping[x])
train_df.drop(['success_pred'], axis=1, inplace=True)
test_df = pd.read_csv(test_file, names=header_names)
test_df['attack_category'] = test_df['attack_type'] \
.map(lambda x: attack_mapping[x])
test_df.drop(['success_pred'], axis=1, inplace=True)
```
```python
train_attack_types = train_df['attack_type'].value_counts()
train_attack_cats = train_df['attack_category'].value_counts()
```
```python
test_attack_types = test_df['attack_type'].value_counts()
test_attack_cats = test_df['attack_category'].value_counts()
```
```python
train_attack_types.plot(kind='barh', figsize=(20,10), fontsize=20)
```
<matplotlib.axes._subplots.AxesSubplot at 0x7f18046419e8>

```python
train_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)
```
<matplotlib.axes._subplots.AxesSubplot at 0x7f1804550f60>

```python
test_attack_types.plot(kind='barh', figsize=(20,10), fontsize=15)
```
<matplotlib.axes._subplots.AxesSubplot at 0x7f18044bf518>

```python
test_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)
```
<matplotlib.axes._subplots.AxesSubplot at 0x7f1804527160>

```python
# Let's take a look at the binary features
# By definition, all of these features should have a min of 0.0 and a max of 1.0
train_df[binary_cols].describe().transpose()
```
<div>
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>count</th>
<th>mean</th>
<th>std</th>
<th>min</th>
<th>25%</th>
<th>50%</th>
<th>75%</th>
<th>max</th>
</tr>
</thead>
<tbody>
<tr>
<th>land</th>
<td>125973.0</td>
<td>0.000198</td>
<td>0.014086</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>1.0</td>
</tr>
<tr>
<th>logged_in</th>
<td>125973.0</td>
<td>0.395736</td>
<td>0.489010</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>1.0</td>
<td>1.0</td>
</tr>
<tr>
<th>root_shell</th>
<td>125973.0</td>
<td>0.001342</td>
<td>0.036603</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>1.0</td>
</tr>
<tr>
<th>su_attempted</th>
<td>125973.0</td>
<td>0.001103</td>
<td>0.045154</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>2.0</td>
</tr>
<tr>
<th>is_host_login</th>
<td>125973.0</td>
<td>0.000008</td>
<td>0.002817</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>1.0</td>
</tr>
<tr>
<th>is_guest_login</th>
<td>125973.0</td>
<td>0.009423</td>
<td>0.096612</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>0.0</td>
<td>1.0</td>
</tr>
</tbody>
</table>
</div>
```python
# Wait a minute... the su_attempted column has a max value of 2.0?
train_df.groupby(['su_attempted']).size()
```
su_attempted
0 125893
1 21
2 59
dtype: int64
```python
# Let's fix this discrepancy and assume that su_attempted=2 -> su_attempted=0
train_df['su_attempted'].replace(2, 0, inplace=True)
# Q1: replace su_attempted value 2 to 0 in test data
train_df.groupby(['su_attempted']).size()
```
su_attempted
0 125952
1 21
dtype: int64
```python
# Next, we notice that the num_outbound_cmds column only takes on one value!
train_df.groupby(['num_outbound_cmds']).size()
```
num_outbound_cmds
0 125973
dtype: int64
```python
# Now, that's not a very useful feature - let's drop it from the dataset
train_df.drop('num_outbound_cmds', axis = 1, inplace=True)
test_df.drop('num_outbound_cmds', axis = 1, inplace=True)
numeric_cols.remove('num_outbound_cmds')
```
## Data preparation
```python
train_Y = train_df['attack_category']
train_x_raw = train_df.drop(['attack_category','attack_type'], axis=1)
test_Y = test_df['attack_category']
test_x_raw = test_df.drop(['attack_category','attack_type'], axis=1)
```
```python
combined_df_raw = pd.concat([train_x_raw, test_x_raw])
combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True)
train_x = combined_df[:len(train_x_raw)]
test_x = combined_df[len(train_x_raw):]
# Store dummy variable feature names
dummy_variables = list(set(train_x)-set(combined_df_raw))
```
```python
train_x.describe()
```
<div>
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>duration</th>
<th>src_bytes</th>
<th>dst_bytes</th>
<th>land</th>
<th>wrong_fragment</th>
<th>urgent</th>
<th>hot</th>
<th>num_failed_logins</th>
<th>logged_in</th>
<th>num_compromised</th>
<th>...</th>
<th>flag_REJ</th>
<th>flag_RSTO</th>
<th>flag_RSTOS0</th>
<th>flag_RSTR</th>
<th>flag_S0</th>
<th>flag_S1</th>
<th>flag_S2</th>
<th>flag_S3</th>
<th>flag_SF</th>
<th>flag_SH</th>
</tr>
</thead>
<tbody>
<tr>
<th>count</th>
<td>125973.00000</td>
<td>1.259730e+05</td>
<td>1.259730e+05</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>...</td>
<td>125973.00000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
</tr>
<tr>
<th>mean</th>
<td>287.14465</td>
<td>4.556674e+04</td>
<td>1.977911e+04</td>
<td>0.000198</td>
<td>0.022687</td>
<td>0.000111</td>
<td>0.204409</td>
<td>0.001222</td>
<td>0.395736</td>
<td>0.279250</td>
<td>...</td>
<td>0.08917</td>
<td>0.012399</td>
<td>0.000818</td>
<td>0.019218</td>
<td>0.276655</td>
<td>0.002897</td>
<td>0.001008</td>
<td>0.000389</td>
<td>0.594929</td>
<td>0.002151</td>
</tr>
<tr>
<th>std</th>
<td>2604.51531</td>
<td>5.870331e+06</td>
<td>4.021269e+06</td>
<td>0.014086</td>
<td>0.253530</td>
<td>0.014366</td>
<td>2.149968</td>
<td>0.045239</td>
<td>0.489010</td>
<td>23.942042</td>
<td>...</td>
<td>0.28499</td>
<td>0.110661</td>
<td>0.028583</td>
<td>0.137292</td>
<td>0.447346</td>
<td>0.053750</td>
<td>0.031736</td>
<td>0.019719</td>
<td>0.490908</td>
<td>0.046332</td>
</tr>
<tr>
<th>min</th>
<td>0.00000</td>
<td>0.000000e+00</td>
<td>0.000000e+00</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>25%</th>
<td>0.00000</td>
<td>0.000000e+00</td>
<td>0.000000e+00</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>50%</th>
<td>0.00000</td>
<td>4.400000e+01</td>
<td>0.000000e+00</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>75%</th>
<td>0.00000</td>
<td>2.760000e+02</td>
<td>5.160000e+02</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>max</th>
<td>42908.00000</td>
<td>1.379964e+09</td>
<td>1.309937e+09</td>
<td>1.000000</td>
<td>3.000000</td>
<td>3.000000</td>
<td>77.000000</td>
<td>5.000000</td>
<td>1.000000</td>
<td>7479.000000</td>
<td>...</td>
<td>1.00000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
</tr>
</tbody>
</table>
<p>8 rows × 118 columns</p>
</div>
```python
# Example statistics for the 'duration' feature before scaling
train_x['duration'].describe()
```
count 125973.00000
mean 287.14465
std 2604.51531
min 0.00000
25% 0.00000
50% 0.00000
75% 0.00000
max 42908.00000
Name: duration, dtype: float64
```python
# Experimenting with StandardScaler on the single 'duration' feature
from sklearn.preprocessing import StandardScaler
durations = train_x['duration'].values.reshape(-1, 1)
standard_scaler = StandardScaler().fit(durations)
scaled_durations = standard_scaler.transform(durations)
pd.Series(scaled_durations.flatten()).describe()
```
count 1.259730e+05
mean 2.549477e-17
std 1.000004e+00
min -1.102492e-01
25% -1.102492e-01
50% -1.102492e-01
75% -1.102492e-01
max 1.636428e+01
dtype: float64
```python
# Experimenting with MinMaxScaler on the single 'duration' feature
from sklearn.preprocessing import MinMaxScaler
# Q2: Use MinMaxScaler, simaliar to standardscalar
# Usage: fit, transform, descibe
```
count 125973.000000
mean 0.006692
std 0.060700
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 1.000000
dtype: float64
```python
# Experimenting with RobustScaler on the single 'duration' feature
from sklearn.preprocessing import RobustScaler
min_max_scaler = RobustScaler().fit(durations)
robust_scaled_durations = min_max_scaler.transform(durations)
pd.Series(robust_scaled_durations.flatten()).describe()
```
count 125973.00000
mean 287.14465
std 2604.51531
min 0.00000
25% 0.00000
50% 0.00000
75% 0.00000
max 42908.00000
dtype: float64
```python
# Let's proceed with StandardScaler- Apply to all the numeric columns
#Q3: Change to other scaler and test for it's effectness
standard_scaler = StandardScaler().fit(train_x[numeric_cols])
train_x[numeric_cols] = \
standard_scaler.transform(train_x[numeric_cols])
test_x[numeric_cols] = \
standard_scaler.transform(test_x[numeric_cols])
```
```python
train_x.describe()
```
<div>
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>duration</th>
<th>src_bytes</th>
<th>dst_bytes</th>
<th>land</th>
<th>wrong_fragment</th>
<th>urgent</th>
<th>hot</th>
<th>num_failed_logins</th>
<th>logged_in</th>
<th>num_compromised</th>
<th>...</th>
<th>flag_REJ</th>
<th>flag_RSTO</th>
<th>flag_RSTOS0</th>
<th>flag_RSTR</th>
<th>flag_S0</th>
<th>flag_S1</th>
<th>flag_S2</th>
<th>flag_S3</th>
<th>flag_SF</th>
<th>flag_SH</th>
</tr>
</thead>
<tbody>
<tr>
<th>count</th>
<td>1.259730e+05</td>
<td>1.259730e+05</td>
<td>1.259730e+05</td>
<td>125973.000000</td>
<td>1.259730e+05</td>
<td>1.259730e+05</td>
<td>1.259730e+05</td>
<td>1.259730e+05</td>
<td>125973.000000</td>
<td>1.259730e+05</td>
<td>...</td>
<td>125973.00000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
<td>125973.000000</td>
</tr>
<tr>
<th>mean</th>
<td>2.549477e-17</td>
<td>-4.512349e-19</td>
<td>7.614590e-19</td>
<td>0.000198</td>
<td>4.230328e-19</td>
<td>4.455945e-18</td>
<td>-2.244894e-17</td>
<td>2.989431e-18</td>
<td>0.395736</td>
<td>-6.549957e-18</td>
<td>...</td>
<td>0.08917</td>
<td>0.012399</td>
<td>0.000818</td>
<td>0.019218</td>
<td>0.276655</td>
<td>0.002897</td>
<td>0.001008</td>
<td>0.000389</td>
<td>0.594929</td>
<td>0.002151</td>
</tr>
<tr>
<th>std</th>
<td>1.000004e+00</td>
<td>1.000004e+00</td>
<td>1.000004e+00</td>
<td>0.014086</td>
<td>1.000004e+00</td>
<td>1.000004e+00</td>
<td>1.000004e+00</td>
<td>1.000004e+00</td>
<td>0.489010</td>
<td>1.000004e+00</td>
<td>...</td>
<td>0.28499</td>
<td>0.110661</td>
<td>0.028583</td>
<td>0.137292</td>
<td>0.447346</td>
<td>0.053750</td>
<td>0.031736</td>
<td>0.019719</td>
<td>0.490908</td>
<td>0.046332</td>
</tr>
<tr>
<th>min</th>
<td>-1.102492e-01</td>
<td>-7.762241e-03</td>
<td>-4.918644e-03</td>
<td>0.000000</td>
<td>-8.948642e-02</td>
<td>-7.735985e-03</td>
<td>-9.507567e-02</td>
<td>-2.702282e-02</td>
<td>0.000000</td>
<td>-1.166364e-02</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>25%</th>
<td>-1.102492e-01</td>
<td>-7.762241e-03</td>
<td>-4.918644e-03</td>
<td>0.000000</td>
<td>-8.948642e-02</td>
<td>-7.735985e-03</td>
<td>-9.507567e-02</td>
<td>-2.702282e-02</td>
<td>0.000000</td>
<td>-1.166364e-02</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>50%</th>
<td>-1.102492e-01</td>
<td>-7.754745e-03</td>
<td>-4.918644e-03</td>
<td>0.000000</td>
<td>-8.948642e-02</td>
<td>-7.735985e-03</td>
<td>-9.507567e-02</td>
<td>-2.702282e-02</td>
<td>0.000000</td>
<td>-1.166364e-02</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>75%</th>
<td>-1.102492e-01</td>
<td>-7.715224e-03</td>
<td>-4.790326e-03</td>
<td>0.000000</td>
<td>-8.948642e-02</td>
<td>-7.735985e-03</td>
<td>-9.507567e-02</td>
<td>-2.702282e-02</td>
<td>1.000000</td>
<td>-1.166364e-02</td>
<td>...</td>
<td>0.00000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>0.000000</td>
<td>1.000000</td>
<td>0.000000</td>
</tr>
<tr>
<th>max</th>
<td>1.636428e+01</td>
<td>2.350675e+02</td>
<td>3.257486e+02</td>
<td>1.000000</td>
<td>1.174348e+01</td>
<td>2.088191e+02</td>
<td>3.571955e+01</td>
<td>1.104972e+02</td>
<td>1.000000</td>
<td>3.123689e+02</td>
<td>...</td>
<td>1.00000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
<td>1.000000</td>
</tr>
</tbody>
</table>
<p>8 rows × 118 columns</p>
</div>
```python
train_Y_bin = train_Y.apply(lambda x: 0 if x is 'benign' else 1)
test_Y_bin = test_Y.apply(lambda x: 0 if x is 'benign' else 1)
```
```python
# 5-class classification version
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
classifier = DecisionTreeClassifier(random_state=17)
classifier.fit(train_x, train_Y)
pred_y = classifier.predict(test_x)
results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)
print(results)
print(error)
```
[[9365 56 289 1 0]
[1541 5998 97 0 0]
[ 675 220 1528 0 0]
[2278 1 14 277 4]
[ 179 0 5 5 11]]
0.237979063165
```python
from sklearn.neighbors import KNeighborsClassifier
# Q4: Implement KNeighborsClassifier
print(results)
print(error)
```
[[9457 57 193 2 2]
[1675 5894 67 0 0]
[ 670 156 1597 0 0]
[2369 2 37 126 40]
[ 176 0 4 7 13]]
0.242059971611
```python
from sklearn.svm import LinearSVC
#Q5: Implement LinearSVC
print(results)
print(error)
```
[[9008 292 407 3 1]
[1972 5654 10 0 0]
[ 714 122 1500 87 0]
[2472 1 1 100 0]
[ 185 2 0 4 9]]
0.278255855216
## Dealing with class imbalance
```python
test_Y.value_counts().apply(lambda x: x/float(len(test_Y)))
```
benign 0.430758
dos 0.338715
r2l 0.114177
probe 0.107479
u2r 0.008872
Name: attack_category, dtype: float64
```python
train_Y.value_counts().apply(lambda x: x/float(len(train_Y)))
```
benign 0.534583
dos 0.364578
probe 0.092528
r2l 0.007899
u2r 0.000413
Name: attack_category, dtype: float64
```python
print(pd.Series(train_Y).value_counts())
```
benign 67343
dos 45927
probe 11656
r2l 995
u2r 52
Name: attack_category, dtype: int64
```python
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='auto', random_state=0)
train_x_sm, train_Y_sm = sm.fit_sample(train_x, train_Y)
print(pd.Series(train_Y_sm).value_counts())
```
probe 67343
dos 67343
r2l 67343
benign 67343
u2r 67343
dtype: int64
```python
from imblearn.under_sampling import RandomUnderSampler
mean_class_size = int(pd.Series(train_Y).value_counts().sum()/5)
ratio = {'benign': mean_class_size,
'dos': mean_class_size,
'probe': mean_class_size,
'r2l': mean_class_size,
'u2r': mean_class_size}
rus = RandomUnderSampler(ratio=ratio, random_state=0, replacement=True)
train_x_rus, train_Y_rus = rus.fit_sample(train_x_sm, train_Y_sm)
print(pd.Series(train_Y_rus).value_counts())
```
r2l 25194
benign 25194
u2r 25194
probe 25194
dos 25194
dtype: int64
```python
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
classifier = DecisionTreeClassifier(random_state=17)
classifier.fit(train_x_rus, train_Y_rus)
pred_y = classifier.predict(test_x)
results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)
print(results)
print(error)
```
[[9369 73 258 6 5]
[1221 5768 647 0 0]
[ 270 170 1980 1 2]
[1829 2 369 369 5]
[ 62 0 108 21 9]]
0.223962029808
## Attempting unsupervised learning
```python
# First, let's visualize the dataset (only numeric cols)
from sklearn.decomposition import PCA
# Use PCA to reduce dimensionality so we can visualize the dataset on a 2d plot
pca = PCA(n_components=2)
train_x_pca_cont = pca.fit_transform(train_x[numeric_cols])
plt.figure(figsize=(15,10))
colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple']
for color, cat in zip(colors, category.keys()):
plt.scatter(train_x_pca_cont[train_Y==cat, 0], train_x_pca_cont[train_Y==cat, 1],
color=color, alpha=.8, lw=2, label=cat)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.show()
```

```python
# Apply k-means (k=5, only using numeric cols) + PCA + plot
from sklearn.cluster import KMeans
# Fit the training data to a k-means clustering estimator model
kmeans = KMeans(n_clusters=5, random_state=17).fit(train_x[numeric_cols])
# Retrieve the labels assigned to each training sample
kmeans_y = kmeans.labels_
# Plot in 2d with train_x_pca_cont
plt.figure(figsize=(15,10))
colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple']
for color, cat in zip(colors, range(5)):
plt.scatter(train_x_pca_cont[kmeans_y==cat, 0],
train_x_pca_cont[kmeans_y==cat, 1],
color=color, alpha=.8, lw=2, label=cat)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.show()
```

```python
print('Total number of features: {}'.format(len(train_x.columns)))
print('Total number of continuous features: {}'.format(len(train_x[numeric_cols].columns)))
```
Total number of features: 118
Total number of continuous features: 31
```python
from sklearn.metrics import completeness_score,\
homogeneity_score, v_measure_score
print('Completeness: {}'.format(completeness_score(test_Y, pred_y)))
print('Homogeneity: {}'.format(homogeneity_score(test_Y, pred_y)))
print('V-measure: {}'.format(v_measure_score(test_Y, pred_y)))
```
Completeness: 0.5313035431629083
Homogeneity: 0.4373279456197804
V-measure: 0.4797570380949557
## Using "Attribute Ratio" (AR) feature selection
```python
averages = train_df.loc[:, numeric_cols].mean()
```
```python
averages_per_class = train_df[numeric_cols+['attack_category']].groupby('attack_category').mean()
```
```python
AR = {}
for col in numeric_cols:
AR[col] = max(averages_per_class[col])/averages[col]
```
```python
AR
```
{'duration': 7.2258291572125568,
'src_bytes': 8.4640642049489454,
'dst_bytes': 9.1548543553434012,
'wrong_fragment': 2.7428963354889282,
'urgent': 173.03983516483518,
'hot': 40.774516817095183,
'num_failed_logins': 46.038556418455919,
'num_compromised': 4.3385392749839271,
'num_root': 2.6091432537726016,
'num_file_creations': 62.233624927703879,
'num_shells': 326.11353550295854,
'num_access_files': 4.6948792486583191,
'count': 2.1174082949142403,
'srv_count': 1.1773191099992069,
'serror_rate': 2.6310546426370025,
'srv_serror_rate': 2.6432463184901405,
'rerror_rate': 3.6455860878284372,
'srv_rerror_rate': 3.6677418023254122,
'same_srv_rate': 1.507961200604778,
'diff_srv_rate': 4.0690854850685172,
'srv_diff_host_rate': 3.0815657101101674,
'dst_host_count': 1.3428596865228266,
'dst_host_srv_count': 1.6453161847397422,
'dst_host_same_srv_rate': 1.5575788279744123,
'dst_host_diff_srv_rate': 4.8373418489732671,
'dst_host_same_src_port_rate': 4.3930803788834885,
'dst_host_srv_diff_host_rate': 5.7568806827546997,
'dst_host_serror_rate': 2.6293396511769247,
'dst_host_srv_serror_rate': 2.6731595957142456,
'dst_host_rerror_rate': 3.2795669242442695,
'dst_host_srv_rerror_rate': 3.667920527965804}
```python
def binary_AR(df, col):
series_zero = train_df[train_df[col] == 0].groupby('attack_category').size()
series_one = train_df[train_df[col] == 1].groupby('attack_category').size()
return max(series_one/series_zero)
```
```python
# Recreating dataframes with 2-class and 5-class labels
labels2 = ['normal', 'attack']
labels5 = ['normal', 'dos', 'probe', 'r2l', 'u2r']
train_df = pd.read_csv(train_file, names=header_names)
train_df['attack_category'] = train_df['attack_type'] \
.map(lambda x: attack_mapping[x])
train_df.drop(['success_pred'], axis=1, inplace=True)
test_df = pd.read_csv(test_file, names=header_names)
test_df['attack_category'] = test_df['attack_type'] \
.map(lambda x: attack_mapping[x])
test_df.drop(['success_pred'], axis=1, inplace=True)
train_attack_types = train_df['attack_type'].value_counts()
train_attack_cats = train_df['attack_category'].value_counts()
test_attack_types = test_df['attack_type'].value_counts()
test_attack_cats = test_df['attack_category'].value_counts()
```
```python
train_df['su_attempted'].replace(2, 0, inplace=True)
test_df['su_attempted'].replace(2, 0, inplace=True)
train_df.drop('num_outbound_cmds', axis = 1, inplace=True)
test_df.drop('num_outbound_cmds', axis = 1, inplace=True)
```
```python
train_df['labels2'] = train_df.apply(lambda x: 'normal' if 'normal' in x['attack_type'] else 'attack', axis=1)
test_df['labels2'] = test_df.apply(lambda x: 'normal' if 'normal' in x['attack_type'] else 'attack', axis=1)
combined_df = pd.concat([train_df, test_df])
original_cols = combined_df.columns
combined_df = pd.get_dummies(combined_df, columns=nominal_cols, drop_first=True)
added_cols = set(combined_df.columns) - set(original_cols)
added_cols= list(added_cols)
```
```python
combined_df.attack_category = pd.Categorical(combined_df.attack_category)
combined_df.labels2 = pd.Categorical(combined_df.labels2)
combined_df['labels5'] = combined_df['attack_category'].cat.codes
combined_df['labels2'] = combined_df['labels2'].cat.codes
```
```python
train_df = combined_df[:len(train_df)]
test_df = combined_df[len(train_df):]
```
```python
for col in binary_cols+dummy_variables:
cur_AR = binary_AR(train_df, col)
if cur_AR:
AR[col] = cur_AR
```
```python
train_df[train_df.service_Z39_50 == 1].groupby('attack_category').size()
```
attack_category
benign 0
dos 851
probe 11
r2l 0
u2r 0
dtype: int64
```python
len(binary_cols+added_cols)
```
87
```python
import operator
AR = dict((k, v) for k,v in AR.items() if not np.isnan(v))
sorted_AR = sorted(AR.items(), key=lambda x:x[1], reverse=True)
```
```python
sorted_AR
```
[('protocol_type_tcp', inf),
('num_shells', 326.11353550295854),
('urgent', 173.03983516483518),
('num_file_creations', 62.233624927703879),
('flag_SF', 51.0),
('num_failed_logins', 46.038556418455919),
('hot', 40.774516817095183),
('logged_in', 10.569767441860465),
('dst_bytes', 9.1548543553434012),
('src_bytes', 8.4640642049489454),
('duration', 7.2258291572125568),
('dst_host_srv_diff_host_rate', 5.7568806827546997),
('dst_host_diff_srv_rate', 4.8373418489732671),
('num_access_files', 4.6948792486583191),
('dst_host_same_src_port_rate', 4.3930803788834885),
('num_compromised', 4.3385392749839271),
('diff_srv_rate', 4.0690854850685172),
('dst_host_srv_rerror_rate', 3.667920527965804),
('srv_rerror_rate', 3.6677418023254122),
('rerror_rate', 3.6455860878284372),
('dst_host_rerror_rate', 3.2795669242442695),
('srv_diff_host_rate', 3.0815657101101674),
('flag_S0', 2.965034965034965),
('wrong_fragment', 2.7428963354889282),
('dst_host_srv_serror_rate', 2.6731595957142456),
('srv_serror_rate', 2.6432463184901405),
('serror_rate', 2.6310546426370025),
('dst_host_serror_rate', 2.6293396511769247),
('num_root', 2.6091432537726016),
('count', 2.1174082949142403),
('service_telnet', 1.8888888888888888),
('dst_host_srv_count', 1.6453161847397422),
('dst_host_same_srv_rate', 1.5575788279744123),
('service_ftp_data', 1.5447570332480818),
('same_srv_rate', 1.507961200604778),
('dst_host_count', 1.3428596865228266),
('service_http', 1.2988666621151088),
('srv_count', 1.1773191099992069),
('root_shell', 1.0),
('service_private', 0.72528123149792778),
('service_eco_i', 0.54037267080745344),
('is_guest_login', 0.45894428152492667),
('service_ftp', 0.45680819912152271),
('flag_REJ', 0.32650506429953341),
('flag_RSTR', 0.23005487547488393),
('protocol_type_udp', 0.22644739478045495),
('service_other', 0.16945921541085582),
('service_domain_u', 0.15493320070658045),
('service_smtp', 0.11654010677454654),
('service_ecr_i', 0.066012116147900562),
('flag_RSTO', 0.048472075869336141),
('service_finger', 0.026095310440358364),
('flag_SH', 0.023263980335352472),
('service_Z39_50', 0.018879226195758277),
('service_uucp', 0.017029097834270781),
('service_courier', 0.0160615915577089),
('service_auth', 0.015544843445957898),
('service_bgp', 0.015455027858848501),
('service_uucp_path', 0.014938896377980597),
('service_iso_tsap', 0.014916467780429593),
('service_whois', 0.014804339660163068),
('service_imap4', 0.013729168965897804),
('service_nnsp', 0.013729168965897804),
('service_vmnet', 0.013371284834844774),
('service_time', 0.012142983074753174),
('service_ctf', 0.011853092158893123),
('service_csnet_ns', 0.011741639864299247),
('service_supdup', 0.011630212119209674),
('service_http_443', 0.011518808915514052),
('service_discard', 0.011451978769793203),
('service_domain', 0.011184746471740902),
('service_daytime', 0.011073441352588941),
('service_gopher', 0.010672945733022314),
('service_efs', 0.010517283108539242),
('service_exec', 0.010228322555100963),
('service_systat', 0.010117227879561),
('service_link', 0.0099839465177138081),
('service_hostnames', 0.0098284960422163597),
('service_name', 0.0094068001494538346),
('service_klogin', 0.0093402487802733952),
('service_login', 0.0092293493308721729),
('service_mtp', 0.0091406473160334858),
('service_echo', 0.0091406473160334858),
('service_urp_i', 0.0089745894762075992),
('flag_RSTOS0', 0.0089154332208084483),
('service_ldap', 0.0088524734206133025),
('service_netbios_dgm', 0.0086087624903920055),
('service_sunrpc', 0.0080995653891742393),
('service_netbios_ssn', 0.0076572030365527231),
('service_netstat', 0.0075466731018142726),
('service_netbios_ns', 0.0073698756333486874),
('service_kshell', 0.0063985975676564043),
('service_ssh', 0.0061560706305043159),
('service_nntp', 0.0061560706305043159),
('flag_S1', 0.0053895076289152306),
('service_sql_net', 0.0050991377423731778),
('flag_S3', 0.0030241935483870967),
('service_pop_3', 0.0027696293759399615),
('service_ntp_u', 0.0025009304056568663),
('flag_S2', 0.0017702011186481019),
('service_rje', 0.0015466575012888812),
('service_remote_job', 0.0015466575012888812),
('service_pop_2', 0.0015264845061822622),
('service_printer', 0.0013517933064428214),
('service_shell', 0.0011553385359898854),
('su_attempted', 0.001006036217303823),
('service_X11', 0.00099589749687853022),
('service_pm_dump', 0.00042914771264269161),
('land', 0.00039207998431680063),
('service_aol', 0.00017161489617298782),
('service_http_8001', 0.00017161489617298782),
('service_harvest', 0.00017161489617298782),
('service_urh_i', 0.00014851558671082529),
('service_red_i', 0.00011880894037276305),
('service_http_2784', 8.5800085800085798e-05),
('service_tim_i', 7.4252279544982031e-05),
('service_tftp_u', 4.4550044550044547e-05),
('is_host_login', 1.4849573817231445e-05)]
```python
# Only keep features with AR value >= 0.01
features_to_use = []
for x,y in sorted_AR:
if y >= 0.01:
features_to_use.append(x)
features_to_use
```
['protocol_type_tcp',
'num_shells',
'urgent',
'num_file_creations',
'flag_SF',
'num_failed_logins',
'hot',
'logged_in',
'dst_bytes',
'src_bytes',
'duration',
'dst_host_srv_diff_host_rate',
'dst_host_diff_srv_rate',
'num_access_files',
'dst_host_same_src_port_rate',
'num_compromised',
'diff_srv_rate',
'dst_host_srv_rerror_rate',
'srv_rerror_rate',
'rerror_rate',
'dst_host_rerror_rate',
'srv_diff_host_rate',
'flag_S0',
'wrong_fragment',
'dst_host_srv_serror_rate',
'srv_serror_rate',
'serror_rate',
'dst_host_serror_rate',
'num_root',
'count',
'service_telnet',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'service_ftp_data',
'same_srv_rate',
'dst_host_count',
'service_http',
'srv_count',
'root_shell',
'service_private',
'service_eco_i',
'is_guest_login',
'service_ftp',
'flag_REJ',
'flag_RSTR',
'protocol_type_udp',
'service_other',
'service_domain_u',
'service_smtp',
'service_ecr_i',
'flag_RSTO',
'service_finger',
'flag_SH',
'service_Z39_50',
'service_uucp',
'service_courier',
'service_auth',
'service_bgp',
'service_uucp_path',
'service_iso_tsap',
'service_whois',
'service_imap4',
'service_nnsp',
'service_vmnet',
'service_time',
'service_ctf',
'service_csnet_ns',
'service_supdup',
'service_http_443',
'service_discard',
'service_domain',
'service_daytime',
'service_gopher',
'service_efs',
'service_exec',
'service_systat']
```python
len(features_to_use)
```
76
```python
len(sorted_AR) - len(features_to_use)
```
42
```python
train_df_trimmed = train_df[features_to_use]
test_df_trimmed = test_df[features_to_use]
```
```python
numeric_cols_to_use = list(set(numeric_cols).intersection(features_to_use))
```
```python
# Rescaling is necessary after reducing dimensions
standard_scaler = StandardScaler()
train_df_trimmed[numeric_cols_to_use] = standard_scaler.fit_transform(train_df_trimmed[numeric_cols_to_use])
test_df_trimmed[numeric_cols_to_use] = standard_scaler.transform(test_df_trimmed[numeric_cols_to_use])
```
## Applying advanced ensembling
```python
kmeans = KMeans(n_clusters=8, random_state=17)
kmeans.fit(train_df_trimmed[numeric_cols_to_use])
kmeans_train_y = kmeans.labels_
```
```python
pd.crosstab(kmeans_train_y, train_Y_bin)
```
<div>
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th>attack_category</th>
<th>0</th>
<th>1</th>
</tr>
<tr>
<th>row_0</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>63569</td>
<td>6457</td>
</tr>
<tr>
<th>1</th>
<td>2784</td>
<td>11443</td>
</tr>
<tr>
<th>2</th>
<td>126</td>
<td>34700</td>
</tr>
<tr>
<th>3</th>
<td>1</td>
<td>0</td>
</tr>
<tr>
<th>4</th>
<td>628</td>
<td>4335</td>
</tr>
<tr>
<th>5</th>
<td>167</td>
<td>757</td>
</tr>
<tr>
<th>6</th>
<td>0</td>
<td>884</td>
</tr>
<tr>
<th>7</th>
<td>68</td>
<td>54</td>
</tr>
</tbody>
</table>
</div>
```python
train_df['kmeans_y'] = kmeans_train_y
train_df_trimmed['kmeans_y'] = kmeans_train_y
```
```python
kmeans_test_y = kmeans.predict(test_df_trimmed[numeric_cols_to_use])
test_df['kmeans_y'] = kmeans_test_y
```
```python
pca8 = PCA(n_components=2)
train_df_trimmed_pca8 = pca8.fit_transform(train_df_trimmed)
plt.figure(figsize=(15,10))
colors8 = ['navy', 'turquoise', 'darkorange', 'red', 'purple', 'green', 'magenta', 'black']
labels8 = [0,1,2,3,4,5,6,7]
for color, cat in zip(colors8, labels8):
plt.scatter(train_df_trimmed_pca8[train_df.kmeans_y==cat, 0], train_df_trimmed_pca8[train_df.kmeans_y==cat, 1],
color=color, alpha=.8, lw=2, label=cat)
```

```python
pd.crosstab(test_df.kmeans_y, test_df.labels2)
```
<div>
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th>labels2</th>
<th>0</th>
<th>1</th>
</tr>
<tr>
<th>kmeans_y</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>4795</td>
<td>9515</td>
</tr>
<tr>
<th>1</th>
<td>5131</td>
<td>87</td>
</tr>
<tr>
<th>2</th>
<td>1997</td>
<td>6</td>
</tr>
<tr>
<th>4</th>
<td>427</td>
<td>51</td>
</tr>
<tr>
<th>5</th>
<td>1</td>
<td>10</td>
</tr>
<tr>
<th>6</th>
<td>8</td>
<td>37</td>
</tr>
<tr>
<th>7</th>
<td>474</td>
<td>5</td>
</tr>
</tbody>
</table>
</div>
```python
# Ensembling strategy
# 1. For clusters that have an aggregate size of fewer than 200 samples, we consider them outliers and assign them the attack label.
# 2. For clusters with more than 95% of samples belonging to a single class, (either attack or benign) we assign the dominant label to the entire cluster.
# 3. For each of the remaining clusters, we train a separate random forest classifier
```
```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
```
### Cluster 0 - Random Forest Classifier (Strategy Option 3)
```python
train_y0 = train_df[train_df.kmeans_y==0]
test_y0 = test_df[test_df.kmeans_y==0]
rfc = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=17).fit(train_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1), train_y0['labels2'])
pred_y0 = rfc.predict(test_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1))
print("cluster {} score is {}, {}".format(0, accuracy_score(pred_y0, test_y0['labels2']), accuracy_score(pred_y0, test_y0['labels2'], normalize=False)))
print(confusion_matrix(test_y0['labels2'], pred_y0))
```
cluster 0 score is 0.7673654786862334, 10981
[[1618 3177]
[ 152 9363]]
### Cluster 1 - Dominant Label Zero (Strategy Option 2)
```python
print(confusion_matrix(test_df[test_df.kmeans_y==1]['labels2'], np.zeros(len(test_df[test_df.kmeans_y==1]))))
```
[[5131 0]
[ 87 0]]
### Cluster 2 - Dominant Label Zero (Strategy Option 2)
```python
print(confusion_matrix(test_df[test_df.kmeans_y==2]['labels2'], np.zeros(len(test_df[test_df.kmeans_y==2]))))
```
[[1997 0]
[ 6 0]]
### Cluster 3 - Empty Cluster
### Cluster 4 - Random Forest Classifier (Strategy Option 3)
```python
train_y0 = train_df[train_df.kmeans_y==4]
test_y0 = test_df[test_df.kmeans_y==4]
rfc = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=17).fit(train_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1), train_y0['labels2'])
pred_y0 = rfc.predict(test_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1))
print("cluster {} score is {}, {}".format(4, accuracy_score(pred_y0, test_y0['labels2']), accuracy_score(pred_y0, test_y0['labels2'], normalize=False)))
print(confusion_matrix(test_y0['labels2'], pred_y0))
```
cluster 4 score is 0.9309623430962343, 445
[[405 22]
[ 11 40]]
### Cluster 5 - Outlier/Attack (Strategy Option 1)
```python
print(confusion_matrix(test_df[test_df.kmeans_y==5]['labels2'], np.ones(len(test_df[test_df.kmeans_y==5]))))
```
[[ 0 1]
[ 0 10]]
### Cluster 6 - Outlier/Attack (Strategy Option 1)
```python
print(confusion_matrix(test_df[test_df.kmeans_y==6]['labels2'], np.ones(len(test_df[test_df.kmeans_y==6]))))
```
[[ 0 8]
[ 0 37]]
### Cluster 7 - Dominant Label Zero (Strategy Option 2)
```python
print(confusion_matrix(test_df[test_df.kmeans_y==7]['labels2'], np.zeros(len(test_df[test_df.kmeans_y==7]))))
```
[[474 0]
[ 5 0]]
### Combined Results: k-means + Random Forest Classifier ensembling with AR feature selection
```python
# combined results:
num_samples = 22544
false_pos = 3177 + 22 + 1 + 8
false_neg = 152 + 87 + 6 + 11 + 5
print('True positive %: {}'.format(1-(false_pos/num_samples)))
print('True negative %: {}'.format(1-(false_neg/num_samples)))
```
True positive %: 0.8577004968062456
True negative %: 0.9884226401703335
<style>
span.hidden-xs:after {
content: ' × ML Security' !important;
}
</style>
###### tags: `ML Security`