Lab 1-3 - HackMD

# Lab 1-3 [TOC] # Code ```python import os from collections import defaultdict import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline ``` ```python import warnings warnings.filterwarnings('ignore') ``` ## Reading and processing dataset ```python dataset_root = 'datasets/nsl-kdd' ``` ```python train_file = os.path.join(dataset_root, 'KDDTrain+.txt') test_file = os.path.join(dataset_root, 'KDDTest+.txt') ``` ```python # Original KDD dataset feature names obtained from # http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names # http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html header_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'success_pred'] ``` ```python # Differentiating between nominal, binary, and numeric features # root_shell is marked as a continuous feature in the kddcup.names # file, but it is supposed to be a binary feature according to the # dataset documentation col_names = np.array(header_names) nominal_idx = [1, 2, 3] binary_idx = [6, 11, 13, 14, 20, 21] numeric_idx = list(set(range(41)).difference(nominal_idx).difference(binary_idx)) nominal_cols = col_names[nominal_idx].tolist() binary_cols = col_names[binary_idx].tolist() numeric_cols = col_names[numeric_idx].tolist() ``` ```python # training_attack_types.txt maps each of the 22 different attacks to 1 of 4 categories # file obtained from http://kdd.ics.uci.edu/databases/kddcup99/training_attack_types category = defaultdict(list) category['benign'].append('normal') with open('datasets/training_attack_types.txt', 'r') as f: for line in f.readlines(): attack, cat = line.strip().split(' ') category[cat].append(attack) attack_mapping = dict((v,k) for k in category for v in category[k]) ``` ## Generating and analyzing train and test sets ```python train_df = pd.read_csv(train_file, names=header_names) train_df['attack_category'] = train_df['attack_type'] \ .map(lambda x: attack_mapping[x]) train_df.drop(['success_pred'], axis=1, inplace=True) test_df = pd.read_csv(test_file, names=header_names) test_df['attack_category'] = test_df['attack_type'] \ .map(lambda x: attack_mapping[x]) test_df.drop(['success_pred'], axis=1, inplace=True) ``` ```python train_attack_types = train_df['attack_type'].value_counts() train_attack_cats = train_df['attack_category'].value_counts() ``` ```python test_attack_types = test_df['attack_type'].value_counts() test_attack_cats = test_df['attack_category'].value_counts() ``` ```python train_attack_types.plot(kind='barh', figsize=(20,10), fontsize=20) ``` <matplotlib.axes._subplots.AxesSubplot at 0x7f18046419e8> ![](https://i.imgur.com/we8VaBH.png) ```python train_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30) ``` <matplotlib.axes._subplots.AxesSubplot at 0x7f1804550f60> ![](https://i.imgur.com/x1Lwu8H.png) ```python test_attack_types.plot(kind='barh', figsize=(20,10), fontsize=15) ``` <matplotlib.axes._subplots.AxesSubplot at 0x7f18044bf518> ![](https://i.imgur.com/D8hRTM2.png) ```python test_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30) ``` <matplotlib.axes._subplots.AxesSubplot at 0x7f1804527160> ![](https://i.imgur.com/OW7PKva.png) ```python # Let's take a look at the binary features # By definition, all of these features should have a min of 0.0 and a max of 1.0 train_df[binary_cols].describe().transpose() ``` <div> <style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>count</th> <th>mean</th> <th>std</th> <th>min</th> <th>25%</th> <th>50%</th> <th>75%</th> <th>max</th> </tr> </thead> <tbody> <tr> <th>land</th> <td>125973.0</td> <td>0.000198</td> <td>0.014086</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>1.0</td> </tr> <tr> <th>logged_in</th> <td>125973.0</td> <td>0.395736</td> <td>0.489010</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>1.0</td> <td>1.0</td> </tr> <tr> <th>root_shell</th> <td>125973.0</td> <td>0.001342</td> <td>0.036603</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>1.0</td> </tr> <tr> <th>su_attempted</th> <td>125973.0</td> <td>0.001103</td> <td>0.045154</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>2.0</td> </tr> <tr> <th>is_host_login</th> <td>125973.0</td> <td>0.000008</td> <td>0.002817</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>1.0</td> </tr> <tr> <th>is_guest_login</th> <td>125973.0</td> <td>0.009423</td> <td>0.096612</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>0.0</td> <td>1.0</td> </tr> </tbody> </table> </div> ```python # Wait a minute... the su_attempted column has a max value of 2.0? train_df.groupby(['su_attempted']).size() ``` su_attempted 0 125893 1 21 2 59 dtype: int64 ```python # Let's fix this discrepancy and assume that su_attempted=2 -> su_attempted=0 train_df['su_attempted'].replace(2, 0, inplace=True) # Q1: replace su_attempted value 2 to 0 in test data train_df.groupby(['su_attempted']).size() ``` su_attempted 0 125952 1 21 dtype: int64 ```python # Next, we notice that the num_outbound_cmds column only takes on one value! train_df.groupby(['num_outbound_cmds']).size() ``` num_outbound_cmds 0 125973 dtype: int64 ```python # Now, that's not a very useful feature - let's drop it from the dataset train_df.drop('num_outbound_cmds', axis = 1, inplace=True) test_df.drop('num_outbound_cmds', axis = 1, inplace=True) numeric_cols.remove('num_outbound_cmds') ``` ## Data preparation ```python train_Y = train_df['attack_category'] train_x_raw = train_df.drop(['attack_category','attack_type'], axis=1) test_Y = test_df['attack_category'] test_x_raw = test_df.drop(['attack_category','attack_type'], axis=1) ``` ```python combined_df_raw = pd.concat([train_x_raw, test_x_raw]) combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True) train_x = combined_df[:len(train_x_raw)] test_x = combined_df[len(train_x_raw):] # Store dummy variable feature names dummy_variables = list(set(train_x)-set(combined_df_raw)) ``` ```python train_x.describe() ``` <div> <style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>duration</th> <th>src_bytes</th> <th>dst_bytes</th> <th>land</th> <th>wrong_fragment</th> <th>urgent</th> <th>hot</th> <th>num_failed_logins</th> <th>logged_in</th> <th>num_compromised</th> <th>...</th> <th>flag_REJ</th> <th>flag_RSTO</th> <th>flag_RSTOS0</th> <th>flag_RSTR</th> <th>flag_S0</th> <th>flag_S1</th> <th>flag_S2</th> <th>flag_S3</th> <th>flag_SF</th> <th>flag_SH</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>125973.00000</td> <td>1.259730e+05</td> <td>1.259730e+05</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>...</td> <td>125973.00000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> </tr> <tr> <th>mean</th> <td>287.14465</td> <td>4.556674e+04</td> <td>1.977911e+04</td> <td>0.000198</td> <td>0.022687</td> <td>0.000111</td> <td>0.204409</td> <td>0.001222</td> <td>0.395736</td> <td>0.279250</td> <td>...</td> <td>0.08917</td> <td>0.012399</td> <td>0.000818</td> <td>0.019218</td> <td>0.276655</td> <td>0.002897</td> <td>0.001008</td> <td>0.000389</td> <td>0.594929</td> <td>0.002151</td> </tr> <tr> <th>std</th> <td>2604.51531</td> <td>5.870331e+06</td> <td>4.021269e+06</td> <td>0.014086</td> <td>0.253530</td> <td>0.014366</td> <td>2.149968</td> <td>0.045239</td> <td>0.489010</td> <td>23.942042</td> <td>...</td> <td>0.28499</td> <td>0.110661</td> <td>0.028583</td> <td>0.137292</td> <td>0.447346</td> <td>0.053750</td> <td>0.031736</td> <td>0.019719</td> <td>0.490908</td> <td>0.046332</td> </tr> <tr> <th>min</th> <td>0.00000</td> <td>0.000000e+00</td> <td>0.000000e+00</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> </tr> <tr> <th>25%</th> <td>0.00000</td> <td>0.000000e+00</td> <td>0.000000e+00</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> </tr> <tr> <th>50%</th> <td>0.00000</td> <td>4.400000e+01</td> <td>0.000000e+00</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> </tr> <tr> <th>75%</th> <td>0.00000</td> <td>2.760000e+02</td> <td>5.160000e+02</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> </tr> <tr> <th>max</th> <td>42908.00000</td> <td>1.379964e+09</td> <td>1.309937e+09</td> <td>1.000000</td> <td>3.000000</td> <td>3.000000</td> <td>77.000000</td> <td>5.000000</td> <td>1.000000</td> <td>7479.000000</td> <td>...</td> <td>1.00000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> </tr> </tbody> </table> <p>8 rows × 118 columns</p> </div> ```python # Example statistics for the 'duration' feature before scaling train_x['duration'].describe() ``` count 125973.00000 mean 287.14465 std 2604.51531 min 0.00000 25% 0.00000 50% 0.00000 75% 0.00000 max 42908.00000 Name: duration, dtype: float64 ```python # Experimenting with StandardScaler on the single 'duration' feature from sklearn.preprocessing import StandardScaler durations = train_x['duration'].values.reshape(-1, 1) standard_scaler = StandardScaler().fit(durations) scaled_durations = standard_scaler.transform(durations) pd.Series(scaled_durations.flatten()).describe() ``` count 1.259730e+05 mean 2.549477e-17 std 1.000004e+00 min -1.102492e-01 25% -1.102492e-01 50% -1.102492e-01 75% -1.102492e-01 max 1.636428e+01 dtype: float64 ```python # Experimenting with MinMaxScaler on the single 'duration' feature from sklearn.preprocessing import MinMaxScaler # Q2: Use MinMaxScaler, simaliar to standardscalar # Usage: fit, transform, descibe ``` count 125973.000000 mean 0.006692 std 0.060700 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 1.000000 dtype: float64 ```python # Experimenting with RobustScaler on the single 'duration' feature from sklearn.preprocessing import RobustScaler min_max_scaler = RobustScaler().fit(durations) robust_scaled_durations = min_max_scaler.transform(durations) pd.Series(robust_scaled_durations.flatten()).describe() ``` count 125973.00000 mean 287.14465 std 2604.51531 min 0.00000 25% 0.00000 50% 0.00000 75% 0.00000 max 42908.00000 dtype: float64 ```python # Let's proceed with StandardScaler- Apply to all the numeric columns #Q3: Change to other scaler and test for it's effectness standard_scaler = StandardScaler().fit(train_x[numeric_cols]) train_x[numeric_cols] = \ standard_scaler.transform(train_x[numeric_cols]) test_x[numeric_cols] = \ standard_scaler.transform(test_x[numeric_cols]) ``` ```python train_x.describe() ``` <div> <style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>duration</th> <th>src_bytes</th> <th>dst_bytes</th> <th>land</th> <th>wrong_fragment</th> <th>urgent</th> <th>hot</th> <th>num_failed_logins</th> <th>logged_in</th> <th>num_compromised</th> <th>...</th> <th>flag_REJ</th> <th>flag_RSTO</th> <th>flag_RSTOS0</th> <th>flag_RSTR</th> <th>flag_S0</th> <th>flag_S1</th> <th>flag_S2</th> <th>flag_S3</th> <th>flag_SF</th> <th>flag_SH</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>1.259730e+05</td> <td>1.259730e+05</td> <td>1.259730e+05</td> <td>125973.000000</td> <td>1.259730e+05</td> <td>1.259730e+05</td> <td>1.259730e+05</td> <td>1.259730e+05</td> <td>125973.000000</td> <td>1.259730e+05</td> <td>...</td> <td>125973.00000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> <td>125973.000000</td> </tr> <tr> <th>mean</th> <td>2.549477e-17</td> <td>-4.512349e-19</td> <td>7.614590e-19</td> <td>0.000198</td> <td>4.230328e-19</td> <td>4.455945e-18</td> <td>-2.244894e-17</td> <td>2.989431e-18</td> <td>0.395736</td> <td>-6.549957e-18</td> <td>...</td> <td>0.08917</td> <td>0.012399</td> <td>0.000818</td> <td>0.019218</td> <td>0.276655</td> <td>0.002897</td> <td>0.001008</td> <td>0.000389</td> <td>0.594929</td> <td>0.002151</td> </tr> <tr> <th>std</th> <td>1.000004e+00</td> <td>1.000004e+00</td> <td>1.000004e+00</td> <td>0.014086</td> <td>1.000004e+00</td> <td>1.000004e+00</td> <td>1.000004e+00</td> <td>1.000004e+00</td> <td>0.489010</td> <td>1.000004e+00</td> <td>...</td> <td>0.28499</td> <td>0.110661</td> <td>0.028583</td> <td>0.137292</td> <td>0.447346</td> <td>0.053750</td> <td>0.031736</td> <td>0.019719</td> <td>0.490908</td> <td>0.046332</td> </tr> <tr> <th>min</th> <td>-1.102492e-01</td> <td>-7.762241e-03</td> <td>-4.918644e-03</td> <td>0.000000</td> <td>-8.948642e-02</td> <td>-7.735985e-03</td> <td>-9.507567e-02</td> <td>-2.702282e-02</td> <td>0.000000</td> <td>-1.166364e-02</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> </tr> <tr> <th>25%</th> <td>-1.102492e-01</td> <td>-7.762241e-03</td> <td>-4.918644e-03</td> <td>0.000000</td> <td>-8.948642e-02</td> <td>-7.735985e-03</td> <td>-9.507567e-02</td> <td>-2.702282e-02</td> <td>0.000000</td> <td>-1.166364e-02</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> </tr> <tr> <th>50%</th> <td>-1.102492e-01</td> <td>-7.754745e-03</td> <td>-4.918644e-03</td> <td>0.000000</td> <td>-8.948642e-02</td> <td>-7.735985e-03</td> <td>-9.507567e-02</td> <td>-2.702282e-02</td> <td>0.000000</td> <td>-1.166364e-02</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> </tr> <tr> <th>75%</th> <td>-1.102492e-01</td> <td>-7.715224e-03</td> <td>-4.790326e-03</td> <td>0.000000</td> <td>-8.948642e-02</td> <td>-7.735985e-03</td> <td>-9.507567e-02</td> <td>-2.702282e-02</td> <td>1.000000</td> <td>-1.166364e-02</td> <td>...</td> <td>0.00000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> <td>0.000000</td> <td>0.000000</td> <td>1.000000</td> <td>0.000000</td> </tr> <tr> <th>max</th> <td>1.636428e+01</td> <td>2.350675e+02</td> <td>3.257486e+02</td> <td>1.000000</td> <td>1.174348e+01</td> <td>2.088191e+02</td> <td>3.571955e+01</td> <td>1.104972e+02</td> <td>1.000000</td> <td>3.123689e+02</td> <td>...</td> <td>1.00000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> <td>1.000000</td> </tr> </tbody> </table> <p>8 rows × 118 columns</p> </div> ```python train_Y_bin = train_Y.apply(lambda x: 0 if x is 'benign' else 1) test_Y_bin = test_Y.apply(lambda x: 0 if x is 'benign' else 1) ``` ```python # 5-class classification version from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix, zero_one_loss classifier = DecisionTreeClassifier(random_state=17) classifier.fit(train_x, train_Y) pred_y = classifier.predict(test_x) results = confusion_matrix(test_Y, pred_y) error = zero_one_loss(test_Y, pred_y) print(results) print(error) ``` [[9365 56 289 1 0] [1541 5998 97 0 0] [ 675 220 1528 0 0] [2278 1 14 277 4] [ 179 0 5 5 11]] 0.237979063165 ```python from sklearn.neighbors import KNeighborsClassifier # Q4: Implement KNeighborsClassifier print(results) print(error) ``` [[9457 57 193 2 2] [1675 5894 67 0 0] [ 670 156 1597 0 0] [2369 2 37 126 40] [ 176 0 4 7 13]] 0.242059971611 ```python from sklearn.svm import LinearSVC #Q5: Implement LinearSVC print(results) print(error) ``` [[9008 292 407 3 1] [1972 5654 10 0 0] [ 714 122 1500 87 0] [2472 1 1 100 0] [ 185 2 0 4 9]] 0.278255855216 ## Dealing with class imbalance ```python test_Y.value_counts().apply(lambda x: x/float(len(test_Y))) ``` benign 0.430758 dos 0.338715 r2l 0.114177 probe 0.107479 u2r 0.008872 Name: attack_category, dtype: float64 ```python train_Y.value_counts().apply(lambda x: x/float(len(train_Y))) ``` benign 0.534583 dos 0.364578 probe 0.092528 r2l 0.007899 u2r 0.000413 Name: attack_category, dtype: float64 ```python print(pd.Series(train_Y).value_counts()) ``` benign 67343 dos 45927 probe 11656 r2l 995 u2r 52 Name: attack_category, dtype: int64 ```python from imblearn.over_sampling import SMOTE sm = SMOTE(ratio='auto', random_state=0) train_x_sm, train_Y_sm = sm.fit_sample(train_x, train_Y) print(pd.Series(train_Y_sm).value_counts()) ``` probe 67343 dos 67343 r2l 67343 benign 67343 u2r 67343 dtype: int64 ```python from imblearn.under_sampling import RandomUnderSampler mean_class_size = int(pd.Series(train_Y).value_counts().sum()/5) ratio = {'benign': mean_class_size, 'dos': mean_class_size, 'probe': mean_class_size, 'r2l': mean_class_size, 'u2r': mean_class_size} rus = RandomUnderSampler(ratio=ratio, random_state=0, replacement=True) train_x_rus, train_Y_rus = rus.fit_sample(train_x_sm, train_Y_sm) print(pd.Series(train_Y_rus).value_counts()) ``` r2l 25194 benign 25194 u2r 25194 probe 25194 dos 25194 dtype: int64 ```python from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix, zero_one_loss classifier = DecisionTreeClassifier(random_state=17) classifier.fit(train_x_rus, train_Y_rus) pred_y = classifier.predict(test_x) results = confusion_matrix(test_Y, pred_y) error = zero_one_loss(test_Y, pred_y) print(results) print(error) ``` [[9369 73 258 6 5] [1221 5768 647 0 0] [ 270 170 1980 1 2] [1829 2 369 369 5] [ 62 0 108 21 9]] 0.223962029808 ## Attempting unsupervised learning ```python # First, let's visualize the dataset (only numeric cols) from sklearn.decomposition import PCA # Use PCA to reduce dimensionality so we can visualize the dataset on a 2d plot pca = PCA(n_components=2) train_x_pca_cont = pca.fit_transform(train_x[numeric_cols]) plt.figure(figsize=(15,10)) colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple'] for color, cat in zip(colors, category.keys()): plt.scatter(train_x_pca_cont[train_Y==cat, 0], train_x_pca_cont[train_Y==cat, 1], color=color, alpha=.8, lw=2, label=cat) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.show() ``` ![](https://i.imgur.com/MfqU1gN.png) ```python # Apply k-means (k=5, only using numeric cols) + PCA + plot from sklearn.cluster import KMeans # Fit the training data to a k-means clustering estimator model kmeans = KMeans(n_clusters=5, random_state=17).fit(train_x[numeric_cols]) # Retrieve the labels assigned to each training sample kmeans_y = kmeans.labels_ # Plot in 2d with train_x_pca_cont plt.figure(figsize=(15,10)) colors = ['navy', 'turquoise', 'darkorange', 'red', 'purple'] for color, cat in zip(colors, range(5)): plt.scatter(train_x_pca_cont[kmeans_y==cat, 0], train_x_pca_cont[kmeans_y==cat, 1], color=color, alpha=.8, lw=2, label=cat) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.show() ``` ![](https://i.imgur.com/izAvDgz.png) ```python print('Total number of features: {}'.format(len(train_x.columns))) print('Total number of continuous features: {}'.format(len(train_x[numeric_cols].columns))) ``` Total number of features: 118 Total number of continuous features: 31 ```python from sklearn.metrics import completeness_score,\ homogeneity_score, v_measure_score print('Completeness: {}'.format(completeness_score(test_Y, pred_y))) print('Homogeneity: {}'.format(homogeneity_score(test_Y, pred_y))) print('V-measure: {}'.format(v_measure_score(test_Y, pred_y))) ``` Completeness: 0.5313035431629083 Homogeneity: 0.4373279456197804 V-measure: 0.4797570380949557 ## Using "Attribute Ratio" (AR) feature selection ```python averages = train_df.loc[:, numeric_cols].mean() ``` ```python averages_per_class = train_df[numeric_cols+['attack_category']].groupby('attack_category').mean() ``` ```python AR = {} for col in numeric_cols: AR[col] = max(averages_per_class[col])/averages[col] ``` ```python AR ``` {'duration': 7.2258291572125568, 'src_bytes': 8.4640642049489454, 'dst_bytes': 9.1548543553434012, 'wrong_fragment': 2.7428963354889282, 'urgent': 173.03983516483518, 'hot': 40.774516817095183, 'num_failed_logins': 46.038556418455919, 'num_compromised': 4.3385392749839271, 'num_root': 2.6091432537726016, 'num_file_creations': 62.233624927703879, 'num_shells': 326.11353550295854, 'num_access_files': 4.6948792486583191, 'count': 2.1174082949142403, 'srv_count': 1.1773191099992069, 'serror_rate': 2.6310546426370025, 'srv_serror_rate': 2.6432463184901405, 'rerror_rate': 3.6455860878284372, 'srv_rerror_rate': 3.6677418023254122, 'same_srv_rate': 1.507961200604778, 'diff_srv_rate': 4.0690854850685172, 'srv_diff_host_rate': 3.0815657101101674, 'dst_host_count': 1.3428596865228266, 'dst_host_srv_count': 1.6453161847397422, 'dst_host_same_srv_rate': 1.5575788279744123, 'dst_host_diff_srv_rate': 4.8373418489732671, 'dst_host_same_src_port_rate': 4.3930803788834885, 'dst_host_srv_diff_host_rate': 5.7568806827546997, 'dst_host_serror_rate': 2.6293396511769247, 'dst_host_srv_serror_rate': 2.6731595957142456, 'dst_host_rerror_rate': 3.2795669242442695, 'dst_host_srv_rerror_rate': 3.667920527965804} ```python def binary_AR(df, col): series_zero = train_df[train_df[col] == 0].groupby('attack_category').size() series_one = train_df[train_df[col] == 1].groupby('attack_category').size() return max(series_one/series_zero) ``` ```python # Recreating dataframes with 2-class and 5-class labels labels2 = ['normal', 'attack'] labels5 = ['normal', 'dos', 'probe', 'r2l', 'u2r'] train_df = pd.read_csv(train_file, names=header_names) train_df['attack_category'] = train_df['attack_type'] \ .map(lambda x: attack_mapping[x]) train_df.drop(['success_pred'], axis=1, inplace=True) test_df = pd.read_csv(test_file, names=header_names) test_df['attack_category'] = test_df['attack_type'] \ .map(lambda x: attack_mapping[x]) test_df.drop(['success_pred'], axis=1, inplace=True) train_attack_types = train_df['attack_type'].value_counts() train_attack_cats = train_df['attack_category'].value_counts() test_attack_types = test_df['attack_type'].value_counts() test_attack_cats = test_df['attack_category'].value_counts() ``` ```python train_df['su_attempted'].replace(2, 0, inplace=True) test_df['su_attempted'].replace(2, 0, inplace=True) train_df.drop('num_outbound_cmds', axis = 1, inplace=True) test_df.drop('num_outbound_cmds', axis = 1, inplace=True) ``` ```python train_df['labels2'] = train_df.apply(lambda x: 'normal' if 'normal' in x['attack_type'] else 'attack', axis=1) test_df['labels2'] = test_df.apply(lambda x: 'normal' if 'normal' in x['attack_type'] else 'attack', axis=1) combined_df = pd.concat([train_df, test_df]) original_cols = combined_df.columns combined_df = pd.get_dummies(combined_df, columns=nominal_cols, drop_first=True) added_cols = set(combined_df.columns) - set(original_cols) added_cols= list(added_cols) ``` ```python combined_df.attack_category = pd.Categorical(combined_df.attack_category) combined_df.labels2 = pd.Categorical(combined_df.labels2) combined_df['labels5'] = combined_df['attack_category'].cat.codes combined_df['labels2'] = combined_df['labels2'].cat.codes ``` ```python train_df = combined_df[:len(train_df)] test_df = combined_df[len(train_df):] ``` ```python for col in binary_cols+dummy_variables: cur_AR = binary_AR(train_df, col) if cur_AR: AR[col] = cur_AR ``` ```python train_df[train_df.service_Z39_50 == 1].groupby('attack_category').size() ``` attack_category benign 0 dos 851 probe 11 r2l 0 u2r 0 dtype: int64 ```python len(binary_cols+added_cols) ``` 87 ```python import operator AR = dict((k, v) for k,v in AR.items() if not np.isnan(v)) sorted_AR = sorted(AR.items(), key=lambda x:x[1], reverse=True) ``` ```python sorted_AR ``` [('protocol_type_tcp', inf), ('num_shells', 326.11353550295854), ('urgent', 173.03983516483518), ('num_file_creations', 62.233624927703879), ('flag_SF', 51.0), ('num_failed_logins', 46.038556418455919), ('hot', 40.774516817095183), ('logged_in', 10.569767441860465), ('dst_bytes', 9.1548543553434012), ('src_bytes', 8.4640642049489454), ('duration', 7.2258291572125568), ('dst_host_srv_diff_host_rate', 5.7568806827546997), ('dst_host_diff_srv_rate', 4.8373418489732671), ('num_access_files', 4.6948792486583191), ('dst_host_same_src_port_rate', 4.3930803788834885), ('num_compromised', 4.3385392749839271), ('diff_srv_rate', 4.0690854850685172), ('dst_host_srv_rerror_rate', 3.667920527965804), ('srv_rerror_rate', 3.6677418023254122), ('rerror_rate', 3.6455860878284372), ('dst_host_rerror_rate', 3.2795669242442695), ('srv_diff_host_rate', 3.0815657101101674), ('flag_S0', 2.965034965034965), ('wrong_fragment', 2.7428963354889282), ('dst_host_srv_serror_rate', 2.6731595957142456), ('srv_serror_rate', 2.6432463184901405), ('serror_rate', 2.6310546426370025), ('dst_host_serror_rate', 2.6293396511769247), ('num_root', 2.6091432537726016), ('count', 2.1174082949142403), ('service_telnet', 1.8888888888888888), ('dst_host_srv_count', 1.6453161847397422), ('dst_host_same_srv_rate', 1.5575788279744123), ('service_ftp_data', 1.5447570332480818), ('same_srv_rate', 1.507961200604778), ('dst_host_count', 1.3428596865228266), ('service_http', 1.2988666621151088), ('srv_count', 1.1773191099992069), ('root_shell', 1.0), ('service_private', 0.72528123149792778), ('service_eco_i', 0.54037267080745344), ('is_guest_login', 0.45894428152492667), ('service_ftp', 0.45680819912152271), ('flag_REJ', 0.32650506429953341), ('flag_RSTR', 0.23005487547488393), ('protocol_type_udp', 0.22644739478045495), ('service_other', 0.16945921541085582), ('service_domain_u', 0.15493320070658045), ('service_smtp', 0.11654010677454654), ('service_ecr_i', 0.066012116147900562), ('flag_RSTO', 0.048472075869336141), ('service_finger', 0.026095310440358364), ('flag_SH', 0.023263980335352472), ('service_Z39_50', 0.018879226195758277), ('service_uucp', 0.017029097834270781), ('service_courier', 0.0160615915577089), ('service_auth', 0.015544843445957898), ('service_bgp', 0.015455027858848501), ('service_uucp_path', 0.014938896377980597), ('service_iso_tsap', 0.014916467780429593), ('service_whois', 0.014804339660163068), ('service_imap4', 0.013729168965897804), ('service_nnsp', 0.013729168965897804), ('service_vmnet', 0.013371284834844774), ('service_time', 0.012142983074753174), ('service_ctf', 0.011853092158893123), ('service_csnet_ns', 0.011741639864299247), ('service_supdup', 0.011630212119209674), ('service_http_443', 0.011518808915514052), ('service_discard', 0.011451978769793203), ('service_domain', 0.011184746471740902), ('service_daytime', 0.011073441352588941), ('service_gopher', 0.010672945733022314), ('service_efs', 0.010517283108539242), ('service_exec', 0.010228322555100963), ('service_systat', 0.010117227879561), ('service_link', 0.0099839465177138081), ('service_hostnames', 0.0098284960422163597), ('service_name', 0.0094068001494538346), ('service_klogin', 0.0093402487802733952), ('service_login', 0.0092293493308721729), ('service_mtp', 0.0091406473160334858), ('service_echo', 0.0091406473160334858), ('service_urp_i', 0.0089745894762075992), ('flag_RSTOS0', 0.0089154332208084483), ('service_ldap', 0.0088524734206133025), ('service_netbios_dgm', 0.0086087624903920055), ('service_sunrpc', 0.0080995653891742393), ('service_netbios_ssn', 0.0076572030365527231), ('service_netstat', 0.0075466731018142726), ('service_netbios_ns', 0.0073698756333486874), ('service_kshell', 0.0063985975676564043), ('service_ssh', 0.0061560706305043159), ('service_nntp', 0.0061560706305043159), ('flag_S1', 0.0053895076289152306), ('service_sql_net', 0.0050991377423731778), ('flag_S3', 0.0030241935483870967), ('service_pop_3', 0.0027696293759399615), ('service_ntp_u', 0.0025009304056568663), ('flag_S2', 0.0017702011186481019), ('service_rje', 0.0015466575012888812), ('service_remote_job', 0.0015466575012888812), ('service_pop_2', 0.0015264845061822622), ('service_printer', 0.0013517933064428214), ('service_shell', 0.0011553385359898854), ('su_attempted', 0.001006036217303823), ('service_X11', 0.00099589749687853022), ('service_pm_dump', 0.00042914771264269161), ('land', 0.00039207998431680063), ('service_aol', 0.00017161489617298782), ('service_http_8001', 0.00017161489617298782), ('service_harvest', 0.00017161489617298782), ('service_urh_i', 0.00014851558671082529), ('service_red_i', 0.00011880894037276305), ('service_http_2784', 8.5800085800085798e-05), ('service_tim_i', 7.4252279544982031e-05), ('service_tftp_u', 4.4550044550044547e-05), ('is_host_login', 1.4849573817231445e-05)] ```python # Only keep features with AR value >= 0.01 features_to_use = [] for x,y in sorted_AR: if y >= 0.01: features_to_use.append(x) features_to_use ``` ['protocol_type_tcp', 'num_shells', 'urgent', 'num_file_creations', 'flag_SF', 'num_failed_logins', 'hot', 'logged_in', 'dst_bytes', 'src_bytes', 'duration', 'dst_host_srv_diff_host_rate', 'dst_host_diff_srv_rate', 'num_access_files', 'dst_host_same_src_port_rate', 'num_compromised', 'diff_srv_rate', 'dst_host_srv_rerror_rate', 'srv_rerror_rate', 'rerror_rate', 'dst_host_rerror_rate', 'srv_diff_host_rate', 'flag_S0', 'wrong_fragment', 'dst_host_srv_serror_rate', 'srv_serror_rate', 'serror_rate', 'dst_host_serror_rate', 'num_root', 'count', 'service_telnet', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'service_ftp_data', 'same_srv_rate', 'dst_host_count', 'service_http', 'srv_count', 'root_shell', 'service_private', 'service_eco_i', 'is_guest_login', 'service_ftp', 'flag_REJ', 'flag_RSTR', 'protocol_type_udp', 'service_other', 'service_domain_u', 'service_smtp', 'service_ecr_i', 'flag_RSTO', 'service_finger', 'flag_SH', 'service_Z39_50', 'service_uucp', 'service_courier', 'service_auth', 'service_bgp', 'service_uucp_path', 'service_iso_tsap', 'service_whois', 'service_imap4', 'service_nnsp', 'service_vmnet', 'service_time', 'service_ctf', 'service_csnet_ns', 'service_supdup', 'service_http_443', 'service_discard', 'service_domain', 'service_daytime', 'service_gopher', 'service_efs', 'service_exec', 'service_systat'] ```python len(features_to_use) ``` 76 ```python len(sorted_AR) - len(features_to_use) ``` 42 ```python train_df_trimmed = train_df[features_to_use] test_df_trimmed = test_df[features_to_use] ``` ```python numeric_cols_to_use = list(set(numeric_cols).intersection(features_to_use)) ``` ```python # Rescaling is necessary after reducing dimensions standard_scaler = StandardScaler() train_df_trimmed[numeric_cols_to_use] = standard_scaler.fit_transform(train_df_trimmed[numeric_cols_to_use]) test_df_trimmed[numeric_cols_to_use] = standard_scaler.transform(test_df_trimmed[numeric_cols_to_use]) ``` ## Applying advanced ensembling ```python kmeans = KMeans(n_clusters=8, random_state=17) kmeans.fit(train_df_trimmed[numeric_cols_to_use]) kmeans_train_y = kmeans.labels_ ``` ```python pd.crosstab(kmeans_train_y, train_Y_bin) ``` <div> <style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th>attack_category</th> <th>0</th> <th>1</th> </tr> <tr> <th>row_0</th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>0</th> <td>63569</td> <td>6457</td> </tr> <tr> <th>1</th> <td>2784</td> <td>11443</td> </tr> <tr> <th>2</th> <td>126</td> <td>34700</td> </tr> <tr> <th>3</th> <td>1</td> <td>0</td> </tr> <tr> <th>4</th> <td>628</td> <td>4335</td> </tr> <tr> <th>5</th> <td>167</td> <td>757</td> </tr> <tr> <th>6</th> <td>0</td> <td>884</td> </tr> <tr> <th>7</th> <td>68</td> <td>54</td> </tr> </tbody> </table> </div> ```python train_df['kmeans_y'] = kmeans_train_y train_df_trimmed['kmeans_y'] = kmeans_train_y ``` ```python kmeans_test_y = kmeans.predict(test_df_trimmed[numeric_cols_to_use]) test_df['kmeans_y'] = kmeans_test_y ``` ```python pca8 = PCA(n_components=2) train_df_trimmed_pca8 = pca8.fit_transform(train_df_trimmed) plt.figure(figsize=(15,10)) colors8 = ['navy', 'turquoise', 'darkorange', 'red', 'purple', 'green', 'magenta', 'black'] labels8 = [0,1,2,3,4,5,6,7] for color, cat in zip(colors8, labels8): plt.scatter(train_df_trimmed_pca8[train_df.kmeans_y==cat, 0], train_df_trimmed_pca8[train_df.kmeans_y==cat, 1], color=color, alpha=.8, lw=2, label=cat) ``` ![](https://i.imgur.com/tgrbMPa.png) ```python pd.crosstab(test_df.kmeans_y, test_df.labels2) ``` <div> <style> .dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; } </style> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th>labels2</th> <th>0</th> <th>1</th> </tr> <tr> <th>kmeans_y</th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>0</th> <td>4795</td> <td>9515</td> </tr> <tr> <th>1</th> <td>5131</td> <td>87</td> </tr> <tr> <th>2</th> <td>1997</td> <td>6</td> </tr> <tr> <th>4</th> <td>427</td> <td>51</td> </tr> <tr> <th>5</th> <td>1</td> <td>10</td> </tr> <tr> <th>6</th> <td>8</td> <td>37</td> </tr> <tr> <th>7</th> <td>474</td> <td>5</td> </tr> </tbody> </table> </div> ```python # Ensembling strategy # 1. For clusters that have an aggregate size of fewer than 200 samples, we consider them outliers and assign them the attack label. # 2. For clusters with more than 95% of samples belonging to a single class, (either attack or benign) we assign the dominant label to the entire cluster. # 3. For each of the remaining clusters, we train a separate random forest classifier ``` ```python from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix ``` ### Cluster 0 - Random Forest Classifier (Strategy Option 3) ```python train_y0 = train_df[train_df.kmeans_y==0] test_y0 = test_df[test_df.kmeans_y==0] rfc = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=17).fit(train_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1), train_y0['labels2']) pred_y0 = rfc.predict(test_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1)) print("cluster {} score is {}, {}".format(0, accuracy_score(pred_y0, test_y0['labels2']), accuracy_score(pred_y0, test_y0['labels2'], normalize=False))) print(confusion_matrix(test_y0['labels2'], pred_y0)) ``` cluster 0 score is 0.7673654786862334, 10981 [[1618 3177] [ 152 9363]] ### Cluster 1 - Dominant Label Zero (Strategy Option 2) ```python print(confusion_matrix(test_df[test_df.kmeans_y==1]['labels2'], np.zeros(len(test_df[test_df.kmeans_y==1])))) ``` [[5131 0] [ 87 0]] ### Cluster 2 - Dominant Label Zero (Strategy Option 2) ```python print(confusion_matrix(test_df[test_df.kmeans_y==2]['labels2'], np.zeros(len(test_df[test_df.kmeans_y==2])))) ``` [[1997 0] [ 6 0]] ### Cluster 3 - Empty Cluster ### Cluster 4 - Random Forest Classifier (Strategy Option 3) ```python train_y0 = train_df[train_df.kmeans_y==4] test_y0 = test_df[test_df.kmeans_y==4] rfc = RandomForestClassifier(n_estimators=500, max_depth=20, random_state=17).fit(train_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1), train_y0['labels2']) pred_y0 = rfc.predict(test_y0.drop(['labels2', 'labels5', 'kmeans_y', 'attack_category', 'attack_type'], axis=1)) print("cluster {} score is {}, {}".format(4, accuracy_score(pred_y0, test_y0['labels2']), accuracy_score(pred_y0, test_y0['labels2'], normalize=False))) print(confusion_matrix(test_y0['labels2'], pred_y0)) ``` cluster 4 score is 0.9309623430962343, 445 [[405 22] [ 11 40]] ### Cluster 5 - Outlier/Attack (Strategy Option 1) ```python print(confusion_matrix(test_df[test_df.kmeans_y==5]['labels2'], np.ones(len(test_df[test_df.kmeans_y==5])))) ``` [[ 0 1] [ 0 10]] ### Cluster 6 - Outlier/Attack (Strategy Option 1) ```python print(confusion_matrix(test_df[test_df.kmeans_y==6]['labels2'], np.ones(len(test_df[test_df.kmeans_y==6])))) ``` [[ 0 8] [ 0 37]] ### Cluster 7 - Dominant Label Zero (Strategy Option 2) ```python print(confusion_matrix(test_df[test_df.kmeans_y==7]['labels2'], np.zeros(len(test_df[test_df.kmeans_y==7])))) ``` [[474 0] [ 5 0]] ### Combined Results: k-means + Random Forest Classifier ensembling with AR feature selection ```python # combined results: num_samples = 22544 false_pos = 3177 + 22 + 1 + 8 false_neg = 152 + 87 + 6 + 11 + 5 print('True positive %: {}'.format(1-(false_pos/num_samples))) print('True negative %: {}'.format(1-(false_neg/num_samples))) ``` True positive %: 0.8577004968062456 True negative %: 0.9884226401703335 <style> span.hidden-xs:after { content: ' × ML Security' !important; } </style> ###### tags: `ML Security`

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.