Intrusion detection(Binary classification) with PCA and RandomForest

'''
Project: NGuard
Dataset: http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/MachineLearningCSV.zip
'''
'\nProject: NGuard\nDataset: http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/MachineLearningCSV.zip\n' 

Extracting the parquet file which consist of network flows of Monday,Tuesday,Wednesday,Thursday and Friday

from google.colab import drive
drive.mount('/content/drive')


import pandas as pd
import numpy as np
import glob
import pathlib
import os
from joblib import dump,load

def extract_dataset():
  '''
  Look for the merged parquet file in the cwd,
  if not found unzip the dataset.zip and return the
  cwd of the data.
  '''
  file_name = pathlib.Path("merged.parquet")
  if not file_name.exists ():
    import zipfile
    with zipfile.ZipFile(os.getcwd()+"/drive/MyDrive/dataset.zip","r") as zip_ref:
      zip_ref.extractall()

  folder_path = os.getcwd() 
  return folder_path  


def read_as_dataframe(master_file):
  '''
  Look for the merged parquet file in the cwd,
  if not found merge the dataframes and return a single
  merged dataframe.
  '''
  file_name = pathlib.Path("merged.parquet")
  if not file_name.exists ():
    df = pd.concat(master_file,ignore_index=True)
    df.to_parquet('merged.parquet',index=False)
  else:  
    df = pd.read_parquet(file_name,engine='pyarrow')
  return df
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True). 

Extract the data from zip file

# returns the location of dataset
dirpath = extract_dataset()

Read parquet file as dataframe

mon = pd.read_parquet(f'{dirpath}/monday.parquet',engine="pyarrow")
tues = pd.read_parquet(f'{dirpath}/tuesday.parquet',engine="pyarrow")
wed =  pd.read_parquet(f'{dirpath}/wednesday.parquet',engine="pyarrow")
thurs =  pd.concat([
                    pd.read_parquet(f'{dirpath}/thursday1.parquet',engine="pyarrow"),
                    pd.read_parquet(f'{dirpath}/thursday2.parquet',engine="pyarrow"),
                    ],ignore_index=True)
fri =  pd.concat([pd.read_parquet(f'{dirpath}/friday1.parquet',engine="pyarrow"),
                    pd.read_parquet(f'{dirpath}/friday2.parquet',engine="pyarrow"),
                    pd.read_parquet(f'{dirpath}/friday3.parquet',engine="pyarrow"),
                    ],ignore_index=True)

Display the total numbers of data in each dataframe

print('monday', mon[' Label'].unique(),mon[' Label'].count())
print('tuesday', tues[' Label'].unique(),tues[' Label'].count())
print('wednesday', wed[' Label'].unique(),wed[' Label'].count())
print('thursday', thurs[' Label'].unique(),thurs[' Label'].count())
print('friday', fri[' Label'].unique(),fri[' Label'].count())
monday ['BENIGN'] 529918
tuesday ['BENIGN' 'FTP-Patator' 'SSH-Patator'] 445909
wednesday ['BENIGN' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed'] 692703
thursday ['BENIGN' 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'Infiltration'] 458968
friday ['BENIGN' 'Bot' 'PortScan' 'DDoS'] 703245 

We can see that monday has only benign data, and other days have mix of attack and benign labels. We can exclude monday data and form a combined dataset of other days.

merged  = read_as_dataframe([tues.drop_duplicates(),wed.drop_duplicates(),thurs.drop_duplicates(),fri.drop_duplicates()])

print(merged[' Label'].value_counts())
BENIGN                        1635861
DoS Hulk                       172849
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack � Brute Force         1470
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name:  Label, dtype: int64 

From above we can see, in the total dataset, infiltration, web attack sql injection and heartbleed has comparatively less amount of data

We will ignore them from training data but will use for testing purpose since they would be a type of unseen data for the model

infil = merged.loc[merged[' Label'] == 'Infiltration']
sqli = merged.loc[merged[' Label']== 'Web Attack � Sql Injection']
hb = merged.loc[merged[' Label']== 'Heartbleed']
merged.drop(infil.index,inplace=True)
merged.drop(sqli.index,inplace=True)
merged.drop(hb.index,inplace=True)
new_test = pd.concat([infil,sqli,hb],ignore_index=True)
del infil
del sqli
del hb

Statistics of merged dataframe

print(merged.columns)
print(merged.shape)
Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count',
       ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count',
       ' ECE Flag Count', ' Down/Up Ratio', ' Average Packet Size',
       ' Avg Fwd Segment Size', ' Avg Bwd Segment Size',
       ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', ' Fwd Avg Packets/Bulk',
       ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk', ' Bwd Avg Packets/Bulk',
       'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', ' Subflow Fwd Bytes',
       ' Subflow Bwd Packets', ' Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward', 'Active Mean', ' Active Std', ' Active Max',
       ' Active Min', 'Idle Mean', ' Idle Std', ' Idle Max', ' Idle Min',
       ' Label'],
      dtype='object')
(2061671, 79) 
The intrusion detection problem can be achieved as either anomaly based approach or even as supervised binary classification approach

We will proceed with the later one, converting all attack class lables as single class ‘anomaly’.

Using such method we will be left with 2 classes: Benign and Anomaly(Intrusion)

labels = merged[' Label'].copy()
print(labels.unique())
labels[labels != 'BENIGN']='ANOMALOUS'
print(labels.unique())

val = labels.value_counts()
print('benign:',(val['BENIGN']/(val['BENIGN']+val['ANOMALOUS']))*100 )
print('anomalous:',(val['ANOMALOUS']/(val['BENIGN']+val['ANOMALOUS'])*100 ))
['BENIGN' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris' 'DoS Slowhttptest'
 'DoS Hulk' 'DoS GoldenEye' 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Bot' 'PortScan' 'DDoS']
['BENIGN' 'ANOMALOUS']
benign: 79.34636515719531
anomalous: 20.653634842804696 

From above we see the dataset is pretty unbalanced for supervised learning. Such imbalance can cause overfitting to a perticular class especially benign.

Data processing

Before splitting the dataset into train and test, there are some considirations to be taken.


Frome the dataset these would be dropped: Destination port; reason no significance since services are run on any port by the host, example: ssh service can be run for any port, so port 22 attack dont necessarily have to trigger attack.

merged.replace([np.inf, -np.inf], np.nan, inplace=True)
merged[merged.columns[merged.isna().any()]].columns
Index(['Flow Bytes/s', ' Flow Packets/s'], dtype='object') 
merged[merged.columns[merged.isnull().any()]].columns
Index(['Flow Bytes/s', ' Flow Packets/s'], dtype='object') 

From above we can see [‘Flow Bytes/s’, ’ Flow Packets/s’] have nan or null or infinite vaues, we proceed cleaning by dropping these columns too. Further in the dataset ’ Fwd Header Length.1’ is redundent with the ’ Fwd Header Length’ column.

Dropping columns from the dataset and separating X and Y as input matrix and target vector and further denoting Benign as 1 and Anomalous as -1 we obtain as following

merged.drop([' Destination Port','Flow Bytes/s',' Flow Packets/s',' Fwd Header Length.1'],inplace=True,axis=1)
merged.columns = ['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len',
       'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max',
       'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt',
       'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt',
       'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio',
       'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg',
       'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg',
       'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts',
       'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts',
       'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std',
       'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max',
       'Idle Min', 'Label']
X = merged.drop(['Label'],axis=1)
merged[merged['Label'] != 'BENIGN']= 0
merged[merged['Label'] != 0]= 1
y = merged['Label'].copy()

Training Phase

Since the dataset is quite large it will be irrelavant to train with all about 28 lakh of data. So, for splitting into train test, we take about only 10 percent of it and rest for testing. We use stratify to maintain the propotion of binary class for now.

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,y,stratify=y,train_size=0.10)
print("X_train.shape",X_train.shape)
print("X_test.shape",X_test.shape)
X_train.shape (206167, 74)
X_test.shape (1855504, 74) 

Dimentionality Reduction with PCA

Before applying Principal Component Analysis, we need to scale the input matrix. We do that by using standar scaler. refer to:https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

As refering to ‘https://doi.org/10.3390/electronics8030322’ paper, we try to reduce 74 features into 10 principal components.

print("finite negative value",(X < 0).values.any())
print("finite positive value",(X > 0).values.any())
finite negative value True
finite positive value True 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


ss = StandardScaler()
X_Scale = ss.fit_transform(X_train.values)
print("X_scale shape:",X_Scale.shape)
dump(ss, 'bscaler.joblib') 


pca = PCA(n_components=10)
principal_components = pca.fit_transform(X_Scale)
print("X_pca shape:",principal_components.shape)
dump(pca, 'bpca.joblib') 
X_scale shape: (206167, 74)
X_pca shape: (206167, 10)





['bpca.joblib'] 
pca.explained_variance_ratio_
array([0.23016039, 0.14102824, 0.08945196, 0.0645161 , 0.04670917,
       0.04254841, 0.03680265, 0.03450822, 0.03041049, 0.03030438]) 

Still the training data is imbalanced as nearly on a 80:20 ratio we apply SMOTE to balance the training dataset by increasing the number of minor class data by over sampling.

Refer to: https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=2)
X,Y = sm.fit_resample(principal_components,Y_train.values.astype('int'))

print(X.shape)
print(y.shape)
(327172, 10)
(2061671,) 

Training with Random Forest classifier

Refer to: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Here, random forest an ensemble technique is used to train a model. The forest keeps on splitting until pure leaves are found. Here, total ensemblers used are 100, with random state =10.

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0,n_jobs=-1)
from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import HalvingGridSearchCV
param_grid = {"max_depth": [10, None],
              "min_samples_split": [5, 10]}
search = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',
                              max_resources=10,random_state=0).fit(X, Y)
print(search.best_params_ )
{'max_depth': None, 'min_samples_split': 5, 'n_estimators': 9} 
# search.best_params_['n_estimators']=50
clf = RandomForestClassifier(random_state=10,n_jobs=-1,**search.best_params_)
clf.fit(X,Y)
RandomForestClassifier(min_samples_split=5, n_estimators=9, n_jobs=-1,
                       random_state=10) 
dump(clf, 'binary.joblib') 
['binary.joblib'] 
Y_predicted = clf.predict(pca.transform(ss.transform(X_test.values)))

Verifying

def test_performance(y_actual,y_predicted):
    from sklearn.metrics import f1_score
    from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
    from sklearn.metrics import precision_recall_curve, precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import average_precision_score
    from sklearn.metrics import roc_curve, auc, classification_report



    score = f1_score(y_actual,y_predicted,average='micro')
    print('F1 Score: %.3f' % score)

    cmatrix = confusion_matrix(y_actual,y_predicted,labels=[1,0])
    cm_obj = ConfusionMatrixDisplay(cmatrix,display_labels=[1,0])
    cm_obj.plot()
    cm_obj.ax_.set(
                    title='Sklearn Confusion Matrix for Benign and Intrusions', 
                    xlabel='Predicted Class', 
                    ylabel='True Class')



    print('Precison',precision_score(y_actual, y_predicted, average='macro'))
    print('Recall',recall_score(y_actual, y_predicted, average='macro'))
    print('Misclassification',(cmatrix[0][1]+cmatrix[1][0])/(cmatrix[0][0]+cmatrix[0][1]+cmatrix[1][0]+cmatrix[1][1]))
    print('\n')
    print('Accuracy',(cmatrix[0][0]+cmatrix[1][1])/(cmatrix[0][0]+cmatrix[0][1]+cmatrix[1][0]+cmatrix[1][1]))
    print('FPR(A classicifed as B)',(cmatrix[1][0])/(cmatrix[1][0]+cmatrix[1][1]))

    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_actual, y_predicted)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print('roc auc',roc_auc)
    print("Classification report:" "\n",classification_report(y_actual,y_predicted))
test_performance(y_actual=Y_test.astype('int'),y_predicted=Y_predicted)
F1 Score: 0.990
Precison 0.9879654453757049
Recall 0.9802614457684058
Misclassification 0.010370767187782941


Accuracy 0.989629232812217
FPR(A classicifed as B) 0.03569928163056554
roc auc 0.9802614457684058
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.97    383229
           1       0.99      1.00      0.99   1472275

    accuracy                           0.99   1855504
   macro avg       0.99      0.98      0.98   1855504
weighted avg       0.99      0.99      0.99   1855504 

Verifying with new_test

new_test.drop([' Destination Port','Flow Bytes/s',' Flow Packets/s',' Fwd Header Length.1'],inplace=True,axis=1)
new_test.columns = ['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len',
       'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max',
       'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt',
       'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt',
       'URG Flag Cnt', 'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio',
       'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg',
       'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg',
       'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts',
       'Subflow Bwd Byts', 'Init Fwd Win Byts', 'Init Bwd Win Byts',
       'Fwd Act Data Pkts', 'Fwd Seg Size Min', 'Active Mean', 'Active Std',
       'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max',
       'Idle Min', 'Label']
xx = new_test.drop(['Label'],axis=1).drop_duplicates()
xx = pca.transform(ss.transform(xx.values))
p = clf.predict(xx)
np.unique(p,return_counts=True) 
(array([0, 1]), array([ 3, 65]))