diff --git a/code/machine_learning_models/random_forest.py b/code/machine_learning_models/random_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..c030c0511029b2b0c8c11229082e454eba81d390 --- /dev/null +++ b/code/machine_learning_models/random_forest.py @@ -0,0 +1,80 @@ +import numpy as np +import pandas as pd +from scipy.io import arff + +import warnings +from sklearn.preprocessing import StandardScaler, LabelEncoder + +from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, print_high_confidence_samples + +warnings.filterwarnings("ignore") + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report, confusion_matrix + +# Constants +X_data = 'count' +y_data = 'class' +arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff" +sc = StandardScaler() +enc = LabelEncoder() +model_name = "Random Forest" + +# Getting data frame (ARFF) +data, meta = arff.loadarff(arff_path) +df = pd.DataFrame(data) +df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) + +# Data inspection and pre-processing +print("Before: \n", df) + +is_threat = df[y_data].unique() +if len(is_threat) != 2: + raise Exception("Random Forest supports both binary and multi-class classification.") + +numerical_columns = df.select_dtypes(include=np.number).columns +label_columns = df.select_dtypes(include=object, exclude=np.number).columns + +# Normalize data +ordinal_encode(df= df, categories = is_threat, target = y_data) +normalize(df, y_data, sc, enc) +print("After: \n", df) + +# Correlation +heat_map(df, model_name=model_name) + +# Data preparation +X = df.select_dtypes(include = np.number).drop(columns = y_data) # X must be independent +y = df[[y_data]] # y must ideally be dependent on X + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Initialize and train the Random Forest model +model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42) +model.fit(X_train, y_train.values.ravel()) + +# Predictions +y_prediction = model.predict(X_test) + +# Plot Confusion Matrix +plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction), + accuracy=model.score(X_test, y_test), + model_name=model_name) + +print(classification_report(y_test, y_prediction)) + +# Prediction function +def predict(prediction_input): + if len(prediction_input) == 0: + return + input_df = pd.DataFrame(prediction_input, columns=X.columns) + input_df[numerical_columns] = sc.transform(input_df[numerical_columns]) + return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)] + +print_high_confidence_samples(model, X) + +# Feature Importance Plot +features = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) + +plot_features(features, "Higher importance = More impact on classification", model_name=model_name) diff --git a/code/machine_learning_models/resulting_figures/Decision Tree - Features.png b/code/machine_learning_models/resulting_figures/Decision Tree - Feature Importance.png similarity index 100% rename from code/machine_learning_models/resulting_figures/Decision Tree - Features.png rename to code/machine_learning_models/resulting_figures/Decision Tree - Feature Importance.png diff --git a/code/machine_learning_models/resulting_figures/Logistic Regression - Confusion Matrix.png b/code/machine_learning_models/resulting_figures/Logistic Regression - Confusion Matrix.png index 2eef5ed9a66d318db590eb6d0e25a4b0083034c4..899ea6e6ff2bf328ecf37c2d6166d87ff3d052a9 100644 Binary files a/code/machine_learning_models/resulting_figures/Logistic Regression - Confusion Matrix.png and b/code/machine_learning_models/resulting_figures/Logistic Regression - Confusion Matrix.png differ diff --git a/code/machine_learning_models/resulting_figures/Logistic Regression - Counts.png b/code/machine_learning_models/resulting_figures/Logistic Regression - Counts.png index 2eef5ed9a66d318db590eb6d0e25a4b0083034c4..b8ba6161600e62b8222d00a3abdce740105aa78c 100644 Binary files a/code/machine_learning_models/resulting_figures/Logistic Regression - Counts.png and b/code/machine_learning_models/resulting_figures/Logistic Regression - Counts.png differ diff --git a/code/machine_learning_models/resulting_figures/Logistic Regression - Data Correlations.png b/code/machine_learning_models/resulting_figures/Logistic Regression - Data Correlations.png new file mode 100644 index 0000000000000000000000000000000000000000..c029361b010c1403c01b3cac34965bb095651607 Binary files /dev/null and b/code/machine_learning_models/resulting_figures/Logistic Regression - Data Correlations.png differ diff --git a/code/machine_learning_models/resulting_figures/Logistic Regression - Feature Importance.png b/code/machine_learning_models/resulting_figures/Logistic Regression - Feature Importance.png new file mode 100644 index 0000000000000000000000000000000000000000..7b79d8afa59157795b0145eca45e70cc63d395ea Binary files /dev/null and b/code/machine_learning_models/resulting_figures/Logistic Regression - Feature Importance.png differ diff --git a/code/machine_learning_models/resulting_figures/Logistic Regression - Features.png b/code/machine_learning_models/resulting_figures/Logistic Regression - Features.png deleted file mode 100644 index 2eef5ed9a66d318db590eb6d0e25a4b0083034c4..0000000000000000000000000000000000000000 Binary files a/code/machine_learning_models/resulting_figures/Logistic Regression - Features.png and /dev/null differ diff --git a/code/machine_learning_models/resulting_figures/Logistic Regression - Heat Map.png b/code/machine_learning_models/resulting_figures/Logistic Regression - Heat Map.png deleted file mode 100644 index 2eef5ed9a66d318db590eb6d0e25a4b0083034c4..0000000000000000000000000000000000000000 Binary files a/code/machine_learning_models/resulting_figures/Logistic Regression - Heat Map.png and /dev/null differ diff --git a/code/machine_learning_models/resulting_figures/Random Forest - Confusion Matrix.png b/code/machine_learning_models/resulting_figures/Random Forest - Confusion Matrix.png new file mode 100644 index 0000000000000000000000000000000000000000..54e83b77680adf294ae69f5eba5d7fc601632ea8 Binary files /dev/null and b/code/machine_learning_models/resulting_figures/Random Forest - Confusion Matrix.png differ diff --git a/code/machine_learning_models/resulting_figures/Random Forest - Data Correlations.png b/code/machine_learning_models/resulting_figures/Random Forest - Data Correlations.png new file mode 100644 index 0000000000000000000000000000000000000000..c029361b010c1403c01b3cac34965bb095651607 Binary files /dev/null and b/code/machine_learning_models/resulting_figures/Random Forest - Data Correlations.png differ diff --git a/code/machine_learning_models/resulting_figures/Random Forest - Feature Importance.png b/code/machine_learning_models/resulting_figures/Random Forest - Feature Importance.png new file mode 100644 index 0000000000000000000000000000000000000000..fc18888dfb789c50062a043200f4782f58c71862 Binary files /dev/null and b/code/machine_learning_models/resulting_figures/Random Forest - Feature Importance.png differ diff --git a/code/machine_learning_models/utilities.py b/code/machine_learning_models/utilities.py index b32dec3d3210c280fc7e43998001b1643d4aecc4..a4e6eacc06eeca8011d61f18b5840069ff163dfc 100644 --- a/code/machine_learning_models/utilities.py +++ b/code/machine_learning_models/utilities.py @@ -44,7 +44,7 @@ def heat_map(df, model_name=None): plt.figure(figsize=(15, 12)) sns.heatmap(corr) if model_name: - save_plot(model_name + " - Heat Map") + save_plot(model_name + " - Data Correlations") if show_plots: plt.show() @@ -102,7 +102,7 @@ def plot_features(features, info_text: str = None, model_name=None): y_max = plt.ylim()[0] plt.text(x_max, y_max, info_text, verticalalignment='bottom', horizontalalignment='right') if model_name: - save_plot(model_name + " - Features") + save_plot(model_name + " - Feature Importance") if show_plots: plt.show()