diff --git a/code/machine_learning_models/decision_tree.py b/code/machine_learning_models/decision_tree.py index 32ff0fbb30f04ebc3c60fb506e7adaccfa570de5..2d22b7679046cbbf10ff9b7a179fe4d61c8c1887 100644 --- a/code/machine_learning_models/decision_tree.py +++ b/code/machine_learning_models/decision_tree.py @@ -3,66 +3,45 @@ import pandas as pd import seaborn as sns import warnings -from scipy.io import arff from sklearn.metrics import classification_report, confusion_matrix -from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.tree import DecisionTreeClassifier from utilities import plot_counts -from utilities import plot_features, ordinal_encode, normalize, plot_confusion_matrix +from utilities import plot_features, ordinal_encode, normalize, plot_confusion_matrix, print_high_confidence_samples, import_data warnings.filterwarnings("ignore") # Constants y_data = 'class' -read_csv = False -csv_path = "nsl-kdd-dataset/" + "fruits_dataset.csv" -arff_path = "nsl-kdd-dataset/" + "KDDTest+.arff" +df_train, df_test, model_name = import_data( + train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff", + test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff", + model_name = "Decision Tree") sc = StandardScaler() enc = LabelEncoder() -model_name = "Decision Tree" -if read_csv: - # Getting data frame (CSV) - df = pd.read_csv(csv_path) -else: - # Getting data frame (ARFF) - data, meta = arff.loadarff(arff_path) - df = pd.DataFrame(data) - df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) - -X = df.select_dtypes(include=[np.number]) -y = df[y_data] - -is_threat = df[y_data].unique() +is_threat = df_train[y_data].unique() if len(is_threat) != 2: raise Exception("Target must be a binary decision.") -ordinal_encode(df= df, categories = is_threat, target = y_data) -normalize(df, y_data, sc, enc) - -sns.countplot(x = y_data, data = df) -plot_counts(model_name=model_name) +# Normalize data +ordinal_encode(df = df_train, categories = is_threat, target = y_data) +ordinal_encode(df = df_test, categories = is_threat, target = y_data) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) +normalize(df_train, df_test, y_data, sc, enc) -def train(dct_model): - dct_model.fit(X_train, y_train) - y_prediction = dct_model.predict(X_test) - print("Classification report: \n", classification_report(y_test, y_prediction)) - plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction), - accuracy = dct_model.score(X_test, y_test), - model_name=model_name) +# Plot absolute quantities of class 0 and class 1 +sns.countplot(x = y_data, data = df_train) +plot_counts(model_name = model_name) -dtc = DecisionTreeClassifier() -train(dtc) -features = pd.DataFrame(dtc.feature_importances_, - index= X.columns, - columns=['Importance']).sort_values(by='Importance', ascending=False) -plot_features(features, model_name=model_name) +# Separate X and y +X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) +X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) +y_train = df_train[[y_data]] +y_test = df_test[[y_data]] # Different parameters # Criterion modifiers ('criterion'): @@ -89,17 +68,26 @@ plot_features(features, model_name=model_name) # What is overfitting? # Machine learning model "memorizes" the training data, rather than understanding the underlying pattern -# Different parameters -dtc2 = DecisionTreeClassifier(criterion ='entropy', ccp_alpha = 0.04) -train(dtc2) -features = pd.DataFrame(dtc2.feature_importances_, - index= X.columns, +# Training model +dtc = DecisionTreeClassifier() +dtc.fit(X_train, y_train) +y_prediction = dtc.predict(X_test) +print("Classification report: \n", classification_report(y_test, y_prediction)) +plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction), + accuracy = dtc.score(X_test, y_test), + model_name=model_name) + +# Determine feature importance +features = pd.DataFrame(dtc.feature_importances_, + index= X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) -plot_features(features, model_name=model_name + "2") +plot_features(features, model_name = model_name) def predict(prediction_input): if len(prediction_input) == 0: return - input_data = pd.DataFrame(prediction_input, columns = X.columns) + input_data = pd.DataFrame(prediction_input, columns = X_train.columns) return dtc.predict(input_data) + +print_high_confidence_samples(model = dtc, x = X_train) diff --git a/code/machine_learning_models/knn.py b/code/machine_learning_models/knn.py index 7ce8704b6ad7ff748f34b7ba07b9fd7fa4bbc810..518e5162e4a1e6f93eaaac370bbad489d958b9a0 100644 --- a/code/machine_learning_models/knn.py +++ b/code/machine_learning_models/knn.py @@ -3,62 +3,54 @@ import warnings import numpy as np import pandas as pd import seaborn as sns -from scipy.io import arff from sklearn.metrics import classification_report, confusion_matrix -from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder -from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts +from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts, import_data warnings.filterwarnings("ignore") # Constants y_data = 'class' -read_csv = False -csv_path = "nsl-kdd-dataset/" + "fruits_dataset.csv" -arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff" +df_train, df_test, model_name = import_data( + train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff", + test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff", + model_name = "KNN") sc = StandardScaler() enc = LabelEncoder() -model_name = "KNN" -if read_csv: - # Getting data frame (CSV) - df = pd.read_csv(csv_path) -else: - # Getting data frame (ARFF) - data, meta = arff.loadarff(arff_path) - df = pd.DataFrame(data) - df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) - -X = df.select_dtypes(include=[np.number]) -y = df[y_data] - -is_threat = df[y_data].unique() +is_threat = df_train[y_data].unique() if len(is_threat) != 2: raise Exception("Target must be a binary decision.") -ordinal_encode(df= df, categories = is_threat, target = y_data) -normalize(df, y_data, sc, enc) +# Normalize data +ordinal_encode(df = df_train, categories = is_threat, target = y_data) +ordinal_encode(df = df_test, categories = is_threat, target = y_data) -sns.countplot(x = y_data, data = df) -plot_counts(model_name=model_name) +normalize(df_train, df_test, y_data, sc, enc) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) +# Plot absolute quantities of class 0 and class 1 +sns.countplot(x = y_data, data = df_train) +plot_counts(model_name = model_name) -def train(knn_model): - knn_model.fit(X_train, y_train) - y_prediction = knn_model.predict(X_test) - print("Classification report: \n", classification_report(y_test, y_prediction)) - plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction), - accuracy = knn_model.score(X_test, y_test), - model_name=model_name) +# Separate X and y +X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) +X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) +y_train = df_train[[y_data]] +y_test = df_test[[y_data]] -knn = KNeighborsClassifier() -train(knn) +# Training model +knn_model = KNeighborsClassifier() +knn_model.fit(X_train, y_train) +y_prediction = knn_model.predict(X_test) +print("Classification report: \n", classification_report(y_test, y_prediction)) +plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction), + accuracy = knn_model.score(X_test, y_test), + model_name=model_name) def predict(prediction_input): if len(prediction_input) == 0: return input_data = pd.DataFrame(prediction_input, columns = X.columns) - return knn.predict(input_data) + return knn_model.predict(input_data) diff --git a/code/machine_learning_models/logistic_regression.py b/code/machine_learning_models/logistic_regression.py index ae4cad2f9ca69952113cb5c57094d03c76723670..152d74050ba2c3033b40fccda10c4fca7738e0b8 100644 --- a/code/machine_learning_models/logistic_regression.py +++ b/code/machine_learning_models/logistic_regression.py @@ -1,66 +1,55 @@ import numpy as np import pandas as pd import seaborn as sns -from scipy.io import arff import warnings from sklearn.preprocessing import StandardScaler, LabelEncoder -from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, print_high_confidence_samples, plot_counts +from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, \ + print_high_confidence_samples, plot_counts, import_data warnings.filterwarnings("ignore") -from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix # Constants -X_data = 'count' y_data = 'class' -read_csv = False -csv_path = "nsl-kdd-dataset/" + "fruits_dataset.csv" -arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff" +df_train, df_test, model_name = import_data( + train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff", + test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff", + model_name = "Logistic Regression") sc = StandardScaler() enc = LabelEncoder() -model_name = "Logistic Regression" - -if read_csv: - # Getting data frame (CSV) - df = pd.read_csv(csv_path) -else: - # Getting data frame (ARFF) - data, meta = arff.loadarff(arff_path) - df = pd.DataFrame(data) - df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) # Data inspection and pre-processing -print("Before: \n", df) -is_threat = df[y_data].unique() +is_threat = df_train[y_data].unique() if len(is_threat) != 2: raise Exception("Logistic Regression only works for binary classification.") -numerical_columns = df.select_dtypes(include=np.number).columns -label_columns = df.select_dtypes(include=object, exclude=np.number).columns +numerical_columns = df_train.select_dtypes(include = np.number).columns +label_columns = df_train.select_dtypes(include=object, exclude = np.number).columns # Normalize data -ordinal_encode(df= df, categories = is_threat, target = y_data) -normalize(df, y_data, sc, enc) -print("After: \n", df) +ordinal_encode(df = df_train, categories = is_threat, target = y_data) +ordinal_encode(df = df_test, categories = is_threat, target = y_data) + +normalize(df_train, df_test, y_data, sc, enc) # Correlation -heat_map(df, model_name=model_name) +heat_map(df_train, model_name = model_name) # Count plot -sns.countplot(x = y_data, data = df) +sns.countplot(x = y_data, data = df_train) plot_counts(model_name=model_name) -# Data preparation -X = df.select_dtypes(include = np.number).drop(columns = y_data) # X must be independent -y = df[[y_data]] # y must ideally be dependent on X, and in case of logistic regression, must be a binary decision - -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) +# Separate X and y +X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) +X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) +y_train = df_train[[y_data]] +y_test = df_test[[y_data]] # 'penalty' # Helps preventing overfitting. @@ -96,19 +85,23 @@ y_prediction = model.predict(X_test) plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction), accuracy = model.score(X_test, y_test), model_name=model_name) -print(classification_report(y_test, y_prediction)) +print("Classification report: \n", classification_report(y_test, y_prediction)) + +# Display feature importance +features = pd.DataFrame(model.coef_[0], index= X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) +plot_features(features = features, + info_text= "Positive coefficient = More probable to land in class 1 \n" + "Negative coefficient = Less probable to land in class 1", + model_name=model_name) + +# Display samples for which the model is 90% confident +print_high_confidence_samples(model, X_train) def predict(prediction_input): if len(prediction_input) == 0: return - input_df = pd.DataFrame(prediction_input, columns = X.columns) + input_df = pd.DataFrame(prediction_input, columns = X_train.columns) input_df[numerical_columns] = sc.transform(input_df[numerical_columns]) return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)] -print_high_confidence_samples(model, X) -features = pd.DataFrame(model.coef_[0], index= X.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) - -plot_features(features, "Positive coefficient = More probable to land in class 1\n" - "Negative coefficient = Less probable to land in class 1", - model_name=model_name) diff --git a/code/machine_learning_models/random_forest.py b/code/machine_learning_models/random_forest.py index 882abf824980157a19e6a4d1f83459ec9ac774af..0cb6b76c209070360ffc9d6a5581d47b92a53259 100644 --- a/code/machine_learning_models/random_forest.py +++ b/code/machine_learning_models/random_forest.py @@ -8,42 +8,40 @@ from sklearn.model_selection import train_test_split, learning_curve from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve -from utilities import plot_precision_recall_curve, plot_learning_curve +from utilities import plot_precision_recall_curve, plot_learning_curve, import_data from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, print_high_confidence_samples warnings.filterwarnings("ignore") # Constants -X_data = 'count' y_data = 'class' -arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff" +df_train, df_test, model_name = import_data( + train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff", + test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff", + model_name = "Random Forest") sc = StandardScaler() enc = LabelEncoder() -model_name = "Random Forest" -data, meta = arff.loadarff(arff_path) -df = pd.DataFrame(data) -df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) +print("Before Processing: \n", df_train) -print("Before Processing: \n", df) +is_threat = df_train[y_data].unique() +numerical_columns = df_train.select_dtypes(include=np.number).columns +label_columns = df_train.select_dtypes(include=object, exclude=np.number).columns -is_threat = df[y_data].unique() -numerical_columns = df.select_dtypes(include=np.number).columns -label_columns = df.select_dtypes(include=object, exclude=np.number).columns +# Normalize data +ordinal_encode(df = df_train, categories = is_threat, target = y_data) +ordinal_encode(df = df_test, categories = is_threat, target = y_data) -# Normalize and encode data -ordinal_encode(df=df, categories=is_threat, target=y_data) -normalize(df, y_data, sc, enc) -print("After Processing: \n", df) +normalize(df_train, df_test, y_data, sc, enc) # Correlation -heat_map(df, model_name=model_name) +heat_map(df_train, model_name = model_name) -# Data Preparation -X = df.select_dtypes(include = np.number).drop(columns = y_data) # X must be independent -y = df[[y_data]] # y must ideally be dependent on X - -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +# Separate X and y +X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) +X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) +y_train = df_train[[y_data]] +y_test = df_test[[y_data]] # Train Random Forest Model model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42) @@ -57,20 +55,14 @@ plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction), accuracy=model.score(X_test, y_test), model_name=model_name) -print(classification_report(y_test, y_prediction)) +print("Classification Report: \n", classification_report(y_test, y_prediction)) -# Prediction function -def predict(prediction_input): - if len(prediction_input) == 0: - return - input_df = pd.DataFrame(prediction_input, columns=X.columns) - input_df[numerical_columns] = sc.transform(input_df[numerical_columns]) - return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)] -print_high_confidence_samples(model, X) +# Get high confidence samples for which the model is 90% confident +print_high_confidence_samples(model, X_train) # Feature Importance Plot -features = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) +features = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) plot_features(features, "Higher importance = More impact on classification", model_name=model_name) # Precision-Recall Curve @@ -81,5 +73,14 @@ plot_precision_recall_curve(precision, recall, model_name) # Learning Curve print("Calculating Learning Curve") -train_sizes, train_scores, test_scores = learning_curve(model, X, y.values.ravel(), cv=5, scoring="accuracy") +train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train.values.ravel(), cv=5, scoring="accuracy") plot_learning_curve(train_sizes, train_scores, test_scores, model_name) + +# Prediction function +def predict(prediction_input): + if len(prediction_input) == 0: + return + input_df = pd.DataFrame(prediction_input, columns=X.columns) + input_df[numerical_columns] = sc.transform(input_df[numerical_columns]) + return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)] + diff --git a/code/machine_learning_models/utilities.py b/code/machine_learning_models/utilities.py index 5e54172c0eea104fe2d4ddc88e1ab524232e9e02..502f3b719e9f1e2061b27c01a2a0ca109e2774f6 100644 --- a/code/machine_learning_models/utilities.py +++ b/code/machine_learning_models/utilities.py @@ -4,6 +4,8 @@ import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt +from scipy.io import arff +from sklearn.base import BaseEstimator from sklearn.preprocessing import OrdinalEncoder show_plots = False @@ -106,14 +108,18 @@ def plot_features(features, info_text: str = None, model_name=None): if show_plots: plt.show() -def normalize(df, exclude, numerical_scaler, label_scaler): +def normalize(df_train, df_test, exclude, numerical_scaler, label_scaler): - scale_targets = df.select_dtypes(include=np.number).drop(columns=exclude).columns - df[scale_targets] = numerical_scaler.fit_transform(df[scale_targets]) + scale_targets = df_train.select_dtypes(include=np.number).drop(columns=exclude).columns + df_train[scale_targets] = numerical_scaler.fit_transform(df_train[scale_targets]) + df_test[scale_targets] = numerical_scaler.transform(df_test[scale_targets]) - labels = df.select_dtypes(include=object, exclude=np.number).columns + labels = df_train.select_dtypes(include=object, exclude=np.number).columns for label in labels: - df[label] = label_scaler.fit_transform(df[label]) + df_train[label] = label_scaler.fit_transform(df_train[label]) + df_test[label] = label_scaler.transform(df_test[label]) + + def plot_confusion_matrix(confusion_matrix: List[List[int]], accuracy: float, model_name=None) -> None: if len(confusion_matrix) != 2 or any(len(row) != 2 for row in confusion_matrix): @@ -186,3 +192,15 @@ def plot_learning_curve(train_sizes, train_scores, test_scores, model_name=None) def save_plot(name): plt.savefig("resulting_figures/" + name, dpi=300, bbox_inches='tight') + +def import_data(train_file_path: str, test_file_path: str, model_name: str): + data, meta = arff.loadarff(train_file_path) + df_train = pd.DataFrame(data) + df_train = df_train.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) + + # Importing test data set + data, meta = arff.loadarff(test_file_path) + df_test = pd.DataFrame(data) + df_test = df_test.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) + + return df_train, df_test, model_name \ No newline at end of file