From 11caccf8ce58291b1124f55ebe1be11495027b66 Mon Sep 17 00:00:00 2001 From: VladLub <vladlubkovskiy1@gmail.com> Date: Sun, 23 Mar 2025 17:12:28 +0100 Subject: [PATCH] simpl epush so i can continue on the pc --- code/machine_learning_models/decision_tree.py | 50 ++++++++++--------- code/machine_learning_models/utilities.py | 7 ++- code/{gui/gui.py => main.py} | 18 +++++++ 3 files changed, 48 insertions(+), 27 deletions(-) rename code/{gui/gui.py => main.py} (92%) diff --git a/code/machine_learning_models/decision_tree.py b/code/machine_learning_models/decision_tree.py index 8b45961..6d83d6e 100644 --- a/code/machine_learning_models/decision_tree.py +++ b/code/machine_learning_models/decision_tree.py @@ -2,41 +2,45 @@ import numpy as np import pandas as pd import seaborn as sns import warnings +import sys +import os -from sklearn.metrics import classification_report, confusion_matrix +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from sklearn.metrics import classification_report, confusion_matrix from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.tree import DecisionTreeClassifier - -from utilities import plot_counts -from utilities import plot_features, ordinal_encode, normalize, plot_confusion_matrix, print_high_confidence_samples, import_data +import utilities as util warnings.filterwarnings("ignore") - # Constants y_data = 'class' y_columns = ['normal', 'anomaly'] -df_train, df_test, model_name = import_data( - train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff", - test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff", - model_name = "Decision Tree") +train_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'nsl-kdd-dataset', 'KDDTrain+.arff')) +test_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'nsl-kdd-dataset', 'KDDTest+.arff')) + +df_train, df_test, model_name = util.import_data( + train_file_path=train_file_path, + test_file_path=test_file_path, + model_name="Decision Tree" +) sc = StandardScaler() enc = LabelEncoder() # Normalize data -ordinal_encode(df = df_train, categories = y_columns, target = y_data) -ordinal_encode(df = df_test, categories = y_columns, target = y_data) +util.ordinal_encode(df=df_train, categories=y_columns, target=y_data) +util.ordinal_encode(df=df_test, categories=y_columns, target=y_data) -normalize(df_train, df_test, y_data, sc, enc) +util.normalize(df_train, df_test, y_data, sc, enc) # Plot absolute quantities of class 0 and class 1 -sns.countplot(x = y_data, data = df_train) -plot_counts(model_name = model_name) +sns.countplot(x=y_data, data=df_train) +util.plot_counts(model_name=model_name) # Separate X and y -X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) -X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) +X_train = df_train.select_dtypes(include=[np.number]).drop(columns=[y_data]) +X_test = df_test.select_dtypes(include=[np.number]).drop(columns=[y_data]) y_train = df_train[[y_data]] y_test = df_test[[y_data]] @@ -71,20 +75,20 @@ dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) y_prediction = dtc.predict(X_test) print("Classification report: \n", classification_report(y_test, y_prediction)) -plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction), - accuracy = dtc.score(X_test, y_test), - model_name=model_name) +util.plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction), + accuracy=dtc.score(X_test, y_test), + model_name=model_name) # Determine feature importance features = pd.DataFrame(dtc.feature_importances_, - index= X_train.columns, + index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False) -plot_features(features, model_name = model_name) +util.plot_features(features, model_name=model_name) def predict(prediction_input): if len(prediction_input) == 0: return - input_data = pd.DataFrame(prediction_input, columns = X_train.columns) + input_data = pd.DataFrame(prediction_input, columns=X_train.columns) return dtc.predict(input_data) -print_high_confidence_samples(model = dtc, x = X_train) +util.print_high_confidence_samples(model=dtc, x=X_train) \ No newline at end of file diff --git a/code/machine_learning_models/utilities.py b/code/machine_learning_models/utilities.py index cfbca48..8c2f3a6 100644 --- a/code/machine_learning_models/utilities.py +++ b/code/machine_learning_models/utilities.py @@ -179,15 +179,14 @@ def save_plot(name): # Data processing -def import_data(train_file_path: str, test_file_path: str, model_name: str): +def import_data(train_file_path, test_file_path, model_name): data, meta = arff.loadarff(train_file_path) df_train = pd.DataFrame(data) - df_train = df_train.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) + df_train = df_train.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) - # Importing test data set data, meta = arff.loadarff(test_file_path) df_test = pd.DataFrame(data) - df_test = df_test.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) + df_test = df_test.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) return df_train, df_test, model_name diff --git a/code/gui/gui.py b/code/main.py similarity index 92% rename from code/gui/gui.py rename to code/main.py index e10add7..525e5e1 100644 --- a/code/gui/gui.py +++ b/code/main.py @@ -3,6 +3,24 @@ from tkinter import scrolledtext, ttk, Menu import subprocess import threading import os +import sys + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'machine_learning_models'))) + +from machine_learning_models import utilities as util + +train_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'machine_learning_models', 'nsl-kdd-dataset', 'KDDTrain+.arff')) +test_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'machine_learning_models', 'nsl-kdd-dataset', 'KDDTest+.arff')) + + +# Import data using the correct paths +df_train, df_test, model_name = util.import_data( + train_file_path=train_file_path, + test_file_path=test_file_path, + model_name=None +) + +from machine_learning_models import decision_tree, random_forest, knn, logistic_regression from matplotlib import pyplot as plt from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg -- GitLab