From 2dbcd144b46cc110126702e828e7bb44e07f8584 Mon Sep 17 00:00:00 2001
From: Daniel Yang <t5wol3yv@duck.com>
Date: Mon, 3 Mar 2025 13:11:48 +0100
Subject: [PATCH] removed pipeline (to keep it uniform), now using kddtrain+
 and kddtest+

---
 code/machine_learning_models/knn.py | 38 ++++++++++-------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/code/machine_learning_models/knn.py b/code/machine_learning_models/knn.py
index e79b523..88638b2 100644
--- a/code/machine_learning_models/knn.py
+++ b/code/machine_learning_models/knn.py
@@ -5,8 +5,6 @@ import seaborn as sns
 from sklearn.metrics import classification_report, confusion_matrix
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.preprocessing import StandardScaler, LabelEncoder
-from sklearn.pipeline import Pipeline
-from sklearn.model_selection import train_test_split
 
 from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts, import_data, plot_roc_curve
 
@@ -14,6 +12,7 @@ warnings.filterwarnings("ignore")
 
 # Constants
 y_data = 'class'
+y_values = ['normal', 'anomaly']
 df_train, df_test, model_name = import_data(
     train_file_path="nsl-kdd-dataset/" + "KDDTrain+.arff",
     test_file_path="nsl-kdd-dataset/" + "KDDTest+.arff",
@@ -22,13 +21,9 @@ df_train, df_test, model_name = import_data(
 sc = StandardScaler()
 enc = LabelEncoder()
 
-is_threat = df_train[y_data].unique()
-if len(is_threat) != 2:
-    raise ValueError("Target must be a binary decision.")
-
 # Normalize data
-ordinal_encode(df=df_train, categories=is_threat, target=y_data)
-ordinal_encode(df=df_test, categories=is_threat, target=y_data)
+ordinal_encode(df=df_train, categories=y_values, target=y_data)
+ordinal_encode(df=df_test, categories=y_values, target=y_data)
 
 normalize(df_train, df_test, y_data, sc, enc)
 
@@ -37,33 +32,26 @@ sns.countplot(x=y_data, data=df_train)
 plot_counts(model_name=model_name)
 
 # Separate X and y
-X = df_train.select_dtypes(include=[np.number]).drop(columns=[y_data])
-y = df_train[y_data]
-
-# Split data into training and testing sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-
-# Create a pipeline for scaling and KNN
-pipeline = Pipeline([
-    ('scaler', StandardScaler()),
-    ('knn', KNeighborsClassifier())
-])
+X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
+X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
+y_train = df_train[[y_data]]
+y_test = df_test[[y_data]]
 
 # Training model
-pipeline.fit(X_train, y_train)
-y_prediction = pipeline.predict(X_test)
+model = KNeighborsClassifier()
+model.fit(X_train, y_train)
+y_prediction = model.predict(X_test)
 print("Classification report: \n", classification_report(y_test, y_prediction))
 plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
-                      accuracy=pipeline.score(X_test, y_test),
+                      accuracy=model.score(X_test, y_test),
                       model_name=model_name)
 
 # Calculate prediction probabilities for ROC curve
-y_score = pipeline.predict_proba(X_test)[:, 1]
-
+y_score = model.predict_proba(X_test)[:, 1]
 plot_roc_curve(y_test, y_score, model_name=model_name)
 
 def predict(prediction_input):
     if len(prediction_input) == 0:
         return
     input_data = pd.DataFrame(prediction_input, columns=X_train.columns)
-    return pipeline.predict(input_data)
\ No newline at end of file
+    return model.predict(input_data)
\ No newline at end of file
-- 
GitLab