From 2dbcd144b46cc110126702e828e7bb44e07f8584 Mon Sep 17 00:00:00 2001 From: Daniel Yang <t5wol3yv@duck.com> Date: Mon, 3 Mar 2025 13:11:48 +0100 Subject: [PATCH] removed pipeline (to keep it uniform), now using kddtrain+ and kddtest+ --- code/machine_learning_models/knn.py | 38 ++++++++++------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/code/machine_learning_models/knn.py b/code/machine_learning_models/knn.py index e79b523..88638b2 100644 --- a/code/machine_learning_models/knn.py +++ b/code/machine_learning_models/knn.py @@ -5,8 +5,6 @@ import seaborn as sns from sklearn.metrics import classification_report, confusion_matrix from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler, LabelEncoder -from sklearn.pipeline import Pipeline -from sklearn.model_selection import train_test_split from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts, import_data, plot_roc_curve @@ -14,6 +12,7 @@ warnings.filterwarnings("ignore") # Constants y_data = 'class' +y_values = ['normal', 'anomaly'] df_train, df_test, model_name = import_data( train_file_path="nsl-kdd-dataset/" + "KDDTrain+.arff", test_file_path="nsl-kdd-dataset/" + "KDDTest+.arff", @@ -22,13 +21,9 @@ df_train, df_test, model_name = import_data( sc = StandardScaler() enc = LabelEncoder() -is_threat = df_train[y_data].unique() -if len(is_threat) != 2: - raise ValueError("Target must be a binary decision.") - # Normalize data -ordinal_encode(df=df_train, categories=is_threat, target=y_data) -ordinal_encode(df=df_test, categories=is_threat, target=y_data) +ordinal_encode(df=df_train, categories=y_values, target=y_data) +ordinal_encode(df=df_test, categories=y_values, target=y_data) normalize(df_train, df_test, y_data, sc, enc) @@ -37,33 +32,26 @@ sns.countplot(x=y_data, data=df_train) plot_counts(model_name=model_name) # Separate X and y -X = df_train.select_dtypes(include=[np.number]).drop(columns=[y_data]) -y = df_train[y_data] - -# Split data into training and testing sets -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - -# Create a pipeline for scaling and KNN -pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('knn', KNeighborsClassifier()) -]) +X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) +X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) +y_train = df_train[[y_data]] +y_test = df_test[[y_data]] # Training model -pipeline.fit(X_train, y_train) -y_prediction = pipeline.predict(X_test) +model = KNeighborsClassifier() +model.fit(X_train, y_train) +y_prediction = model.predict(X_test) print("Classification report: \n", classification_report(y_test, y_prediction)) plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction), - accuracy=pipeline.score(X_test, y_test), + accuracy=model.score(X_test, y_test), model_name=model_name) # Calculate prediction probabilities for ROC curve -y_score = pipeline.predict_proba(X_test)[:, 1] - +y_score = model.predict_proba(X_test)[:, 1] plot_roc_curve(y_test, y_score, model_name=model_name) def predict(prediction_input): if len(prediction_input) == 0: return input_data = pd.DataFrame(prediction_input, columns=X_train.columns) - return pipeline.predict(input_data) \ No newline at end of file + return model.predict(input_data) \ No newline at end of file -- GitLab