Skip to content
Snippets Groups Projects
Commit 2dbcd144 authored by Daniel Yang's avatar Daniel Yang
Browse files

removed pipeline (to keep it uniform), now using kddtrain+ and kddtest+

parent 52054a99
No related branches found
No related tags found
No related merge requests found
......@@ -5,8 +5,6 @@ import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts, import_data, plot_roc_curve
......@@ -14,6 +12,7 @@ warnings.filterwarnings("ignore")
# Constants
y_data = 'class'
y_values = ['normal', 'anomaly']
df_train, df_test, model_name = import_data(
train_file_path="nsl-kdd-dataset/" + "KDDTrain+.arff",
test_file_path="nsl-kdd-dataset/" + "KDDTest+.arff",
......@@ -22,13 +21,9 @@ df_train, df_test, model_name = import_data(
sc = StandardScaler()
enc = LabelEncoder()
is_threat = df_train[y_data].unique()
if len(is_threat) != 2:
raise ValueError("Target must be a binary decision.")
# Normalize data
ordinal_encode(df=df_train, categories=is_threat, target=y_data)
ordinal_encode(df=df_test, categories=is_threat, target=y_data)
ordinal_encode(df=df_train, categories=y_values, target=y_data)
ordinal_encode(df=df_test, categories=y_values, target=y_data)
normalize(df_train, df_test, y_data, sc, enc)
......@@ -37,33 +32,26 @@ sns.countplot(x=y_data, data=df_train)
plot_counts(model_name=model_name)
# Separate X and y
X = df_train.select_dtypes(include=[np.number]).drop(columns=[y_data])
y = df_train[y_data]
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a pipeline for scaling and KNN
pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsClassifier())
])
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
y_train = df_train[[y_data]]
y_test = df_test[[y_data]]
# Training model
pipeline.fit(X_train, y_train)
y_prediction = pipeline.predict(X_test)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_prediction = model.predict(X_test)
print("Classification report: \n", classification_report(y_test, y_prediction))
plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
accuracy=pipeline.score(X_test, y_test),
accuracy=model.score(X_test, y_test),
model_name=model_name)
# Calculate prediction probabilities for ROC curve
y_score = pipeline.predict_proba(X_test)[:, 1]
y_score = model.predict_proba(X_test)[:, 1]
plot_roc_curve(y_test, y_score, model_name=model_name)
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_data = pd.DataFrame(prediction_input, columns=X_train.columns)
return pipeline.predict(input_data)
\ No newline at end of file
return model.predict(input_data)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment