Skip to content
Snippets Groups Projects
Commit 11caccf8 authored by VladLub's avatar VladLub
Browse files

simpl epush so i can continue on the pc

parent 384d401a
No related branches found
No related tags found
No related merge requests found
......@@ -2,41 +2,45 @@ import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import sys
import os
from sklearn.metrics import classification_report, confusion_matrix
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from utilities import plot_counts
from utilities import plot_features, ordinal_encode, normalize, plot_confusion_matrix, print_high_confidence_samples, import_data
import utilities as util
warnings.filterwarnings("ignore")
# Constants
y_data = 'class'
y_columns = ['normal', 'anomaly']
df_train, df_test, model_name = import_data(
train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff",
test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff",
model_name = "Decision Tree")
train_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'nsl-kdd-dataset', 'KDDTrain+.arff'))
test_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'nsl-kdd-dataset', 'KDDTest+.arff'))
df_train, df_test, model_name = util.import_data(
train_file_path=train_file_path,
test_file_path=test_file_path,
model_name="Decision Tree"
)
sc = StandardScaler()
enc = LabelEncoder()
# Normalize data
ordinal_encode(df = df_train, categories = y_columns, target = y_data)
ordinal_encode(df = df_test, categories = y_columns, target = y_data)
util.ordinal_encode(df=df_train, categories=y_columns, target=y_data)
util.ordinal_encode(df=df_test, categories=y_columns, target=y_data)
normalize(df_train, df_test, y_data, sc, enc)
util.normalize(df_train, df_test, y_data, sc, enc)
# Plot absolute quantities of class 0 and class 1
sns.countplot(x = y_data, data = df_train)
plot_counts(model_name = model_name)
sns.countplot(x=y_data, data=df_train)
util.plot_counts(model_name=model_name)
# Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_train = df_train.select_dtypes(include=[np.number]).drop(columns=[y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns=[y_data])
y_train = df_train[[y_data]]
y_test = df_test[[y_data]]
......@@ -71,20 +75,20 @@ dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_prediction = dtc.predict(X_test)
print("Classification report: \n", classification_report(y_test, y_prediction))
plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction),
accuracy = dtc.score(X_test, y_test),
model_name=model_name)
util.plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
accuracy=dtc.score(X_test, y_test),
model_name=model_name)
# Determine feature importance
features = pd.DataFrame(dtc.feature_importances_,
index= X_train.columns,
index=X_train.columns,
columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, model_name = model_name)
util.plot_features(features, model_name=model_name)
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_data = pd.DataFrame(prediction_input, columns = X_train.columns)
input_data = pd.DataFrame(prediction_input, columns=X_train.columns)
return dtc.predict(input_data)
print_high_confidence_samples(model = dtc, x = X_train)
util.print_high_confidence_samples(model=dtc, x=X_train)
\ No newline at end of file
......@@ -179,15 +179,14 @@ def save_plot(name):
# Data processing
def import_data(train_file_path: str, test_file_path: str, model_name: str):
def import_data(train_file_path, test_file_path, model_name):
data, meta = arff.loadarff(train_file_path)
df_train = pd.DataFrame(data)
df_train = df_train.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df_train = df_train.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
# Importing test data set
data, meta = arff.loadarff(test_file_path)
df_test = pd.DataFrame(data)
df_test = df_test.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df_test = df_test.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
return df_train, df_test, model_name
......
......@@ -3,6 +3,24 @@ from tkinter import scrolledtext, ttk, Menu
import subprocess
import threading
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'machine_learning_models')))
from machine_learning_models import utilities as util
train_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'machine_learning_models', 'nsl-kdd-dataset', 'KDDTrain+.arff'))
test_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'machine_learning_models', 'nsl-kdd-dataset', 'KDDTest+.arff'))
# Import data using the correct paths
df_train, df_test, model_name = util.import_data(
train_file_path=train_file_path,
test_file_path=test_file_path,
model_name=None
)
from machine_learning_models import decision_tree, random_forest, knn, logistic_regression
from matplotlib import pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment