Skip to content
Snippets Groups Projects
Commit f7c64002 authored by Daniel Yang's avatar Daniel Yang
Browse files

cleaned up code:

added import_data method for importing data, now using the given KDDTest+ and KDDTrain+ data for training instead of only KDDTrain+, a lot of refactoring
parent d03c7b3c
No related branches found
No related tags found
No related merge requests found
......@@ -3,66 +3,45 @@ import pandas as pd
import seaborn as sns
import warnings
from scipy.io import arff
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from utilities import plot_counts
from utilities import plot_features, ordinal_encode, normalize, plot_confusion_matrix
from utilities import plot_features, ordinal_encode, normalize, plot_confusion_matrix, print_high_confidence_samples, import_data
warnings.filterwarnings("ignore")
# Constants
y_data = 'class'
read_csv = False
csv_path = "nsl-kdd-dataset/" + "fruits_dataset.csv"
arff_path = "nsl-kdd-dataset/" + "KDDTest+.arff"
df_train, df_test, model_name = import_data(
train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff",
test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff",
model_name = "Decision Tree")
sc = StandardScaler()
enc = LabelEncoder()
model_name = "Decision Tree"
if read_csv:
# Getting data frame (CSV)
df = pd.read_csv(csv_path)
else:
# Getting data frame (ARFF)
data, meta = arff.loadarff(arff_path)
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
X = df.select_dtypes(include=[np.number])
y = df[y_data]
is_threat = df[y_data].unique()
is_threat = df_train[y_data].unique()
if len(is_threat) != 2:
raise Exception("Target must be a binary decision.")
ordinal_encode(df= df, categories = is_threat, target = y_data)
normalize(df, y_data, sc, enc)
sns.countplot(x = y_data, data = df)
plot_counts(model_name=model_name)
# Normalize data
ordinal_encode(df = df_train, categories = is_threat, target = y_data)
ordinal_encode(df = df_test, categories = is_threat, target = y_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
normalize(df_train, df_test, y_data, sc, enc)
def train(dct_model):
dct_model.fit(X_train, y_train)
y_prediction = dct_model.predict(X_test)
print("Classification report: \n", classification_report(y_test, y_prediction))
plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction),
accuracy = dct_model.score(X_test, y_test),
model_name=model_name)
# Plot absolute quantities of class 0 and class 1
sns.countplot(x = y_data, data = df_train)
plot_counts(model_name = model_name)
dtc = DecisionTreeClassifier()
train(dtc)
features = pd.DataFrame(dtc.feature_importances_,
index= X.columns,
columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, model_name=model_name)
# Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
y_train = df_train[[y_data]]
y_test = df_test[[y_data]]
# Different parameters
# Criterion modifiers ('criterion'):
......@@ -89,17 +68,26 @@ plot_features(features, model_name=model_name)
# What is overfitting?
# Machine learning model "memorizes" the training data, rather than understanding the underlying pattern
# Different parameters
dtc2 = DecisionTreeClassifier(criterion ='entropy', ccp_alpha = 0.04)
train(dtc2)
features = pd.DataFrame(dtc2.feature_importances_,
index= X.columns,
# Training model
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_prediction = dtc.predict(X_test)
print("Classification report: \n", classification_report(y_test, y_prediction))
plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction),
accuracy = dtc.score(X_test, y_test),
model_name=model_name)
# Determine feature importance
features = pd.DataFrame(dtc.feature_importances_,
index= X_train.columns,
columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, model_name=model_name + "2")
plot_features(features, model_name = model_name)
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_data = pd.DataFrame(prediction_input, columns = X.columns)
input_data = pd.DataFrame(prediction_input, columns = X_train.columns)
return dtc.predict(input_data)
print_high_confidence_samples(model = dtc, x = X_train)
......@@ -3,62 +3,54 @@ import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.io import arff
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts
from utilities import ordinal_encode, normalize, plot_confusion_matrix, plot_counts, import_data
warnings.filterwarnings("ignore")
# Constants
y_data = 'class'
read_csv = False
csv_path = "nsl-kdd-dataset/" + "fruits_dataset.csv"
arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff"
df_train, df_test, model_name = import_data(
train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff",
test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff",
model_name = "KNN")
sc = StandardScaler()
enc = LabelEncoder()
model_name = "KNN"
if read_csv:
# Getting data frame (CSV)
df = pd.read_csv(csv_path)
else:
# Getting data frame (ARFF)
data, meta = arff.loadarff(arff_path)
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
X = df.select_dtypes(include=[np.number])
y = df[y_data]
is_threat = df[y_data].unique()
is_threat = df_train[y_data].unique()
if len(is_threat) != 2:
raise Exception("Target must be a binary decision.")
ordinal_encode(df= df, categories = is_threat, target = y_data)
normalize(df, y_data, sc, enc)
# Normalize data
ordinal_encode(df = df_train, categories = is_threat, target = y_data)
ordinal_encode(df = df_test, categories = is_threat, target = y_data)
sns.countplot(x = y_data, data = df)
plot_counts(model_name=model_name)
normalize(df_train, df_test, y_data, sc, enc)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Plot absolute quantities of class 0 and class 1
sns.countplot(x = y_data, data = df_train)
plot_counts(model_name = model_name)
def train(knn_model):
knn_model.fit(X_train, y_train)
y_prediction = knn_model.predict(X_test)
print("Classification report: \n", classification_report(y_test, y_prediction))
plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction),
accuracy = knn_model.score(X_test, y_test),
model_name=model_name)
# Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
y_train = df_train[[y_data]]
y_test = df_test[[y_data]]
knn = KNeighborsClassifier()
train(knn)
# Training model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_prediction = knn_model.predict(X_test)
print("Classification report: \n", classification_report(y_test, y_prediction))
plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction),
accuracy = knn_model.score(X_test, y_test),
model_name=model_name)
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_data = pd.DataFrame(prediction_input, columns = X.columns)
return knn.predict(input_data)
return knn_model.predict(input_data)
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.io import arff
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, print_high_confidence_samples, plot_counts
from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, \
print_high_confidence_samples, plot_counts, import_data
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
# Constants
X_data = 'count'
y_data = 'class'
read_csv = False
csv_path = "nsl-kdd-dataset/" + "fruits_dataset.csv"
arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff"
df_train, df_test, model_name = import_data(
train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff",
test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff",
model_name = "Logistic Regression")
sc = StandardScaler()
enc = LabelEncoder()
model_name = "Logistic Regression"
if read_csv:
# Getting data frame (CSV)
df = pd.read_csv(csv_path)
else:
# Getting data frame (ARFF)
data, meta = arff.loadarff(arff_path)
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
# Data inspection and pre-processing
print("Before: \n", df)
is_threat = df[y_data].unique()
is_threat = df_train[y_data].unique()
if len(is_threat) != 2:
raise Exception("Logistic Regression only works for binary classification.")
numerical_columns = df.select_dtypes(include=np.number).columns
label_columns = df.select_dtypes(include=object, exclude=np.number).columns
numerical_columns = df_train.select_dtypes(include = np.number).columns
label_columns = df_train.select_dtypes(include=object, exclude = np.number).columns
# Normalize data
ordinal_encode(df= df, categories = is_threat, target = y_data)
normalize(df, y_data, sc, enc)
print("After: \n", df)
ordinal_encode(df = df_train, categories = is_threat, target = y_data)
ordinal_encode(df = df_test, categories = is_threat, target = y_data)
normalize(df_train, df_test, y_data, sc, enc)
# Correlation
heat_map(df, model_name=model_name)
heat_map(df_train, model_name = model_name)
# Count plot
sns.countplot(x = y_data, data = df)
sns.countplot(x = y_data, data = df_train)
plot_counts(model_name=model_name)
# Data preparation
X = df.select_dtypes(include = np.number).drop(columns = y_data) # X must be independent
y = df[[y_data]] # y must ideally be dependent on X, and in case of logistic regression, must be a binary decision
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
y_train = df_train[[y_data]]
y_test = df_test[[y_data]]
# 'penalty'
# Helps preventing overfitting.
......@@ -96,19 +85,23 @@ y_prediction = model.predict(X_test)
plot_confusion_matrix(confusion_matrix = confusion_matrix(y_test, y_prediction),
accuracy = model.score(X_test, y_test),
model_name=model_name)
print(classification_report(y_test, y_prediction))
print("Classification report: \n", classification_report(y_test, y_prediction))
# Display feature importance
features = pd.DataFrame(model.coef_[0], index= X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features = features,
info_text= "Positive coefficient = More probable to land in class 1 \n"
"Negative coefficient = Less probable to land in class 1",
model_name=model_name)
# Display samples for which the model is 90% confident
print_high_confidence_samples(model, X_train)
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns = X.columns)
input_df = pd.DataFrame(prediction_input, columns = X_train.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
print_high_confidence_samples(model, X)
features = pd.DataFrame(model.coef_[0], index= X.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, "Positive coefficient = More probable to land in class 1\n"
"Negative coefficient = Less probable to land in class 1",
model_name=model_name)
......@@ -8,42 +8,40 @@ from sklearn.model_selection import train_test_split, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from utilities import plot_precision_recall_curve, plot_learning_curve
from utilities import plot_precision_recall_curve, plot_learning_curve, import_data
from utilities import ordinal_encode, heat_map, plot_features, plot_confusion_matrix, normalize, print_high_confidence_samples
warnings.filterwarnings("ignore")
# Constants
X_data = 'count'
y_data = 'class'
arff_path = "nsl-kdd-dataset/" + "KDDTrain+.arff"
df_train, df_test, model_name = import_data(
train_file_path = "nsl-kdd-dataset/" + "KDDTrain+.arff",
test_file_path = "nsl-kdd-dataset/" + "KDDTest+.arff",
model_name = "Random Forest")
sc = StandardScaler()
enc = LabelEncoder()
model_name = "Random Forest"
data, meta = arff.loadarff(arff_path)
df = pd.DataFrame(data)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
print("Before Processing: \n", df_train)
print("Before Processing: \n", df)
is_threat = df_train[y_data].unique()
numerical_columns = df_train.select_dtypes(include=np.number).columns
label_columns = df_train.select_dtypes(include=object, exclude=np.number).columns
is_threat = df[y_data].unique()
numerical_columns = df.select_dtypes(include=np.number).columns
label_columns = df.select_dtypes(include=object, exclude=np.number).columns
# Normalize data
ordinal_encode(df = df_train, categories = is_threat, target = y_data)
ordinal_encode(df = df_test, categories = is_threat, target = y_data)
# Normalize and encode data
ordinal_encode(df=df, categories=is_threat, target=y_data)
normalize(df, y_data, sc, enc)
print("After Processing: \n", df)
normalize(df_train, df_test, y_data, sc, enc)
# Correlation
heat_map(df, model_name=model_name)
heat_map(df_train, model_name = model_name)
# Data Preparation
X = df.select_dtypes(include = np.number).drop(columns = y_data) # X must be independent
y = df[[y_data]] # y must ideally be dependent on X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
y_train = df_train[[y_data]]
y_test = df_test[[y_data]]
# Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
......@@ -57,20 +55,14 @@ plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
accuracy=model.score(X_test, y_test),
model_name=model_name)
print(classification_report(y_test, y_prediction))
print("Classification Report: \n", classification_report(y_test, y_prediction))
# Prediction function
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns=X.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
print_high_confidence_samples(model, X)
# Get high confidence samples for which the model is 90% confident
print_high_confidence_samples(model, X_train)
# Feature Importance Plot
features = pd.DataFrame(model.feature_importances_, index=X.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
features = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, "Higher importance = More impact on classification", model_name=model_name)
# Precision-Recall Curve
......@@ -81,5 +73,14 @@ plot_precision_recall_curve(precision, recall, model_name)
# Learning Curve
print("Calculating Learning Curve")
train_sizes, train_scores, test_scores = learning_curve(model, X, y.values.ravel(), cv=5, scoring="accuracy")
train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train.values.ravel(), cv=5, scoring="accuracy")
plot_learning_curve(train_sizes, train_scores, test_scores, model_name)
# Prediction function
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns=X.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
......@@ -4,6 +4,8 @@ import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.io import arff
from sklearn.base import BaseEstimator
from sklearn.preprocessing import OrdinalEncoder
show_plots = False
......@@ -106,14 +108,18 @@ def plot_features(features, info_text: str = None, model_name=None):
if show_plots:
plt.show()
def normalize(df, exclude, numerical_scaler, label_scaler):
def normalize(df_train, df_test, exclude, numerical_scaler, label_scaler):
scale_targets = df.select_dtypes(include=np.number).drop(columns=exclude).columns
df[scale_targets] = numerical_scaler.fit_transform(df[scale_targets])
scale_targets = df_train.select_dtypes(include=np.number).drop(columns=exclude).columns
df_train[scale_targets] = numerical_scaler.fit_transform(df_train[scale_targets])
df_test[scale_targets] = numerical_scaler.transform(df_test[scale_targets])
labels = df.select_dtypes(include=object, exclude=np.number).columns
labels = df_train.select_dtypes(include=object, exclude=np.number).columns
for label in labels:
df[label] = label_scaler.fit_transform(df[label])
df_train[label] = label_scaler.fit_transform(df_train[label])
df_test[label] = label_scaler.transform(df_test[label])
def plot_confusion_matrix(confusion_matrix: List[List[int]], accuracy: float, model_name=None) -> None:
if len(confusion_matrix) != 2 or any(len(row) != 2 for row in confusion_matrix):
......@@ -186,3 +192,15 @@ def plot_learning_curve(train_sizes, train_scores, test_scores, model_name=None)
def save_plot(name):
plt.savefig("resulting_figures/" + name, dpi=300, bbox_inches='tight')
def import_data(train_file_path: str, test_file_path: str, model_name: str):
data, meta = arff.loadarff(train_file_path)
df_train = pd.DataFrame(data)
df_train = df_train.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
# Importing test data set
data, meta = arff.loadarff(test_file_path)
df_test = pd.DataFrame(data)
df_test = df_test.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
return df_train, df_test, model_name
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment