-
Daniel Yang authored
added import_data method for importing data, now using the given KDDTest+ and KDDTrain+ data for training instead of only KDDTrain+, a lot of refactoring
Daniel Yang authoredadded import_data method for importing data, now using the given KDDTest+ and KDDTrain+ data for training instead of only KDDTrain+, a lot of refactoring
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
utilities.py 6.73 KiB
from typing import List
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.io import arff
from sklearn.base import BaseEstimator
from sklearn.preprocessing import OrdinalEncoder
show_plots = False
def ordinal_encode(df, categories, target):
"""
Applies ordinal encoding to a specified categorical column in a DataFrame.
Parameters:
df (pd.DataFrame): The input DataFrame.
categories (list): A list containing the ordered categories for encoding.
target (str): The column name to be encoded.
Raises:
TypeError: If categories are not provided or do not contain exactly two elements.
Modifies:
The function modifies the input DataFrame by replacing the target column with its encoded values.
"""
if categories is None or len(categories) != 2:
raise TypeError("Categories must be provided")
df[target] = OrdinalEncoder(categories = [categories]).fit_transform(df[[target]])
def heat_map(df, model_name=None):
"""
Generates a heatmap of the correlation matrix for numerical features in the DataFrame.
Parameters:
df (pd.DataFrame): The input DataFrame.
Modifies:
Displays a heatmap visualization of the feature correlations.
"""
# Drop all NaN
df.dropna(axis='columns')
ndf = df[[col for col in df.columns if df[col].nunique() > 1 and pd.api.types.is_numeric_dtype(df[col])]]
corr = ndf.corr()
plt.figure(figsize=(15, 12))
sns.heatmap(corr)
if model_name:
save_plot(model_name + " - Data Correlations")
if show_plots:
plt.show()
def plot_counts(model_name=None):
if model_name:
save_plot(model_name + " - Counts")
if show_plots:
plt.show()
def plot_xy(df, x, y, model_name=None):
"""
Creates a scatter plot for two numerical columns.
Parameters:
df (pd.DataFrame): The input DataFrame.
x (str): The column name for the x-axis.
y (str): The column name for the y-axis.
Modifies:
Displays a scatter plot of the two selected features.
"""
plt.scatter(df[x], df[y])
plt.xlabel(x)
plt.ylabel(y)
if model_name:
save_plot(model_name + " - XY")
if show_plots:
plt.show()
def plot_features(features, info_text: str = None, model_name=None):
"""
Parameters:
columns (list): The list of feature names used in the model.
Modifies:
Displays a horizontal bar chart representing feature importance.
"""
plt.figure(figsize=(10, 10))
# Add labels to bars
for index, value in enumerate(features['Importance']):
plt.text(value + 0.005, index, f'{value:.4f}', fontsize=10, verticalalignment='center')
plt.barh(features.index, features.Importance)
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
# Fix text cutoff issue
plt.subplots_adjust(left=0.3) # ⬅ Increase left margin for feature names
plt.tight_layout()
if info_text:
x_max = plt.xlim()[1]
y_max = plt.ylim()[0]
plt.text(x_max, y_max, info_text, verticalalignment='bottom', horizontalalignment='right')
if model_name:
save_plot(model_name + " - Feature Importance")
if show_plots:
plt.show()
def normalize(df_train, df_test, exclude, numerical_scaler, label_scaler):
scale_targets = df_train.select_dtypes(include=np.number).drop(columns=exclude).columns
df_train[scale_targets] = numerical_scaler.fit_transform(df_train[scale_targets])
df_test[scale_targets] = numerical_scaler.transform(df_test[scale_targets])
labels = df_train.select_dtypes(include=object, exclude=np.number).columns
for label in labels:
df_train[label] = label_scaler.fit_transform(df_train[label])
df_test[label] = label_scaler.transform(df_test[label])
def plot_confusion_matrix(confusion_matrix: List[List[int]], accuracy: float, model_name=None) -> None:
if len(confusion_matrix) != 2 or any(len(row) != 2 for row in confusion_matrix):
raise ValueError("Confusion matrices must be 2x2")
plt.figure(figsize=(8, 6))
labels = [[f'True Positive\n{confusion_matrix[0][0]}', f'False Positive\n{confusion_matrix[0][1]}'],
[f'False Negative\n{confusion_matrix[1][0]}', f'True Negative\n{confusion_matrix[1][1]}']]
plt.title(f"Confusion matrix | Accuracy: {accuracy * 100:.2f}%")
sns.heatmap(confusion_matrix, annot=labels, fmt='', cmap='Blues', cbar=False)
if model_name:
save_plot(model_name + " - Confusion Matrix")
if show_plots:
plt.show()
def print_high_confidence_samples(model, x: pd.DataFrame):
# Get predicted probabilities
predicted_probabilities = pd.DataFrame(model.predict_proba(x)[:, 1],
columns=['confidence level']) # Probability of being class 1
# Filter samples where the model is at least 90% sure
high_confidence_samples = predicted_probabilities[predicted_probabilities['confidence level'] > 0.9]
print(high_confidence_samples.head())
def plot_precision_recall_curve(precision, recall, model_name=None):
"""
A Precision-Recall curve shows the trade-off between precision
(how many predicted positives are actually correct) and recall
(how many actual positives were correctly identified).
A good curve is mostly at the top and right.
"""
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
if model_name:
save_plot(model_name + " - Precision Recall Curve")
if show_plots:
plt.show()
def plot_learning_curve(train_sizes, train_scores, test_scores, model_name=None):
"""
A learning curve helps diagnose overfitting or underfitting by plotting
training and validation performance as training size increases.
"""
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, 'o-', label="Training Score", color="blue")
plt.plot(train_sizes, test_mean, 'o-', label="Validation Score", color="red")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="blue")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="red")
plt.xlabel("Training Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.grid()
if model_name:
save_plot(model_name + " - Learning Curve")
if show_plots:
plt.show()
def save_plot(name):
plt.savefig("resulting_figures/" + name, dpi=300, bbox_inches='tight')
def import_data(train_file_path: str, test_file_path: str, model_name: str):
data, meta = arff.loadarff(train_file_path)
df_train = pd.DataFrame(data)
df_train = df_train.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
# Importing test data set
data, meta = arff.loadarff(test_file_path)
df_test = pd.DataFrame(data)
df_test = df_test.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
return df_train, df_test, model_name