Skip to content
Snippets Groups Projects
Commit 384d401a authored by Daniel Yang's avatar Daniel Yang
Browse files

refactored code

parent 67dd971a
No related branches found
No related tags found
No related merge requests found
...@@ -31,9 +31,6 @@ ordinal_encode(df = df_test, categories = y_values, target = y_data) ...@@ -31,9 +31,6 @@ ordinal_encode(df = df_test, categories = y_values, target = y_data)
normalize(df_train, df_test, y_data, sc, enc) normalize(df_train, df_test, y_data, sc, enc)
# Correlation
heat_map(df_train, model_name = model_name)
# Separate X and y # Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data]) X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data]) X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
...@@ -42,48 +39,61 @@ y_test = df_test[[y_data]] ...@@ -42,48 +39,61 @@ y_test = df_test[[y_data]]
# Train Random Forest Model # Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42) model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
model.fit(X_train, y_train.values.ravel())
# Predictions # Prediction function
y_prediction = model.predict(X_test) def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns=X_train.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
# Plot Confusion Matrix def train():
plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction), model.fit(X_train, y_train.values.ravel())
accuracy=model.score(X_test, y_test), graphs()
model_name=model_name) print("Training complete.")
print("Classification Report: \n", classification_report(y_test, y_prediction)) def graphs():
# Correlation
heat_map(df_train, model_name=model_name)
# Predictions
y_prediction = model.predict(X_test)
# Get high confidence samples for which the model is 90% confident # Plot Confusion Matrix
print_high_confidence_samples(model, X_train) plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
accuracy=model.score(X_test, y_test),
model_name=model_name)
# Feature Importance Plot print("Classification Report: \n", classification_report(y_test, y_prediction))
features = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, "Higher importance = More impact on classification", model_name=model_name)
# Precision-Recall Curve # Get high confidence samples for which the model is 90% confident
print("Calculating Precision Recall Curve") print_high_confidence_samples(model, X_train)
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_scores)
plot_precision_recall_curve(precision, recall, model_name)
# Learning Curve # Feature Importance Plot
print("Calculating Learning Curve") features = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance']).sort_values(
train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train.values.ravel(), cv=5, scoring="accuracy") by='Importance', ascending=False)
plot_learning_curve(train_sizes, train_scores, test_scores, model_name) plot_features(features, "Higher importance = More impact on classification", model_name=model_name)
# Calculate prediction probabilities for ROC curve # Precision-Recall Curve
y_score = model.predict_proba(X_test)[:, 1] print("Calculating Precision Recall Curve")
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_scores)
plot_precision_recall_curve(precision, recall, model_name)
# Plot ROC curve using the function from utilities # Learning Curve
plot_roc_curve(y_test, y_score, model_name=model_name) print("Calculating Learning Curve")
train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train.values.ravel(), cv=5,
scoring="accuracy")
plot_learning_curve(train_sizes, train_scores, test_scores, model_name)
# Calculate prediction probabilities for ROC curve
y_score = model.predict_proba(X_test)[:, 1]
# Plot ROC curve using the function from utilities
plot_roc_curve(y_test, y_score, model_name=model_name)
if __name__ == "__main__":
train()
# Prediction function
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns=X_train.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment