Skip to content
Snippets Groups Projects
Commit 384d401a authored by Daniel Yang's avatar Daniel Yang
Browse files

refactored code

parent 67dd971a
No related branches found
No related tags found
Loading
......@@ -31,9 +31,6 @@ ordinal_encode(df = df_test, categories = y_values, target = y_data)
normalize(df_train, df_test, y_data, sc, enc)
# Correlation
heat_map(df_train, model_name = model_name)
# Separate X and y
X_train = df_train.select_dtypes(include=[np.number]).drop(columns = [y_data])
X_test = df_test.select_dtypes(include=[np.number]).drop(columns = [y_data])
......@@ -42,48 +39,61 @@ y_test = df_test[[y_data]]
# Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
model.fit(X_train, y_train.values.ravel())
# Predictions
y_prediction = model.predict(X_test)
# Prediction function
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns=X_train.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
# Plot Confusion Matrix
plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
accuracy=model.score(X_test, y_test),
model_name=model_name)
def train():
model.fit(X_train, y_train.values.ravel())
graphs()
print("Training complete.")
print("Classification Report: \n", classification_report(y_test, y_prediction))
def graphs():
# Correlation
heat_map(df_train, model_name=model_name)
# Predictions
y_prediction = model.predict(X_test)
# Get high confidence samples for which the model is 90% confident
print_high_confidence_samples(model, X_train)
# Plot Confusion Matrix
plot_confusion_matrix(confusion_matrix=confusion_matrix(y_test, y_prediction),
accuracy=model.score(X_test, y_test),
model_name=model_name)
# Feature Importance Plot
features = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
plot_features(features, "Higher importance = More impact on classification", model_name=model_name)
print("Classification Report: \n", classification_report(y_test, y_prediction))
# Precision-Recall Curve
print("Calculating Precision Recall Curve")
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_scores)
plot_precision_recall_curve(precision, recall, model_name)
# Get high confidence samples for which the model is 90% confident
print_high_confidence_samples(model, X_train)
# Learning Curve
print("Calculating Learning Curve")
train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train.values.ravel(), cv=5, scoring="accuracy")
plot_learning_curve(train_sizes, train_scores, test_scores, model_name)
# Feature Importance Plot
features = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance']).sort_values(
by='Importance', ascending=False)
plot_features(features, "Higher importance = More impact on classification", model_name=model_name)
# Calculate prediction probabilities for ROC curve
y_score = model.predict_proba(X_test)[:, 1]
# Precision-Recall Curve
print("Calculating Precision Recall Curve")
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_scores)
plot_precision_recall_curve(precision, recall, model_name)
# Plot ROC curve using the function from utilities
plot_roc_curve(y_test, y_score, model_name=model_name)
# Learning Curve
print("Calculating Learning Curve")
train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train.values.ravel(), cv=5,
scoring="accuracy")
plot_learning_curve(train_sizes, train_scores, test_scores, model_name)
# Calculate prediction probabilities for ROC curve
y_score = model.predict_proba(X_test)[:, 1]
# Plot ROC curve using the function from utilities
plot_roc_curve(y_test, y_score, model_name=model_name)
if __name__ == "__main__":
train()
# Prediction function
def predict(prediction_input):
if len(prediction_input) == 0:
return
input_df = pd.DataFrame(prediction_input, columns=X_train.columns)
input_df[numerical_columns] = sc.transform(input_df[numerical_columns])
return ["anomaly" if x == 1 else "normal" for x in model.predict(input_df)]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment