9 Final results
Let’s make our final predictions using so far unseen data. After this, we cannot adjust the models anymore without risking data leakage.
glm_ultimate_pred <- predict(glm_final_fit, final_test_df2, type = "prob") %>%
  mutate(Prediction = as.factor(if_else(.pred_True < 0.46, "False", "True")))
glm_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = glm_ultimate_pred$Prediction)
write_csv(glm_submission, file = "Submissions/GLM submission 2024-02-13 19_40.csv")
glmnet_ultimate_pred <- predict(glmnet_final_fit, final_test_df2, type = "class")
glmnet_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = glmnet_ultimate_pred$.pred_class)
write_csv(glmnet_submission, file = "Submissions/GLMNET submission 2024-02-13 19_40.csv")
gam_ultimate_pred <- predict(gam_final_fit, final_test_df2, type = "class")
gam_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = gam_ultimate_pred$.pred_class)
write_csv(gam_submission, file = "Submissions/GAM submission 2024-02-13 19_40.csv")
svm_linear_ultimate_pred <- predict(svm_linear_final_fit, final_test_df2, type = "class")
svm_linear_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = svm_linear_ultimate_pred$.pred_class)
write_csv(svm_linear_submission, file = "Submissions/SVM linear submission 2024-02-13 19_40.csv")
svm_poly_ultimate_pred <- predict(svm_poly_final_fit, final_test_df2, type = "class")
svm_poly_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = svm_poly_ultimate_pred$.pred_class)
write_csv(svm_poly_submission, file = "Submissions/SVM poly submission 2024-02-13 19_40.csv")
svm_rbf_ultimate_pred <- predict(svm_rbf_final_fit, final_test_df2, type = "class")
svm_rbf_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = svm_rbf_ultimate_pred$.pred_class)
write_csv(svm_rbf_submission, file = "Submissions/SVM rbf submission 2024-02-13 19_40.csv")
nb_ultimate_pred <- predict(nb_final_fit, final_test_df2, type = "class")
nb_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = nb_ultimate_pred$.pred_class)
write_csv(nb_submission, file = "Submissions/NB submission 2024-02-13 19_40.csv")
knn_ultimate_pred <- predict(knn_final_fit, final_test_df2, type = "class")
knn_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = knn_ultimate_pred$.pred_class)
write_csv(knn_submission, file = "Submissions/KNN submission 2024-02-13 19_40.csv")
c5.0_ultimate_pred <- predict(c5.0_final_fit, final_test_df2, type = "class")
c5.0_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = c5.0_ultimate_pred$.pred_class)
write_csv(c5.0_submission, file = "Submissions/C5_0 submission 2024-02-13 19_40.csv")
rf_ultimate_pred <- predict(rf_final_fit, final_test_df2, type = "class")
rf_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = rf_ultimate_pred$.pred_class)
write_csv(rf_submission, file = "Submissions/RF submission 2024-02-13 19_40.csv")
xgb_ultimate_pred <- predict(xgb_final_fit, final_test_df2, type = "class")
xgb_submission <- bind_cols(PassengerId = final_test_df2$PassengerId, Transported = xgb_ultimate_pred$.pred_class)
write_csv(xgb_submission, file = "Submissions/XGB submission 2024-02-13 19_40.csv")Now we summarise both resampling and test results in a table.
modelresults <- readxl::read_xlsx("Extra/Modelresults.xlsx")
modelresults2 <- modelresults %>%
  mutate(`Resample Accuracy` = c(glm_final_acc, glmnet_final_acc, gam_final_acc, svm_linear_final_acc, svm_poly_final_acc,
                                 svm_rbf_final_acc, nb_final_acc, knn_final_acc, c5.0_final_acc, rf_final_acc, xgb_final_acc),
         `Test Data Accuracy` = c(0.79565, 0.79775, 0.79775, 0.79424, NA, 0.79658, 0.73182, 0.77671, 0.76852,
                                  0.78045, 0.78068)) %>%
  mutate(across(.cols = c(2:3), .fns = ~ round(.x, 3)))
knitr::kable(modelresults2, align = "lcc") %>%
  kableExtra::kable_styling(bootstrap_options = "striped")| Model | Resample Accuracy | Test Data Accuracy | 
|---|---|---|
| Logistic Regression (GLM) | 0.801 | 0.796 | 
| Penalized logistic regression (Lasso/Ridge) | 0.809 | 0.798 | 
| Generalized Additive Models (GAM) | 0.807 | 0.798 | 
| Support Vector Machines (SVM-linear) | 0.804 | 0.794 | 
| Support Vector Machines (SVM-poly) | 0.804 | NA | 
| Support Vector Machines (SVM-rbf) | 0.805 | 0.797 | 
| Naive Bayes | 0.731 | 0.732 | 
| K-nearest neighbors | 0.782 | 0.777 | 
| C5.0 | 0.772 | 0.769 | 
| RandomForest | 0.794 | 0.780 | 
| Boosted tree-model (XGBoost) | 0.795 | 0.781 | 
We see that our resampling methods give reasonably accurate estimates of accuracy. Apart from the Naive Bayes model that has low performance, most other models do well with this data set. The explanation for the Naive Bayes underperformance is likely that it doesn’t do so well on data that has been converted to dummy variables - it probably requires a different recipe to reach similar performance as the other models.
The simplest models seem to also have the best accuracy with the best two being penalized regression as well as general additive models. I was hoping to score above 80% but it seems as if more can be done with the data.