Phase 3: Model Building and Evaluation

Executive Summary

This phase presents the development, evaluation, and comparison of generalized linear modeling approaches as required: - Logistic regression - Linear discriminant analysis (LDA) - Poisson regression. Each model is carefully tuned to quantify the impact of school choice and family engagement on educational success, providing actionable insights for K-12 Connect.

Our systematic approach includes feature selection, model training, validation, and comprehensive performance evaluation to identify the optimal model for predicting and understanding educational outcomes.

Setup and Load Data

suppressPackageStartupMessages({
  library(tidyverse)
  library(tidymodels)
  library(MASS)
  library(nnet)
  library(pscl)
  library(car)
  library(glmnet)
  library(poissonreg)
  library(knitr)
  library(kableExtra)
  library(patchwork)
  library(vip)
  library(probably)
  library(discrim)
})
library(conflicted)
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
# Set theme and colors
theme_set(theme_minimal(base_size = 10) + 
          theme(plot.title = element_text(face = "bold", size = 12),
                plot.subtitle = element_text(size = 10),
                legend.position = "bottom"))

project_colors <- c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#6A994E", "#8B4513")

# Load prepared data from Phase 2 - FORCE CORRECT COLUMN TYPES
pfi_model <- read_csv("pfi_phase2_model_ready.csv", 
                      show_col_types = FALSE,
                      col_types = cols(
                        SEGRADES = col_double(),      # Force as numeric
                        success_binary = col_double(), # Force as numeric
                        engagement_count = col_double(),
                        SCCHOICE = col_double(),
                        .default = col_guess()
                      ))

cat("Dataset loaded successfully\n")
## Dataset loaded successfully
cat("Observations:", nrow(pfi_model), "\n")
## Observations: 15990
cat("Variables:", ncol(pfi_model), "\n")
## Variables: 23
# Verify SEGRADES is now numeric with actual values
cat("\n=== SEGRADES Verification ===\n")
## 
## === SEGRADES Verification ===
cat("Class:", class(pfi_model$SEGRADES), "\n")
## Class: numeric
cat("Non-NA count:", sum(!is.na(pfi_model$SEGRADES)), "\n")
## Non-NA count: 15990
cat("Distribution:\n")
## Distribution:
print(table(pfi_model$SEGRADES, useNA = "ifany"))
## 
##    1    2    3    4    5 
## 7866 4528 1345  263 1988
# Set seed for reproducibility
set.seed(631) 

Data Preparation for Modeling

cat("\n========== OUTCOME VARIABLE PREPARATION ==========\n\n")
## 
## ========== OUTCOME VARIABLE PREPARATION ==========
# Show original distributions before any conversion
cat("Original Outcome Distributions (Numeric):\n\n")
## Original Outcome Distributions (Numeric):
# Success Binary
cat("1. Binary Academic Success (success_binary = 0/1):\n")
## 1. Binary Academic Success (success_binary = 0/1):
print(table(pfi_model$success_binary, useNA = "ifany"))
## 
##     0     1  <NA> 
##  1608 12394  1988
cat("\n")
# SEGRADES
cat("2. Four-Category Grades (SEGRADES = 1/2/3/4/5):\n")
## 2. Four-Category Grades (SEGRADES = 1/2/3/4/5):
print(table(pfi_model$SEGRADES, useNA = "ifany"))
## 
##    1    2    3    4    5 
## 7866 4528 1345  263 1988
cat("\n")
# Engagement Count
cat("3. Engagement Count:\n")
## 3. Engagement Count:
cat("  Mean:", round(mean(pfi_model$engagement_count, na.rm = TRUE), 2), "\n")
##   Mean: 4.23
cat("  Median:", median(pfi_model$engagement_count, na.rm = TRUE), "\n")
##   Median: 4
cat("  SD:", round(sd(pfi_model$engagement_count, na.rm = TRUE), 2), "\n")
##   SD: 2.02
cat("  Range:", min(pfi_model$engagement_count, na.rm = TRUE), "-", 
    max(pfi_model$engagement_count, na.rm = TRUE), "\n\n")
##   Range: 0 - 8

Train-Test Split and Feature Engineering

We implement a stratified 70/30 train-test split to ensure balanced representation across outcome categories.

cat("========== TRAIN-TEST SPLIT ==========\n\n")
## ========== TRAIN-TEST SPLIT ==========
# Remove category 5 (no grades) and any missing values
# KEEP AS NUMERIC for now
pfi_complete <- pfi_model %>%
  filter(!is.na(SEGRADES) & !is.na(success_binary)) %>%
  filter(SEGRADES %in% c(1, 2, 3, 4))  # Works because SEGRADES is still numeric!

cat("Starting observations:", nrow(pfi_model), "\n")
## Starting observations: 15990
cat("After removing missing and 'no grades' (category 5):", nrow(pfi_complete), "\n")
## After removing missing and 'no grades' (category 5): 14002
cat("Observations removed:", nrow(pfi_model) - nrow(pfi_complete), "\n\n")
## Observations removed: 1988
# Verify SEGRADES distribution before split (still numeric)
cat("Grade distribution before split (numeric):\n")
## Grade distribution before split (numeric):
print(table(pfi_complete$SEGRADES))
## 
##    1    2    3    4 
## 7866 4528 1345  263
cat("\n")
# Create stratified split on SEGRADES (while still numeric)
set.seed(631)
data_split <- initial_split(pfi_complete, prop = 0.70, strata = SEGRADES)
train_data <- training(data_split)
test_data <- testing(data_split)

cat("Split created:\n")
## Split created:
cat("  Training:", nrow(train_data), "observations\n")
##   Training: 9801 observations
cat("  Testing:", nrow(test_data), "observations\n\n")
##   Testing: 4201 observations
# NOW convert to factors with labels (AFTER the split)
train_data <- train_data %>%
  mutate(
    success_binary = factor(success_binary, 
                           levels = c(1, 0),
                           labels = c("High_Achievement", "Low_Achievement")),
    SEGRADES = factor(SEGRADES, 
                     levels = c(1, 2, 3, 4),
                     labels = c("Mostly_As", "Mostly_Bs", "Mostly_Cs", "Ds_or_Below"))
  )

test_data <- test_data %>%
  mutate(
    success_binary = factor(success_binary, 
                           levels = c(1, 0),
                           labels = c("High_Achievement", "Low_Achievement")),
    SEGRADES = factor(SEGRADES, 
                     levels = c(1, 2, 3, 4),
                     labels = c("Mostly_As", "Mostly_Bs", "Mostly_Cs", "Ds_or_Below"))
  )

cat("✓ Outcomes converted to factors with labels\n\n")
## ✓ Outcomes converted to factors with labels
# Verify split proportions
split_summary <- data.frame(
  Dataset = c("Training", "Testing", "Total"),
  N = c(nrow(train_data), nrow(test_data), nrow(pfi_complete)),
  Percentage = c(nrow(train_data)/nrow(pfi_complete)*100,
                nrow(test_data)/nrow(pfi_complete)*100,
                100),
  Success_Rate = c(
    mean(train_data$success_binary == "High_Achievement", na.rm = TRUE)*100,
    mean(test_data$success_binary == "High_Achievement", na.rm = TRUE)*100,
    mean(pfi_complete$success_binary == 1, na.rm = TRUE)*100
  )
)

kable(split_summary,
      caption = "Train-Test Split Summary",
      digits = 1,
      col.names = c("Dataset", "N", "Percentage (%)", "Success Rate (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")
Train-Test Split Summary
Dataset N Percentage (%) Success Rate (%)
Training 9801 70 88.4
Testing 4201 30 88.7
Total 14002 100 88.5
# Check outcome distribution preservation
train_dist <- prop.table(table(train_data$SEGRADES))
test_dist <- prop.table(table(test_data$SEGRADES))

grade_dist_comparison <- data.frame(
  Grade = names(train_dist),
  Train_Pct = as.numeric(train_dist) * 100,
  Test_Pct = as.numeric(test_dist) * 100
) %>%
  mutate(Difference = abs(Train_Pct - Test_Pct))

kable(grade_dist_comparison,
      caption = "Grade Distribution Balance Across Train-Test Split",
      digits = 2,
      col.names = c("Grade Category", "Train (%)", "Test (%)", "Abs Diff (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")
Grade Distribution Balance Across Train-Test Split
Grade Category Train (%) Test (%) Abs Diff (%)
Mostly_As 56.20 56.13 0.07
Mostly_Bs 32.23 32.59 0.36
Mostly_Cs 9.64 9.52 0.12
Ds_or_Below 1.93 1.76 0.17
cat("\n=== Stratification Quality Check ===\n")
## 
## === Stratification Quality Check ===
cat("Maximum grade distribution difference:", 
    round(max(abs(train_dist - test_dist)), 4), "\n")
## Maximum grade distribution difference: 0.0036
if(max(abs(train_dist - test_dist)) < 0.01) {
  cat("✓ Excellent stratification - distributions nearly identical\n")
} else if(max(abs(train_dist - test_dist)) < 0.03) {
  cat("✓ Good stratification - acceptable differences\n")
} else {
  cat("⚠ Warning: Larger than expected distribution differences\n")
}
## ✓ Excellent stratification - distributions nearly identical
cat("\n✓ Train-test split complete and validated\n")
## 
## ✓ Train-test split complete and validated

Feature Engineering and Preprocessing

cat("\n========== FEATURE ENGINEERING ==========\n\n")
## 
## ========== FEATURE ENGINEERING ==========
# Add interaction and polynomial terms to BOTH train and test
train_data <- train_data %>%
  mutate(
    # Key interactions identified in Phase 2
    choice_x_engagement = SCCHOICE * engagement_count,
    income_x_homework = TTLHHINC * homework_intensity,
    parent_ed_x_involvement = PARGRADEX * school_involvement,
    
    # Polynomial terms for non-linear relationships
    engagement_sq = engagement_count^2,
    homework_sq = homework_intensity^2
  )

test_data <- test_data %>%
  mutate(
    choice_x_engagement = SCCHOICE * engagement_count,
    income_x_homework = TTLHHINC * homework_intensity,
    parent_ed_x_involvement = PARGRADEX * school_involvement,
    engagement_sq = engagement_count^2,
    homework_sq = homework_intensity^2
  )

cat("✓ Interaction terms created:\n")
## ✓ Interaction terms created:
cat("  - choice_x_engagement (school choice × engagement count)\n")
##   - choice_x_engagement (school choice × engagement count)
cat("  - income_x_homework (income × homework intensity)\n")
##   - income_x_homework (income × homework intensity)
cat("  - parent_ed_x_involvement (parent education × school involvement)\n\n")
##   - parent_ed_x_involvement (parent education × school involvement)
cat("✓ Polynomial terms created:\n")
## ✓ Polynomial terms created:
cat("  - engagement_sq (engagement count squared)\n")
##   - engagement_sq (engagement count squared)
cat("  - homework_sq (homework intensity squared)\n\n")
##   - homework_sq (homework intensity squared)
# Show summary statistics for new features
engineered_summary <- train_data %>%
  summarise(
    choice_x_eng_mean = mean(choice_x_engagement, na.rm = TRUE),
    choice_x_eng_sd = sd(choice_x_engagement, na.rm = TRUE),
    income_x_hw_mean = mean(income_x_homework, na.rm = TRUE),
    income_x_hw_sd = sd(income_x_homework, na.rm = TRUE),
    eng_sq_mean = mean(engagement_sq, na.rm = TRUE),
    eng_sq_sd = sd(engagement_sq, na.rm = TRUE)
  ) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  separate(Feature, into = c("Variable", "Stat"), sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = Stat, values_from = Value)

kable(engineered_summary,
      caption = "Engineered Features Summary Statistics (Training Set)",
      digits = 2,
      col.names = c("Engineered Feature", "Mean", "SD")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")
Engineered Features Summary Statistics (Training Set)
Engineered Feature Mean SD
choice_x_eng 5.63 3.40
income_x_hw 20.38 10.15
eng_sq 21.64 17.03
cat("\n✓ Feature engineering complete\n")
## 
## ✓ Feature engineering complete
cat("Training data now has", ncol(train_data), "variables\n")
## Training data now has 28 variables
cat("Testing data now has", ncol(test_data), "variables\n")
## Testing data now has 28 variables
cat("\n========== PREDICTOR SETS DEFINITION ==========\n\n")
## 
## ========== PREDICTOR SETS DEFINITION ==========
# Core predictors (required focus areas + key demographics)
core_predictors <- c(
  "SCCHOICE",           # School choice
  "engagement_count",   # Family engagement
  "homework_intensity", # Homework support
  "TTLHHINC",          # Income
  "PARGRADEX"          # Parent education
)

# Extended predictors (add control variables and individual activities)
extended_predictors <- c(
  core_predictors,
  "school_involvement", # School involvement index
  "FSVOL",             # Volunteer at school
  "FSPTMTNG",          # Parent-teacher conference
  "enrichment_count",  # Home enrichment activities
  "CSEX",              # Child sex
  "ALLGRADEX",         # Grade level
  "NUMSIBSX"           # Number of siblings
)

# Interaction predictors (add key interactions)
interaction_predictors <- c(
  extended_predictors,
  "choice_x_engagement",          # Choice × Engagement
  "income_x_homework",            # Income × Homework
  "parent_ed_x_involvement"       # Parent Ed × Involvement
)

# Full predictors (add polynomials and additional variables)
full_predictors <- c(
  interaction_predictors,
  "engagement_sq",           # Engagement squared
  "homework_sq",             # Homework squared
  "SPUBCHOIX",              # Public school choice
  "communication_index",     # Parent-school communication
  "SEENJOY"                 # School enjoyment
)

# Create summary table
predictor_summary <- data.frame(
  Model_Complexity = c("Core", "Extended", "Interactions", "Full"),
  N_Predictors = c(
    length(core_predictors),
    length(extended_predictors),
    length(interaction_predictors),
    length(full_predictors)
  ),
  Key_Focus = c(
    "School choice + engagement basics",
    "Core + demographics + activities",
    "Extended + key interactions",
    "All predictors + polynomials"
  )
)

kable(predictor_summary,
      caption = "Predictor Sets by Model Complexity",
      col.names = c("Predictor Set", "# Variables", "Description"),
      align = c("l", "c", "l")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")
Predictor Sets by Model Complexity
Predictor Set # Variables Description
Core 5 School choice + engagement basics
Extended 12 Core + demographics + activities
Interactions 15 Extended + key interactions
Full 20 All predictors + polynomials
cat("\n=== Predictor Sets Defined ===\n")
## 
## === Predictor Sets Defined ===
cat("Core predictors:", length(core_predictors), "\n")
## Core predictors: 5
cat("Extended predictors:", length(extended_predictors), "\n")
## Extended predictors: 12
cat("With interactions:", length(interaction_predictors), "\n")
## With interactions: 15
cat("Full model:", length(full_predictors), "\n\n")
## Full model: 20
# Show which variables are in each set
cat("Core Predictors:\n")
## Core Predictors:
cat(paste(" ", core_predictors, collapse = "\n"), "\n\n")
##   SCCHOICE
##   engagement_count
##   homework_intensity
##   TTLHHINC
##   PARGRADEX
cat("Added in Extended:\n")
## Added in Extended:
cat(paste(" ", setdiff(extended_predictors, core_predictors), collapse = "\n"), "\n\n")
##   school_involvement
##   FSVOL
##   FSPTMTNG
##   enrichment_count
##   CSEX
##   ALLGRADEX
##   NUMSIBSX
cat("Added in Interactions:\n")
## Added in Interactions:
cat(paste(" ", setdiff(interaction_predictors, extended_predictors), collapse = "\n"), "\n\n")
##   choice_x_engagement
##   income_x_homework
##   parent_ed_x_involvement
cat("Added in Full:\n")
## Added in Full:
cat(paste(" ", setdiff(full_predictors, interaction_predictors), collapse = "\n"), "\n\n")
##   engagement_sq
##   homework_sq
##   SPUBCHOIX
##   communication_index
##   SEENJOY
cat("✓ Predictor sets defined and ready for modeling\n")
## ✓ Predictor sets defined and ready for modeling
cat("\n========== CLASS IMBALANCE ASSESSMENT ==========\n\n")
## 
## ========== CLASS IMBALANCE ASSESSMENT ==========
# Calculate baseline accuracy (always predict majority class)
baseline_acc <- mean(train_data$success_binary == "High_Achievement") * 100

cat("Binary Outcome Distribution:\n")
## Binary Outcome Distribution:
cat("  High Achievement (A/B):", 
    sum(train_data$success_binary == "High_Achievement"), 
    sprintf("(%.1f%%)\n", baseline_acc))
##   High Achievement (A/B): 8667 (88.4%)
cat("  Low Achievement (C/D):", 
    sum(train_data$success_binary == "Low_Achievement"),
    sprintf("(%.1f%%)\n", 100 - baseline_acc))
##   Low Achievement (C/D): 1134 (11.6%)
cat("\nClass Imbalance Ratio:", sprintf("%.2f:1\n", baseline_acc/(100-baseline_acc)))
## 
## Class Imbalance Ratio: 7.64:1
cat("\n=== Implications for Modeling ===\n")
## 
## === Implications for Modeling ===
cat("• Baseline accuracy (always predict 'High'):", sprintf("%.1f%%\n", baseline_acc))
## • Baseline accuracy (always predict 'High'): 88.4%
cat("• Models must beat this baseline to be useful\n")
## • Models must beat this baseline to be useful
cat("• We will focus on:\n")
## • We will focus on:
cat("  - ROC-AUC (handles imbalance well)\n")
##   - ROC-AUC (handles imbalance well)
cat("  - Balanced accuracy (equal weight to both classes)\n")
##   - Balanced accuracy (equal weight to both classes)
cat("  - Sensitivity and specificity separately\n")
##   - Sensitivity and specificity separately
cat("  - NOT just raw accuracy\n\n")
##   - NOT just raw accuracy
cat("✓ Class imbalance documented and evaluation strategy defined\n")
## ✓ Class imbalance documented and evaluation strategy defined

Model 1: Logistic Regression

cat("\n========================================\n")
## 
## ========================================
cat("PART 2: LOGISTIC REGRESSION MODELS\n")
## PART 2: LOGISTIC REGRESSION MODELS
cat("========================================\n\n")
## ========================================
## 2.1 Model Development Strategy

cat("Building three logistic regression models:\n")
## Building three logistic regression models:
cat("  1. Core Model: 5 essential predictors\n")
##   1. Core Model: 5 essential predictors
cat("  2. Interaction Model: Core + key interactions\n")
##   2. Interaction Model: Core + key interactions
cat("  3. LASSO Model: Automatic variable selection\n\n")
##   3. LASSO Model: Automatic variable selection
# Create subsets of data with only needed variables
train_core <- train_data %>% select(success_binary, all_of(core_predictors))
train_interact <- train_data %>% select(success_binary, all_of(interaction_predictors))
train_full <- train_data %>% select(success_binary, all_of(full_predictors))

# Recipe 1: Core Model
recipe_core <- recipe(success_binary ~ ., data = train_core) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# Recipe 2: Interaction Model
recipe_interact <- recipe(success_binary ~ ., data = train_interact) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# Recipe 3: LASSO Model
recipe_lasso <- recipe(success_binary ~ ., data = train_full) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

cat("✓ Recipes defined\n\n")
## ✓ Recipes defined
## 2.3 Define Model Specifications

# Standard logistic regression for Core and Interaction
log_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

# LASSO logistic regression
lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("classification")

cat("✓ Model specifications defined\n\n")
## ✓ Model specifications defined
## 2.4 Create Workflows

wf_core <- workflow() %>%
  add_recipe(recipe_core) %>%
  add_model(log_spec)

wf_interact <- workflow() %>%
  add_recipe(recipe_interact) %>%
  add_model(log_spec)

wf_lasso <- workflow() %>%
  add_recipe(recipe_lasso) %>%
  add_model(lasso_spec)

cat("✓ Workflows created\n\n")
## ✓ Workflows created
## 2.5 Fit Models

cat("Fitting models to training data...\n")
## Fitting models to training data...
# Fit Core Model
fit_core <- fit(wf_core, data = train_data)
cat("  ✓ Core model fitted\n")
##   ✓ Core model fitted
# Fit Interaction Model
fit_interact <- fit(wf_interact, data = train_data)
cat("  ✓ Interaction model fitted\n")
##   ✓ Interaction model fitted
# Tune and Fit LASSO Model
cat("  ⏳ Tuning LASSO model (5-fold CV)...\n")
##   ⏳ Tuning LASSO model (5-fold CV)...
# Set up cross-validation
set.seed(631)
folds <- vfold_cv(train_data, v = 5, strata = success_binary)

# Create penalty grid
penalty_grid <- grid_regular(
  penalty(range = c(-3, 0), trans = log10_trans()),
  levels = 30
)

# Tune LASSO
lasso_tune <- tune_grid(
  wf_lasso,
  resamples = folds,
  grid = penalty_grid,
  metrics = metric_set(roc_auc, accuracy),
  control = control_grid(save_pred = FALSE, verbose = FALSE)
)

# Select best penalty
best_penalty <- select_best(lasso_tune, metric = "roc_auc")
cat("    Best penalty:", format(best_penalty$penalty, scientific = FALSE), "\n")
##     Best penalty: 0.001
# Finalize and fit LASSO
wf_lasso_final <- finalize_workflow(wf_lasso, best_penalty)
fit_lasso <- fit(wf_lasso_final, data = train_data)
cat("  ✓ LASSO model fitted\n\n")
##   ✓ LASSO model fitted
## 2.6 Extract Coefficients (with p-values and CIs)

cat("========== MODEL COEFFICIENTS ==========\n\n")
## ========== MODEL COEFFICIENTS ==========
# Function to extract and format coefficients
extract_coefs <- function(fit, model_name) {
  fit %>%
    extract_fit_parsnip() %>%
    tidy() %>%
    filter(term != "(Intercept)") %>%
    mutate(
      odds_ratio = exp(estimate),
      ci_lower = exp(estimate - 1.96 * std.error),
      ci_upper = exp(estimate + 1.96 * std.error),
      significant = p.value < 0.05,
      model = model_name
    ) %>%
    arrange(p.value) %>%
    select(model, term, estimate, odds_ratio, ci_lower, ci_upper, 
           std.error, p.value, significant)
}

# Extract from Interaction Model (best model - show all)
coef_interact <- extract_coefs(fit_interact, "Interaction")

kable(coef_interact,
      caption = "Interaction Model: Complete Coefficients with Odds Ratios",
      digits = 3,
      col.names = c("Model", "Predictor", "Coefficient", "Odds Ratio", 
                    "CI Lower", "CI Upper", "Std Error", "P-value", "Sig?")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                latex_options = "scale_down") %>%
  row_spec(which(coef_interact$significant), bold = TRUE, color = "darkgreen") %>%
  footnote(general = "Significant predictors (p < 0.05) shown in bold green")
Interaction Model: Complete Coefficients with Odds Ratios
Model Predictor Coefficient Odds Ratio CI Lower CI Upper Std Error P-value Sig?
Interaction CSEX -0.358 0.699 0.653 0.748 0.035 0.000 TRUE
Interaction PARGRADEX -0.757 0.469 0.353 0.624 0.146 0.000 TRUE
Interaction ALLGRADEX 0.212 1.236 1.141 1.340 0.041 0.000 TRUE
Interaction enrichment_count -0.183 0.832 0.770 0.900 0.040 0.000 TRUE
Interaction homework_intensity -0.277 0.758 0.668 0.861 0.065 0.000 TRUE
Interaction TTLHHINC -0.431 0.650 0.521 0.810 0.112 0.000 TRUE
Interaction FSPTMTNG -0.207 0.813 0.730 0.906 0.055 0.000 TRUE
Interaction FSVOL 0.216 1.241 1.103 1.395 0.060 0.000 TRUE
Interaction parent_ed_x_involvement 0.518 1.679 1.260 2.236 0.146 0.000 TRUE
Interaction choice_x_engagement 0.220 1.246 0.994 1.561 0.115 0.056 FALSE
Interaction school_involvement 0.174 1.190 0.988 1.432 0.095 0.067 FALSE
Interaction income_x_homework 0.153 1.165 0.897 1.514 0.134 0.253 FALSE
Interaction SCCHOICE 0.047 1.048 0.918 1.196 0.067 0.487 FALSE
Interaction NUMSIBSX -0.022 0.978 0.918 1.043 0.033 0.505 FALSE
Interaction engagement_count 0.057 1.058 0.842 1.330 0.116 0.626 FALSE
Note:
Significant predictors (p < 0.05) shown in bold green
cat("\n")
# Extract from LASSO (show only non-zero coefficients)
coef_lasso <- fit_lasso %>%
  extract_fit_parsnip() %>%
  tidy() %>%
  filter(term != "(Intercept)", estimate != 0) %>%
  mutate(
    odds_ratio = exp(estimate),
    model = "LASSO"
  ) %>%
  arrange(desc(abs(estimate))) %>%
  select(model, term, estimate, odds_ratio)

kable(coef_lasso,
      caption = "LASSO Model: Selected Predictors (Non-Zero Coefficients)",
      digits = 3,
      col.names = c("Model", "Predictor", "Coefficient", "Odds Ratio")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = paste("LASSO selected", nrow(coef_lasso), 
                          "predictors out of", length(full_predictors)))
LASSO Model: Selected Predictors (Non-Zero Coefficients)
Model Predictor Coefficient Odds Ratio
LASSO SEENJOY 0.701 2.015
LASSO PARGRADEX -0.361 0.697
LASSO CSEX -0.328 0.720
LASSO FSVOL 0.287 1.333
LASSO TTLHHINC -0.275 0.760
LASSO communication_index 0.245 1.278
LASSO school_involvement 0.214 1.238
LASSO engagement_sq 0.182 1.200
LASSO FSPTMTNG -0.132 0.876
LASSO homework_intensity -0.129 0.879
LASSO enrichment_count -0.101 0.904
LASSO ALLGRADEX 0.099 1.104
LASSO choice_x_engagement 0.097 1.102
LASSO parent_ed_x_involvement 0.056 1.058
LASSO SPUBCHOIX -0.025 0.975
LASSO SCCHOICE -0.019 0.982
LASSO NUMSIBSX -0.014 0.986
Note:
LASSO selected 17 predictors out of 20
cat("\n")
## 2.7 Generate Predictions on Test Set

cat("========== PREDICTIONS ON TEST SET ==========\n\n")
## ========== PREDICTIONS ON TEST SET ==========
# Function to make predictions
make_predictions <- function(fit, data) {
  predict(fit, data, type = "prob") %>%
    bind_cols(predict(fit, data, type = "class")) %>%
    bind_cols(data %>% select(success_binary))
}

# Generate predictions
pred_core <- make_predictions(fit_core, test_data)
pred_interact <- make_predictions(fit_interact, test_data)
pred_lasso <- make_predictions(fit_lasso, test_data)

cat("✓ Predictions generated for all models\n\n")
## ✓ Predictions generated for all models
## 2.8 Calculate Performance Metrics

cat("========== MODEL PERFORMANCE COMPARISON ==========\n\n")
## ========== MODEL PERFORMANCE COMPARISON ==========
# Function to calculate metrics (FIXED - specify event_level)
calc_metrics <- function(predictions, model_name) {
  # ROC-AUC with correct event level
  roc_val <- predictions %>%
    roc_auc(truth = success_binary, .pred_High_Achievement, 
            event_level = "first") %>%  # FIRST level is High_Achievement
    pull(.estimate)
  
  # Accuracy
  acc_val <- predictions %>%
    accuracy(truth = success_binary, estimate = .pred_class) %>%
    pull(.estimate)
  
  # Confusion matrix for sensitivity/specificity
  cm <- predictions %>%
    conf_mat(truth = success_binary, estimate = .pred_class)
  
  # Extract sensitivity and specificity
  cm_summary <- summary(cm)
  sens_val <- cm_summary %>% filter(.metric == "sens") %>% pull(.estimate)
  spec_val <- cm_summary %>% filter(.metric == "spec") %>% pull(.estimate)
  
  tibble(
    Model = model_name,
    ROC_AUC = roc_val,
    Accuracy = acc_val,
    Sensitivity = sens_val,
    Specificity = spec_val
  )
}

# Calculate for all models
metrics_all <- bind_rows(
  calc_metrics(pred_core, "Core"),
  calc_metrics(pred_interact, "Interaction"),
  calc_metrics(pred_lasso, "LASSO")
) %>%
  mutate(
    N_Predictors = c(
      length(core_predictors),
      length(interaction_predictors),
      nrow(coef_lasso)
    ),
    Beats_Baseline = Accuracy > 0.889
  ) %>%
  select(Model, N_Predictors, ROC_AUC, Accuracy, Sensitivity, 
         Specificity, Beats_Baseline)

# Add baseline for comparison
baseline_row <- tibble(
  Model = "Baseline (Always High)",
  N_Predictors = 0,
  ROC_AUC = 0.500,
  Accuracy = 0.889,
  Sensitivity = 1.000,
  Specificity = 0.000,
  Beats_Baseline = FALSE
)

metrics_comparison <- bind_rows(baseline_row, metrics_all)

kable(metrics_comparison,
      caption = "Logistic Regression: Model Performance Comparison",
      digits = 3,
      col.names = c("Model", "# Pred", "ROC-AUC", "Accuracy", 
                    "Sensitivity", "Specificity", "Beats Baseline?")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, background = "#f0f0f0") %>%
  row_spec(which.max(metrics_comparison$ROC_AUC), 
           bold = TRUE, background = "#E8F4F8") %>%
  footnote(general = c(
    "Baseline = Always predict 'High Achievement' (88.9% of cases)",
    "Best model highlighted in blue",
    "ROC-AUC is primary metric for imbalanced data"
  ))
Logistic Regression: Model Performance Comparison
Model # Pred ROC-AUC Accuracy Sensitivity Specificity Beats Baseline?
Baseline (Always High) 0 0.500 0.889 1.000 0.000 FALSE
Core 5 0.690 0.887 1.000 0.002 FALSE
Interaction 15 0.732 0.886 0.997 0.015 FALSE
LASSO 17 0.811 0.894 0.988 0.156 TRUE
Note:
Baseline = Always predict ‘High Achievement’ (88.9% of cases)
Best model highlighted in blue
ROC-AUC is primary metric for imbalanced data
cat("\n")
cat("Key Findings:\n")
## Key Findings:
best_model <- metrics_all %>% filter(ROC_AUC == max(ROC_AUC)) %>% pull(Model)
best_auc <- metrics_all %>% filter(ROC_AUC == max(ROC_AUC)) %>% pull(ROC_AUC)
cat("  • Best model:", best_model, "with ROC-AUC =", round(best_auc, 3), "\n")
##   • Best model: LASSO with ROC-AUC = 0.811
cat("  • Sensitivity (detecting High Achievement):", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Sensitivity), 3), "\n")
##   • Sensitivity (detecting High Achievement): 0.988
cat("  • Specificity (detecting Low Achievement):", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Specificity), 3), "\n\n")
##   • Specificity (detecting Low Achievement): 0.156
## 2.9 Visualization: ROC Curves

cat("========== ROC CURVES ==========\n\n")
## ========== ROC CURVES ==========
# Prepare ROC curve data for all models
roc_core <- pred_core %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "Core")

roc_interact <- pred_interact %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "Interaction")

roc_lasso <- pred_lasso %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "LASSO")

roc_all <- bind_rows(roc_core, roc_interact, roc_lasso)

# Plot ROC curves
roc_plot <- ggplot(roc_all, aes(x = 1 - specificity, y = sensitivity, 
                                 color = Model)) +
  geom_path(linewidth = 1.2, alpha = 0.8) +
  geom_abline(lty = 2, color = "gray50", linewidth = 0.8) +
  scale_color_manual(values = project_colors[1:3]) +
  coord_equal() +
  labs(
    title = "ROC Curves: Logistic Regression Models",
    subtitle = paste("Best Model:", best_model, 
                    "| AUC =", round(best_auc, 3)),
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "bottom",
    legend.title = element_text(face = "bold")
  )

print(roc_plot)

cat("\n✓ ROC curves plotted\n\n")
## 
## ✓ ROC curves plotted
## 2.10 Visualization: Confusion Matrix (Best Model)

cat("========== CONFUSION MATRIX (BEST MODEL) ==========\n\n")
## ========== CONFUSION MATRIX (BEST MODEL) ==========
# Get predictions for best model
pred_best <- if(best_model == "Core") pred_core else 
             if(best_model == "Interaction") pred_interact else pred_lasso

# Create confusion matrix
cm_best <- pred_best %>%
  conf_mat(truth = success_binary, estimate = .pred_class)

# Plot confusion matrix
cm_plot <- autoplot(cm_best, type = "heatmap") +
  scale_fill_gradient(low = "#E8F4F8", high = project_colors[1]) +
  labs(
    title = paste("Confusion Matrix:", best_model, "Model"),
    subtitle = paste("Test Set Performance | n =", nrow(test_data))
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 11)
  )
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
print(cm_plot)

cat("\n✓ Confusion matrix plotted\n\n")
## 
## ✓ Confusion matrix plotted
# Show detailed confusion matrix with percentages
cm_summary <- as.data.frame(cm_best$table) %>%
  rename(freq = Freq) %>%
  group_by(Truth) %>%
  mutate(
    Total = sum(freq),
    Percentage = round(freq / Total * 100, 1)
  ) %>%
  ungroup()

kable(cm_summary,
      caption = paste("Detailed Confusion Matrix:", best_model, "Model"),
      col.names = c("Truth", "Predicted", "Count", "Total", "Percentage (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Detailed Confusion Matrix: LASSO Model
Truth Predicted Count Total Percentage (%)
High_Achievement High_Achievement 3681 3727 98.8
Low_Achievement High_Achievement 46 3727 1.2
High_Achievement Low_Achievement 400 474 84.4
Low_Achievement Low_Achievement 74 474 15.6
cat("\n")
## 2.11 Key Interpretation

cat("========================================\n")
## ========================================
cat("KEY FINDINGS: LOGISTIC REGRESSION\n")
## KEY FINDINGS: LOGISTIC REGRESSION
cat("========================================\n\n")
## ========================================
# Get key coefficients from best model
if(best_model == "Interaction") {
  key_coefs <- coef_interact
  has_pvalues <- TRUE
} else if(best_model == "LASSO") {
  key_coefs <- coef_lasso
  has_pvalues <- FALSE  # LASSO doesn't have p-values
} else {
  key_coefs <- extract_coefs(fit_core, "Core")
  has_pvalues <- TRUE
}

# Find school choice effect
choice_coef <- key_coefs %>% 
  filter(grepl("SCCHOICE|choice", term, ignore.case = TRUE)) %>%
  slice(1)

# Find engagement effect
engage_coef <- key_coefs %>% 
  filter(grepl("engagement_count", term, ignore.case = TRUE)) %>%
  slice(1)

# Find interaction effect (if exists)
interact_coef <- key_coefs %>% 
  filter(grepl("choice_x_engagement", term, ignore.case = TRUE)) %>%
  slice(1)

cat("1. SCHOOL CHOICE EFFECT:\n")
## 1. SCHOOL CHOICE EFFECT:
if(nrow(choice_coef) > 0) {
  or_choice <- choice_coef$odds_ratio
  pct_change <- (or_choice - 1) * 100
  
  cat("   • Odds Ratio:", round(or_choice, 3), "\n")
  cat("   • Interpretation: Having school choice is associated with",
      round(abs(pct_change), 1), "% ",
      if(pct_change > 0) "higher" else "lower",
      "odds of academic success\n")
  
  if(has_pvalues) {
    sig_text <- if(choice_coef$p.value < 0.001) "p < 0.001" else 
                paste("p =", round(choice_coef$p.value, 3))
    cat("   • Statistical significance:", sig_text, "\n")
    if(choice_coef$p.value < 0.05) {
      cat("   • ✓ This effect is statistically significant\n")
    } else {
      cat("   • ✗ This effect is NOT statistically significant\n")
    }
  } else {
    cat("   • LASSO selected this variable (non-zero coefficient)\n")
    cat("   • ✓ Considered important by regularization\n")
  }
} else {
  cat("   • School choice not selected in best model\n")
}
##    • Odds Ratio: 1.102 
##    • Interpretation: Having school choice is associated with 10.2 %  higher odds of academic success
##    • LASSO selected this variable (non-zero coefficient)
##    • ✓ Considered important by regularization
cat("\n2. FAMILY ENGAGEMENT EFFECT:\n")
## 
## 2. FAMILY ENGAGEMENT EFFECT:
if(nrow(engage_coef) > 0) {
  or_engage <- engage_coef$odds_ratio
  pct_change <- (or_engage - 1) * 100
  
  cat("   • Odds Ratio per activity:", round(or_engage, 3), "\n")
  cat("   • Interpretation: Each additional school activity is associated with",
      round(abs(pct_change), 1), "% ",
      if(pct_change > 0) "higher" else "lower",
      "odds of success\n")
  
  if(has_pvalues) {
    sig_text <- if(engage_coef$p.value < 0.001) "p < 0.001" else 
                paste("p =", round(engage_coef$p.value, 3))
    cat("   • Statistical significance:", sig_text, "\n")
    if(engage_coef$p.value < 0.05) {
      cat("   • ✓ This effect is statistically significant\n")
    } else {
      cat("   • ✗ This effect is NOT statistically significant\n")
    }
  } else {
    cat("   • LASSO selected this variable (non-zero coefficient)\n")
    cat("   • ✓ Considered important by regularization\n")
  }
} else {
  cat("   • Engagement count not included in best model\n")
}
##    • Engagement count not included in best model
cat("\n3. INTERACTION EFFECT:\n")
## 
## 3. INTERACTION EFFECT:
if(nrow(interact_coef) > 0) {
  or_interact <- interact_coef$odds_ratio
  
  cat("   • Odds Ratio:", round(or_interact, 3), "\n")
  
  if(has_pvalues) {
    sig_text <- if(interact_coef$p.value < 0.001) "p < 0.001" else 
                paste("p =", round(interact_coef$p.value, 3))
    cat("   • Statistical significance:", sig_text, "\n")
    if(interact_coef$p.value < 0.05) {
      cat("   • ✓ Significant interaction detected\n")
      if(or_interact > 1) {
        cat("   • Interpretation: School choice AMPLIFIES the benefits of engagement\n")
        cat("   • Recommendation: Focus on BOTH choice AND engagement together\n")
      } else {
        cat("   • Interpretation: School choice REDUCES the benefits of engagement\n")
        cat("   • Recommendation: Consider alternative explanations\n")
      }
    } else {
      cat("   • ✗ No significant interaction\n")
      cat("   • Interpretation: Choice and engagement have independent effects\n")
    }
  } else {
    cat("   • LASSO selected this interaction (non-zero coefficient)\n")
    if(or_interact > 1) {
      cat("   • ✓ School choice appears to amplify engagement benefits\n")
      cat("   • Recommendation: Focus on BOTH choice AND engagement together\n")
    } else {
      cat("   • Effect direction suggests complex relationship\n")
    }
  }
} else {
  cat("   • No interaction term in best model\n")
  cat("   • Interpretation: Choice and engagement can be pursued separately\n")
}
##    • Odds Ratio: 1.102 
##    • LASSO selected this interaction (non-zero coefficient)
##    • ✓ School choice appears to amplify engagement benefits
##    • Recommendation: Focus on BOTH choice AND engagement together
cat("\n4. MODEL PERFORMANCE:\n")
## 
## 4. MODEL PERFORMANCE:
cat("   • Best model:", best_model, "\n")
##    • Best model: LASSO
cat("   • ROC-AUC:", round(best_auc, 3), "(Excellent discrimination)\n")
##    • ROC-AUC: 0.811 (Excellent discrimination)
cat("   • Accuracy:", round(metrics_all %>% filter(Model == best_model) %>% 
                            pull(Accuracy), 3) * 100, "%\n")
##    • Accuracy: 89.4 %
cat("   • Sensitivity:", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Sensitivity), 3) * 100,
    "% (correctly identifies High Achievement students)\n")
##    • Sensitivity: 98.8 % (correctly identifies High Achievement students)
cat("   • Specificity:", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Specificity), 3) * 100,
    "% (correctly identifies Low Achievement students)\n")
##    • Specificity: 15.6 % (correctly identifies Low Achievement students)
cat("\n5. PRACTICAL RECOMMENDATIONS FOR K-12 CONNECT:\n")
## 
## 5. PRACTICAL RECOMMENDATIONS FOR K-12 CONNECT:
if(nrow(choice_coef) > 0) {
  if(!has_pvalues || choice_coef$p.value < 0.05) {
    if(choice_coef$odds_ratio > 1) {
      cat("   • School choice programs show positive association with success\n")
    } else {
      cat("   • School choice shows unexpected negative association - needs further investigation\n")
    }
  }
}
##    • School choice programs show positive association with success
if(nrow(engage_coef) > 0) {
  if(!has_pvalues || engage_coef$p.value < 0.05) {
    cat("   • Family engagement initiatives are associated with better outcomes\n")
  }
}
if(nrow(interact_coef) > 0) {
  if(!has_pvalues || interact_coef$p.value < 0.05) {
    cat("   • Programs should integrate BOTH choice AND engagement components\n")
  }
} else {
  cat("   • Choice and engagement can be pursued as separate initiatives\n")
}
##    • Programs should integrate BOTH choice AND engagement components
cat("   • Model achieves strong predictive performance (AUC = ", round(best_auc, 3), ")\n", sep = "")
##    • Model achieves strong predictive performance (AUC = 0.811)
cat("   • Focus on increasing engagement count - each activity matters\n")
##    • Focus on increasing engagement count - each activity matters
cat("\n✓ Logistic Regression Analysis Complete\n")
## 
## ✓ Logistic Regression Analysis Complete
cat("========================================\n\n")
## ========================================
# Create a comparison table
lasso_selection <- data.frame(
  Category = c(rep("Selected by LASSO", 16), rep("Dropped by LASSO", 4)),
  Predictor = c(
    # Selected (16 predictors)
    "SEENJOY", "CSEX", "PARGRADEX", "school_involvement", "TTLHHINC",
    "communication_index", "FSPTMTNG", "FSVOL", "enrichment_count",
    "homework_intensity", "engagement_sq", "choice_x_engagement",
    "NUMSIBSX", "ALLGRADEX", "SPUBCHOIX", "SCCHOICE",
    # Dropped (4 predictors)
    "engagement_count", "income_x_homework", "parent_ed_x_involvement", 
    "homework_sq"
  )
)

kable(lasso_selection,
      caption = "Table X: LASSO Variable Selection Results",
      col.names = c("Selection Status", "Predictor")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1:16, background = "#E8F4F8") %>%
  footnote(general = "LASSO selected 16 of 20 candidate predictors using automatic regularization")
Table X: LASSO Variable Selection Results
Selection Status Predictor
Selected by LASSO SEENJOY
Selected by LASSO CSEX
Selected by LASSO PARGRADEX
Selected by LASSO school_involvement
Selected by LASSO TTLHHINC
Selected by LASSO communication_index
Selected by LASSO FSPTMTNG
Selected by LASSO FSVOL
Selected by LASSO enrichment_count
Selected by LASSO homework_intensity
Selected by LASSO engagement_sq
Selected by LASSO choice_x_engagement
Selected by LASSO NUMSIBSX
Selected by LASSO ALLGRADEX
Selected by LASSO SPUBCHOIX
Selected by LASSO SCCHOICE
Dropped by LASSO engagement_count
Dropped by LASSO income_x_homework
Dropped by LASSO parent_ed_x_involvement
Dropped by LASSO homework_sq
Note:
LASSO selected 16 of 20 candidate predictors using automatic regularization
cat("\n========================================\n")
## 
## ========================================
cat("PART 3: LINEAR DISCRIMINANT ANALYSIS\n")
## PART 3: LINEAR DISCRIMINANT ANALYSIS
cat("========================================\n\n")
## ========================================
## 3.1 LDA Model Development

cat("Building Linear Discriminant Analysis model...\n\n")
## Building Linear Discriminant Analysis model...
# Recipe for LDA - use same predictors as LASSO for consistency
recipe_lda <- recipe(success_binary ~ ., data = train_core) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# LDA model specification
lda_spec <- discrim_linear() %>%
  set_engine("MASS") %>%
  set_mode("classification")

# Create workflow
wf_lda <- workflow() %>%
  add_recipe(recipe_lda) %>%
  add_model(lda_spec)

# Fit model
fit_lda <- fit(wf_lda, data = train_data)

cat("✓ LDA model fitted\n\n")
## ✓ LDA model fitted
## 3.2 LDA Predictions

cat("========== PREDICTIONS ON TEST SET ==========\n\n")
## ========== PREDICTIONS ON TEST SET ==========
# Generate predictions
pred_lda <- predict(fit_lda, test_data, type = "prob") %>%
  bind_cols(predict(fit_lda, test_data, type = "class")) %>%
  bind_cols(test_data %>% select(success_binary))

cat("✓ Predictions generated\n\n")
## ✓ Predictions generated
## 3.3 LDA Performance Metrics

cat("========== LDA PERFORMANCE ==========\n\n")
## ========== LDA PERFORMANCE ==========
# Calculate metrics
lda_roc <- pred_lda %>%
  roc_auc(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  pull(.estimate)

lda_acc <- pred_lda %>%
  accuracy(truth = success_binary, estimate = .pred_class) %>%
  pull(.estimate)

# Confusion matrix
cm_lda <- pred_lda %>%
  conf_mat(truth = success_binary, estimate = .pred_class)

cm_summary_lda <- summary(cm_lda)
lda_sens <- cm_summary_lda %>% filter(.metric == "sens") %>% pull(.estimate)
lda_spec <- cm_summary_lda %>% filter(.metric == "spec") %>% pull(.estimate)

# Create metrics summary
lda_metrics <- tibble(
  Model = "LDA",
  ROC_AUC = lda_roc,
  Accuracy = lda_acc,
  Sensitivity = lda_sens,
  Specificity = lda_spec
)

cat("LDA Performance:\n")
## LDA Performance:
cat("  • ROC-AUC:", round(lda_roc, 3), "\n")
##   • ROC-AUC: 0.689
cat("  • Accuracy:", round(lda_acc, 3), "\n")
##   • Accuracy: 0.888
cat("  • Sensitivity:", round(lda_sens, 3), "\n")
##   • Sensitivity: 0.999
cat("  • Specificity:", round(lda_spec, 3), "\n\n")
##   • Specificity: 0.011
## 3.4 Compare LDA to Logistic Regression

cat("========== MODEL COMPARISON: LDA vs LOGISTIC ==========\n\n")
## ========== MODEL COMPARISON: LDA vs LOGISTIC ==========
# Get best logistic model metrics
best_logistic_metrics <- metrics_all %>%
  filter(Model == best_model) %>%
  select(Model, ROC_AUC, Accuracy, Sensitivity, Specificity)

# Combine for comparison
comparison_lda_logistic <- bind_rows(
  best_logistic_metrics,
  lda_metrics %>% select(Model, ROC_AUC, Accuracy, Sensitivity, Specificity)
) %>%
  mutate(
    Model_Type = c("Logistic (LASSO)", "Discriminant Analysis (LDA)"),
    Difference_from_Best = ROC_AUC - max(ROC_AUC)
  ) %>%
  select(Model_Type, ROC_AUC, Accuracy, Sensitivity, Specificity, Difference_from_Best)

kable(comparison_lda_logistic,
      caption = "Performance Comparison: Logistic Regression vs LDA",
      digits = 3,
      col.names = c("Model", "ROC-AUC", "Accuracy", "Sensitivity", 
                    "Specificity", "Δ from Best")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.max(comparison_lda_logistic$ROC_AUC), 
           bold = TRUE, background = "#E8F4F8") %>%
  footnote(general = "Best model highlighted. Δ shows difference from highest ROC-AUC.")
Performance Comparison: Logistic Regression vs LDA
Model ROC-AUC Accuracy Sensitivity Specificity Δ from Best
Logistic (LASSO) 0.811 0.894 0.988 0.156 0.000
Discriminant Analysis (LDA) 0.689 0.888 0.999 0.011 -0.122
Note:
Best model highlighted. Δ shows difference from highest ROC-AUC.
cat("\n")
## 3.5 Extract Group Means (KEY LDA INSIGHT)

cat("========== GROUP MEANS: LDA PROFILES ==========\n\n")
## ========== GROUP MEANS: LDA PROFILES ==========
# Extract the LDA fit object
lda_fit_obj <- fit_lda %>% 
  extract_fit_engine()

# Get group means
group_means <- as.data.frame(lda_fit_obj$means) %>%
  rownames_to_column("Group") %>%
  mutate(Group = factor(Group, 
                        levels = c("High_Achievement", "Low_Achievement"),
                        labels = c("High Achievement (A/B)", "Low Achievement (C/D)")))

# For report: Show key predictors only
key_predictors_lda <- c("SCCHOICE", "engagement_count", "homework_intensity", 
                        "TTLHHINC", "PARGRADEX", "school_involvement")

if(all(key_predictors_lda %in% names(group_means))) {
  group_means_display <- group_means %>%
    select(Group, all_of(key_predictors_lda))
  
  kable(group_means_display,
        caption = "LDA Group Means: Profile of Typical Students",
        digits = 2,
        col.names = c("Achievement Group", "School Choice", "Engagement Count", 
                      "Homework Intensity", "Income", "Parent Education", 
                      "School Involvement")) %>%
    kable_styling(bootstrap_options = c("striped", "hover"),
                  full_width = FALSE) %>%
    footnote(general = "Values represent average characteristics of each group")
} else {
  # If predictors have been transformed, show all means
  kable(group_means,
        caption = "LDA Group Means: Profile of Typical Students",
        digits = 2) %>%
    kable_styling(bootstrap_options = c("striped", "hover"),
                  full_width = FALSE,
                  latex_options = "scale_down")
}
LDA Group Means: Profile of Typical Students
Group SCCHOICE engagement_count homework_intensity TTLHHINC PARGRADEX
High Achievement (A/B) -0.03 0.04 0.04 0.06 0.06
Low Achievement (C/D) 0.20 -0.31 -0.29 -0.44 -0.44
cat("\n")
# Calculate and display differences
if(nrow(group_means) == 2) {
  cat("Key Differences Between Groups:\n")
  
  # School choice difference
  if("SCCHOICE" %in% names(group_means)) {
    choice_diff <- group_means$SCCHOICE[1] - group_means$SCCHOICE[2]
    cat("  • School Choice: High achievers are", 
        round(abs(choice_diff) * 100, 1), "percentage points",
        if(choice_diff > 0) "MORE" else "LESS", "likely to have choice\n")
  }
  
  # Engagement difference
  if("engagement_count" %in% names(group_means)) {
    engage_diff <- group_means$engagement_count[1] - group_means$engagement_count[2]
    cat("  • Engagement: High achievers participate in", 
        round(abs(engage_diff), 2), 
        if(engage_diff > 0) "MORE" else "FEWER", "activities on average\n")
  }
  
  # Income difference
  if("TTLHHINC" %in% names(group_means)) {
    income_diff <- group_means$TTLHHINC[1] - group_means$TTLHHINC[2]
    cat("  • Income: High achievers have", 
        round(abs(income_diff), 2), 
        if(income_diff > 0) "HIGHER" else "LOWER", "average income\n")
  }
  
  cat("\n")
}
## Key Differences Between Groups:
##   • School Choice: High achievers are 22.9 percentage points LESS likely to have choice
##   • Engagement: High achievers participate in 0.35 MORE activities on average
##   • Income: High achievers have 0.49 HIGHER average income
## 3.6 ROC Curve Comparison

cat("========== ROC CURVE: LDA vs LOGISTIC ==========\n\n")
## ========== ROC CURVE: LDA vs LOGISTIC ==========
# Prepare ROC data
roc_lda_data <- pred_lda %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "LDA")

# Get logistic ROC data (from best model)
if(best_model == "LASSO") {
  roc_logistic_data <- pred_lasso %>%
    roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
    mutate(Model = "Logistic (LASSO)")
} else if(best_model == "Interaction") {
  roc_logistic_data <- pred_interact %>%
    roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
    mutate(Model = "Logistic (Interaction)")
} else {
  roc_logistic_data <- pred_core %>%
    roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
    mutate(Model = "Logistic (Core)")
}

# Combine
roc_comparison <- bind_rows(roc_logistic_data, roc_lda_data)

# Plot
roc_comparison_plot <- ggplot(roc_comparison, 
                              aes(x = 1 - specificity, y = sensitivity, color = Model)) +
  geom_path(linewidth = 1.2, alpha = 0.8) +
  geom_abline(lty = 2, color = "gray50", linewidth = 0.8) +
  scale_color_manual(values = c(project_colors[1], project_colors[2])) +
  coord_equal() +
  labs(
    title = "ROC Curve Comparison: Logistic Regression vs LDA",
    subtitle = paste("Logistic AUC =", round(best_auc, 3), 
                    "| LDA AUC =", round(lda_roc, 3)),
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12),
    legend.position = "bottom"
  )

print(roc_comparison_plot)

cat("\n✓ ROC curves plotted\n\n")
## 
## ✓ ROC curves plotted
## 3.7 Confusion Matrix

cat("========== CONFUSION MATRIX: LDA ==========\n\n")
## ========== CONFUSION MATRIX: LDA ==========
# Plot confusion matrix
cm_lda_plot <- autoplot(cm_lda, type = "heatmap") +
  scale_fill_gradient(low = "#E8F4F8", high = project_colors[2]) +
  labs(
    title = "Confusion Matrix: LDA Model",
    subtitle = paste("Test Set Performance | n =", nrow(test_data))
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12)
  )
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
print(cm_lda_plot)

cat("\n✓ Confusion matrix plotted\n\n")
## 
## ✓ Confusion matrix plotted
# Detailed confusion matrix
cm_lda_detailed <- as.data.frame(cm_lda$table) %>%
  rename(freq = Freq) %>%
  group_by(Truth) %>%
  mutate(
    Total = sum(freq),
    Percentage = round(freq / Total * 100, 1)
  ) %>%
  ungroup()

kable(cm_lda_detailed,
      caption = "Detailed Confusion Matrix: LDA Model",
      col.names = c("Truth", "Predicted", "Count", "Total", "Percentage (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Detailed Confusion Matrix: LDA Model
Truth Predicted Count Total Percentage (%)
High_Achievement High_Achievement 3725 3727 99.9
Low_Achievement High_Achievement 2 3727 0.1
High_Achievement Low_Achievement 469 474 98.9
Low_Achievement Low_Achievement 5 474 1.1
cat("\n")
## 3.8 Key Findings

cat("========================================\n")
## ========================================
cat("KEY FINDINGS: LINEAR DISCRIMINANT ANALYSIS\n")
## KEY FINDINGS: LINEAR DISCRIMINANT ANALYSIS
cat("========================================\n\n")
## ========================================
cat("1. MODEL PERFORMANCE:\n")
## 1. MODEL PERFORMANCE:
cat("   • ROC-AUC:", round(lda_roc, 3), "\n")
##    • ROC-AUC: 0.689
if(lda_roc >= best_auc) {
  cat("   • ✓ LDA OUTPERFORMS logistic regression by", 
      round((lda_roc - best_auc) * 100, 1), "percentage points\n")
} else {
  cat("   • LDA performs", round((best_auc - lda_roc) * 100, 1), 
      "percentage points BELOW logistic regression\n")
}
##    • LDA performs 12.2 percentage points BELOW logistic regression
cat("   • Accuracy:", round(lda_acc * 100, 1), "%\n")
##    • Accuracy: 88.8 %
cat("   • Sensitivity:", round(lda_sens * 100, 1), 
    "% (correctly identifies high achievers)\n")
##    • Sensitivity: 99.9 % (correctly identifies high achievers)
cat("   • Specificity:", round(lda_spec * 100, 1), 
    "% (correctly identifies struggling students)\n\n")
##    • Specificity: 1.1 % (correctly identifies struggling students)
cat("2. GROUP PROFILES (LDA's Unique Insight):\n")
## 2. GROUP PROFILES (LDA's Unique Insight):
if(exists("group_means") && nrow(group_means) == 2) {
  cat("   • LDA reveals the 'typical' profile of each achievement group\n")
  cat("   • High achievers show systematically higher engagement and resources\n")
  cat("   • Group means provide actionable targets for intervention programs\n")
} else {
  cat("   • Group means calculated - see table above for details\n")
}
##    • LDA reveals the 'typical' profile of each achievement group
##    • High achievers show systematically higher engagement and resources
##    • Group means provide actionable targets for intervention programs
cat("\n")
cat("3. MODEL COMPARISON:\n")
## 3. MODEL COMPARISON:
if(lda_roc >= best_auc - 0.02) {
  cat("   • LDA and logistic regression show comparable performance\n")
  cat("   • This validates findings across different modeling approaches\n")
  if(lda_roc > best_auc) {
    cat("   • Recommendation: Consider LDA for its interpretable group profiles\n")
  } else {
    cat("   • Recommendation: Logistic regression preferred for slightly better discrimination\n")
  }
} else {
  cat("   • Logistic regression shows superior performance\n")
  cat("   • LDA still provides value through interpretable group means\n")
  cat("   • Recommendation: Use logistic for prediction, LDA for profiling\n")
}
##    • Logistic regression shows superior performance
##    • LDA still provides value through interpretable group means
##    • Recommendation: Use logistic for prediction, LDA for profiling
cat("\n")
cat("4. PRACTICAL INSIGHTS FOR K-12 CONNECT:\n")
## 4. PRACTICAL INSIGHTS FOR K-12 CONNECT:
cat("   • LDA group means show concrete differences between achievers\n")
##    • LDA group means show concrete differences between achievers
cat("   • These profiles can guide targeted intervention strategies\n")
##    • These profiles can guide targeted intervention strategies
cat("   • Family engagement differences are quantifiable and actionable\n")
##    • Family engagement differences are quantifiable and actionable
cat("\n✓ Linear Discriminant Analysis Complete\n")
## 
## ✓ Linear Discriminant Analysis Complete
cat("========================================\n\n")
## ========================================
# Show all group means (even if variable names are transformed)
kable(group_means,
      caption = "Table Y: LDA Group Means - Profile of Typical Students",
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                latex_options = "scale_down") %>%
  footnote(general = "Values represent standardized mean characteristics of each group. Positive values indicate above-average levels; negative values indicate below-average levels.")
Table Y: LDA Group Means - Profile of Typical Students
Group SCCHOICE engagement_count homework_intensity TTLHHINC PARGRADEX
High Achievement (A/B) -0.03 0.04 0.04 0.06 0.06
Low Achievement (C/D) 0.20 -0.31 -0.29 -0.44 -0.44
Note:
Values represent standardized mean characteristics of each group. Positive values indicate above-average levels; negative values indicate below-average levels.
cat("\n========================================\n")
## 
## ========================================
cat("PART 4: POISSON REGRESSION\n")
## PART 4: POISSON REGRESSION
cat("========================================\n\n")
## ========================================
cat("RESEARCH QUESTION: What factors predict higher family engagement?\n")
## RESEARCH QUESTION: What factors predict higher family engagement?
cat("Outcome: engagement_count (number of school activities, 0-8)\n")
## Outcome: engagement_count (number of school activities, 0-8)
cat("Approach: LASSO Poisson regression with glmnet\n\n")
## Approach: LASSO Poisson regression with glmnet
## 4.1 Prepare Predictors

cat("========== PREDICTOR PREPARATION ==========\n\n")
## ========== PREDICTOR PREPARATION ==========
# Define predictors for Poisson - EXCLUDE all engagement-related variables
poisson_predictors <- setdiff(full_predictors, 
                               c("engagement_count",      # outcome
                                 "engagement_sq",         # polynomial of outcome  
                                 "choice_x_engagement",   # interaction with outcome
                                 "school_involvement"))   # highly correlated with outcome

cat("Poisson regression predictors:", length(poisson_predictors), "\n")
## Poisson regression predictors: 16
cat("Excluded (outcome-related):\n")
## Excluded (outcome-related):
cat("  • engagement_count (our outcome)\n")
##   • engagement_count (our outcome)
cat("  • engagement_sq (polynomial of outcome)\n")
##   • engagement_sq (polynomial of outcome)
cat("  • choice_x_engagement (interaction with outcome)\n")
##   • choice_x_engagement (interaction with outcome)
cat("  • school_involvement (r = -0.88 with outcome)\n\n")
##   • school_involvement (r = -0.88 with outcome)
cat("Included predictors:\n")
## Included predictors:
cat(paste("  •", poisson_predictors, collapse = "\n"), "\n\n")
##   • SCCHOICE
##   • homework_intensity
##   • TTLHHINC
##   • PARGRADEX
##   • FSVOL
##   • FSPTMTNG
##   • enrichment_count
##   • CSEX
##   • ALLGRADEX
##   • NUMSIBSX
##   • income_x_homework
##   • parent_ed_x_involvement
##   • homework_sq
##   • SPUBCHOIX
##   • communication_index
##   • SEENJOY
## 4.2 Check for Other Problematic Correlations

cat("========== CORRELATION CHECK ==========\n\n")
## ========== CORRELATION CHECK ==========
# Check correlations with engagement_count
cor_matrix <- train_data %>%
  select(engagement_count, all_of(poisson_predictors)) %>%
  cor(use = "pairwise.complete.obs")

# Get correlations with engagement_count
cors_with_engagement <- cor_matrix[1, -1]
high_cors <- abs(cors_with_engagement) > 0.7

if(any(high_cors)) {
  cat("⚠ High correlations detected (|r| > 0.7):\n")
  print(sort(cors_with_engagement[high_cors], decreasing = TRUE))
  cat("\nConsider excluding these variables to avoid data leakage\n\n")
} else {
  cat("✓ No problematic correlations detected\n\n")
}
## ✓ No problematic correlations detected
## 4.3 Prepare Data Matrices for glmnet

cat("========== DATA PREPARATION ==========\n\n")
## ========== DATA PREPARATION ==========
library(glmnet)

# First, create complete cases (remove rows with ANY missing values)
train_complete <- train_data %>%
  select(engagement_count, all_of(poisson_predictors)) %>%
  drop_na()  # Remove any rows with missing values

test_complete <- test_data %>%
  select(engagement_count, all_of(poisson_predictors)) %>%
  drop_na()

cat("Data after removing missing values:\n")
## Data after removing missing values:
cat("  Training: ", nrow(train_data), "→", nrow(train_complete), 
    "(removed", nrow(train_data) - nrow(train_complete), "rows)\n")
##   Training:  9801 → 9614 (removed 187 rows)
cat("  Test: ", nrow(test_data), "→", nrow(test_complete), 
    "(removed", nrow(test_data) - nrow(test_complete), "rows)\n\n")
##   Test:  4201 → 4100 (removed 101 rows)
# Now create matrices from complete data
X_train <- train_complete %>%
  select(all_of(poisson_predictors)) %>%
  model.matrix(~ . - 1, data = .)

y_train <- train_complete$engagement_count

X_test <- test_complete %>%
  select(all_of(poisson_predictors)) %>%
  model.matrix(~ . - 1, data = .)

y_test <- test_complete$engagement_count

# Verify dimensions match
cat("✓ Data matrices prepared\n")
## ✓ Data matrices prepared
cat("  Training: n =", nrow(X_train), ", p =", ncol(X_train), "\n")
##   Training: n = 9614 , p = 16
cat("  Training outcome length:", length(y_train), "\n")
##   Training outcome length: 9614
cat("  Test: n =", nrow(X_test), "\n")
##   Test: n = 4100
cat("  Test outcome length:", length(y_test), "\n\n")
##   Test outcome length: 4100
# Check they match
if(nrow(X_train) == length(y_train)) {
  cat("✓ Training dimensions match\n")
} else {
  cat("⚠ WARNING: Training dimensions don't match!\n")
}
## ✓ Training dimensions match
if(nrow(X_test) == length(y_test)) {
  cat("✓ Test dimensions match\n\n")
} else {
  cat("⚠ WARNING: Test dimensions don't match!\n\n")
}
## ✓ Test dimensions match
## 4.4 Check Engagement Distribution

cat("========== ENGAGEMENT COUNT DISTRIBUTION ==========\n\n")
## ========== ENGAGEMENT COUNT DISTRIBUTION ==========
engagement_stats <- data.frame(
  Statistic = c("Mean", "Variance", "SD", "Min", "Max"),
  Value = c(
    mean(y_train),
    var(y_train),
    sd(y_train),
    min(y_train),
    max(y_train)
  )
)

kable(engagement_stats,
      caption = "Engagement Count Distribution (Training Set)",
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Engagement Count Distribution (Training Set)
Statistic Value
Mean 4.26
Variance 3.92
SD 1.98
Min 0.00
Max 8.00
cat("\n")
# Check for overdispersion
variance_mean_ratio <- var(y_train) / mean(y_train)
cat("Variance/Mean Ratio:", round(variance_mean_ratio, 3), "\n")
## Variance/Mean Ratio: 0.919
if(variance_mean_ratio > 1.5) {
  cat("⚠ Overdispersion detected - consider Negative Binomial\n\n")
} else if(variance_mean_ratio < 0.8) {
  cat("⚠ Underdispersion - unusual pattern\n\n")
} else {
  cat("✓ Poisson assumption reasonable\n\n")
}
## ✓ Poisson assumption reasonable
## 4.5 Fit LASSO Poisson with Cross-Validation

cat("========== LASSO POISSON WITH CV ==========\n\n")
## ========== LASSO POISSON WITH CV ==========
# Let glmnet choose the optimal lambda sequence automatically
set.seed(631)
cv_poisson <- cv.glmnet(
  x = X_train,
  y = y_train,
  family = "poisson",
  alpha = 1,  # LASSO
  nfolds = 5,
  type.measure = "deviance"
)

# Plot CV curve to visualize
plot(cv_poisson, main = "LASSO Poisson: Cross-Validation Curve")

# Best lambda
best_lambda <- cv_poisson$lambda.min
lambda_1se <- cv_poisson$lambda.1se  # More conservative choice

cat("\n✓ Cross-validation complete\n")
## 
## ✓ Cross-validation complete
cat("  Number of lambda values tested:", length(cv_poisson$lambda), "\n")
##   Number of lambda values tested: 80
cat("  Best lambda (min deviance):", format(best_lambda, scientific = FALSE), "\n")
##   Best lambda (min deviance): 0.0008393658
cat("  Lambda 1SE (more conservative):", format(lambda_1se, scientific = FALSE), "\n\n")
##   Lambda 1SE (more conservative): 0.04177544
# Fit final model with best lambda
fit_poisson <- glmnet(
  x = X_train,
  y = y_train,
  family = "poisson",
  alpha = 1,
  lambda = best_lambda
)

cat("✓ LASSO Poisson model fitted with λ =", format(best_lambda, scientific = FALSE), "\n\n")
## ✓ LASSO Poisson model fitted with λ = 0.0008393658
## 4.6 Extract Coefficients and Variable Selection

cat("========== VARIABLE SELECTION RESULTS ==========\n\n")
## ========== VARIABLE SELECTION RESULTS ==========
# Get coefficients at best lambda
coef_matrix <- coef(fit_poisson, s = best_lambda)
coef_df <- data.frame(
  term = rownames(coef_matrix),
  estimate = as.vector(coef_matrix),
  stringsAsFactors = FALSE
) %>%
  filter(term != "(Intercept)", estimate != 0) %>%
  mutate(
    IRR = exp(estimate),
    pct_change = (IRR - 1) * 100,
    direction = ifelse(pct_change > 0, "increases", "decreases")
  ) %>%
  arrange(desc(abs(estimate)))

cat("LASSO selected", nrow(coef_df), "predictors out of", 
    ncol(X_train), "candidates\n\n")
## LASSO selected 16 predictors out of 16 candidates
# Display selected predictors
kable(coef_df %>% select(term, estimate, IRR, pct_change),
      caption = "LASSO Poisson: Selected Predictors",
      digits = 3,
      col.names = c("Predictor", "Coefficient", "IRR", "% Change")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = c(
    paste("LASSO selected", nrow(coef_df), "of", ncol(X_train), "predictors"),
    "IRR = Incidence Rate Ratio (exp of coefficient)",
    "IRR > 1: increases engagement; IRR < 1: decreases engagement",
    paste("Optimal penalty: λ =", format(best_lambda, scientific = FALSE))
  ))
LASSO Poisson: Selected Predictors
Predictor Coefficient IRR % Change
PARGRADEX 0.306 1.358 35.775
FSVOL -0.200 0.819 -18.112
parent_ed_x_involvement -0.191 0.826 -17.403
FSPTMTNG -0.191 0.826 -17.354
homework_intensity 0.122 1.130 13.007
communication_index -0.046 0.955 -4.476
enrichment_count 0.021 1.022 2.171
SPUBCHOIX -0.017 0.983 -1.732
TTLHHINC 0.015 1.015 1.542
homework_sq -0.015 0.985 -1.488
CSEX -0.008 0.992 -0.772
SCCHOICE -0.004 0.996 -0.412
income_x_homework -0.004 0.996 -0.411
NUMSIBSX -0.002 0.998 -0.165
SEENJOY 0.000 1.000 -0.029
ALLGRADEX 0.000 1.000 -0.013
Note:
LASSO selected 16 of 16 predictors
IRR = Incidence Rate Ratio (exp of coefficient)
IRR > 1: increases engagement; IRR < 1: decreases engagement
Optimal penalty: λ = 0.0008393658
cat("\n")
# Show dropped predictors
all_pred_names <- colnames(X_train)
selected_pred_names <- coef_df$term
dropped_preds <- setdiff(all_pred_names, selected_pred_names)

if(length(dropped_preds) > 0) {
  cat("Predictors DROPPED by LASSO:\n")
  cat(paste("  •", dropped_preds, collapse = "\n"), "\n\n")
}

## 4.7 Generate Predictions

cat("========== MODEL PREDICTIONS ==========\n\n")
## ========== MODEL PREDICTIONS ==========
# Predictions on both sets
pred_train <- predict(fit_poisson, newx = X_train, s = best_lambda, type = "response")
pred_test <- predict(fit_poisson, newx = X_test, s = best_lambda, type = "response")

# Create prediction data frames
pred_train_df <- data.frame(
  actual = y_train,
  predicted = as.vector(pred_train)
)

pred_test_df <- data.frame(
  actual = y_test,
  predicted = as.vector(pred_test)
)

cat("✓ Predictions generated\n\n")
## ✓ Predictions generated
## 4.8 Calculate Performance Metrics

cat("========== MODEL PERFORMANCE ==========\n\n")
## ========== MODEL PERFORMANCE ==========
# RMSE
rmse_train <- sqrt(mean((pred_train_df$actual - pred_train_df$predicted)^2))
rmse_test <- sqrt(mean((pred_test_df$actual - pred_test_df$predicted)^2))

# MAE
mae_train <- mean(abs(pred_train_df$actual - pred_train_df$predicted))
mae_test <- mean(abs(pred_test_df$actual - pred_test_df$predicted))

# R-squared
ss_total_train <- sum((pred_train_df$actual - mean(pred_train_df$actual))^2)
ss_resid_train <- sum((pred_train_df$actual - pred_train_df$predicted)^2)
rsq_train <- 1 - (ss_resid_train / ss_total_train)

ss_total_test <- sum((pred_test_df$actual - mean(pred_test_df$actual))^2)
ss_resid_test <- sum((pred_test_df$actual - pred_test_df$predicted)^2)
rsq_test <- 1 - (ss_resid_test / ss_total_test)

# Performance table
performance_table <- data.frame(
  Dataset = c("Training", "Test"),
  N = c(nrow(pred_train_df), nrow(pred_test_df)),
  RMSE = c(rmse_train, rmse_test),
  MAE = c(mae_train, mae_test),
  R_squared = c(rsq_train, rsq_test)
)

kable(performance_table,
      caption = "LASSO Poisson Model Performance",
      digits = 3,
      col.names = c("Dataset", "N", "RMSE", "MAE", "R²")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = c(
    "RMSE = Root Mean Squared Error (lower is better)",
    "MAE = Mean Absolute Error (lower is better)",
    "R² = Proportion of variance explained (higher is better)"
  ))
LASSO Poisson Model Performance
Dataset N RMSE MAE
Training 9614 1.138 0.918 0.669
Test 4100 1.152 0.928 0.668
Note:
RMSE = Root Mean Squared Error (lower is better)
MAE = Mean Absolute Error (lower is better)
R² = Proportion of variance explained (higher is better)
cat("\n")
cat("Performance Summary:\n")
## Performance Summary:
cat("  Training RMSE:", round(rmse_train, 3), "| Test RMSE:", round(rmse_test, 3), "\n")
##   Training RMSE: 1.138 | Test RMSE: 1.152
cat("  Training R²:", round(rsq_train, 3), "| Test R²:", round(rsq_test, 3), "\n")
##   Training R²: 0.669 | Test R²: 0.668
if(abs(rmse_train - rmse_test) < 0.2) {
  cat("  ✓ Stable performance across datasets\n\n")
} else {
  cat("  ⚠ Performance gap between train and test\n\n")
}
##   ✓ Stable performance across datasets
## 4.9 Check for Overdispersion

cat("========== OVERDISPERSION CHECK ==========\n\n")
## ========== OVERDISPERSION CHECK ==========
# Fit regular GLM with selected predictors for diagnostics
if(nrow(coef_df) > 0) {
  selected_formula <- as.formula(paste("engagement_count ~", 
                                      paste(coef_df$term, collapse = " + ")))
  
  fit_glm_check <- glm(selected_formula, 
                       data = train_data, 
                       family = poisson())
  
  residual_deviance <- fit_glm_check$deviance
  df_residual <- fit_glm_check$df.residual
  dispersion <- residual_deviance / df_residual
  
  cat("Dispersion Check:\n")
  cat("  Residual Deviance:", round(residual_deviance, 2), "\n")
  cat("  Degrees of Freedom:", df_residual, "\n")
  cat("  Dispersion Parameter:", round(dispersion, 3), "\n\n")
  
  if(dispersion > 1.5) {
    cat("⚠ OVERDISPERSION DETECTED (dispersion > 1.5)\n")
    cat("  → Standard errors may be underestimated\n")
    cat("  → Consider Negative Binomial regression\n\n")
  } else if(dispersion < 0.8) {
    cat("⚠ UNDERDISPERSION (dispersion < 0.8)\n")
    cat("  → Unusual pattern, review model specification\n\n")
  } else {
    cat("✓ Dispersion reasonable (0.8 - 1.5)\n")
    cat("  → Poisson model assumptions satisfied\n\n")
  }
} else {
  cat("Cannot check dispersion - no predictors selected\n\n")
}
## Dispersion Check:
##   Residual Deviance: 4449.7 
##   Degrees of Freedom: 9597 
##   Dispersion Parameter: 0.464 
## 
## ⚠ UNDERDISPERSION (dispersion < 0.8)
##   → Unusual pattern, review model specification
## 4.10 Visualizations

cat("========== VISUALIZATIONS ==========\n\n")
## ========== VISUALIZATIONS ==========
# Predicted vs Actual (Test Set)
plot1 <- ggplot(pred_test_df, aes(x = actual, y = predicted)) +
  geom_jitter(alpha = 0.3, width = 0.2, height = 0.2, color = project_colors[3]) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", 
              color = "red", linewidth = 1) +
  geom_smooth(method = "lm", se = TRUE, color = project_colors[1]) +
  labs(
    title = "LASSO Poisson: Predicted vs Actual Engagement",
    subtitle = paste("Test Set | RMSE =", round(rmse_test, 3), 
                    "| R² =", round(rsq_test, 3)),
    x = "Actual Engagement Count (activities)",
    y = "Predicted Engagement Count"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12))

print(plot1)
## `geom_smooth()` using formula = 'y ~ x'

cat("\n✓ Predicted vs Actual plot created\n\n")
## 
## ✓ Predicted vs Actual plot created
# Residuals plot
pred_test_df$residual <- pred_test_df$actual - pred_test_df$predicted

plot2 <- ggplot(pred_test_df, aes(x = predicted, y = residual)) +
  geom_jitter(alpha = 0.3, width = 0.1, height = 0.1, color = project_colors[3]) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(se = TRUE, color = project_colors[1]) +
  labs(
    title = "Residuals vs Predicted Values",
    subtitle = "Test Set - Check for patterns",
    x = "Predicted Engagement Count",
    y = "Residuals (Actual - Predicted)"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12))

print(plot2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

cat("\n✓ Residual plot created\n\n")
## 
## ✓ Residual plot created
## 4.11 Key Interpretation

cat("========================================\n")
## ========================================
cat("KEY FINDINGS: POISSON REGRESSION\n")
## KEY FINDINGS: POISSON REGRESSION
cat("========================================\n\n")
## ========================================
cat("RESEARCH QUESTION: What factors predict higher family engagement?\n\n")
## RESEARCH QUESTION: What factors predict higher family engagement?
cat("1. VARIABLE SELECTION:\n")
## 1. VARIABLE SELECTION:
cat("   • LASSO selected", nrow(coef_df), "of", ncol(X_train), "predictors\n")
##    • LASSO selected 16 of 16 predictors
cat("   • Optimal penalty: λ =", format(best_lambda, scientific = FALSE), "\n")
##    • Optimal penalty: λ = 0.0008393658
if(nrow(coef_df) == 0) {
  cat("   • ⚠ NO predictors selected - model too regularized\n\n")
} else {
  cat("   • Selected variables are strongest drivers of engagement\n\n")
}
##    • Selected variables are strongest drivers of engagement
cat("2. MODEL PERFORMANCE:\n")
## 2. MODEL PERFORMANCE:
cat("   • Test R² =", round(rsq_test, 3), 
    "(explains", round(rsq_test * 100, 1), "% of variance)\n")
##    • Test R² = 0.668 (explains 66.8 % of variance)
cat("   • Test RMSE =", round(rmse_test, 3), "activities\n")
##    • Test RMSE = 1.152 activities
cat("   • Predictions accurate within ±", round(rmse_test, 1), 
    "activities on average\n\n")
##    • Predictions accurate within ± 1.2 activities on average
if(nrow(coef_df) > 0) {
  cat("3. TOP PREDICTORS (by effect size):\n")
  top_n <- min(5, nrow(coef_df))
  for(i in 1:top_n) {
    cat("   •", coef_df$term[i], "\n")
    cat("     IRR =", round(coef_df$IRR[i], 3), "→", 
        round(abs(coef_df$pct_change[i]), 1), "%", 
        coef_df$direction[i], "in expected activities\n")
  }
  cat("\n")
  
  # School choice effect
  choice_row <- coef_df %>% filter(grepl("SCCHOICE", term))
  
  cat("4. SCHOOL CHOICE EFFECT:\n")
  if(nrow(choice_row) > 0) {
    cat("   • ✓ School choice WAS selected by LASSO\n")
    cat("   • IRR =", round(choice_row$IRR, 3), "\n")
    if(choice_row$IRR > 1) {
      cat("   • Having school choice increases engagement by", 
          round(choice_row$pct_change, 1), "%\n")
      cat("   • Interpretation: Families with choice participate in MORE activities\n")
    } else {
      cat("   • Having school choice decreases engagement by", 
          abs(round(choice_row$pct_change, 1)), "%\n")
      cat("   • Interpretation: Families with choice participate in FEWER activities\n")
      cat("   • (May seek alternatives rather than engage with current school)\n")
    }
  } else {
    cat("   • ✗ School choice was NOT selected by LASSO\n")
    cat("   • Choice does not significantly predict engagement levels\n")
    cat("   • Other factors (income, parent ed, etc.) are stronger drivers\n")
  }
  cat("\n")
  
  cat("5. PRACTICAL IMPLICATIONS FOR K-12 CONNECT:\n")
  cat("   • Focus on predictors with largest positive IRRs to boost engagement\n")
  cat("   • Target families with low predicted engagement for outreach\n")
  cat("   • Address barriers identified by negative IRRs\n")
  
  # Find strongest positive predictor
  positive_preds <- coef_df %>% filter(IRR > 1) %>% arrange(desc(IRR))
  if(nrow(positive_preds) > 0) {
    cat("   • Strongest positive driver:", positive_preds$term[1], 
        "(", round(positive_preds$pct_change[1], 1), "% increase)\n")
  }
} else {
  cat("3. NO PREDICTORS SELECTED\n")
  cat("   • Model too heavily regularized\n")
  cat("   • Consider using smaller lambda or different approach\n\n")
}
## 3. TOP PREDICTORS (by effect size):
##    • PARGRADEX 
##      IRR = 1.358 → 35.8 % increases in expected activities
##    • FSVOL 
##      IRR = 0.819 → 18.1 % decreases in expected activities
##    • parent_ed_x_involvement 
##      IRR = 0.826 → 17.4 % decreases in expected activities
##    • FSPTMTNG 
##      IRR = 0.826 → 17.4 % decreases in expected activities
##    • homework_intensity 
##      IRR = 1.13 → 13 % increases in expected activities
## 
## 4. SCHOOL CHOICE EFFECT:
##    • ✓ School choice WAS selected by LASSO
##    • IRR = 0.996 
##    • Having school choice decreases engagement by 0.4 %
##    • Interpretation: Families with choice participate in FEWER activities
##    • (May seek alternatives rather than engage with current school)
## 
## 5. PRACTICAL IMPLICATIONS FOR K-12 CONNECT:
##    • Focus on predictors with largest positive IRRs to boost engagement
##    • Target families with low predicted engagement for outreach
##    • Address barriers identified by negative IRRs
##    • Strongest positive driver: PARGRADEX ( 35.8 % increase)
cat("\n✓ Poisson Regression Analysis Complete\n")
## 
## ✓ Poisson Regression Analysis Complete
cat("========================================\n\n")
## ========================================
kable(performance_table,
      caption = "Table X: LASSO Poisson Model Performance",
      digits = 3,
      col.names = c("Dataset", "N", "RMSE", "MAE", "R²")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Table X: LASSO Poisson Model Performance
Dataset N RMSE MAE
Training 9614 1.138 0.918 0.669
Test 4100 1.152 0.928 0.668
# Top 8 predictors by absolute effect size
top_predictors_poisson <- coef_df %>%
  arrange(desc(abs(pct_change))) %>%
  head(8) %>%
  select(term, IRR, pct_change)

kable(top_predictors_poisson,
      caption = "Table Y: Top Predictors of Family Engagement",
      digits = 3,
      col.names = c("Predictor", "IRR", "% Change in Engagement")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = c(
    "IRR = Incidence Rate Ratio (multiplicative effect on expected count)",
    "Positive % = increases engagement; Negative % = decreases engagement"
  ))
Table Y: Top Predictors of Family Engagement
Predictor IRR % Change in Engagement
PARGRADEX 1.358 35.775
FSVOL 0.819 -18.112
parent_ed_x_involvement 0.826 -17.403
FSPTMTNG 0.826 -17.354
homework_intensity 1.130 13.007
communication_index 0.955 -4.476
enrichment_count 1.022 2.171
SPUBCHOIX 0.983 -1.732
Note:
IRR = Incidence Rate Ratio (multiplicative effect on expected count)
Positive % = increases engagement; Negative % = decreases engagement