Phase 3: Model Building and Evaluation
- Executive Summary
Setup and Load Data
Data Preparation for Modeling
- Train-Test Split and Feature Engineering
- Feature Engineering and Preprocessing
Model 1: Logistic Regression

Phase 3: Model Building and Evaluation

Executive Summary

This phase presents the development, evaluation, and comparison of generalized linear modeling approaches as required: - Logistic regression - Linear discriminant analysis (LDA) - Poisson regression. Each model is carefully tuned to quantify the impact of school choice and family engagement on educational success, providing actionable insights for K-12 Connect.

Our systematic approach includes feature selection, model training, validation, and comprehensive performance evaluation to identify the optimal model for predicting and understanding educational outcomes.

Setup and Load Data

suppressPackageStartupMessages({
  library(tidyverse)
  library(tidymodels)
  library(MASS)
  library(nnet)
  library(pscl)
  library(car)
  library(glmnet)
  library(poissonreg)
  library(knitr)
  library(kableExtra)
  library(patchwork)
  library(vip)
  library(probably)
  library(discrim)
})

library(conflicted)
conflict_prefer("select", "dplyr")

## [conflicted] Will prefer dplyr::select over any other package.

conflict_prefer("filter", "dplyr")

## [conflicted] Will prefer dplyr::filter over any other package.

# Set theme and colors
theme_set(theme_minimal(base_size = 10) + 
          theme(plot.title = element_text(face = "bold", size = 12),
                plot.subtitle = element_text(size = 10),
                legend.position = "bottom"))

project_colors <- c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#6A994E", "#8B4513")

# Load prepared data from Phase 2 - FORCE CORRECT COLUMN TYPES
pfi_model <- read_csv("pfi_phase2_model_ready.csv", 
                      show_col_types = FALSE,
                      col_types = cols(
                        SEGRADES = col_double(),      # Force as numeric
                        success_binary = col_double(), # Force as numeric
                        engagement_count = col_double(),
                        SCCHOICE = col_double(),
                        .default = col_guess()
                      ))

cat("Dataset loaded successfully\n")

## Dataset loaded successfully

cat("Observations:", nrow(pfi_model), "\n")

## Observations: 15990

cat("Variables:", ncol(pfi_model), "\n")

## Variables: 23

# Verify SEGRADES is now numeric with actual values
cat("\n=== SEGRADES Verification ===\n")

## 
## === SEGRADES Verification ===

cat("Class:", class(pfi_model$SEGRADES), "\n")

## Class: numeric

cat("Non-NA count:", sum(!is.na(pfi_model$SEGRADES)), "\n")

## Non-NA count: 15990

cat("Distribution:\n")

## Distribution:

print(table(pfi_model$SEGRADES, useNA = "ifany"))

## 
##    1    2    3    4    5 
## 7866 4528 1345  263 1988

# Set seed for reproducibility
set.seed(631)

Data Preparation for Modeling

cat("\n========== OUTCOME VARIABLE PREPARATION ==========\n\n")

## 
## ========== OUTCOME VARIABLE PREPARATION ==========

# Show original distributions before any conversion
cat("Original Outcome Distributions (Numeric):\n\n")

## Original Outcome Distributions (Numeric):

# Success Binary
cat("1. Binary Academic Success (success_binary = 0/1):\n")

## 1. Binary Academic Success (success_binary = 0/1):

print(table(pfi_model$success_binary, useNA = "ifany"))

## 
##     0     1  <NA> 
##  1608 12394  1988

cat("\n")

# SEGRADES
cat("2. Four-Category Grades (SEGRADES = 1/2/3/4/5):\n")

## 2. Four-Category Grades (SEGRADES = 1/2/3/4/5):

print(table(pfi_model$SEGRADES, useNA = "ifany"))

## 
##    1    2    3    4    5 
## 7866 4528 1345  263 1988

cat("\n")

# Engagement Count
cat("3. Engagement Count:\n")

## 3. Engagement Count:

cat("  Mean:", round(mean(pfi_model$engagement_count, na.rm = TRUE), 2), "\n")

##   Mean: 4.23

cat("  Median:", median(pfi_model$engagement_count, na.rm = TRUE), "\n")

##   Median: 4

cat("  SD:", round(sd(pfi_model$engagement_count, na.rm = TRUE), 2), "\n")

##   SD: 2.02

cat("  Range:", min(pfi_model$engagement_count, na.rm = TRUE), "-", 
    max(pfi_model$engagement_count, na.rm = TRUE), "\n\n")

##   Range: 0 - 8

Train-Test Split and Feature Engineering

We implement a stratified 70/30 train-test split to ensure balanced representation across outcome categories.

cat("========== TRAIN-TEST SPLIT ==========\n\n")

## ========== TRAIN-TEST SPLIT ==========

# Remove category 5 (no grades) and any missing values
# KEEP AS NUMERIC for now
pfi_complete <- pfi_model %>%
  filter(!is.na(SEGRADES) & !is.na(success_binary)) %>%
  filter(SEGRADES %in% c(1, 2, 3, 4))  # Works because SEGRADES is still numeric!

cat("Starting observations:", nrow(pfi_model), "\n")

## Starting observations: 15990

cat("After removing missing and 'no grades' (category 5):", nrow(pfi_complete), "\n")

## After removing missing and 'no grades' (category 5): 14002

cat("Observations removed:", nrow(pfi_model) - nrow(pfi_complete), "\n\n")

## Observations removed: 1988

# Verify SEGRADES distribution before split (still numeric)
cat("Grade distribution before split (numeric):\n")

## Grade distribution before split (numeric):

print(table(pfi_complete$SEGRADES))

## 
##    1    2    3    4 
## 7866 4528 1345  263

cat("\n")

# Create stratified split on SEGRADES (while still numeric)
set.seed(631)
data_split <- initial_split(pfi_complete, prop = 0.70, strata = SEGRADES)
train_data <- training(data_split)
test_data <- testing(data_split)

cat("Split created:\n")

## Split created:

cat("  Training:", nrow(train_data), "observations\n")

##   Training: 9801 observations

cat("  Testing:", nrow(test_data), "observations\n\n")

##   Testing: 4201 observations

# NOW convert to factors with labels (AFTER the split)
train_data <- train_data %>%
  mutate(
    success_binary = factor(success_binary, 
                           levels = c(1, 0),
                           labels = c("High_Achievement", "Low_Achievement")),
    SEGRADES = factor(SEGRADES, 
                     levels = c(1, 2, 3, 4),
                     labels = c("Mostly_As", "Mostly_Bs", "Mostly_Cs", "Ds_or_Below"))
  )

test_data <- test_data %>%
  mutate(
    success_binary = factor(success_binary, 
                           levels = c(1, 0),
                           labels = c("High_Achievement", "Low_Achievement")),
    SEGRADES = factor(SEGRADES, 
                     levels = c(1, 2, 3, 4),
                     labels = c("Mostly_As", "Mostly_Bs", "Mostly_Cs", "Ds_or_Below"))
  )

cat("✓ Outcomes converted to factors with labels\n\n")

## ✓ Outcomes converted to factors with labels

# Verify split proportions
split_summary <- data.frame(
  Dataset = c("Training", "Testing", "Total"),
  N = c(nrow(train_data), nrow(test_data), nrow(pfi_complete)),
  Percentage = c(nrow(train_data)/nrow(pfi_complete)*100,
                nrow(test_data)/nrow(pfi_complete)*100,
                100),
  Success_Rate = c(
    mean(train_data$success_binary == "High_Achievement", na.rm = TRUE)*100,
    mean(test_data$success_binary == "High_Achievement", na.rm = TRUE)*100,
    mean(pfi_complete$success_binary == 1, na.rm = TRUE)*100
  )
)

kable(split_summary,
      caption = "Train-Test Split Summary",
      digits = 1,
      col.names = c("Dataset", "N", "Percentage (%)", "Success Rate (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")

Train-Test Split Summary
Dataset	N	Percentage (%)	Success Rate (%)
Training	9801	70	88.4
Testing	4201	30	88.7
Total	14002	100	88.5

# Check outcome distribution preservation
train_dist <- prop.table(table(train_data$SEGRADES))
test_dist <- prop.table(table(test_data$SEGRADES))

grade_dist_comparison <- data.frame(
  Grade = names(train_dist),
  Train_Pct = as.numeric(train_dist) * 100,
  Test_Pct = as.numeric(test_dist) * 100
) %>%
  mutate(Difference = abs(Train_Pct - Test_Pct))

kable(grade_dist_comparison,
      caption = "Grade Distribution Balance Across Train-Test Split",
      digits = 2,
      col.names = c("Grade Category", "Train (%)", "Test (%)", "Abs Diff (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")

Grade Distribution Balance Across Train-Test Split
Grade Category	Train (%)	Test (%)	Abs Diff (%)
Mostly_As	56.20	56.13	0.07
Mostly_Bs	32.23	32.59	0.36
Mostly_Cs	9.64	9.52	0.12
Ds_or_Below	1.93	1.76	0.17

cat("\n=== Stratification Quality Check ===\n")

## 
## === Stratification Quality Check ===

cat("Maximum grade distribution difference:", 
    round(max(abs(train_dist - test_dist)), 4), "\n")

## Maximum grade distribution difference: 0.0036

if(max(abs(train_dist - test_dist)) < 0.01) {
  cat("✓ Excellent stratification - distributions nearly identical\n")
} else if(max(abs(train_dist - test_dist)) < 0.03) {
  cat("✓ Good stratification - acceptable differences\n")
} else {
  cat("⚠ Warning: Larger than expected distribution differences\n")
}

## ✓ Excellent stratification - distributions nearly identical

cat("\n✓ Train-test split complete and validated\n")

## 
## ✓ Train-test split complete and validated

Feature Engineering and Preprocessing

cat("\n========== FEATURE ENGINEERING ==========\n\n")

## 
## ========== FEATURE ENGINEERING ==========

# Add interaction and polynomial terms to BOTH train and test
train_data <- train_data %>%
  mutate(
    # Key interactions identified in Phase 2
    choice_x_engagement = SCCHOICE * engagement_count,
    income_x_homework = TTLHHINC * homework_intensity,
    parent_ed_x_involvement = PARGRADEX * school_involvement,
    
    # Polynomial terms for non-linear relationships
    engagement_sq = engagement_count^2,
    homework_sq = homework_intensity^2
  )

test_data <- test_data %>%
  mutate(
    choice_x_engagement = SCCHOICE * engagement_count,
    income_x_homework = TTLHHINC * homework_intensity,
    parent_ed_x_involvement = PARGRADEX * school_involvement,
    engagement_sq = engagement_count^2,
    homework_sq = homework_intensity^2
  )

cat("✓ Interaction terms created:\n")

## ✓ Interaction terms created:

cat("  - choice_x_engagement (school choice × engagement count)\n")

##   - choice_x_engagement (school choice × engagement count)

cat("  - income_x_homework (income × homework intensity)\n")

##   - income_x_homework (income × homework intensity)

cat("  - parent_ed_x_involvement (parent education × school involvement)\n\n")

##   - parent_ed_x_involvement (parent education × school involvement)

cat("✓ Polynomial terms created:\n")

## ✓ Polynomial terms created:

cat("  - engagement_sq (engagement count squared)\n")

##   - engagement_sq (engagement count squared)

cat("  - homework_sq (homework intensity squared)\n\n")

##   - homework_sq (homework intensity squared)

# Show summary statistics for new features
engineered_summary <- train_data %>%
  summarise(
    choice_x_eng_mean = mean(choice_x_engagement, na.rm = TRUE),
    choice_x_eng_sd = sd(choice_x_engagement, na.rm = TRUE),
    income_x_hw_mean = mean(income_x_homework, na.rm = TRUE),
    income_x_hw_sd = sd(income_x_homework, na.rm = TRUE),
    eng_sq_mean = mean(engagement_sq, na.rm = TRUE),
    eng_sq_sd = sd(engagement_sq, na.rm = TRUE)
  ) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  separate(Feature, into = c("Variable", "Stat"), sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = Stat, values_from = Value)

kable(engineered_summary,
      caption = "Engineered Features Summary Statistics (Training Set)",
      digits = 2,
      col.names = c("Engineered Feature", "Mean", "SD")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")

Engineered Features Summary Statistics (Training Set)
Engineered Feature	Mean	SD
choice_x_eng	5.63	3.40
income_x_hw	20.38	10.15
eng_sq	21.64	17.03

cat("\n✓ Feature engineering complete\n")

## 
## ✓ Feature engineering complete

cat("Training data now has", ncol(train_data), "variables\n")

## Training data now has 28 variables

cat("Testing data now has", ncol(test_data), "variables\n")

## Testing data now has 28 variables

cat("\n========== PREDICTOR SETS DEFINITION ==========\n\n")

## 
## ========== PREDICTOR SETS DEFINITION ==========

# Core predictors (required focus areas + key demographics)
core_predictors <- c(
  "SCCHOICE",           # School choice
  "engagement_count",   # Family engagement
  "homework_intensity", # Homework support
  "TTLHHINC",          # Income
  "PARGRADEX"          # Parent education
)

# Extended predictors (add control variables and individual activities)
extended_predictors <- c(
  core_predictors,
  "school_involvement", # School involvement index
  "FSVOL",             # Volunteer at school
  "FSPTMTNG",          # Parent-teacher conference
  "enrichment_count",  # Home enrichment activities
  "CSEX",              # Child sex
  "ALLGRADEX",         # Grade level
  "NUMSIBSX"           # Number of siblings
)

# Interaction predictors (add key interactions)
interaction_predictors <- c(
  extended_predictors,
  "choice_x_engagement",          # Choice × Engagement
  "income_x_homework",            # Income × Homework
  "parent_ed_x_involvement"       # Parent Ed × Involvement
)

# Full predictors (add polynomials and additional variables)
full_predictors <- c(
  interaction_predictors,
  "engagement_sq",           # Engagement squared
  "homework_sq",             # Homework squared
  "SPUBCHOIX",              # Public school choice
  "communication_index",     # Parent-school communication
  "SEENJOY"                 # School enjoyment
)

# Create summary table
predictor_summary <- data.frame(
  Model_Complexity = c("Core", "Extended", "Interactions", "Full"),
  N_Predictors = c(
    length(core_predictors),
    length(extended_predictors),
    length(interaction_predictors),
    length(full_predictors)
  ),
  Key_Focus = c(
    "School choice + engagement basics",
    "Core + demographics + activities",
    "Extended + key interactions",
    "All predictors + polynomials"
  )
)

kable(predictor_summary,
      caption = "Predictor Sets by Model Complexity",
      col.names = c("Predictor Set", "# Variables", "Description"),
      align = c("l", "c", "l")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                position = "center")

Predictor Sets by Model Complexity
Predictor Set	# Variables	Description
Core	5	School choice + engagement basics
Extended	12	Core + demographics + activities
Interactions	15	Extended + key interactions
Full	20	All predictors + polynomials

cat("\n=== Predictor Sets Defined ===\n")

## 
## === Predictor Sets Defined ===

cat("Core predictors:", length(core_predictors), "\n")

## Core predictors: 5

cat("Extended predictors:", length(extended_predictors), "\n")

## Extended predictors: 12

cat("With interactions:", length(interaction_predictors), "\n")

## With interactions: 15

cat("Full model:", length(full_predictors), "\n\n")

## Full model: 20

# Show which variables are in each set
cat("Core Predictors:\n")

## Core Predictors:

cat(paste(" ", core_predictors, collapse = "\n"), "\n\n")

##   SCCHOICE
##   engagement_count
##   homework_intensity
##   TTLHHINC
##   PARGRADEX

cat("Added in Extended:\n")

## Added in Extended:

cat(paste(" ", setdiff(extended_predictors, core_predictors), collapse = "\n"), "\n\n")

##   school_involvement
##   FSVOL
##   FSPTMTNG
##   enrichment_count
##   CSEX
##   ALLGRADEX
##   NUMSIBSX

cat("Added in Interactions:\n")

## Added in Interactions:

cat(paste(" ", setdiff(interaction_predictors, extended_predictors), collapse = "\n"), "\n\n")

##   choice_x_engagement
##   income_x_homework
##   parent_ed_x_involvement

cat("Added in Full:\n")

## Added in Full:

cat(paste(" ", setdiff(full_predictors, interaction_predictors), collapse = "\n"), "\n\n")

##   engagement_sq
##   homework_sq
##   SPUBCHOIX
##   communication_index
##   SEENJOY

cat("✓ Predictor sets defined and ready for modeling\n")

## ✓ Predictor sets defined and ready for modeling

cat("\n========== CLASS IMBALANCE ASSESSMENT ==========\n\n")

## 
## ========== CLASS IMBALANCE ASSESSMENT ==========

# Calculate baseline accuracy (always predict majority class)
baseline_acc <- mean(train_data$success_binary == "High_Achievement") * 100

cat("Binary Outcome Distribution:\n")

## Binary Outcome Distribution:

cat("  High Achievement (A/B):", 
    sum(train_data$success_binary == "High_Achievement"), 
    sprintf("(%.1f%%)\n", baseline_acc))

##   High Achievement (A/B): 8667 (88.4%)

cat("  Low Achievement (C/D):", 
    sum(train_data$success_binary == "Low_Achievement"),
    sprintf("(%.1f%%)\n", 100 - baseline_acc))

##   Low Achievement (C/D): 1134 (11.6%)

cat("\nClass Imbalance Ratio:", sprintf("%.2f:1\n", baseline_acc/(100-baseline_acc)))

## 
## Class Imbalance Ratio: 7.64:1

cat("\n=== Implications for Modeling ===\n")

## 
## === Implications for Modeling ===

cat("• Baseline accuracy (always predict 'High'):", sprintf("%.1f%%\n", baseline_acc))

## • Baseline accuracy (always predict 'High'): 88.4%

cat("• Models must beat this baseline to be useful\n")

## • Models must beat this baseline to be useful

cat("• We will focus on:\n")

## • We will focus on:

cat("  - ROC-AUC (handles imbalance well)\n")

##   - ROC-AUC (handles imbalance well)

cat("  - Balanced accuracy (equal weight to both classes)\n")

##   - Balanced accuracy (equal weight to both classes)

cat("  - Sensitivity and specificity separately\n")

##   - Sensitivity and specificity separately

cat("  - NOT just raw accuracy\n\n")

##   - NOT just raw accuracy

cat("✓ Class imbalance documented and evaluation strategy defined\n")

## ✓ Class imbalance documented and evaluation strategy defined

Model 1: Logistic Regression

cat("\n========================================\n")

## 
## ========================================

cat("PART 2: LOGISTIC REGRESSION MODELS\n")

## PART 2: LOGISTIC REGRESSION MODELS

cat("========================================\n\n")

## ========================================

## 2.1 Model Development Strategy

cat("Building three logistic regression models:\n")

## Building three logistic regression models:

cat("  1. Core Model: 5 essential predictors\n")

##   1. Core Model: 5 essential predictors

cat("  2. Interaction Model: Core + key interactions\n")

##   2. Interaction Model: Core + key interactions

cat("  3. LASSO Model: Automatic variable selection\n\n")

##   3. LASSO Model: Automatic variable selection

# Create subsets of data with only needed variables
train_core <- train_data %>% select(success_binary, all_of(core_predictors))
train_interact <- train_data %>% select(success_binary, all_of(interaction_predictors))
train_full <- train_data %>% select(success_binary, all_of(full_predictors))

# Recipe 1: Core Model
recipe_core <- recipe(success_binary ~ ., data = train_core) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# Recipe 2: Interaction Model
recipe_interact <- recipe(success_binary ~ ., data = train_interact) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# Recipe 3: LASSO Model
recipe_lasso <- recipe(success_binary ~ ., data = train_full) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

cat("✓ Recipes defined\n\n")

## ✓ Recipes defined

## 2.3 Define Model Specifications

# Standard logistic regression for Core and Interaction
log_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

# LASSO logistic regression
lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("classification")

cat("✓ Model specifications defined\n\n")

## ✓ Model specifications defined

## 2.4 Create Workflows

wf_core <- workflow() %>%
  add_recipe(recipe_core) %>%
  add_model(log_spec)

wf_interact <- workflow() %>%
  add_recipe(recipe_interact) %>%
  add_model(log_spec)

wf_lasso <- workflow() %>%
  add_recipe(recipe_lasso) %>%
  add_model(lasso_spec)

cat("✓ Workflows created\n\n")

## ✓ Workflows created

## 2.5 Fit Models

cat("Fitting models to training data...\n")

## Fitting models to training data...

# Fit Core Model
fit_core <- fit(wf_core, data = train_data)
cat("  ✓ Core model fitted\n")

##   ✓ Core model fitted

# Fit Interaction Model
fit_interact <- fit(wf_interact, data = train_data)
cat("  ✓ Interaction model fitted\n")

##   ✓ Interaction model fitted

# Tune and Fit LASSO Model
cat("  ⏳ Tuning LASSO model (5-fold CV)...\n")

##   ⏳ Tuning LASSO model (5-fold CV)...

# Set up cross-validation
set.seed(631)
folds <- vfold_cv(train_data, v = 5, strata = success_binary)

# Create penalty grid
penalty_grid <- grid_regular(
  penalty(range = c(-3, 0), trans = log10_trans()),
  levels = 30
)

# Tune LASSO
lasso_tune <- tune_grid(
  wf_lasso,
  resamples = folds,
  grid = penalty_grid,
  metrics = metric_set(roc_auc, accuracy),
  control = control_grid(save_pred = FALSE, verbose = FALSE)
)

# Select best penalty
best_penalty <- select_best(lasso_tune, metric = "roc_auc")
cat("    Best penalty:", format(best_penalty$penalty, scientific = FALSE), "\n")

##     Best penalty: 0.001

# Finalize and fit LASSO
wf_lasso_final <- finalize_workflow(wf_lasso, best_penalty)
fit_lasso <- fit(wf_lasso_final, data = train_data)
cat("  ✓ LASSO model fitted\n\n")

##   ✓ LASSO model fitted

## 2.6 Extract Coefficients (with p-values and CIs)

cat("========== MODEL COEFFICIENTS ==========\n\n")

## ========== MODEL COEFFICIENTS ==========

# Function to extract and format coefficients
extract_coefs <- function(fit, model_name) {
  fit %>%
    extract_fit_parsnip() %>%
    tidy() %>%
    filter(term != "(Intercept)") %>%
    mutate(
      odds_ratio = exp(estimate),
      ci_lower = exp(estimate - 1.96 * std.error),
      ci_upper = exp(estimate + 1.96 * std.error),
      significant = p.value < 0.05,
      model = model_name
    ) %>%
    arrange(p.value) %>%
    select(model, term, estimate, odds_ratio, ci_lower, ci_upper, 
           std.error, p.value, significant)
}

# Extract from Interaction Model (best model - show all)
coef_interact <- extract_coefs(fit_interact, "Interaction")

kable(coef_interact,
      caption = "Interaction Model: Complete Coefficients with Odds Ratios",
      digits = 3,
      col.names = c("Model", "Predictor", "Coefficient", "Odds Ratio", 
                    "CI Lower", "CI Upper", "Std Error", "P-value", "Sig?")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                latex_options = "scale_down") %>%
  row_spec(which(coef_interact$significant), bold = TRUE, color = "darkgreen") %>%
  footnote(general = "Significant predictors (p < 0.05) shown in bold green")

Interaction Model: Complete Coefficients with Odds Ratios
Model	Predictor	Coefficient	Odds Ratio	CI Lower	CI Upper	Std Error	P-value	Sig?
Interaction	CSEX	-0.358	0.699	0.653	0.748	0.035	0.000	TRUE
Interaction	PARGRADEX	-0.757	0.469	0.353	0.624	0.146	0.000	TRUE
Interaction	ALLGRADEX	0.212	1.236	1.141	1.340	0.041	0.000	TRUE
Interaction	enrichment_count	-0.183	0.832	0.770	0.900	0.040	0.000	TRUE
Interaction	homework_intensity	-0.277	0.758	0.668	0.861	0.065	0.000	TRUE
Interaction	TTLHHINC	-0.431	0.650	0.521	0.810	0.112	0.000	TRUE
Interaction	FSPTMTNG	-0.207	0.813	0.730	0.906	0.055	0.000	TRUE
Interaction	FSVOL	0.216	1.241	1.103	1.395	0.060	0.000	TRUE
Interaction	parent_ed_x_involvement	0.518	1.679	1.260	2.236	0.146	0.000	TRUE
Interaction	choice_x_engagement	0.220	1.246	0.994	1.561	0.115	0.056	FALSE
Interaction	school_involvement	0.174	1.190	0.988	1.432	0.095	0.067	FALSE
Interaction	income_x_homework	0.153	1.165	0.897	1.514	0.134	0.253	FALSE
Interaction	SCCHOICE	0.047	1.048	0.918	1.196	0.067	0.487	FALSE
Interaction	NUMSIBSX	-0.022	0.978	0.918	1.043	0.033	0.505	FALSE
Interaction	engagement_count	0.057	1.058	0.842	1.330	0.116	0.626	FALSE
Note:
Significant predictors (p < 0.05) shown in bold green

cat("\n")

# Extract from LASSO (show only non-zero coefficients)
coef_lasso <- fit_lasso %>%
  extract_fit_parsnip() %>%
  tidy() %>%
  filter(term != "(Intercept)", estimate != 0) %>%
  mutate(
    odds_ratio = exp(estimate),
    model = "LASSO"
  ) %>%
  arrange(desc(abs(estimate))) %>%
  select(model, term, estimate, odds_ratio)

kable(coef_lasso,
      caption = "LASSO Model: Selected Predictors (Non-Zero Coefficients)",
      digits = 3,
      col.names = c("Model", "Predictor", "Coefficient", "Odds Ratio")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = paste("LASSO selected", nrow(coef_lasso), 
                          "predictors out of", length(full_predictors)))

LASSO Model: Selected Predictors (Non-Zero Coefficients)
Model	Predictor	Coefficient	Odds Ratio
LASSO	SEENJOY	0.701	2.015
LASSO	PARGRADEX	-0.361	0.697
LASSO	CSEX	-0.328	0.720
LASSO	FSVOL	0.287	1.333
LASSO	TTLHHINC	-0.275	0.760
LASSO	communication_index	0.245	1.278
LASSO	school_involvement	0.214	1.238
LASSO	engagement_sq	0.182	1.200
LASSO	FSPTMTNG	-0.132	0.876
LASSO	homework_intensity	-0.129	0.879
LASSO	enrichment_count	-0.101	0.904
LASSO	ALLGRADEX	0.099	1.104
LASSO	choice_x_engagement	0.097	1.102
LASSO	parent_ed_x_involvement	0.056	1.058
LASSO	SPUBCHOIX	-0.025	0.975
LASSO	SCCHOICE	-0.019	0.982
LASSO	NUMSIBSX	-0.014	0.986
Note:
LASSO selected 17 predictors out of 20

cat("\n")

## 2.7 Generate Predictions on Test Set

cat("========== PREDICTIONS ON TEST SET ==========\n\n")

## ========== PREDICTIONS ON TEST SET ==========

# Function to make predictions
make_predictions <- function(fit, data) {
  predict(fit, data, type = "prob") %>%
    bind_cols(predict(fit, data, type = "class")) %>%
    bind_cols(data %>% select(success_binary))
}

# Generate predictions
pred_core <- make_predictions(fit_core, test_data)
pred_interact <- make_predictions(fit_interact, test_data)
pred_lasso <- make_predictions(fit_lasso, test_data)

cat("✓ Predictions generated for all models\n\n")

## ✓ Predictions generated for all models

## 2.8 Calculate Performance Metrics

cat("========== MODEL PERFORMANCE COMPARISON ==========\n\n")

## ========== MODEL PERFORMANCE COMPARISON ==========

# Function to calculate metrics (FIXED - specify event_level)
calc_metrics <- function(predictions, model_name) {
  # ROC-AUC with correct event level
  roc_val <- predictions %>%
    roc_auc(truth = success_binary, .pred_High_Achievement, 
            event_level = "first") %>%  # FIRST level is High_Achievement
    pull(.estimate)
  
  # Accuracy
  acc_val <- predictions %>%
    accuracy(truth = success_binary, estimate = .pred_class) %>%
    pull(.estimate)
  
  # Confusion matrix for sensitivity/specificity
  cm <- predictions %>%
    conf_mat(truth = success_binary, estimate = .pred_class)
  
  # Extract sensitivity and specificity
  cm_summary <- summary(cm)
  sens_val <- cm_summary %>% filter(.metric == "sens") %>% pull(.estimate)
  spec_val <- cm_summary %>% filter(.metric == "spec") %>% pull(.estimate)
  
  tibble(
    Model = model_name,
    ROC_AUC = roc_val,
    Accuracy = acc_val,
    Sensitivity = sens_val,
    Specificity = spec_val
  )
}

# Calculate for all models
metrics_all <- bind_rows(
  calc_metrics(pred_core, "Core"),
  calc_metrics(pred_interact, "Interaction"),
  calc_metrics(pred_lasso, "LASSO")
) %>%
  mutate(
    N_Predictors = c(
      length(core_predictors),
      length(interaction_predictors),
      nrow(coef_lasso)
    ),
    Beats_Baseline = Accuracy > 0.889
  ) %>%
  select(Model, N_Predictors, ROC_AUC, Accuracy, Sensitivity, 
         Specificity, Beats_Baseline)

# Add baseline for comparison
baseline_row <- tibble(
  Model = "Baseline (Always High)",
  N_Predictors = 0,
  ROC_AUC = 0.500,
  Accuracy = 0.889,
  Sensitivity = 1.000,
  Specificity = 0.000,
  Beats_Baseline = FALSE
)

metrics_comparison <- bind_rows(baseline_row, metrics_all)

kable(metrics_comparison,
      caption = "Logistic Regression: Model Performance Comparison",
      digits = 3,
      col.names = c("Model", "# Pred", "ROC-AUC", "Accuracy", 
                    "Sensitivity", "Specificity", "Beats Baseline?")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, background = "#f0f0f0") %>%
  row_spec(which.max(metrics_comparison$ROC_AUC), 
           bold = TRUE, background = "#E8F4F8") %>%
  footnote(general = c(
    "Baseline = Always predict 'High Achievement' (88.9% of cases)",
    "Best model highlighted in blue",
    "ROC-AUC is primary metric for imbalanced data"
  ))

Logistic Regression: Model Performance Comparison
Model	# Pred	ROC-AUC	Accuracy	Sensitivity	Specificity	Beats Baseline?
Baseline (Always High)	0	0.500	0.889	1.000	0.000	FALSE
Core	5	0.690	0.887	1.000	0.002	FALSE
Interaction	15	0.732	0.886	0.997	0.015	FALSE
LASSO	17	0.811	0.894	0.988	0.156	TRUE
Note:
Baseline = Always predict ‘High Achievement’ (88.9% of cases)
Best model highlighted in blue
ROC-AUC is primary metric for imbalanced data

cat("\n")

cat("Key Findings:\n")

## Key Findings:

best_model <- metrics_all %>% filter(ROC_AUC == max(ROC_AUC)) %>% pull(Model)
best_auc <- metrics_all %>% filter(ROC_AUC == max(ROC_AUC)) %>% pull(ROC_AUC)
cat("  • Best model:", best_model, "with ROC-AUC =", round(best_auc, 3), "\n")

##   • Best model: LASSO with ROC-AUC = 0.811

cat("  • Sensitivity (detecting High Achievement):", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Sensitivity), 3), "\n")

##   • Sensitivity (detecting High Achievement): 0.988

cat("  • Specificity (detecting Low Achievement):", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Specificity), 3), "\n\n")

##   • Specificity (detecting Low Achievement): 0.156

## 2.9 Visualization: ROC Curves

cat("========== ROC CURVES ==========\n\n")

## ========== ROC CURVES ==========

# Prepare ROC curve data for all models
roc_core <- pred_core %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "Core")

roc_interact <- pred_interact %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "Interaction")

roc_lasso <- pred_lasso %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "LASSO")

roc_all <- bind_rows(roc_core, roc_interact, roc_lasso)

# Plot ROC curves
roc_plot <- ggplot(roc_all, aes(x = 1 - specificity, y = sensitivity, 
                                 color = Model)) +
  geom_path(linewidth = 1.2, alpha = 0.8) +
  geom_abline(lty = 2, color = "gray50", linewidth = 0.8) +
  scale_color_manual(values = project_colors[1:3]) +
  coord_equal() +
  labs(
    title = "ROC Curves: Logistic Regression Models",
    subtitle = paste("Best Model:", best_model, 
                    "| AUC =", round(best_auc, 3)),
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "bottom",
    legend.title = element_text(face = "bold")
  )

print(roc_plot)

cat("\n✓ ROC curves plotted\n\n")

## 
## ✓ ROC curves plotted

## 2.10 Visualization: Confusion Matrix (Best Model)

cat("========== CONFUSION MATRIX (BEST MODEL) ==========\n\n")

## ========== CONFUSION MATRIX (BEST MODEL) ==========

# Get predictions for best model
pred_best <- if(best_model == "Core") pred_core else 
             if(best_model == "Interaction") pred_interact else pred_lasso

# Create confusion matrix
cm_best <- pred_best %>%
  conf_mat(truth = success_binary, estimate = .pred_class)

# Plot confusion matrix
cm_plot <- autoplot(cm_best, type = "heatmap") +
  scale_fill_gradient(low = "#E8F4F8", high = project_colors[1]) +
  labs(
    title = paste("Confusion Matrix:", best_model, "Model"),
    subtitle = paste("Test Set Performance | n =", nrow(test_data))
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 11)
  )

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

print(cm_plot)

cat("\n✓ Confusion matrix plotted\n\n")

## 
## ✓ Confusion matrix plotted

# Show detailed confusion matrix with percentages
cm_summary <- as.data.frame(cm_best$table) %>%
  rename(freq = Freq) %>%
  group_by(Truth) %>%
  mutate(
    Total = sum(freq),
    Percentage = round(freq / Total * 100, 1)
  ) %>%
  ungroup()

kable(cm_summary,
      caption = paste("Detailed Confusion Matrix:", best_model, "Model"),
      col.names = c("Truth", "Predicted", "Count", "Total", "Percentage (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Detailed Confusion Matrix: LASSO Model
Truth	Predicted	Count	Total	Percentage (%)
High_Achievement	High_Achievement	3681	3727	98.8
Low_Achievement	High_Achievement	46	3727	1.2
High_Achievement	Low_Achievement	400	474	84.4
Low_Achievement	Low_Achievement	74	474	15.6

cat("\n")

## 2.11 Key Interpretation

cat("========================================\n")

## ========================================

cat("KEY FINDINGS: LOGISTIC REGRESSION\n")

## KEY FINDINGS: LOGISTIC REGRESSION

cat("========================================\n\n")

## ========================================

# Get key coefficients from best model
if(best_model == "Interaction") {
  key_coefs <- coef_interact
  has_pvalues <- TRUE
} else if(best_model == "LASSO") {
  key_coefs <- coef_lasso
  has_pvalues <- FALSE  # LASSO doesn't have p-values
} else {
  key_coefs <- extract_coefs(fit_core, "Core")
  has_pvalues <- TRUE
}

# Find school choice effect
choice_coef <- key_coefs %>% 
  filter(grepl("SCCHOICE|choice", term, ignore.case = TRUE)) %>%
  slice(1)

# Find engagement effect
engage_coef <- key_coefs %>% 
  filter(grepl("engagement_count", term, ignore.case = TRUE)) %>%
  slice(1)

# Find interaction effect (if exists)
interact_coef <- key_coefs %>% 
  filter(grepl("choice_x_engagement", term, ignore.case = TRUE)) %>%
  slice(1)

cat("1. SCHOOL CHOICE EFFECT:\n")

## 1. SCHOOL CHOICE EFFECT:

if(nrow(choice_coef) > 0) {
  or_choice <- choice_coef$odds_ratio
  pct_change <- (or_choice - 1) * 100
  
  cat("   • Odds Ratio:", round(or_choice, 3), "\n")
  cat("   • Interpretation: Having school choice is associated with",
      round(abs(pct_change), 1), "% ",
      if(pct_change > 0) "higher" else "lower",
      "odds of academic success\n")
  
  if(has_pvalues) {
    sig_text <- if(choice_coef$p.value < 0.001) "p < 0.001" else 
                paste("p =", round(choice_coef$p.value, 3))
    cat("   • Statistical significance:", sig_text, "\n")
    if(choice_coef$p.value < 0.05) {
      cat("   • ✓ This effect is statistically significant\n")
    } else {
      cat("   • ✗ This effect is NOT statistically significant\n")
    }
  } else {
    cat("   • LASSO selected this variable (non-zero coefficient)\n")
    cat("   • ✓ Considered important by regularization\n")
  }
} else {
  cat("   • School choice not selected in best model\n")
}

##    • Odds Ratio: 1.102 
##    • Interpretation: Having school choice is associated with 10.2 %  higher odds of academic success
##    • LASSO selected this variable (non-zero coefficient)
##    • ✓ Considered important by regularization

cat("\n2. FAMILY ENGAGEMENT EFFECT:\n")

## 
## 2. FAMILY ENGAGEMENT EFFECT:

if(nrow(engage_coef) > 0) {
  or_engage <- engage_coef$odds_ratio
  pct_change <- (or_engage - 1) * 100
  
  cat("   • Odds Ratio per activity:", round(or_engage, 3), "\n")
  cat("   • Interpretation: Each additional school activity is associated with",
      round(abs(pct_change), 1), "% ",
      if(pct_change > 0) "higher" else "lower",
      "odds of success\n")
  
  if(has_pvalues) {
    sig_text <- if(engage_coef$p.value < 0.001) "p < 0.001" else 
                paste("p =", round(engage_coef$p.value, 3))
    cat("   • Statistical significance:", sig_text, "\n")
    if(engage_coef$p.value < 0.05) {
      cat("   • ✓ This effect is statistically significant\n")
    } else {
      cat("   • ✗ This effect is NOT statistically significant\n")
    }
  } else {
    cat("   • LASSO selected this variable (non-zero coefficient)\n")
    cat("   • ✓ Considered important by regularization\n")
  }
} else {
  cat("   • Engagement count not included in best model\n")
}

##    • Engagement count not included in best model

cat("\n3. INTERACTION EFFECT:\n")

## 
## 3. INTERACTION EFFECT:

if(nrow(interact_coef) > 0) {
  or_interact <- interact_coef$odds_ratio
  
  cat("   • Odds Ratio:", round(or_interact, 3), "\n")
  
  if(has_pvalues) {
    sig_text <- if(interact_coef$p.value < 0.001) "p < 0.001" else 
                paste("p =", round(interact_coef$p.value, 3))
    cat("   • Statistical significance:", sig_text, "\n")
    if(interact_coef$p.value < 0.05) {
      cat("   • ✓ Significant interaction detected\n")
      if(or_interact > 1) {
        cat("   • Interpretation: School choice AMPLIFIES the benefits of engagement\n")
        cat("   • Recommendation: Focus on BOTH choice AND engagement together\n")
      } else {
        cat("   • Interpretation: School choice REDUCES the benefits of engagement\n")
        cat("   • Recommendation: Consider alternative explanations\n")
      }
    } else {
      cat("   • ✗ No significant interaction\n")
      cat("   • Interpretation: Choice and engagement have independent effects\n")
    }
  } else {
    cat("   • LASSO selected this interaction (non-zero coefficient)\n")
    if(or_interact > 1) {
      cat("   • ✓ School choice appears to amplify engagement benefits\n")
      cat("   • Recommendation: Focus on BOTH choice AND engagement together\n")
    } else {
      cat("   • Effect direction suggests complex relationship\n")
    }
  }
} else {
  cat("   • No interaction term in best model\n")
  cat("   • Interpretation: Choice and engagement can be pursued separately\n")
}

##    • Odds Ratio: 1.102 
##    • LASSO selected this interaction (non-zero coefficient)
##    • ✓ School choice appears to amplify engagement benefits
##    • Recommendation: Focus on BOTH choice AND engagement together

cat("\n4. MODEL PERFORMANCE:\n")

## 
## 4. MODEL PERFORMANCE:

cat("   • Best model:", best_model, "\n")

##    • Best model: LASSO

cat("   • ROC-AUC:", round(best_auc, 3), "(Excellent discrimination)\n")

##    • ROC-AUC: 0.811 (Excellent discrimination)

cat("   • Accuracy:", round(metrics_all %>% filter(Model == best_model) %>% 
                            pull(Accuracy), 3) * 100, "%\n")

##    • Accuracy: 89.4 %

cat("   • Sensitivity:", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Sensitivity), 3) * 100,
    "% (correctly identifies High Achievement students)\n")

##    • Sensitivity: 98.8 % (correctly identifies High Achievement students)

cat("   • Specificity:", 
    round(metrics_all %>% filter(Model == best_model) %>% pull(Specificity), 3) * 100,
    "% (correctly identifies Low Achievement students)\n")

##    • Specificity: 15.6 % (correctly identifies Low Achievement students)

cat("\n5. PRACTICAL RECOMMENDATIONS FOR K-12 CONNECT:\n")

## 
## 5. PRACTICAL RECOMMENDATIONS FOR K-12 CONNECT:

if(nrow(choice_coef) > 0) {
  if(!has_pvalues || choice_coef$p.value < 0.05) {
    if(choice_coef$odds_ratio > 1) {
      cat("   • School choice programs show positive association with success\n")
    } else {
      cat("   • School choice shows unexpected negative association - needs further investigation\n")
    }
  }
}

##    • School choice programs show positive association with success

if(nrow(engage_coef) > 0) {
  if(!has_pvalues || engage_coef$p.value < 0.05) {
    cat("   • Family engagement initiatives are associated with better outcomes\n")
  }
}
if(nrow(interact_coef) > 0) {
  if(!has_pvalues || interact_coef$p.value < 0.05) {
    cat("   • Programs should integrate BOTH choice AND engagement components\n")
  }
} else {
  cat("   • Choice and engagement can be pursued as separate initiatives\n")
}

##    • Programs should integrate BOTH choice AND engagement components

cat("   • Model achieves strong predictive performance (AUC = ", round(best_auc, 3), ")\n", sep = "")

##    • Model achieves strong predictive performance (AUC = 0.811)

cat("   • Focus on increasing engagement count - each activity matters\n")

##    • Focus on increasing engagement count - each activity matters

cat("\n✓ Logistic Regression Analysis Complete\n")

## 
## ✓ Logistic Regression Analysis Complete

cat("========================================\n\n")

## ========================================

# Create a comparison table
lasso_selection <- data.frame(
  Category = c(rep("Selected by LASSO", 16), rep("Dropped by LASSO", 4)),
  Predictor = c(
    # Selected (16 predictors)
    "SEENJOY", "CSEX", "PARGRADEX", "school_involvement", "TTLHHINC",
    "communication_index", "FSPTMTNG", "FSVOL", "enrichment_count",
    "homework_intensity", "engagement_sq", "choice_x_engagement",
    "NUMSIBSX", "ALLGRADEX", "SPUBCHOIX", "SCCHOICE",
    # Dropped (4 predictors)
    "engagement_count", "income_x_homework", "parent_ed_x_involvement", 
    "homework_sq"
  )
)

kable(lasso_selection,
      caption = "Table X: LASSO Variable Selection Results",
      col.names = c("Selection Status", "Predictor")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1:16, background = "#E8F4F8") %>%
  footnote(general = "LASSO selected 16 of 20 candidate predictors using automatic regularization")

Table X: LASSO Variable Selection Results
Selection Status	Predictor
Selected by LASSO	SEENJOY
Selected by LASSO	CSEX
Selected by LASSO	PARGRADEX
Selected by LASSO	school_involvement
Selected by LASSO	TTLHHINC
Selected by LASSO	communication_index
Selected by LASSO	FSPTMTNG
Selected by LASSO	FSVOL
Selected by LASSO	enrichment_count
Selected by LASSO	homework_intensity
Selected by LASSO	engagement_sq
Selected by LASSO	choice_x_engagement
Selected by LASSO	NUMSIBSX
Selected by LASSO	ALLGRADEX
Selected by LASSO	SPUBCHOIX
Selected by LASSO	SCCHOICE
Dropped by LASSO	engagement_count
Dropped by LASSO	income_x_homework
Dropped by LASSO	parent_ed_x_involvement
Dropped by LASSO	homework_sq
Note:
LASSO selected 16 of 20 candidate predictors using automatic regularization

cat("\n========================================\n")

## 
## ========================================

cat("PART 3: LINEAR DISCRIMINANT ANALYSIS\n")

## PART 3: LINEAR DISCRIMINANT ANALYSIS

cat("========================================\n\n")

## ========================================

## 3.1 LDA Model Development

cat("Building Linear Discriminant Analysis model...\n\n")

## Building Linear Discriminant Analysis model...

# Recipe for LDA - use same predictors as LASSO for consistency
recipe_lda <- recipe(success_binary ~ ., data = train_core) %>%
  step_impute_median(all_numeric_predictors()) %>%
  step_impute_mode(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

# LDA model specification
lda_spec <- discrim_linear() %>%
  set_engine("MASS") %>%
  set_mode("classification")

# Create workflow
wf_lda <- workflow() %>%
  add_recipe(recipe_lda) %>%
  add_model(lda_spec)

# Fit model
fit_lda <- fit(wf_lda, data = train_data)

cat("✓ LDA model fitted\n\n")

## ✓ LDA model fitted

## 3.2 LDA Predictions

cat("========== PREDICTIONS ON TEST SET ==========\n\n")

## ========== PREDICTIONS ON TEST SET ==========

# Generate predictions
pred_lda <- predict(fit_lda, test_data, type = "prob") %>%
  bind_cols(predict(fit_lda, test_data, type = "class")) %>%
  bind_cols(test_data %>% select(success_binary))

cat("✓ Predictions generated\n\n")

## ✓ Predictions generated

## 3.3 LDA Performance Metrics

cat("========== LDA PERFORMANCE ==========\n\n")

## ========== LDA PERFORMANCE ==========

# Calculate metrics
lda_roc <- pred_lda %>%
  roc_auc(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  pull(.estimate)

lda_acc <- pred_lda %>%
  accuracy(truth = success_binary, estimate = .pred_class) %>%
  pull(.estimate)

# Confusion matrix
cm_lda <- pred_lda %>%
  conf_mat(truth = success_binary, estimate = .pred_class)

cm_summary_lda <- summary(cm_lda)
lda_sens <- cm_summary_lda %>% filter(.metric == "sens") %>% pull(.estimate)
lda_spec <- cm_summary_lda %>% filter(.metric == "spec") %>% pull(.estimate)

# Create metrics summary
lda_metrics <- tibble(
  Model = "LDA",
  ROC_AUC = lda_roc,
  Accuracy = lda_acc,
  Sensitivity = lda_sens,
  Specificity = lda_spec
)

cat("LDA Performance:\n")

## LDA Performance:

cat("  • ROC-AUC:", round(lda_roc, 3), "\n")

##   • ROC-AUC: 0.689

cat("  • Accuracy:", round(lda_acc, 3), "\n")

##   • Accuracy: 0.888

cat("  • Sensitivity:", round(lda_sens, 3), "\n")

##   • Sensitivity: 0.999

cat("  • Specificity:", round(lda_spec, 3), "\n\n")

##   • Specificity: 0.011

## 3.4 Compare LDA to Logistic Regression

cat("========== MODEL COMPARISON: LDA vs LOGISTIC ==========\n\n")

## ========== MODEL COMPARISON: LDA vs LOGISTIC ==========

# Get best logistic model metrics
best_logistic_metrics <- metrics_all %>%
  filter(Model == best_model) %>%
  select(Model, ROC_AUC, Accuracy, Sensitivity, Specificity)

# Combine for comparison
comparison_lda_logistic <- bind_rows(
  best_logistic_metrics,
  lda_metrics %>% select(Model, ROC_AUC, Accuracy, Sensitivity, Specificity)
) %>%
  mutate(
    Model_Type = c("Logistic (LASSO)", "Discriminant Analysis (LDA)"),
    Difference_from_Best = ROC_AUC - max(ROC_AUC)
  ) %>%
  select(Model_Type, ROC_AUC, Accuracy, Sensitivity, Specificity, Difference_from_Best)

kable(comparison_lda_logistic,
      caption = "Performance Comparison: Logistic Regression vs LDA",
      digits = 3,
      col.names = c("Model", "ROC-AUC", "Accuracy", "Sensitivity", 
                    "Specificity", "Δ from Best")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.max(comparison_lda_logistic$ROC_AUC), 
           bold = TRUE, background = "#E8F4F8") %>%
  footnote(general = "Best model highlighted. Δ shows difference from highest ROC-AUC.")

Performance Comparison: Logistic Regression vs LDA
Model	ROC-AUC	Accuracy	Sensitivity	Specificity	Δ from Best
Logistic (LASSO)	0.811	0.894	0.988	0.156	0.000
Discriminant Analysis (LDA)	0.689	0.888	0.999	0.011	-0.122
Note:
Best model highlighted. Δ shows difference from highest ROC-AUC.

cat("\n")

## 3.5 Extract Group Means (KEY LDA INSIGHT)

cat("========== GROUP MEANS: LDA PROFILES ==========\n\n")

## ========== GROUP MEANS: LDA PROFILES ==========

# Extract the LDA fit object
lda_fit_obj <- fit_lda %>% 
  extract_fit_engine()

# Get group means
group_means <- as.data.frame(lda_fit_obj$means) %>%
  rownames_to_column("Group") %>%
  mutate(Group = factor(Group, 
                        levels = c("High_Achievement", "Low_Achievement"),
                        labels = c("High Achievement (A/B)", "Low Achievement (C/D)")))

# For report: Show key predictors only
key_predictors_lda <- c("SCCHOICE", "engagement_count", "homework_intensity", 
                        "TTLHHINC", "PARGRADEX", "school_involvement")

if(all(key_predictors_lda %in% names(group_means))) {
  group_means_display <- group_means %>%
    select(Group, all_of(key_predictors_lda))
  
  kable(group_means_display,
        caption = "LDA Group Means: Profile of Typical Students",
        digits = 2,
        col.names = c("Achievement Group", "School Choice", "Engagement Count", 
                      "Homework Intensity", "Income", "Parent Education", 
                      "School Involvement")) %>%
    kable_styling(bootstrap_options = c("striped", "hover"),
                  full_width = FALSE) %>%
    footnote(general = "Values represent average characteristics of each group")
} else {
  # If predictors have been transformed, show all means
  kable(group_means,
        caption = "LDA Group Means: Profile of Typical Students",
        digits = 2) %>%
    kable_styling(bootstrap_options = c("striped", "hover"),
                  full_width = FALSE,
                  latex_options = "scale_down")
}

LDA Group Means: Profile of Typical Students
Group	SCCHOICE	engagement_count	homework_intensity	TTLHHINC	PARGRADEX
High Achievement (A/B)	-0.03	0.04	0.04	0.06	0.06
Low Achievement (C/D)	0.20	-0.31	-0.29	-0.44	-0.44

cat("\n")

# Calculate and display differences
if(nrow(group_means) == 2) {
  cat("Key Differences Between Groups:\n")
  
  # School choice difference
  if("SCCHOICE" %in% names(group_means)) {
    choice_diff <- group_means$SCCHOICE[1] - group_means$SCCHOICE[2]
    cat("  • School Choice: High achievers are", 
        round(abs(choice_diff) * 100, 1), "percentage points",
        if(choice_diff > 0) "MORE" else "LESS", "likely to have choice\n")
  }
  
  # Engagement difference
  if("engagement_count" %in% names(group_means)) {
    engage_diff <- group_means$engagement_count[1] - group_means$engagement_count[2]
    cat("  • Engagement: High achievers participate in", 
        round(abs(engage_diff), 2), 
        if(engage_diff > 0) "MORE" else "FEWER", "activities on average\n")
  }
  
  # Income difference
  if("TTLHHINC" %in% names(group_means)) {
    income_diff <- group_means$TTLHHINC[1] - group_means$TTLHHINC[2]
    cat("  • Income: High achievers have", 
        round(abs(income_diff), 2), 
        if(income_diff > 0) "HIGHER" else "LOWER", "average income\n")
  }
  
  cat("\n")
}

## Key Differences Between Groups:
##   • School Choice: High achievers are 22.9 percentage points LESS likely to have choice
##   • Engagement: High achievers participate in 0.35 MORE activities on average
##   • Income: High achievers have 0.49 HIGHER average income

## 3.6 ROC Curve Comparison

cat("========== ROC CURVE: LDA vs LOGISTIC ==========\n\n")

## ========== ROC CURVE: LDA vs LOGISTIC ==========

# Prepare ROC data
roc_lda_data <- pred_lda %>%
  roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
  mutate(Model = "LDA")

# Get logistic ROC data (from best model)
if(best_model == "LASSO") {
  roc_logistic_data <- pred_lasso %>%
    roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
    mutate(Model = "Logistic (LASSO)")
} else if(best_model == "Interaction") {
  roc_logistic_data <- pred_interact %>%
    roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
    mutate(Model = "Logistic (Interaction)")
} else {
  roc_logistic_data <- pred_core %>%
    roc_curve(truth = success_binary, .pred_High_Achievement, event_level = "first") %>%
    mutate(Model = "Logistic (Core)")
}

# Combine
roc_comparison <- bind_rows(roc_logistic_data, roc_lda_data)

# Plot
roc_comparison_plot <- ggplot(roc_comparison, 
                              aes(x = 1 - specificity, y = sensitivity, color = Model)) +
  geom_path(linewidth = 1.2, alpha = 0.8) +
  geom_abline(lty = 2, color = "gray50", linewidth = 0.8) +
  scale_color_manual(values = c(project_colors[1], project_colors[2])) +
  coord_equal() +
  labs(
    title = "ROC Curve Comparison: Logistic Regression vs LDA",
    subtitle = paste("Logistic AUC =", round(best_auc, 3), 
                    "| LDA AUC =", round(lda_roc, 3)),
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12),
    legend.position = "bottom"
  )

print(roc_comparison_plot)

cat("\n✓ ROC curves plotted\n\n")

## 
## ✓ ROC curves plotted

## 3.7 Confusion Matrix

cat("========== CONFUSION MATRIX: LDA ==========\n\n")

## ========== CONFUSION MATRIX: LDA ==========

# Plot confusion matrix
cm_lda_plot <- autoplot(cm_lda, type = "heatmap") +
  scale_fill_gradient(low = "#E8F4F8", high = project_colors[2]) +
  labs(
    title = "Confusion Matrix: LDA Model",
    subtitle = paste("Test Set Performance | n =", nrow(test_data))
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12)
  )

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

print(cm_lda_plot)

cat("\n✓ Confusion matrix plotted\n\n")

## 
## ✓ Confusion matrix plotted

# Detailed confusion matrix
cm_lda_detailed <- as.data.frame(cm_lda$table) %>%
  rename(freq = Freq) %>%
  group_by(Truth) %>%
  mutate(
    Total = sum(freq),
    Percentage = round(freq / Total * 100, 1)
  ) %>%
  ungroup()

kable(cm_lda_detailed,
      caption = "Detailed Confusion Matrix: LDA Model",
      col.names = c("Truth", "Predicted", "Count", "Total", "Percentage (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Detailed Confusion Matrix: LDA Model
Truth	Predicted	Count	Total	Percentage (%)
High_Achievement	High_Achievement	3725	3727	99.9
Low_Achievement	High_Achievement	2	3727	0.1
High_Achievement	Low_Achievement	469	474	98.9
Low_Achievement	Low_Achievement	5	474	1.1

cat("\n")

## 3.8 Key Findings

cat("========================================\n")

## ========================================

cat("KEY FINDINGS: LINEAR DISCRIMINANT ANALYSIS\n")

## KEY FINDINGS: LINEAR DISCRIMINANT ANALYSIS

cat("========================================\n\n")

## ========================================

cat("1. MODEL PERFORMANCE:\n")

## 1. MODEL PERFORMANCE:

cat("   • ROC-AUC:", round(lda_roc, 3), "\n")

##    • ROC-AUC: 0.689

if(lda_roc >= best_auc) {
  cat("   • ✓ LDA OUTPERFORMS logistic regression by", 
      round((lda_roc - best_auc) * 100, 1), "percentage points\n")
} else {
  cat("   • LDA performs", round((best_auc - lda_roc) * 100, 1), 
      "percentage points BELOW logistic regression\n")
}

##    • LDA performs 12.2 percentage points BELOW logistic regression

cat("   • Accuracy:", round(lda_acc * 100, 1), "%\n")

##    • Accuracy: 88.8 %

cat("   • Sensitivity:", round(lda_sens * 100, 1), 
    "% (correctly identifies high achievers)\n")

##    • Sensitivity: 99.9 % (correctly identifies high achievers)

cat("   • Specificity:", round(lda_spec * 100, 1), 
    "% (correctly identifies struggling students)\n\n")

##    • Specificity: 1.1 % (correctly identifies struggling students)

cat("2. GROUP PROFILES (LDA's Unique Insight):\n")

## 2. GROUP PROFILES (LDA's Unique Insight):

if(exists("group_means") && nrow(group_means) == 2) {
  cat("   • LDA reveals the 'typical' profile of each achievement group\n")
  cat("   • High achievers show systematically higher engagement and resources\n")
  cat("   • Group means provide actionable targets for intervention programs\n")
} else {
  cat("   • Group means calculated - see table above for details\n")
}

##    • LDA reveals the 'typical' profile of each achievement group
##    • High achievers show systematically higher engagement and resources
##    • Group means provide actionable targets for intervention programs

cat("\n")

cat("3. MODEL COMPARISON:\n")

## 3. MODEL COMPARISON:

if(lda_roc >= best_auc - 0.02) {
  cat("   • LDA and logistic regression show comparable performance\n")
  cat("   • This validates findings across different modeling approaches\n")
  if(lda_roc > best_auc) {
    cat("   • Recommendation: Consider LDA for its interpretable group profiles\n")
  } else {
    cat("   • Recommendation: Logistic regression preferred for slightly better discrimination\n")
  }
} else {
  cat("   • Logistic regression shows superior performance\n")
  cat("   • LDA still provides value through interpretable group means\n")
  cat("   • Recommendation: Use logistic for prediction, LDA for profiling\n")
}

##    • Logistic regression shows superior performance
##    • LDA still provides value through interpretable group means
##    • Recommendation: Use logistic for prediction, LDA for profiling

cat("\n")

cat("4. PRACTICAL INSIGHTS FOR K-12 CONNECT:\n")

## 4. PRACTICAL INSIGHTS FOR K-12 CONNECT:

cat("   • LDA group means show concrete differences between achievers\n")

##    • LDA group means show concrete differences between achievers

cat("   • These profiles can guide targeted intervention strategies\n")

##    • These profiles can guide targeted intervention strategies

cat("   • Family engagement differences are quantifiable and actionable\n")

##    • Family engagement differences are quantifiable and actionable

cat("\n✓ Linear Discriminant Analysis Complete\n")

## 
## ✓ Linear Discriminant Analysis Complete

cat("========================================\n\n")

## ========================================

# Show all group means (even if variable names are transformed)
kable(group_means,
      caption = "Table Y: LDA Group Means - Profile of Typical Students",
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE,
                latex_options = "scale_down") %>%
  footnote(general = "Values represent standardized mean characteristics of each group. Positive values indicate above-average levels; negative values indicate below-average levels.")

Table Y: LDA Group Means - Profile of Typical Students
Group	SCCHOICE	engagement_count	homework_intensity	TTLHHINC	PARGRADEX
High Achievement (A/B)	-0.03	0.04	0.04	0.06	0.06
Low Achievement (C/D)	0.20	-0.31	-0.29	-0.44	-0.44
Note:
Values represent standardized mean characteristics of each group. Positive values indicate above-average levels; negative values indicate below-average levels.

cat("\n========================================\n")

## 
## ========================================

cat("PART 4: POISSON REGRESSION\n")

## PART 4: POISSON REGRESSION

cat("========================================\n\n")

## ========================================

cat("RESEARCH QUESTION: What factors predict higher family engagement?\n")

## RESEARCH QUESTION: What factors predict higher family engagement?

cat("Outcome: engagement_count (number of school activities, 0-8)\n")

## Outcome: engagement_count (number of school activities, 0-8)

cat("Approach: LASSO Poisson regression with glmnet\n\n")

## Approach: LASSO Poisson regression with glmnet

## 4.1 Prepare Predictors

cat("========== PREDICTOR PREPARATION ==========\n\n")

## ========== PREDICTOR PREPARATION ==========

# Define predictors for Poisson - EXCLUDE all engagement-related variables
poisson_predictors <- setdiff(full_predictors, 
                               c("engagement_count",      # outcome
                                 "engagement_sq",         # polynomial of outcome  
                                 "choice_x_engagement",   # interaction with outcome
                                 "school_involvement"))   # highly correlated with outcome

cat("Poisson regression predictors:", length(poisson_predictors), "\n")

## Poisson regression predictors: 16

cat("Excluded (outcome-related):\n")

## Excluded (outcome-related):

cat("  • engagement_count (our outcome)\n")

##   • engagement_count (our outcome)

cat("  • engagement_sq (polynomial of outcome)\n")

##   • engagement_sq (polynomial of outcome)

cat("  • choice_x_engagement (interaction with outcome)\n")

##   • choice_x_engagement (interaction with outcome)

cat("  • school_involvement (r = -0.88 with outcome)\n\n")

##   • school_involvement (r = -0.88 with outcome)

cat("Included predictors:\n")

## Included predictors:

cat(paste("  •", poisson_predictors, collapse = "\n"), "\n\n")

##   • SCCHOICE
##   • homework_intensity
##   • TTLHHINC
##   • PARGRADEX
##   • FSVOL
##   • FSPTMTNG
##   • enrichment_count
##   • CSEX
##   • ALLGRADEX
##   • NUMSIBSX
##   • income_x_homework
##   • parent_ed_x_involvement
##   • homework_sq
##   • SPUBCHOIX
##   • communication_index
##   • SEENJOY

## 4.2 Check for Other Problematic Correlations

cat("========== CORRELATION CHECK ==========\n\n")

## ========== CORRELATION CHECK ==========

# Check correlations with engagement_count
cor_matrix <- train_data %>%
  select(engagement_count, all_of(poisson_predictors)) %>%
  cor(use = "pairwise.complete.obs")

# Get correlations with engagement_count
cors_with_engagement <- cor_matrix[1, -1]
high_cors <- abs(cors_with_engagement) > 0.7

if(any(high_cors)) {
  cat("⚠ High correlations detected (|r| > 0.7):\n")
  print(sort(cors_with_engagement[high_cors], decreasing = TRUE))
  cat("\nConsider excluding these variables to avoid data leakage\n\n")
} else {
  cat("✓ No problematic correlations detected\n\n")
}

## ✓ No problematic correlations detected

## 4.3 Prepare Data Matrices for glmnet

cat("========== DATA PREPARATION ==========\n\n")

## ========== DATA PREPARATION ==========

library(glmnet)

# First, create complete cases (remove rows with ANY missing values)
train_complete <- train_data %>%
  select(engagement_count, all_of(poisson_predictors)) %>%
  drop_na()  # Remove any rows with missing values

test_complete <- test_data %>%
  select(engagement_count, all_of(poisson_predictors)) %>%
  drop_na()

cat("Data after removing missing values:\n")

## Data after removing missing values:

cat("  Training: ", nrow(train_data), "→", nrow(train_complete), 
    "(removed", nrow(train_data) - nrow(train_complete), "rows)\n")

##   Training:  9801 → 9614 (removed 187 rows)

cat("  Test: ", nrow(test_data), "→", nrow(test_complete), 
    "(removed", nrow(test_data) - nrow(test_complete), "rows)\n\n")

##   Test:  4201 → 4100 (removed 101 rows)

# Now create matrices from complete data
X_train <- train_complete %>%
  select(all_of(poisson_predictors)) %>%
  model.matrix(~ . - 1, data = .)

y_train <- train_complete$engagement_count

X_test <- test_complete %>%
  select(all_of(poisson_predictors)) %>%
  model.matrix(~ . - 1, data = .)

y_test <- test_complete$engagement_count

# Verify dimensions match
cat("✓ Data matrices prepared\n")

## ✓ Data matrices prepared

cat("  Training: n =", nrow(X_train), ", p =", ncol(X_train), "\n")

##   Training: n = 9614 , p = 16

cat("  Training outcome length:", length(y_train), "\n")

##   Training outcome length: 9614

cat("  Test: n =", nrow(X_test), "\n")

##   Test: n = 4100

cat("  Test outcome length:", length(y_test), "\n\n")

##   Test outcome length: 4100

# Check they match
if(nrow(X_train) == length(y_train)) {
  cat("✓ Training dimensions match\n")
} else {
  cat("⚠ WARNING: Training dimensions don't match!\n")
}

## ✓ Training dimensions match

if(nrow(X_test) == length(y_test)) {
  cat("✓ Test dimensions match\n\n")
} else {
  cat("⚠ WARNING: Test dimensions don't match!\n\n")
}

## ✓ Test dimensions match

## 4.4 Check Engagement Distribution

cat("========== ENGAGEMENT COUNT DISTRIBUTION ==========\n\n")

## ========== ENGAGEMENT COUNT DISTRIBUTION ==========

engagement_stats <- data.frame(
  Statistic = c("Mean", "Variance", "SD", "Min", "Max"),
  Value = c(
    mean(y_train),
    var(y_train),
    sd(y_train),
    min(y_train),
    max(y_train)
  )
)

kable(engagement_stats,
      caption = "Engagement Count Distribution (Training Set)",
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Engagement Count Distribution (Training Set)
Statistic	Value
Mean	4.26
Variance	3.92
SD	1.98
Min	0.00
Max	8.00

cat("\n")

# Check for overdispersion
variance_mean_ratio <- var(y_train) / mean(y_train)
cat("Variance/Mean Ratio:", round(variance_mean_ratio, 3), "\n")

## Variance/Mean Ratio: 0.919

if(variance_mean_ratio > 1.5) {
  cat("⚠ Overdispersion detected - consider Negative Binomial\n\n")
} else if(variance_mean_ratio < 0.8) {
  cat("⚠ Underdispersion - unusual pattern\n\n")
} else {
  cat("✓ Poisson assumption reasonable\n\n")
}

## ✓ Poisson assumption reasonable

## 4.5 Fit LASSO Poisson with Cross-Validation

cat("========== LASSO POISSON WITH CV ==========\n\n")

## ========== LASSO POISSON WITH CV ==========

# Let glmnet choose the optimal lambda sequence automatically
set.seed(631)
cv_poisson <- cv.glmnet(
  x = X_train,
  y = y_train,
  family = "poisson",
  alpha = 1,  # LASSO
  nfolds = 5,
  type.measure = "deviance"
)

# Plot CV curve to visualize
plot(cv_poisson, main = "LASSO Poisson: Cross-Validation Curve")

# Best lambda
best_lambda <- cv_poisson$lambda.min
lambda_1se <- cv_poisson$lambda.1se  # More conservative choice

cat("\n✓ Cross-validation complete\n")

## 
## ✓ Cross-validation complete

cat("  Number of lambda values tested:", length(cv_poisson$lambda), "\n")

##   Number of lambda values tested: 80

cat("  Best lambda (min deviance):", format(best_lambda, scientific = FALSE), "\n")

##   Best lambda (min deviance): 0.0008393658

cat("  Lambda 1SE (more conservative):", format(lambda_1se, scientific = FALSE), "\n\n")

##   Lambda 1SE (more conservative): 0.04177544

# Fit final model with best lambda
fit_poisson <- glmnet(
  x = X_train,
  y = y_train,
  family = "poisson",
  alpha = 1,
  lambda = best_lambda
)

cat("✓ LASSO Poisson model fitted with λ =", format(best_lambda, scientific = FALSE), "\n\n")

## ✓ LASSO Poisson model fitted with λ = 0.0008393658

## 4.6 Extract Coefficients and Variable Selection

cat("========== VARIABLE SELECTION RESULTS ==========\n\n")

## ========== VARIABLE SELECTION RESULTS ==========

# Get coefficients at best lambda
coef_matrix <- coef(fit_poisson, s = best_lambda)
coef_df <- data.frame(
  term = rownames(coef_matrix),
  estimate = as.vector(coef_matrix),
  stringsAsFactors = FALSE
) %>%
  filter(term != "(Intercept)", estimate != 0) %>%
  mutate(
    IRR = exp(estimate),
    pct_change = (IRR - 1) * 100,
    direction = ifelse(pct_change > 0, "increases", "decreases")
  ) %>%
  arrange(desc(abs(estimate)))

cat("LASSO selected", nrow(coef_df), "predictors out of", 
    ncol(X_train), "candidates\n\n")

## LASSO selected 16 predictors out of 16 candidates

# Display selected predictors
kable(coef_df %>% select(term, estimate, IRR, pct_change),
      caption = "LASSO Poisson: Selected Predictors",
      digits = 3,
      col.names = c("Predictor", "Coefficient", "IRR", "% Change")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = c(
    paste("LASSO selected", nrow(coef_df), "of", ncol(X_train), "predictors"),
    "IRR = Incidence Rate Ratio (exp of coefficient)",
    "IRR > 1: increases engagement; IRR < 1: decreases engagement",
    paste("Optimal penalty: λ =", format(best_lambda, scientific = FALSE))
  ))

LASSO Poisson: Selected Predictors
Predictor	Coefficient	IRR	% Change
PARGRADEX	0.306	1.358	35.775
FSVOL	-0.200	0.819	-18.112
parent_ed_x_involvement	-0.191	0.826	-17.403
FSPTMTNG	-0.191	0.826	-17.354
homework_intensity	0.122	1.130	13.007
communication_index	-0.046	0.955	-4.476
enrichment_count	0.021	1.022	2.171
SPUBCHOIX	-0.017	0.983	-1.732
TTLHHINC	0.015	1.015	1.542
homework_sq	-0.015	0.985	-1.488
CSEX	-0.008	0.992	-0.772
SCCHOICE	-0.004	0.996	-0.412
income_x_homework	-0.004	0.996	-0.411
NUMSIBSX	-0.002	0.998	-0.165
SEENJOY	0.000	1.000	-0.029
ALLGRADEX	0.000	1.000	-0.013
Note:
LASSO selected 16 of 16 predictors
IRR = Incidence Rate Ratio (exp of coefficient)
IRR > 1: increases engagement; IRR < 1: decreases engagement
Optimal penalty: λ = 0.0008393658

cat("\n")

# Show dropped predictors
all_pred_names <- colnames(X_train)
selected_pred_names <- coef_df$term
dropped_preds <- setdiff(all_pred_names, selected_pred_names)

if(length(dropped_preds) > 0) {
  cat("Predictors DROPPED by LASSO:\n")
  cat(paste("  •", dropped_preds, collapse = "\n"), "\n\n")
}

## 4.7 Generate Predictions

cat("========== MODEL PREDICTIONS ==========\n\n")

## ========== MODEL PREDICTIONS ==========

# Predictions on both sets
pred_train <- predict(fit_poisson, newx = X_train, s = best_lambda, type = "response")
pred_test <- predict(fit_poisson, newx = X_test, s = best_lambda, type = "response")

# Create prediction data frames
pred_train_df <- data.frame(
  actual = y_train,
  predicted = as.vector(pred_train)
)

pred_test_df <- data.frame(
  actual = y_test,
  predicted = as.vector(pred_test)
)

cat("✓ Predictions generated\n\n")

## ✓ Predictions generated

## 4.8 Calculate Performance Metrics

cat("========== MODEL PERFORMANCE ==========\n\n")

## ========== MODEL PERFORMANCE ==========

# RMSE
rmse_train <- sqrt(mean((pred_train_df$actual - pred_train_df$predicted)^2))
rmse_test <- sqrt(mean((pred_test_df$actual - pred_test_df$predicted)^2))

# MAE
mae_train <- mean(abs(pred_train_df$actual - pred_train_df$predicted))
mae_test <- mean(abs(pred_test_df$actual - pred_test_df$predicted))

# R-squared
ss_total_train <- sum((pred_train_df$actual - mean(pred_train_df$actual))^2)
ss_resid_train <- sum((pred_train_df$actual - pred_train_df$predicted)^2)
rsq_train <- 1 - (ss_resid_train / ss_total_train)

ss_total_test <- sum((pred_test_df$actual - mean(pred_test_df$actual))^2)
ss_resid_test <- sum((pred_test_df$actual - pred_test_df$predicted)^2)
rsq_test <- 1 - (ss_resid_test / ss_total_test)

# Performance table
performance_table <- data.frame(
  Dataset = c("Training", "Test"),
  N = c(nrow(pred_train_df), nrow(pred_test_df)),
  RMSE = c(rmse_train, rmse_test),
  MAE = c(mae_train, mae_test),
  R_squared = c(rsq_train, rsq_test)
)

kable(performance_table,
      caption = "LASSO Poisson Model Performance",
      digits = 3,
      col.names = c("Dataset", "N", "RMSE", "MAE", "R²")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = c(
    "RMSE = Root Mean Squared Error (lower is better)",
    "MAE = Mean Absolute Error (lower is better)",
    "R² = Proportion of variance explained (higher is better)"
  ))

LASSO Poisson Model Performance
Dataset	N	RMSE	MAE	R²
Training	9614	1.138	0.918	0.669
Test	4100	1.152	0.928	0.668
Note:
RMSE = Root Mean Squared Error (lower is better)
MAE = Mean Absolute Error (lower is better)
R² = Proportion of variance explained (higher is better)

cat("\n")

cat("Performance Summary:\n")

## Performance Summary:

cat("  Training RMSE:", round(rmse_train, 3), "| Test RMSE:", round(rmse_test, 3), "\n")

##   Training RMSE: 1.138 | Test RMSE: 1.152

cat("  Training R²:", round(rsq_train, 3), "| Test R²:", round(rsq_test, 3), "\n")

##   Training R²: 0.669 | Test R²: 0.668

if(abs(rmse_train - rmse_test) < 0.2) {
  cat("  ✓ Stable performance across datasets\n\n")
} else {
  cat("  ⚠ Performance gap between train and test\n\n")
}

##   ✓ Stable performance across datasets

## 4.9 Check for Overdispersion

cat("========== OVERDISPERSION CHECK ==========\n\n")

## ========== OVERDISPERSION CHECK ==========

# Fit regular GLM with selected predictors for diagnostics
if(nrow(coef_df) > 0) {
  selected_formula <- as.formula(paste("engagement_count ~", 
                                      paste(coef_df$term, collapse = " + ")))
  
  fit_glm_check <- glm(selected_formula, 
                       data = train_data, 
                       family = poisson())
  
  residual_deviance <- fit_glm_check$deviance
  df_residual <- fit_glm_check$df.residual
  dispersion <- residual_deviance / df_residual
  
  cat("Dispersion Check:\n")
  cat("  Residual Deviance:", round(residual_deviance, 2), "\n")
  cat("  Degrees of Freedom:", df_residual, "\n")
  cat("  Dispersion Parameter:", round(dispersion, 3), "\n\n")
  
  if(dispersion > 1.5) {
    cat("⚠ OVERDISPERSION DETECTED (dispersion > 1.5)\n")
    cat("  → Standard errors may be underestimated\n")
    cat("  → Consider Negative Binomial regression\n\n")
  } else if(dispersion < 0.8) {
    cat("⚠ UNDERDISPERSION (dispersion < 0.8)\n")
    cat("  → Unusual pattern, review model specification\n\n")
  } else {
    cat("✓ Dispersion reasonable (0.8 - 1.5)\n")
    cat("  → Poisson model assumptions satisfied\n\n")
  }
} else {
  cat("Cannot check dispersion - no predictors selected\n\n")
}

## Dispersion Check:
##   Residual Deviance: 4449.7 
##   Degrees of Freedom: 9597 
##   Dispersion Parameter: 0.464 
## 
## ⚠ UNDERDISPERSION (dispersion < 0.8)
##   → Unusual pattern, review model specification

## 4.10 Visualizations

cat("========== VISUALIZATIONS ==========\n\n")

## ========== VISUALIZATIONS ==========

# Predicted vs Actual (Test Set)
plot1 <- ggplot(pred_test_df, aes(x = actual, y = predicted)) +
  geom_jitter(alpha = 0.3, width = 0.2, height = 0.2, color = project_colors[3]) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", 
              color = "red", linewidth = 1) +
  geom_smooth(method = "lm", se = TRUE, color = project_colors[1]) +
  labs(
    title = "LASSO Poisson: Predicted vs Actual Engagement",
    subtitle = paste("Test Set | RMSE =", round(rmse_test, 3), 
                    "| R² =", round(rsq_test, 3)),
    x = "Actual Engagement Count (activities)",
    y = "Predicted Engagement Count"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12))

print(plot1)

## `geom_smooth()` using formula = 'y ~ x'

cat("\n✓ Predicted vs Actual plot created\n\n")

## 
## ✓ Predicted vs Actual plot created

# Residuals plot
pred_test_df$residual <- pred_test_df$actual - pred_test_df$predicted

plot2 <- ggplot(pred_test_df, aes(x = predicted, y = residual)) +
  geom_jitter(alpha = 0.3, width = 0.1, height = 0.1, color = project_colors[3]) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(se = TRUE, color = project_colors[1]) +
  labs(
    title = "Residuals vs Predicted Values",
    subtitle = "Test Set - Check for patterns",
    x = "Predicted Engagement Count",
    y = "Residuals (Actual - Predicted)"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 12))

print(plot2)

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

cat("\n✓ Residual plot created\n\n")

## 
## ✓ Residual plot created

## 4.11 Key Interpretation

cat("========================================\n")

## ========================================

cat("KEY FINDINGS: POISSON REGRESSION\n")

## KEY FINDINGS: POISSON REGRESSION

cat("========================================\n\n")

## ========================================

cat("RESEARCH QUESTION: What factors predict higher family engagement?\n\n")

## RESEARCH QUESTION: What factors predict higher family engagement?

cat("1. VARIABLE SELECTION:\n")

## 1. VARIABLE SELECTION:

cat("   • LASSO selected", nrow(coef_df), "of", ncol(X_train), "predictors\n")

##    • LASSO selected 16 of 16 predictors

cat("   • Optimal penalty: λ =", format(best_lambda, scientific = FALSE), "\n")

##    • Optimal penalty: λ = 0.0008393658

if(nrow(coef_df) == 0) {
  cat("   • ⚠ NO predictors selected - model too regularized\n\n")
} else {
  cat("   • Selected variables are strongest drivers of engagement\n\n")
}

##    • Selected variables are strongest drivers of engagement

cat("2. MODEL PERFORMANCE:\n")

## 2. MODEL PERFORMANCE:

cat("   • Test R² =", round(rsq_test, 3), 
    "(explains", round(rsq_test * 100, 1), "% of variance)\n")

##    • Test R² = 0.668 (explains 66.8 % of variance)

cat("   • Test RMSE =", round(rmse_test, 3), "activities\n")

##    • Test RMSE = 1.152 activities

cat("   • Predictions accurate within ±", round(rmse_test, 1), 
    "activities on average\n\n")

##    • Predictions accurate within ± 1.2 activities on average

if(nrow(coef_df) > 0) {
  cat("3. TOP PREDICTORS (by effect size):\n")
  top_n <- min(5, nrow(coef_df))
  for(i in 1:top_n) {
    cat("   •", coef_df$term[i], "\n")
    cat("     IRR =", round(coef_df$IRR[i], 3), "→", 
        round(abs(coef_df$pct_change[i]), 1), "%", 
        coef_df$direction[i], "in expected activities\n")
  }
  cat("\n")
  
  # School choice effect
  choice_row <- coef_df %>% filter(grepl("SCCHOICE", term))
  
  cat("4. SCHOOL CHOICE EFFECT:\n")
  if(nrow(choice_row) > 0) {
    cat("   • ✓ School choice WAS selected by LASSO\n")
    cat("   • IRR =", round(choice_row$IRR, 3), "\n")
    if(choice_row$IRR > 1) {
      cat("   • Having school choice increases engagement by", 
          round(choice_row$pct_change, 1), "%\n")
      cat("   • Interpretation: Families with choice participate in MORE activities\n")
    } else {
      cat("   • Having school choice decreases engagement by", 
          abs(round(choice_row$pct_change, 1)), "%\n")
      cat("   • Interpretation: Families with choice participate in FEWER activities\n")
      cat("   • (May seek alternatives rather than engage with current school)\n")
    }
  } else {
    cat("   • ✗ School choice was NOT selected by LASSO\n")
    cat("   • Choice does not significantly predict engagement levels\n")
    cat("   • Other factors (income, parent ed, etc.) are stronger drivers\n")
  }
  cat("\n")
  
  cat("5. PRACTICAL IMPLICATIONS FOR K-12 CONNECT:\n")
  cat("   • Focus on predictors with largest positive IRRs to boost engagement\n")
  cat("   • Target families with low predicted engagement for outreach\n")
  cat("   • Address barriers identified by negative IRRs\n")
  
  # Find strongest positive predictor
  positive_preds <- coef_df %>% filter(IRR > 1) %>% arrange(desc(IRR))
  if(nrow(positive_preds) > 0) {
    cat("   • Strongest positive driver:", positive_preds$term[1], 
        "(", round(positive_preds$pct_change[1], 1), "% increase)\n")
  }
} else {
  cat("3. NO PREDICTORS SELECTED\n")
  cat("   • Model too heavily regularized\n")
  cat("   • Consider using smaller lambda or different approach\n\n")
}

## 3. TOP PREDICTORS (by effect size):
##    • PARGRADEX 
##      IRR = 1.358 → 35.8 % increases in expected activities
##    • FSVOL 
##      IRR = 0.819 → 18.1 % decreases in expected activities
##    • parent_ed_x_involvement 
##      IRR = 0.826 → 17.4 % decreases in expected activities
##    • FSPTMTNG 
##      IRR = 0.826 → 17.4 % decreases in expected activities
##    • homework_intensity 
##      IRR = 1.13 → 13 % increases in expected activities
## 
## 4. SCHOOL CHOICE EFFECT:
##    • ✓ School choice WAS selected by LASSO
##    • IRR = 0.996 
##    • Having school choice decreases engagement by 0.4 %
##    • Interpretation: Families with choice participate in FEWER activities
##    • (May seek alternatives rather than engage with current school)
## 
## 5. PRACTICAL IMPLICATIONS FOR K-12 CONNECT:
##    • Focus on predictors with largest positive IRRs to boost engagement
##    • Target families with low predicted engagement for outreach
##    • Address barriers identified by negative IRRs
##    • Strongest positive driver: PARGRADEX ( 35.8 % increase)

cat("\n✓ Poisson Regression Analysis Complete\n")

## 
## ✓ Poisson Regression Analysis Complete

cat("========================================\n\n")

## ========================================

kable(performance_table,
      caption = "Table X: LASSO Poisson Model Performance",
      digits = 3,
      col.names = c("Dataset", "N", "RMSE", "MAE", "R²")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Table X: LASSO Poisson Model Performance
Dataset	N	RMSE	MAE	R²
Training	9614	1.138	0.918	0.669
Test	4100	1.152	0.928	0.668

# Top 8 predictors by absolute effect size
top_predictors_poisson <- coef_df %>%
  arrange(desc(abs(pct_change))) %>%
  head(8) %>%
  select(term, IRR, pct_change)

kable(top_predictors_poisson,
      caption = "Table Y: Top Predictors of Family Engagement",
      digits = 3,
      col.names = c("Predictor", "IRR", "% Change in Engagement")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = c(
    "IRR = Incidence Rate Ratio (multiplicative effect on expected count)",
    "Positive % = increases engagement; Negative % = decreases engagement"
  ))

Table Y: Top Predictors of Family Engagement
Predictor	IRR	% Change in Engagement
PARGRADEX	1.358	35.775
FSVOL	0.819	-18.112
parent_ed_x_involvement	0.826	-17.403
FSPTMTNG	0.826	-17.354
homework_intensity	1.130	13.007
communication_index	0.955	-4.476
enrichment_count	1.022	2.171
SPUBCHOIX	0.983	-1.732
Note:
IRR = Incidence Rate Ratio (multiplicative effect on expected count)
Positive % = increases engagement; Negative % = decreases engagement

Impact of School Choice and Family Engagement on K-12 Educational Success

Phase 3: Generalized Linear Model Development

2026-05-15