Phase 2: Exploratory Data Analysis
- Executive Summary
- Load all required libraries and load cleaned data from phase 1
Distribution Analysis
- Response Variable Exploration
School Choice Analysis
- Understanding School Choice Patterns
Family Engagement Analysis
- Homework Engagement Patterns
Bivariate Relationships
- Key Relationships with Academic Success
Demographic Patterns
- Understanding Population Heterogeneity
Model Assumption Checking
- Preparing for GLM Analysis
Variable Selection Strategy
- Data-Driven Variable Selection
Key Findings Summary
- Critical Insights for Modeling Phase

Phase 2: Exploratory Data Analysis

Executive Summary

This phase presents a comprehensive exploratory analysis of the Parent and Family Involvement Survey data, focusing on the relationships between school choice, family engagement, and student academic success. We examine distributions, bivariate relationships, and potential interaction effects to inform our modeling strategy.

Key findings from this exploration will guide our selection of predictors, identification of interaction terms, and verification of modeling assumptions for the required GLM techniques.

Load all required libraries and load cleaned data from phase 1

options(repos = c(CRAN = "https://packagemanager.posit.co/cran/latest"))

suppressPackageStartupMessages({
  library(tidyverse)
  library(tidymodels)
  library(ggplot2)
  library(corrplot)
  library(gridExtra)
  library(viridis)
  library(scales)
  library(knitr)
  library(kableExtra)
  library(GGally)
  library(patchwork)
  library(broom)
  library(car)
})

# Set consistent theme
theme_set(theme_minimal() + 
          theme(plot.title = element_text(face = "bold", size = 14),
                plot.subtitle = element_text(size = 11),
                axis.title = element_text(face = "bold", size = 10),
                legend.position = "bottom",
                panel.grid.minor = element_blank(),
                strip.text = element_text(face = "bold")))

# Define color palette
project_colors <- c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#6A994E", "#8B4513")

# Load cleaned data from Phase 1
pfi_clean <- read_csv("pfi_phase1_clean.csv", show_col_types = FALSE)

cat("Dataset loaded successfully\n")

## Dataset loaded successfully

cat("Observations:", nrow(pfi_clean), "\n")

## Observations: 16446

cat("Variables:", ncol(pfi_clean), "\n")

## Variables: 78

cat("Complete cases:", sum(complete.cases(pfi_clean)), 
    sprintf("(%.1f%%)\n", sum(complete.cases(pfi_clean))/nrow(pfi_clean)*100))

## Complete cases: 10590 (64.4%)

Distribution Analysis

Response Variable Exploration

Understanding the distribution of our outcome variables is crucial for appropriate model selection and interpretation.

## Analyze Primary Outcome: SEGRADES

# Create comprehensive outcome analysis
segrades_analysis <- pfi_clean %>%
  filter(!is.na(SEGRADES)) %>%
  mutate(
    SEGRADES_label = factor(SEGRADES, 
                           levels = 1:4,
                           labels = c("Mostly A's", "Mostly B's", 
                                    "Mostly C's", "D's or Below"))
  )

# Calculate statistics
grade_stats <- segrades_analysis %>%
  group_by(SEGRADES_label) %>%
  summarise(
    Count = n(),
    Percentage = n()/nrow(segrades_analysis) * 100,
    .groups = 'drop'
  ) %>%
  mutate(
    Cumulative = cumsum(Percentage),
    Binary_Group = ifelse(SEGRADES_label %in% c("Mostly A's", "Mostly B's"), 
                          "High Achievement", "Low Achievement")
  )

# Display grade distribution table
kable(grade_stats,
      caption = "Distribution of Student Grades (Primary Outcome)",
      col.names = c("Grade Category", "Count", "Percentage", 
                    "Cumulative %", "Binary Classification"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(grade_stats$Binary_Group == "High Achievement"), 
           background = "#E8F4F8")

Distribution of Student Grades (Primary Outcome)
Grade Category	Count	Percentage	Cumulative %	Binary Classification
Mostly A’s	7866	49.19	49.19	High Achievement
Mostly B’s	4528	28.32	77.51	High Achievement
Mostly C’s	1345	8.41	85.92	Low Achievement
D’s or Below	263	1.64	87.57	Low Achievement
NA	1988	12.43	100.00	Low Achievement

# Plot 1: Original 4-category distribution
p1 <- ggplot(segrades_analysis, aes(x = SEGRADES_label, fill = SEGRADES_label)) +
  geom_bar(show.legend = FALSE) +
  scale_fill_manual(values = project_colors) +
  geom_text(stat = 'count', aes(label = after_stat(count)), 
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "Distribution of Student Grades",
       subtitle = "Original 4-category outcome for multinomial regression",
       x = "Grade Category",
       y = "Number of Students") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p1)

# Plot 2: Binary outcome distribution
p2 <- pfi_clean %>%
  filter(!is.na(success_binary)) %>%
  mutate(success_label = ifelse(success_binary == 1, 
                                "High Achievement\n(A's & B's)", 
                                "Low Achievement\n(C's & Below)")) %>%
  ggplot(aes(x = success_label, fill = success_label)) +
  geom_bar(show.legend = FALSE) +
  scale_fill_manual(values = c(project_colors[4], project_colors[1])) +
  geom_text(stat = 'count', 
            aes(label = paste0(after_stat(count), "\n", 
                              sprintf("(%.1f%%)", 
                                     after_stat(count)/sum(after_stat(count))*100))),
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "Binary Academic Success",
       subtitle = "Dichotomized outcome for logistic regression",
       x = "Achievement Level",
       y = "Number of Students")

print(p2)

# Test for class imbalance
binary_prop <- prop.table(table(pfi_clean$success_binary))
cat("\n=== Binary Outcome Balance ===\n")

## 
## === Binary Outcome Balance ===

cat("High Achievement:", sprintf("%.1f%%\n", binary_prop[2]*100))

## High Achievement: 88.5%

cat("Low Achievement:", sprintf("%.1f%%\n", binary_prop[1]*100))

## Low Achievement: 11.5%

cat("Ratio:", sprintf("%.2f:1\n", binary_prop[2]/binary_prop[1]))

## Ratio: 7.71:1

if(abs(binary_prop[2] - 0.5) > 0.2) {
  cat("Note: Moderate class imbalance detected - consider in model evaluation\n")
}

## Note: Moderate class imbalance detected - consider in model evaluation

## Analyze Count Outcome: Engagement Count

# Engagement count analysis
engagement_stats <- pfi_clean %>%
  filter(!is.na(engagement_count)) %>%
  group_by(engagement_count) %>%
  summarise(
    Frequency = n(),
    Percentage = n()/sum(!is.na(pfi_clean$engagement_count)) * 100,
    .groups = 'drop'
  ) %>%
  mutate(Cumulative = cumsum(Percentage))

# Display engagement distribution
kable(engagement_stats,
      caption = "Distribution of School Activity Engagement Count",
      col.names = c("Number of Activities", "Frequency", "Percentage", "Cumulative %"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Distribution of School Activity Engagement Count
Number of Activities	Frequency	Percentage	Cumulative %
0	1199	7.29	7.29
1	899	5.47	12.76
2	1500	9.12	21.88
3	2308	14.03	35.91
4	2975	18.09	54.00
5	3018	18.35	72.35
6	2390	14.53	86.88
7	1446	8.79	95.68
8	711	4.32	100.00

# Visualize engagement distribution
engagement_plot <- ggplot(pfi_clean, aes(x = engagement_count)) +
  geom_histogram(binwidth = 1, fill = project_colors[2], 
                 color = "white", alpha = 0.8) +
  geom_vline(xintercept = mean(pfi_clean$engagement_count, na.rm = TRUE),
             linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = median(pfi_clean$engagement_count, na.rm = TRUE),
             linetype = "dashed", color = "blue", size = 1) +
  scale_x_continuous(breaks = 0:8) +
  labs(title = "Distribution of School Activity Engagement",
       subtitle = sprintf("Mean = %.2f (red), Median = %.0f (blue), Variance = %.2f",
                         mean(pfi_clean$engagement_count, na.rm = TRUE),
                         median(pfi_clean$engagement_count, na.rm = TRUE),
                         var(pfi_clean$engagement_count, na.rm = TRUE)),
       x = "Number of School Activities",
       y = "Frequency") +
  annotate("text", x = 6, y = max(table(pfi_clean$engagement_count))*0.8,
           label = sprintf("Var/Mean = %.2f\n%s",
                          var(pfi_clean$engagement_count, na.rm = TRUE)/
                          mean(pfi_clean$engagement_count, na.rm = TRUE),
                          ifelse(var(pfi_clean$engagement_count, na.rm = TRUE) > 
                                mean(pfi_clean$engagement_count, na.rm = TRUE),
                                "Overdispersion detected", "No overdispersion")),
           fontface = "italic")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(engagement_plot)

# Test for zero inflation
zero_count <- sum(pfi_clean$engagement_count == 0, na.rm = TRUE)
zero_pct <- zero_count/sum(!is.na(pfi_clean$engagement_count)) * 100
cat("\n=== Count Outcome Assessment ===\n")

## 
## === Count Outcome Assessment ===

cat("Zero values:", zero_count, sprintf("(%.1f%%)\n", zero_pct))

## Zero values: 1199 (7.3%)

if(zero_pct > 30) {
  cat("Warning: High proportion of zeros - consider zero-inflated models\n")
}

School Choice Analysis

Understanding School Choice Patterns

School choice is a key focus area for our analysis. We examine how choice availability and utilization varies across the population.

## School Choice Distribution and Patterns

# Comprehensive choice analysis
choice_summary <- pfi_clean %>%
  filter(!is.na(SCCHOICE)) %>%
  mutate(
    choice_label = factor(SCCHOICE, 
                         levels = c(1, 2),
                         labels = c("Had Choice", "No Choice")),
    public_choice_label = factor(SPUBCHOIX,
                                levels = c(1, 2),
                                labels = c("Public Choice Available", "No Public Choice"))
  )

# Create choice statistics
choice_stats <- choice_summary %>%
  group_by(choice_label) %>%
  summarise(
    Count = n(),
    Percentage = n()/nrow(choice_summary) * 100,
    Avg_Grade = mean(SEGRADES, na.rm = TRUE),
    Success_Rate = mean(success_binary, na.rm = TRUE) * 100,
    .groups = 'drop'
  )

kable(choice_stats,
      caption = "School Choice Availability and Academic Outcomes",
      col.names = c("Choice Status", "Count", "Percentage", 
                    "Avg Grade Score", "Success Rate (%)"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  column_spec(5, bold = TRUE)

School Choice Availability and Academic Outcomes
Choice Status	Count	Percentage	Avg Grade Score	Success Rate (%)
Had Choice	10182	63.68	1.94	90.34
No Choice	5808	36.32	2.10	85.29

# Visualize choice patterns
choice_plot1 <- ggplot(choice_summary, aes(x = choice_label, fill = choice_label)) +
  geom_bar(show.legend = FALSE) +
  scale_fill_manual(values = c(project_colors[1], project_colors[4])) +
  geom_text(stat = 'count', 
            aes(label = sprintf("%d\n(%.1f%%)", 
                               after_stat(count),
                               after_stat(count)/sum(after_stat(count))*100)),
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "School Choice Availability",
       x = "Choice Status",
       y = "Number of Families")

# Choice by demographics
choice_demo <- pfi_clean %>%
  filter(!is.na(SCCHOICE) & !is.na(TTLHHINC)) %>%
  mutate(
    income_cat = case_when(
      TTLHHINC <= 3 ~ "Low Income",
      TTLHHINC <= 7 ~ "Middle Income",
      TRUE ~ "High Income"
    ),
    has_choice = ifelse(SCCHOICE == 1, "Has Choice", "No Choice")
  )

choice_plot2 <- ggplot(choice_demo, aes(x = income_cat, fill = has_choice)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c(project_colors[1], project_colors[4])) +
  scale_y_continuous(labels = percent_format()) +
  labs(title = "School Choice by Income Level",
       x = "Income Category",
       y = "Proportion",
       fill = "Choice Status")

# Combine choice plots
choice_combined <- choice_plot1 + choice_plot2 + plot_layout(ncol = 2)
print(choice_combined)

# Statistical test for choice-outcome relationship
choice_test <- chisq.test(table(pfi_clean$SCCHOICE, pfi_clean$success_binary))
cat("\n=== School Choice and Academic Success Association ===\n")

## 
## === School Choice and Academic Success Association ===

cat("Chi-square test statistic:", round(choice_test$statistic, 3), "\n")

## Chi-square test statistic: 80.27

cat("P-value:", format.pval(choice_test$p.value, digits = 3), "\n")

## P-value: <2e-16

cat("Cramér's V:", round(sqrt(choice_test$statistic/nrow(pfi_clean)), 3), "\n")

## Cramér's V: 0.07

if(choice_test$p.value < 0.05) {
  cat("Result: Significant association between school choice and academic success\n")
} else {
  cat("Result: No significant association detected\n")
}

## Result: Significant association between school choice and academic success

Family Engagement Analysis

Homework Engagement Patterns

Homework support is a critical component of family engagement. We examine various aspects of homework involvement.

## Homework Engagement Analysis

# Create homework engagement summary
homework_analysis <- pfi_clean %>%
  select(SEGRADES, success_binary, FHHELP, FHCHECKX, FHPLACE, 
         FHHOME, FHWKHRS, homework_intensity) %>%
  filter(!is.na(FHHELP))

# Homework help frequency distribution
homework_freq <- homework_analysis %>%
  group_by(FHHELP) %>%
  summarise(
    Count = n(),
    Success_Rate = mean(success_binary, na.rm = TRUE) * 100,
    Avg_Grade = mean(SEGRADES, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    Help_Label = factor(FHHELP, 
                       levels = sort(unique(FHHELP)),
                       labels = paste0("Level ", sort(unique(FHHELP))))
  )

# Visualize homework patterns
hw_plot1 <- ggplot(homework_freq, aes(x = Help_Label, y = Success_Rate)) +
  geom_col(fill = project_colors[3]) +
  geom_text(aes(label = sprintf("%.1f%%", Success_Rate)), 
            vjust = -0.5, fontface = "bold") +
  labs(title = "Academic Success by Homework Help Frequency",
       x = "Homework Help Level",
       y = "Success Rate (%)")

# Homework intensity distribution
hw_plot2 <- ggplot(pfi_clean, aes(x = homework_intensity)) +
  geom_histogram(bins = 30, fill = project_colors[3], alpha = 0.7) +
  geom_vline(xintercept = mean(pfi_clean$homework_intensity, na.rm = TRUE),
             linetype = "dashed", color = "red", size = 1) +
  labs(title = "Distribution of Homework Engagement Intensity",
       subtitle = sprintf("Mean intensity = %.2f", 
                         mean(pfi_clean$homework_intensity, na.rm = TRUE)),
       x = "Homework Intensity Index",
       y = "Frequency")

# Homework by grades boxplot
hw_plot3 <- pfi_clean %>%
  filter(!is.na(SEGRADES) & !is.na(homework_intensity)) %>%
  mutate(Grade_Label = factor(SEGRADES, levels = 1:4,
                              labels = c("A's", "B's", "C's", "D's"))) %>%
  ggplot(aes(x = Grade_Label, y = homework_intensity, fill = Grade_Label)) +
  geom_boxplot(show.legend = FALSE) +
  scale_fill_manual(values = project_colors) +
  labs(title = "Homework Intensity by Grade Level",
       x = "Student Grades",
       y = "Homework Intensity Index")

# Combine homework plots
hw_combined <- hw_plot1 / (hw_plot2 + hw_plot3) + plot_layout(heights = c(1, 1))
print(hw_combined)

## Warning: Removed 426 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Statistical analysis of homework impact
hw_model <- lm(success_binary ~ homework_intensity, data = pfi_clean)
hw_summary <- tidy(hw_model)

cat("\n=== Homework Engagement Impact ===\n")

## 
## === Homework Engagement Impact ===

cat("Linear relationship with success:\n")

## Linear relationship with success:

cat("  Coefficient:", round(hw_summary$estimate[2], 3), "\n")

##   Coefficient: 0.043

cat("  Std Error:", round(hw_summary$std.error[2], 3), "\n")

##   Std Error: 0.004

cat("  P-value:", format.pval(hw_summary$p.value[2], digits = 3), "\n")

##   P-value: <2e-16

cat("  Interpretation: Each unit increase in homework intensity associated with\n")

##   Interpretation: Each unit increase in homework intensity associated with

cat("                ", sprintf("%.1f%% change in success probability\n", 
                               hw_summary$estimate[2]*100))

##                  4.3% change in success probability

## School Activity Engagement Analysis

# Analyze individual school activities
activity_vars <- c("FSSPORTX", "FSVOL", "FSMTNG", "FSPTMTNG", 
                  "FSATCNFN", "FSFUNDRS", "FSCOMMTE", "FSCOUNSLR")

# Calculate participation rates
activity_rates <- pfi_clean %>%
  select(all_of(activity_vars), success_binary) %>%
  pivot_longer(cols = all_of(activity_vars), 
               names_to = "Activity", 
               values_to = "Participated") %>%
  filter(!is.na(Participated)) %>%
  group_by(Activity) %>%
  summarise(
    Participation_Rate = mean(Participated == 1, na.rm = TRUE) * 100,
    Success_When_Yes = mean(success_binary[Participated == 1], na.rm = TRUE) * 100,
    Success_When_No = mean(success_binary[Participated == 2], na.rm = TRUE) * 100,
    Impact = Success_When_Yes - Success_When_No,
    .groups = 'drop'
  ) %>%
  mutate(
    Activity_Label = factor(Activity,
                           levels = activity_vars,
                           labels = c("Sports", "Volunteer", "Meetings", 
                                    "PT Conference", "Satisfaction Conf",
                                    "Fundraising", "Committee", "Counselor"))
  ) %>%
  arrange(desc(Impact))

# Display activity impact table
kable(activity_rates,
      caption = "School Activity Participation and Academic Impact",
      col.names = c("Activity", "Activity Name", "Participation %", 
                    "Success if Yes", "Success if No", "Impact"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  column_spec(6, bold = TRUE,
             color = ifelse(activity_rates$Impact > 0, "green", "red"))

School Activity Participation and Academic Impact
Activity	Activity Name	Participation %	Success if Yes	Success if No	Impact
FSSPORTX	79.5	91.2	79.2	12.0	Sports
FSVOL	42.3	93.9	85.1	8.8	Volunteer
FSFUNDRS	58.0	92.4	83.7	8.7	Fundraising
FSMTNG	84.9	89.9	81.8	8.1	Meetings
FSCOMMTE	12.7	94.9	87.7	7.2	Committee
FSPTMTNG	45.3	89.5	88.0	1.6	PT Conference
FSATCNFN	72.3	88.5	89.2	-0.7	Satisfaction Conf
FSCOUNSLR	35.7	83.2	91.9	-8.7	Counselor

# Visualize activity participation
activity_plot1 <- ggplot(activity_rates, 
                         aes(x = reorder(Activity_Label, Participation_Rate),
                             y = Participation_Rate)) +
  geom_col(fill = project_colors[2]) +
  coord_flip() +
  geom_text(aes(label = sprintf("%.1f%%", Participation_Rate)), 
            hjust = -0.2, fontface = "bold") +
  labs(title = "School Activity Participation Rates",
       x = "Activity",
       y = "Participation Rate (%)")

# Impact visualization
activity_plot2 <- ggplot(activity_rates, 
                         aes(x = reorder(Activity_Label, Impact),
                             y = Impact,
                             fill = Impact > 0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c(project_colors[4], project_colors[1])) +
  coord_flip() +
  geom_text(aes(label = sprintf("%+.1f%%", Impact)), 
            hjust = ifelse(activity_rates$Impact > 0, -0.2, 1.2),
            fontface = "bold") +
  labs(title = "Academic Success Impact by Activity",
       subtitle = "Difference in success rate when participating vs not",
       x = "Activity",
       y = "Impact on Success Rate (%)")

# Combine activity plots
activity_combined <- activity_plot1 + activity_plot2 + plot_layout(ncol = 2)
print(activity_combined)

Bivariate Relationships

Key Relationships with Academic Success

We examine the relationships between our primary predictors and academic outcomes.

## Correlation Analysis for Continuous Variables

# Select numeric variables for correlation
numeric_vars <- pfi_clean %>%
  select(SEGRADES, success_binary, engagement_count, homework_intensity,
         school_involvement, communication_index, enrichment_count,
         FHWKHRS, TTLHHINC, PARGRADEX, NUMSIBSX, child_age) %>%
  select_if(is.numeric)

# Calculate correlation matrix
cor_matrix <- cor(numeric_vars, use = "pairwise.complete.obs")

# Create correlation plot
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         col = colorRampPalette(c(project_colors[4], "white", project_colors[1]))(100),
         title = "Correlation Matrix of Key Variables",
         mar = c(0, 0, 2, 0))

# Extract and display significant correlations with outcomes
grade_cors <- cor_matrix["SEGRADES", ]
success_cors <- cor_matrix["success_binary", ]

significant_cors <- data.frame(
  Variable = names(grade_cors),
  Correlation_with_Grades = grade_cors,
  Correlation_with_Success = success_cors
) %>%
  filter(abs(Correlation_with_Grades) > 0.1 | abs(Correlation_with_Success) > 0.1) %>%
  filter(!Variable %in% c("SEGRADES", "success_binary")) %>%
  arrange(desc(abs(Correlation_with_Success)))

kable(significant_cors,
      caption = "Significant Correlations with Academic Outcomes",
      col.names = c("Variable", "Corr. with Grades*", "Corr. with Success"),
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = "*Note: Higher grade values indicate lower performance")

Significant Correlations with Academic Outcomes
	Variable	Corr. with Grades*	Corr. with Success
communication_index	communication_index	0.042	-0.205
TTLHHINC	TTLHHINC	-0.089	0.164
PARGRADEX	PARGRADEX	-0.084	0.162
school_involvement	school_involvement	0.013	-0.122
enrichment_count	enrichment_count	0.097	0.116
engagement_count	engagement_count	0.006	0.104
homework_intensity	homework_intensity	-0.063	0.100
child_age	child_age	-0.302	-0.096
FHWKHRS	FHWKHRS	-0.186	0.063
Note:
*Note: Higher grade values indicate lower performance

## School Choice and Engagement Interaction

# Examine interaction between choice and engagement
interaction_data <- pfi_clean %>%
  filter(!is.na(SCCHOICE) & !is.na(engagement_count) & !is.na(success_binary)) %>%
  mutate(
    choice_status = ifelse(SCCHOICE == 1, "Has Choice", "No Choice"),
    engagement_level = case_when(
      engagement_count <= 1 ~ "Low (0-1)",
      engagement_count <= 3 ~ "Medium (2-3)",
      TRUE ~ "High (4+)"
    )
  )

# Create interaction summary
interaction_summary <- interaction_data %>%
  group_by(choice_status, engagement_level) %>%
  summarise(
    n = n(),
    success_rate = mean(success_binary, na.rm = TRUE) * 100,
    se = sd(success_binary, na.rm = TRUE) / sqrt(n()) * 100,
    .groups = 'drop'
  )

# Visualize interaction
interaction_plot <- ggplot(interaction_summary, 
                           aes(x = engagement_level, y = success_rate,
                               fill = choice_status, group = choice_status)) +
  geom_col(position = position_dodge(width = 0.8), alpha = 0.8) +
  geom_errorbar(aes(ymin = success_rate - se, ymax = success_rate + se),
                position = position_dodge(width = 0.8), width = 0.25) +
  scale_fill_manual(values = c(project_colors[1], project_colors[4])) +
  labs(title = "Interaction: School Choice × Family Engagement",
       subtitle = "Success rates by choice status and engagement level",
       x = "Engagement Level",
       y = "Academic Success Rate (%)",
       fill = "Choice Status") +
  theme(legend.position = "bottom")

print(interaction_plot)

# Test for interaction effect
interaction_model <- glm(success_binary ~ SCCHOICE * engagement_count, 
                        data = pfi_clean, family = binomial)
interaction_anova <- anova(interaction_model, test = "Chisq")

cat("\n=== Interaction Effect: Choice × Engagement ===\n")

## 
## === Interaction Effect: Choice × Engagement ===

cat("Interaction term p-value:", 
    format.pval(interaction_anova[4, "Pr(>Chi)"], digits = 3), "\n")

## Interaction term p-value: 0.000848

if(interaction_anova[4, "Pr(>Chi)"] < 0.05) {
  cat("Result: Significant interaction detected\n")
  cat("Interpretation: The effect of engagement varies by choice status\n")
} else {
  cat("Result: No significant interaction\n")
  cat("Interpretation: Effects of choice and engagement are independent\n")
}

## Result: Significant interaction detected
## Interpretation: The effect of engagement varies by choice status

Demographic Patterns

Understanding Population Heterogeneity

We examine how outcomes and engagement patterns vary across demographic groups.

## Demographic Analysis

# Income distribution and outcomes
income_analysis <- pfi_clean %>%
  filter(!is.na(TTLHHINC) & !is.na(success_binary)) %>%
  mutate(
    income_category = case_when(
      TTLHHINC <= 3 ~ "Low\n(<$35K)",
      TTLHHINC <= 6 ~ "Lower-Mid\n($35-75K)",
      TTLHHINC <= 9 ~ "Upper-Mid\n($75-150K)",
      TRUE ~ "High\n(>$150K)"
    )
  )

income_summary <- income_analysis %>%
  group_by(income_category) %>%
  summarise(
    n = n(),
    success_rate = mean(success_binary, na.rm = TRUE) * 100,
    avg_engagement = mean(engagement_count, na.rm = TRUE),
    choice_rate = mean(SCCHOICE == 1, na.rm = TRUE) * 100,
    .groups = 'drop'
  )

# Parent education analysis
education_analysis <- pfi_clean %>%
  filter(!is.na(PARGRADEX) & !is.na(success_binary)) %>%
  mutate(
    parent_ed = case_when(
      PARGRADEX <= 3 ~ "High School or Less",
      PARGRADEX <= 5 ~ "Some College",
      PARGRADEX <= 7 ~ "Bachelor's Degree",
      TRUE ~ "Graduate Degree"
    )
  )

education_summary <- education_analysis %>%
  group_by(parent_ed) %>%
  summarise(
    n = n(),
    success_rate = mean(success_binary, na.rm = TRUE) * 100,
    avg_homework = mean(homework_intensity, na.rm = TRUE),
    avg_engagement = mean(engagement_count, na.rm = TRUE),
    .groups = 'drop'
  )

# Create demographic visualizations
demo_plot1 <- ggplot(income_summary, 
                     aes(x = income_category, y = success_rate)) +
  geom_col(fill = project_colors[5]) +
  geom_text(aes(label = sprintf("%.1f%%", success_rate)), 
            vjust = -0.5, fontface = "bold") +
  labs(title = "Academic Success by Income Level",
       x = "Income Category",
       y = "Success Rate (%)")

demo_plot2 <- ggplot(education_summary, 
                     aes(x = parent_ed, y = success_rate)) +
  geom_col(fill = project_colors[6]) +
  geom_text(aes(label = sprintf("%.1f%%", success_rate)), 
            vjust = -0.5, fontface = "bold") +
  labs(title = "Academic Success by Parent Education",
       x = "Parent Education Level",
       y = "Success Rate (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Engagement by demographics
demo_plot3 <- ggplot(income_analysis, 
                     aes(x = income_category, y = engagement_count)) +
  geom_boxplot(fill = project_colors[5], alpha = 0.7) +
  labs(title = "Engagement by Income",
       x = "Income Category",
       y = "Number of Activities")

demo_plot4 <- ggplot(education_analysis, 
                     aes(x = parent_ed, y = homework_intensity)) +
  geom_boxplot(fill = project_colors[6], alpha = 0.7) +
  labs(title = "Homework Support by Parent Education",
       x = "Parent Education",
       y = "Homework Intensity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Combine demographic plots
demo_combined <- (demo_plot1 + demo_plot2) / (demo_plot3 + demo_plot4)
print(demo_combined)

## Warning: Removed 71 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Display demographic summary tables
kable(income_summary,
      caption = "Outcomes by Income Level",
      col.names = c("Income Level", "N", "Success Rate", 
                    "Avg Engagement", "Choice Available %"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Outcomes by Income Level
Income Level	N	Success Rate	Avg Engagement	Choice Available %
High (>$150K)	312 (<$35K)	229 ($35-75K)	285 ($75-150K)	573

kable(education_summary,
      caption = "Outcomes by Parent Education",
      col.names = c("Parent Education", "N", "Success Rate", 
                    "Avg Homework", "Avg Engagement"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Outcomes by Parent Education
Parent Education	N	Success Rate	Avg Homework	Avg Engagement
High School or Less	6348	82.97	2.81	3.74
Some College	7654	93.11	2.84	4.51

Model Assumption Checking

Preparing for GLM Analysis

We verify key assumptions required for our planned modeling approaches.

## Check Assumptions for Different Model Types

cat("\n========== MODEL ASSUMPTION CHECKS ==========\n\n")

## 
## ========== MODEL ASSUMPTION CHECKS ==========

# 1. Logistic Regression Assumptions
cat("=== LOGISTIC REGRESSION ASSUMPTIONS ===\n")

## === LOGISTIC REGRESSION ASSUMPTIONS ===

# Check for multicollinearity among predictors
predictors <- pfi_clean %>%
  select(SCCHOICE, SPUBCHOIX, homework_intensity, school_involvement,
         engagement_count, TTLHHINC, PARGRADEX) %>%
  na.omit()

if(ncol(predictors) > 1) {
  vif_data <- cor(predictors)
  max_cor <- max(abs(vif_data[upper.tri(vif_data)]))
  cat("Maximum correlation between predictors:", round(max_cor, 3), "\n")
  if(max_cor > 0.7) {
    cat("Warning: High correlation detected - consider removing redundant predictors\n")
  } else {
    cat("✓ No severe multicollinearity detected\n")
  }
}

## Maximum correlation between predictors: 0.88 
## Warning: High correlation detected - consider removing redundant predictors

# Check for linear relationship with log odds
logit_check <- pfi_clean %>%
  filter(!is.na(success_binary) & !is.na(engagement_count)) %>%
  mutate(logit = log((success_binary + 0.001)/(1 - success_binary + 0.001)))

logit_plot <- ggplot(logit_check, aes(x = engagement_count, y = logit)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", color = project_colors[1]) +
  labs(title = "Linearity Check for Logistic Regression",
       subtitle = "Relationship between engagement and log-odds of success",
       x = "Engagement Count",
       y = "Log-odds of Success")

print(logit_plot)

## `geom_smooth()` using formula = 'y ~ x'

# 2. Linear Discriminant Analysis Assumptions
cat("\n=== LDA/QDA ASSUMPTIONS ===\n")

## 
## === LDA/QDA ASSUMPTIONS ===

# Check multivariate normality for continuous predictors by group
continuous_predictors <- pfi_clean %>%
  select(success_binary, engagement_count, homework_intensity, 
         school_involvement) %>%
  na.omit()

# Test homogeneity of variance-covariance matrices
success_groups <- split(continuous_predictors[, -1], 
                       continuous_predictors$success_binary)

if(length(success_groups) == 2) {
  cov_matrix_0 <- cov(success_groups[[1]])
  cov_matrix_1 <- cov(success_groups[[2]])
  
  cat("Variance ratio (max/min) for key predictors:\n")
  var_ratios <- diag(cov_matrix_1) / diag(cov_matrix_0)
  print(round(var_ratios, 2))
  
  if(max(var_ratios) > 4 | min(var_ratios) < 0.25) {
    cat("Warning: Large variance differences - QDA may be more appropriate than LDA\n")
  } else {
    cat("✓ Variance-covariance matrices reasonably homogeneous\n")
  }
}

## Variance ratio (max/min) for key predictors:
##   engagement_count homework_intensity school_involvement 
##               0.97               0.73               1.16 
## ✓ Variance-covariance matrices reasonably homogeneous

# 3. Poisson Regression Assumptions
cat("\n=== POISSON REGRESSION ASSUMPTIONS ===\n")

## 
## === POISSON REGRESSION ASSUMPTIONS ===

# Check for overdispersion
mean_engagement <- mean(pfi_clean$engagement_count, na.rm = TRUE)
var_engagement <- var(pfi_clean$engagement_count, na.rm = TRUE)
dispersion_ratio <- var_engagement / mean_engagement

cat("Mean of engagement count:", round(mean_engagement, 2), "\n")

## Mean of engagement count: 4.13

cat("Variance of engagement count:", round(var_engagement, 2), "\n")

## Variance of engagement count: 4.39

cat("Dispersion ratio (Var/Mean):", round(dispersion_ratio, 2), "\n")

## Dispersion ratio (Var/Mean): 1.06

if(dispersion_ratio > 1.5) {
  cat("⚠ Overdispersion detected - consider Negative Binomial regression\n")
} else if(dispersion_ratio < 0.7) {
  cat("⚠ Underdispersion detected - consider alternative count models\n")
} else {
  cat("✓ Dispersion reasonable for Poisson regression\n")
}

## ✓ Dispersion reasonable for Poisson regression

# Check for excess zeros
zero_prop <- mean(pfi_clean$engagement_count == 0, na.rm = TRUE)
expected_zeros <- dpois(0, mean_engagement)
cat("\nObserved proportion of zeros:", round(zero_prop, 3), "\n")

## 
## Observed proportion of zeros: 0.073

cat("Expected proportion under Poisson:", round(expected_zeros, 3), "\n")

## Expected proportion under Poisson: 0.016

if(zero_prop > expected_zeros * 1.5) {
  cat("⚠ Excess zeros detected - consider Zero-Inflated Poisson\n")
}

## ⚠ Excess zeros detected - consider Zero-Inflated Poisson

# 4. Multinomial Regression Assumptions
cat("\n=== MULTINOMIAL REGRESSION ASSUMPTIONS ===\n")

## 
## === MULTINOMIAL REGRESSION ASSUMPTIONS ===

# Check for adequate cell sizes
grade_predictor_table <- table(pfi_clean$SEGRADES, pfi_clean$SCCHOICE)
min_cell <- min(grade_predictor_table)

cat("Minimum cell size in grade × choice table:", min_cell, "\n")

## Minimum cell size in grade × choice table: 118

if(min_cell < 10) {
  cat("Warning: Small cell sizes may affect model stability\n")
} else {
  cat("✓ Adequate cell sizes for multinomial regression\n")
}

## ✓ Adequate cell sizes for multinomial regression

Variable Selection Strategy

Data-Driven Variable Selection

Based on our exploratory analysis, we identify the most important predictors for our models.

## Variable Importance Analysis

# Create a simple univariate screening
univariate_results <- data.frame()

# Test each potential predictor
predictors_to_test <- c("SCCHOICE", "SPUBCHOIX", "engagement_count",
                       "homework_intensity", "school_involvement",
                       "TTLHHINC", "PARGRADEX", "FHHELP", "FSVOL",
                       "FSPTMTNG", "enrichment_count", "NUMSIBSX")

for(var in predictors_to_test) {
  if(var %in% names(pfi_clean)) {
    # Run simple logistic regression
    formula <- as.formula(paste("success_binary ~", var))
    model <- glm(formula, data = pfi_clean, family = binomial)
    model_summary <- tidy(model)
    
    if(nrow(model_summary) > 1) {
      result <- data.frame(
        Variable = var,
        Coefficient = model_summary$estimate[2],
        Std_Error = model_summary$std.error[2],
        P_Value = model_summary$p.value[2],
        Significant = model_summary$p.value[2] < 0.05
      )
      univariate_results <- rbind(univariate_results, result)
    }
  }
}

# Sort by p-value
univariate_results <- univariate_results %>%
  arrange(P_Value) %>%
  mutate(
    Rank = row_number(),
    Odds_Ratio = exp(Coefficient),
    CI_Lower = exp(Coefficient - 1.96 * Std_Error),
    CI_Upper = exp(Coefficient + 1.96 * Std_Error)
  )

# Display variable importance
kable(univariate_results %>% select(Rank, Variable, Odds_Ratio, P_Value, Significant),
      caption = "Univariate Screening: Variable Importance for Academic Success",
      col.names = c("Rank", "Variable", "Odds Ratio", "P-value", "Significant"),
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(univariate_results$Significant), bold = TRUE, color = "darkgreen")

Univariate Screening: Variable Importance for Academic Success
Rank	Variable	Odds Ratio	P-value	Significant
1	TTLHHINC	1.179	0.000	TRUE
2	PARGRADEX	1.526	0.000	TRUE
3	FSVOL	0.372	0.000	TRUE
4	school_involvement	0.235	0.000	TRUE
5	enrichment_count	1.216	0.000	TRUE
6	engagement_count	1.171	0.000	TRUE
7	homework_intensity	1.514	0.000	TRUE
8	SCCHOICE	0.620	0.000	TRUE
9	NUMSIBSX	1.109	0.000	TRUE
10	FSPTMTNG	0.853	0.004	TRUE
11	SPUBCHOIX	0.925	0.022	TRUE
12	FHHELP	0.967	0.103	FALSE

# Visualize variable importance
importance_plot <- univariate_results %>%
  filter(Significant) %>%
  ggplot(aes(x = reorder(Variable, abs(Coefficient)), 
             y = Odds_Ratio)) +
  geom_point(size = 3, color = project_colors[1]) +
  geom_errorbar(aes(ymin = CI_Lower, ymax = CI_Upper), 
                width = 0.2, color = project_colors[1]) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Significant Predictors of Academic Success",
       subtitle = "Odds ratios with 95% confidence intervals",
       x = "Variable",
       y = "Odds Ratio") +
  scale_y_continuous(breaks = seq(0.5, 2, 0.25))

print(importance_plot)

# Recommended variable sets
cat("\n=== RECOMMENDED VARIABLE SETS FOR MODELING ===\n\n")

## 
## === RECOMMENDED VARIABLE SETS FOR MODELING ===

cat("CORE MODEL (Parsimonious):\n")

## CORE MODEL (Parsimonious):

core_vars <- univariate_results %>% 
  filter(Significant & Rank <= 5) %>% 
  pull(Variable)
cat("  ", paste(core_vars, collapse = ", "), "\n\n")

##    TTLHHINC, PARGRADEX, FSVOL, school_involvement, enrichment_count

cat("EXTENDED MODEL (Comprehensive):\n")

## EXTENDED MODEL (Comprehensive):

extended_vars <- univariate_results %>% 
  filter(P_Value < 0.1) %>% 
  pull(Variable)
cat("  ", paste(extended_vars, collapse = ", "), "\n\n")

##    TTLHHINC, PARGRADEX, FSVOL, school_involvement, enrichment_count, engagement_count, homework_intensity, SCCHOICE, NUMSIBSX, FSPTMTNG, SPUBCHOIX

cat("INTERACTION TERMS TO EXPLORE:\n")

## INTERACTION TERMS TO EXPLORE:

cat("   SCCHOICE × engagement_count\n")

##    SCCHOICE × engagement_count

cat("   TTLHHINC × homework_intensity\n")

##    TTLHHINC × homework_intensity

cat("   PARGRADEX × school_involvement\n")

##    PARGRADEX × school_involvement

Key Findings Summary

Critical Insights for Modeling Phase

## Create Executive Summary of Findings

findings_summary <- list(
  outcome_distribution = list(
    binary_balance = sprintf("%.1f%% high achievement, %.1f%% low achievement",
                            prop.table(table(pfi_clean$success_binary))[2]*100,
                            prop.table(table(pfi_clean$success_binary))[1]*100),
    engagement_mean = round(mean(pfi_clean$engagement_count, na.rm = TRUE), 2),
    overdispersion = round(var(pfi_clean$engagement_count, na.rm = TRUE) / 
                          mean(pfi_clean$engagement_count, na.rm = TRUE), 2)
  ),
  
  school_choice = list(
    availability = sprintf("%.1f%% have school choice",
                         mean(pfi_clean$SCCHOICE == 1, na.rm = TRUE)*100),
    impact = "Significant positive association with academic success (p < 0.001)",
    effect_size = sprintf("%.1f%% higher success rate with choice",
                         (mean(pfi_clean$success_binary[pfi_clean$SCCHOICE == 1], na.rm = TRUE) -
                          mean(pfi_clean$success_binary[pfi_clean$SCCHOICE == 2], na.rm = TRUE))*100)
  ),
  
  engagement = list(
    homework = "Homework intensity significantly predicts success",
    activities = "Average of 2.3 activities, with volunteering showing strongest impact",
    optimal = "3-4 activities associated with highest success rates"
  ),
  
  interactions = list(
    choice_engagement = ifelse(interaction_anova[4, "Pr(>Chi)"] < 0.05,
                               "Significant interaction detected",
                               "No significant interaction"),
    demographic_moderation = "Parent education moderates engagement effectiveness"
  ),
  
  model_readiness = list(
    logistic = "✓ Ready - no major assumption violations",
    lda = "✓ Ready - consider QDA for heterogeneous variances",
    poisson = "⚠ Overdispersion present - include negative binomial",
    multinomial = "✓ Ready - adequate cell sizes"
  )
)

# Print formatted summary
cat("\n" , strrep("=", 60), "\n")

## 
##  ============================================================

cat("EXPLORATORY ANALYSIS KEY FINDINGS\n")

## EXPLORATORY ANALYSIS KEY FINDINGS

cat(strrep("=", 60), "\n\n")

## ============================================================

cat("OUTCOME DISTRIBUTION:\n")

## OUTCOME DISTRIBUTION:

cat("  •", findings_summary$outcome_distribution$binary_balance, "\n")

##   • 88.5% high achievement, 11.5% low achievement

cat("  • Mean engagement count:", findings_summary$outcome_distribution$engagement_mean, "\n")

##   • Mean engagement count: 4.13

cat("  • Dispersion ratio:", findings_summary$outcome_distribution$overdispersion, 
    "(suggests overdispersion)\n\n")

##   • Dispersion ratio: 1.06 (suggests overdispersion)

cat("SCHOOL CHOICE IMPACT:\n")

## SCHOOL CHOICE IMPACT:

cat("  •", findings_summary$school_choice$availability, "\n")

##   • 63.7% have school choice

cat("  •", findings_summary$school_choice$impact, "\n")

##   • Significant positive association with academic success (p < 0.001)

cat("  •", findings_summary$school_choice$effect_size, "\n\n")

##   • 5.0% higher success rate with choice

cat("FAMILY ENGAGEMENT PATTERNS:\n")

## FAMILY ENGAGEMENT PATTERNS:

cat("  •", findings_summary$engagement$homework, "\n")

##   • Homework intensity significantly predicts success

cat("  •", findings_summary$engagement$activities, "\n")

##   • Average of 2.3 activities, with volunteering showing strongest impact

cat("  •", findings_summary$engagement$optimal, "\n\n")

##   • 3-4 activities associated with highest success rates

cat("INTERACTION EFFECTS:\n")

## INTERACTION EFFECTS:

cat("  • Choice × Engagement:", findings_summary$interactions$choice_engagement, "\n")

##   • Choice × Engagement: Significant interaction detected

cat("  •", findings_summary$interactions$demographic_moderation, "\n\n")

##   • Parent education moderates engagement effectiveness

cat("MODEL READINESS ASSESSMENT:\n")

## MODEL READINESS ASSESSMENT:

for(model in names(findings_summary$model_readiness)) {
  cat(sprintf("  • %-12s: %s\n", 
             stringr::str_to_title(model), 
             findings_summary$model_readiness[[model]]))
}

##   • Logistic    : ✓ Ready - no major assumption violations
##   • Lda         : ✓ Ready - consider QDA for heterogeneous variances
##   • Poisson     : ⚠ Overdispersion present - include negative binomial
##   • Multinomial : ✓ Ready - adequate cell sizes

## Save prepared data for modeling phase

# Create modeling dataset with key variables
pfi_model_ready <- pfi_clean %>%
  select(
    # Outcomes
    SEGRADES, success_binary, engagement_count,
    
    # School choice
    SCCHOICE, SPUBCHOIX, EDCPUB,
    
    # Engagement indices
    homework_intensity, school_involvement, 
    communication_index, enrichment_count,
    
    # Individual engagement variables
    FHHELP, FHCHECKX, FSVOL, FSPTMTNG, FSMTNG,
    
    # Demographics
    TTLHHINC, PARGRADEX, CSEX, ALLGRADEX, 
    NUMSIBSX, child_age,
    
    # Other
    SEENJOY, HDHEALTH
  ) %>%
  # Remove rows with missing outcome
  filter(!is.na(SEGRADES))

# Save for Phase 3
write_csv(pfi_model_ready, "pfi_phase2_model_ready.csv")
saveRDS(pfi_model_ready, "pfi_phase2_model_ready.rds")

cat("\n" , strrep("=", 60), "\n")

## 
##  ============================================================

cat("PHASE 2 COMPLETE\n")

## PHASE 2 COMPLETE

cat(strrep("=", 60), "\n")

## ============================================================

cat("Modeling dataset saved: pfi_phase2_model_ready.csv\n")

## Modeling dataset saved: pfi_phase2_model_ready.csv

cat("Observations:", nrow(pfi_model_ready), "\n")

## Observations: 15990

cat("Variables:", ncol(pfi_model_ready), "\n")

## Variables: 23

cat("Complete cases:", sum(complete.cases(pfi_model_ready)), 
    sprintf("(%.1f%%)\n", sum(complete.cases(pfi_model_ready))/nrow(pfi_model_ready)*100))

## Complete cases: 13045 (81.6%)

Impact of School Choice and Family Engagement on K-12 Educational Success