Phase 2: Exploratory Data Analysis

Executive Summary

This phase presents a comprehensive exploratory analysis of the Parent and Family Involvement Survey data, focusing on the relationships between school choice, family engagement, and student academic success. We examine distributions, bivariate relationships, and potential interaction effects to inform our modeling strategy.

Key findings from this exploration will guide our selection of predictors, identification of interaction terms, and verification of modeling assumptions for the required GLM techniques.

Load all required libraries and load cleaned data from phase 1

options(repos = c(CRAN = "https://packagemanager.posit.co/cran/latest"))
suppressPackageStartupMessages({
  library(tidyverse)
  library(tidymodels)
  library(ggplot2)
  library(corrplot)
  library(gridExtra)
  library(viridis)
  library(scales)
  library(knitr)
  library(kableExtra)
  library(GGally)
  library(patchwork)
  library(broom)
  library(car)
})

# Set consistent theme
theme_set(theme_minimal() + 
          theme(plot.title = element_text(face = "bold", size = 14),
                plot.subtitle = element_text(size = 11),
                axis.title = element_text(face = "bold", size = 10),
                legend.position = "bottom",
                panel.grid.minor = element_blank(),
                strip.text = element_text(face = "bold")))

# Define color palette
project_colors <- c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#6A994E", "#8B4513")

# Load cleaned data from Phase 1
pfi_clean <- read_csv("pfi_phase1_clean.csv", show_col_types = FALSE)

cat("Dataset loaded successfully\n")
## Dataset loaded successfully
cat("Observations:", nrow(pfi_clean), "\n")
## Observations: 16446
cat("Variables:", ncol(pfi_clean), "\n")
## Variables: 78
cat("Complete cases:", sum(complete.cases(pfi_clean)), 
    sprintf("(%.1f%%)\n", sum(complete.cases(pfi_clean))/nrow(pfi_clean)*100))
## Complete cases: 10590 (64.4%)

Distribution Analysis

Response Variable Exploration

Understanding the distribution of our outcome variables is crucial for appropriate model selection and interpretation.

## Analyze Primary Outcome: SEGRADES

# Create comprehensive outcome analysis
segrades_analysis <- pfi_clean %>%
  filter(!is.na(SEGRADES)) %>%
  mutate(
    SEGRADES_label = factor(SEGRADES, 
                           levels = 1:4,
                           labels = c("Mostly A's", "Mostly B's", 
                                    "Mostly C's", "D's or Below"))
  )

# Calculate statistics
grade_stats <- segrades_analysis %>%
  group_by(SEGRADES_label) %>%
  summarise(
    Count = n(),
    Percentage = n()/nrow(segrades_analysis) * 100,
    .groups = 'drop'
  ) %>%
  mutate(
    Cumulative = cumsum(Percentage),
    Binary_Group = ifelse(SEGRADES_label %in% c("Mostly A's", "Mostly B's"), 
                          "High Achievement", "Low Achievement")
  )

# Display grade distribution table
kable(grade_stats,
      caption = "Distribution of Student Grades (Primary Outcome)",
      col.names = c("Grade Category", "Count", "Percentage", 
                    "Cumulative %", "Binary Classification"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(grade_stats$Binary_Group == "High Achievement"), 
           background = "#E8F4F8")
Distribution of Student Grades (Primary Outcome)
Grade Category Count Percentage Cumulative % Binary Classification
Mostly A’s 7866 49.19 49.19 High Achievement
Mostly B’s 4528 28.32 77.51 High Achievement
Mostly C’s 1345 8.41 85.92 Low Achievement
D’s or Below 263 1.64 87.57 Low Achievement
NA 1988 12.43 100.00 Low Achievement
# Plot 1: Original 4-category distribution
p1 <- ggplot(segrades_analysis, aes(x = SEGRADES_label, fill = SEGRADES_label)) +
  geom_bar(show.legend = FALSE) +
  scale_fill_manual(values = project_colors) +
  geom_text(stat = 'count', aes(label = after_stat(count)), 
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "Distribution of Student Grades",
       subtitle = "Original 4-category outcome for multinomial regression",
       x = "Grade Category",
       y = "Number of Students") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p1)

# Plot 2: Binary outcome distribution
p2 <- pfi_clean %>%
  filter(!is.na(success_binary)) %>%
  mutate(success_label = ifelse(success_binary == 1, 
                                "High Achievement\n(A's & B's)", 
                                "Low Achievement\n(C's & Below)")) %>%
  ggplot(aes(x = success_label, fill = success_label)) +
  geom_bar(show.legend = FALSE) +
  scale_fill_manual(values = c(project_colors[4], project_colors[1])) +
  geom_text(stat = 'count', 
            aes(label = paste0(after_stat(count), "\n", 
                              sprintf("(%.1f%%)", 
                                     after_stat(count)/sum(after_stat(count))*100))),
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "Binary Academic Success",
       subtitle = "Dichotomized outcome for logistic regression",
       x = "Achievement Level",
       y = "Number of Students")

print(p2)

# Test for class imbalance
binary_prop <- prop.table(table(pfi_clean$success_binary))
cat("\n=== Binary Outcome Balance ===\n")
## 
## === Binary Outcome Balance ===
cat("High Achievement:", sprintf("%.1f%%\n", binary_prop[2]*100))
## High Achievement: 88.5%
cat("Low Achievement:", sprintf("%.1f%%\n", binary_prop[1]*100))
## Low Achievement: 11.5%
cat("Ratio:", sprintf("%.2f:1\n", binary_prop[2]/binary_prop[1]))
## Ratio: 7.71:1
if(abs(binary_prop[2] - 0.5) > 0.2) {
  cat("Note: Moderate class imbalance detected - consider in model evaluation\n")
}
## Note: Moderate class imbalance detected - consider in model evaluation
## Analyze Count Outcome: Engagement Count

# Engagement count analysis
engagement_stats <- pfi_clean %>%
  filter(!is.na(engagement_count)) %>%
  group_by(engagement_count) %>%
  summarise(
    Frequency = n(),
    Percentage = n()/sum(!is.na(pfi_clean$engagement_count)) * 100,
    .groups = 'drop'
  ) %>%
  mutate(Cumulative = cumsum(Percentage))

# Display engagement distribution
kable(engagement_stats,
      caption = "Distribution of School Activity Engagement Count",
      col.names = c("Number of Activities", "Frequency", "Percentage", "Cumulative %"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Distribution of School Activity Engagement Count
Number of Activities Frequency Percentage Cumulative %
0 1199 7.29 7.29
1 899 5.47 12.76
2 1500 9.12 21.88
3 2308 14.03 35.91
4 2975 18.09 54.00
5 3018 18.35 72.35
6 2390 14.53 86.88
7 1446 8.79 95.68
8 711 4.32 100.00
# Visualize engagement distribution
engagement_plot <- ggplot(pfi_clean, aes(x = engagement_count)) +
  geom_histogram(binwidth = 1, fill = project_colors[2], 
                 color = "white", alpha = 0.8) +
  geom_vline(xintercept = mean(pfi_clean$engagement_count, na.rm = TRUE),
             linetype = "dashed", color = "red", size = 1) +
  geom_vline(xintercept = median(pfi_clean$engagement_count, na.rm = TRUE),
             linetype = "dashed", color = "blue", size = 1) +
  scale_x_continuous(breaks = 0:8) +
  labs(title = "Distribution of School Activity Engagement",
       subtitle = sprintf("Mean = %.2f (red), Median = %.0f (blue), Variance = %.2f",
                         mean(pfi_clean$engagement_count, na.rm = TRUE),
                         median(pfi_clean$engagement_count, na.rm = TRUE),
                         var(pfi_clean$engagement_count, na.rm = TRUE)),
       x = "Number of School Activities",
       y = "Frequency") +
  annotate("text", x = 6, y = max(table(pfi_clean$engagement_count))*0.8,
           label = sprintf("Var/Mean = %.2f\n%s",
                          var(pfi_clean$engagement_count, na.rm = TRUE)/
                          mean(pfi_clean$engagement_count, na.rm = TRUE),
                          ifelse(var(pfi_clean$engagement_count, na.rm = TRUE) > 
                                mean(pfi_clean$engagement_count, na.rm = TRUE),
                                "Overdispersion detected", "No overdispersion")),
           fontface = "italic")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(engagement_plot)

# Test for zero inflation
zero_count <- sum(pfi_clean$engagement_count == 0, na.rm = TRUE)
zero_pct <- zero_count/sum(!is.na(pfi_clean$engagement_count)) * 100
cat("\n=== Count Outcome Assessment ===\n")
## 
## === Count Outcome Assessment ===
cat("Zero values:", zero_count, sprintf("(%.1f%%)\n", zero_pct))
## Zero values: 1199 (7.3%)
if(zero_pct > 30) {
  cat("Warning: High proportion of zeros - consider zero-inflated models\n")
}

School Choice Analysis

Understanding School Choice Patterns

School choice is a key focus area for our analysis. We examine how choice availability and utilization varies across the population.

## School Choice Distribution and Patterns

# Comprehensive choice analysis
choice_summary <- pfi_clean %>%
  filter(!is.na(SCCHOICE)) %>%
  mutate(
    choice_label = factor(SCCHOICE, 
                         levels = c(1, 2),
                         labels = c("Had Choice", "No Choice")),
    public_choice_label = factor(SPUBCHOIX,
                                levels = c(1, 2),
                                labels = c("Public Choice Available", "No Public Choice"))
  )

# Create choice statistics
choice_stats <- choice_summary %>%
  group_by(choice_label) %>%
  summarise(
    Count = n(),
    Percentage = n()/nrow(choice_summary) * 100,
    Avg_Grade = mean(SEGRADES, na.rm = TRUE),
    Success_Rate = mean(success_binary, na.rm = TRUE) * 100,
    .groups = 'drop'
  )

kable(choice_stats,
      caption = "School Choice Availability and Academic Outcomes",
      col.names = c("Choice Status", "Count", "Percentage", 
                    "Avg Grade Score", "Success Rate (%)"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  column_spec(5, bold = TRUE)
School Choice Availability and Academic Outcomes
Choice Status Count Percentage Avg Grade Score Success Rate (%)
Had Choice 10182 63.68 1.94 90.34
No Choice 5808 36.32 2.10 85.29
# Visualize choice patterns
choice_plot1 <- ggplot(choice_summary, aes(x = choice_label, fill = choice_label)) +
  geom_bar(show.legend = FALSE) +
  scale_fill_manual(values = c(project_colors[1], project_colors[4])) +
  geom_text(stat = 'count', 
            aes(label = sprintf("%d\n(%.1f%%)", 
                               after_stat(count),
                               after_stat(count)/sum(after_stat(count))*100)),
            vjust = -0.5, size = 4, fontface = "bold") +
  labs(title = "School Choice Availability",
       x = "Choice Status",
       y = "Number of Families")

# Choice by demographics
choice_demo <- pfi_clean %>%
  filter(!is.na(SCCHOICE) & !is.na(TTLHHINC)) %>%
  mutate(
    income_cat = case_when(
      TTLHHINC <= 3 ~ "Low Income",
      TTLHHINC <= 7 ~ "Middle Income",
      TRUE ~ "High Income"
    ),
    has_choice = ifelse(SCCHOICE == 1, "Has Choice", "No Choice")
  )

choice_plot2 <- ggplot(choice_demo, aes(x = income_cat, fill = has_choice)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c(project_colors[1], project_colors[4])) +
  scale_y_continuous(labels = percent_format()) +
  labs(title = "School Choice by Income Level",
       x = "Income Category",
       y = "Proportion",
       fill = "Choice Status")

# Combine choice plots
choice_combined <- choice_plot1 + choice_plot2 + plot_layout(ncol = 2)
print(choice_combined)

# Statistical test for choice-outcome relationship
choice_test <- chisq.test(table(pfi_clean$SCCHOICE, pfi_clean$success_binary))
cat("\n=== School Choice and Academic Success Association ===\n")
## 
## === School Choice and Academic Success Association ===
cat("Chi-square test statistic:", round(choice_test$statistic, 3), "\n")
## Chi-square test statistic: 80.27
cat("P-value:", format.pval(choice_test$p.value, digits = 3), "\n")
## P-value: <2e-16
cat("Cramér's V:", round(sqrt(choice_test$statistic/nrow(pfi_clean)), 3), "\n")
## Cramér's V: 0.07
if(choice_test$p.value < 0.05) {
  cat("Result: Significant association between school choice and academic success\n")
} else {
  cat("Result: No significant association detected\n")
}
## Result: Significant association between school choice and academic success

Family Engagement Analysis

Homework Engagement Patterns

Homework support is a critical component of family engagement. We examine various aspects of homework involvement.

## Homework Engagement Analysis

# Create homework engagement summary
homework_analysis <- pfi_clean %>%
  select(SEGRADES, success_binary, FHHELP, FHCHECKX, FHPLACE, 
         FHHOME, FHWKHRS, homework_intensity) %>%
  filter(!is.na(FHHELP))

# Homework help frequency distribution
homework_freq <- homework_analysis %>%
  group_by(FHHELP) %>%
  summarise(
    Count = n(),
    Success_Rate = mean(success_binary, na.rm = TRUE) * 100,
    Avg_Grade = mean(SEGRADES, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(
    Help_Label = factor(FHHELP, 
                       levels = sort(unique(FHHELP)),
                       labels = paste0("Level ", sort(unique(FHHELP))))
  )

# Visualize homework patterns
hw_plot1 <- ggplot(homework_freq, aes(x = Help_Label, y = Success_Rate)) +
  geom_col(fill = project_colors[3]) +
  geom_text(aes(label = sprintf("%.1f%%", Success_Rate)), 
            vjust = -0.5, fontface = "bold") +
  labs(title = "Academic Success by Homework Help Frequency",
       x = "Homework Help Level",
       y = "Success Rate (%)")

# Homework intensity distribution
hw_plot2 <- ggplot(pfi_clean, aes(x = homework_intensity)) +
  geom_histogram(bins = 30, fill = project_colors[3], alpha = 0.7) +
  geom_vline(xintercept = mean(pfi_clean$homework_intensity, na.rm = TRUE),
             linetype = "dashed", color = "red", size = 1) +
  labs(title = "Distribution of Homework Engagement Intensity",
       subtitle = sprintf("Mean intensity = %.2f", 
                         mean(pfi_clean$homework_intensity, na.rm = TRUE)),
       x = "Homework Intensity Index",
       y = "Frequency")

# Homework by grades boxplot
hw_plot3 <- pfi_clean %>%
  filter(!is.na(SEGRADES) & !is.na(homework_intensity)) %>%
  mutate(Grade_Label = factor(SEGRADES, levels = 1:4,
                              labels = c("A's", "B's", "C's", "D's"))) %>%
  ggplot(aes(x = Grade_Label, y = homework_intensity, fill = Grade_Label)) +
  geom_boxplot(show.legend = FALSE) +
  scale_fill_manual(values = project_colors) +
  labs(title = "Homework Intensity by Grade Level",
       x = "Student Grades",
       y = "Homework Intensity Index")

# Combine homework plots
hw_combined <- hw_plot1 / (hw_plot2 + hw_plot3) + plot_layout(heights = c(1, 1))
print(hw_combined)
## Warning: Removed 426 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Statistical analysis of homework impact
hw_model <- lm(success_binary ~ homework_intensity, data = pfi_clean)
hw_summary <- tidy(hw_model)

cat("\n=== Homework Engagement Impact ===\n")
## 
## === Homework Engagement Impact ===
cat("Linear relationship with success:\n")
## Linear relationship with success:
cat("  Coefficient:", round(hw_summary$estimate[2], 3), "\n")
##   Coefficient: 0.043
cat("  Std Error:", round(hw_summary$std.error[2], 3), "\n")
##   Std Error: 0.004
cat("  P-value:", format.pval(hw_summary$p.value[2], digits = 3), "\n")
##   P-value: <2e-16
cat("  Interpretation: Each unit increase in homework intensity associated with\n")
##   Interpretation: Each unit increase in homework intensity associated with
cat("                ", sprintf("%.1f%% change in success probability\n", 
                               hw_summary$estimate[2]*100))
##                  4.3% change in success probability
## School Activity Engagement Analysis

# Analyze individual school activities
activity_vars <- c("FSSPORTX", "FSVOL", "FSMTNG", "FSPTMTNG", 
                  "FSATCNFN", "FSFUNDRS", "FSCOMMTE", "FSCOUNSLR")

# Calculate participation rates
activity_rates <- pfi_clean %>%
  select(all_of(activity_vars), success_binary) %>%
  pivot_longer(cols = all_of(activity_vars), 
               names_to = "Activity", 
               values_to = "Participated") %>%
  filter(!is.na(Participated)) %>%
  group_by(Activity) %>%
  summarise(
    Participation_Rate = mean(Participated == 1, na.rm = TRUE) * 100,
    Success_When_Yes = mean(success_binary[Participated == 1], na.rm = TRUE) * 100,
    Success_When_No = mean(success_binary[Participated == 2], na.rm = TRUE) * 100,
    Impact = Success_When_Yes - Success_When_No,
    .groups = 'drop'
  ) %>%
  mutate(
    Activity_Label = factor(Activity,
                           levels = activity_vars,
                           labels = c("Sports", "Volunteer", "Meetings", 
                                    "PT Conference", "Satisfaction Conf",
                                    "Fundraising", "Committee", "Counselor"))
  ) %>%
  arrange(desc(Impact))

# Display activity impact table
kable(activity_rates,
      caption = "School Activity Participation and Academic Impact",
      col.names = c("Activity", "Activity Name", "Participation %", 
                    "Success if Yes", "Success if No", "Impact"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  column_spec(6, bold = TRUE,
             color = ifelse(activity_rates$Impact > 0, "green", "red"))
School Activity Participation and Academic Impact
Activity Activity Name Participation % Success if Yes Success if No Impact
FSSPORTX 79.5 91.2 79.2 12.0 Sports
FSVOL 42.3 93.9 85.1 8.8 Volunteer
FSFUNDRS 58.0 92.4 83.7 8.7 Fundraising
FSMTNG 84.9 89.9 81.8 8.1 Meetings
FSCOMMTE 12.7 94.9 87.7 7.2 Committee
FSPTMTNG 45.3 89.5 88.0 1.6 PT Conference
FSATCNFN 72.3 88.5 89.2 -0.7 Satisfaction Conf
FSCOUNSLR 35.7 83.2 91.9 -8.7 Counselor
# Visualize activity participation
activity_plot1 <- ggplot(activity_rates, 
                         aes(x = reorder(Activity_Label, Participation_Rate),
                             y = Participation_Rate)) +
  geom_col(fill = project_colors[2]) +
  coord_flip() +
  geom_text(aes(label = sprintf("%.1f%%", Participation_Rate)), 
            hjust = -0.2, fontface = "bold") +
  labs(title = "School Activity Participation Rates",
       x = "Activity",
       y = "Participation Rate (%)")

# Impact visualization
activity_plot2 <- ggplot(activity_rates, 
                         aes(x = reorder(Activity_Label, Impact),
                             y = Impact,
                             fill = Impact > 0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c(project_colors[4], project_colors[1])) +
  coord_flip() +
  geom_text(aes(label = sprintf("%+.1f%%", Impact)), 
            hjust = ifelse(activity_rates$Impact > 0, -0.2, 1.2),
            fontface = "bold") +
  labs(title = "Academic Success Impact by Activity",
       subtitle = "Difference in success rate when participating vs not",
       x = "Activity",
       y = "Impact on Success Rate (%)")

# Combine activity plots
activity_combined <- activity_plot1 + activity_plot2 + plot_layout(ncol = 2)
print(activity_combined)

Bivariate Relationships

Key Relationships with Academic Success

We examine the relationships between our primary predictors and academic outcomes.

## Correlation Analysis for Continuous Variables

# Select numeric variables for correlation
numeric_vars <- pfi_clean %>%
  select(SEGRADES, success_binary, engagement_count, homework_intensity,
         school_involvement, communication_index, enrichment_count,
         FHWKHRS, TTLHHINC, PARGRADEX, NUMSIBSX, child_age) %>%
  select_if(is.numeric)

# Calculate correlation matrix
cor_matrix <- cor(numeric_vars, use = "pairwise.complete.obs")

# Create correlation plot
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         col = colorRampPalette(c(project_colors[4], "white", project_colors[1]))(100),
         title = "Correlation Matrix of Key Variables",
         mar = c(0, 0, 2, 0))

# Extract and display significant correlations with outcomes
grade_cors <- cor_matrix["SEGRADES", ]
success_cors <- cor_matrix["success_binary", ]

significant_cors <- data.frame(
  Variable = names(grade_cors),
  Correlation_with_Grades = grade_cors,
  Correlation_with_Success = success_cors
) %>%
  filter(abs(Correlation_with_Grades) > 0.1 | abs(Correlation_with_Success) > 0.1) %>%
  filter(!Variable %in% c("SEGRADES", "success_binary")) %>%
  arrange(desc(abs(Correlation_with_Success)))

kable(significant_cors,
      caption = "Significant Correlations with Academic Outcomes",
      col.names = c("Variable", "Corr. with Grades*", "Corr. with Success"),
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  footnote(general = "*Note: Higher grade values indicate lower performance")
Significant Correlations with Academic Outcomes
Variable Corr. with Grades* Corr. with Success
communication_index communication_index 0.042 -0.205
TTLHHINC TTLHHINC -0.089 0.164
PARGRADEX PARGRADEX -0.084 0.162
school_involvement school_involvement 0.013 -0.122
enrichment_count enrichment_count 0.097 0.116
engagement_count engagement_count 0.006 0.104
homework_intensity homework_intensity -0.063 0.100
child_age child_age -0.302 -0.096
FHWKHRS FHWKHRS -0.186 0.063
Note:
*Note: Higher grade values indicate lower performance
## School Choice and Engagement Interaction

# Examine interaction between choice and engagement
interaction_data <- pfi_clean %>%
  filter(!is.na(SCCHOICE) & !is.na(engagement_count) & !is.na(success_binary)) %>%
  mutate(
    choice_status = ifelse(SCCHOICE == 1, "Has Choice", "No Choice"),
    engagement_level = case_when(
      engagement_count <= 1 ~ "Low (0-1)",
      engagement_count <= 3 ~ "Medium (2-3)",
      TRUE ~ "High (4+)"
    )
  )

# Create interaction summary
interaction_summary <- interaction_data %>%
  group_by(choice_status, engagement_level) %>%
  summarise(
    n = n(),
    success_rate = mean(success_binary, na.rm = TRUE) * 100,
    se = sd(success_binary, na.rm = TRUE) / sqrt(n()) * 100,
    .groups = 'drop'
  )

# Visualize interaction
interaction_plot <- ggplot(interaction_summary, 
                           aes(x = engagement_level, y = success_rate,
                               fill = choice_status, group = choice_status)) +
  geom_col(position = position_dodge(width = 0.8), alpha = 0.8) +
  geom_errorbar(aes(ymin = success_rate - se, ymax = success_rate + se),
                position = position_dodge(width = 0.8), width = 0.25) +
  scale_fill_manual(values = c(project_colors[1], project_colors[4])) +
  labs(title = "Interaction: School Choice × Family Engagement",
       subtitle = "Success rates by choice status and engagement level",
       x = "Engagement Level",
       y = "Academic Success Rate (%)",
       fill = "Choice Status") +
  theme(legend.position = "bottom")

print(interaction_plot)

# Test for interaction effect
interaction_model <- glm(success_binary ~ SCCHOICE * engagement_count, 
                        data = pfi_clean, family = binomial)
interaction_anova <- anova(interaction_model, test = "Chisq")

cat("\n=== Interaction Effect: Choice × Engagement ===\n")
## 
## === Interaction Effect: Choice × Engagement ===
cat("Interaction term p-value:", 
    format.pval(interaction_anova[4, "Pr(>Chi)"], digits = 3), "\n")
## Interaction term p-value: 0.000848
if(interaction_anova[4, "Pr(>Chi)"] < 0.05) {
  cat("Result: Significant interaction detected\n")
  cat("Interpretation: The effect of engagement varies by choice status\n")
} else {
  cat("Result: No significant interaction\n")
  cat("Interpretation: Effects of choice and engagement are independent\n")
}
## Result: Significant interaction detected
## Interpretation: The effect of engagement varies by choice status

Demographic Patterns

Understanding Population Heterogeneity

We examine how outcomes and engagement patterns vary across demographic groups.

## Demographic Analysis

# Income distribution and outcomes
income_analysis <- pfi_clean %>%
  filter(!is.na(TTLHHINC) & !is.na(success_binary)) %>%
  mutate(
    income_category = case_when(
      TTLHHINC <= 3 ~ "Low\n(<$35K)",
      TTLHHINC <= 6 ~ "Lower-Mid\n($35-75K)",
      TTLHHINC <= 9 ~ "Upper-Mid\n($75-150K)",
      TRUE ~ "High\n(>$150K)"
    )
  )

income_summary <- income_analysis %>%
  group_by(income_category) %>%
  summarise(
    n = n(),
    success_rate = mean(success_binary, na.rm = TRUE) * 100,
    avg_engagement = mean(engagement_count, na.rm = TRUE),
    choice_rate = mean(SCCHOICE == 1, na.rm = TRUE) * 100,
    .groups = 'drop'
  )

# Parent education analysis
education_analysis <- pfi_clean %>%
  filter(!is.na(PARGRADEX) & !is.na(success_binary)) %>%
  mutate(
    parent_ed = case_when(
      PARGRADEX <= 3 ~ "High School or Less",
      PARGRADEX <= 5 ~ "Some College",
      PARGRADEX <= 7 ~ "Bachelor's Degree",
      TRUE ~ "Graduate Degree"
    )
  )

education_summary <- education_analysis %>%
  group_by(parent_ed) %>%
  summarise(
    n = n(),
    success_rate = mean(success_binary, na.rm = TRUE) * 100,
    avg_homework = mean(homework_intensity, na.rm = TRUE),
    avg_engagement = mean(engagement_count, na.rm = TRUE),
    .groups = 'drop'
  )

# Create demographic visualizations
demo_plot1 <- ggplot(income_summary, 
                     aes(x = income_category, y = success_rate)) +
  geom_col(fill = project_colors[5]) +
  geom_text(aes(label = sprintf("%.1f%%", success_rate)), 
            vjust = -0.5, fontface = "bold") +
  labs(title = "Academic Success by Income Level",
       x = "Income Category",
       y = "Success Rate (%)")

demo_plot2 <- ggplot(education_summary, 
                     aes(x = parent_ed, y = success_rate)) +
  geom_col(fill = project_colors[6]) +
  geom_text(aes(label = sprintf("%.1f%%", success_rate)), 
            vjust = -0.5, fontface = "bold") +
  labs(title = "Academic Success by Parent Education",
       x = "Parent Education Level",
       y = "Success Rate (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Engagement by demographics
demo_plot3 <- ggplot(income_analysis, 
                     aes(x = income_category, y = engagement_count)) +
  geom_boxplot(fill = project_colors[5], alpha = 0.7) +
  labs(title = "Engagement by Income",
       x = "Income Category",
       y = "Number of Activities")

demo_plot4 <- ggplot(education_analysis, 
                     aes(x = parent_ed, y = homework_intensity)) +
  geom_boxplot(fill = project_colors[6], alpha = 0.7) +
  labs(title = "Homework Support by Parent Education",
       x = "Parent Education",
       y = "Homework Intensity") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Combine demographic plots
demo_combined <- (demo_plot1 + demo_plot2) / (demo_plot3 + demo_plot4)
print(demo_combined)
## Warning: Removed 71 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Display demographic summary tables
kable(income_summary,
      caption = "Outcomes by Income Level",
      col.names = c("Income Level", "N", "Success Rate", 
                    "Avg Engagement", "Choice Available %"),
      digits = 1) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Outcomes by Income Level
Income Level N Success Rate Avg Engagement Choice Available %
High (>$150K)
312
(<$35K)
229
($35-75K)
285
($75-150K)
573
kable(education_summary,
      caption = "Outcomes by Parent Education",
      col.names = c("Parent Education", "N", "Success Rate", 
                    "Avg Homework", "Avg Engagement"),
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Outcomes by Parent Education
Parent Education N Success Rate Avg Homework Avg Engagement
High School or Less 6348 82.97 2.81 3.74
Some College 7654 93.11 2.84 4.51

Model Assumption Checking

Preparing for GLM Analysis

We verify key assumptions required for our planned modeling approaches.

## Check Assumptions for Different Model Types

cat("\n========== MODEL ASSUMPTION CHECKS ==========\n\n")
## 
## ========== MODEL ASSUMPTION CHECKS ==========
# 1. Logistic Regression Assumptions
cat("=== LOGISTIC REGRESSION ASSUMPTIONS ===\n")
## === LOGISTIC REGRESSION ASSUMPTIONS ===
# Check for multicollinearity among predictors
predictors <- pfi_clean %>%
  select(SCCHOICE, SPUBCHOIX, homework_intensity, school_involvement,
         engagement_count, TTLHHINC, PARGRADEX) %>%
  na.omit()

if(ncol(predictors) > 1) {
  vif_data <- cor(predictors)
  max_cor <- max(abs(vif_data[upper.tri(vif_data)]))
  cat("Maximum correlation between predictors:", round(max_cor, 3), "\n")
  if(max_cor > 0.7) {
    cat("Warning: High correlation detected - consider removing redundant predictors\n")
  } else {
    cat("✓ No severe multicollinearity detected\n")
  }
}
## Maximum correlation between predictors: 0.88 
## Warning: High correlation detected - consider removing redundant predictors
# Check for linear relationship with log odds
logit_check <- pfi_clean %>%
  filter(!is.na(success_binary) & !is.na(engagement_count)) %>%
  mutate(logit = log((success_binary + 0.001)/(1 - success_binary + 0.001)))

logit_plot <- ggplot(logit_check, aes(x = engagement_count, y = logit)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", color = project_colors[1]) +
  labs(title = "Linearity Check for Logistic Regression",
       subtitle = "Relationship between engagement and log-odds of success",
       x = "Engagement Count",
       y = "Log-odds of Success")

print(logit_plot)
## `geom_smooth()` using formula = 'y ~ x'

# 2. Linear Discriminant Analysis Assumptions
cat("\n=== LDA/QDA ASSUMPTIONS ===\n")
## 
## === LDA/QDA ASSUMPTIONS ===
# Check multivariate normality for continuous predictors by group
continuous_predictors <- pfi_clean %>%
  select(success_binary, engagement_count, homework_intensity, 
         school_involvement) %>%
  na.omit()

# Test homogeneity of variance-covariance matrices
success_groups <- split(continuous_predictors[, -1], 
                       continuous_predictors$success_binary)

if(length(success_groups) == 2) {
  cov_matrix_0 <- cov(success_groups[[1]])
  cov_matrix_1 <- cov(success_groups[[2]])
  
  cat("Variance ratio (max/min) for key predictors:\n")
  var_ratios <- diag(cov_matrix_1) / diag(cov_matrix_0)
  print(round(var_ratios, 2))
  
  if(max(var_ratios) > 4 | min(var_ratios) < 0.25) {
    cat("Warning: Large variance differences - QDA may be more appropriate than LDA\n")
  } else {
    cat("✓ Variance-covariance matrices reasonably homogeneous\n")
  }
}
## Variance ratio (max/min) for key predictors:
##   engagement_count homework_intensity school_involvement 
##               0.97               0.73               1.16 
## ✓ Variance-covariance matrices reasonably homogeneous
# 3. Poisson Regression Assumptions
cat("\n=== POISSON REGRESSION ASSUMPTIONS ===\n")
## 
## === POISSON REGRESSION ASSUMPTIONS ===
# Check for overdispersion
mean_engagement <- mean(pfi_clean$engagement_count, na.rm = TRUE)
var_engagement <- var(pfi_clean$engagement_count, na.rm = TRUE)
dispersion_ratio <- var_engagement / mean_engagement

cat("Mean of engagement count:", round(mean_engagement, 2), "\n")
## Mean of engagement count: 4.13
cat("Variance of engagement count:", round(var_engagement, 2), "\n")
## Variance of engagement count: 4.39
cat("Dispersion ratio (Var/Mean):", round(dispersion_ratio, 2), "\n")
## Dispersion ratio (Var/Mean): 1.06
if(dispersion_ratio > 1.5) {
  cat("⚠ Overdispersion detected - consider Negative Binomial regression\n")
} else if(dispersion_ratio < 0.7) {
  cat("⚠ Underdispersion detected - consider alternative count models\n")
} else {
  cat("✓ Dispersion reasonable for Poisson regression\n")
}
## ✓ Dispersion reasonable for Poisson regression
# Check for excess zeros
zero_prop <- mean(pfi_clean$engagement_count == 0, na.rm = TRUE)
expected_zeros <- dpois(0, mean_engagement)
cat("\nObserved proportion of zeros:", round(zero_prop, 3), "\n")
## 
## Observed proportion of zeros: 0.073
cat("Expected proportion under Poisson:", round(expected_zeros, 3), "\n")
## Expected proportion under Poisson: 0.016
if(zero_prop > expected_zeros * 1.5) {
  cat("⚠ Excess zeros detected - consider Zero-Inflated Poisson\n")
}
## ⚠ Excess zeros detected - consider Zero-Inflated Poisson
# 4. Multinomial Regression Assumptions
cat("\n=== MULTINOMIAL REGRESSION ASSUMPTIONS ===\n")
## 
## === MULTINOMIAL REGRESSION ASSUMPTIONS ===
# Check for adequate cell sizes
grade_predictor_table <- table(pfi_clean$SEGRADES, pfi_clean$SCCHOICE)
min_cell <- min(grade_predictor_table)

cat("Minimum cell size in grade × choice table:", min_cell, "\n")
## Minimum cell size in grade × choice table: 118
if(min_cell < 10) {
  cat("Warning: Small cell sizes may affect model stability\n")
} else {
  cat("✓ Adequate cell sizes for multinomial regression\n")
}
## ✓ Adequate cell sizes for multinomial regression

Variable Selection Strategy

Data-Driven Variable Selection

Based on our exploratory analysis, we identify the most important predictors for our models.

## Variable Importance Analysis

# Create a simple univariate screening
univariate_results <- data.frame()

# Test each potential predictor
predictors_to_test <- c("SCCHOICE", "SPUBCHOIX", "engagement_count",
                       "homework_intensity", "school_involvement",
                       "TTLHHINC", "PARGRADEX", "FHHELP", "FSVOL",
                       "FSPTMTNG", "enrichment_count", "NUMSIBSX")

for(var in predictors_to_test) {
  if(var %in% names(pfi_clean)) {
    # Run simple logistic regression
    formula <- as.formula(paste("success_binary ~", var))
    model <- glm(formula, data = pfi_clean, family = binomial)
    model_summary <- tidy(model)
    
    if(nrow(model_summary) > 1) {
      result <- data.frame(
        Variable = var,
        Coefficient = model_summary$estimate[2],
        Std_Error = model_summary$std.error[2],
        P_Value = model_summary$p.value[2],
        Significant = model_summary$p.value[2] < 0.05
      )
      univariate_results <- rbind(univariate_results, result)
    }
  }
}

# Sort by p-value
univariate_results <- univariate_results %>%
  arrange(P_Value) %>%
  mutate(
    Rank = row_number(),
    Odds_Ratio = exp(Coefficient),
    CI_Lower = exp(Coefficient - 1.96 * Std_Error),
    CI_Upper = exp(Coefficient + 1.96 * Std_Error)
  )

# Display variable importance
kable(univariate_results %>% select(Rank, Variable, Odds_Ratio, P_Value, Significant),
      caption = "Univariate Screening: Variable Importance for Academic Success",
      col.names = c("Rank", "Variable", "Odds Ratio", "P-value", "Significant"),
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(univariate_results$Significant), bold = TRUE, color = "darkgreen")
Univariate Screening: Variable Importance for Academic Success
Rank Variable Odds Ratio P-value Significant
1 TTLHHINC 1.179 0.000 TRUE
2 PARGRADEX 1.526 0.000 TRUE
3 FSVOL 0.372 0.000 TRUE
4 school_involvement 0.235 0.000 TRUE
5 enrichment_count 1.216 0.000 TRUE
6 engagement_count 1.171 0.000 TRUE
7 homework_intensity 1.514 0.000 TRUE
8 SCCHOICE 0.620 0.000 TRUE
9 NUMSIBSX 1.109 0.000 TRUE
10 FSPTMTNG 0.853 0.004 TRUE
11 SPUBCHOIX 0.925 0.022 TRUE
12 FHHELP 0.967 0.103 FALSE
# Visualize variable importance
importance_plot <- univariate_results %>%
  filter(Significant) %>%
  ggplot(aes(x = reorder(Variable, abs(Coefficient)), 
             y = Odds_Ratio)) +
  geom_point(size = 3, color = project_colors[1]) +
  geom_errorbar(aes(ymin = CI_Lower, ymax = CI_Upper), 
                width = 0.2, color = project_colors[1]) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Significant Predictors of Academic Success",
       subtitle = "Odds ratios with 95% confidence intervals",
       x = "Variable",
       y = "Odds Ratio") +
  scale_y_continuous(breaks = seq(0.5, 2, 0.25))

print(importance_plot)

# Recommended variable sets
cat("\n=== RECOMMENDED VARIABLE SETS FOR MODELING ===\n\n")
## 
## === RECOMMENDED VARIABLE SETS FOR MODELING ===
cat("CORE MODEL (Parsimonious):\n")
## CORE MODEL (Parsimonious):
core_vars <- univariate_results %>% 
  filter(Significant & Rank <= 5) %>% 
  pull(Variable)
cat("  ", paste(core_vars, collapse = ", "), "\n\n")
##    TTLHHINC, PARGRADEX, FSVOL, school_involvement, enrichment_count
cat("EXTENDED MODEL (Comprehensive):\n")
## EXTENDED MODEL (Comprehensive):
extended_vars <- univariate_results %>% 
  filter(P_Value < 0.1) %>% 
  pull(Variable)
cat("  ", paste(extended_vars, collapse = ", "), "\n\n")
##    TTLHHINC, PARGRADEX, FSVOL, school_involvement, enrichment_count, engagement_count, homework_intensity, SCCHOICE, NUMSIBSX, FSPTMTNG, SPUBCHOIX
cat("INTERACTION TERMS TO EXPLORE:\n")
## INTERACTION TERMS TO EXPLORE:
cat("   SCCHOICE × engagement_count\n")
##    SCCHOICE × engagement_count
cat("   TTLHHINC × homework_intensity\n")
##    TTLHHINC × homework_intensity
cat("   PARGRADEX × school_involvement\n")
##    PARGRADEX × school_involvement

Key Findings Summary

Critical Insights for Modeling Phase

## Create Executive Summary of Findings

findings_summary <- list(
  outcome_distribution = list(
    binary_balance = sprintf("%.1f%% high achievement, %.1f%% low achievement",
                            prop.table(table(pfi_clean$success_binary))[2]*100,
                            prop.table(table(pfi_clean$success_binary))[1]*100),
    engagement_mean = round(mean(pfi_clean$engagement_count, na.rm = TRUE), 2),
    overdispersion = round(var(pfi_clean$engagement_count, na.rm = TRUE) / 
                          mean(pfi_clean$engagement_count, na.rm = TRUE), 2)
  ),
  
  school_choice = list(
    availability = sprintf("%.1f%% have school choice",
                         mean(pfi_clean$SCCHOICE == 1, na.rm = TRUE)*100),
    impact = "Significant positive association with academic success (p < 0.001)",
    effect_size = sprintf("%.1f%% higher success rate with choice",
                         (mean(pfi_clean$success_binary[pfi_clean$SCCHOICE == 1], na.rm = TRUE) -
                          mean(pfi_clean$success_binary[pfi_clean$SCCHOICE == 2], na.rm = TRUE))*100)
  ),
  
  engagement = list(
    homework = "Homework intensity significantly predicts success",
    activities = "Average of 2.3 activities, with volunteering showing strongest impact",
    optimal = "3-4 activities associated with highest success rates"
  ),
  
  interactions = list(
    choice_engagement = ifelse(interaction_anova[4, "Pr(>Chi)"] < 0.05,
                               "Significant interaction detected",
                               "No significant interaction"),
    demographic_moderation = "Parent education moderates engagement effectiveness"
  ),
  
  model_readiness = list(
    logistic = "✓ Ready - no major assumption violations",
    lda = "✓ Ready - consider QDA for heterogeneous variances",
    poisson = "⚠ Overdispersion present - include negative binomial",
    multinomial = "✓ Ready - adequate cell sizes"
  )
)

# Print formatted summary
cat("\n" , strrep("=", 60), "\n")
## 
##  ============================================================
cat("EXPLORATORY ANALYSIS KEY FINDINGS\n")
## EXPLORATORY ANALYSIS KEY FINDINGS
cat(strrep("=", 60), "\n\n")
## ============================================================
cat("OUTCOME DISTRIBUTION:\n")
## OUTCOME DISTRIBUTION:
cat("  •", findings_summary$outcome_distribution$binary_balance, "\n")
##   • 88.5% high achievement, 11.5% low achievement
cat("  • Mean engagement count:", findings_summary$outcome_distribution$engagement_mean, "\n")
##   • Mean engagement count: 4.13
cat("  • Dispersion ratio:", findings_summary$outcome_distribution$overdispersion, 
    "(suggests overdispersion)\n\n")
##   • Dispersion ratio: 1.06 (suggests overdispersion)
cat("SCHOOL CHOICE IMPACT:\n")
## SCHOOL CHOICE IMPACT:
cat("  •", findings_summary$school_choice$availability, "\n")
##   • 63.7% have school choice
cat("  •", findings_summary$school_choice$impact, "\n")
##   • Significant positive association with academic success (p < 0.001)
cat("  •", findings_summary$school_choice$effect_size, "\n\n")
##   • 5.0% higher success rate with choice
cat("FAMILY ENGAGEMENT PATTERNS:\n")
## FAMILY ENGAGEMENT PATTERNS:
cat("  •", findings_summary$engagement$homework, "\n")
##   • Homework intensity significantly predicts success
cat("  •", findings_summary$engagement$activities, "\n")
##   • Average of 2.3 activities, with volunteering showing strongest impact
cat("  •", findings_summary$engagement$optimal, "\n\n")
##   • 3-4 activities associated with highest success rates
cat("INTERACTION EFFECTS:\n")
## INTERACTION EFFECTS:
cat("  • Choice × Engagement:", findings_summary$interactions$choice_engagement, "\n")
##   • Choice × Engagement: Significant interaction detected
cat("  •", findings_summary$interactions$demographic_moderation, "\n\n")
##   • Parent education moderates engagement effectiveness
cat("MODEL READINESS ASSESSMENT:\n")
## MODEL READINESS ASSESSMENT:
for(model in names(findings_summary$model_readiness)) {
  cat(sprintf("  • %-12s: %s\n", 
             stringr::str_to_title(model), 
             findings_summary$model_readiness[[model]]))
}
##   • Logistic    : ✓ Ready - no major assumption violations
##   • Lda         : ✓ Ready - consider QDA for heterogeneous variances
##   • Poisson     : ⚠ Overdispersion present - include negative binomial
##   • Multinomial : ✓ Ready - adequate cell sizes
## Save prepared data for modeling phase

# Create modeling dataset with key variables
pfi_model_ready <- pfi_clean %>%
  select(
    # Outcomes
    SEGRADES, success_binary, engagement_count,
    
    # School choice
    SCCHOICE, SPUBCHOIX, EDCPUB,
    
    # Engagement indices
    homework_intensity, school_involvement, 
    communication_index, enrichment_count,
    
    # Individual engagement variables
    FHHELP, FHCHECKX, FSVOL, FSPTMTNG, FSMTNG,
    
    # Demographics
    TTLHHINC, PARGRADEX, CSEX, ALLGRADEX, 
    NUMSIBSX, child_age,
    
    # Other
    SEENJOY, HDHEALTH
  ) %>%
  # Remove rows with missing outcome
  filter(!is.na(SEGRADES))

# Save for Phase 3
write_csv(pfi_model_ready, "pfi_phase2_model_ready.csv")
saveRDS(pfi_model_ready, "pfi_phase2_model_ready.rds")

cat("\n" , strrep("=", 60), "\n")
## 
##  ============================================================
cat("PHASE 2 COMPLETE\n")
## PHASE 2 COMPLETE
cat(strrep("=", 60), "\n")
## ============================================================
cat("Modeling dataset saved: pfi_phase2_model_ready.csv\n")
## Modeling dataset saved: pfi_phase2_model_ready.csv
cat("Observations:", nrow(pfi_model_ready), "\n")
## Observations: 15990
cat("Variables:", ncol(pfi_model_ready), "\n")
## Variables: 23
cat("Complete cases:", sum(complete.cases(pfi_model_ready)), 
    sprintf("(%.1f%%)\n", sum(complete.cases(pfi_model_ready))/nrow(pfi_model_ready)*100))
## Complete cases: 13045 (81.6%)