Bonus lecture

Last time

Paired-samples t-tests

  • aka one-sample t-tests on difference scores
library(tidyverse)

Today

  • Repeated measures
  • Intro to text analysis
  • Bootstrapping?

Repeated Measures

  • How does the number of trials per participant affect statistical power?
  • Can more trials compensate for fewer participants?
  • How do we analyze experiments with multiple trials per participant?

Key Concepts

  • In cognitive experiments, we have:
    • Multiple participants (n)
    • Multiple trials per participant (k)
    • Two or more conditions
    • Each trial produces a measurement
  • Multiple trials can:
    1. Improve measurement precision
    2. Reduce within-subject variability
    3. Increase reliability

Simulation: Effect of Trials

(Function to simulate experiment here)

Code
simulate_cognitive_experiment <- function(
    n_participants = 20,    # number of participants
    n_trials = 50,         # trials per condition per participant
    true_effect = 0.5,     # mean difference between conditions
    participant_sd = 0.25,   # between-participant variability
    trial_sd = 1.0,        # within-participant (trial) variability
    seed = 123             # for reproducibility
) {
  
  set.seed(seed)
  
  # Generate participant random effects (individual differences)
  participant_effects <- rnorm(n_participants, mean = 0, sd = participant_sd)
  
  # Create a data frame with all combinations of participants and trials
  experiment_data <- expand_grid(
    participant = 1:n_participants,
    trial = 1:n_trials,
    condition = c("A", "B")
  ) %>%
    # Add participant-level random effects
    mutate(
      participant_effect = rep(participant_effects, each = n_trials * 2),
      # Add condition effect (only for condition B)
      condition_effect = if_else(condition == "B", true_effect, 0),
      # Generate trial-level noise
      trial_noise = rnorm(n(), mean = 0, sd = trial_sd),
      # Compute response time (or other DV)
      response = 0.5 + participant_effect + condition_effect + trial_noise
    ) %>%
    # Reorder columns for clarity
    select(participant, condition, trial, response)
  
  return(experiment_data)
}
# Generate example data
data <- simulate_cognitive_experiment(
  n_participants = 20,
  n_trials = 100,
  true_effect = 0.5,
  participant_sd = 0.25,
  trial_sd = 1.0
)

# View first few rows
data
# A tibble: 4,000 × 4
   participant condition trial response
         <int> <chr>     <int>    <dbl>
 1           1 A             1   -0.708
 2           1 B             1    0.642
 3           1 A             2   -0.666
 4           1 B             2    0.131
 5           1 A             3   -0.265
 6           1 B             3   -0.827
 7           1 A             4    1.20 
 8           1 B             4    1.01 
 9           1 A             5   -0.778
10           1 B             5    2.11 
# ℹ 3,990 more rows
# Basic summary statistics
data %>%
  group_by(condition) %>%
  summarise(
    mean_rt = mean(response),
    sd_rt = sd(response),
    n_trials = n()
  )
# A tibble: 2 × 4
  condition mean_rt sd_rt n_trials
  <chr>       <dbl> <dbl>    <int>
1 A           0.544  1.02     2000
2 B           1.04   1.01     2000
# Participant-level summary
data %>%
  group_by(participant, condition) %>%
  summarise(
    mean_rt = mean(response),
    sd_rt = sd(response),
    n_trials = n(),
    .groups = "drop"
  ) 
# A tibble: 40 × 5
   participant condition mean_rt sd_rt n_trials
         <int> <chr>       <dbl> <dbl>    <int>
 1           1 A           0.345 0.935      100
 2           1 B           0.842 0.967      100
 3           2 A           0.562 0.929      100
 4           2 B           0.853 1.03       100
 5           3 A           1.07  1.04       100
 6           3 B           1.27  0.914      100
 7           4 A           0.558 1.01       100
 8           4 B           0.991 1.03       100
 9           5 A           0.561 1.02       100
10           5 B           1.12  1.14       100
# ℹ 30 more rows
# Calculate effect size for each participant
participant_effects <- data %>%
  group_by(participant, condition) %>%
  summarise(
    mean_rt = mean(response),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = condition,
    values_from = mean_rt
  ) %>%
  mutate(effect_size = B - A)
# A tibble: 20 × 4
   participant      A     B effect_size
         <int>  <dbl> <dbl>       <dbl>
 1           1  0.345 0.842       0.497
 2           2  0.562 0.853       0.291
 3           3  1.07  1.27        0.200
 4           4  0.558 0.991       0.433
 5           5  0.561 1.12        0.564
 6           6  0.977 1.38        0.401
 7           7  0.597 1.15        0.555
 8           8  0.139 0.881       0.742
 9           9  0.315 0.911       0.596
10          10  0.395 1.01        0.620
11          11  0.752 1.17        0.416
12          12  0.543 0.947       0.405
13          13  0.662 1.17        0.510
14          14  0.488 0.975       0.486
15          15  0.480 0.927       0.447
16          16  0.845 1.38        0.534
17          17  0.562 1.20        0.634
18          18 -0.107 0.688       0.796
19          19  0.582 0.992       0.410
20          20  0.561 0.940       0.379
# Aggregating across different trial numbers
participant_effects20 <- data %>%
  filter(trial <= 20) %>% 
  group_by(participant, condition) %>%
  summarise(
    mean_rt = mean(response),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = condition,
    values_from = mean_rt
  ) %>%
  mutate(effect_size = B - A)
# A tibble: 20 × 4
   participant       A     B effect_size
         <int>   <dbl> <dbl>       <dbl>
 1           1  0.238  1.04       0.800 
 2           2  0.627  0.696      0.0686
 3           3  1.15   1.30       0.158 
 4           4  0.324  1.05       0.721 
 5           5  0.524  1.11       0.583 
 6           6  1.05   1.58       0.529 
 7           7  0.590  1.21       0.616 
 8           8  0.0702 0.980      0.910 
 9           9  0.127  1.07       0.947 
10          10  0.321  0.900      0.579 
11          11  0.904  1.40       0.495 
12          12  0.413  0.736      0.322 
13          13  0.507  1.24       0.729 
14          14  0.901  0.945      0.0440
15          15  0.401  1.12       0.715 
16          16  0.812  1.14       0.332 
17          17  0.568  1.38       0.809 
18          18 -0.146  0.841      0.987 
19          19  0.610  1.15       0.538 
20          20  0.955  0.977      0.0217
t.test(participant_effects20$A, participant_effects20$B, 
       paired = TRUE)$statistic
        t 
-8.090232 
t.test(participant_effects$A, participant_effects$B, 
       paired = TRUE)$statistic
        t 
-15.56358 

Why More Trials Help

Law of Large Numbers

  • Individual trial measurements are noisy
  • Mean of many trials is more stable
  • Reduces measurement error

Standard Error Formula

  • SE = σ / √n
  • More trials → smaller SE of participant means
  • Smaller SE → More power
Code
participant_effects %>% 
  mutate(Trials = 100) %>% 
  full_join(
    mutate(participant_effects20, Trials = 20)
  ) %>% 
  pivot_longer(cols = c("A", "B"),
               names_to = "condition",
               values_to = "avg_response") %>% 
  ggplot(., aes(x = condition, 
                     y = avg_response, 
                     fill = as.factor(Trials))) +
  geom_boxplot() +
  labs(title = "Reduced Variability with More Trials", 
       y = "Mean Score", 
       fill = "Number of Trials") +
  theme_minimal() +
  theme(legend.position = "bottom")

Practical Guidelines

  • More trials generally help, but with diminishing returns
  • Consider:
    • Participant fatigue
    • Practice effects
    • Time constraints
    • Resource limitations

Analysis Approaches

Summary by Dan McNeish

  • Participant-level Analysis
    • Average trials for each participant
    • Run paired t-test on participant means
    • Simple but may lose information
  • Multilevel modeling
    • Use all information
    • Especially useful when interested in individual differences
  • Clustered errors and GEE
    • Don’t care about individual differences
  • Fixed effects models
    • Useful when relevant contextual factors are not measured

Coming Soon (PSY 613)…

  • Mixed-effects models
    • Account for trial-level variation
    • Handle missing data
    • Model individual differences
  • Power analysis for repeated measures
library(lme4)
lmer(response ~ 1 + condition + (1|participant),
     data = data) %>% 
  summary()
Linear mixed model fit by REML ['lmerMod']
Formula: response ~ 1 + condition + (1 | participant)
   Data: data

REML criterion at convergence: 11358.8

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.2348 -0.6633 -0.0035  0.6758  3.4111 

Random effects:
 Groups      Name        Variance Std.Dev.
 participant (Intercept) 0.0416   0.2039  
 Residual                0.9888   0.9944  
Number of obs: 4000, groups:  participant, 20

Fixed effects:
            Estimate Std. Error t value
(Intercept)  0.54415    0.05074   10.72
conditionB   0.49573    0.03145   15.77

Correlation of Fixed Effects:
           (Intr)
conditionB -0.310

Text analysis

Data came from a study of parents of young children, collected during 2020.

Parents answered the question: “How do you feel about the COVID-19 vaccine in terms of its safety and effectiveness, and what are your plans in terms of whether or not to get it?”

# install.packages(c("textdata", "tidytext", "wordcloud"))
library(tidytext)
library(textdata)
library(wordcloud)
rapid = read_csv("https://raw.githubusercontent.com/uopsych/psy611/refs/heads/master/data/rapid_short.csv")

Data Preparation

Let’s examine and clean our text data:

# Look at the structure of our data
glimpse(rapid)
Rows: 6,554
Columns: 7
$ ...1         <dbl> 23435, 23436, 23437, 23439, 23440, 23444, 23445, 23451, 2…
$ CaregiverID  <chr> "R3ID00020", "R3ID00032", "R3ID00032", "R3ID00036", "R3ID…
$ StartDate    <dttm> 2021-06-11 08:15:52, 2021-06-09 10:59:01, 2021-11-10 09:…
$ CaregiverAge <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ DEMO.006.a   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ DEMO.014     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ HEALTH.030   <chr> "I am not planning on it right now but i am open to the f…

Cleaning text requires the use of “regular expressions.” This is like a sub-dialect of coding. The website regex101.com is very useful for navigating this code, but of course, AI is great too.

# Create a clean text column
rapid_clean <- rapid %>%
  mutate(
    response = HEALTH.030,
    # Convert to lowercase
    response = tolower(response),
    # Remove punctuation
    response = str_remove_all(response, "\\?"),
    response = str_replace_all(response, "[[:punct:]]", " "),
    # Remove extra whitespace
    response = str_squish(response)
  )

head(rapid_clean$response) # show first few responses
[1] "i am not planning on it right now but i am open to the future"                                                  
[2] "i dont feel it is safe yet or ready"                                                                            
[3] "im not sure about the vaccine and its effectiveness im still weary and havent decided on taking the vaccine yet"
[4] "i dont feel like it works i had to get it for work"                                                             
[5] "i already got both doses"                                                                                       
[6] "i got it"                                                                                                       

Breaking Text into Words

We’ll use tidytext to tokenize our responses:

words_df <- rapid_clean %>%
  unnest_tokens(word, response) %>%
  # Remove stop words
  anti_join(stop_words)

# View most common words
words_df %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 10) %>%
  knitr::kable()
word n
vaccine 1789
safe 1747
vaccinated 1457
feel 1210
received 976
effective 952
covid 514
booster 463
plan 460
children 449

Visualizing Common Words

Let’s create a word cloud:

words_df %>%
  count(word) %>%
  with(wordcloud(word, n, 
                max.words = 50, 
                colors = brewer.pal(8, "Dark2")))

Finding Themes: Bigrams

Let’s look at word pairs to understand context better:

Code
bigrams_df <- rapid_clean %>%
  unnest_tokens(bigram, response, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>%
  filter(!is.na(word1)) %>%
  filter(!is.na(word2))

# View top bigrams
bigrams_df %>%
  unite(bigram, word1, word2, sep = " ") %>%
  count(bigram, sort = TRUE) %>%
  slice_head(n = 10) %>%
  knitr::kable()
bigram n
term effects 154
covid 19 118
feel confident 65
19 vaccine 47
dont feel 46
dont trust 44
covid vaccine 43
feel comfortable 43
children vaccinated 41
feel safe 39

Sentiment Analysis

Let’s analyze the emotional content of responses:

get_sentiments("bing") %>%
  sample_n(10)
# A tibble: 10 × 2
   word          sentiment
   <chr>         <chr>    
 1 shark         negative 
 2 awsome        positive 
 3 irrepressible negative 
 4 humiliation   negative 
 5 disgruntled   negative 
 6 preferably    positive 
 7 comforting    positive 
 8 immobilized   negative 
 9 bored         negative 
10 eye-catching  positive 

Sentiment Analysis

# Add sentiment scores
sentiment_df <- words_df %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(CaregiverID, StartDate, sentiment) %>%
  pivot_wider(names_from = sentiment, 
              values_from = n, 
              values_fill = 0) %>%
  mutate(sentiment_score = positive - negative)

sentiment_df
# A tibble: 4,070 × 5
   CaregiverID StartDate           positive negative sentiment_score
   <chr>       <dttm>                 <int>    <int>           <int>
 1 R3ID00032   2021-06-09 10:59:01        2        0               2
 2 R3ID00032   2021-11-10 09:57:51        1        1               0
 3 R3ID00049   2021-12-10 08:17:19        1        0               1
 4 R3ID00083   2021-11-14 09:11:26        0        1              -1
 5 R3ID00084   2021-06-11 10:58:56        1        0               1
 6 R3ID00084   2021-10-13 09:21:12        1        0               1
 7 R3ID00117   2021-12-08 10:30:31        0        1              -1
 8 R3ID00166   2021-03-17 22:15:04        1        0               1
 9 R3ID00168   2021-12-08 10:13:24        1        0               1
10 R3ID00188   2021-10-14 09:17:07        1        0               1
# ℹ 4,060 more rows

Sentiment Analysis

Code
# View distribution of sentiment
ggplot(sentiment_df, aes(x = sentiment_score)) +
  geom_histogram(binwidth = 1, fill = "steelblue", alpha = 0.7) +
  geom_vline(aes(xintercept = 0), linetype = "dashed") +
  labs(title = "Distribution of Sentiment Scores",
       x = "Sentiment Score (Positive - Negative)",
       y = "Count") +
  theme_minimal()

Does sentiment change over time?

Code
sentiment_df %>% 
  ggplot(aes(x = StartDate)) +
  geom_smooth(aes(y = positive, color = "positive sentiment")) +
  geom_smooth(aes(y = negative, color = "negative sentiment")) +
  scale_color_manual(values = c("#1B4965", "#FF9F1C")) +
  labs(
    x = "Time (2020)",
    y = "Sentiment score"
  ) +
  theme_minimal() +
  theme(legend.position = "top")

Key Findings

  • Most common words reveal attitudes about vaccine safety and effectiveness
  • Common bigrams show personal experiences (“already got”, “fully vaccinated”)
  • Sentiment analysis reveals mixed but generally positive attitudes

Resources for Learning More