A single data frame

After having cleaned all individual studies and put the data into a shared format, we combine the studies into a single data frame.

Code
# combine data

# List all matching files recursively
files <- list.files(
  path = "papers", 
  pattern = "^cleaned.*\\.rds$", 
  full.names = TRUE, 
  recursive = TRUE
)

# Load each file into a named list
studies_data <- lapply(files, function(file) {
  readRDS(file) |> 
    # remove old labels from stata files 
    haven::zap_labels() |> 
    # ensure all identifier variables are of same variable type
    mutate(subject_id = as.character(subject_id), 
           scale = as.character(scale))
})

# Name list elements based on filenames (without extension or path)
names(studies_data) <- tools::file_path_sans_ext(basename(files))

data <- bind_rows(studies_data)

Control and intervention group selection

Several experiments have multiple control and/or intervention groups. For our meta-analysis, for each experiment, we only pair one control and one treatment group. In a case where we needed to make a selection, we have coded our selection choice in the variables intervention_selection and control_selection.

Code
data <- data |>
  # for cases where there was an intervention selection, make sure to only pick the selected intervention
  filter(is.na(intervention_selection) | intervention_label == intervention_selection | condition == "control") |> 
  # same but for the control
  filter(is.na(control_selection) | control_label == control_selection | condition == "treatment")

# check
# data |> 
#   group_by(paper_id, condition) |> 
#   summarise(sum(!is.na(accuracy_raw)))

# check transformation with dias paper
# data |> 
#   filter(paper_id == "dias_2020") |> 
#   filter(experiment_id == 1) |> 
#   distinct(condition, control_label, control_selection, intervention_label, intervention_selection) 

Collapse accuracy measures

Code
data <- data |> 
  # collapes all raw accuracy scores on different scales into a binary scale
  mutate(
    # make a numeric version of scale, for all scales that are not binary
    scale_numeric = as.numeric(scale),
    # add helper variable that indicates whether a scale has a midpoint
    midpoint_scale = ifelse(scale_numeric %% 2 != 0, TRUE, FALSE),
    accuracy = case_when(
      # for scales with midpoints, code midpoints as NA
      midpoint_scale == TRUE & 
        accuracy_raw == (scale_numeric/2)+0.5 ~ NA,
      # transform continuous scores
      accuracy_raw <= scale_numeric/2 ~ 0, 
      accuracy_raw > scale_numeric/2 ~ 1, 
      TRUE ~ accuracy_raw)
    )

# check
# data |>
#   distinct(scale, accuracy, accuracy_raw)

Make unique paper, experiment, and subject identifiers

Code
data <- data |> 
    mutate(    
    # unique experiment identifier
    unique_experiment_id = paste(paper_id, experiment_id, sep = "_"), 
    # unique participant identifier
    unique_subject_id = paste0(paper_id, "_", experiment_id, "_", subject_id)
    )

Use deviation coding

Code
data <- data |> 
  mutate(    
    # make numeric helper variables using deviation coding
    veracity_numeric = ifelse(veracity == "true", 0.5, -0.5),
    condition_numeric = ifelse(condition == "treatment",  0.5, -0.5)
  )

Exclude long_term effects

Some studies measured long_term effects.

Code
data |> 
  filter(long_term == TRUE) |> 
  distinct(paper_id)
# A tibble: 1 × 1
  paper_id  
  <chr>     
1 guess_2020
Code
# detailed check
# data |> 
#   filter(paper_id == "guess_2020") |> 
#   group_by(long_term) |> 
#   count()

We want to exclude these from the main analysis (including them would likely reduce the detected average treatment effect, as effects tend to regress over time).

Code
data <- data |>
  filter(
    # remove long term effects
    long_term == FALSE | is.na(long_term)
  )

# check
# table(data$long_term, useNA = "always")

Save data

Code
# build additional variables
# Save as CSV
write_csv(data, "data.csv")

# Save as RDS
saveRDS(data, "data.rds")