Results: Critiques & Key Issues

Setup and libraries

library("tidyverse")
library("jsonlite")
library("knitr")
library("kableExtra")
library("DT")
library("reticulate")

# Theme colors (matching results_ratings.qmd)
UJ_ORANGE <- "#f19e4b"
UJ_GREEN  <- "#99bb66"
UJ_BLUE   <- "#4e79a7"

# Severity colors for badges
SEV_NECESSARY <- "#e74c3c"   # Red
SEV_OPTIONAL  <- "#f39c12"   # Orange
SEV_UNSURE    <- "#95a5a6"   # Gray

theme_uj <- function(base_size = 11) {
  theme_minimal(base_size = base_size) +
    theme(
      panel.grid.minor = element_blank(),
      plot.title.position = "plot",
      legend.position = "bottom"
    )
}

# Function to create severity badge HTML
severity_badge <- function(severity) {
  sev <- tolower(trimws(severity))
  if (grepl("necessary", sev)) {
    return('<span style="background-color:#e74c3c;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">🔴 Necessary</span>')
  } else if (grepl("optional", sev)) {
    return('<span style="background-color:#f39c12;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">🟠 Optional</span>')
  } else if (sev != "") {
    return('<span style="background-color:#95a5a6;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">⚪ Unsure</span>')
  }
  return('<span style="background-color:#bdc3c7;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">—</span>')
}

# Function to truncate text for table display
truncate_text <- function(text, max_chars = 80) {
  if (nchar(text) > max_chars) {
    paste0(substr(text, 1, max_chars), "...")
  } else {
    text
  }
}

This chapter compares qualitative critiques: the key methodological and interpretive issues identified by the LLM model — against human expert critiques documented by Unjournal evaluation managers in Coda.

We first assess alignment using LLM-based comparison: GPT-5.2 Pro evaluates coverage (what proportion of human concerns GPT identified) and precision (whether GPT issues are substantive). Human-checks on this alignment are underway. See LLM-Based Assessment for aggregate metrics.

Data Sources

GPT-5.2 Pro Key Issues: Structured output from the focal evaluation run (January 2026), where the model was prompted to identify 5-15 key issues per paper, ordered from most to least important.

Human Expert Critiques: Curated content from the “Key critiques & issues with paper” column in The Unjournal’s internal tracking database (Coda), written by evaluation managers synthesizing evaluator feedback. These use severity labels (“Necessary”, “Optional but important”, “Unsure”) and cite specific evaluator comments.

Load comparison data

# Load the matched comparison data
comparison_file <- "results/key_issues_comparison.json"
comparison_results_file <- c(
  "results/key_issue_comp_results.json",
  "results/key_issues_comparison_results.json"
)
comparison_results_file <- comparison_results_file[file.exists(comparison_results_file)][1]

if (file.exists(comparison_file)) {
  comparison_data <- fromJSON(comparison_file)
  n_papers <- nrow(comparison_data)
} else {
  comparison_data <- NULL
  n_papers <- 0
}

# Load LLM-based comparison results (coverage/precision) if available
if (!is.null(comparison_data) && !is.na(comparison_results_file)) {
  llm_results_raw <- fromJSON(comparison_results_file)
  llm_results <- llm_results_raw |>
    as_tibble() |>
    unnest_wider(comparison) |>
    select(
      gpt_paper,
      coverage_pct,
      precision_pct,
      # New format uses matched_pairs, unmatched_human, unmatched_llm
      any_of(c("matched_pairs", "unmatched_human", "unmatched_llm")),
      # Old format used missed_issues, extra_issues (keep for backward compat)
      any_of(c("missed_issues", "extra_issues")),
      overall_rating,
      overall_justification,
      detailed_notes
    )

  comparison_data <- comparison_data |>
    left_join(llm_results, by = "gpt_paper")
}

# Update paper count after loading
if (!is.null(comparison_data)) {
  n_papers <- nrow(comparison_data)
}

# Load human and LLM ratings for context tables
# Criterion name mapping: human -> standardized
criterion_map_human <- c(
  "overall" = "overall",
  "claims" = "claims_evidence",
  "methods" = "methods",
  "adv_knowledge" = "advancing_knowledge",
  "logic_comms" = "logic_communication",
  "open_sci" = "open_science",
  "gp_relevance" = "global_relevance",
  "real_world" = "real_world"
)

# Display names for nice labels
criterion_display <- c(
  "overall" = "Overall assessment",
  "claims_evidence" = "Claims & evidence",
  "methods" = "Methods",
  "advancing_knowledge" = "Advancing knowledge",
  "logic_communication" = "Logic & communication",
  "open_science" = "Open science",
  "global_relevance" = "Global relevance",
  "real_world" = "Real-world relevance"
)

# Normalize paper names for matching (remove underscores, periods, lowercase)
normalize_paper_name <- function(x) {
  x <- gsub("[_.]", " ", x)    # Replace underscores and periods with spaces
  x <- gsub("\\s+", " ", x)    # Collapse multiple spaces
  trimws(tolower(x))           # Lowercase and trim
}

human_ratings <- if (file.exists("data/all_ratings.rds")) {
  readRDS("data/all_ratings.rds") |>
    filter(criteria != "merits_journal", criteria != "journal_predict") |>
    mutate(
      criterion_std = criterion_map_human[as.character(criteria)],
      paper_norm = normalize_paper_name(label_paper)
    ) |>
    filter(!is.na(criterion_std)) |>
    group_by(paper = label_paper, paper_norm, criterion = criterion_std) |>
    summarise(human_mean = round(mean(middle_rating, na.rm = TRUE), 0), .groups = "drop")
} else {
  NULL
}

llm_ratings <- if (file.exists("data/metrics_long_gpt5_pro_jan2026.csv")) {
  read_csv("data/metrics_long_gpt5_pro_jan2026.csv", show_col_types = FALSE) |>
    mutate(paper_norm = normalize_paper_name(paper)) |>
    select(paper_llm = paper, paper_norm, criterion = metric, llm_rating = midpoint)
} else if (file.exists("data/metrics_long.csv")) {
  read_csv("data/metrics_long.csv", show_col_types = FALSE) |>
    mutate(paper_norm = normalize_paper_name(paper)) |>
    select(paper_llm = paper, paper_norm, criterion = metric, llm_rating = midpoint)
} else {
  NULL
}

# Combine ratings if both available - join on normalized paper name AND criterion
if (!is.null(human_ratings) && !is.null(llm_ratings)) {
  combined_ratings <- human_ratings |>
    full_join(llm_ratings, by = c("paper_norm", "criterion")) |>
    mutate(
      display_name = criterion_display[criterion],
      paper_final = coalesce(paper, paper_llm)
    ) |>
    select(paper = paper_final, paper_norm, criterion, display_name, human_mean, llm_rating)
} else {
  combined_ratings <- NULL
}

Parse human critiques into individual issues

import json
import re
from pathlib import Path

# Severity parsing regex patterns
SEVERITY_HEADER_RE = re.compile(
    r"^(necessary|optional|optional but important|unsure|less important|possibly relevant|"
    r"probably important|probably correct|probably|note|notes|optional/desirable|"
    r"probably useful|maybe|possibly|unsure if correct|limitations)(\s|:|$)",
    re.IGNORECASE,
)
ENUM_RE = re.compile(r"^\s*(\d+)[\.\)]\s+")
EVALUATOR_RE = re.compile(r"^\[?(E\d+|Evaluator|DR|NotebookLM)\]?[\s:\[]", re.IGNORECASE)

def normalize_severity(header):
    h = header.lower()
    if "necessary" in h:
        return "necessary"
    if "optional" in h:
        return "optional"
    if any(word in h for word in ["unsure", "possibly", "probably", "less important", "maybe", "limitations"]):
        return "unsure"
    return ""

def extract_short_label(text, max_words=12):
    """Extract a descriptive short label from issue text."""
    original = text.strip()

    # Meta-commentary patterns to skip
    META_SKIP_PATTERNS = [
        r'^I chose',
        r'^I selected',
        r'^I think',
        r'^I would',
        r'^I believe',
        r'^These are',
        r'^This is (a|the|my)',
        r'^Note[s]?\s*[:)]',
        r'^DR\s*[:)]',
        r'^NotebookLM\s*[:)]',
        r'^The following',
        r'^Summary\s*[:)]',
        r'^Quote from',
        r'^See also',
        r'^As noted',
        r'^and that',  # Fragments starting with conjunctions
        r'^and it',
        r"^and that it's",
        r'^or that',
        r'^but that',
        r'^but it',
        r'^which ',
        r'^that ',
        r'^to be ',
        r'^it is ',
        r'^it would ',
    ]

    # First clean: Remove leading evaluator tags (DR —, [E1], etc.)
    cleaned_for_meta_check = re.sub(
        r'^[\s\[]*(?:E\d+|Evaluator\s*\d*|DR|NotebookLM|Treich|Triech|Capitan|Reiley|Manheim|Espinosa|Seinkmane|Cloud|Tsai)[\]:\s—\-]+',
        '', original, flags=re.IGNORECASE
    ).strip()

    # Check for meta-commentary patterns AFTER removing evaluator prefix
    for pattern in META_SKIP_PATTERNS:
        if re.match(pattern, cleaned_for_meta_check, re.IGNORECASE):
            # Try to find actual content after the meta-commentary
            # Look for a quoted phrase or substantive content after colon/period
            quote_in_meta = re.search(r'["""\']([^"""\'\n]{15,80})["""\']', cleaned_for_meta_check)
            if quote_in_meta:
                quoted = quote_in_meta.group(1).strip()
                if not re.match(r'^(the |a |an |this )', quoted.lower()):
                    return quoted
            # Look for content after a colon
            after_meta = re.split(r'[:\n]', cleaned_for_meta_check, maxsplit=1)
            if len(after_meta) > 1 and len(after_meta[1].strip()) > 20:
                original = after_meta[1].strip()
                break
            else:
                # Return a generic placeholder if no content found
                return "[Meta-commentary - see full text]"

    # FIRST: Check if text starts with "Label [Evaluator] more text" pattern
    # Extract just the label part before the evaluator tag
    # Match common evaluator patterns: [E1], [Treich], [Triech], [Reiley], [Capitan], etc.
    label_before_eval = re.match(r'^([A-Z][^[\n]{10,80}?)\s*\[(?:E\d+|[A-Z][a-z]+)\]', original)
    if label_before_eval:
        label = label_before_eval.group(1).strip()
        label = re.sub(r'[\s:,;—\-]+$', '', label)
        if 3 <= len(label.split()) <= max_words:
            return label

    # Also check for "Label E2: more text" pattern (evaluator without brackets)
    label_before_eval2 = re.match(r'^([A-Z][^\n]{10,80}?)\s+(?:E\d+|DR)\s*:', original)
    if label_before_eval2:
        label = label_before_eval2.group(1).strip()
        label = re.sub(r'[\s:,;—\-]+$', '', label)
        if 3 <= len(label.split()) <= max_words:
            return label

    # Remove evaluator tags, names, and quotes at start
    clean = re.sub(r'^\[?(?:E\d+|Evaluator\s*\d*|DR|NotebookLM|Treich|Triech|Capitan|Reiley|Manheim|Espinosa|Seinkmane|Cloud|Tsai)\]?[\s:\[]*', '', original, flags=re.IGNORECASE)
    clean = re.sub(r'^[""\'\[\(]', '', clean)
    clean = re.sub(r'^Author[\'s]*\s*response:?\s*', '', clean, flags=re.IGNORECASE)

    # Skip if remaining text starts with a name (single capitalized word followed by colon)
    name_check = re.match(r'^([A-Z][a-z]+)\s*:', clean)
    if name_check and len(name_check.group(1)) < 15:
        clean = clean[name_check.end():].strip()

    # Look for quoted phrases that are key terms (handle both straight and curly quotes)
    quote_match = re.search(r'["""\']([^"""\'\n]{15,80})["""\']', clean[:250])
    if quote_match:
        quoted = quote_match.group(1).strip()
        if not re.match(r'^(the |a |an |this )', quoted.lower()):
            return quoted

    # Look for "Label: description" or "Label — description" patterns
    # Use (?<!\w) to avoid matching hyphens in compound words like "pre-registered"
    label_match = re.match(r'^([^:\n]{10,80}?)(?:\s*[:\—]\s*|\s+—\s+)', clean)
    if label_match:
        label = label_match.group(1).strip()
        label = re.sub(r'[""\']+$', '', label)
        # Skip if it ends with a preposition or conjunction (incomplete phrase)
        if not re.search(r'\s+(or|and|of|for|to|in|on|by|with|the|a|an)$', label.lower()):
            if not re.match(r'^[A-Z][a-z]+$', label) and len(label.split()) <= max_words:
                return label

    # Look for descriptive noun phrases (allow hyphens in words)
    patterns = [
        r'((?:Lack of |Missing |Unclear |Weak |Strong |Potential )?[A-Z][a-z]+(?:[-\s]+[a-z-]+){1,5}(?:\s+(?:effects?|bias|issues?|concerns?|limitations?|assumptions?|analysis|validity|problems?|risks?|errors?|size|needs?|checking)))',
        r'((?:Small |Large |Limited |Significant )[a-z-]+(?:\s+[a-z-]+){0,5})',
        r'([A-Z][a-z]+\s+(?:sample|data|model|method|approach|design|analysis|claims?|effects?)(?:\s+[a-z-]+){0,3})',
        r'(Consider\s+[a-z-]+(?:[-\s]+[a-z-]+){0,5})',
        r'(Deviations?\s+from\s+(?:the\s+)?[a-z-]+(?:[-\s]+[a-z-]+){0,5})',
    ]
    for pattern in patterns:
        match = re.search(pattern, clean[:250])
        if match:
            label = match.group(1).strip()
            # Remove trailing preposition phrases like "or degrees of", "and more", etc.
            while True:
                new_label = re.sub(r'\s+(or|and|with|for|to|of|in|on|by|including)(\s+\w+)*\s*$', '', label)
                if new_label == label:
                    break
                label = new_label
            if 10 < len(label) < 120 and len(label.split()) <= max_words:
                return label

    # Take first sentence or clause
    first_clause = re.split(r'(?<=[.!?])\s+|\n', clean)[0].strip()
    first_clause = re.sub(r'^(The |A |An )', '', first_clause)
    first_clause = re.sub(r'\s+(including|or|and|with|for|to|of|in|on|by)\s*$', '', first_clause)

    # Remove evaluator tags from middle/end of text
    first_clause = re.sub(r'\s*\[(?:E\d+|[A-Z][a-z]+)\]\s*', ' ', first_clause)
    first_clause = re.sub(r'\s+(?:E\d+|DR)\s*:.*$', '', first_clause)
    first_clause = re.sub(r'\s+', ' ', first_clause).strip()

    words = first_clause.split()
    if 3 <= len(words) <= max_words:
        return first_clause
    elif len(words) > max_words:
        text = ' '.join(words[:max_words])
        last_break = max(text.rfind(' or '), text.rfind(' and '), text.rfind(', '), text.rfind(' — '))
        if last_break > len(text) // 2:
            return text[:last_break].strip(' ,—')
        return text + '...'

    # Fallback
    key_terms = re.findall(r'[A-Z][a-z]+(?:\s+[a-z]+){0,2}', clean[:150])
    if key_terms:
        return ' / '.join(key_terms[:3])

    words = clean.split()[:max_words]
    return ' '.join(words) + ('...' if len(clean.split()) > max_words else '')

def extract_llm_label(text, max_words=10):
    """Extract a short label from LLM issue text."""
    # Clean up numbering
    clean = re.sub(r'^[\d\.\)]+\s*', '', text.strip())

    # LLM issues often have format "Topic: detailed explanation"
    colon_match = re.match(r'^([^:]{8,60}):\s*', clean)
    if colon_match:
        label = colon_match.group(1).strip()
        # Make sure it's descriptive enough
        if len(label.split()) >= 2:
            return label

    # Take first clause
    first_part = re.split(r'[;.]', clean)[0].strip()
    words = first_part.split()
    if len(words) <= max_words:
        return first_part
    return ' '.join(words[:max_words]) + '...'

def split_human_issues(text):
    """Parse human critique text into individual issues with severity labels."""
    if not text:
        return []

    issues = []
    current_severity = ""
    current_issue_lines = []

    def flush_issue():
        nonlocal current_issue_lines
        if not current_issue_lines:
            return
        full_text = "\n".join(current_issue_lines).strip()
        # Clean leading punctuation/numbering
        cleaned = re.sub(r"^[\-\*\d\.\)\s]+", "", full_text).strip()
        if not cleaned or len(cleaned) < 10:
            current_issue_lines = []
            return
        # Skip if it's just a severity label
        if SEVERITY_HEADER_RE.match(cleaned):
            current_issue_lines = []
            return

        short_label = extract_short_label(cleaned)
        # Skip if this is meta-commentary (the label extraction will return placeholder)
        if short_label == "[Meta-commentary - see full text]":
            current_issue_lines = []
            return

        issues.append({
            "text": cleaned,
            "severity": current_severity,
            "short_label": short_label
        })
        current_issue_lines = []

    lines = text.splitlines()
    i = 0
    while i < len(lines):
        line = lines[i].rstrip()
        raw = line.strip()

        # Check for severity header
        if raw and SEVERITY_HEADER_RE.match(raw):
            flush_issue()
            current_severity = normalize_severity(raw.split(':')[0].split()[0])
            i += 1
            continue

        # Check for numbered issue start
        if ENUM_RE.match(raw):
            flush_issue()
            # Remove the number prefix
            issue_text = re.sub(r"^\s*\d+[\.\)]\s*", "", raw)
            current_issue_lines = [issue_text] if issue_text else []
            i += 1
            continue

        # Check for issue label line (short line followed by longer explanation)
        # e.g., "Ceiling effects with Likert scale measures"
        if raw and not EVALUATOR_RE.match(raw) and len(raw) < 80:
            # Look ahead - if next non-empty line starts with [E or is indented, this is a label
            next_idx = i + 1
            while next_idx < len(lines) and not lines[next_idx].strip():
                next_idx += 1
            if next_idx < len(lines):
                next_line = lines[next_idx].strip()
                if (EVALUATOR_RE.match(next_line) or
                    lines[next_idx].startswith(' ') or
                    next_line.startswith('"') or
                    next_line.startswith("'")):
                    # This short line is likely a label for a new issue
                    flush_issue()
                    current_issue_lines = [raw]
                    i += 1
                    continue

        # Empty line - might signal new issue in some formats
        if not raw:
            # Don't flush on every empty line - only if we have significant content
            if current_issue_lines and len("\n".join(current_issue_lines)) > 100:
                # Check if next content looks like new issue
                next_idx = i + 1
                while next_idx < len(lines) and not lines[next_idx].strip():
                    next_idx += 1
                if next_idx < len(lines):
                    next_line = lines[next_idx].strip()
                    if ENUM_RE.match(next_line) or (len(next_line) < 80 and not EVALUATOR_RE.match(next_line)):
                        flush_issue()
            i += 1
            continue

        # Regular content line - add to current issue
        if current_issue_lines:
            current_issue_lines.append(raw)
        else:
            current_issue_lines = [raw]
        i += 1

    flush_issue()
    return issues

# Load comparison data and results
comparison_file = Path("results/key_issues_comparison.json")
results_file = Path("results/key_issues_comparison_results.json")

parsed_issues = {}
comparison_results = {}

if comparison_file.exists():
    comparison = json.loads(comparison_file.read_text())
    for item in comparison:
        paper_id = item.get("gpt_paper", "")
        coda_critique = item.get("coda_critique", "")
        parsed_issues[paper_id] = split_human_issues(coda_critique)

if results_file.exists():
    results = json.loads(results_file.read_text())
    for r in results:
        paper_id = r.get("gpt_paper", "")
        comparison_results[paper_id] = r.get("comparison", {})

We matched 14 papers with both GPT-5.2 Pro key issues and human expert critiques.

Overview Statistics

Show code

if (!is.null(comparison_data) && n_papers > 0) {
  overview <- comparison_data |>
    mutate(
      num_gpt_issues = sapply(gpt_key_issues, length)
    ) |>
    summarise(
      `Papers Compared` = n(),
      `Avg GPT Issues per Paper` = round(mean(num_gpt_issues), 1),
      `Min GPT Issues` = min(num_gpt_issues),
      `Max GPT Issues` = max(num_gpt_issues),
      `Avg Coda Critique Length (chars)` = round(mean(coda_critique_length)),
      `Min Coda Length` = min(coda_critique_length),
      `Max Coda Length` = max(coda_critique_length)
    ) |>
    pivot_longer(everything(), names_to = "Metric", values_to = "Value")

  kable(overview, align = c("l", "r"))
}

Table 4.1: Summary of key issues comparison data
Metric	Value
Papers Compared	14.0
Avg GPT Issues per Paper	11.6
Min GPT Issues	10.0
Max GPT Issues	12.0
Avg Coda Critique Length (chars)	2311.0
Min Coda Length	1197.0
Max Coda Length	4998.0

Paper-by-Paper Comparison

Each paper section shows:

Assessment badge: Coverage and precision metrics with overall rating
LLM Assessment Summary: GPT-5.2 Pro’s justification of the comparison
Missed Issues: Human expert issues that GPT failed to capture
Extra Issues: GPT issues not present in human critique
Full texts: Expandable sections with complete human critique and GPT issues

Render paper comparisons

# Access parsed issues and comparison results from Python
parsed_issues_list <- py$parsed_issues
comparison_results_list <- py$comparison_results

if (!is.null(comparison_data) && n_papers > 0) {

  for (i in 1:nrow(comparison_data)) {
    row <- comparison_data[i, ]
    paper_id <- row$gpt_paper

    # Paper header
    cat(sprintf("\n### %s\n\n", paper_id))
    cat(sprintf("**Coda title:** %s\n\n", row$coda_title))

    # Get comparison results for this paper
    paper_comparison <- comparison_results_list[[paper_id]]
    gpt_issues <- row$gpt_key_issues[[1]]

    # Show LLM assessment badge
    if (!is.na(row$coverage_pct)) {
      coverage_val <- as.numeric(row$coverage_pct)
      precision_val <- as.numeric(row$precision_pct)
      avg_score <- (coverage_val + precision_val) / 2
      rating <- if (!is.null(row$overall_rating)) row$overall_rating else "Unknown"

      # Calculate weighted coverage: mean of match_quality across all human issues (0 for unmatched)
      matched_pairs_data <- if (!is.null(paper_comparison$matched_pairs)) paper_comparison$matched_pairs else list()
      unmatched_human_data <- if (!is.null(paper_comparison$unmatched_human)) paper_comparison$unmatched_human else list()
      n_matched <- length(matched_pairs_data)
      n_unmatched <- length(unmatched_human_data)
      n_total_human <- n_matched + n_unmatched

      if (n_total_human > 0) {
        # Sum of match_quality for matched issues + 0 for each unmatched issue
        match_qualities <- sapply(matched_pairs_data, function(x) as.numeric(x$match_quality))
        weighted_coverage <- round(sum(match_qualities) / n_total_human, 0)
      } else {
        weighted_coverage <- NA
      }

      # Determine badge color based on rating
      if (rating %in% c("Good", "Excellent")) {
        badge_class <- "callout-tip"
      } else if (rating == "Moderate") {
        badge_class <- "callout-note"
      } else {
        badge_class <- "callout-warning"
      }

      cat(sprintf("::: {.%s appearance=\"simple\"}\n", badge_class))
      if (!is.na(weighted_coverage)) {
        cat(sprintf("**Coverage:** %s%% | **Weighted Coverage:** %s%% | **Precision:** %s%%\n",
                    row$coverage_pct, weighted_coverage, row$precision_pct))
      } else {
        cat(sprintf("**Coverage:** %s%% | **Precision:** %s%%\n",
                    row$coverage_pct, row$precision_pct))
      }
      cat(":::\n\n")
    }

    # ---- RATINGS CONTEXT TABLE (collapsible) ----
    if (!is.null(combined_ratings)) {
      # Match paper using normalized name for exact matching
      paper_id_norm <- normalize_paper_name(paper_id)
      paper_ratings <- combined_ratings |>
        filter(paper_norm == paper_id_norm) |>
        filter(!is.na(display_name)) |>  # Only rows with valid display names
        arrange(criterion != "overall", display_name)  # Overall first, then alphabetical

      if (nrow(paper_ratings) > 0) {
        cat('<details style="margin-bottom:1em;">\n')
        cat('<summary style="font-size:0.9em;color:#666;cursor:pointer;">📊 Human vs LLM ratings for this paper</summary>\n')
        cat('<div style="margin-top:0.5em;overflow-x:auto;">\n')
        cat('<table style="font-size:0.85em;border-collapse:collapse;width:100%;">\n')
        cat('<tr style="background:#f5f5f5;"><th style="padding:6px 12px;text-align:left;">Criterion</th><th style="padding:6px 12px;text-align:center;">Human (avg)</th><th style="padding:6px 12px;text-align:center;">LLM (mid)</th></tr>\n')

        for (j in 1:nrow(paper_ratings)) {
          r <- paper_ratings[j, ]
          h_val <- if (!is.na(r$human_mean)) as.character(r$human_mean) else "—"
          l_val <- if (!is.na(r$llm_rating)) as.character(round(r$llm_rating, 0)) else "—"
          crit_name <- r$display_name
          cat(sprintf('<tr><td style="padding:6px 12px;">%s</td><td style="padding:6px 12px;text-align:center;">%s</td><td style="padding:6px 12px;text-align:center;">%s</td></tr>\n',
                      crit_name, h_val, l_val))
        }
        cat('</table>\n')
        cat('<div style="font-size:0.75em;color:#888;margin-top:0.25em;">Human = avg of evaluator midpoints; LLM = model midpoint rating</div>\n')
        cat('</div>\n')
        cat('</details>\n\n')
      }
    }

    # ---- LLM ASSESSMENT SUMMARY ----
    if (!is.null(row$overall_justification) && nchar(row$overall_justification) > 0) {
      cat("#### LLM Assessment Summary\n\n")
      cat(row$overall_justification)
      cat("\n\n")
    }

    # ---- DETAILED NOTES (expandable) ----
    if (!is.null(row$detailed_notes) && nchar(row$detailed_notes) > 0) {
      cat("::: {.callout-note collapse='true'}\n")
      cat("## Detailed Comparison Notes\n\n")
      cat(row$detailed_notes)
      cat("\n:::\n\n")
    }

    # ---- MATCHED PAIRS (New format from enhanced LLM comparison) ----
    matched_pairs <- if (!is.null(paper_comparison$matched_pairs)) paper_comparison$matched_pairs else list()

    if (length(matched_pairs) > 0) {
      cat(sprintf("#### ✅ Matched Issues (%d human issues with LLM coverage)\n\n", length(matched_pairs)))

      for (j in seq_along(matched_pairs)) {
        pair <- matched_pairs[[j]]
        label <- if (!is.null(pair$label)) pair$label else "Unnamed issue"
        match_quality <- if (!is.null(pair$match_quality)) pair$match_quality else "?"
        match_explanation <- if (!is.null(pair$match_explanation)) pair$match_explanation else ""
        detailed_discussion <- if (!is.null(pair$detailed_discussion)) pair$detailed_discussion else ""
        human_idx <- if (!is.null(pair$human_issue_index)) pair$human_issue_index else j
        llm_indices <- if (!is.null(pair$llm_issue_indices)) pair$llm_issue_indices else c()

        # Match quality color
        if (is.numeric(match_quality)) {
          if (match_quality >= 70) {
            quality_color <- "#27ae60"  # Green
          } else if (match_quality >= 40) {
            quality_color <- "#f39c12"  # Orange
          } else {
            quality_color <- "#e74c3c"  # Red
          }
        } else {
          quality_color <- "#95a5a6"  # Gray
        }

        # Each matched issue is collapsible to reduce clutter
        cat(sprintf('<details style="margin:0.75em 0;padding:0;background:#f0fff0;border-left:4px solid %s;border-radius:4px;">\n', quality_color))
        llm_tags <- if (length(llm_indices) > 0) paste(sapply(llm_indices, function(x) sprintf("L%d", x)), collapse = ", ") else ""
        cat(sprintf('<summary style="padding:0.75em;cursor:pointer;display:flex;justify-content:space-between;align-items:center;">\n'))
        cat(sprintf('<span><strong style="font-size:1.05em;">H%d: %s</strong> <span style="font-size:0.85em;color:#666;">↔ %s</span></span>\n',
                    human_idx, gsub("<", "&lt;", gsub(">", "&gt;", label)), llm_tags))
        cat(sprintf('<span style="background-color:%s;color:white;padding:3px 8px;border-radius:12px;font-weight:bold;font-size:0.9em;">%s%%</span>\n', quality_color, match_quality))
        cat('</summary>\n')
        cat('<div style="padding:0 0.75em 0.75em 0.75em;">\n')

        if (nchar(match_explanation) > 0) {
          cat(sprintf('<div style="font-size:0.9em;margin-bottom:0.5em;">%s</div>\n', gsub("<", "&lt;", gsub(">", "&gt;", match_explanation))))
        }

        # ---- SHOW ACTUAL HUMAN ISSUE TEXT ----
        # First check if human_issue_text is directly available in data
        human_text <- if (!is.null(pair$human_issue_text)) pair$human_issue_text else NULL

        # If not, try to extract from coda_critique using label keywords
        if (is.null(human_text) || nchar(human_text) == 0) {
          # Extract keywords from label (split on / and space, take significant words)
          keywords <- unlist(strsplit(tolower(label), "[/ ]+"))
          keywords <- keywords[nchar(keywords) > 4]  # Only words > 4 chars

          # Search coda_critique for lines containing these keywords
          coda_lines <- unlist(strsplit(row$coda_critique, "\n"))
          for (kw in keywords[1:min(3, length(keywords))]) {
            # Use fixed matching to avoid regex issues with special chars
            matches <- coda_lines[grepl(kw, tolower(coda_lines), fixed = TRUE)]
            if (length(matches) > 0) {
              human_text <- paste(matches[1:min(2, length(matches))], collapse = " ")
              break
            }
          }
        }

        if (!is.null(human_text) && nchar(human_text) > 10) {
          cat('<details style="margin-top:0.75em;background:#fff3e0;padding:0.5em;border-radius:4px;" open>\n')
          cat('<summary style="font-size:0.9em;color:#e65100;cursor:pointer;font-weight:500;">👤 Human expert critique text</summary>\n')
          cat(sprintf('<div style="margin-top:0.5em;padding:0.5em;background:white;border-left:3px solid #e65100;font-size:0.85em;">%s</div>\n',
                      gsub("<", "&lt;", gsub(">", "&gt;", human_text))))
          cat('</details>\n')
        }

        # ---- SHOW ACTUAL LLM ISSUE TEXT(S) ----
        if (length(llm_indices) > 0 && length(gpt_issues) > 0) {
          cat('<details style="margin-top:0.75em;background:#e8f4fd;padding:0.5em;border-radius:4px;" open>\n')
          cat('<summary style="font-size:0.9em;color:#1565c0;cursor:pointer;font-weight:500;">🤖 Matched LLM issue text</summary>\n')
          cat('<div style="margin-top:0.5em;">\n')
          for (idx in llm_indices) {
            if (idx <= length(gpt_issues)) {
              llm_text <- gpt_issues[idx]
              cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:white;border-left:3px solid #1565c0;font-size:0.85em;">\n'))
              cat(sprintf('<strong style="color:#1565c0;">L%d:</strong> %s\n', idx, gsub("<", "&lt;", gsub(">", "&gt;", llm_text))))
              cat('</div>\n')
            }
          }
          cat('</div>\n')
          cat('</details>\n')
        }

        if (nchar(detailed_discussion) > 0) {
          cat('<details style="margin-top:0.5em;">\n')
          cat('<summary style="font-size:0.85em;color:#666;cursor:pointer;">Show detailed discussion</summary>\n')
          cat(sprintf('<div style="margin-top:0.5em;padding:0.5em;background:#fafafa;font-size:0.85em;border-radius:4px;">%s</div>\n',
                      gsub("<", "&lt;", gsub(">", "&gt;", detailed_discussion))))
          cat('</details>\n')
        }

        cat('</div>\n')  # Close content wrapper
        cat('</details>\n')  # Close collapsible matched issue
      }
      cat('\n')
    }

    # ---- UNMATCHED HUMAN ISSUES (New format, collapsible) ----
    unmatched_human <- if (!is.null(paper_comparison$unmatched_human)) paper_comparison$unmatched_human else list()

    if (length(unmatched_human) > 0) {
      cat('<details style="margin-bottom:1em;">\n')
      cat(sprintf('<summary style="color:#c62828;font-weight:bold;cursor:pointer;">⚠️ Unmatched Human Issues (%d not captured by LLM)</summary>\n', length(unmatched_human)))
      cat('<div style="margin-top:0.5em;">\n')

      for (j in seq_along(unmatched_human)) {
        item <- unmatched_human[[j]]
        idx <- if (!is.null(item$index)) item$index else j
        brief <- if (!is.null(item$brief_description)) item$brief_description else "No description"
        why_missed <- if (!is.null(item$why_missed)) item$why_missed else ""

        cat(sprintf('<div style="margin:0.5em 0;padding:0.75em;background:#ffebee;border-left:4px solid #c62828;font-size:0.9em;">\n'))
        cat(sprintf('<strong>H%d:</strong> %s\n', idx, gsub("<", "&lt;", gsub(">", "&gt;", brief))))
        if (nchar(why_missed) > 0) {
          cat(sprintf('<div style="margin-top:0.5em;font-size:0.85em;color:#666;"><em>Why missed:</em> %s</div>\n',
                      gsub("<", "&lt;", gsub(">", "&gt;", why_missed))))
        }
        cat('</div>\n')
      }
      cat('</div>\n')
      cat('</details>\n\n')
    }

    # ---- UNMATCHED LLM ISSUES (New format) ----
    unmatched_llm <- if (!is.null(paper_comparison$unmatched_llm)) paper_comparison$unmatched_llm else list()

    if (length(unmatched_llm) > 0) {
      cat('<details style="margin-bottom:1.5em;">\n')
      cat(sprintf('<summary style="color:#1565c0;font-weight:bold;cursor:pointer;">📋 Unmatched LLM Issues (%d not in human critique)</summary>\n', length(unmatched_llm)))
      cat('<div style="margin-top:0.5em;">\n')

      for (j in seq_along(unmatched_llm)) {
        item <- unmatched_llm[[j]]
        idx <- if (!is.null(item$index)) item$index else j
        brief <- if (!is.null(item$brief_description)) item$brief_description else "No description"
        why_extra <- if (!is.null(item$why_extra)) item$why_extra else ""

        cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:#e3f2fd;border-left:4px solid #1565c0;font-size:0.9em;">\n'))
        cat(sprintf('<strong>L%d:</strong> %s\n', idx, gsub("<", "&lt;", gsub(">", "&gt;", brief))))
        if (nchar(why_extra) > 0) {
          cat(sprintf('<div style="margin-top:0.5em;font-size:0.85em;color:#666;"><em>Why extra:</em> %s</div>\n',
                      gsub("<", "&lt;", gsub(">", "&gt;", why_extra))))
        }
        cat('</div>\n')
      }
      cat('</div>\n')
      cat('</details>\n\n')
    }

    # ---- FALLBACK: Old format missed/extra issues (if new format not available) ----
    if (length(matched_pairs) == 0 && length(unmatched_human) == 0) {
      # Use old format
      missed_issues <- if (!is.null(paper_comparison$missed_issues)) paper_comparison$missed_issues else character(0)

      if (length(missed_issues) > 0) {
        cat(sprintf("#### ⚠️ Issues Human Experts Raised That GPT Missed (%d)\n\n", length(missed_issues)))
        for (j in seq_along(missed_issues)) {
          issue_text <- missed_issues[[j]]
          issue_escaped <- gsub("<", "&lt;", gsub(">", "&gt;", issue_text))
          cat(sprintf('<div style="margin:0.5em 0;padding:0.75em;background:#ffebee;border-left:4px solid #c62828;font-size:0.9em;">\n'))
          cat(sprintf('%s\n', issue_escaped))
          cat('</div>\n')
        }
        cat('\n')
      }

      extra_issues <- if (!is.null(paper_comparison$extra_issues)) paper_comparison$extra_issues else character(0)

      if (length(extra_issues) > 0) {
        cat('<details style="margin-bottom:1.5em;">\n')
        cat(sprintf('<summary style="color:#1565c0;font-weight:bold;cursor:pointer;">📋 Extra Issues GPT Identified (%d not in human critique)</summary>\n', length(extra_issues)))
        cat('<div style="margin-top:0.5em;">\n')
        for (j in seq_along(extra_issues)) {
          issue_text <- extra_issues[[j]]
          issue_escaped <- gsub("<", "&lt;", gsub(">", "&gt;", issue_text))
          cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:#e3f2fd;border-left:4px solid #1565c0;font-size:0.9em;">\n'))
          cat(sprintf('%s\n', issue_escaped))
          cat('</div>\n')
        }
        cat('</div>\n')
        cat('</details>\n\n')
      }
    }

    # ---- FULL HUMAN CRITIQUE (expandable) ----
    cat('<details style="margin-bottom:1em;">\n')
    cat('<summary style="color:#e65100;cursor:pointer;">📄 View full human critique</summary>\n')
    cat('<div style="margin-top:0.5em;padding:0.75em;background:#fffbf0;border:1px solid #ffe0b2;border-radius:4px;font-size:0.9em;white-space:pre-wrap;">\n')
    cat(gsub("<", "&lt;", gsub(">", "&gt;", row$coda_critique)))
    cat('\n</div>\n')
    cat('</details>\n\n')

    # ---- FULL GPT ISSUES (expandable) ----
    if (length(gpt_issues) > 0) {
      cat('<details style="margin-bottom:1em;">\n')
      cat(sprintf('<summary style="color:#1565c0;cursor:pointer;">🤖 View all GPT key issues (%d)</summary>\n', length(gpt_issues)))
      cat('<div style="margin-top:0.5em;">\n')
      for (j in seq_along(gpt_issues)) {
        issue <- gpt_issues[j]
        issue_clean <- gsub("^[0-9]+[)\\.]\\s*", "", issue)
        issue_clean <- gsub("^\\. ", "", issue_clean)
        issue_escaped <- gsub("<", "&lt;", gsub(">", "&gt;", issue_clean))
        cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:#f0f7ff;border-left:4px solid #4e79a7;font-size:0.9em;">\n'))
        cat(sprintf('<strong>%d.</strong> %s\n', j, issue_escaped))
        cat('</div>\n')
      }
      cat('</div>\n')
      cat('</details>\n\n')
    }

    cat("---\n\n")
  }
}

Acemoglu_et_al._2024

Coda title: Misperceptions and Demand for Democracy under Authoritarianism

Coverage: 50% | Weighted Coverage: 39% | Precision: 33%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	88	92
Advancing knowledge	89	92
Claims & evidence	88	90
Global relevance	91	95
Logic & communication	90	92
Methods	84	88
Open science	62	68
Real-world relevance	89	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM captures the spillover/interference concern very well and partially captures the human construct-validity/external-validity worry about framing and over-strong ‘demand’ interpretation. However, it misses two prominent human points: (i) the strong sensitivity of field precision/significance to including prior election controls, and (ii) the lab design-analysis suggestion to use pre–post differences. Many LLM critiques are additional and not present in the human list, lowering precision.

Detailed Comparison Notes

Matching threshold used: match_quality >= 30. Matched LLM issues: L2, L5, L6, L11. Unmatched human issues reflect more specification-robustness and within-subject (pre/post) analysis concerns, which are distinct from the LLM’s emphasis on identification assumptions (SUTVA/IV), message epistemics, and external validity/sampling.

✅ Matched Issues (2 human issues with LLM coverage)

H2: Spillovers/interference attenuate estimated treatment effects ↔︎ L5 95%

Both flag that untreated units may be indirectly exposed to treatment information, violating SUTVA and likely attenuating (biasing toward zero) estimated effects.

👤 Human expert critique text

Spillovers attenuate estimates (~lower bounds): [E2] “adjacent untreated neighborhoods may have been indirectly exposed to treatment messages via word-of-mouth or sharing of pamphlets. If spillovers occurred, the estimated treatment effects could be attenuated or biased.”

🤖 Matched LLM issue text

L5: Potential interference/spillovers across neighborhoods (social discussion, adjacency, shared spaces) are plausible in a city setting; the design and analysis largely assume SUTVA without bounding or testing spillovers.

Show detailed discussion

The human critique emphasizes that spillovers (e.g., word-of-mouth, shared pamphlets) would make the reported effects lower bounds and potentially biased. The LLM issue similarly points to plausible interference in a dense urban setting and notes the analysis largely assumes SUTVA without tests or bounds. Both are focused on the same identification threat: contamination of control neighborhoods. The LLM additionally suggests design/analysis remedies (testing/bounding), whereas the human note mainly highlights the direction of bias (attenuation).

H3: Framing/interpretation and external validity of “demand” claims ↔︎ L2, L6, L11 60%

Both question whether the messages and outcomes identify normative commitments to democracy/media freedom versus context-specific persuasion, and both raise limits on generalizing beyond the specific Turkish context.

👤 Human expert critique text

“These framings are substantively compelling and contextually salient in Turkey, but they do not isolate support for democracy or media freedom as normative commitments.” “Demand for democracy” wording is too strong; their results may reflect the specific salience of (natural disasters) and corruption, particularly in light of a recent earthquake with widespread reports of harm linked to corruption. This may limit the external generalizability.

🤖 Matched LLM issue text

L2: Treatment scripts/pamphlets present causal claims (democracy → fewer disaster deaths; media independence → less corruption) as near-laws, but the cited evidence is largely observational/context-dependent; this risks conflating persuasion by normative/valence messaging with “accurate information correction.”

L6: External validity is limited: the field experiment is in a single city/province with selective neighborhood inclusion, and the online sample is Facebook-ad recruited with non-representative composition; generalization to national electorates or different authoritarian contexts is uncertain.

L11: The paper sometimes slides from compliance-adjusted per-contact effects to broad claims about the impact of “accurate information campaigns”; clearer separation of reduced-form assignment effects, LATE, and mechanism interpretation would improve causal communication.

Show detailed discussion

The human critique argues the framings are contextually salient but do not cleanly isolate intrinsic democratic or media-freedom commitments, warning that “demand for democracy” may overstate what is being measured and may be driven by earthquake/disaster and corruption salience. The LLM’s L2 overlaps by arguing the scripts present strong causal claims that may function as persuasive/valence messaging rather than “accurate information correction,” which similarly complicates interpretation of what attitude is being moved. The LLM’s L6 directly overlaps with the human concern about generalizability, though the LLM frames it more as sampling/site limitations (single city, Facebook sample) rather than conceptual generalizability of the construct. L11 partially aligns by noting the paper may overgeneralize from specific estimands/mechanisms to broad claims about “accurate information campaigns,” which is adjacent to the human worry about overly strong “demand for democracy” language. The match is not perfect because the human issue is especially about construct validity (normative commitment vs instrumental evaluation), while the LLM spreads the concern across message epistemics, sample/site external validity, and over-claiming.

⚠️ Unmatched Human Issues (2 not captured by LLM)

H1: Field results are highly sensitive to including prior election results as a control; without it, SEs are much larger and significance disappears.

Why missed: The LLM list does not discuss covariate adjustment sensitivity, precision/power dependence on controls, or highlighting specification fragility in Table 2; it focuses more on interference, compliance/IV interpretation, and implementation validity.

H4: Lab experiment should analyze change scores (post–pre) rather than only post-treatment outcomes to leverage baseline and improve inference.

Why missed: No LLM issue mentions the pre/post measurement structure or recommends difference-in-differences/change-score modeling for the lab experiment; the LLM critiques center on messaging validity, identification threats, and reporting/measurement concerns elsewhere.

📋 Unmatched LLM Issues (8 not in human critique)

L1: Misperceptions measured relative to V-Dem treated as ground truth needs epistemic nuance.

Why extra: Human critiques did not raise concerns about using expert-coded indices as the benchmark for ‘misperceptions’.

L3: Partisan branding/opposition-party entanglement confounds ‘non-partisan information’ interpretation.

Why extra: Humans did not flag partisan delivery/branding as a central threat (they focused more on spillovers, controls, and construct validity).

L4: 2SLS/compliance estimand: exclusion restriction concerns because assignment affects multiple channels beyond conversations.

Why extra: Humans did not discuss IV exclusion restrictions or multi-channel violations for the field compliance analysis.

L7: Data integrity concerns: turnout values >1 and documentation inconsistencies for the online turnout variable.

Why extra: Human notes did not mention potential data coding/validation issues.

L8: Placebo messages contain substantive content, potentially muddying the placebo/demand interpretation.

Why extra: Humans did not critique placebo construction or experimenter-demand isolation.

L9: Heterogeneity analysis uses many median splits/subgroups; potential multiple-testing and preregistration clarity needed.

Why extra: Humans did not raise multiplicity/heterogeneity specification concerns.

L10: Persistence claims tied to 2024 municipal election need stronger alternative-explanations discussion.

Why extra: Humans did not focus on longer-run/persistence identification threats across elections.

L12: Non-partisan narrative vs partisan implementation needs transparency.

Why extra: Not explicitly raised by humans; overlaps somewhat with L3 but remains distinct from the human-listed issues.

📄 View full human critique

Optional but important Sensitivity to controls: [Eval. manager, also relates to E1 critique] The field experiment’s power depends strongly on including the value of the prior election in the regression analyses. In Table 2 columns 1-2, the standard error declines by half when the prior election result is included in the intention-to-treat (ITT) estimate for the field experiment. Without including it, the estimate is too noisy to be considered “statistically significant.” [This should be noticed and emphasized.] Spillovers attenuate estimates (~lower bounds): [E2] “adjacent untreated neighborhoods may have been indirectly exposed to treatment messages via word-of-mouth or sharing of pamphlets. If spillovers occurred, the estimated treatment effects could be attenuated or biased.” Unsure “These framings are substantively compelling and contextually salient in Turkey, but they do not isolate support for democracy or media freedom as normative commitments.” “Demand for democracy” wording is too strong; their results may reflect the specific salience of (natural disasters) and corruption, particularly in light of a recent earthquake with widespread reports of harm linked to corruption. This may limit the external generalizability. DR: Unsure because I think the authors are fairly cautious in their statements, and their interpretation seems to be largely about understanding the instrumental link between democracy and good outcomes E1: “why don’t the authors use the difference between post- and pre-treatment values instead of only the post-treatment values” [in the lab experiment]?

🤖 View all GPT key issues (12)

1. The paper operationalizes “misperceptions” by comparing respondents’ beliefs to V-Dem indices treated as ground truth; V-Dem is expert-coded (not citizens’ lived experience) and not “official data,” so the epistemic basis for labeling beliefs as misperceived needs more nuance.

2. Treatment scripts/pamphlets present causal claims (democracy → fewer disaster deaths; media independence → less corruption) as near-laws, but the cited evidence is largely observational/context-dependent; this risks conflating persuasion by normative/valence messaging with “accurate information correction.”

3. Field implementation is intertwined with opposition parties and includes explicitly partisan branding in some arms (especially the basic treatment), complicating interpretation that effects reflect non-partisan information rather than standard partisan persuasion.

4. The main field estimand is a 2SLS effect of “completed conversations” instrumented by assignment; exclusion restrictions are not fully defended because assignment also changes pamphlet exposure, canvasser presence/intensity, and possibly other neighborhood-level channels.

5. Potential interference/spillovers across neighborhoods (social discussion, adjacency, shared spaces) are plausible in a city setting; the design and analysis largely assume SUTVA without bounding or testing spillovers.

6. External validity is limited: the field experiment is in a single city/province with selective neighborhood inclusion, and the online sample is Facebook-ad recruited with non-representative composition; generalization to national electorates or different authoritarian contexts is uncertain.

7. Administrative turnout values exceeding 1 in some ballot boxes (max >1) and documentation inconsistencies about the online “turnout” variable require explicit reconciliation to ensure data integrity and interpretability.

8. The placebo messages still contain substantive propositions (“some people think … has worsened”), so the placebo may not cleanly isolate experimenter demand; this could attenuate differences and complicate interpretation.

9. Heterogeneity analyses rely heavily on median splits and multiple subgroup comparisons; if not fully pre-registered, multiple-testing adjustments or a clearer separation of confirmatory vs exploratory analyses are needed.

10. Persistence claims using the 2024 municipal election are suggestive but need stronger discussion of alternative explanations (subsequent campaigns, compositional/administrative changes, ballot-box mapping stability across elections).

11. The paper sometimes slides from compliance-adjusted per-contact effects to broad claims about the impact of “accurate information campaigns”; clearer separation of reduced-form assignment effects, LATE, and mechanism interpretation would improve causal communication.

12. The “non-partisan approach” narrative is somewhat inconsistent with the realized partisan delivery; clarifying what was attempted vs what was implemented would improve transparency.

Adena_and_Hager_2024

Coda title: Does online fundraising increase charitable giving? A nationwide field experiment on Facebook

Coverage: 92% | Weighted Coverage: 63% | Precision: 67%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	86	85
Advancing knowledge	85	84
Claims & evidence	88	82
Global relevance	78	72
Logic & communication	87	90
Methods	86	86
Open science	62	54

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM covers nearly all major human concerns (windowing/seasonality, spillovers/interference, winsorization/ROI sensitivity, competitor effects, and HTE multiplicity), but misses the specific methodological critique about balance-table hypothesis testing and adds several extra issues (mechanisms, factorial power, alliance imbalance, reproducibility) not present in the human list.

Detailed Comparison Notes

Several matches are conceptually aligned but differ in emphasis: humans repeatedly foreground preregistration adherence and specific missing inferential elements (confidence intervals, F-tests), while the LLM often reframes these as sensitivity/assumption-dependence or multiplicity/validation problems. The strongest alignments are spillovers/interference (H11-L1), tail handling/winsorization (H1/H10-L4), and windowing/seasonality (H3-L5). The weakest alignments are the preregistered causal-trees omission (H5) and the missing HTE F-tests (H9), which the LLM addresses only indirectly via general heterogeneity/multiplicity concerns.

✅ Matched Issues (11 human issues with LLM coverage)

H1: Winsorization/transforming outcome affects inference ↔︎ L4 75%

Both note that winsorizing PLZ-day donations at €1,000 is consequential in a heavy-tailed setting and can change estimated effects and inference.

👤 Human expert critique text

Transforming the outcome variable — winsorizing location-days with gifts of over 1000 euros.

🤖 Matched LLM issue text

L4: Winsorization and heavy tails: PLZ-day winsorization at €1,000 may affect inference and ROI in a heavy-tailed donation distribution; robustness is mentioned but should be more fully quantified (e.g., alternative cutoffs, quantile/Poisson models).

Show detailed discussion

The human critique frames winsorization as a potential deviation from the pre-registered plan and a key researcher degree of freedom (transforming the outcome). The LLM focuses less on preregistration per se and more on the statistical consequences of a €1,000 cap under heavy tails, calling for fuller robustness checks. Both imply that tail-handling choices can materially move treatment effects and downstream conclusions. The overlap is strong on the transformation concern, but the LLM does not explicitly emphasize preregistration deviations, lowering the match quality.

H2: Per-capita normalization and estimand interpretation ↔︎ L2 55%

Both raise concerns about interpreting effects when outcomes are scaled by population/area and when exposure varies within assigned areas.

🤖 Matched LLM issue text

L2: Treatment intensity vs assignment: Postal-code assignment does not ensure uniform exposure; Facebook’s delivery algorithm can create endogenous within-area intensity and composition, making per-capita effects harder to interpret as “advertising causes X euros per million residents.”

Show detailed discussion

The human critique flags that donations are normalized by zip-code population, which changes interpretation and may embed assumptions about linear scaling with residents. The LLM raises a related but broader point: assignment at the postal-code level does not guarantee uniform exposure because platform delivery creates endogenous intensity, making “€ per million residents” interpretations fragile. Both are ultimately about the estimand implied by aggregation/scaling choices. The LLM does not directly critique the population-normalization choice as such, so the match is partial.

H3: Outcome window length and seasonal dynamics ↔︎ L5 80%

Both question how the chosen post-treatment measurement window affects conclusions, especially given end-of-year seasonality.

👤 Human expert critique text

Transforming the outcome variable — winsorizing location-days with gifts of over 1000 euros.

🤖 Matched LLM issue text

L5: Post-period choice and seasonality: Ending outcomes at Dec 31 is substantively motivated but couples treatment persistence to extreme seasonal dynamics; more explicit dynamic/event-study estimates would better separate persistence from seasonal noise.

Show detailed discussion

The human issue highlights the degree of freedom in selecting the measurement window length, which can materially affect estimated persistence and ROI. The LLM specifically critiques the Dec 31 endpoint as confounding persistence with extreme seasonal donation dynamics and recommends more explicit dynamic/event-study modeling. Both imply that results may be sensitive to windowing choices. The LLM adds a concrete seasonal mechanism and a methodological fix, making the match strong.

H4: Competing-charity effects and crowd-out inference ↔︎ L6, L11 85%

Both address analyzing spillovers to competing charities/crowding out and the limits of available competitor data for strong conclusions.

👤 Human expert critique text

Investigating the effects on competing charities (additional dimension of analysis) Interpretation/statistical inference for heterogeneous treatment effects:

🤖 Matched LLM issue text

L6: Competitor/crowding-out measurement mismatch: Competitor datasets are incomplete (often online-only), overlap across sources is unresolved, and sector-wide giving is unobserved, limiting the strength of conclusions about total charitable-sector effects.

L11: Profitability depends on assumptions: Long-run ROI uses an assumed lifetime-value multiplier and implicitly assumes competitor crowd-out does not offset social impact; uncertainty propagation around ROI should be more explicit.

Show detailed discussion

The human critique notes that the paper adds an extra analytical dimension by investigating competing charities, implicitly raising concerns about scope creep or interpretive overreach. The LLM similarly treats competitor/crowding-out conclusions as constrained by incomplete and mismatched datasets and connects this to social-impact interpretation and ROI assumptions. Both are aligned that competitor effects are hard to measure well and conclusions may be too strong. The LLM is more specific about data limitations, while the human critique is more about the analytic expansion/degree of freedom.

H5: Planned causal-tree heterogeneity not implemented ↔︎ L10 45%

The human critique notes a preregistered causal-tree approach was not carried out; the LLM flags heterogeneity analyses as exploratory and at risk of false positives.

👤 Human expert critique text

Preregistered, not done: “We also plan to use causal trees to search for meaningful heterogeneity in a structured way using all available pre-treatment covariates…”

🤖 Matched LLM issue text

L10: Heterogeneity/multiplicity risk: Many subgroup interactions are reported with strong managerial recommendations, but these appear exploratory and risk false positives without multiplicity control or holdout validation.

Show detailed discussion

The human issue is a concrete preregistration deviation: a promised structured heterogeneity search via causal trees was not done. The LLM does not mention this specific omission, but it does critique the heterogeneity section as exploratory with multiplicity risk and calls for stronger validation, which is directionally similar to why a structured approach was preregistered. Both are concerned about how heterogeneity is discovered/validated. Because the LLM does not discuss adherence to the preregistered causal-tree plan, the match remains moderate.

H6: Overstated conclusions without uncertainty intervals ↔︎ L3, L11 70%

Both argue interpretations (especially profitability/long-run effects) should be more cautious and should foreground uncertainty rather than point estimates.

👤 Human expert critique text

Capitan: “interpreting covariate balance tables as a test of randomization is conceptually incorrect … report standardized differences in covariates without relying on hypothesis tests”

🤖 Matched LLM issue text

L3: Revenue effects are borderline: Donation revenue results are weaker than frequency (e.g., long-term p<0.1), so ROI/profitability conclusions are sensitive to specification, tail behavior, and aggregation choices.

Show detailed discussion

The human critique explicitly complains about missing or underemphasized confidence intervals and uncertainty bounds in narrative discussion, leading to overselling. The LLM similarly calls the revenue effects borderline and emphasizes sensitivity of ROI/profitability to assumptions, urging explicit uncertainty propagation for ROI. Both converge on the need to present uncertainty prominently when effects are weak or assumption-dependent. The LLM frames uncertainty partly through robustness/sensitivity rather than the specific reporting failure of CIs, so it is not a perfect match.

H7: HTE sign-flips suggest noise over theory ↔︎ L10 55%

Both question heterogeneous treatment effects that look unstable and may reflect noise rather than meaningful subgroup differences.

👤 Human expert critique text

“The coefficients with the biggest high-low differences all ended up estimating positive treatment effects for one half of the data and negative treatment effects for the other half, which helped the difference become big enough for statistical significance. But it’s hard for me to imagine that negative treatment effects are genuine, rather than mere noise, because I don’t see a good theory for why Save the Children ads would cause donations to decrease among half the population

🤖 Matched LLM issue text

Show detailed discussion

The human critique focuses on a substantive interpretability red flag: large high-low subgroup differences arise because one subgroup is estimated positive and the other negative, which seems theoretically implausible and likely noise. The LLM does not single out sign reversals, but it broadly warns that many subgroup interactions are exploratory and prone to false positives without controls/validation. Both imply that the heterogeneity patterns may not be reliable enough for strong managerial recommendations. The human critique is more theory-driven (implausible negative effects), while the LLM is more statistical/multiplicity-driven.

H8: Multiple testing risk in HTE exploration ↔︎ L10 80%

Both highlight multiplicity/multiple-hypothesis-testing risks in reported heterogeneous treatment effects and suggest more structured/validated approaches.

🤖 Matched LLM issue text

Show detailed discussion

The human critique explicitly flags MHT bias in HTE testing and notes that the preregistered causal-tree approach might have mitigated this problem. The LLM mirrors this by warning that many subgroup interactions look exploratory and could be false positives without multiplicity control or holdout validation. Both therefore converge on the same statistical threat: overinterpreting selected significant subgroup findings. The LLM adds concrete remedies (multiplicity control/validation) whereas the human critique points to the (unused) causal-tree plan.

H9: Insufficient formal testing for HTE claims ↔︎ L10 40%

The human critique notes missing omnibus tests (e.g., F-tests) for HTE significance; the LLM more generally critiques HTE reporting as exploratory without rigorous error control.

🤖 Matched LLM issue text

Show detailed discussion

The human issue is specific: the authors report HTEs but do not conduct appropriate joint tests (e.g., F-tests) to determine which are statistically significant. The LLM does not mention F-tests, but its multiplicity/false-positive critique implies that the inferential framework for HTE claims is weak. Both point to shortcomings in statistical inference supporting heterogeneity conclusions. Because the LLM’s point is broader and not about the specific missing tests, the match is limited.

H10: Profitability/ROI highly sensitive to trimming ↔︎ L4, L3, L11 85%

Both argue that winsorization/trimming in a heavy-tailed outcome can materially alter ROI and profitability conclusions, which are already sensitive to assumptions/specifications.

👤 Human expert critique text

Profitability: Trimming (Winsorizing) particularly problematic for these estimates

🤖 Matched LLM issue text

Show detailed discussion

The human critique stresses that trimming/winsorizing is especially problematic for profitability estimates because ROI is driven by tail donations. The LLM similarly emphasizes that the €1,000 cap may strongly affect inference under heavy tails and separately notes that revenue/ROI conclusions are borderline and assumption-dependent (lifetime value, specifications). Both therefore see ROI as fragile to tail-handling and modeling choices. The LLM expands beyond trimming to include broader ROI-assumption uncertainty, but the shared core concern is strong.

H11: Spillovers/interference rely on strong assumptions ↔︎ L1 90%

Both critique the spillover/interference analysis as relying on arbitrary modeling/measurement assumptions and producing implausibly large indirect effects relative to direct effects.

👤 Human expert critique text

Estimating and Interpreting spillovers: “spillovers shows an indirect effect (the effect of treating neighboring postal codes) that is ten times higher than the direct effect (the effect of treating one’s own postal code).”

🤖 Matched LLM issue text

L1: Interference/SUTVA: Geo-randomized Facebook targeting plausibly induces cross-postal-code spillovers and location misclassification; the paper’s neighbor-share regression is suggestive but not a randomized test, so the magnitude/direction of interference remains uncertain and complicates causal estimands.

Show detailed discussion

The human critique highlights that estimated spillovers are an order of magnitude larger than direct effects and depend on arbitrary distance cutoffs and functional-form assumptions, raising interpretability concerns. The LLM likewise frames this as an interference/SUTVA problem: neighbor-share regressions are suggestive rather than randomized tests, and cross-area spillovers and misclassification complicate the causal estimand. Both agree that spillovers are not cleanly identified and should be interpreted cautiously. The LLM adds the causal-inference framing (SUTVA/estimands), while the human critique focuses more on the magnitude and arbitrariness of specific modeling choices.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H12: Covariate balance tables should not be used as randomization tests; prefer standardized differences over hypothesis tests.

Why missed: No LLM issue explicitly critiques balance-table hypothesis testing or recommends standardized differences; related concerns (e.g., pretreatment imbalance) are mentioned only in a different context (LLM L7) and not framed as a conceptual error about testing randomization.

📋 Unmatched LLM Issues (4 not in human critique)

L7: Pretreatment imbalance and residual confounding for 23-charity alliance outcomes despite DiD adjustments.

Why extra: The human critiques provided do not specifically discuss the 23-charity alliance imbalance/DiD identification threat.

L8: Mechanism claims (‘power of asking’) exceed what is directly identified/tested.

Why extra: Humans focus on uncertainty, HTE inference, spillovers, and preregistration/transformations, not on mechanism identification.

L9: Factorial design content differences are mostly underpowered/not distinguishable; avoid equivalence-style rhetoric.

Why extra: The human list does not mention factorial/video-type comparisons or equivalence vs low power.

L12: Replication constraints from proprietary/aggregated data and unclear code/data availability.

Why extra: Human issues emphasize preregistration deviations and inference/interpretation, not reproducibility constraints.

📄 View full human critique

Optional but important: Deviations from the pre-registered analysis plan or degrees of freedom, including Transforming the outcome variable — winsorizing location-days with gifts of over 1000 euros. donations normalized by the population of each zip code. Length of measurement window Investigating the effects on competing charities (additional dimension of analysis) Preregistered, not done: “We also plan to use causal trees to search for meaningful heterogeneity in a structured way using all available pre-treatment covariates…” Interpretation of main results: lack of emphasis on uncertainty bounds

Reiley: “the authors oversell some of their results, mainly by failing to provide confidence intervals rather than point estimates in some of their discussions.” Capitan: “The campaign appears profitable by point estimate, but the confidence intervals and strong behavioral assumptions warrant a more cautious interpretation. Highlighting this uncertainty explicitly…” Interpretation/statistical inference for heterogeneous treatment effects: “The coefficients with the biggest high-low differences all ended up estimating positive treatment effects for one half of the data and negative treatment effects for the other half, which helped the difference become big enough for statistical significance. But it’s hard for me to imagine that negative treatment effects are genuine, rather than mere noise, because I don’t see a good theory for why Save the Children ads would cause donations to decrease among half the population HTE tests subject to MHT bias (the ‘causal trees’ approach they proposed might have dealt with this) Missing statistical tests for HTEs: “the authors did not conduct F-tests to conclude which of these HTEs were statistically significant” Profitability: Trimming (Winsorizing) particularly problematic for these estimates Estimating and Interpreting spillovers: “spillovers shows an indirect effect (the effect of treating neighboring postal codes) that is ten times higher than the direct effect (the effect of treating one’s own postal code).” spillover result relies on a number of arbitrary assumptions, such as the distance over which spillovers can versus cannot occur, and the functional form through which we measure these spillover effects.   Possibly relevant but maybe controversial Capitan: “interpreting covariate balance tables as a test of randomization is conceptually incorrect … report standardized differences in covariates without relying on hypothesis tests”

🤖 View all GPT key issues (12)

1. Interference/SUTVA: Geo-randomized Facebook targeting plausibly induces cross-postal-code spillovers and location misclassification; the paper’s neighbor-share regression is suggestive but not a randomized test, so the magnitude/direction of interference remains uncertain and complicates causal estimands.

2. Treatment intensity vs assignment: Postal-code assignment does not ensure uniform exposure; Facebook’s delivery algorithm can create endogenous within-area intensity and composition, making per-capita effects harder to interpret as “advertising causes X euros per million residents.”

3. Revenue effects are borderline: Donation revenue results are weaker than frequency (e.g., long-term p<0.1), so ROI/profitability conclusions are sensitive to specification, tail behavior, and aggregation choices.

4. Winsorization and heavy tails: PLZ-day winsorization at €1,000 may affect inference and ROI in a heavy-tailed donation distribution; robustness is mentioned but should be more fully quantified (e.g., alternative cutoffs, quantile/Poisson models).

5. Post-period choice and seasonality: Ending outcomes at Dec 31 is substantively motivated but couples treatment persistence to extreme seasonal dynamics; more explicit dynamic/event-study estimates would better separate persistence from seasonal noise.

6. Competitor/crowding-out measurement mismatch: Competitor datasets are incomplete (often online-only), overlap across sources is unresolved, and sector-wide giving is unobserved, limiting the strength of conclusions about total charitable-sector effects.

7. Pretreatment imbalance for the 23-charity alliance outcomes: The need for DiD/lagged-outcome adjustment is appropriate, but residual confounding and differing seasonal patterns across areas remain possible; inference is also relatively weak (often p<0.1).

8. Mechanism claims exceed direct tests: The conclusion that effects primarily reflect the “power of asking” is plausible but not directly isolated from alternative mechanisms (salience, trust, concurrent campaigns/news shocks).

9. Factorial design interpretation: Differences across video type and impression allocation are mostly not statistically distinguishable; emphasizing “no matter the content, online fundraising works” should be framed as limited power to detect meaningful differences rather than equivalence.

10. Heterogeneity/multiplicity risk: Many subgroup interactions are reported with strong managerial recommendations, but these appear exploratory and risk false positives without multiplicity control or holdout validation.

11. Profitability depends on assumptions: Long-run ROI uses an assumed lifetime-value multiplier and implicitly assumes competitor crowd-out does not offset social impact; uncertainty propagation around ROI should be more explicit.

12. Replication constraints: Although preregistered and well-documented, proprietary and aggregated data plus unclear code availability limit independent reproducibility of key transformations (aggregation, neighbor calculations, data merges).

Benabou_et_al._2023

Coda title: Willful Ignorance and Moral Behavior

Coverage: 50% | Weighted Coverage: 28% | Precision: 20%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	86	84
Advancing knowledge	85	83
Claims & evidence	88	82
Global relevance	66	58
Logic & communication	88	86
Methods	84	78
Open science	74	64

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

Only one of the two human expert concerns is even partially captured by the LLM list (interpretation of null/non-significant results). The core methodological point about Likert ceiling effects and appropriate censored-outcome modeling is entirely absent from the LLM issues.

Detailed Comparison Notes

The overlap is thematic rather than technical: the LLM warns against overly strong ‘no effect/no structure’ conclusions from weak evidence, aligning with H2’s ‘absence of evidence is not evidence of absence’ point. However, the human critique’s key technical mechanism (ceiling effects causing underestimation and underpower; Tobit with IPW) is not reflected in any LLM issue. Most LLM issues instead focus on construct validity and interpretation of cross-task moral preference measures, suggesting the LLM critique targets different sections/questions than the human reviewer.

✅ Matched Issues (1 human issues with LLM coverage)

H2: Overstating null differences from non-significance ↔︎ L2, L5 55%

H2 criticizes interpreting a non-significant difference as evidence of no difference, especially under low power. L2 similarly argues the paper’s headline conclusion is too strong given the evidence, and L5 cautions against strong interpretation of null effects in a particular manipulation.

👤 Human expert critique text

Quote from authors (to avoid straw-manning) “However, we do not find that belief updating in response to information differs significantly between information avoiders and seekers (average belief updating: 0.15 vs. 0.20 points, p = 0.55). Hence, differences in belief updating are unlikely to explain the treatment effect heterogeneity between information avoiders and seekers in our experiment.”

🤖 Matched LLM issue text

L2: . The headline conclusion of “no stable individual preference types” is stronger than what low pairwise correlations among binary indicators can establish; latent-variable models (multidimensional factor/IRT, mixture models) could test for structured heterogeneity more directly.

L5: . The “real stakes” manipulation in the trolley is probabilistic (10% implementation); while a robustness exercise shows similar probabilities matter in SVO tradeoffs, moral responsibility may respond differently, so null effects should be interpreted cautiously.

Show detailed discussion

The human expert focuses on a specific statistical inference problem: the authors treat p=0.55 as implying “differences are unlikely,” despite a numerically meaningful gap (0.15 vs 0.20) and likely low power due to ceiling effects. The LLM raises an analogous interpretive concern at a more conceptual level: weak/low correlations (or null effects) do not justify strong claims like “no stable types,” and null results should be stated cautiously. Unlike the human critique, the LLM does not discuss power, effect sizes, or ceiling-induced censoring; it mainly emphasizes that the evidentiary standard for strong ‘no difference/no structure’ conclusions is not met. L5 is only partially related because it is about a particular manipulation’s null effect and external interpretation, not about underpowered tests or censored outcomes.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H1: Ceiling effects in Likert beliefs; suggest Tobit/censoring model (possibly with IPW).

Why missed: No LLM issue addresses censoring/ceiling effects in Likert-scale outcomes, nor proposes Tobit or similar models to correct underestimated treatment effects due to bounded scales.

📋 Unmatched LLM Issues (8 not in human critique)

L1: Construct validity: ‘deontological’ option differs across games; multidimensionality.

Why extra: Human critiques focus on Likert belief updating measurement/statistical inference, not on cross-task construct validity of moral games.

L3: Pearson correlations on dichotomous choices; attenuation/measurement error and alternative estimators.

Why extra: Human issues concern ceiling/censoring in Likert belief measures and interpretation of non-significance, not correlation estimators for binary indicators.

L4: Trolley (SAL) interpretation confounded by defaults/omission/fairness/indirect mapping.

Why extra: Not discussed by the human expert, who instead critiques belief-updating analysis.

L6: Common consequence frame across tasks may drive apparent preferences.

Why extra: Outside the human critique scope (which is about belief updating statistics).

L7: External validity limits from online mostly-student sample.

Why extra: No corresponding human issue; human critique is methodological/statistical within-sample.

L8: Some games may measure compliance/experimenter demand rather than moral philosophy.

Why extra: Human critique does not address demand/compliance confounds.

L9: Multiple-hypothesis testing; narrative may overweight small significant links.

Why extra: Human critique is about interpreting a non-significant difference and ceiling effects, not multiple-comparisons control.

L10: Open-science clarity: provide permanent links to data/code/materials.

Why extra: Not raised in the human critique.

📄 View full human critique

Optional but important Ceiling effects with Likert scale measures [Treich] Assuming that the authors evaluated the beliefs twice, I might have some concerns here. One issue is that most of the participants on this question are distributed at the highest level of the Likert scale (about 70% of the participants report the maximum value looking at Figure A5). When assessing a difference between treatment groups or a treatment effect, ceiling effects are important as they can lead to considerably underestimate the difference. I would suggest using here a Tobit model to take this issue into account. (I assume that there is no issue with combining it with inverse probability weighting.) 2. Reporting on lack of difference in beliefs [Triech] Another related issue concerns the difference in beliefs. The authors write that the difference in beliefs is 0.15 for information avoiders and 0.20 for information seekers (page 18). However, note that the difference is non-negligible (it is about 33% larger). The lack of significance for the difference does not mean that there is no difference (well-known motto: the absence of evidence is not the evidence of absence). This is particularly true in the case of underpowered tests. And as I mentioned above, this is likely to be the case here because of the ceiling effects. If we look at Table A9, we see that the average beliefs are 4.69 for information avoiders and 4.59 for information seekers. It seems that information seekers have more room to update their beliefs on this Likert scale than information avoiders (because of the ceiling effect). Quote from authors (to avoid straw-manning) “However, we do not find that belief updating in response to information differs significantly between information avoiders and seekers (average belief updating: 0.15 vs. 0.20 points, p = 0.55). Hence, differences in belief updating are unlikely to explain the treatment effect heterogeneity between information avoiders and seekers in our experiment.”

🤖 View all GPT key issues (10)

1. Construct validity: the “deontological” option differs across games (harm/omission in trolley, honesty in lying, anti-corruption in bribe, purity in repugnant statement, rule-following, expressive/non-pivotal voting), so weak cross-task correlations may reflect multidimensional moral domains rather than absence of stable traits.

2. The headline conclusion of “no stable individual preference types” is stronger than what low pairwise correlations among binary indicators can establish; latent-variable models (multidimensional factor/IRT, mixture models) could test for structured heterogeneity more directly.

3. Reliance on Pearson correlations of dichotomous choices risks attenuation from measurement error and base-rate differences; reliability-adjusted associations or alternative estimators would better quantify “consistency.”

4. SAL trolley interpretation is confounded by default effects, omission/commission, reallocation of promised aid, fairness across regions, and the indirect nature of “statistical lives via donations,” which may not cleanly map to standard trolley ‘instrumental harm’ constructs.

5. The “real stakes” manipulation in the trolley is probabilistic (10% implementation); while a robustness exercise shows similar probabilities matter in SVO tradeoffs, moral responsibility may respond differently, so null effects should be interpreted cautiously.

6. Many EVM tasks share a common consequence frame (15€ charity donation vs 2€ to another subject); stable preferences about charity vs direct transfers, or about ‘experimenter-intended morality,’ could interact with means manipulations.

7. External validity is limited by an online, mostly-student lab pool; the extent to which observed context-dependence generalizes to high-stakes institutional settings (triage, bribery, AI policy) is uncertain.

8. Some games (e.g., group donation with non-pivotality, rule-following task) may tap compliance/norm sensitivity toward the experiment itself rather than moral philosophy distinctions, complicating interpretation as deontology vs consequentialism.

9. Multiple-hypothesis considerations are noted for some correlations, but the paper’s narrative may still overweight small significant links (e.g., trolley–lying r≈0.16) without a unified error-rate control for exploratory patterns.

10. Open-science/replication usability would be clearer if the manuscript explicitly provided permanent links to anonymized data, code, and experimental materials beyond the preregistration reference.

Bilal_and_Kaenzig_2024

Coda title: The Macroeconomic Impact of Climate Change: Global vs. Local Temperature

Coverage: 100% | Weighted Coverage: 76% | Precision: 55%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	90	92
Advancing knowledge	95	95
Claims & evidence	89	88
Global relevance	93	97
Logic & communication	92	92
Methods	85	82
Open science	66	62
Real-world relevance	90	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

All human expert concerns are covered to a meaningful degree (full coverage), especially around finite-sample inference, horizon length, lag/dynamics sensitivity, and extrapolation/nonlinearity. However, several LLM-raised concerns (identification/common shocks, mediation via extremes, panel time-FE confounding, broader uncertainty propagation, and replicability) are not present in the human critique, reducing precision.

Detailed Comparison Notes

Scoring rule applied: a human issue is counted as covered if at least one matched LLM issue has match_quality ≥ 30. Precision counts an LLM issue as matching if it appears in any matched_pairs list with match_quality ≥ 30. Some conceptual adjacency (e.g., H1 vs L1) was not treated as a match because the core critique differs (inference validity vs causal confounding).

✅ Matched Issues (5 human issues with LLM coverage)

H1: Finite-sample inference with many parameters ↔︎ L5 85%

Both critiques worry that short time series with relatively many parameters can make standard (asymptotic) confidence intervals unreliable. The LLM frames this as long-horizon LP finite-sample/SE bias issues, aligning closely with the human’s asymptotic-validity concern.

👤 Human expert critique text

Authors responded (September 2025) by introducing the BU dataset, which spans 160 years. And note they performed inference robustness using bootstrapping techniques in the appendix.

🤖 Matched LLM issue text

L5: Long-horizon LP and finite-sample inference: overlapping cumulative-growth outcomes, persistent regressors, multiple controls/dummies, and 10-year horizons can generate biased standard errors and sensitivity to specification choices despite robustness checks.

Show detailed discussion

The human critique focuses on the ratio of observations to parameters (≈50 observations, ≥7 parameters) and explicitly questions whether asymptotic approximations justify the reported frequentist confidence intervals, recommending simulation-based validation. The LLM similarly flags finite-sample inference problems in long-horizon local projections (overlapping outcomes, persistent regressors, multiple controls) that can bias standard errors and make results specification-sensitive. Both are fundamentally about inference credibility when T is small and the design is complex. The LLM is more specific about LP pathologies (overlap/persistence) rather than recommending Monte Carlo validation, but the underlying inferential risk is the same.

H2: Long-run effects beyond 10 years ↔︎ L5, L3, L9 75%

Both identify that limiting attention to a 10-year horizon may miss important longer-run damage dynamics and can materially change SCC/damage interpretations. The LLM additionally ties this to persistence-to-welfare mapping and the structural SCC translation.

👤 Human expert critique text

[NotebookLM] . The revised paper directly addresses this by converting their estimates to the damage implied by a permanent 1°C rise in temperature, yielding the much larger long-run figure of 22–34% GDP reduction. NotebookLM: Evaluator 2 requested allowing the local temperature model to include non-linear or permanent growth effects for a fair comparison with previous literature. While the authors maintain their main conservative assumption of level effects (not permanent growth effects), they clarify that their local temperature estimates generate a 3% long-run GDP reduction, which is consistent with the “level effects” benchmark of the conventional literature. They explicitly report that imposing non-linear specifications on their shocks did not materially change the core estimates.

🤖 Matched LLM issue text

L3: Magnitude interpretation: the 12% “per 1°C shock” effect partly reflects the post-shock persistence of temperature; the welfare mapping depends critically on how persistence is estimated and translated into a damage kernel.

L9: Structural welfare/SCC translation is model-dependent: representative-agent neoclassical model with damages only through TFP omits adaptation, sectoral/region heterogeneity, risk/uncertainty, and endogenous mitigation; the functional-form restriction for the damage kernel extrapolates beyond observed horizons.

Show detailed discussion

The human asks for impacts beyond 10 years and an SCC robustness check allowing longer-run effects, motivated by concern that damages may not mean-revert quickly. The LLM’s L5 directly echoes the horizon/inference difficulty at 10-year horizons and implies those issues only intensify when extending horizons. L3 relates because translating an impulse response into welfare damages depends critically on estimated temperature persistence and the implied damage kernel—exactly the long-run mapping the human wants explored. L9 overlaps more loosely: it highlights that SCC/welfare translation depends on a structural model and kernel assumptions, which become more consequential as one pushes to longer-run effects beyond observed horizons.

H3: Lag length / dynamics specification sensitivity ↔︎ L6, L5 55%

Both critiques raise sensitivity to dynamic specification choices (lags/dynamics) and imply that results can change depending on how persistence is modeled. The human frames it as adding longer lags for geoscience realism; the LLM frames it as shock-construction and LP/specification dependence.

👤 Human expert critique text

NotebookLM: Evaluator 2 requested allowing the local temperature model to include non-linear or permanent growth effects for a fair comparison with previous literature. While the authors maintain their main conservative assumption of level effects (not permanent growth effects), they clarify that their local temperature estimates generate a 3% long-run GDP reduction, which is consistent with the “level effects” benchmark of the conventional literature. They explicitly report that imposing non-linear specifications on their shocks did not materially change the core estimates.

🤖 Matched LLM issue text

L6: Temperature-shock construction: the Hamilton (2018) multi-step forecast-error approach (choice of h and p) is defensible but not uniquely grounded; shock definitions can embed assumptions about persistence that influence implied damages.

Show detailed discussion

The human issue is motivated by geoscience: longer climate cycles suggest using more lags (especially in a longer historical sample) to capture dynamics appropriately. The LLM’s L6 targets the temperature-shock construction (multi-step forecast errors) where choices like horizon and lag order embed assumptions about persistence, which can affect implied damages. L5 is adjacent because it notes specification sensitivity in long-horizon LPs with persistent regressors and many controls, which includes sensitivity to lag structure. The overlap is substantive but not exact: the human is primarily about physical/climate-cycle realism in lag selection, whereas the LLM emphasizes econometric identification and embedded persistence assumptions.

H4: Nonlinearities and growth vs level effects ↔︎ L4 80%

Both focus on whether linear/level-effect specifications miss important nonlinear or alternative (e.g., growth) damage channels. The LLM frames this as weak identification of nonlinearities/tails from small historical shocks, aligning with the human’s request for nonlinear or permanent-growth modeling for comparability.

👤 Human expert critique text

🤖 Matched LLM issue text

L4: Limited information about nonlinearities and tails: the historical global shocks are small; extrapolating linearly to multi-degree warming and to potential tipping points/adaptation regimes is weakly identified.

Show detailed discussion

The human critique asks for nonlinear or permanent growth effects in the local temperature model to enable fair comparison with prior literature, even if the authors prefer a conservative level-effect baseline. The LLM similarly argues that historical shocks are small and provide limited information about nonlinearities and tail risks, making linear extrapolation questionable and nonlinear regimes weakly identified. Both are pointing to model-form risk: the chosen functional form can drive conclusions about long-run damages. The LLM leans more toward tail/tipping-point concerns, while the human leans more toward comparability with the existing “growth vs level effects” literature.

H5: Out-of-sample scaling and linear extrapolation ↔︎ L4, L2 85%

Both critiques highlight that scaling results from typical small temperature shocks to a 1°C change (and beyond) is an out-of-sample extrapolation that relies on linearity. The LLM adds that the underlying shock type may also be externally invalid for anthropogenic warming patterns.

👤 Human expert critique text

Out-of-Sample Extrapolation: Evaluator 1 suggested acknowledging how far out of sample the 1°C calibration is, given that shocks are typically around 0.3°C. The 2025 version notes that scaling up to a 1°C increase requires assuming linearity, thereby explicitly addressing this methodological limitation, although they suggest that the consistency of their linear estimates across shock sizes supports the approach.

🤖 Matched LLM issue text

L2: External validity: natural variability shocks (ENSO/volcano/other internal variability) may not be representative of damages from sustained anthropogenic warming, due to different spatial/seasonal patterns and joint dynamics of climate variables.

Show detailed discussion

The human explicitly notes that typical shocks are around 0.3°C and that calibrating to 1°C requires a linearity assumption, urging acknowledgement of how far out-of-sample that step is. The LLM’s L4 matches directly by emphasizing limited identification of nonlinearities from small shocks and the weakness of linear extrapolation to larger warming and tail events. The LLM’s L2 overlaps by stressing external validity: natural-variability shocks may not represent sustained anthropogenic warming, so extrapolation is problematic not only in magnitude but also in kind. Compared to the human, the LLM broadens the critique from “linearity scaling” to “representativeness of the underlying shock process.”

📋 Unmatched LLM Issues (5 not in human critique)

L1: Identification/confounding: temperature innovations may correlate with unobserved global shocks

Why extra: The human critique focuses on inference validity and horizon length rather than omitted-variable/common-shock confounding in identifying causal temperature effects.

L7: Extreme-events mediation is suggestive but not causal; construction/aggregation choices matter

Why extra: The human issues do not discuss the paper’s proposed mediation through extremes or concerns about reanalysis indices and bottom-up channel aggregation.

L8: Panel evidence lacks time fixed effects; residual common-factor confounding

Why extra: The human critique does not raise the panel-design/time-FE concern; it concentrates on the main time-series sample size and long-run dynamics.

L10: Uncertainty propagation incomplete (climate IRF, measurement error, external validity)

Why extra: While the human questions frequentist CI validity in small samples, they do not broaden to integrating multiple layers of uncertainty beyond reduced-form estimation.

L11: Replicability/code release and exact data-processing pipeline

Why extra: The human critique does not mention computational reproducibility, code availability, or versioning of data transformations.

📄 View full human critique

Necessary Small sample size needs augmentation and robustness checking
E2: BK’s main results are calculated using a regression of around 50 observations. The regression appears to have at least 7 parameters. …not clear that the asymptotic approximations required to justify their frequentist confidence intervals are appropriate. I would suggest providing simulation evidence to illustrate the validity of their confidence regions in their setting, with time series data, few observations, and lots of parameters. Authors responded (September 2025) by introducing the BU dataset, which spans 160 years. And note they performed inference robustness using bootstrapping techniques in the appendix.
Optional but important Consider longer-term effects
E2: “It would be good to show the impacts for more than 10 years out, and to include a robustness check SCC calculation which allows for longer run effects” [NotebookLM] . The revised paper directly addresses this by converting their estimates to the damage implied by a permanent 1°C rise in temperature, yielding the much larger long-run figure of 22–34% GDP reduction. Sept version “In the longer BU sample, the same temperature shock leads to a peak effect at 18% after 5 years with a 95% confidence interval of (6%, 30%), … In both cases, impacts do not fully mean-revert even after 10 years. “
And it explicitly includes a section on Sensitivity that details how the SCC changes under alternative assumptions 2. Longer lags to reflect geoscience [NotebookLM]: Evaluator 1 requested increasing the number of lags, especially given the longer cycles noted in geoscience. The newer version notes that in the longer BU sample, they use 4 to 8 lags to capture the complexity of the historical series.
Less important, I guess Non-Linear Effects and Comparability: NotebookLM: Evaluator 2 requested allowing the local temperature model to include non-linear or permanent growth effects for a fair comparison with previous literature. While the authors maintain their main conservative assumption of level effects (not permanent growth effects), they clarify that their local temperature estimates generate a 3% long-run GDP reduction, which is consistent with the “level effects” benchmark of the conventional literature. They explicitly report that imposing non-linear specifications on their shocks did not materially change the core estimates. Out-of-Sample Extrapolation: Evaluator 1 suggested acknowledging how far out of sample the 1°C calibration is, given that shocks are typically around 0.3°C. The 2025 version notes that scaling up to a 1°C increase requires assuming linearity, thereby explicitly addressing this methodological limitation, although they suggest that the consistency of their linear estimates across shock sizes supports the approach.

🤖 View all GPT key issues (11)

1. Time-series identification risk: global temperature innovations may still correlate with unobserved global shocks (policy, finance, technology, geopolitics) that affect GDP at multi-year horizons, and the small annual sample (≈60) makes this hard to rule out conclusively.

2. External validity: natural variability shocks (ENSO/volcano/other internal variability) may not be representative of damages from sustained anthropogenic warming, due to different spatial/seasonal patterns and joint dynamics of climate variables.

3. Magnitude interpretation: the 12% “per 1°C shock” effect partly reflects the post-shock persistence of temperature; the welfare mapping depends critically on how persistence is estimated and translated into a damage kernel.

4. Limited information about nonlinearities and tails: the historical global shocks are small; extrapolating linearly to multi-degree warming and to potential tipping points/adaptation regimes is weakly identified.

5. Long-horizon LP and finite-sample inference: overlapping cumulative-growth outcomes, persistent regressors, multiple controls/dummies, and 10-year horizons can generate biased standard errors and sensitivity to specification choices despite robustness checks.

6. Temperature-shock construction: the Hamilton (2018) multi-step forecast-error approach (choice of h and p) is defensible but not uniquely grounded; shock definitions can embed assumptions about persistence that influence implied damages.

7. Mediation via extreme events is suggestive but not causal: extreme indices rely on reanalysis and threshold/smoothing choices; the bottom-up aggregation assumes separability and correct dynamic persistence and may omit other correlated channels.

8. Panel-country evidence still lacks time fixed effects for global shocks; while global controls and alternative designs are explored, any residual common-factor confounding remains a concern.

9. Structural welfare/SCC translation is model-dependent: representative-agent neoclassical model with damages only through TFP omits adaptation, sectoral/region heterogeneity, risk/uncertainty, and endogenous mitigation; the functional-form restriction for the damage kernel extrapolates beyond observed horizons.

10. Uncertainty propagation: reported confidence intervals mainly reflect reduced-form estimation uncertainty; deeper uncertainties (climate impulse response, external validity, long-horizon extrapolation, measurement error in extremes) are not fully integrated.

11. Replicability: while data sources are public and appendices are detailed, full computational reproducibility likely requires releasing code and exact data-processing pipelines (shocks, weighting, extreme event construction, smoothing choices, and versioning).

Blimpo_and_Castaneda-Dower_2025

Coda title: Asymmetry in Civic Information: An Experiment on Tax Participation among Informal Firms in Togo

Coverage: 100% | Weighted Coverage: 78% | Precision: 58%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	79	71
Advancing knowledge	82	72
Claims & evidence	80	66
Global relevance	88	70
Logic & communication	85	76
Methods	78	64
Open science	62	28
Real-world relevance	90	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

All human expert concerns are captured by at least one LLM issue (often multiple), including the key identification/measurement threats (enumerator-intervention confounding, visit-based mechanism concerns, and receipts/missingness). The LLM additionally flags several important but distinct issues (SE clustering, robustness sensitivity, multiple testing, timeline clarity, and open-science) that were not raised by the human experts, lowering precision.

Detailed Comparison Notes

Match threshold used for coverage: match_quality >= 30. Precision counts an LLM issue as ‘matched’ if it substantively overlaps any human issue at or above that threshold; under this rule, L1, L2, L3, L4, L5, L8, and L10 are matched (7/12). The closest-to-miss match is H2↔︎L4/L10 because the human requests a specific reduced-form test of treatment effects on visits, while the LLM mainly cautions against post-treatment conditioning and notes lack of direct enforcement data.

✅ Matched Issues (3 human issues with LLM coverage)

H1: Enumerator-delivered intervention threatens measurement and IV exclusion ↔︎ L2, L3, L5, L8 85%

Both critiques worry that intervention delivery and data collection are entangled, creating differential reporting/engagement and violating the 2SLS exclusion restriction. The LLM expands this into attention effects and broader reporting noise beyond the IV concern.

👤 Human expert critique text

Same people used as enumerators and intervention — may frustrate causal identification (for outcomes without receipts) [E2 notes] “The intervention was delivered by the same enumerators who conducted the baseline and endline surveys. This is a bit concerning because it introduces the possibility of endline reporting varying because respondents have greater knowledge of and trust in the enumeration team. While this does not concern validated measures like tax payment—which relies on showing a receipt—it does make me a bit more wary of interpreting reports of economic activity. I also think it leads to an exclusion restriction violation for the 2SLS strategy, which currently assumes the treatment can only operate through the channel of improving knowledge. Can the authors test this in some way in the data, i.e., by looking at proxies for familiarity with or trust in the enumerator? This leads me to think the authors might consider dropping the 2SLS and replacing it with correlational analysis between knowledge and tax payments.”

🤖 Matched LLM issue text

L2: The binary “pays taxes” outcome may be differentially misreported after training because the intervention changes respondents’ understanding/classification of payments (TPU vs other taxes/fees/bribes), threatening comparability across arms.

L3: Treatment firms received substantially more contact (training + follow-up consultations) than controls, so effects on economic activity and even tax behavior may partly reflect attention/Hawthorne effects rather than information about the fiscal contract per se.

L5: The 2SLS “knowledge take-up” LATE relies on a constructed endline measure (including an ‘accountability opinion’ proxy) and likely violates exclusion (assignment affects outcomes through channels other than the measured knowledge items), so IV coefficients should not be interpreted straightforwardly as causal effects of ‘knowledge.’

L8: Economic activity measures (open orders; estimated monthly revenue) are potentially noisy and seasonally variable, and the paper does not fully rule out survey-timing or reporting effects tied to differential enumerator engagement across arms.

Show detailed discussion

The human expert’s core concern is that using the same enumerators to deliver treatment and collect endline data could change trust/familiarity and thus self-reports, especially for non-validated outcomes, and that this undermines the 2SLS exclusion restriction (treatment affects outcomes through more than “knowledge”). The LLM covers the same conceptual risks via (L5) explicit exclusion-restriction doubts about the IV/knowledge LATE, and via (L2, L8) differential misreporting and survey/reporting effects driven by engagement. The LLM also adds an attention/Hawthorne framing (L3), generalizing beyond “same people” to “more contact” as a competing channel. The human proposes a concrete diagnostic (proxies for enumerator familiarity/trust, possibly dropping 2SLS), whereas the LLM emphasizes interpretation cautions and alternative channel threats without the specific enumerator-familiarity test.

H2: Tax-official visit channel needs direct causal testing ↔︎ L4, L10 55%

The human asks to test whether treatment directly affects visits; the LLM does not phrase it as that exact test but raises closely related concerns about using visits in mechanism analysis and the lack of direct enforcement data.

👤 Human expert critique text

They should [E2] “test whether the treatment affected the likelihood of a visit directly?” ‘While the treatment itself could affect the likelihood of a visit, the patterns that we observe are still instructive.’ DR:

🤖 Matched LLM issue text

L4: Conditioning on (or splitting by) post-treatment variables such as “visited by a tax official” risks post-treatment selection/collider bias; such results should be framed as descriptive rather than evidence of bargaining-power mechanisms.

L10: The mechanism story (‘empowerment’ reallocates audits from low- to high-revenue firms) is plausible but not directly tested with administrative enforcement data; evidence is indirect and partly based on self-reports and descriptive patterns.

Show detailed discussion

The human critique is narrowly about identification: if the intervention changes the likelihood of a tax-official visit, then interpretations that condition on (or treat visits as exogenous) are questionable, and the authors should test the treatment’s effect on visits. The LLM’s nearest match is (L4), which warns that conditioning on post-treatment “visited by a tax official” can induce post-treatment selection/collider bias, implying that visit-based splits should be descriptive rather than causal. The LLM also echoes the human’s desire for more direct evidence on enforcement patterns via (L10), noting the mechanism is not directly tested with administrative enforcement data and relies on self-reports/descriptive patterns. However, the LLM does not explicitly call for the specific reduced-form test “treatment → visit” as the human does, so the overlap is partial rather than exact. Conceptually, both critiques challenge the credibility of the visit-based mechanism story without stronger causal or administrative evidence.

H3: Receipts missingness and zero-imputation bias tax amounts ↔︎ L1 95%

Both point to the same measurement problem: revenue/payment amounts are observed only for a subset with receipts and missing values are treated as zeros, making results sensitive and potentially biased. The human additionally suggests constructing an alternative indirect outcome for robustness.

👤 Human expert critique text

Same people used as enumerators and intervention — may frustrate causal identification (for outcomes without receipts) E1 “… measures of tax participation and tax payments are self-reported by the MIEs’ owners.” … although they do ask “owners to provide proof of tax payment through receipts … which resulted in a considerable mass of missing values … imputed as zeros…” implying “a noisy measure of the treatment impact on this variable, which the authors [recognize]” consider using survey information to “construct an indirect measure of liabilities paid by the MIEs, as an alternative outcome variable for robustness checks.”

🤖 Matched LLM issue text

L1: Tax revenue amounts are based on a small subset with receipts (≈60/383) and the analysis imputes all missing/unverified amounts as zero, conflating nonpayment with non-verification and making the “revenue increased” claim highly sensitive to missing-data assumptions.

Show detailed discussion

The human expert flags that tax payments are largely self-reported, receipts are missing for many firms, and imputing missing receipts as zero creates noise and possibly bias; they suggest exploring an indirect/constructed liabilities-paid measure using survey information as a robustness outcome. The LLM (L1) closely matches this: it highlights the small verified-receipt subset, the conflation of nonpayment with non-verification, and high sensitivity of the “revenue increased” conclusion to missing-data assumptions. The LLM frames the issue as a threat to the main claim’s stability under alternative missingness assumptions, whereas the human frames it as a limitation of admin validation and proposes an alternative measurement/imputation strategy. Both converge on the need to treat the tax-amount findings as fragile unless backed by stronger handling of missingness or alternative outcome construction.

📋 Unmatched LLM Issues (5 not in human critique)

L6: Standard errors may need clustering by area/enforcement environment

Why extra: The human critiques do not discuss inference/SE specification or clustered sampling implications.

L7: Treatment effects appear specification-sensitive; need robustness/prespecification

Why extra: Humans focus on measurement/identification threats (enumerators, visits, receipts) rather than robustness to controls/specifications.

L9: Heterogeneity analyses risk multiple-testing/selective emphasis

Why extra: Human issues do not mention multiple comparisons or exploratory heterogeneity concerns.

L11: Baseline/timeline clarity about pre-treatment measures could be improved

Why extra: Human critiques do not raise timing/baseline definitional clarity as an issue.

L12: Open-science/replicability elements (data/code/preregistration) not evident

Why extra: Humans do not comment on transparency, preregistration, or replication materials.

📄 View full human critique

Optional but important 1. Same people used as enumerators and intervention — may frustrate causal identification (for outcomes without receipts) [E2 notes] “The intervention was delivered by the same enumerators who conducted the baseline and endline surveys. This is a bit concerning because it introduces the possibility of endline reporting varying because respondents have greater knowledge of and trust in the enumeration team. While this does not concern validated measures like tax payment—which relies on showing a receipt—it does make me a bit more wary of interpreting reports of economic activity. I also think it leads to an exclusion restriction violation for the 2SLS strategy, which currently assumes the treatment can only operate through the channel of improving knowledge. Can the authors test this in some way in the data, i.e., by looking at proxies for familiarity with or trust in the enumerator? This leads me to think the authors might consider dropping the 2SLS and replacing it with correlational analysis between knowledge and tax payments.” 2. They should [E2] “test whether the treatment affected the likelihood of a visit directly?” ‘While the treatment itself could affect the likelihood of a visit, the patterns that we observe are still instructive.’ DR: Unsure 1. Lack of admin data limits accuracy, consider further imputation work E1 “… measures of tax participation and tax payments are self-reported by the MIEs’ owners.” … although they do ask “owners to provide proof of tax payment through receipts … which resulted in a considerable mass of missing values … imputed as zeros…” implying “a noisy measure of the treatment impact on this variable, which the authors [recognize]” consider using survey information to “construct an indirect measure of liabilities paid by the MIEs, as an alternative outcome variable for robustness checks.” DR: But it’s unclear how much can be done here, and they don’t seem to be overclaiming

🤖 View all GPT key issues (12)

1. Tax revenue amounts are based on a small subset with receipts (≈60/383) and the analysis imputes all missing/unverified amounts as zero, conflating nonpayment with non-verification and making the “revenue increased” claim highly sensitive to missing-data assumptions.

2. The binary “pays taxes” outcome may be differentially misreported after training because the intervention changes respondents’ understanding/classification of payments (TPU vs other taxes/fees/bribes), threatening comparability across arms.

3. Treatment firms received substantially more contact (training + follow-up consultations) than controls, so effects on economic activity and even tax behavior may partly reflect attention/Hawthorne effects rather than information about the fiscal contract per se.

4. Conditioning on (or splitting by) post-treatment variables such as “visited by a tax official” risks post-treatment selection/collider bias; such results should be framed as descriptive rather than evidence of bargaining-power mechanisms.

5. The 2SLS “knowledge take-up” LATE relies on a constructed endline measure (including an ‘accountability opinion’ proxy) and likely violates exclusion (assignment affects outcomes through channels other than the measured knowledge items), so IV coefficients should not be interpreted straightforwardly as causal effects of ‘knowledge.’

6. Standard errors are not clearly clustered by enumeration area/neighborhood despite clustered sampling and likely correlated enforcement environments; uncertainty may be understated.

7. Main treatment effect significance varies notably by specification (from 10% without controls to 1% with controls), suggesting sensitivity and the need for robustness checks (e.g., randomization inference, clustered SEs, alternative specifications, or pre-specified primary model).

8. Economic activity measures (open orders; estimated monthly revenue) are potentially noisy and seasonally variable, and the paper does not fully rule out survey-timing or reporting effects tied to differential enumerator engagement across arms.

9. Heterogeneity analyses (revenue quartiles, revenue-to-assets ratio, tax unpredictability) appear exploratory without multiple-testing adjustments or a clear count of attempted interactions, raising the risk of selective emphasis.

10. The mechanism story (‘empowerment’ reallocates audits from low- to high-revenue firms) is plausible but not directly tested with administrative enforcement data; evidence is indirect and partly based on self-reports and descriptive patterns.

11. Timeline/baseline clarity could be improved (sampling-frame survey vs experimental baseline), to confirm all baseline covariates and outcomes used in ANCOVA are strictly pre-treatment and comparable across arms.

12. Replicability/open-science elements (data/code availability, pre-registration, analysis reproducibility) are not evident in the manuscript, limiting external verification and reuse.

Bruers_2021

Coda title: The animal welfare cost of meat: evidence from a survey of hypothetical scenarios among Belgian consumers

Coverage: 50% | Weighted Coverage: 38% | Precision: 17%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	67	46
Advancing knowledge	76	57
Claims & evidence	59	42
Global relevance	75	68
Logic & communication	77	62
Methods	51	35
Open science	55	20
Real-world relevance	80	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM captures the main substantive human concern about confounding/construct validity in using WTP proxies (especially via its construct-validity framing), but it misses the meta-level human point about whether issues were already acknowledged. Most LLM issues are additional, broader methodological critiques not present in the human notes, lowering precision.

Detailed Comparison Notes

Mapping rule used: a match requires the same underlying threat to validity (e.g., confounding/construct validity), not merely both being ‘limitations.’ H2 aligns most closely with L6 (construct validity) and partially with L2 (scenario aversion/contamination). No LLM issue addresses H1’s evaluation-of-critique framing (acknowledged vs omitted), so it remains unmatched.

✅ Matched Issues (1 human issues with LLM coverage)

H2: Cultured-meat WTP confounds altruistic valuation ↔︎ L6, L2 75%

Both critiques argue the paper’s willingness-to-pay measures may not cleanly capture the intended animal-welfare/altruistic construct because responses can be driven by other attitudes or reactions. The LLM frames this as construct-validity/measurement error; the human critique specifies concrete confounds unique to cultured meat.

👤 Human expert critique text

[marginal altruistic WTP] is approximated through the WTP for the replacement of one kilogram of conventional meat supply with one kilogram of cultured meat. However, it is unclear whether the cultured-meat WTP effectively reflects altruistic preferences. For instance, consumers could expect the quality to differ between the two products, they could be reluctant to try new food (food neophobia), they could hold naturalistic views about food (i.e., a reluctance against food tech), etc. At the altruistic level, individuals could also think that cultured meat would prevent the birth of some animals, which, even if their net lifetime welfare is negative, would not be socially desirable.

🤖 Matched LLM issue text

L6: . Construct validity: responses may reflect moral attitudes, social desirability, or misunderstanding rather than welfare judgments; the lack of correlation between representative-agent valuations and MWTP for cruelty-free meat suggests the measures are capturing different constructs or are noisy/unstable.

L2: . The welfare-neutral animal is intended to anchor the zero point, but many respondents still exhibit positive valuations for the neutral animal (especially in means), indicating the relative measure may not isolate animal welfare from general aversion to the scenario (identity loss, being non-human, fear/disgust).

Show detailed discussion

The human critique (Espinosa) targets a specific operationalization: using WTP to replace conventional meat with cultured meat as a proxy for marginal altruistic WTP for animal welfare. It argues this proxy is confounded by expected quality differences, food neophobia, naturalness/anti-tech views, and even beliefs about preventing animal births—factors that can move WTP independent of altruistic concern for suffering. The LLM raises a broader construct-validity concern that survey responses may reflect moral attitudes, social desirability, misunderstanding, or general reactions rather than welfare judgments, and (via L2) that valuations can reflect aversion to the scenario rather than the intended attribute. Compared to the human issue, the LLM critique is less tied to the cultured-meat substitution specifically, but it covers the same core risk: the measured WTP may not identify altruistic animal-welfare preferences.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H1: Several stated limitations are already acknowledged by the author, so critiques shouldn’t be framed as omissions/mistakes; at least one critique was genuinely new.

Why missed: The LLM issues focus on methodological/statistical validity and interpretation, not on meta-evaluation of whether limitations were already disclosed in the original paper or whether critiques are ‘omissions’ versus acknowledged caveats.

📋 Unmatched LLM Issues (10 not in human critique)

L1: Strong, unvalidated assumptions linking pill-scenario WTP/WTA to ‘external animal welfare costs’ (separability, δa=1, zero-point meaning).

Why extra: Human critique did not discuss the pill-scenario identification assumptions or comparability/zero-point foundations.

L3: Extreme attrition/selection and confidence-based exclusions leading to tiny effective samples and potential upward bias.

Why extra: Human critique did not address sample attrition, selection, or confidence-exclusion bias.

L4: Ad hoc outlier handling for mean WTA that materially changes headline results.

Why extra: Human critique did not mention outlier rules or robustness of mean estimates.

L5: Unjustified reliance on WTA vs WTP given huge WTA–WTP gap; inconsistent use of mean/median.

Why extra: Human critique did not focus on the WTA–WTP gap or normative justification for choosing WTA.

L7: Weak statistical inference: low power, multiple testing, limited modelling; heterogeneity questions unresolved.

Why extra: Human critique did not raise inference/power/modelling concerns.

L8: Per-kg conversion depends heavily on assumed edible-meat yields; not stress-tested.

Why extra: Human critique did not discuss yield assumptions or per-kg conversion sensitivity.

L9: Limited external validity (Belgian panel) yet broad global/practitioner conclusions are suggested.

Why extra: Human critique did not address population generalizability or cross-country comparability.

L10: Carbon-tax backfire calculation mixes contexts (Belgian welfare costs with US parameters) and under-propagates uncertainty.

Why extra: Human critique did not discuss the policy simulation or cross-context parameter mixing.

L11: Reporting/replicability gaps: missing full wording, scripts, and data/code.

Why extra: Human critique did not mention reproducibility, appendices, or data/code availability.

L12: Overstated interpretation: contingent estimates framed too strongly as likely/robust.

Why extra: Human critique did not focus on rhetorical overclaiming relative to acknowledged uncertainty.

📄 View full human critique

Necessary None
Optional but important Many of the limitations are already acknowledged by the author (in the paper?), and thus not “omissions/mistakes”. At least one issue was not noted in the original paper (which the author agrees with).
Espinosa [marginal altruistic WTP] is approximated through the WTP for the replacement of one kilogram of conventional meat supply with one kilogram of cultured meat. However, it is unclear whether the cultured-meat WTP effectively reflects altruistic preferences. For instance, consumers could expect the quality to differ between the two products, they could be reluctant to try new food (food neophobia), they could hold naturalistic views about food (i.e., a reluctance against food tech), etc. At the altruistic level, individuals could also think that cultured meat would prevent the birth of some animals, which, even if their net lifetime welfare is negative, would not be socially desirable. Authors’ response: …I agree with [this] criticism: there are many confounding factors to measure the marginal WTP using questions about cultivated meat. The same goes for hypothetical questions about e.g. meat from animals that are genetically programmed to not feel any suffering. I don’t know of another method used in the literature that captures the full marginal WTP to avoid all suffering of a farmed animal, without confounding elements. As animals on animal welfare certified farms still experience some non-zero suffering, traditional research methodologies involving WTP for animal-welfare-labelled meat do not capture the full WTP to avoid all suffering, and also involve confounding factors such as the level of trust in the certification scheme.

🤖 View all GPT key issues (12)

1. The identification of “external animal welfare costs” with respondents’ pill-scenario WTP/WTA relies on strong, unvalidated assumptions (additive separability, correct interpretation of the vignette, δa=1 interpersonal/species comparability, and a meaningful zero point tied to non-existence/neutral welfare).

2. The welfare-neutral animal is intended to anchor the zero point, but many respondents still exhibit positive valuations for the neutral animal (especially in means), indicating the relative measure may not isolate animal welfare from general aversion to the scenario (identity loss, being non-human, fear/disgust).

3. Extreme attrition and selection: core median estimates use very small subsamples after exclusions (e.g., N≈57/N≈66), and excluding below-median confidence responses likely biases estimates upward (the paper notes low-confidence respondents often report zero/low WTA).

4. Outlier handling for mean WTA is ad hoc (rank “10× jump” rule; effective €1,000,000 cutoff) and changes headline results by orders of magnitude; no preregistered or theory-grounded robust estimation strategy is provided.

5. The WTA–WTP gap is enormous and central; interpreting WTA as the “right” welfare measure is not justified within a clear rights baseline or welfare-theoretic loss function, and the paper alternates between mean/median and WTA/WTP to motivate conclusions.

6. Construct validity: responses may reflect moral attitudes, social desirability, or misunderstanding rather than welfare judgments; the lack of correlation between representative-agent valuations and MWTP for cruelty-free meat suggests the measures are capturing different constructs or are noisy/unstable.

7. Statistical inference is limited: small effective samples, multiple tests, low power, and no multivariate modelling; key heterogeneity questions (knowledge, diet, engagement, comprehension) are left largely unresolved.

8. The per-kg conversion (animal valuation divided by assumed edible meat yield) is a key driver of the “chicken >> beef” conclusion but is not stress-tested for alternative yield assumptions, production systems, or uncertainty in yields.

9. External validity is narrow (Flemish-speaking Belgian online panel, 18–65); nevertheless, results are used to motivate broad global/practitioner conclusions and compared numerically to other countries’ parameters.

10. The carbon-tax backfire calculation mixes contexts (Belgian welfare costs with US prices/elasticities/consumption) and relies on strong functional-form assumptions; uncertainty is not propagated, yet the policy recommendation (flat meat tax) is stated relatively strongly.

11. Reporting/replicability gaps: the manuscript does not (in the provided text) specify full questionnaire wording in an appendix, detailed data cleaning scripts, or provide data/code, limiting reproducibility and auditability.

12. Interpretation sometimes overstates what the data show: given sensitivity analyses and acknowledged unreliability, claims about welfare costs “likely” exceeding consumer surplus and climate costs should be framed as highly contingent scenarios rather than robust estimates.

Clancy_2024

Coda title: The Returns to Science In the Presence of Technological Risks

Coverage: 100% | Weighted Coverage: 68% | Precision: 42%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	79	73
Advancing knowledge	76	80
Claims & evidence	83	64
Global relevance	79	84
Logic & communication	78	85
Methods	72	58
Open science	74	43
Real-world relevance	70	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

All human issues are at least partially covered (100% coverage), with a strong match on the superforecaster-vs-expert weighting and moderate matches on discounting and scope/AI omission. Precision is lower because many LLM-identified issues (normative welfare choices, lag calibration, replicability, etc.) are not present in the human critique set provided.

Detailed Comparison Notes

The main gap is that the LLM does not explicitly foreground the human’s headline fairness/omission claim—i.e., comparing broad historical benefits of science against a narrow biotech risk while neglecting AI risk estimates and other domains (nuclear/climate rationale). Instead, it touches the issue indirectly via ‘AI/engineering may dominate capabilities’ (L3) and via contestability of biotech-attribution assumptions (L9). Conversely, the LLM contributes several additional model-structure and methodology critiques (forecast-to-hazard mapping, welfare function, lags, replicability) that are not reflected in the human issues listed here.

✅ Matched Issues (3 human issues with LLM coverage)

H1: Scope bias: biotech focus ignores AI risk ↔︎ L3, L9 55%

The human critique argues the paper unfairly pits all benefits of science against (mostly) biotech risks while omitting AI and other risk domains; the LLM similarly flags that conclusions about shifting risk with “science pace” may fail if AI/engineering dominates (L3) and that biotech-risk framing rests on contestable attribution/assumption choices (L9).

👤 Human expert critique text

Unfair to compare all benefits of science to only biotech risk; neglect of AI Risk NotebookLM: Evaluator 3 noted that AI risk estimates (both catastrophic and extinction) from the Existential Risk Persuasion Tournament (XPT) are much higher than those for biotech risks (e.g., superforecasters see AI extinction probability as 38x higher than bio extinction probability),

🤖 Matched LLM issue text

L3: . The “time of perils” is modeled as a discrete step increase with constant hazards (d, d_x) and a one-year shift from pausing/accelerating science; real risk likely evolves endogenously and may not be meaningfully shifted by marginal changes in science pace, especially if AI/engineering dominate capabilities.

L9: . The claim that non-extinction mortality risks from advanced biotech are too small to offset historical benefits hinges on assuming historical benefit trends persist and are attributable to ‘science’ at the stated shares (e.g., 56% of life expectancy gains); these attribution fractions are only weakly identified.

Show detailed discussion

Humans emphasize a key omission and fairness concern: the analysis is structurally tilted (“stacks the deck”) because it aggregates broad historical benefits of science but compares them primarily to biotech harms, while sidelining AI risk (and inadequately explaining why nuclear/climate are treated as ‘sunk costs’). The LLM does not directly call out the omission of AI risk estimates or the apples-to-oranges comparison, but L3 gestures at the same underlying vulnerability: marginal changes in “science” may not move the frontier risk if AI/capability drivers are elsewhere. L9 partially overlaps by challenging the claim that biotech non-extinction risks are too small, noting the dependence on historical-benefit persistence and weakly identified attribution shares. Overall, the LLM covers the ‘model may be mis-scoped / mis-framed around biotech’ concern, but it does not match the human critique’s central complaint about explicitly excluding AI risk estimates and the implied one-sided comparison.

H2: Discounting choice and long-run extrapolation fragility ↔︎ L2, L10 60%

The human critique targets the paper’s 2% discount rate as inconsistently justified and based on extrapolating a short-horizon AI forecast; the LLM flags that results are highly sensitive to the discounting/hazard construct (L2) and that the extension across regimes treats discounting inconsistently (L10).

👤 Human expert critique text

According to the author, this choice was based on the focus on funding ‘fundamental science’ (perhaps ‘academic’ work) and the assumption that AI progress mainly comes from private, applied, for-profit research. But he conceded this assumption may be “overly strong”. 2. Choice of 2% discount rate is not consistently justified

🤖 Matched LLM issue text

L2: . The “epistemic regime” discounting (constant annual p) combines forecast invalidation, catastrophic risk, and structural change into one hazard and assumes policy has no effect on regime transition in the baseline; conclusions about long-run benefits vs perils are highly sensitive to this construct.

L10: . The extinction-risk extension uses a reduced-form term for the value of the next regime (λW) without a consistent treatment of discounting, population, and income dynamics across regimes; the resulting “break-even λ” numbers can be misleadingly precise.

Show detailed discussion

Humans focus on a concrete parameter-choice critique: why 2% is chosen, the inconsistency with criticisms made elsewhere, and the implicit extrapolation of ~20-year AI timelines to millennia—plus sensitivity of conclusions to that rate. The LLM reframes the issue more structurally: L2 criticizes the ‘epistemic regime’ discounting as a compound hazard with strong assumptions and high sensitivity, without centering on the specific 2% numerical justification. L10 adds that the extinction-risk extension can yield misleadingly precise break-even numbers because discounting/population/income dynamics are not treated consistently across regimes. Thus, the overlap is real (discounting drives conclusions and is not robust), but the LLM does not explicitly address the particular Open Phil-derived 2% rationale and the alleged ad-hoc nature of the author’s post-hoc justification.

H3: Overconfident preference for superforecasters vs experts ↔︎ L11 90%

Both the human critique and L11 question whether the paper is too confident in privileging superforecaster estimates over domain expert estimates when that choice flips the sign of the recommendation.

👤 Human expert critique text

NotebookLM: Evaluators 1 and 3 argued that Clancy’s stated personal preference (3 in 4 chance) for the more optimistic Superforecaster estimates (which result in a positive return to science) over the Domain Expert estimates (which suggest accelerating science may be a net negative) was “too strong” or required more ambivalence

🤖 Matched LLM issue text

L11: . Arguments for preferring superforecasters over domain experts (intersubjective accuracy, correlated pessimism, limited track-record evidence) are suggestive but not robustly validated; conclusions about whether to accelerate science under extinction risk remain epistemically fragile.

Show detailed discussion

Humans report reviewers’ concern that the author’s stated personal preference (e.g., ‘3 in 4 chance’) for the superforecaster-derived optimistic view is too strong and should be more ambivalent given the stakes and uncertainty. L11 closely matches by arguing that the reasons offered for favoring superforecasters (track record, correlated pessimism, intersubjective accuracy) are not robustly validated and leave the accelerate-vs-pause conclusion epistemically fragile. The LLM frames it as an evidential/validation shortfall rather than a tone/credence-calibration problem, but the substance is the same: the justification for weighting forecaster classes is not strong enough to support confident conclusions. This is a near-direct alignment.

📋 Unmatched LLM Issues (7 not in human critique)

L1: Conflation of average returns, one-year pause effects, and marginal philanthropic acceleration under diminishing returns/general equilibrium.

Why extra: Human critiques provided here focus on scope (AI omission), discounting justification, and forecaster weighting; they do not raise this conceptual-equivalence/identification problem.

L4: Strong, multi-step transformations from XPT forecasts into annual hazards with unpropagated uncertainty.

Why extra: Humans mention discount-rate extrapolation and sensitivity but not the specific forecast-to-hazard conversion pipeline and missing uncertainty propagation.

L5: Normative welfare-function commitments (log income, healthy-year weighting, population ethics shifts) materially drive results.

Why extra: Human issues listed do not address normative welfare aggregation choices or comparability across model versions.

L6: Population/health mechanism likely overstates long-run effects; ‘more realistic health’ patch adds hardcoded assumptions.

Why extra: Not raised in the human critiques provided; this is a model-mechanism critique beyond discounting/scope.

L7: Calibration of long science→invention and diffusion lags relies on indirect proxies and drives results.

Why extra: Humans did not flag lag structure/calibration as a key weakness in the provided critique set.

L8: ROI comparison/counterfactual is not policy-relevant; linear conversion from global pause to per-dollar effects is suspect.

Why extra: Humans did not critique the chosen ROI metric/counterfactual framing in the provided issues.

L12: Limited replicability due to missing packaged code/data and insufficient reproducible workflows/sensitivity analysis.

Why extra: Humans here did not raise reproducibility concerns.

📄 View full human critique

Necessary None
Optional but important Unfair to compare all benefits of science to only biotech risk; neglect of AI Risk “stacks the deck against concluding in favor of a science slowdown…” NotebookLM: Evaluator 3 noted that AI risk estimates (both catastrophic and extinction) from the Existential Risk Persuasion Tournament (XPT) are much higher than those for biotech risks (e.g., superforecasters see AI extinction probability as 38x higher than bio extinction probability),
[NotebookLM] Author’s Response: The author, Matt Clancy, acknowledged that “all three reviewers noted at various points […] that this analysis should be extended to include AI risk”. He agreed that the report “should have spent more time on this question” of why it ignores the impact of science on other risk areas. He provided an extensive response attempting to model AI risks post-evaluation.
According to the author, this choice was based on the focus on funding ‘fundamental science’ (perhaps ‘academic’ work) and the assumption that AI progress mainly comes from private, applied, for-profit research. But he conceded this assumption may be “overly strong”. Also, he saw other risks (nuclear weapons and climate change as primarily “sunk costs of technological progress”. But this was not explained in the initial paper, so it should be flagged as an ~issue, omission or error.
Choice of 2% discount rate is not consistently justified [NotebookLM, checked manually] [E2] The author chose 2% (driven mainly by the 1.6% annual probability of transformative AI derived from the Open Philanthropy Worldview contest, assembling credences of AGI arriving before 2043). … this is implicitly extrapolating a forecast designed to be resolved in ~20 years out for thousands of years, which is the same criticism the author used to dismiss alternative, lower discount rates implied by XPT participants’ forecasts (0.05%–0.7%) E1 did simple calculations that confirmed that the results are sensitive to this choice of discount factor [NotebookLM] … [Author accepts] that this implicitly extrapolates a short-term forecast over a much longer timeframe. Justified the choice by noting that a higher discount rate limits the impact of assumptions about the distant future, arguing that this shortcut “has less severe consequences for the model’s results when the implied discount rate is higher” The author’s response might be seen as an ad-hoc justification, and it was not mentioned in the paper → flag as an ~issue, omission or error.
Unsure if correct/relevant NotebookLM: Evaluators 1 and 3 argued that Clancy’s stated personal preference (3 in 4 chance) for the more optimistic Superforecaster estimates (which result in a positive return to science) over the Domain Expert estimates (which suggest accelerating science may be a net negative) was “too strong” or required more ambivalence

🤖 View all GPT key issues (12)

1. The analysis conflates (i) the average return to global science spending, (ii) the effect of a hypothetical one-year global pause in science, and (iii) the marginal impact of metascience/philanthropic acceleration; these are not equivalent under diminishing returns, portfolio composition, and general equilibrium responses.

2. The “epistemic regime” discounting (constant annual p) combines forecast invalidation, catastrophic risk, and structural change into one hazard and assumes policy has no effect on regime transition in the baseline; conclusions about long-run benefits vs perils are highly sensitive to this construct.

3. The “time of perils” is modeled as a discrete step increase with constant hazards (d, d_x) and a one-year shift from pausing/accelerating science; real risk likely evolves endogenously and may not be meaningfully shifted by marginal changes in science pace, especially if AI/engineering dominate capabilities.

4. Mapping XPT forecasts into annual mortality and extinction hazards involves many strong transformations (conditioning on regime exit via GDP/extinction questions; imposing q0/q1/q2 step form; annualizing 5-year probabilities; severity assumptions for >1% and >10% events); uncertainty from this pipeline is not propagated into results.

5. The welfare function embeds strong normative commitments (log income, 2 log-points per healthy year, aggregation over persons and time) that materially drive results; the treatment of population ethics shifts across model versions (constant growth vs fixed births), affecting comparability and interpretation of the headline ROI.

6. The baseline model’s health/population mechanism (a one-year pause permanently reduces population growth) likely overstates long-run effects; the “more realistic health” correction helps but introduces other hardcoded assumptions (US SSA survival curves, ceiling at age 120, diminishing returns via log(birthyear−1800)).

7. The choice and calibration of long lags (20 years science→invention plus ~54 years diffusion) rely on indirect proxies (patent citation patterns, GDP/life expectancy convergence) that conflate science with other determinants; results depend on these lags because costs are assumed near-immediate while benefits are delayed.

8. The comparison metric (ROI vs giving $1 to a $50k earner) is not a policy-relevant counterfactual for most funders, and the conversion from a one-year global pause to per-dollar marginal effects assumes near-linearity and a correct denominator for “science spending.”

9. The claim that non-extinction mortality risks from advanced biotech are too small to offset historical benefits hinges on assuming historical benefit trends persist and are attributable to ‘science’ at the stated shares (e.g., 56% of life expectancy gains); these attribution fractions are only weakly identified.

10. The extinction-risk extension uses a reduced-form term for the value of the next regime (λW) without a consistent treatment of discounting, population, and income dynamics across regimes; the resulting “break-even λ” numbers can be misleadingly precise.

11. Arguments for preferring superforecasters over domain experts (intersubjective accuracy, correlated pessimism, limited track-record evidence) are suggestive but not robustly validated; conclusions about whether to accelerate science under extinction risk remain epistemically fragile.

12. Replicability is limited: the report describes a Python simulation and spreadsheet steps but does not clearly provide code/data artifacts, and many parameter choices involve manual judgment calls that would benefit from packaged reproducible workflows and probabilistic sensitivity analysis.

Dullaghan_and_Zhang_2022

Coda title: Forecasts estimate limited cultured meat production through 2050 (EA forum post)

Coverage: 88% | Weighted Coverage: 59% | Precision: 55%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	68	72
Advancing knowledge	69	70
Claims & evidence	72	68
Global relevance	70	80
Logic & communication	74	77
Methods	56	58
Open science	71	78

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM captures most major human critiques (7/8) at least partially—especially small-sample limitations and over-reliance on TEA anchors—though it misses the elicitation-collaboration point and does not reproduce some human critiques’ specific factual details (e.g., the FGF2 unit error). Precision is moderate because several LLM issues introduce additional, plausible concerns (resolution protocols, construct ambiguity, welfare-mechanism analysis, macro-scenario conditioning) not raised by the human experts.

Detailed Comparison Notes

Scoring threshold used: match_quality >= 30 counts toward coverage; an LLM issue counts toward precision if it matches at least one human issue at >= 30. Partial matches mainly arise where both sides critique “technical inaccuracies,” but refer to different concrete mistakes (H2 vs L4) or where one side targets narrative misuse of conditional probabilities (H3) while the other targets question complexity more generally (L9).

✅ Matched Issues (7 human issues with LLM coverage)

H1: Tiny sample undermines inference and aggregation ↔︎ L1, L5, L6 90%

Both critiques emphasize that n=6 is too small for strong conclusions and that aggregation/inference can be driven by outliers. The LLM additionally focuses on pooling-rule justification and statistical-testing validity as specific failure modes of small-n.

👤 Human expert critique text

Small sample limitations Somewhat overstated/mis-stated claims from these small samples:

🤖 Matched LLM issue text

L1: Very small and potentially unrepresentative panel (n=6) for long-horizon technology-scaling forecasts; no demonstrated calibration for this domain/horizon, and no quantification of aggregate uncertainty or sensitivity to individual forecasters.

L5: Aggregation choice (geometric mean of odds) is not justified or stress-tested against alternative pooling rules (linear pool, extremizing, performance weighting, leave-one-out), despite the tiny panel and potentially influential outliers.

L6: Correlation and “statistical significance (p<0.05)” claims are not credible with n=6 and many tested pairs; multiple-comparisons and model assumptions are not addressed, risking spurious inferences about which signposts matter.

Show detailed discussion

The human critique argues the sample (one expert + five forecasters) is too small to support the strength of some claims, and flags a concrete example where aggregate reporting “hides disagreement.” The LLM similarly centers the small, potentially unrepresentative panel (L1), and extends the critique to methodological consequences: un-justified aggregation choices and lack of sensitivity checks (L5), plus unreliable p-values/correlations with many comparisons at n=6 (L6). The human framing is primarily about over-claiming from limited evidence and interpretability of disagreement; the LLM frames it as statistical and aggregation invalidity. These are highly aligned, with the LLM being more formal about which analyses become non-credible under small-n.

H2: Material input-cost framing errors may bias forecasts ↔︎ L4, L8 55%

Both point to potentially important errors or mis-specifications in technical input-cost information provided to forecasters. However, the human issue is a specific units/typo error (FGF2 cost per gram vs per kg), while the LLM highlights a different wording/reference error in a different input-cost question.

🤖 Matched LLM issue text

L4: Acknowledged wording/reference error in the amino-acids input-cost question (mixing “recombinant proteins” and non-amino-acid examples) plausibly biases an important ‘crux’ variable and complicates interpretation of correlations and narratives.

L8: Heavy reliance on a few techno-economic analyses as anchors without a structured approach to combining first-principles engineering constraints with reference-class growth; this may over-weight specific assumptions (e.g., sterility standards, media costs).

Show detailed discussion

The human expert flags a concrete, high-magnitude table error: reporting Delft’s FGF2 costs per kg when the source was per gram, potentially distorting comparisons by ~1000x and possibly missing the “eliminate FGF2” possibility. The LLM does not identify that exact FGF2 unit error, but raises a closely related class of problem: a wording/reference error in a key cost question (L4) and broader risks of anchoring on TEAs and their assumptions (L8). In both cases, the concern is that mistaken or misleading technical cost inputs can shift forecasts and downstream narratives. The match is partial because the specific factual error (FGF2 units) and its implications are not explicitly captured by the LLM issues.

H3: Confusion from conditional/compound question interpretation ↔︎ L9 70%

Both critiques target misinterpretation risk when outcomes are conditional on other events or thresholds. The human critique focuses on the authors’ narrative mischaracterizing what a conditional forecast implies, while the LLM focuses on compound, cognitively demanding question design that increases interpretation variance.

👤 Human expert critique text

Mischaracterization of conditional claims Consumer approval did not appear as a major constraint as measured by the probability of large-scale anti-cultured meat protests (18%), or the majority of survey respondents saying they would be willing to try cultured meat (70%) (conditional upon >10,000 cumulative metric tons of cellular meat produced at any price by 2031).

🤖 Matched LLM issue text

L9: Some questions are compound and cognitively demanding (multiple thresholds, time windows, price conditions, species restrictions), increasing noise and interpretation variance across forecasters.

Show detailed discussion

The human critique argues the paper improperly treats conditional probabilities (e.g., willingness to try/protest probabilities conditional on >10kt production) as evidence that consumer approval is not a barrier, when forecasters could believe approval is a barrier precisely because the condition is unlikely. The LLM’s L9 flags that multi-threshold, condition-heavy questions increase noise and cross-forecaster variance, which directly contributes to the kind of confusion the human highlights. The LLM doesn’t explicitly call out the specific narrative error (drawing an unconditional conclusion from conditional statements), but it does identify the design feature that makes such mischaracterizations likely. Thus the overlap is substantial but not exact.

H4: TEAs treated as forecasts rather than conditionals ↔︎ L8 65%

Both critiques warn that techno-economic analyses (TEAs) can be misused as anchors or implicit predictions rather than conditional scenario estimates. The LLM frames this as over-reliance on a few TEAs and lack of a structured combination approach, while the human emphasizes explicitly clarifying TEAs are not future predictions and are tied to present-day costs.

👤 Human expert critique text

[Manheim] A potentially critical issue was found with the table provided to forecasters and experts for question 24, which says that the Delft analysis had a cost of FGF2/kg of 1.3-2.3B/kg, while the report itself, in table 3, this was the cost per gram, making the numbers approximately 3x what was reported for Humbird, not 3,000x. (The Risner et al price was, in fact, 1000x higher - but was also focused on the promise of eliminating FGF2, i.e. setting the price to zero. Notably, this is a possibility that the forecasts do not seem to take into account.) I do not know enough about this to check it in depth. [Seinkmane] They make further incorrect statements on growth factors, such as: “(growth factors are various hormones, cytokines, vitamins, and some other proteins that promote cell growth)”. Such a definition indeed can be found on Google, however in cell biology it is normally only protein signalling molecules, especially those affecting cell proliferation, that are classified as growth factors, certainly not vitamins and in most current literature not other cytokines either (see wiki article for quick review). Therefore for any further forecasts and synthesis I would strongly recommend further consultations with relevant experts, both in the CM field and in adjacent fields (wider cell biology and bioprocess engineering backgrounds).

🤖 Matched LLM issue text

Show detailed discussion

The human issue is that the background material did not clearly communicate TEAs as conditional estimates (often based on current costs) rather than predictive forecasts of future costs. LLM issue L8 overlaps by criticizing heavy reliance on a small set of TEAs as anchors and not integrating them with other modeling approaches or reference classes. Both are concerned that readers/forecasters may overweight TEA outputs and misinterpret what they mean. The human critique is more about explanatory clarity and epistemic status; the LLM critique is more about methodological over-weighting and combination of evidence. These are closely related but not identical.

H5: Domain-definition inaccuracies suggest missing technical expertise ↔︎ L4, L8 45%

Both critiques identify technical inaccuracies in the background/questions that could mislead forecasters. The human critique specifically calls out an incorrect definition of “growth factors” and recommends more cell-biology/bioprocess expertise; the LLM cites a different technical wording/reference error and general TEA-anchoring weakness.

🤖 Matched LLM issue text

Show detailed discussion

The human expert points to a concrete definitional mistake (growth factors incorrectly including vitamins/various cytokines) and infers insufficient consultation with relevant biological/bioprocessing experts. The LLM similarly notes a problematic technical phrasing/reference mix-up in an input-cost question (L4), implying that domain details were mishandled, and more broadly that the technical basis was overly anchored to a few TEAs (L8). Both critiques converge on “technical slippage” that can distort elicitation, but they differ on the specific scientific content (growth-factor definition vs amino-acids/recombinant-protein question). As a result, the match is moderate rather than strong.

H6: Over-reliance on limited TEA sources/anchors ↔︎ L8 85%

Both critiques argue the background relied too heavily on a narrow set of techno-economic analyses, risking anchoring and outdated or incomplete assumptions. The human suggests diversifying literature sources; the LLM frames this as lacking a structured approach to combining TEAs with other evidence.

👤 Human expert critique text

Diversify sources in formulating background for forecasters [Seinkmane] The authors admit themselves that they relied primarily on one source – the Humbird TEA – with the addition of a couple other papers. I have compiled a list of literature (see relevant tab) that would hopefully provide other and more recent sources to facilitate future forecasts. Aside from peer-reviewed publications and regulatory dossiers + patents available that have already been submitted for product approval, GFI releases information regularly, including their state of industry reports and reports focusing on specific topics such as media ingredients.

🤖 Matched LLM issue text

Show detailed discussion

The human issue explicitly notes the authors leaned primarily on the Humbird TEA plus a few other papers and recommends broadening to additional peer-reviewed work, regulatory dossiers, patents, and industry reports. LLM issue L8 closely matches by criticizing heavy reliance on a few TEAs as anchors and not combining them systematically with engineering constraints and reference-class growth. Both are concerned about anchoring effects and missing newer or alternative assumptions that would change forecasts. The LLM adds a methodological prescription (structured synthesis), while the human provides concrete source categories and examples. Overall, these are strongly aligned.

H8: Insufficient specification of biological product assumptions ↔︎ L9 60%

Both critiques raise that unclear biological/species/tissue constraints in questions can change interpretations and cost implications. The human focuses on defining cell type (species and tissue) because it affects media needs and cost; the LLM notes that species restrictions and other compound constraints increase interpretation variance.

🤖 Matched LLM issue text

Show detailed discussion

The human critique argues that focusing on “cow cells” without specifying the relevant cell/tissue type leaves major cost drivers underdetermined, since media requirements differ substantially by cell type and species. LLM issue L9 addresses a related design problem: complex questions with species restrictions and multiple thresholds increase noise and variance in interpretation. The LLM does not explicitly emphasize the bioprocess cost mechanism (cell type → media requirements → cost), but it does capture the general risk that underspecified biological constraints reduce comparability across forecasts. This makes the match moderate.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H7: Forecasters/experts lacked opportunity to discuss and update forecasts (structured collaboration).

Why missed: No LLM issue addresses the elicitation process design choice of preventing discussion/iterative updating (e.g., Tetlock-style collaboration); the LLM focuses on question design, aggregation, and technical framing instead.

📋 Unmatched LLM Issues (5 not in human critique)

L2: Ambiguity in the core metric “>51% from animal cells” (mass vs calories vs value).

Why extra: Human critiques did not raise definitional ambiguity for the headline construct or hybrid-product resolution; they focused more on sample size, TEA framing, and specific technical/background errors.

L3: High resolution/measurement risk for global volumes and lack of a detailed pre-committed resolution protocol.

Why extra: Humans did not discuss outcome verifiability, proprietary data, or formal resolution rules/data sources for endpoints.

L7: Mixing forecasts with intervention recommendations without modeling counterfactual leverage.

Why extra: Humans did not critique the decision-analysis leap from probabilities to philanthropic strategy; their focus stayed on elicitation validity and technical framing.

L10: Welfare impact mechanism (displacement, rebound, market expansion) acknowledged but not analyzed.

Why extra: Human issues did not emphasize downstream animal-welfare impact modeling; they emphasized accuracy/interpretability of the forecasts and inputs.

L11: Underspecified “conditional on no transformative AI” and missing macro-scenario sensitivity.

Why extra: Humans did not raise scenario-conditioning or macro uncertainty framing beyond the specific conditional-probability mischaracterization about consumer acceptance.

📄 View full human critique

Necessary Small sample limitations [Manheim] severely limits the conclusions which can be drawn from the difference in views between the single expert and the five Somewhat overstated/mis-stated claims from these small samples: Manheim “The aggregate reported results, most notably, a 9% chance of greater than 50m metric tons of cultured meat sold in 2051, hide the disagreement, in this case, the fact that two forecasters reported a greater than 40% probability of this occurring.”
Magnitude of units error in the table for Delft overstating the cost of FGF2 1000x [Manheim] A potentially critical issue was found with the table provided to forecasters and experts for question 24, which says that the Delft analysis had a cost of FGF2/kg of 1.3-2.3B/kg, while the report itself, in table 3, this was the cost per gram, making the numbers approximately 3x what was reported for Humbird, not 3,000x. (The Risner et al price was, in fact, 1000x higher - but was also focused on the promise of eliminating FGF2, i.e. setting the price to zero. Notably, this is a possibility that the forecasts do not seem to take into account.) I do not know enough about this to check it in depth. Author’s response: Hi, Yeah that looks like a typo at first glance. Plausible it influenced things a little, though unsure it’s critical to the end results.
Mischaracterization of conditional claims The authors stated: Consumer approval did not appear as a major constraint as measured by the probability of large-scale anti-cultured meat protests (18%), or the majority of survey respondents saying they would be willing to try cultured meat (70%) (conditional upon >10,000 cumulative metric tons of cellular meat produced at any price by 2031). I assume he is saying that the ‘probability of protest predicted to be low’ and ‘willingness to try’ being high are both signs that consumer approval is not a major barrier. But as you say this is conditional upon >10 kt of production. So the forecasters might indeed think “This is unlikely to be accepted by consumers, and there will probably be massive protests if it is rolled out. And that will be a major barrier. But in the unlikely scenario that consumers seem to like it, and there are no big protests, that makes it far more likely that 10kt will be produced”/
Optional but important [Manheim] “the presentation of the Techno-Economic Analyses did not clarify that these analyses are conditional estimates, not predictions, and that high price scenarios were all based on the present-day costs… not predicted future cost.” More consultation with cell biology and bioprocessing expertise in formulating background for forecasters (specific definitional mistakes reflect a lack of expertise) [Seinkmane] They make further incorrect statements on growth factors, such as: “(growth factors are various hormones, cytokines, vitamins, and some other proteins that promote cell growth)”. Such a definition indeed can be found on Google, however in cell biology it is normally only protein signalling molecules, especially those affecting cell proliferation, that are classified as growth factors, certainly not vitamins and in most current literature not other cytokines either (see wiki article for quick review). Therefore for any further forecasts and synthesis I would strongly recommend further consultations with relevant experts, both in the CM field and in adjacent fields (wider cell biology and bioprocess engineering backgrounds). 3. Diversify sources in formulating background for forecasters [Seinkmane] The authors admit themselves that they relied primarily on one source – the Humbird TEA – with the addition of a couple other papers. I have compiled a list of literature (see relevant tab) that would hopefully provide other and more recent sources to facilitate future forecasts. Aside from peer-reviewed publications and regulatory dossiers + patents available that have already been submitted for product approval, GFI releases information regularly, including their state of industry reports and reports focusing on specific topics such as media ingredients.
Less important/unsure Lack of discussion and updating in expert elicitation [Manheim] It is somewhat unfortunate that the forecasters and the expert were not given an opportunity to discuss and update, as this is an important part of what enables superforecaster success. Specifically, Gardner and Tetlock’s (2015) book points out that discussion dynamics and structured collaboration significantly amplify performance compared to individual experts, due to shared reasoning and cross-examination of assumptions.
Define the cell type more clearly [Seinkmane] Neil & Linch have focused on beef (their questions refer to “cow cells”) and they don’t define cell type very specifically… for future forecasts CM type should be better-defined, as both the species and the tissue type influence the media requirement – and therefore the cost – to a great extent

🤖 View all GPT key issues (11)

1. Very small and potentially unrepresentative panel (n=6) for long-horizon technology-scaling forecasts; no demonstrated calibration for this domain/horizon, and no quantification of aggregate uncertainty or sensitivity to individual forecasters.

2. Ambiguous key construct: “>51% from animal cells” was not specified as weight/volume/calories/value; this can change both forecasters’ interpretations and eventual resolution, especially for hybrid products.

3. Resolution and measurement risk for global production/sales volumes is high (proprietary data, inconsistent reporting); the paper does not fully pre-commit to a detailed resolution protocol and authoritative data sources for the main endpoints.

4. Acknowledged wording/reference error in the amino-acids input-cost question (mixing “recombinant proteins” and non-amino-acid examples) plausibly biases an important ‘crux’ variable and complicates interpretation of correlations and narratives.

5. Aggregation choice (geometric mean of odds) is not justified or stress-tested against alternative pooling rules (linear pool, extremizing, performance weighting, leave-one-out), despite the tiny panel and potentially influential outliers.

6. Correlation and “statistical significance (p<0.05)” claims are not credible with n=6 and many tested pairs; multiple-comparisons and model assumptions are not addressed, risking spurious inferences about which signposts matter.

7. The report mixes descriptive forecasts with strategic recommendations, but does not model counterfactual philanthropic leverage; probabilities of outcomes do not directly imply which interventions are best.

8. Heavy reliance on a few techno-economic analyses as anchors without a structured approach to combining first-principles engineering constraints with reference-class growth; this may over-weight specific assumptions (e.g., sterility standards, media costs).

9. Some questions are compound and cognitively demanding (multiple thresholds, time windows, price conditions, species restrictions), increasing noise and interpretation variance across forecasters.

10. Key welfare mechanism (degree of displacement of conventional meat, rebound effects, market expansion) is acknowledged but not analyzed, limiting direct usefulness for animal-welfare impact estimation.

11. The “conditional on no transformative AI” assumption is underspecified and may complicate interpretation; the paper does not discuss how sensitive conclusions are to broader macro/tech scenarios (policy shocks, pandemics, geopolitical shifts).

Frech_et_al._2023

Coda title: Pharmaceutical Pricing and R&D as a Global Public Good

Coverage: 50% | Weighted Coverage: 42% | Precision: 33%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	52	70
Advancing knowledge	59	62
Claims & evidence	53	66
Global relevance	71	82
Logic & communication	76	78
Methods	64	58
Open science	85	45
Real-world relevance	65	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM captures the main human concern about over-interpreting the GDP correlation (including a close analogue to the willingness-to-pay alternative), but it misses the human’s separate critique about overconfident BOTEC cost-effectiveness claims. Many LLM points are additional, more technical measurement/replication critiques not present in the human list.

Detailed Comparison Notes

Matching rule used: an LLM issue is considered a match if it addresses the same underlying threat to inference/interpretation raised by the human issue. For H1, L2/L7/L8 are core overlaps (identification/non-diagnostic correlation), while L10 is a partial overlap (inference fragility reinforcing non-causal interpretation). H2 has no clear analogue among L1–L12.

✅ Matched Issues (1 human issues with LLM coverage)

H1: GDP correlation over-interpreted as causal effect ↔︎ L2, L7, L8, L10 85%

Both critiques argue the paper treats a GDP–(price/contribution) correlation as evidence for the authors’ mechanism, despite alternative explanations and weak identification. The LLM list expands this into specific reasons the GDP relationship is non-diagnostic (mechanical scaling, non-testable theory mapping, and confounding elasticities).

👤 Human expert critique text

[E2] The authors interpret the positive correlation between GDP and drug prices as evidence supporting their hypothesis that drug prices are affected by how much countries internalize returns from R&D (because countries with higher GDP also benefit more from future innovation). However, the relationship between GDP and drug prices could be driven by many factors, chief among them the willingness to pay for current health benefits.

🤖 Matched LLM issue text

L2: The main empirical result (log contribution strongly explained by log GDP with elasticity > 1) is plausibly mechanical because the contribution measure is built from drug spending/sales, which naturally scales with GDP; this does not identify public-goods strategic behavior.

L7: Theoretical constructs (Lindahl optimum; Nash interior/corner equilibria; bargaining) are not mapped to testable implications beyond the GDP correlation; the empirical work does not discriminate among the proposed models.

L8: The “exploitation hypothesis” test (elasticity > 1) is not a clean test of Olson–Zeckhauser in this setting because it conflates general health/pharma spending elasticities with strategic under-contribution to R&D.

L10: Outlier sensitivity and leverage are not thoroughly assessed (the US and Japan dominate totals; negative/zero contributions are dropped in logs), which can affect coefficient estimates and inference in small samples.

Show detailed discussion

The human critique (E1) frames the key problem as omitted variable bias and overly simplistic regression—GDP is correlated with other determinants, so causal language is too strong. It also (E2) highlights an alternative channel: willingness-to-pay for current health benefits could drive higher prices in richer countries, independent of internalizing R&D returns. The LLM critique overlaps strongly by arguing the GDP relationship may be mechanically induced by how the dependent variable is constructed (L2) and that the empirical work doesn’t cleanly test the theory beyond the GDP correlation (L7). L8 aligns closely with the human’s willingness-to-pay point by noting the elasticity result conflates general spending elasticities with strategic “under-contribution,” and L10 adds that small-sample leverage/outliers could further undermine inference, reinforcing the human’s caution about interpretation.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H2: BOTEC/cost-effectiveness claims stated too confidently; high sensitivity to assumptions and heterogeneity across indications.

Why missed: None of the LLM issues directly address the paper’s back-of-the-envelope cost-effectiveness (e.g., $/QALY) claims, their sensitivity analyses, or across-indication variation; the LLM focuses on the cross-country pricing/R&D contribution framework instead.

📋 Unmatched LLM Issues (8 not in human critique)

L1: Outcome (‘contribution’ in one year) may not reflect incentive-relevant global expected profits for marginal R&D.

Why extra: This is a construct/measurement-validity critique about aligning the dependent variable with R&D incentives; the human critique focuses on causal over-interpretation of GDP correlations and BOTEC confidence.

L3: Marginal cost weakly identified; MC choices can change levels/signs of contributions.

Why extra: The human critique does not discuss marginal-cost identification or robustness of MC assumptions.

L4: Assuming common marginal cost across countries is problematic due to system-specific downstream costs.

Why extra: Not raised by the human critique; it is a specific cross-country cost comparability issue.

L5: Cross-country price indices may be distorted by mix, timing, coverage, and confidential discounts.

Why extra: The human critique does not flag price-index measurement/comparability problems.

L6: Ignoring quantity/access differences risks mislabeling lower spending as free-riding.

Why extra: The human critique does not discuss utilization, access, launch lags, or coverage as confounders.

L9: National branded-originator revenues built from strong assumptions (total sales × brand share) without auditing consistency with index basket.

Why extra: This is a data-construction/auditing concern not present in the human critique.

L11: Policy conclusion ‘ROW should pay more’ lacks explicit welfare tradeoff (dynamic innovation vs static access/affordability).

Why extra: The human critique centers on identification/interpretation and BOTEC confidence, not on welfare accounting for the policy recommendation.

L12: Replication barriers: code/data wrangling opacity and missing uncertainty propagation.

Why extra: The human critique does not mention reproducibility, code availability, or uncertainty propagation.

📄 View full human critique

Optional but important
Causal claims too strong — GDP correlated with other important determinants of outcomes
[E1] The primary regression analyzed in the paper is overly simplistic and may be subject to omitted variable bias. There could be other factors correlated with GDP that are not included in the model. Therefore, results derived from this model should be interpreted with caution as the estimated effect of GDP may be overstated or misattributed.
[E2] The authors interpret the positive correlation between GDP and drug prices as evidence supporting their hypothesis that drug prices are affected by how much countries internalize returns from R&D (because countries with higher GDP also benefit more from future innovation). However, the relationship between GDP and drug prices could be driven by many factors, chief among them the willingness to pay for current health benefits.
Unsure if correct
[E2] BOTECs stated too confidently, not incorporating sensitivity to key assumptions
The authors, citing numbers from another one of their studies, claim that new drugs launched in the U.S. are very cost-effective—around $40,000 per Quality-Adjusted Life Year (QALY) (Frech et al. 2022). Two main criticisms of these back-of-the-envelope calculations: First, estimates of drug cost-effectiveness rely on many assumptions and are generally highly sensitive to them. Second, there is likely broad variation across indications in the cost-effectiveness of drugs.

🤖 View all GPT key issues (12)

1. The dependent variable (“contribution” = quasi-rents on all branded-originator sales in a single year) is not clearly the incentive-relevant object for R&D, which depends on expected global profit streams for marginal projects; life-cycle stage, risk, and therapeutic heterogeneity are ignored.

2. The main empirical result (log contribution strongly explained by log GDP with elasticity > 1) is plausibly mechanical because the contribution measure is built from drug spending/sales, which naturally scales with GDP; this does not identify public-goods strategic behavior.

3. Marginal cost (MC) is weakly identified: using Turkey’s low price index as MC and using generic-entry price erosion (then adjusting by an average rebate) both rely on strong assumptions and can materially change levels and even signs of contributions.

4. Assuming a common MC across countries is particularly problematic because the paper’s MC includes distribution and downstream costs that vary across systems (wages, geography, pharmacy/PBM margins), confounding cross-country markups.

5. Cross-country price indices may reflect differences in drug mix, launch timing, reimbursement/coverage, and confidential discounts; interpreting them as “same products, different prices” risks mismeasurement of both prices and implied quasi-rents.

6. The analysis treats lower spending/markups as free-riding, but does not incorporate quantity/access differences (e.g., launch lags, reimbursement refusals), which could mean lower prices coincide with reduced consumption of newer drugs.

7. Theoretical constructs (Lindahl optimum; Nash interior/corner equilibria; bargaining) are not mapped to testable implications beyond the GDP correlation; the empirical work does not discriminate among the proposed models.

8. The “exploitation hypothesis” test (elasticity > 1) is not a clean test of Olson–Zeckhauser in this setting because it conflates general health/pharma spending elasticities with strategic under-contribution to R&D.

9. The construction of national branded-originator revenues from total sales × brand share assumes comparability across countries and alignment with the RAND price index basket; potential inconsistencies are not audited.

10. Outlier sensitivity and leverage are not thoroughly assessed (the US and Japan dominate totals; negative/zero contributions are dropped in logs), which can affect coefficient estimates and inference in small samples.

11. The policy conclusion that ROW countries should pay more is not supported by a welfare calculation that explicitly weighs dynamic innovation gains against static access/affordability losses and distributional concerns under differential pricing.

12. Replication barriers remain: no shared code, limited detail on data wrangling/transformations from the RAND tables, and no uncertainty propagation for MC assumptions or index measurement error.

Green_et_al._2025

Coda title: Meaningfully reducing consumption of meat and animal products is an unsolved problem: A meta-analysis

Coverage: 86% | Weighted Coverage: 60% | Precision: 58%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	57	80
Advancing knowledge	55	77
Claims & evidence	60	82
Global relevance	89	86
Logic & communication	60	85
Methods	32	74
Open science	80	91
Real-world relevance	89	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM captures most major human-identified problems (missing-data imputation, single-effect selection, and inadequate risk-of-bias/attrition coverage) and partially reflects broader systematic-review rigor concerns. It misses the specific critique about small-study exclusion affecting publication-bias detectability and does not directly address the missing reproducible search strategy; conversely, it introduces several additional conceptual and interpretability issues not mentioned by the human expert.

Detailed Comparison Notes

Match-quality scores are lower where the LLM addresses adjacent process-integrity concerns (single-coder screening, post hoc eligibility) but does not explicitly cover the human’s key reporting expectations (fully reproducible search strategy, explicit inclusion/exclusion text, documented screening justifications). Precision is moderated because multiple LLM points (outcome heterogeneity, decision-unit translation, substitution inference, generalizability, sparse-category k) are absent from the human critique.

✅ Matched Issues (6 human issues with LLM coverage)

H1: Systematic review process not standard or rigorous ↔︎ L4, L5 45%

The human critique flags non-adherence to standard systematic-review methods; the LLM raises related process-rigor concerns via single-coder screening/extraction (L4) and post hoc eligibility changes (L5).

👤 Human expert critique text

General: “The authors have not followed standard methods for systematic reviews…” — this should at least be mentioned Improper missing data handling: Assigning SMD = 0.01 to non-significant unreported effects introduces systematic bias by ignoring imputation variance.

🤖 Matched LLM issue text

L4: Single-coder screening and extraction: inclusion/exclusion decisions and data extraction were primarily conducted by one author, increasing risk of missed studies, inconsistent coding, and bias; inter-rater reliability is not reported.

L5: Post hoc eligibility refinements: several inclusion decisions were made after data collection began (e.g., RPM category definition and exclusions for “substitution-inducing” or low-agency interventions), creating researcher degrees of freedom unless accompanied by preregistration or systematic robustness checks.

Show detailed discussion

The human issue is broad, asserting the review did not follow standard systematic review methods overall. The LLM does not explicitly call out missing PRISMA-style elements at a high level, but it does identify two concrete deviations that contribute to non-standard methods: reliance on one screener/coder (L4) and eligibility refinements after data collection began (L5). These LLM points overlap with the human’s “methods not standard” concern, but they do not fully capture the full scope implied by the human critique (e.g., reproducible search documentation). Thus the match is substantive but incomplete.

H2: Insufficiently documented screening and eligibility decisions ↔︎ L4, L5 35%

The human critique emphasizes missing reproducible search and clear inclusion/exclusion criteria; the LLM overlaps mainly on the transparency/credibility of screening and eligibility decisions (L4, L5) rather than the search strategy itself.

👤 Human expert critique text

Related: Missing “a fully reproducible search strategy, clearly articulated inclusion and exclusion criteria …, and justification for screening decisions are not comprehensively documented in the manuscript or supplement.

🤖 Matched LLM issue text

Show detailed discussion

The human issue focuses on missing core documentation: a reproducible search strategy, explicit inclusion/exclusion criteria, and justification for screening decisions. The LLM does not mention search strings, databases, dates, or reproducibility of the search, so that central component is not covered. However, L4 and L5 do align with the human concern about screening decisions being insufficiently robust/transparent: single-coder screening (L4) raises risk of inconsistent inclusion decisions, and post hoc eligibility refinements (L5) raise concerns about researcher degrees of freedom. Overall, the overlap is partial because the LLM critiques process integrity rather than the specific missing reporting elements the human highlights.

H3: Biased imputation for missing or null effects ↔︎ L2 95%

Both critiques target the same practice: coding inadequately reported null results as a small fixed SMD, which can bias the pooled estimate near zero.

👤 Human expert critique text

Improper missing data handling: Assigning SMD = 0.01 to non-significant unreported effects introduces systematic bias by ignoring imputation variance.

🤖 Matched LLM issue text

L2: Imputation of “unspecified nulls”: coding outcomes reported as null but lacking sufficient statistics as SMD=0.01 is arbitrary and can materially affect a pooled effect this close to zero; the count of such cases and robustness to alternative codings are not clearly presented in the main text.

Show detailed discussion

The human critique argues that assigning SMD=0.01 to non-significant unreported effects induces systematic bias and ignores imputation variance. L2 closely mirrors this by calling the coding arbitrary and potentially influential, and by noting missing reporting on how many cases were affected and whether robustness checks were done. The main difference is emphasis: the human explicitly frames it as missing-data handling that should incorporate imputation uncertainty/variance, whereas the LLM emphasizes arbitrariness, sensitivity analysis, and transparency about counts. Substantively, they converge on the same threat to validity and likely direction of distortion.

H4: Single-effect selection discards within-study information ↔︎ L1, L9 90%

Both identify loss of information and potential bias from selecting only one effect per study; the LLM adds that the specific “longest follow-up” rule needs sensitivity analyses and relates to dependence handling (L9).

👤 Human expert critique text

Single outcome selection wastes data: Extracting only one effect per study discards valuable information (multilevel modeling could work better)

🤖 Matched LLM issue text

L1: Outcome selection rule: extracting only the longest-follow-up consumption outcome per intervention is a consequential choice that may systematically shrink estimates and is not accompanied by sensitivity analyses using alternative rules (e.g., prespecified primary endpoints or multivariate follow-up modeling).

L9: Dependence and weighting: RVE is appropriate for multi-arm dependence, but results may still be sensitive to how multiple interventions from the same study are represented; additional checks (e.g., per-study aggregation under alternative models) would strengthen robustness.

Show detailed discussion

The human critique frames the problem as wasted data and suggests multilevel modeling as a better alternative to retain multiple outcomes/effects. L1 aligns strongly by criticizing the “longest follow-up only” rule as consequential and undersupported by sensitivity analyses, and it points toward alternative rules or modeling strategies. L9 partially overlaps by discussing how dependence and representation of multiple interventions per study could affect results, reinforcing the broader point that within-study multiplicity should be modeled/checked rather than collapsed arbitrarily. The LLM framing is more about rule choice and robustness checks, while the human framing is more about statistical efficiency and multilevel modeling as the remedy.

H5: Risk-of-bias assessment misses key RCT bias domains ↔︎ L7, L8 85%

Both critiques argue the bias assessment is incomplete; the LLM enumerates omitted domains (including attrition and selective reporting) and adds context on selective reporting complexities (L8).

👤 Human expert critique text

Risk-of-bias assessment is inadequate: The informal approach omits critical bias sources like selective reporting and attrition.

🤖 Matched LLM issue text

L7: Limited risk-of-bias assessment: the coded bias indicators (self-report/objective, preregistration/open data, publication status) do not fully address common RCT bias domains (attrition, allocation concealment, contamination, compliance, clustering/ICC handling).

L8: Publication-bias modeling assumptions: selection models and significance-based approaches assume selection on p<0.05 and sign, but selective reporting may occur at the outcome/analysis level and incentives differ across gray vs journal literature; this complicates interpretation of “bias-corrected” means.

Show detailed discussion

The human critique states the risk-of-bias approach is informal/inadequate and specifically flags missing selective reporting and attrition among critical sources. L7 directly matches this by stating that the coded indicators do not cover common RCT bias domains and explicitly lists attrition and related threats. L8 connects more indirectly by discussing outcome/analysis-level selective reporting and how publication-bias models may not capture it, which complements (but does not replace) a formal RoB assessment critique. Compared to the human, the LLM is more granular about methodological domains, while the human emphasizes that the approach is too informal and incomplete.

H6: Attrition bias in RCTs insufficiently addressed ↔︎ L7 70%

The human critique faults the manuscript for not discussing attrition bias; the LLM notes attrition is not covered by their risk-of-bias indicators, which is a closely related omission.

👤 Human expert critique text

Risk-of-bias assessment is inadequate: The informal approach omits critical bias sources like selective reporting and attrition. No discussion of attrition bias in RCTs… “concerning given the known non-randomness of attrition in dietary interventions”

🤖 Matched LLM issue text

Show detailed discussion

The human issue is specifically about the lack of discussion of attrition bias and why it matters (non-random attrition in dietary interventions). L7 overlaps by identifying attrition as a missing bias domain in the study’s risk-of-bias coding, implying it was not systematically evaluated. The LLM does not explicitly echo the human’s emphasis on diet-intervention-specific attrition patterns or the need for a narrative discussion, focusing instead on measurement/assessment omissions. So the match is strong on the underlying gap (attrition not adequately handled) but not perfectly aligned on the “discussion” aspect and domain-specific rationale.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H7: Excluding studies with n<25 per group is unwarranted and may hinder publication-bias detection by restricting the range of standard errors.

Why missed: No LLM issue flags the small-sample exclusion criterion or its implications for diagnosing publication bias via standard-error variation; LLM publication-bias comments focus on selection-model assumptions rather than eligibility restrictions.

📋 Unmatched LLM Issues (5 not in human critique)

L3: Pooling heterogeneous outcome constructs and conversion pathways risks non-comparability.

Why extra: The human critique did not raise construct heterogeneity or effect-size conversion comparability as a key limitation.

L6: Very sparse evidence for choice architecture category leads to over-interpretation risk.

Why extra: Humans did not focus on low-k category-specific inference or narrative overreach due to extremely wide uncertainty.

L10: Limited translation of SMDs into decision-relevant absolute units or practical impact.

Why extra: Human critiques concentrated on review methods, missing data, and bias assessment rather than interpretability/decision metrics.

L11: RPM vs MAP inference gap due to unmeasured substitution; substitution discussion should be reframed as evidence gap.

Why extra: Humans did not flag this specific conceptual inference gap about substitution effects.

L12: Generalizability constraints from eligibility choices may exclude policy-relevant interventions.

Why extra: Humans did not emphasize external validity limits arising from the review’s control/voluntariness requirements.

📄 View full human critique

Necessary General: “The authors have not followed standard methods for systematic reviews…” — this should at least be mentioned Related: Missing “a fully reproducible search strategy, clearly articulated inclusion and exclusion criteria …, and justification for screening decisions are not comprehensively documented in the manuscript or supplement. Improper missing data handling: Assigning SMD = 0.01 to non-significant unreported effects introduces systematic bias by ignoring imputation variance.
Optional/desirable Single outcome selection wastes data: Extracting only one effect per study discards valuable information (multilevel modeling could work better) Risk-of-bias assessment is inadequate: The informal approach omits critical bias sources like selective reporting and attrition. No discussion of attrition bias in RCTs… “concerning given the known non-randomness of attrition in dietary interventions”
Probably useful but more debatable Excluding studies with n<25 per group is ‘unwarranted’ because it makes publication bias harder to detect by ‘restricting the range of standard errors.’ [But one could make a case that this is a reasonable practical screening choice]

🤖 View all GPT key issues (12)

1. Outcome selection rule: extracting only the longest-follow-up consumption outcome per intervention is a consequential choice that may systematically shrink estimates and is not accompanied by sensitivity analyses using alternative rules (e.g., prespecified primary endpoints or multivariate follow-up modeling).

2. Imputation of “unspecified nulls”: coding outcomes reported as null but lacking sufficient statistics as SMD=0.01 is arbitrary and can materially affect a pooled effect this close to zero; the count of such cases and robustness to alternative codings are not clearly presented in the main text.

3. Heterogeneous outcome constructs: pooling across very different consumption measures (binary self-report reduction, frequency counts, purchasing data, servings) and multiple effect-size conversion pathways risks non-comparability and additional error; limited sensitivity analyses restrict to more homogeneous outcome types.

4. Single-coder screening and extraction: inclusion/exclusion decisions and data extraction were primarily conducted by one author, increasing risk of missed studies, inconsistent coding, and bias; inter-rater reliability is not reported.

5. Post hoc eligibility refinements: several inclusion decisions were made after data collection began (e.g., RPM category definition and exclusions for “substitution-inducing” or low-agency interventions), creating researcher degrees of freedom unless accompanied by preregistration or systematic robustness checks.

6. Sparse evidence for choice architecture under inclusion criteria: only 2 studies (3 estimates) qualify, yielding extremely wide uncertainty; narrative comparisons across theoretical categories can be over-interpreted despite low k.

7. Limited risk-of-bias assessment: the coded bias indicators (self-report/objective, preregistration/open data, publication status) do not fully address common RCT bias domains (attrition, allocation concealment, contamination, compliance, clustering/ICC handling).

8. Publication-bias modeling assumptions: selection models and significance-based approaches assume selection on p<0.05 and sign, but selective reporting may occur at the outcome/analysis level and incentives differ across gray vs journal literature; this complicates interpretation of “bias-corrected” means.

9. Dependence and weighting: RVE is appropriate for multi-arm dependence, but results may still be sensitive to how multiple interventions from the same study are represented; additional checks (e.g., per-study aggregation under alternative models) would strengthen robustness.

10. Limited translation to decision-relevant units: effects are reported mainly as SMDs, with minimal mapping to absolute reductions (e.g., meals/week, grams, CO2e) or cost-effectiveness, making “meaningfully reducing” hard to interpret for practitioners.

11. RPM vs MAP inference gap: the larger RPM pooled effect is clearly identified, but substitution to other MAP is unmeasured in most RPM studies; discussion of substitution remains speculative and should be framed as a key evidence gap rather than an implied conclusion.

12. Generalizability constraints: requirements for pure controls and voluntary choice, while improving interpretability, may exclude policy-relevant interventions (institutional defaults/portioning) and thereby limit external validity for real-world implementation contexts.

McGuire_et_al._2024

Coda title: The wellbeing cost-effectiveness of StrongMinds and Friendship Bench: Combining a systematic review and meta-analysis with charity-related data

Coverage: 75% | Weighted Coverage: 55% | Precision: 83%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	87	83
Advancing knowledge	88	82
Claims & evidence	88	78
Global relevance	93	90
Logic & communication	82	88
Methods	86	76
Open science	79	62
Real-world relevance	93	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

Most human critiques are reflected in the LLM’s identified issues, especially around discretionary/ad hoc adjustments and the linear decay assumption. The main gap is the human request for a clear, visual multiverse/sensitivity depiction, which the LLM does not explicitly surface.

Detailed Comparison Notes

Scoring rules used: a human issue is counted as covered if any matched LLM issue yields match_quality ≥ 30 (3 of 4 covered). An LLM issue is counted as matching if it appears in any matched_pairs entry (10 of 12 matched). Match qualities are lower where the LLM flags analogous selective-handling choices (e.g., L2/L8) rather than the specific outlier/high-RoB exclusion policy emphasized by the human expert.

✅ Matched Issues (3 human issues with LLM coverage)

H1: Ad hoc adjustments need standardized decision rules ↔︎ L2, L4, L5, L7, L8, L9, L10, L11, L12 70%

Both critiques point to numerous discretionary, project-specific adjustments that materially affect results. The LLM enumerates several such modeling/discounting/weighting choices, aligning with the human request for explicit, consistent rules governing adjustments.

👤 Human expert critique text

“Lack of Systematized/Standardized Adjustment Rules” — Both evaluators noted multiple “project-specific” and “subjective” analytical choices and ad-hoc adjustments. E2 particularly emphasized the approach to weighting and outliers. E1 — “I would be substantially more confident in the comparative cost-effectiveness estimates if the authors provided a clear set of rules for the adjustments they are making, along with a list of these adjustments to be applied consistently across all evaluated projects.” 2. Clearer/visual depiction of sensitivity analysis/multiverse analysis — Both evaluators requested a concise visual summary (such as a specification curve or analogue of Figure 2) showing how the large number of analytical choices and adjustments systematically affect the final estimates. E2 called this their “most significant statistical recommendation”.

🤖 Matched LLM issue text

L2: Handling of influential long-term follow-ups is ad hoc (exclude from core time model, then apply a 1.54 multiplicative compromise); this is a structural modeling choice without clear precedent and materially affects results.

L4: Charity M&E pre–post effects are “corrected” using a pseudo-synthetic control from other trials’ control arms; exchangeability is not established, so residual bias (regression to mean, measurement context, incentives, selection/attrition) could be large.

L5: Subjective weights across evidence sources (general RCTs vs charity-related RCTs vs M&E) are central to the headline estimates; the procedure is transparent but remains a major source of discretion, especially for StrongMinds where the RCT conflicts with other evidence.

L7: Dosage adjustment for Friendship Bench is driven by extremely low reported attendance (1.12 sessions) and a chosen log(+1) dose-response form; empirical support for this functional form and parameterization is limited.

L8: The “Iran study” adjustment treats larger effects as likely biased based on contextual suspicion; while potentially prudent, it risks discarding true contextual heterogeneity and is not validated against independent indicators of study quality.

L9: Publication-bias adjustment uses an average of multiple correction methods (some known to perform poorly under heterogeneity); the resulting discount is not clearly tied to the estimand under multilevel dependence and time moderation.

L10: The replication (0.51) and response-bias (0.85) discounts for M&E data are generic and not estimated in the charity context; they may double-count some biases or miss others, and materially alter the M&E contribution.

L11: StrongMinds RCT relevance assessment is plausible but partly speculative (pilot/partner/adolescents/COVID); the decision to downweight rather than treat as a serious negative update is consequential and could be contested.

L12: Cost modeling includes counterfactual partner adjustments for StrongMinds; these assumptions are important to $/WELLBY and would benefit from clearer empirical grounding and uncertainty propagation.

Show detailed discussion

The human experts emphasize the meta-level concern: many adjustments appear subjective and would be more credible if governed by a clearly stated, consistently applied rulebook (including how to treat outliers and weighting). The LLM echoes this by flagging multiple concrete places where judgment calls drive headline estimates—e.g., the long-term follow-up handling (L2), evidence-source weighting (L5), bespoke study/context adjustments (L8, L11), M&E discounts (L10), publication-bias correction choices (L9), and cost-counterfactual assumptions (L12). Where the human critique asks for systematization and a catalog of rules/adjustments, the LLM critique mostly diagnoses the specific adjustment points and why each is consequential. The overlap is strong on “discretion/ad hoc choices,” but the LLM is less explicit about proposing a standardized protocol applied uniformly across projects, hence not a near-perfect match.

H2: Excluding outliers/high-bias data may be arbitrary ↔︎ L2, L8 55%

The human issue targets exclusion rules for outliers and high-RoB studies, arguing for transparent inclusion and sensitivity checks. The LLM raises closely related concerns about discarding/discounting influential evidence (long-term follow-ups in L2; suspicious-context study adjustment in L8) as potentially unjustified or not validated.

👤 Human expert critique text

“Lack of Systematized/Standardized Adjustment Rules” — Both evaluators noted multiple “project-specific” and “subjective” analytical choices and ad-hoc adjustments. E2 particularly emphasized the approach to weighting and outliers. E1 — “I would be substantially more confident in the comparative cost-effectiveness estimates if the authors provided a clear set of rules for the adjustments they are making, along with a list of these adjustments to be applied consistently across all evaluated projects.” 1. “Arbitrary Exclusion of Outliers and High Risk of Bias (RoB) Studies” — Evaluator 2 critiqued the removal of outliers (e.g., effect sizes >2 SDs) and high-risk bias studies, arguing it seems “arbitrary” and that it’s “always better to include the full data” to transparently show the impact of such exclusions.

🤖 Matched LLM issue text

Show detailed discussion

Evaluator 2’s critique is about principled data inclusion: outlier trimming (e.g., >2 SD) and dropping high-RoB studies can look arbitrary, and the preferred remedy is showing results with and without such exclusions. The LLM does not directly discuss a generic outlier rule or systematic high-RoB exclusion, but it does highlight analogous “removal/discounting” decisions that can function like selective exclusion. In L2, the long-term follow-up is effectively sidelined from the core time model and then partially reintroduced via a multiplicative compromise, which parallels concerns about selectively handling influential datapoints. In L8, the adjustment that treats a larger effect as likely biased due to contextual suspicion similarly risks discarding real heterogeneity without independent validation. The match is therefore related-but-not-identical: the LLM critiques particular exclusion-like choices rather than the paper’s explicit outlier/high-RoB exclusion policy.

H4: Linear decay assumption may misstate total effects ↔︎ L1 95%

Both critiques directly challenge the choice of linear decay over time for psychotherapy effects and note that total-effect estimates are highly sensitive to this assumption. The LLM adds the triangular AUC framing and emphasizes nonlinear/long-tail alternatives.

👤 Human expert critique text

Linear decay model — Evaluator 2 expressed surprise at the linear choice, suggesting exponential models are more common in psychological phenomena and that linear decay might lead to “overestimation”

🤖 Matched LLM issue text

L1: Total-effect-over-time relies on a linear decay assumption and triangular area-under-curve; psychotherapy effects may decay nonlinearly or have long tails, making total WELLBYs highly assumption-sensitive.

Show detailed discussion

Evaluator 2 questions why the authors assume linear decay when exponential or other nonlinear patterns are common in psychological phenomena and worries linear decay could overestimate effects. The LLM’s L1 is essentially the same technical concern: a linear decay with triangular area-under-curve can misrepresent persistence (e.g., long tails) and makes total WELLBYs assumption-sensitive. The LLM frames the consequence in terms of integrating effect over time (AUC) and sensitivity of total benefits, while the human critique frames it as model choice plausibility and potential bias direction. Both converge on the need to justify the functional form and/or show robustness across plausible decay models.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H3: Clearer/visual depiction of sensitivity or multiverse analysis (e.g., specification curve).

Why missed: The LLM issues critique many assumption-sensitive choices but do not explicitly request a consolidated visual summary (specification curve/multiverse plot) or emphasize presentation/communication of sensitivity results as a primary recommendation.

📋 Unmatched LLM Issues (2 not in human critique)

L3: Normative/construct validity concerns in WELLBY conversion and combining outcome constructs.

Why extra: Human critiques (as provided) focus on adjustment rules, exclusions, sensitivity visualization, and decay functional form, not on the philosophical/measurement assumptions of mapping SD-years to WELLBYs or mixing MHa with SWB constructs.

L6: Large household spillover component driven by sparse, low-quality evidence.

Why extra: The human issues do not mention spillover estimation quality or the dominance/sparsity of the spillover evidence base, whereas the LLM flags this as a major driver of totals and uncertainty.

📄 View full human critique

Necessary
“Lack of Systematized/Standardized Adjustment Rules” — Both evaluators noted multiple “project-specific” and “subjective” analytical choices and ad-hoc adjustments. E2 particularly emphasized the approach to weighting and outliers. E1 — “I would be substantially more confident in the comparative cost-effectiveness estimates if the authors provided a clear set of rules for the adjustments they are making, along with a list of these adjustments to be applied consistently across all evaluated projects.”
Optional but important
“Arbitrary Exclusion of Outliers and High Risk of Bias (RoB) Studies” — Evaluator 2 critiqued the removal of outliers (e.g., effect sizes >2 SDs) and high-risk bias studies, arguing it seems “arbitrary” and that it’s “always better to include the full data” to transparently show the impact of such exclusions.
Clearer/visual depiction of sensitivity analysis/multiverse analysis — Both evaluators requested a concise visual summary (such as a specification curve or analogue of Figure 2) showing how the large number of analytical choices and adjustments systematically affect the final estimates. E2 called this their “most significant statistical recommendation”.
Linear decay model — Evaluator 2 expressed surprise at the linear choice, suggesting exponential models are more common in psychological phenomena and that linear decay might lead to “overestimation”

🤖 View all GPT key issues (12)

1. Total-effect-over-time relies on a linear decay assumption and triangular area-under-curve; psychotherapy effects may decay nonlinearly or have long tails, making total WELLBYs highly assumption-sensitive.

2. Handling of influential long-term follow-ups is ad hoc (exclude from core time model, then apply a 1.54 multiplicative compromise); this is a structural modeling choice without clear precedent and materially affects results.

3. The WELLBY conversion (SD-years → WELLBYs via a single “SD=2 on 0–10 ladder” mapping) and combining affective mental health (MHa) with SWB outcomes embed normative/construct assumptions that can shift cross-intervention comparisons.

4. Charity M&E pre–post effects are “corrected” using a pseudo-synthetic control from other trials’ control arms; exchangeability is not established, so residual bias (regression to mean, measurement context, incentives, selection/attrition) could be large.

5. Subjective weights across evidence sources (general RCTs vs charity-related RCTs vs M&E) are central to the headline estimates; the procedure is transparent but remains a major source of discretion, especially for StrongMinds where the RCT conflicts with other evidence.

6. Household spillovers contribute ~30–40% of total benefits but are estimated from very sparse and heterogeneous evidence (few interventions, one dominant study) and acknowledged as very low-quality; the spillover prior strongly affects totals.

7. Dosage adjustment for Friendship Bench is driven by extremely low reported attendance (1.12 sessions) and a chosen log(+1) dose-response form; empirical support for this functional form and parameterization is limited.

8. The “Iran study” adjustment treats larger effects as likely biased based on contextual suspicion; while potentially prudent, it risks discarding true contextual heterogeneity and is not validated against independent indicators of study quality.

9. Publication-bias adjustment uses an average of multiple correction methods (some known to perform poorly under heterogeneity); the resulting discount is not clearly tied to the estimand under multilevel dependence and time moderation.

10. The replication (0.51) and response-bias (0.85) discounts for M&E data are generic and not estimated in the charity context; they may double-count some biases or miss others, and materially alter the M&E contribution.

11. StrongMinds RCT relevance assessment is plausible but partly speculative (pilot/partner/adolescents/COVID); the decision to downweight rather than treat as a serious negative update is consequential and could be contested.

12. Cost modeling includes counterfactual partner adjustments for StrongMinds; these assumptions are important to $/WELLBY and would benefit from clearer empirical grounding and uncertainty propagation.

Peterman_et_al._2025

Coda title: Social Safety Nets, Women’s Economic

Coverage: 40% | Weighted Coverage: 18% | Precision: 18%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	—	82
Advancing knowledge	—	83
Claims & evidence	—	81
Global relevance	—	92
Logic & communication	—	76
Methods	—	78
Open science	—	64

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The human critiques focus on ASQ-3 measurement validity, trial reporting, and within-study multiplicity, while the LLM critiques largely target meta-analysis methodology (heterogeneity, RVE, moderator testing, representativeness, open science). Only two human issues have even partial conceptual overlap with the LLM list, yielding low coverage and very low precision.

Detailed Comparison Notes

Interpretation is constrained by apparent topic mismatch: the LLM issues read like a critique of a meta-analysis (social safety nets/agency/cost-benefit, RVE, heterogeneity), whereas the human issues clearly concern an RCT using ASQ-3 in India (validation, cutoffs, blinding, CONSORT). The only meaningful bridges are (a) general measurement validity/context dependence (H1~L9) and (b) general multiple-testing/pre-specification logic (H2~L5). If the intention was to compare critiques of the same paper, the LLM issue set likely corresponds to a different manuscript or an earlier/different evaluation target.

✅ Matched Issues (2 human issues with LLM coverage)

H1: Need context-specific validation of measures ↔︎ L9 35%

H1 asks for evidence the ASQ-3 instrument is validated in India; L9 similarly raises that outcomes can be noisy and context-dependent, requiring stronger measurement-validity discussion.

👤 Human expert critique text

ASQ regional validation: “it would be useful to show that the ASQ3 has been validated in India.”

🤖 Matched LLM issue text

L9: Agency measurement is known to be noisy and context-dependent; the paper notes some limitations (e.g., decision-making ambiguity) but does not fully assess how measurement heterogeneity may attenuate or distort pooled results.

Show detailed discussion

The human critique is concrete and instrument-specific: show that ASQ-3 has been validated in India (or otherwise justify its use). The LLM critique is broader and about measurement heterogeneity and context dependence, but in a different substantive domain (agency measures) rather than ASQ-3. Both are ultimately about construct validity and whether the measurement means the same thing across settings. However, L9 does not explicitly call for regional validation evidence, so the overlap is conceptual rather than direct.

H2: Multiple comparisons and pre-specification concerns ↔︎ L5 55%

H2 criticizes presenting many outcomes/items without multiple-comparison adjustments or a clearly pre-specified plan; L5 similarly flags multiple-testing risks and recommends pre-specification/correction strategies.

👤 Human expert critique text

Lack of MHT comparisons “The authors present results for an overall child development index, domain-specific indices, as well as individual ASQ items. They do this without accounting for multiple comparisons or presenting this as a pre-specified plan” DR: I checked the PAPs (3 pdfs) on NotebookLM and it said “the detailed breakdown into the multiple component indices and individual items… was not explicitly included in the PAP’s definition of the outcome.

🤖 Matched LLM issue text

L5: Moderator/meta-regression analysis appears underpowered for key subgroups and vulnerable to multiple-testing false positives; a pre-specified moderator hierarchy and/or correction strategy would strengthen inference.

Show detailed discussion

The human critique is focused on within-study reporting: many ASQ domains and individual items are analyzed, raising familywise error concerns and questions about whether the breakdown was pre-specified in a PAP. The LLM critique is focused on meta-analytic moderator/meta-regression testing, but it targets the same statistical vulnerability: exploring many comparisons can create false positives unless hierarchy/corrections are used. Both recommend stronger discipline around what was planned versus explored and how inferential error rates are controlled. The mismatch is that H2 is about outcome multiplicity in a trial paper, while L5 is about multiplicity in subgroup/moderator analysis, so they are related but not the same operational problem.

⚠️ Unmatched Human Issues (3 not captured by LLM)

H3: ASQ-3 is a screening tool; report effects on ‘at-risk’ proportions using cutoffs, not only index scores.

Why missed: No LLM issue discusses using clinically/diagnostically meaningful ASQ-3 cutoffs or re-expressing results as risk/proportion-at-risk rather than continuous indices.

H4: Include a CONSORT flow diagram for participant flow.

Why missed: LLM issues focus on meta-analysis reporting/reproducibility and do not mention CONSORT-style trial reporting elements.

H5: Parent-reported ASQ-3 in an unblinded trial may bias results (demand effects); should be emphasized as a key limitation.

Why missed: LLM issues do not address blinding, respondent-report bias, or demand effects as threats to internal validity in an RCT context.

📋 Unmatched LLM Issues (9 not in human critique)

L1: Very high heterogeneity undermines pooled effect interpretability; need prediction intervals/emphasis on dispersion.

Why extra: Human critiques are about ASQ-3 measurement/analysis choices within a single study, not heterogeneity across a meta-analytic evidence base.

L2: Internal inconsistencies/typos in reported magnitudes and confidence intervals need reconciliation.

Why extra: No human issue raises numerical inconsistencies, traceability to code, or reporting typos.

L3: Outcome harmonization across disparate constructs using standardized effects risks conflating mechanisms; need construct-validity justification.

Why extra: Human critiques are specific to ASQ-3 child development measurement, not harmonizing multiple construct families in a meta-analysis.

L4: Insufficient transparency on robust variance estimation (RVE) settings and sensitivity to within-study correlation assumptions.

Why extra: Human critiques do not mention RVE, dependence among effect sizes, or meta-analytic estimation choices.

L6: Modality comparisons may be confounded by correlated program design features; causal claims should be more cautious.

Why extra: Human issues do not discuss cross-program modality comparisons or confounding among intervention features.

L7: RCT-only inclusion may reduce representativeness for national programs evaluated quasi-experimentally.

Why extra: Human critiques are not about evidence-inclusion criteria or external validity from restricting study designs in a review/meta-analysis.

L8: Regional and modality concentration makes conclusions fragile for underrepresented settings/interventions.

Why extra: Aside from a narrow request for ASQ-3 validation in India, human critiques do not address evidence-base concentration in a synthesis context.

L10: Cost-benefit evidence is sparse and not harmonized; women-specific benefits often omitted.

Why extra: Human critiques do not discuss cost-benefit analysis or value-for-money claims.

L11: Open-science posture incomplete (delayed repository; extraction form not public) limiting reproducibility.

Why extra: Human critiques do not mention data/code sharing, extraction forms, or reproducibility infrastructure.

📄 View full human critique

Optional but important DR — I chose the critiques that seemed important to me, and vetted them quickly in NotebookLM, dropping some the authors’ already seem to address, at least in the August version. But I’m not an expert in this area. ASQ regional validation: “it would be useful to show that the ASQ3 has been validated in India.” Lack of MHT comparisons “The authors present results for an overall child development index, domain-specific indices, as well as individual ASQ items. They do this without accounting for multiple comparisons or presenting this as a pre-specified plan” DR: I checked the PAPs (3 pdfs) on NotebookLM and it said “the detailed breakdown into the multiple component indices and individual items… was not explicitly included in the PAP’s definition of the outcome. “ASQ3 is meant as a screening tool for early childhood development, not a test of child’s abilities. Typically, it relies on cutoffs to determine whether a child is “at risk” of suboptimal development in certain domains. An index score for the ASQ3 were used in the analysis, but it would be useful to see the analysis for differences in proportion of children at risk of developmental delays.”
“Possibly relevant but I’m not sure: Consort flow diagram — “ the study should include a consort flow diagram” I’m not sure if this is desirable practice in this context. I don’t think it’s common practice in the economics literature “ASQ3 is parent reported. This trial was not blinded, so results could have been biased, which is worth mentioning as a key limitation” They seem to recognize this to an extent, even if it’s not emphasized. At least they discuss the issue of ‘experimental demand effects’ in the appendix and provide evidence suggesting it may not be important
Limitations for LLM benchmarking: David Reinstein is not an expert on this. Only 1 strong evaluation, and the critiques are not always in the economics wheelhouse.

🤖 View all GPT key issues (11)

1. Extremely high heterogeneity (I² often ~80–95% and τ² > ~0.02) means the pooled mean effect may be a weak policy summary; the paper should more strongly emphasize dispersion/prediction intervals and limits to generalization.

2. Several apparent internal inconsistencies/typos in reported magnitudes and intervals (e.g., public works pooled effect reported with different values; at least one implausible confidence interval) need reconciliation and traceability to code output.

3. Outcome harmonization across very different constructs (savings/assets/expenditures vs autonomy/voice/decision-making) using standardized effects risks conflating mechanisms and complicates interpretation; stronger justification and construct-validity discussion is needed.

4. RVE implementation details are not fully transparent in the main text (e.g., assumed within-study correlation rho, small-sample corrections, sensitivity to rho), which can matter with many dependent effect sizes per study.

5. Moderator/meta-regression analysis appears underpowered for key subgroups and vulnerable to multiple-testing false positives; a pre-specified moderator hierarchy and/or correction strategy would strengthen inference.

6. Modality comparisons (UCT vs CCT vs in-kind, etc.) may be confounded by correlated design features (targeting rules, recipient identity, enforcement intensity, complementary programming, baseline contexts) that are only coarsely measured; causal interpretation of modality differences should be more cautious.

7. Restricting inclusion to RCTs improves internal validity but may reduce representativeness for large-scale national programs commonly evaluated quasi-experimentally; this could bias conclusions about real-world SSN performance.

8. The evidence base is heavily concentrated in certain regions (notably sub-Saharan Africa) and in cash modalities; conclusions about underrepresented modalities (social care, public works, fee waivers) and regions are therefore fragile.

9. Agency measurement is known to be noisy and context-dependent; the paper notes some limitations (e.g., decision-making ambiguity) but does not fully assess how measurement heterogeneity may attenuate or distort pooled results.

10. Cost-benefit evidence is sparse and not harmonized; most studies omit women-specific benefits, making any overall “value for money” implication highly uncertain and potentially a lower bound in unknown ways.

11. Open-science posture is incomplete at present (public repository only “upon acceptance”; extraction form not public), limiting immediate reproducibility and external auditing of coding decisions.

Weaver_et_al._2025

Coda title: Cash Transfers for Child Development: Experimental Evidence from India

Coverage: 80% | Weighted Coverage: 56% | Precision: 50%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	—	90
Advancing knowledge	—	89
Claims & evidence	—	88
Global relevance	—	95
Logic & communication	—	90
Methods	—	85
Open science	—	62

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM covers most major human concerns (4/5), especially the key multiple-testing and sanitation-heterogeneity issues, but it misses one entire human request (CONSORT/DAG) and only partially addresses the intervention-context/interacting-programs concern. Conversely, half of the LLM’s points add new (often valuable) critiques not raised by humans, lowering precision against the human issue set.

Detailed Comparison Notes

Match threshold used: match_quality >= 30. H2 is treated as matched but only moderately because LLM addresses messaging/bundling clarity but not the specific concern about concurrent CCTs/health services and resulting interaction effects. H4 is treated as a low-confidence match because it shares the measurement-validity theme but not the breastmilk-specific omission.

✅ Matched Issues (4 human issues with LLM coverage)

H1: Sanitation heterogeneity and multiple-testing risk ↔︎ L2, L7, L8 90%

Both critiques warn that the sanitation-subgroup result is fragile: the LLM highlights post-treatment/non-random sanitation measurement and measurement error, and also flags multiple outcomes/heterogeneities without clear multiple-testing control—directly aligning with the human concern about overstated significance.

👤 Human expert critique text

Subgroup analysis risks overstating statistical significance without adjustments for MHT. Particular signs of this: Tsai notes “‘nutrition only helped for the high-sanitation subgroup’ claim, noting that ‘other studies in LMICs have compared joint nutrition and sanitation interventions to only nutrition interventions directly, and effects on stunting, wasting, and underweight are not consistently better.’” DR: Although the authors couch their language about this as “suggestive evidence” of mediation, they emphasize it quite a lot in the abstract, intro, etc — a reasonable reader is likely to have the takeaway that the treatment had positive impacts on anthropometric outcomes in areas with better sanitation (and that these differences were ‘statistically significant’)

🤖 Matched LLM issue text

L2: . Sanitation heterogeneity relies on non-random, post-treatment sanitation measurement; interaction estimates should be presented as suggestive and vulnerable to confounding by correlated area characteristics (health access, markets, disease ecology), despite balance on treatment and residualization checks.

L7: . The manuscript analyzes many outcomes and heterogeneities; it should more explicitly define primary families, address multiple-hypothesis testing (beyond indices), and distinguish confirmatory vs exploratory findings.

L8: . Post-treatment sanitation index includes respondent perceptions of neighbors and enumerator observations; reliability and potential measurement error (including enumerator subjectivity) deserve more discussion and sensitivity analyses using alternative sanitation proxies.

Show detailed discussion

The human expert’s core worry is interpretive: the paper’s emphasis (abstract/intro) could leave readers believing anthropometric effects are genuinely “statistically significant” in high-sanitation areas, despite subgroup/MHT concerns. The LLM converges on the same vulnerability but frames it more as identification and measurement: sanitation is measured post-treatment and may proxy for correlated area characteristics (L2) and has reliability issues (L8). The LLM also adds a general manuscript-wide multiple-testing critique (L7), which supports the human point that subgroup significance can be overstated without adjustments. Overall, the LLM covers both the statistical-significance overstatement channel (MHT) and a deeper reason the heterogeneity could be spurious (post-treatment, confounded sanitation measure).

H2: Intervention details and messaging/implementation context ↔︎ L1, L6 60%

Both request clearer articulation of what the intervention actually was and how messaging operated; the LLM focuses on bundling/interpretation of “unconditional cash” (L1) and on measuring actual IVR exposure (L6), which partially overlaps with the human request for more messaging/context detail.

👤 Human expert critique text

Subgroup analysis risks overstating statistical significance without adjustments for MHT. Particular signs of this: Tsai notes “‘nutrition only helped for the high-sanitation subgroup’ claim, noting that ‘other studies in LMICs have compared joint nutrition and sanitation interventions to only nutrition interventions directly, and effects on stunting, wasting, and underweight are not consistently better.’” 2. Need more detail on the intervention context: E1 asked for more details on the messaging: “How were participants ‘encouraged’ to use the funds for nutritious foods?” Tsai particularly wanted more information on concurrent conditional cash transfers (CCTs) and health services targeting early childhood, noting that there might be interaction effects.

🤖 Matched LLM issue text

L1: . Clarify that the estimated effects are for a bundled intervention (cash delivered to women + nutrition framing/IVR), not a generic income shock; claims about “unconditional cash” should be qualified accordingly.

L6: . Evidence that IVR messaging had little effect is indirect (based on having registered a mobile number); stronger tests using call delivery/connection data or instruments for actual exposure would improve interpretability.

Show detailed discussion

The human critique asks for richer contextual detail: what exactly participants were told/encouraged to do with cash, and how concurrent programs (CCTs/health services) might interact with the intervention. The LLM similarly pushes for clearer intervention definition (cash + nutrition framing/IVR rather than a pure income shock) and better evidence on messaging exposure (call connection/delivery), which speaks to the “how did messaging work?” component. However, the LLM does not explicitly raise the possibility of concurrent CCTs/health-service targeting creating interaction effects or confounding the interpretation, which is a central part of the human issue. So the overlap is substantial on messaging/packaging clarity, but incomplete on the broader policy-context/interacting-programs concern.

H3: Multiple comparisons across many indices/outcomes ↔︎ L7 95%

Both critiques directly flag a multiple-hypothesis testing problem arising from many reported indices/items/heterogeneities without clear family definitions or adjustments.

👤 Human expert critique text

MHT issue for multiple indices: E1 “The authors present results for an overall child development index, domain-specific indices, as well as individual ASQ items. They do this without accounting for multiple comparisons or presenting this as a pre-specified plan.” [NotebookLM] — But “The PaPs prioritize specifications and randomization inference for the primary hypotheses, which focus on anthropometric measures”

🤖 Matched LLM issue text

Show detailed discussion

The human expert points specifically to child development indices and ASQ items being analyzed without MHT correction or clarity that this was pre-specified. The LLM generalizes the same concern across the manuscript—many outcomes and heterogeneities—and calls for explicit primary families and clearer confirmatory vs exploratory labeling. The LLM framing is slightly broader (covering indices “beyond” any single domain), but it squarely covers the human complaint about multiple comparisons. This is one of the cleanest overlaps between the two critiques.

H4: Diet measurement limitations (including missing components) ↔︎ L5 35%

The human issue about excluding breastmilk from child consumption is a specific measurement limitation; the LLM raises broader dietary recall/portion/allocation measurement error concerns that relate only indirectly.

🤖 Matched LLM issue text

L5: . Dietary intake measurement, while unusually detailed, still depends on 24-hour recall and complex allocation/portion measurement; the paper could better assess robustness to measurement error and potential differential misreporting.

Show detailed discussion

The human critique is narrowly about an omitted intake source (breastmilk) and how that affects child food consumption accounting. The LLM does not mention breastmilk specifically, but it does question the robustness of dietary measurement based on 24-hour recall and complex allocation/portioning, including potential differential misreporting. Conceptually, both are “diet measurement validity” issues, but they point to different failure modes (systematic omission vs recall/measurement error). Because the LLM doesn’t directly engage the breastmilk omission, the match is partial rather than strong.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H5: Request for CONSORT-style flow chart and/or DAG

Why missed: None of the LLM issues explicitly mention adding a CONSORT diagram or a DAG as reporting/clarity improvements; the LLM focused more on identification, measurement, and inference concerns than on these specific presentation standards.

📋 Unmatched LLM Issues (6 not in human critique)

L3: COVID-truncated year-2 subsample and mode/composition changes

Why extra: Human critiques did not raise concerns about COVID-driven sampling/mode changes affecting year-to-year comparability.

L4: Parent-reported ASQ outcomes susceptible to reporting/demand effects

Why extra: Humans flagged multiple testing around ASQ items but did not emphasize measurement bias from parent reporting or experimenter demand.

L9: Non-causal interpretation of “explained share” mediation-style correlations

Why extra: Human issues focused on subgroup/MHT and context details, not on the causal interpretability of correlation-based channel decomposition.

L10: Year-1 non-food spending missing due to survey form error

Why extra: Humans did not mention this data gap or its implications for welfare/budget-share interpretation.

L11: External validity limits (Jharkhand/ICDS registrants; wealth differences)

Why extra: Humans did not discuss generalizability beyond the study setting or sampling frame.

L12: Replication readiness (code/data availability; construction scripts)

Why extra: Humans did not raise reproducibility, data/code access, or index-construction transparency as a critique.

📄 View full human critique

Optional but important
Subgroup analysis risks overstating statistical significance without adjustments for MHT. Particular signs of this: Tsai notes “‘nutrition only helped for the high-sanitation subgroup’ claim, noting that ‘other studies in LMICs have compared joint nutrition and sanitation interventions to only nutrition interventions directly, and effects on stunting, wasting, and underweight are not consistently better.’”
DR: Although the authors couch their language about this as “suggestive evidence” of mediation, they emphasize it quite a lot in the abstract, intro, etc — a reasonable reader is likely to have the takeaway that the treatment had positive impacts on anthropometric outcomes in areas with better sanitation (and that these differences were ‘statistically significant’)
Need more detail on the intervention context: E1 asked for more details on the messaging: “How were participants ‘encouraged’ to use the funds for nutritious foods?” Tsai particularly wanted more information on concurrent conditional cash transfers (CCTs) and health services targeting early childhood, noting that there might be interaction effects.
Unsure/maybe less important
MHT issue for multiple indices: E1 “The authors present results for an overall child development index, domain-specific indices, as well as individual ASQ items. They do this without accounting for multiple comparisons or presenting this as a pre-specified plan.” [NotebookLM] — But “The PaPs prioritize specifications and randomization inference for the primary hypotheses, which focus on anthropometric measures”
“Exclusion of breastmilk from child food consumption” accounting. NotebookLM suggests that the authors recognize this limitation.
Both evaluators mentioned including flow charts: a consort diagram (evaluator 1) and a DAG (evaluator 2). DR: I’m not sure whether this is an important standard?

🤖 View all GPT key issues (12)

1. Clarify that the estimated effects are for a bundled intervention (cash delivered to women + nutrition framing/IVR), not a generic income shock; claims about “unconditional cash” should be qualified accordingly.

2. Sanitation heterogeneity relies on non-random, post-treatment sanitation measurement; interaction estimates should be presented as suggestive and vulnerable to confounding by correlated area characteristics (health access, markets, disease ecology), despite balance on treatment and residualization checks.

3. Year-2 calorie/nutrient outcomes are based on a COVID-truncated in-person subsample; comparisons across years (e.g., “empowerment grows over time”) may be partly driven by sample composition and survey mode changes.

4. Child functional development is measured via parent-reported ASQ items; differential reporting/experimenter-demand effects cannot be fully ruled out without more objective assessor-based measures or validation subsamples.

5. Dietary intake measurement, while unusually detailed, still depends on 24-hour recall and complex allocation/portion measurement; the paper could better assess robustness to measurement error and potential differential misreporting.

6. Evidence that IVR messaging had little effect is indirect (based on having registered a mobile number); stronger tests using call delivery/connection data or instruments for actual exposure would improve interpretability.

7. The manuscript analyzes many outcomes and heterogeneities; it should more explicitly define primary families, address multiple-hypothesis testing (beyond indices), and distinguish confirmatory vs exploratory findings.

8. Post-treatment sanitation index includes respondent perceptions of neighbors and enumerator observations; reliability and potential measurement error (including enumerator subjectivity) deserve more discussion and sensitivity analyses using alternative sanitation proxies.

9. The paper infers that intermediate-channel changes (knowledge, empowerment, AWC services) explain little via cross-sectional correlations; this “explained share” approach is not causal and may be misleading if interpreted too literally.

10. Non-food spending was not measured in year 1 due to a survey form error; this limits the welfare/accounting interpretation (budget shares, non-food mechanisms) during the key period of transfer receipt.

11. External validity beyond Jharkhand and beyond ICDS-linked registrants is good but not complete; the sample is slightly wealthier than the state average and depends on engagement with AWCs, which may differ in other states/settings.

12. Replication readiness is unclear from the manuscript text excerpt: despite registry/PAPs, the paper should state whether code/data (or restricted-access procedures) are available and provide full construction scripts for complex nutrition and sanitation indices.

Williams_et_al._2024

Coda title: Global potential for natural regeneration in deforested tropical regions

Coverage: 88% | Weighted Coverage: 56% | Precision: 67%

📊 Human vs LLM ratings for this paper

Criterion	Human (avg)	LLM (mid)
Overall assessment	63	87
Advancing knowledge	67	85
Claims & evidence	48	82
Global relevance	81	92
Logic & communication	68	90
Methods	43	75
Open science	64	62
Real-world relevance	75	—

Human = avg of evaluator midpoints; LLM = model midpoint rating

LLM Assessment Summary

The LLM captures most major human concerns (7/8), including confounding and label omission, and partially captures data leakage and label-definition problems. It misses one prominent human-identified flaw: the specific logical error of validating a 2030 prediction against historical (2000–2016) outcomes. Several LLM points extend beyond the human critique (calibration, carbon downscaling, misuse risk, reproducibility), lowering precision but adding relevant methodological depth.

Detailed Comparison Notes

Match-quality threshold used for coverage: >=30%. Some matches are partial because the LLM addresses the same broad failure mode (e.g., temporal inconsistency) but not the exact human framing (e.g., predictors being influenced by regeneration itself). Precision counts an LLM issue as ‘matched’ if it supports at least one human issue at >=30% quality; by that rule, 8/12 LLM issues align with human critiques.

✅ Matched Issues (7 human issues with LLM coverage)

H1: Temporal leakage / contemporaneous predictors in training ↔︎ L7, L8 75%

Both critiques flag problematic time alignment between predictors and the 2000–2016 regrowth outcome, which can inflate apparent performance. The LLM frames it as temporal inconsistency and mixed time windows rather than explicitly as post-outcome leakage.

👤 Human expert critique text

Data leakage/contemporaneous predictors [mentioned: both evaluators and UJ field specialists] — The ML model seems to be fitted on data/variables that includes post-regeneration outcomes, or that could be influenced by these outcomes; or “using future information to predict past events, or accidentally including validation data in the training data.” This can lead to overly optimistic estimates of model performance and poor generalization to new data.

🤖 Matched LLM issue text

L7: . Predictor rasters have mixed spatial/temporal resolutions (30 m to ~1 km; climate normals 1970–2000; NPP 2000–2015; burn area 2001–2017), yet outputs are presented at 30 m; this can create visually fine-scale artifacts and overconfidence in local precision.

L8: . Updating key forest covariates to 2018 while training on 2000–2016 outcomes introduces temporal inconsistency; combined with known version differences in GFW products, this may yield local incoherence and affect transfer to 2015/2030 predictions.

Show detailed discussion

The human reviewers argue that key predictors (e.g., NPP, burned area, road density) are measured during the same window as the outcome and may be influenced by regeneration, creating leakage and optimistic accuracy. The LLM similarly notes predictors spanning 2000–2015/2017 and other time-window mismatches, plus inconsistencies from updating covariates to later years. However, the LLM emphasizes resolution/temporal-mismatch artifacts and transfer incoherence, while the human critique emphasizes the causal direction problem (predictors containing information affected by the outcome). Substantively, both point to the same risk: inflated in-sample validation and degraded real-world generalization due to temporally entangled inputs.

H2: Outdated / inconsistent forest-change product versions ↔︎ L8 50%

Both mention that product/version choices in GFW/GFC-derived inputs can affect results and comparability over time. The human critique focuses on newer improved datasets (e.g., GEDI-informed updates), while the LLM highlights version/temporal inconsistency more generally.

🤖 Matched LLM issue text

Show detailed discussion

The human issue is specifically that the study relies on an older GFC gain label layer (v1.5) and should consider newer updates that may materially change training labels and conclusions. The LLM does not explicitly call for switching to the latest gain dataset or cite GEDI-based improvements, but it does warn that known version differences in GFW products and updating covariates to 2018 can yield incoherent signals and affect transfer to future predictions. Thus, both are concerned that dataset vintage/versioning undermines validity, but they stress different remedies: the humans want updated gain labels; the LLM stresses temporal consistency and comparability.

H3: Socioeconomic confounding undermines “biophysical potential” claims ↔︎ L3, L9 90%

Both critiques argue that observational patterns conflate biophysical suitability with human land-use/governance, so outputs cannot be interpreted as purely biophysical potential. The LLM explicitly flags omitted-variable bias/proxying when socioeconomic covariates are excluded from prediction.

👤 Human expert critique text

“Confounding by Socioeconomic Factors and Predictor Choice:” [Both evaluators agreed] While the authors train models with biophysical and socioeconomic predictors, their spatial predictions are derived from a model that includes only biophysical variables. Inferring purely biophysical potential from observational data is not feasible with the proposed method. The difficulty lies in the fact that biophysical and socioeconomic conditions are deeply interconnected. In practice, the model will predict high regeneration probability in locations where both factors are favorable. The proposed method cannot support the interpretation of the estimates as ‘purely biophysical potential for natural forest regrowth’.

🤖 Matched LLM issue text

L3: . “Non-regeneration” points are sampled from a broad domain that likely includes lands under active management or repeated clearing; thus the model mixes biophysical capacity, human land-use decisions, and detectability rather than isolating biophysical potential.

L9: . Dropping socioeconomic covariates for prediction stability may increase omitted-variable bias, with biophysical variables acting as proxies for land-use intensity/governance; this complicates the “biophysical conditions can support” interpretation.

Show detailed discussion

The human reviewers emphasize that biophysical and socioeconomic conditions are intertwined and that using a biophysical-only prediction model cannot support the interpretation of ‘pure biophysical potential.’ The LLM makes the same point via two angles: (i) negatives and the domain include actively managed lands, so the model learns land-use decisions as much as ecology; and (ii) dropping socioeconomic covariates causes omitted-variable bias, with biophysical variables acting as proxies for governance/pressure. The human critique is framed as an interpretation/identification problem (“you cannot infer pure biophysical potential”), while the LLM frames it as a statistical bias/problem setup issue. These are essentially the same concern, expressed with different causal/statistical language.

H4: Outcome definition misses important regrowth processes ↔︎ L1 45%

Both raise that the chosen regrowth label misses meaningful forms of recovery. The human critique focuses on intensive-margin recovery within existing forests, whereas the LLM focuses on early/slow regeneration and persistence limitations.

👤 Human expert critique text

Data leakage/contemporaneous predictors [mentioned: both evaluators and UJ field specialists] — The ML model seems to be fitted on data/variables that includes post-regeneration outcomes, or that could be influenced by these outcomes; or “using future information to predict past events, or accidentally including validation data in the training data.” This can lead to overly optimistic estimates of model performance and poor generalization to new data. Here, the issue is the inclusion of “Net primary production, burned area, road density, and soil characteristics were recorded over the same time period as the outcome, and these predictors incorporate information that is influenced by regeneration itself.”

🤖 Matched LLM issue text

L1: . The target variable (“natural regrowth” as >5 m vegetation gain from 2000–2016) is a partial proxy for restoration success and likely misses early regeneration and slow-growing systems; it also does not ensure persistence beyond 2016.

Show detailed discussion

The human issue is that the study only captures extensive-margin gain (new forest pixels) and ignores intensive-margin recovery (canopy densification/regrowth within existing forest). The LLM similarly critiques the target definition as an imperfect proxy that can miss early regeneration and slow-growing ecosystems and may not ensure persistence beyond 2016. These overlap in the general idea that “regrowth” is incompletely operationalized, but they are not the same measurement gap: intensive-margin recovery is specifically about within-forest changes rather than new forest appearance. Hence the match is partial: both identify label incompleteness, but for different missing components.

H6: Label definition of “natural regrowth” is problematic ↔︎ L1, L2 45%

Both critiques question whether the label truly represents the intended concept of natural regeneration. The LLM focuses on proxy/measurement limitations and label error, while the human critique focuses on conceptual misclassification (natural vs human-assisted).

👤 Human expert critique text

Definition of Natural Regrowth (Inherited from Fagan et al.) [Cloud]: The model relies on Fagan et al.’s definition, which treats all non-plantation gain as “natural regrowth,” conflating truly natural processes with human-assisted regeneration.

🤖 Matched LLM issue text

L2: . The upstream regrowth map used for labels has acknowledged humid-biome omission and low producer’s accuracy for area-adjusted estimates, creating systematic false negatives that can bias model training and geographic patterns.

Show detailed discussion

The human reviewer concern is definitional: inheriting Fagan et al.’s label may conflate natural processes with human-assisted regeneration (anything non-plantation counted as “natural”). The LLM does not explicitly discuss assisted vs natural categories, but it does argue the target is only a proxy for restoration success and that upstream labels have systematic omission/accuracy problems that can bias training. Conceptually, both are about construct validity: what the label actually measures versus what the paper claims. The human critique is about mislabeling the mechanism/type of regrowth; the LLM is about measurement error and proxy validity more broadly.

H7: Omission error in labels inflates area potential ↔︎ L2, L6 92%

Both identify that the upstream regrowth map has high omission/low producer’s accuracy, which biases training and can overstate available/potential area. The LLM explicitly notes systematic false negatives and missing uncertainty propagation.

👤 Human expert critique text

Overestimation due to Omission Error (Inherited from Fagan et al.): Fagan et al. reported very high omission errors for regrowth (low area-based Producer’s Accuracy of ~18.7%). The Williams et al. potential area (215 Mha) includes already-regrown areas, substantially overestimating the area truly available for future regeneration.

🤖 Matched LLM issue text

L6: . The extremely narrow confidence intervals on the headline area estimates are not credible as full uncertainty bounds because they omit major sources of uncertainty (label noise, transfer/non-stationarity, predictor error/resolution mismatch, and model specification uncertainty).

Show detailed discussion

The human critique ties directly to Fagan et al.’s reported low producer’s accuracy (~18.7%) and argues this leads to overestimation of the ‘available’ area because already-regrown areas may be miscounted or misrepresented. The LLM similarly highlights that the label layer has systematic false negatives (omission), which will bias model training and spatial patterns, and further notes that such label noise is not reflected in uncertainty intervals. The two critiques align closely on the technical failure mode (label omission) and its downstream consequence (biased area estimates). The LLM extends the point by connecting it to uncertainty accounting, while the human critique emphasizes inherited error from the benchmark dataset.

H8: Headline area estimates lack credible validation/uncertainty ↔︎ L5, L6 55%

Both raise that the headline area number is not adequately supported by validation against historical benchmarks or by realistic uncertainty. The LLM focuses on missing uncertainty sources and over-optimistic validation; the human critique emphasizes the discrepancy with Fagan et al. and the absence of a hindcast.

👤 Human expert critique text

Data leakage/contemporaneous predictors [mentioned: both evaluators and UJ field specialists] — The ML model seems to be fitted on data/variables that includes post-regeneration outcomes, or that could be influenced by these outcomes; or “using future information to predict past events, or accidentally including validation data in the training data.” This can lead to overly optimistic estimates of model performance and poor generalization to new data. 2. “Confounding by Socioeconomic Factors and Predictor Choice:” [Both evaluators agreed] While the authors train models with biophysical and socioeconomic predictors, their spatial predictions are derived from a model that includes only biophysical variables. Inferring purely biophysical potential from observational data is not feasible with the proposed method. The difficulty lies in the fact that biophysical and socioeconomic conditions are deeply interconnected. In practice, the model will predict high regeneration probability in locations where both factors are favorable. The proposed method cannot support the interpretation of the estimates as ‘purely biophysical potential for natural forest regrowth’.

🤖 Matched LLM issue text

L5: . Validation uses random splits with a post hoc autocorrelation-distance diagnostic but not fully spatially blocked cross-validation; performance may be optimistic and appears regionally variable (notably weaker in parts of Southeast Asia).

Show detailed discussion

The human reviewers stress that the 215 Mha estimate is far larger than Fagan et al.’s ~31.6 Mha benchmark and that the authors fail to report a hindcast for 2000–2016 to check consistency. The LLM does not name that specific discrepancy, but it argues the reported confidence intervals are implausibly narrow because they omit major uncertainties (label noise, non-stationarity, predictor error, specification uncertainty), and it warns that non-spatially-blocked validation can be optimistic. Both are ultimately challenging evidentiary support for the headline magnitude, but they differ in emphasis: humans want a concrete historical back-test against the benchmark; the LLM critiques the general uncertainty/validation framework.

⚠️ Unmatched Human Issues (1 not captured by LLM)

H5: Validation compares 2030 predictions to 2000–2016 outcomes (not true future validation).

Why missed: The LLM critique focuses on cross-validation design (random vs spatial blocking) rather than the specific logical mismatch of ‘validating’ a 2030 map using historical regrowth labels from 2000–2016.

📋 Unmatched LLM Issues (4 not in human critique)

L4: Limited calibration/threshold analysis despite probability-to-area aggregation.

Why extra: Human critiques did not focus on calibration, threshold choice, or precision–recall tradeoffs as a central methodological risk.

L10: Carbon layer resampling from 1 km to 30 m without uncertainty/covariance handling.

Why extra: Human critiques centered on regeneration mapping/validation rather than the downstream carbon accumulation estimation workflow.

L11: Risk of policy/operational misuse; need clearer guidance on additionality/permanence.

Why extra: Human critiques were primarily technical (data leakage, confounding, validation) rather than interpretability/communications risks for end users.

L12: Code availability only on request limits reproducibility/auditing.

Why extra: Human critiques did not explicitly raise transparency/reproducibility as a key issue.

📄 View full human critique

Necessary
Data leakage/contemporaneous predictors [mentioned: both evaluators and UJ field specialists] — The ML model seems to be fitted on data/variables that includes post-regeneration outcomes, or that could be influenced by these outcomes; or “using future information to predict past events, or accidentally including validation data in the training data.” This can lead to overly optimistic estimates of model performance and poor generalization to new data.
Here, the issue is the inclusion of “Net primary production, burned area, road density, and soil characteristics were recorded over the same time period as the outcome, and these predictors incorporate information that is influenced by regeneration itself.”
[E1] The outcome variable indicates patches of natural forest regeneration which occurred between 2000–2012 and persisted to 2016. At least three of the predictor variables seen by the training algorithm are also recorded during this period rather than prior to it: net primary production, burned area, and road density. In addition, the authors use variables from the ESA CCI land cover dataset as recorded in the year 2000, overlapping with the outcome.
Optional but important
Reliance on Older GFC Gain Data: [Cloud] The study relies on an older version of the Global Forest Change (GFC) gain data (v1.5, covering 2000-2012/2016). Since the initial analysis for Fagan et al. (2022), improved GFC datasets have been released (e.g., GMD 2022 update using GEDI lidar).
“Confounding by Socioeconomic Factors and Predictor Choice:” [Both evaluators agreed] While the authors train models with biophysical and socioeconomic predictors, their spatial predictions are derived from a model that includes only biophysical variables. Inferring purely biophysical potential from observational data is not feasible with the proposed method. The difficulty lies in the fact that biophysical and socioeconomic conditions are deeply interconnected. In practice, the model will predict high regeneration probability in locations where both factors are favorable. The proposed method cannot support the interpretation of the estimates as ‘purely biophysical potential for natural forest regrowth’.
Neglect of Intensive Margin Regrowth: [Cloud] The study focuses exclusively on the extensive margin (new forest area gain). It completely overlooks the intensive margin – canopy regeneration and forest recovery within existing, potentially degraded, forest pixels.
“Validation” uses past data [Both evaluators/consensus] — The confusion matrix appears to validate the model by comparing its predictions for 2030 against the Fagan et al. (2022) regrowth data for 2000-2016. Validating a model designed to predict future potential against past outcomes is not a true test of its predictive power for the future.
Unsure if correct/relevant
Definition of Natural Regrowth (Inherited from Fagan et al.) [Cloud]: The model relies on Fagan et al.’s definition, which treats all non-plantation gain as “natural regrowth,” conflating truly natural processes with human-assisted regeneration.
Overestimation due to Omission Error (Inherited from Fagan et al.): Fagan et al. reported very high omission errors for regrowth (low area-based Producer’s Accuracy of ~18.7%). The Williams et al. potential area (215 Mha) includes already-regrown areas, substantially overestimating the area truly available for future regeneration.
Discrepancy in Estimated Magnitude and Lack of Historical Validation: [Both evaluators] The authors’ estimate of 215 Mha of expected natural regeneration is strikingly larger than the benchmark from Fagan et al. (2022) of 31.6 Mha ± 11.9 Mha (for 2000-2016). Williams et al. do not report their own model’s prediction for the 2000-2016 historical period (a “hindcast”), a crucial missing validation against this benchmark.

🤖 View all GPT key issues (12)

1. The target variable (“natural regrowth” as >5 m vegetation gain from 2000–2016) is a partial proxy for restoration success and likely misses early regeneration and slow-growing systems; it also does not ensure persistence beyond 2016.

2. The upstream regrowth map used for labels has acknowledged humid-biome omission and low producer’s accuracy for area-adjusted estimates, creating systematic false negatives that can bias model training and geographic patterns.

3. “Non-regeneration” points are sampled from a broad domain that likely includes lands under active management or repeated clearing; thus the model mixes biophysical capacity, human land-use decisions, and detectability rather than isolating biophysical potential.

4. The paper reports high balanced-class accuracy but provides limited information on calibration and threshold-dependent tradeoffs (precision/recall), which are central when aggregating probabilities into expected areas and when using binary maps for targeting.

5. Validation uses random splits with a post hoc autocorrelation-distance diagnostic but not fully spatially blocked cross-validation; performance may be optimistic and appears regionally variable (notably weaker in parts of Southeast Asia).

6. The extremely narrow confidence intervals on the headline area estimates are not credible as full uncertainty bounds because they omit major sources of uncertainty (label noise, transfer/non-stationarity, predictor error/resolution mismatch, and model specification uncertainty).

7. Predictor rasters have mixed spatial/temporal resolutions (30 m to ~1 km; climate normals 1970–2000; NPP 2000–2015; burn area 2001–2017), yet outputs are presented at 30 m; this can create visually fine-scale artifacts and overconfidence in local precision.

8. Updating key forest covariates to 2018 while training on 2000–2016 outcomes introduces temporal inconsistency; combined with known version differences in GFW products, this may yield local incoherence and affect transfer to 2015/2030 predictions.

9. Dropping socioeconomic covariates for prediction stability may increase omitted-variable bias, with biophysical variables acting as proxies for land-use intensity/governance; this complicates the “biophysical conditions can support” interpretation.

10. Carbon accumulation estimates rely on resampling a 1 km carbon potential layer to 30 m with implicit within-km homogeneity and without propagating uncertainty or accounting for covariance between regeneration probability and carbon potential.

11. The map is framed as a first-stage input, but discussion of offsets/targeting could be misread as identifying realizable, additional, and permanent sequestration opportunities; clearer operational guidance on use and limits would reduce misuse.

12. Code is only available “on request,” limiting reproducibility and independent auditing of modelling choices (variable selection, sampling, preprocessing, and aggregation).

LLM-Based Assessment

The comparison between GPT key issues and human critiques was assessed using GPT-5.2 Pro, which evaluated coverage (what proportion of human concerns GPT identified) and precision (whether GPT issues are substantive).

Coverage: Proportion of consensus human issues that have any LLM match (match quality ≥ 30%)
Weighted Coverage: Mean match quality across all human issues (treating unmatched issues as 0%)
Precision: Proportion of LLM issues that match a human concern

Note: Any interpretation or narrative commentary in this section is written by an LLM (Codex). The values themselves come from the GPT-5.2 Pro comparison outputs.

Caveat on Precision

“Precision” only reflects whether LLM issues match the consensus human issues curated in Coda—those prioritized by evaluation managers as the most important concerns. Many LLM-identified issues may have been noted by one or more individual human evaluators but were not included in this curated set. A low precision score does not necessarily mean the LLM raised irrelevant concerns; it may simply reflect issues that weren’t prioritized in the consensus summary.

Show code

# Check if LLM comparison results are available
has_llm_results <- !is.null(comparison_data) &&
                   n_papers > 0 &&
                   "coverage_pct" %in% names(comparison_data) &&
                   any(!is.na(comparison_data$coverage_pct))

Show code

if (has_llm_results) {
  # Filter to papers with valid LLM results
  llm_results <- comparison_data |>
    filter(!is.na(coverage_pct) & !is.na(precision_pct))

  summary_stats <- llm_results |>
    summarise(
      `Papers Assessed` = n(),
      `Mean Coverage (%)` = round(mean(coverage_pct, na.rm = TRUE), 1),
      `Mean Precision (%)` = round(mean(precision_pct, na.rm = TRUE), 1),
      `Coverage Range` = paste0(min(coverage_pct, na.rm = TRUE), "-", max(coverage_pct, na.rm = TRUE)),
      `Precision Range` = paste0(min(precision_pct, na.rm = TRUE), "-", max(precision_pct, na.rm = TRUE))
    ) |>
    mutate(across(everything(), as.character)) |>
    pivot_longer(everything(), names_to = "Metric", values_to = "Value")

  kable(summary_stats, align = c("l", "r"))
}

LLM assessment of GPT vs human critique alignment
Metric	Value
Papers Assessed	14
Mean Coverage (%)	74.9
Mean Precision (%)	46.9
Coverage Range	40-100
Precision Range	17-83

Show code

if (has_llm_results) {
  llm_results <- comparison_data |>
    filter(!is.na(coverage_pct) & !is.na(precision_pct)) |>
    mutate(paper_short = str_trunc(gpt_paper, 25))

  ggplot(llm_results, aes(x = coverage_pct, y = precision_pct)) +
    geom_point(size = 4, color = UJ_ORANGE, alpha = 0.8) +
    geom_text(aes(label = paper_short), hjust = -0.1, vjust = 0.5, size = 2.5, check_overlap = TRUE) +
    geom_vline(xintercept = mean(llm_results$coverage_pct), linetype = "dashed", color = UJ_BLUE) +
    geom_hline(yintercept = mean(llm_results$precision_pct), linetype = "dashed", color = UJ_BLUE) +
    scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) +
    scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, 20)) +
    labs(
      x = "Coverage (%): Human issues captured by GPT",
      y = "Precision (%): GPT issues that are substantive",
      subtitle = "Dashed lines = means"
    ) +
    theme_uj()
}

Figure 4.1: Coverage vs Precision across papers (LLM-assessed)

Show code

if (has_llm_results) {
  ratings_table <- comparison_data |>
    filter(!is.na(coverage_pct)) |>
    mutate(
      Paper = str_trunc(gpt_paper, 35),
      `Coverage (%)` = coverage_pct,
      `Precision (%)` = precision_pct,
      Rating = overall_rating
    ) |>
    select(Paper, `Coverage (%)`, `Precision (%)`, Rating) |>
    arrange(desc(`Coverage (%)`))

  kable(ratings_table, align = c("l", "r", "r", "l"))
}

Table 4.2: Per-paper LLM assessment results
Paper	Coverage (%)	Precision (%)	Rating
Bilal_and_Kaenzig_2024	100	55	Good
Blimpo_and_Castaneda-Dower_2025	100	58	Good
Clancy_2024	100	42	Good
Adena_and_Hager_2024	92	67	Good
Dullaghan_and_Zhang_2022	88	55	Good
Williams_et_al._2024	88	67	Good
Green_et_al._2025	86	58	Good
Weaver_et_al._2025	80	50	Moderate
McGuire_et_al._2024	75	83	Good
Acemoglu_et_al._2024	50	33	Moderate
Benabou_et_al._2023	50	20	Poor
Bruers_2021	50	17	Moderate
Frech_et_al._2023	50	33	Moderate
Peterman_et_al._2025	40	18	Poor

Show code

if (has_llm_results) {
  rating_counts <- comparison_data |>
    filter(!is.na(overall_rating)) |>
    mutate(
      Rating = factor(overall_rating, levels = c("Poor", "Moderate", "Good", "Excellent"))
    ) |>
    count(Rating, .drop = FALSE)

  # Define colors for ratings
  rating_colors <- c("Poor" = "#e74c3c", "Moderate" = "#f39c12", "Good" = "#27ae60", "Excellent" = "#2ecc71")

  ggplot(rating_counts, aes(x = Rating, y = n, fill = Rating)) +
    geom_col(alpha = 0.9) +
    geom_text(aes(label = n), vjust = -0.3, size = 5) +
    scale_fill_manual(values = rating_colors, drop = FALSE) +
    labs(
      x = "Overall Rating",
      y = "Number of Papers"
    ) +
    theme_uj() +
    theme(legend.position = "none")
}

Figure 4.2: Distribution of overall ratings across papers

Interpretation Guide

Coverage: Percentage of human-identified issues that GPT also captured (in some form). Higher = GPT missed fewer human concerns.
Precision: Percentage of GPT issues that are substantive rather than generic or spurious. Higher = GPT’s critiques are more targeted.
Overall Rating: Qualitative assessment (Excellent/Good/Moderate/Poor) based on both coverage and precision metrics.

Summary and Observations

Show code

if (!is.null(comparison_data) && n_papers > 0) {

  # Calculate some summary statistics
  issue_counts <- comparison_data |>
    mutate(
      num_gpt_issues = sapply(gpt_key_issues, length),
      paper_short = str_trunc(gpt_paper, 30)
    ) |>
    select(paper_short, num_gpt_issues, coda_critique_length)

  # Calculate severity distribution from parsed issues
  all_human_issues <- do.call(rbind, lapply(names(parsed_issues_list), function(paper_id) {
    issues <- parsed_issues_list[[paper_id]]
    if (is.null(issues) || length(issues) == 0) return(NULL)
    data.frame(
      paper_id = paper_id,
      severity = sapply(issues, function(x) x$severity),
      stringsAsFactors = FALSE
    )
  }))

  if (!is.null(all_human_issues) && nrow(all_human_issues) > 0) {
    severity_dist <- all_human_issues |>
      mutate(
        severity = case_when(
          grepl("necessary", tolower(severity)) ~ "Necessary",
          grepl("optional", tolower(severity)) ~ "Optional",
          severity != "" ~ "Unsure",
          TRUE ~ "Unclassified"
        ),
        severity = factor(severity, levels = c("Necessary", "Optional", "Unsure", "Unclassified"))
      ) |>
      count(severity, .drop = FALSE)
  }
}

Aggregate Statistics

Show code

if (!is.null(comparison_data) && n_papers > 0) {
  # Count human issues by severity
  total_human_issues <- if (!is.null(all_human_issues)) nrow(all_human_issues) else 0
  total_gpt_issues <- sum(issue_counts$num_gpt_issues)

  necessary_count <- if (!is.null(all_human_issues)) sum(grepl("necessary", tolower(all_human_issues$severity))) else 0
  optional_count <- if (!is.null(all_human_issues)) sum(grepl("optional", tolower(all_human_issues$severity))) else 0

  aggregate_stats <- data.frame(
    Metric = c(
      "Papers Compared",
      "Total Human Issues Parsed",
      "  - Necessary",
      "  - Optional",
      "  - Unsure/Other",
      "Total LLM Issues",
      "Avg Human Issues per Paper",
      "Avg LLM Issues per Paper"
    ),
    Value = c(
      n_papers,
      total_human_issues,
      necessary_count,
      optional_count,
      total_human_issues - necessary_count - optional_count,
      total_gpt_issues,
      round(total_human_issues / n_papers, 1),
      round(total_gpt_issues / n_papers, 1)
    )
  )

  kable(aggregate_stats, align = c("l", "r"))
}

Table 4.3: Aggregate statistics across all papers
Metric	Value
Papers Compared	14.0
Total Human Issues Parsed	53.0
- Necessary	7.0
- Optional	37.0
- Unsure/Other	9.0
Total LLM Issues	163.0
Avg Human Issues per Paper	3.8
Avg LLM Issues per Paper	11.6

Show code

if (!is.null(all_human_issues) && nrow(all_human_issues) > 0) {
  # Define severity colors
  sev_colors <- c("Necessary" = "#e74c3c", "Optional" = "#f39c12", "Unsure" = "#95a5a6", "Unclassified" = "#bdc3c7")

  ggplot(severity_dist, aes(x = severity, y = n, fill = severity)) +
    geom_col(alpha = 0.9) +
    geom_text(aes(label = n), vjust = -0.3, size = 5) +
    scale_fill_manual(values = sev_colors, drop = FALSE) +
    labs(
      x = "Severity Label",
      y = "Number of Issues"
    ) +
    theme_uj() +
    theme(legend.position = "none")
}

Figure 4.3: Distribution of human issue severity labels

Issue Count Distribution

Show code

if (!is.null(comparison_data) && n_papers > 0) {

  issue_counts |>
    ggplot(aes(x = reorder(paper_short, num_gpt_issues), y = num_gpt_issues)) +
    geom_col(fill = UJ_ORANGE, alpha = 0.8) +
    geom_hline(yintercept = mean(issue_counts$num_gpt_issues),
               linetype = "dashed", color = UJ_BLUE, linewidth = 1) +
    coord_flip() +
    labs(
      x = NULL,
      y = "Number of Key Issues",
      subtitle = paste0("Dashed line = mean (", round(mean(issue_counts$num_gpt_issues), 1), " issues)")
    ) +
    theme_uj()
}

Figure 4.4: Number of key issues identified by GPT-5.2 Pro per paper

Coda Critique Length vs GPT Issue Count

Show code

if (!is.null(comparison_data) && n_papers > 0) {

  ggplot(issue_counts, aes(x = coda_critique_length, y = num_gpt_issues)) +
    geom_point(size = 3, color = UJ_ORANGE, alpha = 0.7) +
    geom_smooth(method = "lm", se = TRUE, color = UJ_BLUE, alpha = 0.2) +
    labs(
      x = "Human Critique Length (characters)",
      y = "GPT-5.2 Pro Issue Count"
    ) +
    theme_uj()
}

Figure 4.5: Relationship between human critique detail and GPT issue count

Observable Structural Differences

The following are verifiable structural differences between the two data sources (not assessments of quality or coverage):

Aspect	GPT-5.2 Pro	Human Expert (Coda)
Format	Numbered bullet points (array of strings)	Free-form prose
Structure	Ordered list (prompted: “most to least important”)	Often uses severity labels (“Necessary”, “Optional but important”)
Source attribution	None (single model output)	Often cites specific evaluators (E1, E2, names)
Length	Constrained by prompt (~10-12 issues)	Unconstrained, highly variable

Questions for Manual Review

The side-by-side comparisons above are provided for manual expert assessment. Key questions to investigate:

Coverage: What proportion of human-identified issues appear in the GPT output (in some form, to some extent)? (This is the focal question).
Precision: Are GPT issues substantive or does the model identify spurious/generic concerns?
Severity alignment: Does GPT’s importance ordering correlate with human severity labels?
Missed issues: Are there critical human concerns that GPT systematically misses?
Novel issues: Does GPT surface valid concerns that humans overlooked?

Show code

if (has_llm_results) {
  cat('::: {.callout-tip}\n')
  cat('## LLM Assessment Complete\n\n')
  cat('The comparisons above have been assessed using GPT-5.2 Pro. See the [LLM-Based Assessment](#sec-llm-assessment) section for coverage and precision metrics.\n')
  cat(':::\n')
} else {
  cat('::: {.callout-warning}\n')
  cat('## No Automated Assessment Yet\n\n')
  cat('This page displays raw data for manual review. The claims about coverage, precision, and alignment **have not been assessed**—they require either:\n\n')
  cat('- Manual expert review of each paper\'s comparisons\n')
  cat('- LLM-based comparison (run the `key-issues-comparison` chunk in `methods.qmd`)\n\n')
  cat('Any conclusions about "high overlap" or "systematic differences" should emerge from that analysis, not be assumed.\n')
  cat(':::\n')
}

LLM Assessment Complete

The comparisons above have been assessed using GPT-5.2 Pro. See the LLM-Based Assessment section for coverage and precision metrics.

Manual Annotation Tool

To systematically assess concordance between human and LLM critiques, use the Issue Annotation Tool.

For developers: Regenerating annotation data

To update the annotation data after changes to human critiques or LLM responses:

# Generate annotation data (parses human critiques into individual issues)
python3 tools/build_issue_annotation_data.py

Annotation workflow:

For each human-identified issue:

Match score (0-1): How well do LLM issues capture this concern?
- 0 = Not addressed at all
- 0.5 = Partially captured or tangentially related
- 1 = Fully captured by one or more LLM issues
Confidence (0-1): How certain are you of this assessment?
Context flag: Check this box if the human critique references information the LLM didn’t have (e.g., appendix or preregistration materials not shared with the LLM at the point this evaluation was done)
Link to LLM issues: Select which LLM issues (L1, L2, …) correspond to this human issue
Discussion: Explain your reasoning, note partial matches, or flag ambiguities

Export annotations as JSON/CSV for analysis. Annotations are auto-saved to browser localStorage.

Annotation Data Schema

The tool parses human critiques using heuristics:

Severity labels normalized to: necessary, optional, unsure
Evaluator attributions (E1, E2, DR) preserved in issue text
Issue boundaries detected via enumeration, sentence breaks, and section headers

Review parsed issues in the UI and edit/add/remove as needed before annotating.