Results: Ratings

Setup and libraries

source("setup_params.R")

library("tidyverse")
library("janitor")
library("stringr")
library("here")
library("knitr")
library("kableExtra")
library("ggrepel")
library("scales")
library("jsonlite")
library("purrr")
library("tibble")
library("plotly")

# Theme and colors
UJ_ORANGE <- "#f19e4b"   # LLM primary
UJ_GREEN  <- "#99bb66"   # Human
UJ_BLUE   <- "#4e79a7"

MODEL_COLORS <- c(
  "GPT-5 Pro" = "#f19e4b",
  "GPT-5.2 Pro" = "#c9302c",
  "GPT-4o-mini" = "#74aa9c",
  "Claude Sonnet 4" = "#d4a574",
  "Gemini 2.0 Flash" = "#7986cb",
  "Human" = "#99bb66"
)

theme_uj <- function(base_size = 11) {
  theme_minimal(base_size = base_size) +
    theme(
      panel.grid.minor = element_blank(),
      plot.title.position = "plot",
      legend.position = "bottom"
    )
}

canon_metric <- function(x) dplyr::recode(
  x,
  "advancing_knowledge" = "adv_knowledge",
  "open_science"        = "open_sci",
  "logic_communication" = "logic_comms",
  "global_relevance"    = "gp_relevance",
  "claims_evidence"     = "claims",
  .default = x
)

`%||%` <- function(x, y) if (!is.null(x)) x else y

# Pricing per 1M tokens (USD)
pricing <- tribble(
  ~model,              ~input_per_m, ~output_per_m,
  "GPT-5 Pro",         15.00,        60.00,
  "GPT-5.2 Pro",       15.00,        60.00,
  "GPT-4o-mini",       0.15,         0.60,
  "Claude Sonnet 4",   3.00,         15.00,
  "Gemini 2.0 Flash",  0.075,        0.30
)

Load human evaluation data

# Load human ratings from Unjournal
UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |>
  mutate(label_paper_title = research, label_paper = paper) |>
  select(c("label_paper_title", "label_paper"))

rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |>
  clean_names() |>
  mutate(label_paper_title = research) |>
  select(-c("research"))

research <- read_csv("data/research.csv", show_col_types = FALSE) |>
  clean_names() |>
  filter(status == "50_published evaluations (on PubPub, by Unjournal)") |>
  left_join(UJmap, by = c("label_paper_title")) |>
  mutate(doi = str_trim(doi)) |>
  mutate(label_paper = case_when(
    doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023",
    doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025",
    doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022",
    doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023",
    TRUE ~ label_paper
  )) |>
  left_join(rsx, by = c("label_paper_title"))

key_map <- research |>
  transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |>
  filter(!is.na(label_paper_title)) |>
  distinct(label_paper_title, label_paper) |>
  group_by(label_paper_title) |>
  slice(1) |>
  ungroup()

rsx_research <- rsx |>
  mutate(label_paper_title = str_trim(label_paper_title)) |>
  left_join(key_map, by = "label_paper_title", relationship = "many-to-one")

metrics_human <- rsx_research |>
  mutate(criteria = canon_metric(criteria)) |>
  filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |>
  transmute(
    paper = label_paper, criteria, evaluator, model = "Human",
    mid = as.numeric(middle_rating),
    lo = suppressWarnings(as.numeric(lower_ci)),
    hi = suppressWarnings(as.numeric(upper_ci))
  ) |>
  filter(!is.na(paper), !is.na(mid)) |>
  mutate(
    lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_),
    hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_)
  ) |>
  mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |>
  distinct(paper, criteria, model, evaluator, mid, lo, hi)

human_avg <- metrics_human |>
  filter(criteria == "overall") |>
  group_by(paper) |>
  summarise(
    human_mid = mean(mid, na.rm = TRUE),
    human_lo = mean(lo, na.rm = TRUE),
    human_hi = mean(hi, na.rm = TRUE),
    n_human = n(),
    .groups = "drop"
  )

# Summary stats for inline use
n_human_papers <- n_distinct(metrics_human$paper)
n_human_evaluators <- n_distinct(metrics_human$evaluator)

Load LLM evaluation data (all models)

model_dirs <- list(
  "gpt5_pro_updated_jan2026" = "GPT-5 Pro",
  "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro",
  "gpt_4o_mini_2024_07_18" = "GPT-4o-mini",
  "claude_sonnet_4_20250514" = "Claude Sonnet 4",
  "gemini_2.0_flash" = "Gemini 2.0 Flash"
)

parse_response <- function(path, model_name) {
  tryCatch({
    r <- jsonlite::fromJSON(path, simplifyVector = FALSE)
    paper <- basename(path) |>
      str_replace("\\.response\\.json$", "") |>
      str_replace_all("_", " ")

    parsed <- NULL
    if (!is.null(r$output_text) && nchar(r$output_text) > 0) {
      parsed <- jsonlite::fromJSON(r$output_text, simplifyVector = TRUE)
    } else if (!is.null(r$parsed)) {
      parsed <- r$parsed
    } else if (!is.null(r$output)) {
      msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL)
      if (!is.null(msg) && length(msg$content) > 0) {
        parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE)
      }
    }

    if (is.null(parsed)) return(NULL)

    metrics <- parsed$metrics
    metric_rows <- list()
    tier_rows <- list()

    for (nm in names(metrics)) {
      if (nm %in% c("tier_should", "tier_will")) {
        tier_rows[[length(tier_rows) + 1]] <- tibble(
          paper = paper, model = model_name, tier_kind = nm,
          score = metrics[[nm]]$score,
          ci_lower = metrics[[nm]]$ci_lower,
          ci_upper = metrics[[nm]]$ci_upper
        )
      } else {
        metric_rows[[length(metric_rows) + 1]] <- tibble(
          paper = paper, model = model_name, metric = nm,
          midpoint = metrics[[nm]]$midpoint,
          lower_bound = metrics[[nm]]$lower_bound,
          upper_bound = metrics[[nm]]$upper_bound
        )
      }
    }

    input_tok <- r$usage$input_tokens %||% r$input_tokens
    output_tok <- r$usage$output_tokens %||% r$output_tokens
    reasoning_tok <- r$usage$output_tokens_details$reasoning_tokens

    list(
      metrics = bind_rows(metric_rows),
      tiers = bind_rows(tier_rows),
      tokens = tibble(
        paper = paper, model = model_name,
        input_tokens = input_tok %||% NA_integer_,
        output_tokens = output_tok %||% NA_integer_,
        reasoning_tokens = reasoning_tok %||% NA_integer_
      ),
      summary = tibble(
        paper = paper, model = model_name,
        assessment_summary = parsed$assessment_summary
      )
    )
  }, error = function(e) NULL)
}

load_all_llm <- function() {
  all_metrics <- list()
  all_tiers <- list()
  all_tokens <- list()
  all_summaries <- list()

  for (dir_name in names(model_dirs)) {
    model_name <- model_dirs[[dir_name]]
    json_dir <- here("results", dir_name, "json")

    if (dir.exists(json_dir)) {
      files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE)

      for (f in files) {
        result <- parse_response(f, model_name)
        if (!is.null(result)) {
          all_metrics[[length(all_metrics) + 1]] <- result$metrics
          all_tiers[[length(all_tiers) + 1]] <- result$tiers
          all_tokens[[length(all_tokens) + 1]] <- result$tokens
          all_summaries[[length(all_summaries) + 1]] <- result$summary
        }
      }
    }
  }

  list(
    metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)),
    tiers = bind_rows(all_tiers),
    tokens = bind_rows(all_tokens),
    summaries = bind_rows(all_summaries)
  )
}

llm_data <- load_all_llm()
llm_metrics <- llm_data$metrics
llm_tiers <- llm_data$tiers
llm_tokens <- llm_data$tokens
llm_summaries <- llm_data$summaries

# Summary stats for inline use
n_llm_models <- n_distinct(llm_metrics$model)
n_llm_papers <- n_distinct(llm_metrics$paper)
llm_model_names <- unique(llm_metrics$model)
llm_model_list <- paste(llm_model_names, collapse = ", ")

# Papers with both human and LLM data
matched_papers <- intersect(
 unique(metrics_human$paper),
 unique(llm_metrics$paper)
)
n_matched <- length(matched_papers)

# Cost summary
cost_data <- llm_tokens |>
  left_join(pricing, by = "model") |>
  mutate(
    reasoning_tokens = coalesce(reasoning_tokens, 0L),
    cost_usd = (input_tokens * input_per_m + output_tokens * output_per_m) / 1e6
  )

total_cost <- sum(cost_data$cost_usd, na.rm = TRUE)
total_evaluations <- nrow(llm_tokens)

# Per-model summaries
model_summary <- cost_data |>
  group_by(model) |>
  summarise(
    n_papers = n(),
    avg_input = round(mean(input_tokens, na.rm = TRUE)),
    avg_output = round(mean(output_tokens, na.rm = TRUE)),
    avg_reasoning = round(mean(reasoning_tokens[reasoning_tokens > 0], na.rm = TRUE)),
    total_cost = sum(cost_usd, na.rm = TRUE),
    cost_per_paper = mean(cost_usd, na.rm = TRUE),
    .groups = "drop"
  )

We compare LLM-generated research evaluations against human expert reviews from The Unjournal. This analysis covers 60 papers evaluated by 5 frontier LLMs (GPT-5 Pro, GPT-5.2 Pro, GPT-4o-mini, Claude Sonnet 4, Gemini 2.0 Flash), with 45 papers having both human and LLM evaluations for direct comparison. Human evaluations come from 49 expert reviewers across 49 papers in The Unjournal’s database.

Evaluation Overview

The previous Methods) chapter explains our process and pipeline.¹ This generally mirrors The Unjournal’s rubric and guidelines, requesting a range of percentile metrics and normative and predictive ‘journal tiers’ metrics.²

Each numeric field carries a midpoint (model’s 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty.

Future Models

We are working to extend this comparison to newer frontier models as they become available and our funding permits, as well as adapting these to bespoke agent-based tools such as Refine.ink.

Token Usage and Cost

Token costs are computed from API-reported input/output tokens (GPT-5 Pro runs with reasoning = high). Use the boxplot log scale to compare efficiency frontiers; reasoning tokens are reported when available.

Show code

if (nrow(llm_tokens) > 0) {

  cost_table <- model_summary |>
    transmute(
      Model = model,
      Papers = n_papers,
      `Avg input` = scales::comma(avg_input),
      `Avg output` = scales::comma(avg_output),
      `Avg reasoning` = ifelse(is.finite(avg_reasoning) & avg_reasoning > 0,
                                scales::comma(avg_reasoning), "—"),
      `Total cost` = scales::dollar(total_cost, accuracy = 0.01),
      `Cost/paper` = scales::dollar(cost_per_paper, accuracy = 0.001)
    ) |>
    arrange(desc(as.numeric(gsub("[$,]", "", `Total cost`))))

  knitr::kable(cost_table, align = c("l", rep("r", 6)))
}

Table 3.1: Token usage and estimated cost per model
Model	Papers	Avg input	Avg output	Avg reasoning	Total cost	Cost/paper
GPT-5 Pro	60	38,516	6,400	4,717	$57.71	$0.962
GPT-5.2 Pro	14	45,143	2,889	707	$11.91	$0.850
Claude Sonnet 4	15	116,386	731	—	$5.40	$0.360
GPT-4o-mini	60	71,581	470	—	$0.66	$0.011
Gemini 2.0 Flash	59	17,821	1,022	—	$0.10	$0.002

Total cost across all 208 LLM evaluations: $75.77.

Show code

if (nrow(cost_data) > 0) {

  ggplot(cost_data, aes(x = reorder(model, cost_usd, FUN = median), y = cost_usd, fill = model)) +
    geom_boxplot(alpha = 0.7, outlier.shape = NA) +
    geom_jitter(width = 0.15, alpha = 0.5, size = 2) +
    scale_fill_manual(values = MODEL_COLORS, guide = "none") +
    scale_y_log10(labels = scales::dollar_format(accuracy = 0.001)) +
    coord_flip() +
    labs(x = NULL, y = "Cost per paper (USD, log scale)") +
    theme_uj()
}

Figure 3.1: Cost per paper by model (log scale)

Interactive Forest Plot: Human vs LLM Ratings

The forest plot below shows overall percentile ratings for each of the 45 papers with matched human and LLM evaluations. Human evaluator ratings are shown in green (individual evaluators as small dots, mean as diamond). LLM ratings from different models are shown with distinct colors.

Interpretation: points are midpoints, horizontal bars are the model’s 90% credible interval.

Hover over any point to see details. Click legend items to show/hide models.

Show code

forest_human <- metrics_human |>
  filter(criteria == "overall") |>
  transmute(paper, model = "Human", evaluator, mid, lo, hi, is_mean = FALSE)

forest_human_means <- human_avg |>
  transmute(paper, model = "Human", evaluator = "Mean",
            mid = human_mid, lo = human_lo, hi = human_hi, is_mean = TRUE)

forest_llm <- llm_metrics |>
  filter(criteria == "overall") |>
  transmute(paper, model, evaluator = model,
            mid = midpoint, lo = lower_bound, hi = upper_bound, is_mean = FALSE)

forest_data <- bind_rows(forest_human, forest_human_means, forest_llm) |>
  filter(paper %in% matched_papers)

if (nrow(forest_data) > 0) {

  paper_order <- forest_data |>
    filter(model == "Human", is_mean) |>
    arrange(desc(mid)) |>
    pull(paper)

  forest_data <- forest_data |>
    mutate(
      paper = factor(paper, levels = rev(paper_order)),
      paper_num = as.numeric(paper),
      hover_text = paste0(
        "<b>", paper, "</b><br>",
        model, ifelse(evaluator != model & !is_mean, paste0(" (", evaluator, ")"), ""), "<br>",
        "Rating: ", round(mid, 1),
        ifelse(!is.na(lo) & !is.na(hi), paste0(" [", round(lo, 1), "–", round(hi, 1), "]"), "")
      )
    )

  p <- plot_ly() |>
    add_segments(x = 25, xend = 25, y = 0.5, yend = length(paper_order) + 0.5,
                 line = list(color = "lightgray", dash = "dot"),
                 showlegend = FALSE, hoverinfo = "none") |>
    add_segments(x = 50, xend = 50, y = 0.5, yend = length(paper_order) + 0.5,
                 line = list(color = "gray", dash = "dot"),
                 showlegend = FALSE, hoverinfo = "none") |>
    add_segments(x = 75, xend = 75, y = 0.5, yend = length(paper_order) + 0.5,
                 line = list(color = "lightgray", dash = "dot"),
                 showlegend = FALSE, hoverinfo = "none")

  models_to_plot <- c("Human", setdiff(unique(forest_data$model), "Human"))

  for (m in models_to_plot) {
    model_data <- forest_data |> filter(model == m)

    if (m == "Human") {
      indiv <- model_data |> filter(!is_mean)
      if (nrow(indiv) > 0) {
        p <- p |> add_trace(
          data = indiv, type = "scatter", mode = "markers",
          x = ~mid, y = ~paper_num,
          marker = list(color = MODEL_COLORS[m], size = 6, opacity = 0.4),
          text = ~hover_text, hoverinfo = "text",
          name = "Human (individual)", legendgroup = "Human"
        )
      }
      means <- model_data |> filter(is_mean)
      if (nrow(means) > 0) {
        p <- p |> add_trace(
          data = means, type = "scatter", mode = "markers",
          x = ~mid, y = ~paper_num,
          error_x = list(type = "data", symmetric = FALSE,
                         arrayminus = ~(mid - lo), array = ~(hi - mid),
                         color = MODEL_COLORS[m], thickness = 1.5),
          marker = list(color = MODEL_COLORS[m], size = 12, symbol = "diamond"),
          text = ~hover_text, hoverinfo = "text",
          name = "Human (mean)", legendgroup = "Human"
        )
      }
    } else {
      p <- p |> add_trace(
        data = model_data, type = "scatter", mode = "markers",
        x = ~mid, y = ~paper_num,
        error_x = list(type = "data", symmetric = FALSE,
                       arrayminus = ~(mid - lo), array = ~(hi - mid),
                       color = MODEL_COLORS[m], thickness = 1.5),
        marker = list(color = MODEL_COLORS[m], size = 10),
        text = ~hover_text, hoverinfo = "text", name = m
      )
    }
  }

  p <- p |> layout(
    xaxis = list(title = "Overall percentile (0–100)", range = c(0, 105), zeroline = FALSE),
    yaxis = list(title = "", ticktext = paper_order, tickvals = seq_along(paper_order), tickmode = "array"),
    legend = list(orientation = "h", y = -0.1, x = 0.5, xanchor = "center"),
    hovermode = "closest",
    margin = list(l = 200)
  )

  p

} else {
  cat("No matched human-LLM data available for forest plot.\n")
}

Figure 3.2: Interactive forest plot: Human and LLM overall ratings by paper

Human vs LLM Scatter Plot

Each point represents a paper, with human mean rating on the x-axis and LLM rating on the y-axis. Points on the diagonal indicate perfect agreement.

Note: human means carry their own variance; correlations here are bounded by human inter-rater noise.

Show code

scatter_data <- llm_metrics |>
  filter(criteria == "overall") |>
  inner_join(human_avg, by = "paper") |>
  mutate(
    diff = midpoint - human_mid,
    paper_short = str_trunc(paper, 25),
    # Ensure no NA values break ggplotly
    human_lo = coalesce(human_lo, human_mid),
    human_hi = coalesce(human_hi, human_mid),
    lower_bound = coalesce(lower_bound, midpoint),
    upper_bound = coalesce(upper_bound, midpoint)
  ) |>
  filter(!is.na(human_mid), !is.na(midpoint))

if (nrow(scatter_data) > 0) {

  p <- ggplot(scatter_data, aes(
    x = human_mid, y = midpoint, color = model,
    text = paste0(
      "<b>", paper, "</b><br>",
      "Human: ", round(human_mid, 1), " [", round(human_lo, 1), "–", round(human_hi, 1), "]<br>",
      "LLM: ", round(midpoint, 1), " [", round(lower_bound, 1), "–", round(upper_bound, 1), "]<br>",
      "Diff: ", sprintf("%+.1f", diff)
    )
  )) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
    geom_point(size = 3.5, alpha = 0.7) +
    scale_color_manual(values = MODEL_COLORS) +
    coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) +
    labs(x = "Human mean rating (0–100)", y = "LLM rating (0–100)", color = "Model") +
    theme_uj()

  # Use tryCatch to handle ggplotly errors gracefully
  tryCatch({
    ggplotly(p, tooltip = "text") |>
      layout(
        legend = list(orientation = "h", y = -0.12, x = 0.5, xanchor = "center"),
        hoverlabel = list(bgcolor = "white")
      )
  }, error = function(e) {
    # Fallback to static plot if ggplotly fails
    print(p)
    message("Note: Interactive plot unavailable, showing static version.")
  })
} else {
  cat("No matching data for scatter plot.\n")
}

Figure 3.3: Interactive scatter: Human vs LLM overall ratings

Model Agreement Statistics

We assess agreement between LLM and human evaluators using multiple metrics:

Pearson r: Linear correlation between ratings
Spearman ρ: Rank-based correlation (robust to outliers)
Mean bias: Average difference (LLM − Human); positive = LLM rates higher
RMSE/MAE: Error magnitude measures

Show code

if (nrow(scatter_data) > 0) {

  agreement_tbl <- scatter_data |>
    group_by(Model = model) |>
    summarise(
      N = n(),
      `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3),
      `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), 3),
      `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)),
      RMSE = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1),
      MAE = round(mean(abs(midpoint - human_mid), na.rm = TRUE), 1),
      .groups = "drop"
    ) |>
    arrange(desc(`Pearson r`))

  knitr::kable(agreement_tbl, align = c("l", rep("r", 6)))
}

Table 3.2: Agreement between each LLM and human mean ratings (overall criterion)
Model	N	Pearson r	Spearman ρ	Mean bias	RMSE	MAE
Gemini 2.0 Flash	44	0.351	0.321	-5.0	14.4	11.6
GPT-5 Pro	45	0.342	0.517	+6.4	15.4	10.3
GPT-5.2 Pro	9	0.333	0.471	-2.7	19.0	14.7
Claude Sonnet 4	10	0.197	0.272	+1.5	9.1	6.3
GPT-4o-mini	45	0.135	0.116	+0.0	20.2	13.1

Inter-Rater Agreement (Krippendorff’s Alpha)

Krippendorff’s alpha measures agreement among multiple raters, accounting for chance agreement. Values above 0.8 indicate good agreement; 0.67–0.8 is acceptable for exploratory research. Ceiling effects apply because human evaluators are not perfectly aligned.

Show code

# Install irr if needed: install.packages("irr")
if (requireNamespace("irr", quietly = TRUE)) {

  library(irr)

  # Prepare data: each row = paper, each column = rater (human mean + each LLM)
  compute_alpha <- function(criterion) {

    # Human mean per paper
    human_crit <- metrics_human |>
      filter(criteria == criterion, paper %in% matched_papers) |>
      group_by(paper) |>
      summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop")

    # LLM ratings per paper
    llm_crit <- llm_metrics |>
      filter(criteria == criterion, paper %in% matched_papers) |>
      select(paper, model, midpoint) |>
      pivot_wider(names_from = model, values_from = midpoint)

    # Combine
    combined <- human_crit |>
      inner_join(llm_crit, by = "paper") |>
      select(-paper)

    if (nrow(combined) < 3 || ncol(combined) < 2) return(NA_real_)

    # Krippendorff's alpha (interval scale)
    tryCatch({
      ka <- kripp.alpha(t(as.matrix(combined)), method = "interval")
      ka$value
    }, error = function(e) NA_real_)
  }

  criteria_list <- c("overall", "claims", "methods", "adv_knowledge",
                     "logic_comms", "open_sci", "gp_relevance")

  alpha_results <- tibble(
    Criterion = c("Overall", "Claims", "Methods", "Adv. Knowledge",
                  "Logic & Comms", "Open Science", "Global Relevance"),
    criterion_key = criteria_list
  ) |>
    rowwise() |>
    mutate(
      `Krippendorff's α` = compute_alpha(criterion_key),
      `α (formatted)` = ifelse(is.na(`Krippendorff's α`), "—",
                                sprintf("%.3f", `Krippendorff's α`)),
      Interpretation = case_when(
        is.na(`Krippendorff's α`) ~ "Insufficient data",
        `Krippendorff's α` >= 0.8 ~ "Good",
        `Krippendorff's α` >= 0.67 ~ "Acceptable",
        `Krippendorff's α` >= 0.4 ~ "Fair",
        TRUE ~ "Poor"
      )
    ) |>
    ungroup() |>
    select(Criterion, `α (formatted)`, Interpretation) |>
    rename(`Krippendorff's α` = `α (formatted)`)

  knitr::kable(alpha_results, align = c("l", "r", "l"))

} else {
  cat("*Install the `irr` package for Krippendorff's alpha: `install.packages('irr')`*\n")
}

Table 3.3: Inter-rater agreement: Krippendorff’s alpha by criterion
Criterion	Krippendorff's α	Interpretation
Overall	0.221	Poor
Claims	0.335	Poor
Methods	0.192	Poor
Adv. Knowledge	0.144	Poor
Logic & Comms	0.021	Poor
Open Science	0.093	Poor
Global Relevance	0.189	Poor

Human-Human vs Human-LLM Agreement (Baseline Context)³

To interpret human-LLM agreement, we must compare against human-human agreement as an upper bound. If human evaluators only agree with each other at α = 0.5, then expecting LLMs to achieve α = 0.8 with humans would be unrealistic. The table below shows both:

α_HH: Krippendorff’s alpha among human evaluators (treating each evaluator as a separate rater)
α_HL: Krippendorff’s alpha between human mean and each LLM model

Show code

if (requireNamespace("irr", quietly = TRUE)) {

  library(irr)

  criteria_list <- c("overall", "claims", "methods", "adv_knowledge",
                     "logic_comms", "open_sci", "gp_relevance")

  criterion_labels <- c(
    overall = "Overall", claims = "Claims", methods = "Methods",
    adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
    open_sci = "Open Science", gp_relevance = "Global Relevance"
  )

  # Compute human-human alpha (each evaluator as separate rater)
  compute_hh_alpha <- function(criterion) {
    human_wide <- metrics_human |>
      filter(criteria == criterion, paper %in% matched_papers) |>
      select(paper, evaluator, mid) |>
      distinct() |>
      pivot_wider(names_from = paper, values_from = mid)

    if (nrow(human_wide) < 2 || ncol(human_wide) < 3) return(NA_real_)

    M <- as.matrix(human_wide[, -1, drop = FALSE])
    tryCatch({
      irr::kripp.alpha(M, method = "interval")$value
    }, error = function(e) NA_real_)
  }

  # Compute human-LLM alpha for a specific model
  compute_hl_alpha <- function(criterion, model_name) {
    human_mean <- metrics_human |>
      filter(criteria == criterion, paper %in% matched_papers) |>
      group_by(paper) |>
      summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop")

    llm_rating <- llm_metrics |>
      filter(criteria == criterion, model == model_name, paper %in% matched_papers) |>
      select(paper, midpoint) |>
      rename(LLM = midpoint)

    combined <- inner_join(human_mean, llm_rating, by = "paper")
    if (nrow(combined) < 3) return(NA_real_)

    M <- rbind(Human = combined$Human, LLM = combined$LLM)
    tryCatch({
      irr::kripp.alpha(M, method = "interval")$value
    }, error = function(e) NA_real_)
  }

  # Get unique LLM models
  llm_models <- unique(llm_metrics$model)

  # Build comparison table
  baseline_results <- tibble(Criterion = criterion_labels[criteria_list]) |>
    mutate(criterion_key = criteria_list) |>
    rowwise() |>
    mutate(
      `α_HH` = compute_hh_alpha(criterion_key)
    ) |>
    ungroup()

  # Add columns for each LLM model
  for (mod in llm_models) {
    col_name <- paste0("α_HL (", mod, ")")
    baseline_results <- baseline_results |>
      rowwise() |>
      mutate(!!col_name := compute_hl_alpha(criterion_key, mod)) |>
      ungroup()
  }

  # Format for display
  baseline_display <- baseline_results |>
    select(-criterion_key) |>
    mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "—", sprintf("%.2f", .x))))

  knitr::kable(baseline_display, align = c("l", rep("r", ncol(baseline_display) - 1)))
}

Table 3.4: Human-human vs Human-LLM agreement by criterion (Krippendorff’s α)
Criterion	α_HH	α_HL (GPT-5 Pro)	α_HL (GPT-5.2 Pro)	α_HL (GPT-4o-mini)	α_HL (Claude Sonnet 4)	α_HL (Gemini 2.0 Flash)
Overall	0.55	0.25	0.36	0.14	0.14	0.26
Claims	0.42	0.20	0.26	0.19	0.21	0.18
Methods	0.54	0.29	0.07	0.03	-0.22	0.16
Adv. Knowledge	0.20	0.18	0.20	0.13	0.08	0.07
Logic & Comms	0.33	-0.04	-0.19	-0.00	0.25	-0.05
Open Science	0.03	-0.06	-0.61	-0.16	0.30	0.01
Global Relevance	0.33	0.25	0.51	0.04	-0.20	0.18

Interpretation: If α_HL approaches or exceeds α_HH, the LLM is performing at or above the level of agreement humans achieve with each other—a meaningful benchmark for “human-level” evaluation consistency.

Uncertainty Interval Coverage (LLM Credible Interval Assessment)⁴

LLMs report 90% credible intervals alongside their point estimates. If these intervals are well-specified, approximately 90% of human mean ratings should fall within them. Lower coverage suggests the LLM is overconfident; higher coverage suggests unnecessary uncertainty.

Show code

# Compute CI coverage: % of human means within LLM [lower_bound, upper_bound]
compute_coverage <- function(criterion, model_name) {
  human_mean <- metrics_human |>
    filter(criteria == criterion, paper %in% matched_papers) |>
    group_by(paper) |>
    summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop")

  llm_ci <- llm_metrics |>
    filter(criteria == criterion, model == model_name, paper %in% matched_papers) |>
    select(paper, lower_bound, upper_bound)

  combined <- inner_join(human_mean, llm_ci, by = "paper") |>
    filter(!is.na(lower_bound), !is.na(upper_bound))

  if (nrow(combined) < 3) return(NA_real_)

  # Check if human mean falls within LLM CI
  covered <- combined |>
    mutate(in_ci = human_mid >= lower_bound & human_mid <= upper_bound) |>
    summarise(coverage = mean(in_ci, na.rm = TRUE)) |>
    pull(coverage)

  covered
}

criteria_list <- c("overall", "claims", "methods", "adv_knowledge",
                   "logic_comms", "open_sci", "gp_relevance")

criterion_labels <- c(
  overall = "Overall", claims = "Claims", methods = "Methods",
  adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
  open_sci = "Open Science", gp_relevance = "Global Relevance"
)

llm_models <- unique(llm_metrics$model)

coverage_results <- tibble(Criterion = criterion_labels[criteria_list]) |>
  mutate(criterion_key = criteria_list)

for (mod in llm_models) {
  col_name <- mod
  coverage_results <- coverage_results |>
    rowwise() |>
    mutate(!!col_name := compute_coverage(criterion_key, mod)) |>
    ungroup()
}

# Format for display
coverage_display <- coverage_results |>
  select(-criterion_key) |>
  mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "—", scales::percent(.x, accuracy = 1))))

knitr::kable(coverage_display, align = c("l", rep("r", ncol(coverage_display) - 1)))

Table 3.5: Share of human mean ratings within LLM 90% credible intervals
Criterion	GPT-5 Pro	GPT-5.2 Pro	GPT-4o-mini	Claude Sonnet 4	Gemini 2.0 Flash
Overall	64%	44%	49%	70%	66%
Claims	39%	44%	44%	43%	56%
Methods	59%	22%	55%	40%	60%
Adv. Knowledge	55%	44%	52%	60%	65%
Logic & Comms	36%	44%	62%	50%	68%
Open Science	33%	33%	40%	60%	48%
Global Relevance	49%	89%	44%	60%	59%

Interpretation: Coverage near 90% indicates well-specified uncertainty. Coverage below 70% suggests systematic overconfidence; coverage above 95% may indicate overly wide intervals that provide little information.

Statistical Significance of Agreement Measures⁵

To assess whether observed correlations are statistically meaningful and to compare models, we compute bootstrap 95% confidence intervals for Pearson correlations. This nonparametric approach makes no distributional assumptions and accounts for the small sample sizes typical in evaluation studies.

Show code

set.seed(42)  # reproducibility

# Bootstrap function for correlation
boot_cor <- function(data, n_boot = 1000) {
  cors <- numeric(n_boot)
  n <- nrow(data)
  for (i in 1:n_boot) {
    idx <- sample(1:n, n, replace = TRUE)
    # Use tryCatch to handle edge cases (e.g., constant values in sample)
    cors[i] <- tryCatch(
      cor(data$human[idx], data$llm[idx], use = "complete.obs"),
      error = function(e) NA_real_,
      warning = function(w) NA_real_
    )
  }
  # Remove NAs before computing quantiles
  cors_valid <- cors[!is.na(cors)]
  if (length(cors_valid) < n_boot * 0.5) {
    # Too many failed bootstraps - return NAs
    return(c(estimate = NA_real_, ci_lo = NA_real_, ci_hi = NA_real_))
  }
  c(estimate = cor(data$human, data$llm, use = "complete.obs"),
    ci_lo = quantile(cors_valid, 0.025, na.rm = TRUE),
    ci_hi = quantile(cors_valid, 0.975, na.rm = TRUE))
}

# Prepare data: overall rating comparison
human_overall <- metrics_human |>
  filter(criteria == "overall", paper %in% matched_papers) |>
  group_by(paper) |>
  summarise(human = mean(mid, na.rm = TRUE), .groups = "drop")

llm_models <- unique(llm_metrics$model)

boot_results <- map_dfr(llm_models, function(mod) {
  llm_overall <- llm_metrics |>
    filter(criteria == "overall", model == mod, paper %in% matched_papers) |>
    group_by(paper) |>
    summarise(llm = mean(midpoint, na.rm = TRUE), .groups = "drop")

  combined <- inner_join(human_overall, llm_overall, by = "paper")

  if (nrow(combined) >= 5) {
    res <- boot_cor(combined)
    ci_lo <- res["ci_lo.2.5%"]
    ci_hi <- res["ci_hi.97.5%"]
    # Handle different quantile naming conventions
    if (is.na(ci_lo)) ci_lo <- res["ci_lo"]
    if (is.na(ci_hi)) ci_hi <- res["ci_hi"]

    tibble(
      Model = mod,
      n = nrow(combined),
      r = res["estimate"],
      `95% CI` = if (is.na(ci_lo) || is.na(ci_hi)) "—" else sprintf("[%.2f, %.2f]", ci_lo, ci_hi)
    )
  } else {
    tibble(Model = mod, n = nrow(combined), r = NA_real_, `95% CI` = "—")
  }
})

if (nrow(boot_results) > 0) {
  boot_results <- boot_results |>
    mutate(r = ifelse(is.na(r), "—", sprintf("%.2f", r))) |>
    arrange(desc(as.numeric(ifelse(r == "—", "-1", r))))

  knitr::kable(boot_results, align = c("l", "r", "r", "r"))
}

Table 3.6: Bootstrap 95% CIs for human-LLM correlation (overall ratings)
Model	n	r	95% CI
Gemini 2.0 Flash	44	0.35	[0.10, 0.56]
GPT-5 Pro	45	0.34	[0.09, 0.59]
GPT-5.2 Pro	9	0.33	[-0.47, 0.88]
Claude Sonnet 4	10	0.20	[-0.66, 0.94]
GPT-4o-mini	45	0.13	[-0.15, 0.46]

Interpretation: Non-overlapping confidence intervals suggest statistically distinguishable performance. Wide intervals reflect uncertainty due to limited sample size—a common constraint in evaluation research where obtaining more human judgments is costly.

Information-Theoretic Perspective: Mutual Information⁶

Beyond correlation, we can ask: how much information do LLM ratings convey about human ratings? We use mutual information (MI), a fundamental concept from information theory (Shannon 1948; Cover and Thomas 2006).

What is mutual information? MI quantifies the reduction in uncertainty about one variable when you learn the value of another. Formally, MI(X;Y) = H(X) - H(X|Y), where H is entropy (average surprise/uncertainty). In plain terms:

Before seeing the LLM rating, you have some uncertainty about what the human rating will be
After seeing the LLM rating, how much does your uncertainty decrease?
MI measures this decrease, in bits (the same unit used in data compression and communication)

Why use MI instead of (or alongside) correlation?

Property	Correlation (r)	Mutual Information
Captures linear relationships	Yes	Yes
Captures nonlinear relationships	No	Yes
Detects threshold effects	No	Yes
Scale	-1 to +1	0 to ∞ bits
Sensitive to outliers	Yes	Less so

A practical example: suppose the LLM predicts human ratings perfectly when the paper is methodology-focused, but randomly when it’s theory-focused. Correlation might show r = 0.5 (moderate), but MI would reveal this is inconsistent performance rather than uniformly moderate performance. This distinction matters for deciding when to trust LLM evaluations.

How to interpret MI values (for 5-bin discretization):

0 bits: Independence—LLM ratings tell you nothing about human ratings
~0.5 bits: Weak association—knowing LLM rating slightly reduces uncertainty
~1.0 bits: Moderate association—knowing LLM rating roughly halves your uncertainty
~1.5 bits: Strong association—substantial predictive value
2.32 bits (maximum): Perfect prediction—the LLM quintile perfectly determines the human quintile

We discretize ratings into quintiles (5 bins) because MI is formally defined for discrete distributions. This preserves ordinal structure while avoiding overfitting to noise in continuous ratings.

Show code

# Discretize into quintiles
discretize <- function(x, n_bins = 5) {
  breaks <- quantile(x, probs = seq(0, 1, length.out = n_bins + 1), na.rm = TRUE)
  breaks <- unique(breaks)
  if (length(breaks) < 2) return(rep(1, length(x)))
  cut(x, breaks = breaks, labels = FALSE, include.lowest = TRUE)
}

# Compute mutual information: MI(X;Y) = sum p(x,y) log2(p(x,y) / p(x)p(y))
compute_mi <- function(x, y) {
  if (length(x) < 5 || length(y) < 5) return(NA_real_)

  x_disc <- discretize(x)
  y_disc <- discretize(y)

  valid <- !is.na(x_disc) & !is.na(y_disc)
  x_disc <- x_disc[valid]
  y_disc <- y_disc[valid]

  if (length(x_disc) < 5) return(NA_real_)

  # Joint distribution p(x,y)
  joint <- table(x_disc, y_disc) / length(x_disc)
  # Marginal distributions p(x), p(y)
  px <- rowSums(joint)
  py <- colSums(joint)

  # MI = sum over all (x,y) of p(x,y) * log2(p(x,y) / (p(x)*p(y)))
  mi <- 0
  for (i in seq_along(px)) {
    for (j in seq_along(py)) {
      if (joint[i, j] > 0 && px[i] > 0 && py[j] > 0) {
        mi <- mi + joint[i, j] * log2(joint[i, j] / (px[i] * py[j]))
      }
    }
  }
  mi
}

llm_models <- unique(llm_metrics$model)

mi_results <- map_dfr(llm_models, function(mod) {
  llm_overall <- llm_metrics |>
    filter(criteria == "overall", model == mod, paper %in% matched_papers) |>
    group_by(paper) |>
    summarise(llm = mean(midpoint, na.rm = TRUE), .groups = "drop")

  combined <- inner_join(human_overall, llm_overall, by = "paper")
  mi_val <- compute_mi(combined$human, combined$llm)

  max_mi <- log2(5)  # ≈ 2.32 bits for 5 bins
  norm_mi <- if (!is.na(mi_val)) mi_val / max_mi else NA_real_

  tibble(
    Model = mod,
    n = nrow(combined),
    `MI (bits)` = ifelse(is.na(mi_val), "—", sprintf("%.2f", mi_val)),
    `Normalized` = ifelse(is.na(norm_mi), "—", sprintf("%.0f%%", norm_mi * 100))
  )
})

if (nrow(mi_results) > 0) {
  knitr::kable(mi_results, align = c("l", "r", "r", "r"))
}

Table 3.7: Mutual information between human and LLM overall ratings
Model	n	MI (bits)	Normalized
GPT-5 Pro	45	0.60	26%
GPT-5.2 Pro	9	1.61	70%
GPT-4o-mini	45	0.30	13%
Claude Sonnet 4	10	0.77	33%
Gemini 2.0 Flash	44	0.14	6%

Interpretation: Normalized MI above 50% indicates substantial predictive value. If MI is low despite moderate correlation, investigate which paper characteristics the LLM struggles with—this points toward targeted improvements or hybrid human-AI workflows where humans handle cases the LLM finds difficult.

Agreement by Evaluation Criterion

Show code

# Compute correlations for each criterion and model
criterion_agreement <- llm_metrics |>
  filter(paper %in% matched_papers) |>
  inner_join(
    metrics_human |>
      filter(paper %in% matched_papers) |>
      group_by(paper, criteria) |>
      summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop"),
    by = c("paper", "criteria")
  ) |>
  group_by(Model = model, Criterion = criteria) |>
  summarise(
    n = n(),
    r = cor(midpoint, human_mid, use = "complete.obs"),
    rho = cor(midpoint, human_mid, method = "spearman", use = "complete.obs"),
    .groups = "drop"
  ) |>
  filter(n >= 3)  # need at least 3 papers for meaningful correlation

if (nrow(criterion_agreement) > 0) {

  # Pivot for display
  criterion_labels <- c(
    overall = "Overall", claims = "Claims", methods = "Methods",
    adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
    open_sci = "Open Science", gp_relevance = "Global Relevance"
  )

  corr_table <- criterion_agreement |>
    mutate(
      Criterion = criterion_labels[Criterion],
      `r / ρ` = sprintf("%.2f / %.2f", r, rho)
    ) |>
    select(Model, Criterion, `r / ρ`) |>
    pivot_wider(names_from = Model, values_from = `r / ρ`, values_fill = "—")

  # Order criteria
  corr_table <- corr_table |>
    mutate(Criterion = factor(Criterion, levels = criterion_labels)) |>
    arrange(Criterion)

  knitr::kable(corr_table, align = c("l", rep("r", ncol(corr_table) - 1)))
}

Table 3.8: Pearson and Spearman correlations by evaluation criterion
Criterion	Claude Sonnet 4	GPT-4o-mini	GPT-5 Pro	GPT-5.2 Pro	Gemini 2.0 Flash
Overall	0.20 / 0.27	0.13 / 0.12	0.34 / 0.52	0.33 / 0.47	0.35 / 0.32
Claims	0.46 / 0.28	0.18 / 0.28	0.21 / 0.39	0.25 / 0.32	0.21 / 0.05
Methods	-0.47 / -0.61	0.03 / 0.12	0.34 / 0.54	0.02 / 0.05	0.19 / 0.19
Adv. Knowledge	0.17 / 0.23	0.12 / 0.04	0.30 / 0.46	0.16 / 0.15	0.10 / 0.10
Logic & Comms	0.52 / 0.56	-0.01 / -0.02	0.19 / 0.28	-0.25 / -0.27	0.07 / 0.08
Open Science	0.29 / 0.44	-0.13 / -0.13	0.17 / 0.12	-0.60 / -0.51	-0.00 / -0.09
Global Relevance	-0.25 / -0.21	0.03 / 0.09	0.35 / 0.43	0.50 / 0.36	0.23 / 0.20

Rating Distribution by Model

Show code

all_overall <- bind_rows(
  metrics_human |> filter(criteria == "overall") |> transmute(paper, model, rating = mid),
  llm_metrics |> filter(criteria == "overall") |> transmute(paper, model, rating = midpoint)
) |>
  filter(paper %in% matched_papers) |>
  filter(model != "Claude Sonnet 4")

if (nrow(all_overall) > 0) {
  ggplot(all_overall, aes(x = rating, fill = model)) +
    geom_density(alpha = 0.5, adjust = 1.2) +
    scale_fill_manual(values = MODEL_COLORS) +
    labs(x = "Overall percentile (0–100)", y = "Density", fill = "Evaluator") +
    theme_uj() +
    theme(legend.position = "right")
}

Figure 3.4: Distribution of overall ratings by evaluator type

Criteria-Level Comparison

Show code

metric_labels <- c(
  overall = "Overall", claims = "Claims", methods = "Methods",
  adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms",
  open_sci = "Open Science", gp_relevance = "Global Relevance"
)

criteria_means <- bind_rows(
  metrics_human |>
    filter(criteria %in% names(metric_labels), paper %in% matched_papers) |>
    group_by(criteria) |>
    summarise(mean_rating = mean(mid, na.rm = TRUE), .groups = "drop") |>
    mutate(model = "Human"),
  llm_metrics |>
    filter(criteria %in% names(metric_labels), paper %in% matched_papers) |>
    group_by(model, criteria) |>
    summarise(mean_rating = mean(midpoint, na.rm = TRUE), .groups = "drop")
) |>
  mutate(criteria_label = factor(metric_labels[criteria], levels = metric_labels))

if (nrow(criteria_means) > 0) {
  ggplot(criteria_means, aes(x = criteria_label, y = model, fill = mean_rating)) +
    geom_tile(color = "white", linewidth = 0.5) +
    geom_text(aes(label = round(mean_rating, 0)), size = 4, color = "white", fontface = "bold") +
    scale_fill_gradient(low = "#f0f0f0", high = UJ_ORANGE, limits = c(40, 80), name = "Mean") +
    labs(x = NULL, y = NULL) +
    theme_uj() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 10), panel.grid = element_blank())
}

Figure 3.5: Mean rating by model and criterion

Evaluation Priorities: What Predicts Journal Tier Judgments?⁷

How do evaluators translate quality assessments into journal tier predictions? By correlating each criterion with “where should this publish?” judgments, we can see which factors each evaluator type weighs most heavily. Divergence here reveals systematic differences in evaluation priorities—what we might call “taste.”

Show code

# Load human tier data if available
human_tiers_file <- "data/all_jtiers.csv"

if (file.exists(human_tiers_file)) {

  human_tiers_raw <- read_csv(human_tiers_file, show_col_types = FALSE) |>
    clean_names()

  # Check what columns are available and find the tier prediction column
  # Typically 'merits_journal' or similar
  tier_col <- intersect(names(human_tiers_raw), c("merits_journal", "journal_tier", "tier_should", "mid"))

  if (length(tier_col) > 0) {

    # Prepare human tier data
    human_tier_data <- human_tiers_raw |>
      filter(!is.na(.data[[tier_col[1]]])) |>
      rename(tier_should = !!tier_col[1])

    # Get paper identifier column
    paper_col <- intersect(names(human_tier_data), c("label_paper", "paper", "research"))
    if (length(paper_col) > 0) {
      human_tier_data <- human_tier_data |> rename(paper = !!paper_col[1])
    }

    metrics_to_cor <- c("overall", "claims", "methods", "adv_knowledge",
                        "logic_comms", "open_sci", "gp_relevance")

    # === Human Correlations ===
    human_wide <- metrics_human |>
      filter(criteria %in% metrics_to_cor, paper %in% matched_papers) |>
      group_by(paper, criteria) |>
      summarise(mid = mean(mid, na.rm = TRUE), .groups = "drop") |>
      pivot_wider(names_from = criteria, values_from = mid)

    # Join with human tier predictions
    human_tier_match <- human_tier_data |>
      group_by(paper) |>
      summarise(tier_should = mean(tier_should, na.rm = TRUE), .groups = "drop")

    human_cors_data <- human_wide |>
      inner_join(human_tier_match, by = "paper")

    if (nrow(human_cors_data) >= 5) {
      cors_human <- tibble(
        metric = metrics_to_cor,
        correlation = sapply(metrics_to_cor, function(m) {
          if (m %in% names(human_cors_data) && !all(is.na(human_cors_data[[m]]))) {
            cor(human_cors_data[[m]], human_cors_data$tier_should, use = "pairwise.complete.obs")
          } else NA_real_
        }),
        source = "Human"
      )
    } else {
      cors_human <- tibble(metric = character(), correlation = numeric(), source = character())
    }

    # === LLM Correlations ===
    llm_wide <- llm_metrics |>
      filter(criteria %in% metrics_to_cor, model == primary_model, paper %in% matched_papers) |>
      group_by(paper, criteria) |>
      summarise(mid = mean(midpoint, na.rm = TRUE), .groups = "drop") |>
      pivot_wider(names_from = criteria, values_from = mid)

    llm_tier_match <- llm_tiers |>
      filter(tier_kind == "tier_should", model == primary_model) |>
      group_by(paper) |>
      summarise(tier_should = mean(score, na.rm = TRUE), .groups = "drop")

    llm_cors_data <- llm_wide |>
      inner_join(llm_tier_match, by = "paper")

    if (nrow(llm_cors_data) >= 5) {
      cors_llm <- tibble(
        metric = metrics_to_cor,
        correlation = sapply(metrics_to_cor, function(m) {
          if (m %in% names(llm_cors_data) && !all(is.na(llm_cors_data[[m]]))) {
            cor(llm_cors_data[[m]], llm_cors_data$tier_should, use = "pairwise.complete.obs")
          } else NA_real_
        }),
        source = "LLM"
      )
    } else {
      cors_llm <- tibble(metric = character(), correlation = numeric(), source = character())
    }

    # === Combine and plot ===
    cors_combined <- bind_rows(cors_human, cors_llm) |>
      filter(!is.na(correlation)) |>
      mutate(
        metric_label = case_when(
          metric == "overall" ~ "Overall",
          metric == "claims" ~ "Claims & Evidence",
          metric == "methods" ~ "Methods",
          metric == "adv_knowledge" ~ "Advancing Knowledge",
          metric == "logic_comms" ~ "Logic & Communication",
          metric == "open_sci" ~ "Open Science",
          metric == "gp_relevance" ~ "Global Relevance",
          TRUE ~ metric
        )
      )

    if (nrow(cors_combined) > 0) {
      # Order by average correlation
      metric_order_cors <- cors_combined |>
        group_by(metric_label) |>
        summarise(avg_cor = mean(correlation, na.rm = TRUE), .groups = "drop") |>
        arrange(desc(avg_cor)) |>
        pull(metric_label)

      cors_wide <- cors_combined |>
        mutate(metric_label = factor(metric_label, levels = metric_order_cors)) |>
        pivot_wider(names_from = source, values_from = correlation)

      # Dumbbell plot
      ggplot(cors_wide, aes(y = metric_label)) +
        geom_segment(aes(x = Human, xend = LLM, yend = metric_label),
                     color = "gray50", linewidth = 1.2, alpha = 0.4) +
        geom_point(aes(x = Human), color = UJ_GREEN, size = 5, alpha = 0.9) +
        geom_point(aes(x = LLM), color = UJ_ORANGE, size = 5, alpha = 0.9) +
        geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
        geom_text(aes(x = Human, label = sprintf("%.2f", Human)),
                  hjust = 1.4, size = 3.5, color = UJ_GREEN, fontface = "bold") +
        geom_text(aes(x = LLM, label = sprintf("%.2f", LLM)),
                  hjust = -0.4, size = 3.5, color = UJ_ORANGE, fontface = "bold") +
        scale_x_continuous(
          limits = c(min(c(cors_wide$Human, cors_wide$LLM), na.rm = TRUE) - 0.15,
                     max(c(cors_wide$Human, cors_wide$LLM), na.rm = TRUE) + 0.15)
        ) +
        labs(
          x = "Correlation with 'Where should this publish?'",
          y = NULL,
          subtitle = paste0("Green = Human, Orange = ", primary_model)
        ) +
        theme_uj() +
        theme(
          panel.grid.major.x = element_line(color = "grey90"),
          panel.grid.major.y = element_blank(),
          axis.text.y = element_text(size = 11)
        )
    } else {
      cat("Insufficient correlation data for dumbbell plot.\n")
    }
  } else {
    cat("Tier prediction column not found in human data.\n")
  }
} else {
  cat("Human tier data file not found.\n")
}

Tier prediction column not found in human data.

Interpretation: Large divergences indicate different evaluation philosophies. For example, if humans weight “open science” heavily but LLMs don’t, this suggests the LLM may undervalue reproducibility when forming overall judgments.

Systematic Rating Differences by Paper and Criterion⁸

The heatmap below shows Human − LLM differences for each paper and criterion. Green indicates humans rated higher than the LLM; orange indicates the LLM rated higher. Papers are ordered by their overall rating difference.

Show code

metric_order <- c("overall", "claims", "methods", "adv_knowledge",
                  "logic_comms", "open_sci", "gp_relevance")
metric_lab <- c(
  overall = "Overall",
  claims = "Claims & Evidence",
  methods = "Methods",
  adv_knowledge = "Adv. Knowledge",
  logic_comms = "Logic & Comms",
  open_sci = "Open Science",
  gp_relevance = "Global Relevance"
)

# Use the first/primary LLM model for comparison (GPT-5 Pro if available)
primary_model <- if ("GPT-5 Pro" %in% unique(llm_metrics$model)) "GPT-5 Pro" else unique(llm_metrics$model)[1]

H_mean <- metrics_human |>
  filter(criteria %in% metric_order, paper %in% matched_papers) |>
  group_by(paper, criteria) |>
  summarise(h = mean(mid, na.rm = TRUE), .groups = "drop")

L_mean <- llm_metrics |>
  filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |>
  group_by(paper, criteria) |>
  summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop")

Ddiff <- inner_join(H_mean, L_mean, by = c("paper", "criteria")) |>
  mutate(diff = h - l) |>
  mutate(crit = factor(criteria, levels = metric_order, labels = metric_lab[metric_order]))

# Order papers by overall difference (Human−LLM)
ord_p <- Ddiff |>
  filter(criteria == "overall") |>
  arrange(desc(diff)) |>
  pull(paper)

if (length(ord_p) > 0 && nrow(Ddiff) > 0) {
  ggplot(Ddiff, aes(x = factor(paper, levels = ord_p), y = crit, fill = diff)) +
    geom_tile(color = "white", linewidth = 0.25) +
    scale_fill_gradient2(
      low = UJ_ORANGE, mid = "grey95", high = UJ_GREEN, midpoint = 0,
      name = "Human − LLM",
      limits = c(-30, 30),
      oob = scales::squish
    ) +
    labs(x = NULL, y = NULL,
         subtitle = paste0("Primary model: ", primary_model)) +
    theme_uj() +
    theme(
      axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1, size = 7),
      axis.text.y = element_text(size = 11),
      panel.grid = element_blank()
    )
} else {
  cat("Insufficient data for gap heatmap.\n")
}

Figure 3.6: Human − LLM rating differences by paper and criterion

This visualization helps identify: (1) papers with systematic disagreement across all criteria, (2) criteria where humans and LLMs consistently diverge, and (3) potential patterns related to paper characteristics.

Papers with Largest Human-LLM Disagreement⁹

Understanding why specific papers generate disagreement can inform both LLM limitations and potential human biases. The table below shows papers where humans and LLMs diverged most.

Show code

# Compute mean overall ratings and differences
H_overall <- metrics_human |>
  filter(criteria == "overall", paper %in% matched_papers) |>
  group_by(paper) |>
  summarise(human_rating = mean(mid, na.rm = TRUE), .groups = "drop")

L_overall <- llm_metrics |>
  filter(criteria == "overall", model == primary_model, paper %in% matched_papers) |>
  group_by(paper) |>
  summarise(llm_rating = mean(midpoint, na.rm = TRUE), .groups = "drop")

rating_diffs <- H_overall |>
  inner_join(L_overall, by = "paper") |>
  mutate(diff = human_rating - llm_rating)

# Helper: truncate long titles
truncate_title <- function(title, max_len = 50) {
  if (nchar(title) <= max_len) return(title)
  paste0(substr(title, 1, max_len - 3), "...")
}

# Top 5 where humans rated higher
top_human_pref <- rating_diffs |>
  filter(diff > 0) |>
  arrange(desc(diff)) |>
  slice_head(n = 5) |>
  mutate(
    Paper = sapply(paper, truncate_title),
    `Human Rating` = round(human_rating, 0),
    `LLM Rating` = round(llm_rating, 0),
    `Difference` = sprintf("%+.0f", diff),
    Direction = "Human > LLM"
  ) |>
  select(Paper, `Human Rating`, `LLM Rating`, Difference, Direction)

# Top 5 where LLM rated higher
top_llm_pref <- rating_diffs |>
  filter(diff < 0) |>
  arrange(diff) |>
  slice_head(n = 5) |>
  mutate(
    Paper = sapply(paper, truncate_title),
    `Human Rating` = round(human_rating, 0),
    `LLM Rating` = round(llm_rating, 0),
    `Difference` = sprintf("%+.0f", diff),
    Direction = "LLM > Human"
  ) |>
  select(Paper, `Human Rating`, `LLM Rating`, Difference, Direction)

# Combine
top_disagree <- bind_rows(top_human_pref, top_llm_pref)

if (nrow(top_disagree) > 0) {
  knitr::kable(top_disagree, align = c("l", "r", "r", "r", "l"))
} else {
  cat("Insufficient data for disagreement analysis.\n")
}

Table 3.9: Papers with largest human vs. LLM rating differences
Paper	Human Rating	LLM Rating	Difference	Direction
Bruers 2021	80	46	+34	Human > LLM
Bhattacharya and Packalen 2020	70	58	+12	Human > LLM
Blimpo and Castaneda-Dower 2025	80	71	+9	Human > LLM
Epperson and Gerster 2024	89	82	+7	Human > LLM
Chuard et al. 2022	80	75	+5	Human > LLM
Bergeron et al. 2024	40	85	-45	LLM > Human
Bettle 2023	32	75	-42	LLM > Human
Williams et al. 2024	50	87	-37	LLM > Human
Frech et al. 2023	40	70	-30	LLM > Human
Green et al. 2025	57	80	-23	LLM > Human

These outliers merit qualitative investigation: examining the paper characteristics, human evaluator comments, and LLM rationales may reveal systematic patterns in what drives disagreement.

Note: GPT-5 Pro evaluations include extended reasoning traces. See the Appendix for full assessment summaries and reasoning traces per paper.

Prompt Comparison: Legacy vs Updated GPT-5 Pro

We compare GPT-5 Pro evaluations between two prompt versions:

Legacy (Oct 2024): Original combined schema without assessment summary
Updated (Jan 2026): Modular prompt with explicit diagnostic assessment summary

Load legacy vs updated GPT-5 Pro data

# Load legacy GPT-5 data
legacy_metrics <- read_csv("data/metrics_long_gpt-5.csv", show_col_types = FALSE) |>
  transmute(
    paper_raw = paper,
    paper = str_replace_all(paper, " ", "_") |> str_replace_all("\\.", ""),  # normalize
    metric = metric,
    legacy_mid = midpoint,
    legacy_lo = lower_bound,
    legacy_hi = upper_bound
  )

# Load updated GPT-5 Pro data
updated_metrics <- read_csv("data/metrics_long_gpt5_pro_jan2026.csv", show_col_types = FALSE) |>
  transmute(
    paper = str_replace_all(paper, "\\.", ""),  # remove dots for matching
    metric = metric,
    updated_mid = midpoint,
    updated_lo = lower_bound,
    updated_hi = upper_bound
  )

# Join by normalized paper name and metric
prompt_comparison <- legacy_metrics |>
  inner_join(updated_metrics, by = c("paper", "metric")) |>
  mutate(
    diff = updated_mid - legacy_mid,
    metric_label = case_when(
      metric == "overall" ~ "Overall",
      metric == "claims_evidence" ~ "Claims",
      metric == "methods" ~ "Methods",
      metric == "advancing_knowledge" ~ "Adv. Knowledge",
      metric == "logic_communication" ~ "Logic & Comms",
      metric == "open_science" ~ "Open Science",
      metric == "global_relevance" ~ "Global Relevance",
      TRUE ~ metric
    )
  )

n_matched_papers <- n_distinct(prompt_comparison$paper)

We matched 57 papers across both runs.

Show code

overall_comp <- prompt_comparison |> filter(metric == "overall")

if (nrow(overall_comp) > 0) {
  cor_val <- cor(overall_comp$legacy_mid, overall_comp$updated_mid, use = "complete.obs")

  ggplot(overall_comp, aes(x = legacy_mid, y = updated_mid)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
    geom_point(alpha = 0.7, size = 2.5, color = UJ_ORANGE) +
    geom_smooth(method = "lm", se = TRUE, alpha = 0.2, color = UJ_BLUE) +
    annotate("text", x = 55, y = 95,
             label = paste0("r = ", round(cor_val, 2)),
             size = 5, fontface = "bold") +
    labs(
      x = "Legacy Prompt (Oct 2024)",
      y = "Updated Prompt (Jan 2026)",
      title = "GPT-5 Pro: Overall Rating Comparison",
      subtitle = "Dashed line = perfect agreement"
    ) +
    coord_fixed(xlim = c(40, 100), ylim = c(40, 100)) +
    theme_uj()
}

Figure 3.7: Legacy vs Updated GPT-5 Pro ratings (overall metric)

Show code

if (nrow(prompt_comparison) > 0) {
  metric_order <- prompt_comparison |>
    group_by(metric_label) |>
    summarise(mean_diff = mean(diff, na.rm = TRUE)) |>
    arrange(mean_diff) |>
    pull(metric_label)

  prompt_comparison |>
    mutate(metric_label = factor(metric_label, levels = metric_order)) |>
    ggplot(aes(x = metric_label, y = diff)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
    geom_boxplot(fill = UJ_ORANGE, alpha = 0.6, outlier.alpha = 0.3) +
    labs(
      x = NULL,
      y = "Rating Difference (Updated - Legacy)",
      title = "Prompt Version Effect by Criterion",
      subtitle = "Positive = updated prompt rates higher"
    ) +
    theme_uj() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))
}

Figure 3.8: Rating difference (Updated - Legacy) by criterion

Show code

if (nrow(prompt_comparison) > 0) {
  prompt_comparison |>
    group_by(Criterion = metric_label) |>
    summarise(
      `Legacy Mean` = round(mean(legacy_mid, na.rm = TRUE), 1),
      `Updated Mean` = round(mean(updated_mid, na.rm = TRUE), 1),
      `Mean Diff` = round(mean(diff, na.rm = TRUE), 1),
      `SD Diff` = round(sd(diff, na.rm = TRUE), 1),
      .groups = "drop"
    ) |>
    kable() |>
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
}

Table 3.10: Summary statistics: Legacy vs Updated prompt
Criterion	Legacy Mean	Updated Mean	Mean Diff	SD Diff
Adv. Knowledge	84.5	80.4	-4.1	4.8
Claims	83.2	77.5	-5.7	4.8
Global Relevance	85.8	82.6	-3.2	6.3
Logic & Comms	86.4	83.6	-2.8	4.8
Methods	81.3	73.6	-7.7	4.6
Open Science	67.9	54.8	-13.1	8.9
Overall	86.0	80.4	-5.6	4.4

The updated prompt with explicit diagnostic assessment appears to produce systematically different ratings. The correlation between prompt versions indicates the degree of consistency across the prompt change.

We submit each PDF through a single, schema-enforced Responses call. The system prompt mirrors the Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a ~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. We enforce well-formed intervals (lower < midpoint < upper for 0–100 metrics; ci_lower < score < ci_upper for 0–5 tiers). GPT-5 Pro runs with reasoning effort = “high”, so traces are available for audit.↩︎
0–100 percentiles relative to a reference group of ~‘all serious work you have read in this area in the last three years’, overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), abd global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.↩︎
This section was added with AI assistance to address feedback about contextualizing agreement metrics.↩︎
This analysis addresses feedback about LLM uncertainty quantification.↩︎
Bootstrap inference was added with AI assistance to quantify uncertainty in agreement metrics.↩︎
Information-theoretic interpretation added with AI assistance.↩︎
This analysis was added with AI assistance to reveal differences in what humans vs. LLMs emphasize when predicting journal outcomes.↩︎
This visualization was added with AI assistance to identify patterns in human-LLM disagreement.↩︎
This table was added with AI assistance to facilitate qualitative investigation of outliers.↩︎

--- title: "Results: Ratings" format: html: default engine: knitr --- ```{r} #| label: setup #| code-summary: "Setup and libraries" #| code-fold: true #| message: false #| warning: false source("setup_params.R") library("tidyverse") library("janitor") library("stringr") library("here") library("knitr") library("kableExtra") library("ggrepel") library("scales") library("jsonlite") library("purrr") library("tibble") library("plotly") # Theme and colors UJ_ORANGE <- "#f19e4b" # LLM primary UJ_GREEN <- "#99bb66" # Human UJ_BLUE <- "#4e79a7" MODEL_COLORS <- c( "GPT-5 Pro" = "#f19e4b", "GPT-5.2 Pro" = "#c9302c", "GPT-4o-mini" = "#74aa9c", "Claude Sonnet 4" = "#d4a574", "Gemini 2.0 Flash" = "#7986cb", "Human" = "#99bb66" ) theme_uj <- function(base_size = 11) { theme_minimal(base_size = base_size) + theme( panel.grid.minor = element_blank(), plot.title.position = "plot", legend.position = "bottom" ) } canon_metric <- function(x) dplyr::recode( x, "advancing_knowledge" = "adv_knowledge", "open_science" = "open_sci", "logic_communication" = "logic_comms", "global_relevance" = "gp_relevance", "claims_evidence" = "claims", .default = x ) `%||%` <- function(x, y) if (!is.null(x)) x else y # Pricing per 1M tokens (USD) pricing <- tribble( ~model, ~input_per_m, ~output_per_m, "GPT-5 Pro", 15.00, 60.00, "GPT-5.2 Pro", 15.00, 60.00, "GPT-4o-mini", 0.15, 0.60, "Claude Sonnet 4", 3.00, 15.00, "Gemini 2.0 Flash", 0.075, 0.30 ) ``` ```{r} #| label: load-human-data #| code-fold: true #| code-summary: "Load human evaluation data" #| message: false # Load human ratings from Unjournal UJmap <- read_delim("data/UJ_map.csv", delim = ";", show_col_types = FALSE) |> mutate(label_paper_title = research, label_paper = paper) |> select(c("label_paper_title", "label_paper")) rsx <- read_csv("data/rsx_evalr_rating.csv", show_col_types = FALSE) |> clean_names() |> mutate(label_paper_title = research) |> select(-c("research")) research <- read_csv("data/research.csv", show_col_types = FALSE) |> clean_names() |> filter(status == "50_published evaluations (on PubPub, by Unjournal)") |> left_join(UJmap, by = c("label_paper_title")) |> mutate(doi = str_trim(doi)) |> mutate(label_paper = case_when( doi == "https://doi.org/10.3386/w31162" ~ "Walker et al. 2023", doi == "doi.org/10.3386/w32728" ~ "Hahn et al. 2025", doi == "https://doi.org/10.3386/w30011" ~ "Bhat et al. 2022", doi == "10.1093/wbro/lkae010" ~ "Crawfurd et al. 2023", TRUE ~ label_paper )) |> left_join(rsx, by = c("label_paper_title")) key_map <- research |> transmute(label_paper_title = str_trim(label_paper_title), label_paper = label_paper) |> filter(!is.na(label_paper_title)) |> distinct(label_paper_title, label_paper) |> group_by(label_paper_title) |> slice(1) |> ungroup() rsx_research <- rsx |> mutate(label_paper_title = str_trim(label_paper_title)) |> left_join(key_map, by = "label_paper_title", relationship = "many-to-one") metrics_human <- rsx_research |> mutate(criteria = canon_metric(criteria)) |> filter(criteria %in% c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |> transmute( paper = label_paper, criteria, evaluator, model = "Human", mid = as.numeric(middle_rating), lo = suppressWarnings(as.numeric(lower_ci)), hi = suppressWarnings(as.numeric(upper_ci)) ) |> filter(!is.na(paper), !is.na(mid)) |> mutate( lo = ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_), hi = ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_) ) |> mutate(across(c(mid, lo, hi), ~ round(.x, 4))) |> distinct(paper, criteria, model, evaluator, mid, lo, hi) human_avg <- metrics_human |> filter(criteria == "overall") |> group_by(paper) |> summarise( human_mid = mean(mid, na.rm = TRUE), human_lo = mean(lo, na.rm = TRUE), human_hi = mean(hi, na.rm = TRUE), n_human = n(), .groups = "drop" ) # Summary stats for inline use n_human_papers <- n_distinct(metrics_human$paper) n_human_evaluators <- n_distinct(metrics_human$evaluator) ``` ```{r} #| label: load-llm-data #| code-fold: true #| code-summary: "Load LLM evaluation data (all models)" #| message: false model_dirs <- list( "gpt5_pro_updated_jan2026" = "GPT-5 Pro", "gpt52_pro_focal_jan2026" = "GPT-5.2 Pro", "gpt_4o_mini_2024_07_18" = "GPT-4o-mini", "claude_sonnet_4_20250514" = "Claude Sonnet 4", "gemini_2.0_flash" = "Gemini 2.0 Flash" ) parse_response <- function(path, model_name) { tryCatch({ r <- jsonlite::fromJSON(path, simplifyVector = FALSE) paper <- basename(path) |> str_replace("\\.response\\.json$", "") |> str_replace_all("_", " ") parsed <- NULL if (!is.null(r$output_text) && nchar(r$output_text) > 0) { parsed <- jsonlite::fromJSON(r$output_text, simplifyVector = TRUE) } else if (!is.null(r$parsed)) { parsed <- r$parsed } else if (!is.null(r$output)) { msg <- purrr::detect(r$output, ~ .x$type == "message", .default = NULL) if (!is.null(msg) && length(msg$content) > 0) { parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector = TRUE) } } if (is.null(parsed)) return(NULL) metrics <- parsed$metrics metric_rows <- list() tier_rows <- list() for (nm in names(metrics)) { if (nm %in% c("tier_should", "tier_will")) { tier_rows[[length(tier_rows) + 1]] <- tibble( paper = paper, model = model_name, tier_kind = nm, score = metrics[[nm]]$score, ci_lower = metrics[[nm]]$ci_lower, ci_upper = metrics[[nm]]$ci_upper ) } else { metric_rows[[length(metric_rows) + 1]] <- tibble( paper = paper, model = model_name, metric = nm, midpoint = metrics[[nm]]$midpoint, lower_bound = metrics[[nm]]$lower_bound, upper_bound = metrics[[nm]]$upper_bound ) } } input_tok <- r$usage$input_tokens %||% r$input_tokens output_tok <- r$usage$output_tokens %||% r$output_tokens reasoning_tok <- r$usage$output_tokens_details$reasoning_tokens list( metrics = bind_rows(metric_rows), tiers = bind_rows(tier_rows), tokens = tibble( paper = paper, model = model_name, input_tokens = input_tok %||% NA_integer_, output_tokens = output_tok %||% NA_integer_, reasoning_tokens = reasoning_tok %||% NA_integer_ ), summary = tibble( paper = paper, model = model_name, assessment_summary = parsed$assessment_summary ) ) }, error = function(e) NULL) } load_all_llm <- function() { all_metrics <- list() all_tiers <- list() all_tokens <- list() all_summaries <- list() for (dir_name in names(model_dirs)) { model_name <- model_dirs[[dir_name]] json_dir <- here("results", dir_name, "json") if (dir.exists(json_dir)) { files <- list.files(json_dir, pattern = "\\.response\\.json$", full.names = TRUE) for (f in files) { result <- parse_response(f, model_name) if (!is.null(result)) { all_metrics[[length(all_metrics) + 1]] <- result$metrics all_tiers[[length(all_tiers) + 1]] <- result$tiers all_tokens[[length(all_tokens) + 1]] <- result$tokens all_summaries[[length(all_summaries) + 1]] <- result$summary } } } } list( metrics = bind_rows(all_metrics) |> mutate(criteria = canon_metric(metric)), tiers = bind_rows(all_tiers), tokens = bind_rows(all_tokens), summaries = bind_rows(all_summaries) ) } llm_data <- load_all_llm() llm_metrics <- llm_data$metrics llm_tiers <- llm_data$tiers llm_tokens <- llm_data$tokens llm_summaries <- llm_data$summaries # Summary stats for inline use n_llm_models <- n_distinct(llm_metrics$model) n_llm_papers <- n_distinct(llm_metrics$paper) llm_model_names <- unique(llm_metrics$model) llm_model_list <- paste(llm_model_names, collapse = ", ") # Papers with both human and LLM data matched_papers <- intersect( unique(metrics_human$paper), unique(llm_metrics$paper) ) n_matched <- length(matched_papers) # Cost summary cost_data <- llm_tokens |> left_join(pricing, by = "model") |> mutate( reasoning_tokens = coalesce(reasoning_tokens, 0L), cost_usd = (input_tokens * input_per_m + output_tokens * output_per_m) / 1e6 ) total_cost <- sum(cost_data$cost_usd, na.rm = TRUE) total_evaluations <- nrow(llm_tokens) # Per-model summaries model_summary <- cost_data |> group_by(model) |> summarise( n_papers = n(), avg_input = round(mean(input_tokens, na.rm = TRUE)), avg_output = round(mean(output_tokens, na.rm = TRUE)), avg_reasoning = round(mean(reasoning_tokens[reasoning_tokens > 0], na.rm = TRUE)), total_cost = sum(cost_usd, na.rm = TRUE), cost_per_paper = mean(cost_usd, na.rm = TRUE), .groups = "drop" ) ``` We compare LLM-generated research evaluations against human expert reviews from [The Unjournal](https://unjournal.pubpub.org). This analysis covers `r n_llm_papers` papers evaluated by `r n_llm_models` frontier LLMs (`r llm_model_list`), with `r n_matched` papers having both human and LLM evaluations for direct comparison. Human evaluations come from `r n_human_evaluators` expert reviewers across `r n_human_papers` papers in The Unjournal's database. ## Evaluation Overview The previous [Methods](methods.qmd)) chapter explains our process and pipeline.[^results_ratings-1] This generally mirrors The Unjournal's rubric and [guidelines](https://globalimpact.gitbook.io/the-unjournal-project-and-communication-space/policies-projects-evaluation-workflow/evaluation/guidelines-for-evaluators), requesting a range of percentile metrics and normative and predictive 'journal tiers' metrics.^[0–100 percentiles relative to a reference group of ~'all serious work you have read in this area in the last three years', overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), abd global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.] [^results_ratings-1]: We submit each PDF through a single, schema-enforced Responses call. The system prompt mirrors the Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a \~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. We enforce well-formed intervals (lower \< midpoint \< upper for 0–100 metrics; ci_lower \< score \< ci_upper for 0–5 tiers). GPT-5 Pro runs with reasoning effort = "high", so traces are available for audit. Each numeric field carries a midpoint (model’s 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty. ::: callout-note ## Future Models We are working to extend this comparison to newer frontier models as they become available and our funding permits, as well as adapting these to bespoke agent-based tools such as Refine.ink. ::: ### Token Usage and Cost Token costs are computed from API-reported input/output tokens (GPT-5 Pro runs with `reasoning` = high). Use the boxplot log scale to compare efficiency frontiers; reasoning tokens are reported when available. ```{r} #| label: tbl-cost-overview #| tbl-cap: "Token usage and estimated cost per model" if (nrow(llm_tokens) > 0) { cost_table <- model_summary |> transmute( Model = model, Papers = n_papers, `Avg input` = scales::comma(avg_input), `Avg output` = scales::comma(avg_output), `Avg reasoning` = ifelse(is.finite(avg_reasoning) & avg_reasoning > 0, scales::comma(avg_reasoning), "—"), `Total cost` = scales::dollar(total_cost, accuracy = 0.01), `Cost/paper` = scales::dollar(cost_per_paper, accuracy = 0.001) ) |> arrange(desc(as.numeric(gsub("[$,]", "", `Total cost`)))) knitr::kable(cost_table, align = c("l", rep("r", 6))) } ``` Total cost across all `r total_evaluations` LLM evaluations: **`r scales::dollar(total_cost, accuracy = 0.01)`**. ```{r} #| label: fig-cost-box #| fig-cap: "Cost per paper by model (log scale)" #| fig-width: 9 #| fig-height: 5 if (nrow(cost_data) > 0) { ggplot(cost_data, aes(x = reorder(model, cost_usd, FUN = median), y = cost_usd, fill = model)) + geom_boxplot(alpha = 0.7, outlier.shape = NA) + geom_jitter(width = 0.15, alpha = 0.5, size = 2) + scale_fill_manual(values = MODEL_COLORS, guide = "none") + scale_y_log10(labels = scales::dollar_format(accuracy = 0.001)) + coord_flip() + labs(x = NULL, y = "Cost per paper (USD, log scale)") + theme_uj() } ```  ------------------------------------------------------------------------ ## Interactive Forest Plot: Human vs LLM Ratings The forest plot below shows **overall percentile ratings** for each of the `r n_matched` papers with matched human and LLM evaluations. Human evaluator ratings are shown in green (individual evaluators as small dots, mean as diamond). LLM ratings from different models are shown with distinct colors. Interpretation: points are midpoints, horizontal bars are the model’s 90% credible interval. Hover over any point to see details. Click legend items to show/hide models. ```{r} #| label: fig-forest-plotly #| fig-cap: "Interactive forest plot: Human and LLM overall ratings by paper" #| fig-width: 12 #| fig-height: 15 forest_human <- metrics_human |> filter(criteria == "overall") |> transmute(paper, model = "Human", evaluator, mid, lo, hi, is_mean = FALSE) forest_human_means <- human_avg |> transmute(paper, model = "Human", evaluator = "Mean", mid = human_mid, lo = human_lo, hi = human_hi, is_mean = TRUE) forest_llm <- llm_metrics |> filter(criteria == "overall") |> transmute(paper, model, evaluator = model, mid = midpoint, lo = lower_bound, hi = upper_bound, is_mean = FALSE) forest_data <- bind_rows(forest_human, forest_human_means, forest_llm) |> filter(paper %in% matched_papers) if (nrow(forest_data) > 0) { paper_order <- forest_data |> filter(model == "Human", is_mean) |> arrange(desc(mid)) |> pull(paper) forest_data <- forest_data |> mutate( paper = factor(paper, levels = rev(paper_order)), paper_num = as.numeric(paper), hover_text = paste0( "", paper, " ", model, ifelse(evaluator != model & !is_mean, paste0(" (", evaluator, ")"), ""), " ", "Rating: ", round(mid, 1), ifelse(!is.na(lo) & !is.na(hi), paste0(" [", round(lo, 1), "–", round(hi, 1), "]"), "") ) ) p <- plot_ly() |> add_segments(x = 25, xend = 25, y = 0.5, yend = length(paper_order) + 0.5, line = list(color = "lightgray", dash = "dot"), showlegend = FALSE, hoverinfo = "none") |> add_segments(x = 50, xend = 50, y = 0.5, yend = length(paper_order) + 0.5, line = list(color = "gray", dash = "dot"), showlegend = FALSE, hoverinfo = "none") |> add_segments(x = 75, xend = 75, y = 0.5, yend = length(paper_order) + 0.5, line = list(color = "lightgray", dash = "dot"), showlegend = FALSE, hoverinfo = "none") models_to_plot <- c("Human", setdiff(unique(forest_data$model), "Human")) for (m in models_to_plot) { model_data <- forest_data |> filter(model == m) if (m == "Human") { indiv <- model_data |> filter(!is_mean) if (nrow(indiv) > 0) { p <- p |> add_trace( data = indiv, type = "scatter", mode = "markers", x = ~mid, y = ~paper_num, marker = list(color = MODEL_COLORS[m], size = 6, opacity = 0.4), text = ~hover_text, hoverinfo = "text", name = "Human (individual)", legendgroup = "Human" ) } means <- model_data |> filter(is_mean) if (nrow(means) > 0) { p <- p |> add_trace( data = means, type = "scatter", mode = "markers", x = ~mid, y = ~paper_num, error_x = list(type = "data", symmetric = FALSE, arrayminus = ~(mid - lo), array = ~(hi - mid), color = MODEL_COLORS[m], thickness = 1.5), marker = list(color = MODEL_COLORS[m], size = 12, symbol = "diamond"), text = ~hover_text, hoverinfo = "text", name = "Human (mean)", legendgroup = "Human" ) } } else { p <- p |> add_trace( data = model_data, type = "scatter", mode = "markers", x = ~mid, y = ~paper_num, error_x = list(type = "data", symmetric = FALSE, arrayminus = ~(mid - lo), array = ~(hi - mid), color = MODEL_COLORS[m], thickness = 1.5), marker = list(color = MODEL_COLORS[m], size = 10), text = ~hover_text, hoverinfo = "text", name = m ) } } p <- p |> layout( xaxis = list(title = "Overall percentile (0–100)", range = c(0, 105), zeroline = FALSE), yaxis = list(title = "", ticktext = paper_order, tickvals = seq_along(paper_order), tickmode = "array"), legend = list(orientation = "h", y = -0.1, x = 0.5, xanchor = "center"), hovermode = "closest", margin = list(l = 200) ) p } else { cat("No matched human-LLM data available for forest plot.\n") } ``` ## Human vs LLM Scatter Plot Each point represents a paper, with human mean rating on the x-axis and LLM rating on the y-axis. Points on the diagonal indicate perfect agreement. Note: human means carry their own variance; correlations here are bounded by human inter-rater noise. ```{r} #| label: fig-scatter-interactive #| fig-cap: "Interactive scatter: Human vs LLM overall ratings" #| fig-width: 10 #| fig-height: 8 scatter_data <- llm_metrics |> filter(criteria == "overall") |> inner_join(human_avg, by = "paper") |> mutate( diff = midpoint - human_mid, paper_short = str_trunc(paper, 25), # Ensure no NA values break ggplotly human_lo = coalesce(human_lo, human_mid), human_hi = coalesce(human_hi, human_mid), lower_bound = coalesce(lower_bound, midpoint), upper_bound = coalesce(upper_bound, midpoint) ) |> filter(!is.na(human_mid), !is.na(midpoint)) if (nrow(scatter_data) > 0) { p <- ggplot(scatter_data, aes( x = human_mid, y = midpoint, color = model, text = paste0( "", paper, " ", "Human: ", round(human_mid, 1), " [", round(human_lo, 1), "–", round(human_hi, 1), "] ", "LLM: ", round(midpoint, 1), " [", round(lower_bound, 1), "–", round(upper_bound, 1), "] ", "Diff: ", sprintf("%+.1f", diff) ) )) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") + geom_point(size = 3.5, alpha = 0.7) + scale_color_manual(values = MODEL_COLORS) + coord_cartesian(xlim = c(0, 100), ylim = c(0, 100)) + labs(x = "Human mean rating (0–100)", y = "LLM rating (0–100)", color = "Model") + theme_uj() # Use tryCatch to handle ggplotly errors gracefully tryCatch({ ggplotly(p, tooltip = "text") |> layout( legend = list(orientation = "h", y = -0.12, x = 0.5, xanchor = "center"), hoverlabel = list(bgcolor = "white") ) }, error = function(e) { # Fallback to static plot if ggplotly fails print(p) message("Note: Interactive plot unavailable, showing static version.") }) } else { cat("No matching data for scatter plot.\n") } ``` ## Model Agreement Statistics We assess agreement between LLM and human evaluators using multiple metrics: - **Pearson r**: Linear correlation between ratings - **Spearman ρ**: Rank-based correlation (robust to outliers) - **Mean bias**: Average difference (LLM − Human); positive = LLM rates higher - **RMSE/MAE**: Error magnitude measures ```{r} #| label: tbl-agreement #| tbl-cap: "Agreement between each LLM and human mean ratings (overall criterion)" if (nrow(scatter_data) > 0) { agreement_tbl <- scatter_data |> group_by(Model = model) |> summarise( N = n(), `Pearson r` = round(cor(human_mid, midpoint, use = "complete.obs"), 3), `Spearman ρ` = round(cor(human_mid, midpoint, method = "spearman", use = "complete.obs"), 3), `Mean bias` = sprintf("%+.1f", mean(midpoint - human_mid, na.rm = TRUE)), RMSE = round(sqrt(mean((midpoint - human_mid)^2, na.rm = TRUE)), 1), MAE = round(mean(abs(midpoint - human_mid), na.rm = TRUE), 1), .groups = "drop" ) |> arrange(desc(`Pearson r`)) knitr::kable(agreement_tbl, align = c("l", rep("r", 6))) } ``` ### Inter-Rater Agreement (Krippendorff's Alpha) Krippendorff's alpha measures agreement among multiple raters, accounting for chance agreement. Values above 0.8 indicate good agreement; 0.67–0.8 is acceptable for exploratory research. Ceiling effects apply because human evaluators are not perfectly aligned. ```{r} #| label: tbl-krippendorff #| tbl-cap: "Inter-rater agreement: Krippendorff's alpha by criterion" # Install irr if needed: install.packages("irr") if (requireNamespace("irr", quietly = TRUE)) { library(irr) # Prepare data: each row = paper, each column = rater (human mean + each LLM) compute_alpha <- function(criterion) { # Human mean per paper human_crit <- metrics_human |> filter(criteria == criterion, paper %in% matched_papers) |> group_by(paper) |> summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop") # LLM ratings per paper llm_crit <- llm_metrics |> filter(criteria == criterion, paper %in% matched_papers) |> select(paper, model, midpoint) |> pivot_wider(names_from = model, values_from = midpoint) # Combine combined <- human_crit |> inner_join(llm_crit, by = "paper") |> select(-paper) if (nrow(combined) < 3 || ncol(combined) < 2) return(NA_real_) # Krippendorff's alpha (interval scale) tryCatch({ ka <- kripp.alpha(t(as.matrix(combined)), method = "interval") ka$value }, error = function(e) NA_real_) } criteria_list <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") alpha_results <- tibble( Criterion = c("Overall", "Claims", "Methods", "Adv. Knowledge", "Logic & Comms", "Open Science", "Global Relevance"), criterion_key = criteria_list ) |> rowwise() |> mutate( `Krippendorff's α` = compute_alpha(criterion_key), `α (formatted)` = ifelse(is.na(`Krippendorff's α`), "—", sprintf("%.3f", `Krippendorff's α`)), Interpretation = case_when( is.na(`Krippendorff's α`) ~ "Insufficient data", `Krippendorff's α` >= 0.8 ~ "Good", `Krippendorff's α` >= 0.67 ~ "Acceptable", `Krippendorff's α` >= 0.4 ~ "Fair", TRUE ~ "Poor" ) ) |> ungroup() |> select(Criterion, `α (formatted)`, Interpretation) |> rename(`Krippendorff's α` = `α (formatted)`) knitr::kable(alpha_results, align = c("l", "r", "l")) } else { cat("*Install the `irr` package for Krippendorff's alpha: `install.packages('irr')`*\n") } ```  ### Human-Human vs Human-LLM Agreement (Baseline Context)[^results_ratings-2] [^results_ratings-2]: This section was added with AI assistance to address feedback about contextualizing agreement metrics. To interpret human-LLM agreement, we must compare against **human-human agreement** as an upper bound. If human evaluators only agree with each other at α = 0.5, then expecting LLMs to achieve α = 0.8 with humans would be unrealistic. The table below shows both: - **α~HH~**: Krippendorff's alpha among human evaluators (treating each evaluator as a separate rater) - **α~HL~**: Krippendorff's alpha between human mean and each LLM model ```{r} #| label: tbl-human-baseline #| tbl-cap: "Human-human vs Human-LLM agreement by criterion (Krippendorff's α)" #| code-fold: true if (requireNamespace("irr", quietly = TRUE)) { library(irr) criteria_list <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") criterion_labels <- c( overall = "Overall", claims = "Claims", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) # Compute human-human alpha (each evaluator as separate rater) compute_hh_alpha <- function(criterion) { human_wide <- metrics_human |> filter(criteria == criterion, paper %in% matched_papers) |> select(paper, evaluator, mid) |> distinct() |> pivot_wider(names_from = paper, values_from = mid) if (nrow(human_wide) < 2 || ncol(human_wide) < 3) return(NA_real_) M <- as.matrix(human_wide[, -1, drop = FALSE]) tryCatch({ irr::kripp.alpha(M, method = "interval")$value }, error = function(e) NA_real_) } # Compute human-LLM alpha for a specific model compute_hl_alpha <- function(criterion, model_name) { human_mean <- metrics_human |> filter(criteria == criterion, paper %in% matched_papers) |> group_by(paper) |> summarise(Human = mean(mid, na.rm = TRUE), .groups = "drop") llm_rating <- llm_metrics |> filter(criteria == criterion, model == model_name, paper %in% matched_papers) |> select(paper, midpoint) |> rename(LLM = midpoint) combined <- inner_join(human_mean, llm_rating, by = "paper") if (nrow(combined) < 3) return(NA_real_) M <- rbind(Human = combined$Human, LLM = combined$LLM) tryCatch({ irr::kripp.alpha(M, method = "interval")$value }, error = function(e) NA_real_) } # Get unique LLM models llm_models <- unique(llm_metrics$model) # Build comparison table baseline_results <- tibble(Criterion = criterion_labels[criteria_list]) |> mutate(criterion_key = criteria_list) |> rowwise() |> mutate( `α_HH` = compute_hh_alpha(criterion_key) ) |> ungroup() # Add columns for each LLM model for (mod in llm_models) { col_name <- paste0("α_HL (", mod, ")") baseline_results <- baseline_results |> rowwise() |> mutate(!!col_name := compute_hl_alpha(criterion_key, mod)) |> ungroup() } # Format for display baseline_display <- baseline_results |> select(-criterion_key) |> mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "—", sprintf("%.2f", .x)))) knitr::kable(baseline_display, align = c("l", rep("r", ncol(baseline_display) - 1))) } ``` **Interpretation**: If α~HL~ approaches or exceeds α~HH~, the LLM is performing at or above the level of agreement humans achieve with each other—a meaningful benchmark for "human-level" evaluation consistency.   ### Uncertainty Interval Coverage (LLM Credible Interval Assessment)[^results_ratings-3] [^results_ratings-3]: This analysis addresses feedback about LLM uncertainty quantification. LLMs report 90% credible intervals alongside their point estimates. If these intervals are **well-specified**, approximately 90% of human mean ratings should fall within them. Lower coverage suggests the LLM is overconfident; higher coverage suggests unnecessary uncertainty. ```{r} #| label: tbl-ci-coverage #| tbl-cap: "Share of human mean ratings within LLM 90% credible intervals" #| code-fold: true # Compute CI coverage: % of human means within LLM [lower_bound, upper_bound] compute_coverage <- function(criterion, model_name) { human_mean <- metrics_human |> filter(criteria == criterion, paper %in% matched_papers) |> group_by(paper) |> summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop") llm_ci <- llm_metrics |> filter(criteria == criterion, model == model_name, paper %in% matched_papers) |> select(paper, lower_bound, upper_bound) combined <- inner_join(human_mean, llm_ci, by = "paper") |> filter(!is.na(lower_bound), !is.na(upper_bound)) if (nrow(combined) < 3) return(NA_real_) # Check if human mean falls within LLM CI covered <- combined |> mutate(in_ci = human_mid >= lower_bound & human_mid <= upper_bound) |> summarise(coverage = mean(in_ci, na.rm = TRUE)) |> pull(coverage) covered } criteria_list <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") criterion_labels <- c( overall = "Overall", claims = "Claims", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) llm_models <- unique(llm_metrics$model) coverage_results <- tibble(Criterion = criterion_labels[criteria_list]) |> mutate(criterion_key = criteria_list) for (mod in llm_models) { col_name <- mod coverage_results <- coverage_results |> rowwise() |> mutate(!!col_name := compute_coverage(criterion_key, mod)) |> ungroup() } # Format for display coverage_display <- coverage_results |> select(-criterion_key) |> mutate(across(where(is.numeric), ~ ifelse(is.na(.x), "—", scales::percent(.x, accuracy = 1)))) knitr::kable(coverage_display, align = c("l", rep("r", ncol(coverage_display) - 1))) ``` **Interpretation**: Coverage near 90% indicates well-specified uncertainty. Coverage below 70% suggests systematic overconfidence; coverage above 95% may indicate overly wide intervals that provide little information.   ### Statistical Significance of Agreement Measures[^results_ratings-4] [^results_ratings-4]: Bootstrap inference was added with AI assistance to quantify uncertainty in agreement metrics. To assess whether observed correlations are statistically meaningful and to compare models, we compute **bootstrap 95% confidence intervals** for Pearson correlations. This nonparametric approach makes no distributional assumptions and accounts for the small sample sizes typical in evaluation studies. ```{r} #| label: tbl-bootstrap-ci #| tbl-cap: "Bootstrap 95% CIs for human-LLM correlation (overall ratings)" #| code-fold: true set.seed(42) # reproducibility # Bootstrap function for correlation boot_cor <- function(data, n_boot = 1000) { cors <- numeric(n_boot) n <- nrow(data) for (i in 1:n_boot) { idx <- sample(1:n, n, replace = TRUE) # Use tryCatch to handle edge cases (e.g., constant values in sample) cors[i] <- tryCatch( cor(data$human[idx], data$llm[idx], use = "complete.obs"), error = function(e) NA_real_, warning = function(w) NA_real_ ) } # Remove NAs before computing quantiles cors_valid <- cors[!is.na(cors)] if (length(cors_valid) < n_boot * 0.5) { # Too many failed bootstraps - return NAs return(c(estimate = NA_real_, ci_lo = NA_real_, ci_hi = NA_real_)) } c(estimate = cor(data$human, data$llm, use = "complete.obs"), ci_lo = quantile(cors_valid, 0.025, na.rm = TRUE), ci_hi = quantile(cors_valid, 0.975, na.rm = TRUE)) } # Prepare data: overall rating comparison human_overall <- metrics_human |> filter(criteria == "overall", paper %in% matched_papers) |> group_by(paper) |> summarise(human = mean(mid, na.rm = TRUE), .groups = "drop") llm_models <- unique(llm_metrics$model) boot_results <- map_dfr(llm_models, function(mod) { llm_overall <- llm_metrics |> filter(criteria == "overall", model == mod, paper %in% matched_papers) |> group_by(paper) |> summarise(llm = mean(midpoint, na.rm = TRUE), .groups = "drop") combined <- inner_join(human_overall, llm_overall, by = "paper") if (nrow(combined) >= 5) { res <- boot_cor(combined) ci_lo <- res["ci_lo.2.5%"] ci_hi <- res["ci_hi.97.5%"] # Handle different quantile naming conventions if (is.na(ci_lo)) ci_lo <- res["ci_lo"] if (is.na(ci_hi)) ci_hi <- res["ci_hi"] tibble( Model = mod, n = nrow(combined), r = res["estimate"], `95% CI` = if (is.na(ci_lo) || is.na(ci_hi)) "—" else sprintf("[%.2f, %.2f]", ci_lo, ci_hi) ) } else { tibble(Model = mod, n = nrow(combined), r = NA_real_, `95% CI` = "—") } }) if (nrow(boot_results) > 0) { boot_results <- boot_results |> mutate(r = ifelse(is.na(r), "—", sprintf("%.2f", r))) |> arrange(desc(as.numeric(ifelse(r == "—", "-1", r)))) knitr::kable(boot_results, align = c("l", "r", "r", "r")) } ``` **Interpretation**: Non-overlapping confidence intervals suggest statistically distinguishable performance. Wide intervals reflect uncertainty due to limited sample size—a common constraint in evaluation research where obtaining more human judgments is costly.   ### Information-Theoretic Perspective: Mutual Information[^results_ratings-5] [^results_ratings-5]: Information-theoretic interpretation added with AI assistance. Beyond correlation, we can ask: **how much information do LLM ratings convey about human ratings?** We use *mutual information* (MI), a fundamental concept from information theory [@cover2006elements; @shannon1948mathematical]. **What is mutual information?** MI quantifies the *reduction in uncertainty* about one variable when you learn the value of another. Formally, MI(X;Y) = H(X) - H(X\|Y), where H is entropy (average surprise/uncertainty). In plain terms: - Before seeing the LLM rating, you have some uncertainty about what the human rating will be - After seeing the LLM rating, how much does your uncertainty decrease? - MI measures this decrease, in *bits* (the same unit used in data compression and communication) **Why use MI instead of (or alongside) correlation?** | Property | Correlation (r) | Mutual Information | |----------------------------------|-----------------|--------------------| | Captures linear relationships | Yes | Yes | | Captures nonlinear relationships | No | Yes | | Detects threshold effects | No | Yes | | Scale | -1 to +1 | 0 to ∞ bits | | Sensitive to outliers | Yes | Less so | A practical example: suppose the LLM predicts human ratings perfectly when the paper is methodology-focused, but randomly when it's theory-focused. Correlation might show r = 0.5 (moderate), but MI would reveal this is *inconsistent* performance rather than *uniformly moderate* performance. This distinction matters for deciding when to trust LLM evaluations. **How to interpret MI values (for 5-bin discretization):** - **0 bits**: Independence—LLM ratings tell you nothing about human ratings - **\~0.5 bits**: Weak association—knowing LLM rating slightly reduces uncertainty - **\~1.0 bits**: Moderate association—knowing LLM rating roughly halves your uncertainty - **\~1.5 bits**: Strong association—substantial predictive value - **2.32 bits** (maximum): Perfect prediction—the LLM quintile perfectly determines the human quintile We discretize ratings into quintiles (5 bins) because MI is formally defined for discrete distributions. This preserves ordinal structure while avoiding overfitting to noise in continuous ratings. ```{r} #| label: tbl-mutual-info #| tbl-cap: "Mutual information between human and LLM overall ratings" #| code-fold: true # Discretize into quintiles discretize <- function(x, n_bins = 5) { breaks <- quantile(x, probs = seq(0, 1, length.out = n_bins + 1), na.rm = TRUE) breaks <- unique(breaks) if (length(breaks) < 2) return(rep(1, length(x))) cut(x, breaks = breaks, labels = FALSE, include.lowest = TRUE) } # Compute mutual information: MI(X;Y) = sum p(x,y) log2(p(x,y) / p(x)p(y)) compute_mi <- function(x, y) { if (length(x) < 5 || length(y) < 5) return(NA_real_) x_disc <- discretize(x) y_disc <- discretize(y) valid <- !is.na(x_disc) & !is.na(y_disc) x_disc <- x_disc[valid] y_disc <- y_disc[valid] if (length(x_disc) < 5) return(NA_real_) # Joint distribution p(x,y) joint <- table(x_disc, y_disc) / length(x_disc) # Marginal distributions p(x), p(y) px <- rowSums(joint) py <- colSums(joint) # MI = sum over all (x,y) of p(x,y) * log2(p(x,y) / (p(x)*p(y))) mi <- 0 for (i in seq_along(px)) { for (j in seq_along(py)) { if (joint[i, j] > 0 && px[i] > 0 && py[j] > 0) { mi <- mi + joint[i, j] * log2(joint[i, j] / (px[i] * py[j])) } } } mi } llm_models <- unique(llm_metrics$model) mi_results <- map_dfr(llm_models, function(mod) { llm_overall <- llm_metrics |> filter(criteria == "overall", model == mod, paper %in% matched_papers) |> group_by(paper) |> summarise(llm = mean(midpoint, na.rm = TRUE), .groups = "drop") combined <- inner_join(human_overall, llm_overall, by = "paper") mi_val <- compute_mi(combined$human, combined$llm) max_mi <- log2(5) # ≈ 2.32 bits for 5 bins norm_mi <- if (!is.na(mi_val)) mi_val / max_mi else NA_real_ tibble( Model = mod, n = nrow(combined), `MI (bits)` = ifelse(is.na(mi_val), "—", sprintf("%.2f", mi_val)), `Normalized` = ifelse(is.na(norm_mi), "—", sprintf("%.0f%%", norm_mi * 100)) ) }) if (nrow(mi_results) > 0) { knitr::kable(mi_results, align = c("l", "r", "r", "r")) } ``` **Interpretation**: Normalized MI above 50% indicates substantial predictive value. If MI is low despite moderate correlation, investigate which paper characteristics the LLM struggles with—this points toward targeted improvements or hybrid human-AI workflows where humans handle cases the LLM finds difficult.  ### Agreement by Evaluation Criterion ```{r} #| label: tbl-agreement-by-criterion #| tbl-cap: "Pearson and Spearman correlations by evaluation criterion" # Compute correlations for each criterion and model criterion_agreement <- llm_metrics |> filter(paper %in% matched_papers) |> inner_join( metrics_human |> filter(paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(human_mid = mean(mid, na.rm = TRUE), .groups = "drop"), by = c("paper", "criteria") ) |> group_by(Model = model, Criterion = criteria) |> summarise( n = n(), r = cor(midpoint, human_mid, use = "complete.obs"), rho = cor(midpoint, human_mid, method = "spearman", use = "complete.obs"), .groups = "drop" ) |> filter(n >= 3) # need at least 3 papers for meaningful correlation if (nrow(criterion_agreement) > 0) { # Pivot for display criterion_labels <- c( overall = "Overall", claims = "Claims", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) corr_table <- criterion_agreement |> mutate( Criterion = criterion_labels[Criterion], `r / ρ` = sprintf("%.2f / %.2f", r, rho) ) |> select(Model, Criterion, `r / ρ`) |> pivot_wider(names_from = Model, values_from = `r / ρ`, values_fill = "—") # Order criteria corr_table <- corr_table |> mutate(Criterion = factor(Criterion, levels = criterion_labels)) |> arrange(Criterion) knitr::kable(corr_table, align = c("l", rep("r", ncol(corr_table) - 1))) } ``` ## Rating Distribution by Model ```{r} #| label: fig-rating-distribution #| fig-cap: "Distribution of overall ratings by evaluator type" #| fig-width: 10 #| fig-height: 6 all_overall <- bind_rows( metrics_human |> filter(criteria == "overall") |> transmute(paper, model, rating = mid), llm_metrics |> filter(criteria == "overall") |> transmute(paper, model, rating = midpoint) ) |> filter(paper %in% matched_papers) |> filter(model != "Claude Sonnet 4") if (nrow(all_overall) > 0) { ggplot(all_overall, aes(x = rating, fill = model)) + geom_density(alpha = 0.5, adjust = 1.2) + scale_fill_manual(values = MODEL_COLORS) + labs(x = "Overall percentile (0–100)", y = "Density", fill = "Evaluator") + theme_uj() + theme(legend.position = "right") } ``` ## Criteria-Level Comparison ```{r} #| label: fig-criteria-heatmap #| fig-cap: "Mean rating by model and criterion" #| fig-width: 11 #| fig-height: 6 metric_labels <- c( overall = "Overall", claims = "Claims", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) criteria_means <- bind_rows( metrics_human |> filter(criteria %in% names(metric_labels), paper %in% matched_papers) |> group_by(criteria) |> summarise(mean_rating = mean(mid, na.rm = TRUE), .groups = "drop") |> mutate(model = "Human"), llm_metrics |> filter(criteria %in% names(metric_labels), paper %in% matched_papers) |> group_by(model, criteria) |> summarise(mean_rating = mean(midpoint, na.rm = TRUE), .groups = "drop") ) |> mutate(criteria_label = factor(metric_labels[criteria], levels = metric_labels)) if (nrow(criteria_means) > 0) { ggplot(criteria_means, aes(x = criteria_label, y = model, fill = mean_rating)) + geom_tile(color = "white", linewidth = 0.5) + geom_text(aes(label = round(mean_rating, 0)), size = 4, color = "white", fontface = "bold") + scale_fill_gradient(low = "#f0f0f0", high = UJ_ORANGE, limits = c(40, 80), name = "Mean") + labs(x = NULL, y = NULL) + theme_uj() + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 10), panel.grid = element_blank()) } ```  ## Evaluation Priorities: What Predicts Journal Tier Judgments?[^results_ratings-6] [^results_ratings-6]: This analysis was added with AI assistance to reveal differences in what humans vs. LLMs emphasize when predicting journal outcomes. How do evaluators translate quality assessments into journal tier predictions? By correlating each criterion with "where should this publish?" judgments, we can see which factors each evaluator type weighs most heavily. Divergence here reveals systematic differences in evaluation priorities—what we might call "taste." ```{r} #| label: fig-tier-correlations #| fig-cap: "Correlation of each criterion with journal tier predictions" #| fig-width: 12 #| fig-height: 7 #| code-fold: true # Load human tier data if available human_tiers_file <- "data/all_jtiers.csv" if (file.exists(human_tiers_file)) { human_tiers_raw <- read_csv(human_tiers_file, show_col_types = FALSE) |> clean_names() # Check what columns are available and find the tier prediction column # Typically 'merits_journal' or similar tier_col <- intersect(names(human_tiers_raw), c("merits_journal", "journal_tier", "tier_should", "mid")) if (length(tier_col) > 0) { # Prepare human tier data human_tier_data <- human_tiers_raw |> filter(!is.na(.data[[tier_col[1]]])) |> rename(tier_should = !!tier_col[1]) # Get paper identifier column paper_col <- intersect(names(human_tier_data), c("label_paper", "paper", "research")) if (length(paper_col) > 0) { human_tier_data <- human_tier_data |> rename(paper = !!paper_col[1]) } metrics_to_cor <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") # === Human Correlations === human_wide <- metrics_human |> filter(criteria %in% metrics_to_cor, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(mid = mean(mid, na.rm = TRUE), .groups = "drop") |> pivot_wider(names_from = criteria, values_from = mid) # Join with human tier predictions human_tier_match <- human_tier_data |> group_by(paper) |> summarise(tier_should = mean(tier_should, na.rm = TRUE), .groups = "drop") human_cors_data <- human_wide |> inner_join(human_tier_match, by = "paper") if (nrow(human_cors_data) >= 5) { cors_human <- tibble( metric = metrics_to_cor, correlation = sapply(metrics_to_cor, function(m) { if (m %in% names(human_cors_data) && !all(is.na(human_cors_data[[m]]))) { cor(human_cors_data[[m]], human_cors_data$tier_should, use = "pairwise.complete.obs") } else NA_real_ }), source = "Human" ) } else { cors_human <- tibble(metric = character(), correlation = numeric(), source = character()) } # === LLM Correlations === llm_wide <- llm_metrics |> filter(criteria %in% metrics_to_cor, model == primary_model, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(mid = mean(midpoint, na.rm = TRUE), .groups = "drop") |> pivot_wider(names_from = criteria, values_from = mid) llm_tier_match <- llm_tiers |> filter(tier_kind == "tier_should", model == primary_model) |> group_by(paper) |> summarise(tier_should = mean(score, na.rm = TRUE), .groups = "drop") llm_cors_data <- llm_wide |> inner_join(llm_tier_match, by = "paper") if (nrow(llm_cors_data) >= 5) { cors_llm <- tibble( metric = metrics_to_cor, correlation = sapply(metrics_to_cor, function(m) { if (m %in% names(llm_cors_data) && !all(is.na(llm_cors_data[[m]]))) { cor(llm_cors_data[[m]], llm_cors_data$tier_should, use = "pairwise.complete.obs") } else NA_real_ }), source = "LLM" ) } else { cors_llm <- tibble(metric = character(), correlation = numeric(), source = character()) } # === Combine and plot === cors_combined <- bind_rows(cors_human, cors_llm) |> filter(!is.na(correlation)) |> mutate( metric_label = case_when( metric == "overall" ~ "Overall", metric == "claims" ~ "Claims & Evidence", metric == "methods" ~ "Methods", metric == "adv_knowledge" ~ "Advancing Knowledge", metric == "logic_comms" ~ "Logic & Communication", metric == "open_sci" ~ "Open Science", metric == "gp_relevance" ~ "Global Relevance", TRUE ~ metric ) ) if (nrow(cors_combined) > 0) { # Order by average correlation metric_order_cors <- cors_combined |> group_by(metric_label) |> summarise(avg_cor = mean(correlation, na.rm = TRUE), .groups = "drop") |> arrange(desc(avg_cor)) |> pull(metric_label) cors_wide <- cors_combined |> mutate(metric_label = factor(metric_label, levels = metric_order_cors)) |> pivot_wider(names_from = source, values_from = correlation) # Dumbbell plot ggplot(cors_wide, aes(y = metric_label)) + geom_segment(aes(x = Human, xend = LLM, yend = metric_label), color = "gray50", linewidth = 1.2, alpha = 0.4) + geom_point(aes(x = Human), color = UJ_GREEN, size = 5, alpha = 0.9) + geom_point(aes(x = LLM), color = UJ_ORANGE, size = 5, alpha = 0.9) + geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") + geom_text(aes(x = Human, label = sprintf("%.2f", Human)), hjust = 1.4, size = 3.5, color = UJ_GREEN, fontface = "bold") + geom_text(aes(x = LLM, label = sprintf("%.2f", LLM)), hjust = -0.4, size = 3.5, color = UJ_ORANGE, fontface = "bold") + scale_x_continuous( limits = c(min(c(cors_wide$Human, cors_wide$LLM), na.rm = TRUE) - 0.15, max(c(cors_wide$Human, cors_wide$LLM), na.rm = TRUE) + 0.15) ) + labs( x = "Correlation with 'Where should this publish?'", y = NULL, subtitle = paste0("Green = Human, Orange = ", primary_model) ) + theme_uj() + theme( panel.grid.major.x = element_line(color = "grey90"), panel.grid.major.y = element_blank(), axis.text.y = element_text(size = 11) ) } else { cat("Insufficient correlation data for dumbbell plot.\n") } } else { cat("Tier prediction column not found in human data.\n") } } else { cat("Human tier data file not found.\n") } ``` **Interpretation**: Large divergences indicate different evaluation philosophies. For example, if humans weight "open science" heavily but LLMs don't, this suggests the LLM may undervalue reproducibility when forming overall judgments.  ------------------------------------------------------------------------  ## Systematic Rating Differences by Paper and Criterion[^results_ratings-7] [^results_ratings-7]: This visualization was added with AI assistance to identify patterns in human-LLM disagreement. The heatmap below shows **Human − LLM** differences for each paper and criterion. Green indicates humans rated higher than the LLM; orange indicates the LLM rated higher. Papers are ordered by their overall rating difference. ```{r} #| label: fig-gap-heatmap #| fig-cap: "Human − LLM rating differences by paper and criterion" #| fig-width: 14 #| fig-height: 8 #| code-fold: true metric_order <- c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance") metric_lab <- c( overall = "Overall", claims = "Claims & Evidence", methods = "Methods", adv_knowledge = "Adv. Knowledge", logic_comms = "Logic & Comms", open_sci = "Open Science", gp_relevance = "Global Relevance" ) # Use the first/primary LLM model for comparison (GPT-5 Pro if available) primary_model <- if ("GPT-5 Pro" %in% unique(llm_metrics$model)) "GPT-5 Pro" else unique(llm_metrics$model)[1] H_mean <- metrics_human |> filter(criteria %in% metric_order, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(h = mean(mid, na.rm = TRUE), .groups = "drop") L_mean <- llm_metrics |> filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |> group_by(paper, criteria) |> summarise(l = mean(midpoint, na.rm = TRUE), .groups = "drop") Ddiff <- inner_join(H_mean, L_mean, by = c("paper", "criteria")) |> mutate(diff = h - l) |> mutate(crit = factor(criteria, levels = metric_order, labels = metric_lab[metric_order])) # Order papers by overall difference (Human−LLM) ord_p <- Ddiff |> filter(criteria == "overall") |> arrange(desc(diff)) |> pull(paper) if (length(ord_p) > 0 && nrow(Ddiff) > 0) { ggplot(Ddiff, aes(x = factor(paper, levels = ord_p), y = crit, fill = diff)) + geom_tile(color = "white", linewidth = 0.25) + scale_fill_gradient2( low = UJ_ORANGE, mid = "grey95", high = UJ_GREEN, midpoint = 0, name = "Human − LLM", limits = c(-30, 30), oob = scales::squish ) + labs(x = NULL, y = NULL, subtitle = paste0("Primary model: ", primary_model)) + theme_uj() + theme( axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1, size = 7), axis.text.y = element_text(size = 11), panel.grid = element_blank() ) } else { cat("Insufficient data for gap heatmap.\n") } ``` This visualization helps identify: (1) papers with systematic disagreement across all criteria, (2) criteria where humans and LLMs consistently diverge, and (3) potential patterns related to paper characteristics.   ## Papers with Largest Human-LLM Disagreement[^results_ratings-8] [^results_ratings-8]: This table was added with AI assistance to facilitate qualitative investigation of outliers. Understanding *why* specific papers generate disagreement can inform both LLM limitations and potential human biases. The table below shows papers where humans and LLMs diverged most. ```{r} #| label: tbl-top-disagreement #| tbl-cap: "Papers with largest human vs. LLM rating differences" #| code-fold: true # Compute mean overall ratings and differences H_overall <- metrics_human |> filter(criteria == "overall", paper %in% matched_papers) |> group_by(paper) |> summarise(human_rating = mean(mid, na.rm = TRUE), .groups = "drop") L_overall <- llm_metrics |> filter(criteria == "overall", model == primary_model, paper %in% matched_papers) |> group_by(paper) |> summarise(llm_rating = mean(midpoint, na.rm = TRUE), .groups = "drop") rating_diffs <- H_overall |> inner_join(L_overall, by = "paper") |> mutate(diff = human_rating - llm_rating) # Helper: truncate long titles truncate_title <- function(title, max_len = 50) { if (nchar(title) <= max_len) return(title) paste0(substr(title, 1, max_len - 3), "...") } # Top 5 where humans rated higher top_human_pref <- rating_diffs |> filter(diff > 0) |> arrange(desc(diff)) |> slice_head(n = 5) |> mutate( Paper = sapply(paper, truncate_title), `Human Rating` = round(human_rating, 0), `LLM Rating` = round(llm_rating, 0), `Difference` = sprintf("%+.0f", diff), Direction = "Human > LLM" ) |> select(Paper, `Human Rating`, `LLM Rating`, Difference, Direction) # Top 5 where LLM rated higher top_llm_pref <- rating_diffs |> filter(diff < 0) |> arrange(diff) |> slice_head(n = 5) |> mutate( Paper = sapply(paper, truncate_title), `Human Rating` = round(human_rating, 0), `LLM Rating` = round(llm_rating, 0), `Difference` = sprintf("%+.0f", diff), Direction = "LLM > Human" ) |> select(Paper, `Human Rating`, `LLM Rating`, Difference, Direction) # Combine top_disagree <- bind_rows(top_human_pref, top_llm_pref) if (nrow(top_disagree) > 0) { knitr::kable(top_disagree, align = c("l", "r", "r", "r", "l")) } else { cat("Insufficient data for disagreement analysis.\n") } ``` These outliers merit qualitative investigation: examining the paper characteristics, human evaluator comments, and LLM rationales may reveal systematic patterns in what drives disagreement.  ------------------------------------------------------------------------ *Note: GPT-5 Pro evaluations include extended reasoning traces. See the [Appendix](appendix_llm_traces.qmd) for full assessment summaries and reasoning traces per paper.* ## Prompt Comparison: Legacy vs Updated GPT-5 Pro We compare GPT-5 Pro evaluations between two prompt versions: - **Legacy (Oct 2024):** Original combined schema without assessment summary - **Updated (Jan 2026):** Modular prompt with explicit diagnostic assessment summary ```{r} #| label: load-legacy-comparison #| code-fold: true #| code-summary: "Load legacy vs updated GPT-5 Pro data" #| message: false # Load legacy GPT-5 data legacy_metrics <- read_csv("data/metrics_long_gpt-5.csv", show_col_types = FALSE) |> transmute( paper_raw = paper, paper = str_replace_all(paper, " ", "_") |> str_replace_all("\\.", ""), # normalize metric = metric, legacy_mid = midpoint, legacy_lo = lower_bound, legacy_hi = upper_bound ) # Load updated GPT-5 Pro data updated_metrics <- read_csv("data/metrics_long_gpt5_pro_jan2026.csv", show_col_types = FALSE) |> transmute( paper = str_replace_all(paper, "\\.", ""), # remove dots for matching metric = metric, updated_mid = midpoint, updated_lo = lower_bound, updated_hi = upper_bound ) # Join by normalized paper name and metric prompt_comparison <- legacy_metrics |> inner_join(updated_metrics, by = c("paper", "metric")) |> mutate( diff = updated_mid - legacy_mid, metric_label = case_when( metric == "overall" ~ "Overall", metric == "claims_evidence" ~ "Claims", metric == "methods" ~ "Methods", metric == "advancing_knowledge" ~ "Adv. Knowledge", metric == "logic_communication" ~ "Logic & Comms", metric == "open_science" ~ "Open Science", metric == "global_relevance" ~ "Global Relevance", TRUE ~ metric ) ) n_matched_papers <- n_distinct(prompt_comparison$paper) ``` We matched `r n_matched_papers` papers across both runs. ```{r} #| label: fig-prompt-scatter #| fig-cap: "Legacy vs Updated GPT-5 Pro ratings (overall metric)" #| fig-width: 8 #| fig-height: 7 overall_comp <- prompt_comparison |> filter(metric == "overall") if (nrow(overall_comp) > 0) { cor_val <- cor(overall_comp$legacy_mid, overall_comp$updated_mid, use = "complete.obs") ggplot(overall_comp, aes(x = legacy_mid, y = updated_mid)) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") + geom_point(alpha = 0.7, size = 2.5, color = UJ_ORANGE) + geom_smooth(method = "lm", se = TRUE, alpha = 0.2, color = UJ_BLUE) + annotate("text", x = 55, y = 95, label = paste0("r = ", round(cor_val, 2)), size = 5, fontface = "bold") + labs( x = "Legacy Prompt (Oct 2024)", y = "Updated Prompt (Jan 2026)", title = "GPT-5 Pro: Overall Rating Comparison", subtitle = "Dashed line = perfect agreement" ) + coord_fixed(xlim = c(40, 100), ylim = c(40, 100)) + theme_uj() } ``` ```{r} #| label: fig-prompt-diff-by-metric #| fig-cap: "Rating difference (Updated - Legacy) by criterion" #| fig-width: 10 #| fig-height: 5 if (nrow(prompt_comparison) > 0) { metric_order <- prompt_comparison |> group_by(metric_label) |> summarise(mean_diff = mean(diff, na.rm = TRUE)) |> arrange(mean_diff) |> pull(metric_label) prompt_comparison |> mutate(metric_label = factor(metric_label, levels = metric_order)) |> ggplot(aes(x = metric_label, y = diff)) + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + geom_boxplot(fill = UJ_ORANGE, alpha = 0.6, outlier.alpha = 0.3) + labs( x = NULL, y = "Rating Difference (Updated - Legacy)", title = "Prompt Version Effect by Criterion", subtitle = "Positive = updated prompt rates higher" ) + theme_uj() + theme(axis.text.x = element_text(angle = 30, hjust = 1)) } ``` ```{r} #| label: tbl-prompt-summary #| tbl-cap: "Summary statistics: Legacy vs Updated prompt" if (nrow(prompt_comparison) > 0) { prompt_comparison |> group_by(Criterion = metric_label) |> summarise( `Legacy Mean` = round(mean(legacy_mid, na.rm = TRUE), 1), `Updated Mean` = round(mean(updated_mid, na.rm = TRUE), 1), `Mean Diff` = round(mean(diff, na.rm = TRUE), 1), `SD Diff` = round(sd(diff, na.rm = TRUE), 1), .groups = "drop" ) |> kable() |> kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) } ``` The updated prompt with explicit diagnostic assessment appears to produce systematically different ratings. The correlation between prompt versions indicates the degree of consistency across the prompt change.

Evaluation Overview

Token Usage and Cost

Interactive Forest Plot: Human vs LLM Ratings

Human vs LLM Scatter Plot

Model Agreement Statistics

Inter-Rater Agreement (Krippendorff’s Alpha)

Human-Human vs Human-LLM Agreement (Baseline Context)3

Uncertainty Interval Coverage (LLM Credible Interval Assessment)4

Statistical Significance of Agreement Measures5

Information-Theoretic Perspective: Mutual Information6

Agreement by Evaluation Criterion

Rating Distribution by Model

Criteria-Level Comparison

Evaluation Priorities: What Predicts Journal Tier Judgments?7

Systematic Rating Differences by Paper and Criterion8

Papers with Largest Human-LLM Disagreement9

Prompt Comparison: Legacy vs Updated GPT-5 Pro

Human-Human vs Human-LLM Agreement (Baseline Context)³

Uncertainty Interval Coverage (LLM Credible Interval Assessment)⁴

Statistical Significance of Agreement Measures⁵

Information-Theoretic Perspective: Mutual Information⁶

Evaluation Priorities: What Predicts Journal Tier Judgments?⁷

Systematic Rating Differences by Paper and Criterion⁸

Papers with Largest Human-LLM Disagreement⁹