We compare LLM-generated research evaluations against human expert reviews from The Unjournal. This analysis covers 60 papers evaluated by 5 frontier LLMs (GPT-5 Pro, GPT-5.2 Pro, GPT-4o-mini, Claude Sonnet 4, Gemini 2.0 Flash), with 45 papers having both human and LLM evaluations for direct comparison. Human evaluations come from 49 expert reviewers across 49 papers in The Unjournal’s database.
Evaluation Overview
The previous Methods) chapter explains our process and pipeline.1 This generally mirrors The Unjournal’s rubric and guidelines, requesting a range of percentile metrics and normative and predictive ‘journal tiers’ metrics.2
Each numeric field carries a midpoint (model’s 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty.
Future Models
We are working to extend this comparison to newer frontier models as they become available and our funding permits, as well as adapting these to bespoke agent-based tools such as Refine.ink.
Token Usage and Cost
Token costs are computed from API-reported input/output tokens (GPT-5 Pro runs with reasoning = high). Use the boxplot log scale to compare efficiency frontiers; reasoning tokens are reported when available.
Table 3.1: Token usage and estimated cost per model
Model
Papers
Avg input
Avg output
Avg reasoning
Total cost
Cost/paper
GPT-5 Pro
60
38,516
6,400
4,717
$57.71
$0.962
GPT-5.2 Pro
14
45,143
2,889
707
$11.91
$0.850
Claude Sonnet 4
15
116,386
731
—
$5.40
$0.360
GPT-4o-mini
60
71,581
470
—
$0.66
$0.011
Gemini 2.0 Flash
59
17,821
1,022
—
$0.10
$0.002
Total cost across all 208 LLM evaluations: $75.77.
Show code
if (nrow(cost_data) >0) {ggplot(cost_data, aes(x =reorder(model, cost_usd, FUN = median), y = cost_usd, fill = model)) +geom_boxplot(alpha =0.7, outlier.shape =NA) +geom_jitter(width =0.15, alpha =0.5, size =2) +scale_fill_manual(values = MODEL_COLORS, guide ="none") +scale_y_log10(labels = scales::dollar_format(accuracy =0.001)) +coord_flip() +labs(x =NULL, y ="Cost per paper (USD, log scale)") +theme_uj()}
Figure 3.1: Cost per paper by model (log scale)
Interactive Forest Plot: Human vs LLM Ratings
The forest plot below shows overall percentile ratings for each of the 45 papers with matched human and LLM evaluations. Human evaluator ratings are shown in green (individual evaluators as small dots, mean as diamond). LLM ratings from different models are shown with distinct colors.
Interpretation: points are midpoints, horizontal bars are the model’s 90% credible interval.
Hover over any point to see details. Click legend items to show/hide models.
Show code
forest_human <- metrics_human |>filter(criteria =="overall") |>transmute(paper, model ="Human", evaluator, mid, lo, hi, is_mean =FALSE)forest_human_means <- human_avg |>transmute(paper, model ="Human", evaluator ="Mean",mid = human_mid, lo = human_lo, hi = human_hi, is_mean =TRUE)forest_llm <- llm_metrics |>filter(criteria =="overall") |>transmute(paper, model, evaluator = model,mid = midpoint, lo = lower_bound, hi = upper_bound, is_mean =FALSE)forest_data <-bind_rows(forest_human, forest_human_means, forest_llm) |>filter(paper %in% matched_papers)if (nrow(forest_data) >0) { paper_order <- forest_data |>filter(model =="Human", is_mean) |>arrange(desc(mid)) |>pull(paper) forest_data <- forest_data |>mutate(paper =factor(paper, levels =rev(paper_order)),paper_num =as.numeric(paper),hover_text =paste0("<b>", paper, "</b><br>", model, ifelse(evaluator != model &!is_mean, paste0(" (", evaluator, ")"), ""), "<br>","Rating: ", round(mid, 1),ifelse(!is.na(lo) &!is.na(hi), paste0(" [", round(lo, 1), "–", round(hi, 1), "]"), "") ) ) p <-plot_ly() |>add_segments(x =25, xend =25, y =0.5, yend =length(paper_order) +0.5,line =list(color ="lightgray", dash ="dot"),showlegend =FALSE, hoverinfo ="none") |>add_segments(x =50, xend =50, y =0.5, yend =length(paper_order) +0.5,line =list(color ="gray", dash ="dot"),showlegend =FALSE, hoverinfo ="none") |>add_segments(x =75, xend =75, y =0.5, yend =length(paper_order) +0.5,line =list(color ="lightgray", dash ="dot"),showlegend =FALSE, hoverinfo ="none") models_to_plot <-c("Human", setdiff(unique(forest_data$model), "Human"))for (m in models_to_plot) { model_data <- forest_data |>filter(model == m)if (m =="Human") { indiv <- model_data |>filter(!is_mean)if (nrow(indiv) >0) { p <- p |>add_trace(data = indiv, type ="scatter", mode ="markers",x =~mid, y =~paper_num,marker =list(color = MODEL_COLORS[m], size =6, opacity =0.4),text =~hover_text, hoverinfo ="text",name ="Human (individual)", legendgroup ="Human" ) } means <- model_data |>filter(is_mean)if (nrow(means) >0) { p <- p |>add_trace(data = means, type ="scatter", mode ="markers",x =~mid, y =~paper_num,error_x =list(type ="data", symmetric =FALSE,arrayminus =~(mid - lo), array =~(hi - mid),color = MODEL_COLORS[m], thickness =1.5),marker =list(color = MODEL_COLORS[m], size =12, symbol ="diamond"),text =~hover_text, hoverinfo ="text",name ="Human (mean)", legendgroup ="Human" ) } } else { p <- p |>add_trace(data = model_data, type ="scatter", mode ="markers",x =~mid, y =~paper_num,error_x =list(type ="data", symmetric =FALSE,arrayminus =~(mid - lo), array =~(hi - mid),color = MODEL_COLORS[m], thickness =1.5),marker =list(color = MODEL_COLORS[m], size =10),text =~hover_text, hoverinfo ="text", name = m ) } } p <- p |>layout(xaxis =list(title ="Overall percentile (0–100)", range =c(0, 105), zeroline =FALSE),yaxis =list(title ="", ticktext = paper_order, tickvals =seq_along(paper_order), tickmode ="array"),legend =list(orientation ="h", y =-0.1, x =0.5, xanchor ="center"),hovermode ="closest",margin =list(l =200) ) p} else {cat("No matched human-LLM data available for forest plot.\n")}
Figure 3.2: Interactive forest plot: Human and LLM overall ratings by paper
Human vs LLM Scatter Plot
Each point represents a paper, with human mean rating on the x-axis and LLM rating on the y-axis. Points on the diagonal indicate perfect agreement.
Note: human means carry their own variance; correlations here are bounded by human inter-rater noise.
Show code
scatter_data <- llm_metrics |>filter(criteria =="overall") |>inner_join(human_avg, by ="paper") |>mutate(diff = midpoint - human_mid,paper_short =str_trunc(paper, 25),# Ensure no NA values break ggplotlyhuman_lo =coalesce(human_lo, human_mid),human_hi =coalesce(human_hi, human_mid),lower_bound =coalesce(lower_bound, midpoint),upper_bound =coalesce(upper_bound, midpoint) ) |>filter(!is.na(human_mid), !is.na(midpoint))if (nrow(scatter_data) >0) { p <-ggplot(scatter_data, aes(x = human_mid, y = midpoint, color = model,text =paste0("<b>", paper, "</b><br>","Human: ", round(human_mid, 1), " [", round(human_lo, 1), "–", round(human_hi, 1), "]<br>","LLM: ", round(midpoint, 1), " [", round(lower_bound, 1), "–", round(upper_bound, 1), "]<br>","Diff: ", sprintf("%+.1f", diff) ) )) +geom_abline(slope =1, intercept =0, linetype ="dashed", color ="grey50") +geom_point(size =3.5, alpha =0.7) +scale_color_manual(values = MODEL_COLORS) +coord_cartesian(xlim =c(0, 100), ylim =c(0, 100)) +labs(x ="Human mean rating (0–100)", y ="LLM rating (0–100)", color ="Model") +theme_uj()# Use tryCatch to handle ggplotly errors gracefullytryCatch({ggplotly(p, tooltip ="text") |>layout(legend =list(orientation ="h", y =-0.12, x =0.5, xanchor ="center"),hoverlabel =list(bgcolor ="white") ) }, error =function(e) {# Fallback to static plot if ggplotly failsprint(p)message("Note: Interactive plot unavailable, showing static version.") })} else {cat("No matching data for scatter plot.\n")}
Figure 3.3: Interactive scatter: Human vs LLM overall ratings
Model Agreement Statistics
We assess agreement between LLM and human evaluators using multiple metrics:
Pearson r: Linear correlation between ratings
Spearman ρ: Rank-based correlation (robust to outliers)
Mean bias: Average difference (LLM − Human); positive = LLM rates higher
Table 3.2: Agreement between each LLM and human mean ratings (overall criterion)
Model
N
Pearson r
Spearman ρ
Mean bias
RMSE
MAE
Gemini 2.0 Flash
44
0.351
0.321
-5.0
14.4
11.6
GPT-5 Pro
45
0.342
0.517
+6.4
15.4
10.3
GPT-5.2 Pro
9
0.333
0.471
-2.7
19.0
14.7
Claude Sonnet 4
10
0.197
0.272
+1.5
9.1
6.3
GPT-4o-mini
45
0.135
0.116
+0.0
20.2
13.1
Inter-Rater Agreement (Krippendorff’s Alpha)
Krippendorff’s alpha measures agreement among multiple raters, accounting for chance agreement. Values above 0.8 indicate good agreement; 0.67–0.8 is acceptable for exploratory research. Ceiling effects apply because human evaluators are not perfectly aligned.
Show code
# Install irr if needed: install.packages("irr")if (requireNamespace("irr", quietly =TRUE)) {library(irr)# Prepare data: each row = paper, each column = rater (human mean + each LLM) compute_alpha <-function(criterion) {# Human mean per paper human_crit <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>group_by(paper) |>summarise(Human =mean(mid, na.rm =TRUE), .groups ="drop")# LLM ratings per paper llm_crit <- llm_metrics |>filter(criteria == criterion, paper %in% matched_papers) |>select(paper, model, midpoint) |>pivot_wider(names_from = model, values_from = midpoint)# Combine combined <- human_crit |>inner_join(llm_crit, by ="paper") |>select(-paper)if (nrow(combined) <3||ncol(combined) <2) return(NA_real_)# Krippendorff's alpha (interval scale)tryCatch({ ka <-kripp.alpha(t(as.matrix(combined)), method ="interval") ka$value }, error =function(e) NA_real_) } criteria_list <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance") alpha_results <-tibble(Criterion =c("Overall", "Claims", "Methods", "Adv. Knowledge","Logic & Comms", "Open Science", "Global Relevance"),criterion_key = criteria_list ) |>rowwise() |>mutate(`Krippendorff's α`=compute_alpha(criterion_key),`α (formatted)`=ifelse(is.na(`Krippendorff's α`), "—",sprintf("%.3f", `Krippendorff's α`)),Interpretation =case_when(is.na(`Krippendorff's α`) ~"Insufficient data",`Krippendorff's α`>=0.8~"Good",`Krippendorff's α`>=0.67~"Acceptable",`Krippendorff's α`>=0.4~"Fair",TRUE~"Poor" ) ) |>ungroup() |>select(Criterion, `α (formatted)`, Interpretation) |>rename(`Krippendorff's α`=`α (formatted)`) knitr::kable(alpha_results, align =c("l", "r", "l"))} else {cat("*Install the `irr` package for Krippendorff's alpha: `install.packages('irr')`*\n")}
Table 3.3: Inter-rater agreement: Krippendorff’s alpha by criterion
Criterion
Krippendorff's α
Interpretation
Overall
0.221
Poor
Claims
0.335
Poor
Methods
0.192
Poor
Adv. Knowledge
0.144
Poor
Logic & Comms
0.021
Poor
Open Science
0.093
Poor
Global Relevance
0.189
Poor
Human-Human vs Human-LLM Agreement (Baseline Context)3
To interpret human-LLM agreement, we must compare against human-human agreement as an upper bound. If human evaluators only agree with each other at α = 0.5, then expecting LLMs to achieve α = 0.8 with humans would be unrealistic. The table below shows both:
αHH: Krippendorff’s alpha among human evaluators (treating each evaluator as a separate rater)
αHL: Krippendorff’s alpha between human mean and each LLM model
Show code
if (requireNamespace("irr", quietly =TRUE)) {library(irr) criteria_list <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance") criterion_labels <-c(overall ="Overall", claims ="Claims", methods ="Methods",adv_knowledge ="Adv. Knowledge", logic_comms ="Logic & Comms",open_sci ="Open Science", gp_relevance ="Global Relevance" )# Compute human-human alpha (each evaluator as separate rater) compute_hh_alpha <-function(criterion) { human_wide <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>select(paper, evaluator, mid) |>distinct() |>pivot_wider(names_from = paper, values_from = mid)if (nrow(human_wide) <2||ncol(human_wide) <3) return(NA_real_) M <-as.matrix(human_wide[, -1, drop =FALSE])tryCatch({ irr::kripp.alpha(M, method ="interval")$value }, error =function(e) NA_real_) }# Compute human-LLM alpha for a specific model compute_hl_alpha <-function(criterion, model_name) { human_mean <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>group_by(paper) |>summarise(Human =mean(mid, na.rm =TRUE), .groups ="drop") llm_rating <- llm_metrics |>filter(criteria == criterion, model == model_name, paper %in% matched_papers) |>select(paper, midpoint) |>rename(LLM = midpoint) combined <-inner_join(human_mean, llm_rating, by ="paper")if (nrow(combined) <3) return(NA_real_) M <-rbind(Human = combined$Human, LLM = combined$LLM)tryCatch({ irr::kripp.alpha(M, method ="interval")$value }, error =function(e) NA_real_) }# Get unique LLM models llm_models <-unique(llm_metrics$model)# Build comparison table baseline_results <-tibble(Criterion = criterion_labels[criteria_list]) |>mutate(criterion_key = criteria_list) |>rowwise() |>mutate(`α_HH`=compute_hh_alpha(criterion_key) ) |>ungroup()# Add columns for each LLM modelfor (mod in llm_models) { col_name <-paste0("α_HL (", mod, ")") baseline_results <- baseline_results |>rowwise() |>mutate(!!col_name :=compute_hl_alpha(criterion_key, mod)) |>ungroup() }# Format for display baseline_display <- baseline_results |>select(-criterion_key) |>mutate(across(where(is.numeric), ~ifelse(is.na(.x), "—", sprintf("%.2f", .x)))) knitr::kable(baseline_display, align =c("l", rep("r", ncol(baseline_display) -1)))}
Table 3.4: Human-human vs Human-LLM agreement by criterion (Krippendorff’s α)
Criterion
α_HH
α_HL (GPT-5 Pro)
α_HL (GPT-5.2 Pro)
α_HL (GPT-4o-mini)
α_HL (Claude Sonnet 4)
α_HL (Gemini 2.0 Flash)
Overall
0.55
0.25
0.36
0.14
0.14
0.26
Claims
0.42
0.20
0.26
0.19
0.21
0.18
Methods
0.54
0.29
0.07
0.03
-0.22
0.16
Adv. Knowledge
0.20
0.18
0.20
0.13
0.08
0.07
Logic & Comms
0.33
-0.04
-0.19
-0.00
0.25
-0.05
Open Science
0.03
-0.06
-0.61
-0.16
0.30
0.01
Global Relevance
0.33
0.25
0.51
0.04
-0.20
0.18
Interpretation: If αHL approaches or exceeds αHH, the LLM is performing at or above the level of agreement humans achieve with each other—a meaningful benchmark for “human-level” evaluation consistency.
LLMs report 90% credible intervals alongside their point estimates. If these intervals are well-specified, approximately 90% of human mean ratings should fall within them. Lower coverage suggests the LLM is overconfident; higher coverage suggests unnecessary uncertainty.
Show code
# Compute CI coverage: % of human means within LLM [lower_bound, upper_bound]compute_coverage <-function(criterion, model_name) { human_mean <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>group_by(paper) |>summarise(human_mid =mean(mid, na.rm =TRUE), .groups ="drop") llm_ci <- llm_metrics |>filter(criteria == criterion, model == model_name, paper %in% matched_papers) |>select(paper, lower_bound, upper_bound) combined <-inner_join(human_mean, llm_ci, by ="paper") |>filter(!is.na(lower_bound), !is.na(upper_bound))if (nrow(combined) <3) return(NA_real_)# Check if human mean falls within LLM CI covered <- combined |>mutate(in_ci = human_mid >= lower_bound & human_mid <= upper_bound) |>summarise(coverage =mean(in_ci, na.rm =TRUE)) |>pull(coverage) covered}criteria_list <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance")criterion_labels <-c(overall ="Overall", claims ="Claims", methods ="Methods",adv_knowledge ="Adv. Knowledge", logic_comms ="Logic & Comms",open_sci ="Open Science", gp_relevance ="Global Relevance")llm_models <-unique(llm_metrics$model)coverage_results <-tibble(Criterion = criterion_labels[criteria_list]) |>mutate(criterion_key = criteria_list)for (mod in llm_models) { col_name <- mod coverage_results <- coverage_results |>rowwise() |>mutate(!!col_name :=compute_coverage(criterion_key, mod)) |>ungroup()}# Format for displaycoverage_display <- coverage_results |>select(-criterion_key) |>mutate(across(where(is.numeric), ~ifelse(is.na(.x), "—", scales::percent(.x, accuracy =1))))knitr::kable(coverage_display, align =c("l", rep("r", ncol(coverage_display) -1)))
Table 3.5: Share of human mean ratings within LLM 90% credible intervals
Criterion
GPT-5 Pro
GPT-5.2 Pro
GPT-4o-mini
Claude Sonnet 4
Gemini 2.0 Flash
Overall
64%
44%
49%
70%
66%
Claims
39%
44%
44%
43%
56%
Methods
59%
22%
55%
40%
60%
Adv. Knowledge
55%
44%
52%
60%
65%
Logic & Comms
36%
44%
62%
50%
68%
Open Science
33%
33%
40%
60%
48%
Global Relevance
49%
89%
44%
60%
59%
Interpretation: Coverage near 90% indicates well-specified uncertainty. Coverage below 70% suggests systematic overconfidence; coverage above 95% may indicate overly wide intervals that provide little information.
To assess whether observed correlations are statistically meaningful and to compare models, we compute bootstrap 95% confidence intervals for Pearson correlations. This nonparametric approach makes no distributional assumptions and accounts for the small sample sizes typical in evaluation studies.
Show code
set.seed(42) # reproducibility# Bootstrap function for correlationboot_cor <-function(data, n_boot =1000) { cors <-numeric(n_boot) n <-nrow(data)for (i in1:n_boot) { idx <-sample(1:n, n, replace =TRUE)# Use tryCatch to handle edge cases (e.g., constant values in sample) cors[i] <-tryCatch(cor(data$human[idx], data$llm[idx], use ="complete.obs"),error =function(e) NA_real_,warning =function(w) NA_real_ ) }# Remove NAs before computing quantiles cors_valid <- cors[!is.na(cors)]if (length(cors_valid) < n_boot *0.5) {# Too many failed bootstraps - return NAsreturn(c(estimate =NA_real_, ci_lo =NA_real_, ci_hi =NA_real_)) }c(estimate =cor(data$human, data$llm, use ="complete.obs"),ci_lo =quantile(cors_valid, 0.025, na.rm =TRUE),ci_hi =quantile(cors_valid, 0.975, na.rm =TRUE))}# Prepare data: overall rating comparisonhuman_overall <- metrics_human |>filter(criteria =="overall", paper %in% matched_papers) |>group_by(paper) |>summarise(human =mean(mid, na.rm =TRUE), .groups ="drop")llm_models <-unique(llm_metrics$model)boot_results <-map_dfr(llm_models, function(mod) { llm_overall <- llm_metrics |>filter(criteria =="overall", model == mod, paper %in% matched_papers) |>group_by(paper) |>summarise(llm =mean(midpoint, na.rm =TRUE), .groups ="drop") combined <-inner_join(human_overall, llm_overall, by ="paper")if (nrow(combined) >=5) { res <-boot_cor(combined) ci_lo <- res["ci_lo.2.5%"] ci_hi <- res["ci_hi.97.5%"]# Handle different quantile naming conventionsif (is.na(ci_lo)) ci_lo <- res["ci_lo"]if (is.na(ci_hi)) ci_hi <- res["ci_hi"]tibble(Model = mod,n =nrow(combined),r = res["estimate"],`95% CI`=if (is.na(ci_lo) ||is.na(ci_hi)) "—"elsesprintf("[%.2f, %.2f]", ci_lo, ci_hi) ) } else {tibble(Model = mod, n =nrow(combined), r =NA_real_, `95% CI`="—") }})if (nrow(boot_results) >0) { boot_results <- boot_results |>mutate(r =ifelse(is.na(r), "—", sprintf("%.2f", r))) |>arrange(desc(as.numeric(ifelse(r =="—", "-1", r)))) knitr::kable(boot_results, align =c("l", "r", "r", "r"))}
Table 3.6: Bootstrap 95% CIs for human-LLM correlation (overall ratings)
Model
n
r
95% CI
Gemini 2.0 Flash
44
0.35
[0.10, 0.56]
GPT-5 Pro
45
0.34
[0.09, 0.59]
GPT-5.2 Pro
9
0.33
[-0.47, 0.88]
Claude Sonnet 4
10
0.20
[-0.66, 0.94]
GPT-4o-mini
45
0.13
[-0.15, 0.46]
Interpretation: Non-overlapping confidence intervals suggest statistically distinguishable performance. Wide intervals reflect uncertainty due to limited sample size—a common constraint in evaluation research where obtaining more human judgments is costly.
Beyond correlation, we can ask: how much information do LLM ratings convey about human ratings? We use mutual information (MI), a fundamental concept from information theory (Shannon 1948; Cover and Thomas 2006).
What is mutual information? MI quantifies the reduction in uncertainty about one variable when you learn the value of another. Formally, MI(X;Y) = H(X) - H(X|Y), where H is entropy (average surprise/uncertainty). In plain terms:
Before seeing the LLM rating, you have some uncertainty about what the human rating will be
After seeing the LLM rating, how much does your uncertainty decrease?
MI measures this decrease, in bits (the same unit used in data compression and communication)
Why use MI instead of (or alongside) correlation?
Property
Correlation (r)
Mutual Information
Captures linear relationships
Yes
Yes
Captures nonlinear relationships
No
Yes
Detects threshold effects
No
Yes
Scale
-1 to +1
0 to ∞ bits
Sensitive to outliers
Yes
Less so
A practical example: suppose the LLM predicts human ratings perfectly when the paper is methodology-focused, but randomly when it’s theory-focused. Correlation might show r = 0.5 (moderate), but MI would reveal this is inconsistent performance rather than uniformly moderate performance. This distinction matters for deciding when to trust LLM evaluations.
How to interpret MI values (for 5-bin discretization):
0 bits: Independence—LLM ratings tell you nothing about human ratings
~1.0 bits: Moderate association—knowing LLM rating roughly halves your uncertainty
~1.5 bits: Strong association—substantial predictive value
2.32 bits (maximum): Perfect prediction—the LLM quintile perfectly determines the human quintile
We discretize ratings into quintiles (5 bins) because MI is formally defined for discrete distributions. This preserves ordinal structure while avoiding overfitting to noise in continuous ratings.
Table 3.7: Mutual information between human and LLM overall ratings
Model
n
MI (bits)
Normalized
GPT-5 Pro
45
0.60
26%
GPT-5.2 Pro
9
1.61
70%
GPT-4o-mini
45
0.30
13%
Claude Sonnet 4
10
0.77
33%
Gemini 2.0 Flash
44
0.14
6%
Interpretation: Normalized MI above 50% indicates substantial predictive value. If MI is low despite moderate correlation, investigate which paper characteristics the LLM struggles with—this points toward targeted improvements or hybrid human-AI workflows where humans handle cases the LLM finds difficult.
Evaluation Priorities: What Predicts Journal Tier Judgments?7
How do evaluators translate quality assessments into journal tier predictions? By correlating each criterion with “where should this publish?” judgments, we can see which factors each evaluator type weighs most heavily. Divergence here reveals systematic differences in evaluation priorities—what we might call “taste.”
Show code
# Load human tier data if availablehuman_tiers_file <-"data/all_jtiers.csv"if (file.exists(human_tiers_file)) { human_tiers_raw <-read_csv(human_tiers_file, show_col_types =FALSE) |>clean_names()# Check what columns are available and find the tier prediction column# Typically 'merits_journal' or similar tier_col <-intersect(names(human_tiers_raw), c("merits_journal", "journal_tier", "tier_should", "mid"))if (length(tier_col) >0) {# Prepare human tier data human_tier_data <- human_tiers_raw |>filter(!is.na(.data[[tier_col[1]]])) |>rename(tier_should =!!tier_col[1])# Get paper identifier column paper_col <-intersect(names(human_tier_data), c("label_paper", "paper", "research"))if (length(paper_col) >0) { human_tier_data <- human_tier_data |>rename(paper =!!paper_col[1]) } metrics_to_cor <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance")# === Human Correlations === human_wide <- metrics_human |>filter(criteria %in% metrics_to_cor, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(mid =mean(mid, na.rm =TRUE), .groups ="drop") |>pivot_wider(names_from = criteria, values_from = mid)# Join with human tier predictions human_tier_match <- human_tier_data |>group_by(paper) |>summarise(tier_should =mean(tier_should, na.rm =TRUE), .groups ="drop") human_cors_data <- human_wide |>inner_join(human_tier_match, by ="paper")if (nrow(human_cors_data) >=5) { cors_human <-tibble(metric = metrics_to_cor,correlation =sapply(metrics_to_cor, function(m) {if (m %in%names(human_cors_data) &&!all(is.na(human_cors_data[[m]]))) {cor(human_cors_data[[m]], human_cors_data$tier_should, use ="pairwise.complete.obs") } elseNA_real_ }),source ="Human" ) } else { cors_human <-tibble(metric =character(), correlation =numeric(), source =character()) }# === LLM Correlations === llm_wide <- llm_metrics |>filter(criteria %in% metrics_to_cor, model == primary_model, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(mid =mean(midpoint, na.rm =TRUE), .groups ="drop") |>pivot_wider(names_from = criteria, values_from = mid) llm_tier_match <- llm_tiers |>filter(tier_kind =="tier_should", model == primary_model) |>group_by(paper) |>summarise(tier_should =mean(score, na.rm =TRUE), .groups ="drop") llm_cors_data <- llm_wide |>inner_join(llm_tier_match, by ="paper")if (nrow(llm_cors_data) >=5) { cors_llm <-tibble(metric = metrics_to_cor,correlation =sapply(metrics_to_cor, function(m) {if (m %in%names(llm_cors_data) &&!all(is.na(llm_cors_data[[m]]))) {cor(llm_cors_data[[m]], llm_cors_data$tier_should, use ="pairwise.complete.obs") } elseNA_real_ }),source ="LLM" ) } else { cors_llm <-tibble(metric =character(), correlation =numeric(), source =character()) }# === Combine and plot === cors_combined <-bind_rows(cors_human, cors_llm) |>filter(!is.na(correlation)) |>mutate(metric_label =case_when( metric =="overall"~"Overall", metric =="claims"~"Claims & Evidence", metric =="methods"~"Methods", metric =="adv_knowledge"~"Advancing Knowledge", metric =="logic_comms"~"Logic & Communication", metric =="open_sci"~"Open Science", metric =="gp_relevance"~"Global Relevance",TRUE~ metric ) )if (nrow(cors_combined) >0) {# Order by average correlation metric_order_cors <- cors_combined |>group_by(metric_label) |>summarise(avg_cor =mean(correlation, na.rm =TRUE), .groups ="drop") |>arrange(desc(avg_cor)) |>pull(metric_label) cors_wide <- cors_combined |>mutate(metric_label =factor(metric_label, levels = metric_order_cors)) |>pivot_wider(names_from = source, values_from = correlation)# Dumbbell plotggplot(cors_wide, aes(y = metric_label)) +geom_segment(aes(x = Human, xend = LLM, yend = metric_label),color ="gray50", linewidth =1.2, alpha =0.4) +geom_point(aes(x = Human), color = UJ_GREEN, size =5, alpha =0.9) +geom_point(aes(x = LLM), color = UJ_ORANGE, size =5, alpha =0.9) +geom_vline(xintercept =0, linetype ="dashed", color ="gray40") +geom_text(aes(x = Human, label =sprintf("%.2f", Human)),hjust =1.4, size =3.5, color = UJ_GREEN, fontface ="bold") +geom_text(aes(x = LLM, label =sprintf("%.2f", LLM)),hjust =-0.4, size =3.5, color = UJ_ORANGE, fontface ="bold") +scale_x_continuous(limits =c(min(c(cors_wide$Human, cors_wide$LLM), na.rm =TRUE) -0.15,max(c(cors_wide$Human, cors_wide$LLM), na.rm =TRUE) +0.15) ) +labs(x ="Correlation with 'Where should this publish?'",y =NULL,subtitle =paste0("Green = Human, Orange = ", primary_model) ) +theme_uj() +theme(panel.grid.major.x =element_line(color ="grey90"),panel.grid.major.y =element_blank(),axis.text.y =element_text(size =11) ) } else {cat("Insufficient correlation data for dumbbell plot.\n") } } else {cat("Tier prediction column not found in human data.\n") }} else {cat("Human tier data file not found.\n")}
Tier prediction column not found in human data.
Interpretation: Large divergences indicate different evaluation philosophies. For example, if humans weight “open science” heavily but LLMs don’t, this suggests the LLM may undervalue reproducibility when forming overall judgments.
Systematic Rating Differences by Paper and Criterion8
The heatmap below shows Human − LLM differences for each paper and criterion. Green indicates humans rated higher than the LLM; orange indicates the LLM rated higher. Papers are ordered by their overall rating difference.
Show code
metric_order <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance")metric_lab <-c(overall ="Overall",claims ="Claims & Evidence",methods ="Methods",adv_knowledge ="Adv. Knowledge",logic_comms ="Logic & Comms",open_sci ="Open Science",gp_relevance ="Global Relevance")# Use the first/primary LLM model for comparison (GPT-5 Pro if available)primary_model <-if ("GPT-5 Pro"%in%unique(llm_metrics$model)) "GPT-5 Pro"elseunique(llm_metrics$model)[1]H_mean <- metrics_human |>filter(criteria %in% metric_order, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(h =mean(mid, na.rm =TRUE), .groups ="drop")L_mean <- llm_metrics |>filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(l =mean(midpoint, na.rm =TRUE), .groups ="drop")Ddiff <-inner_join(H_mean, L_mean, by =c("paper", "criteria")) |>mutate(diff = h - l) |>mutate(crit =factor(criteria, levels = metric_order, labels = metric_lab[metric_order]))# Order papers by overall difference (Human−LLM)ord_p <- Ddiff |>filter(criteria =="overall") |>arrange(desc(diff)) |>pull(paper)if (length(ord_p) >0&&nrow(Ddiff) >0) {ggplot(Ddiff, aes(x =factor(paper, levels = ord_p), y = crit, fill = diff)) +geom_tile(color ="white", linewidth =0.25) +scale_fill_gradient2(low = UJ_ORANGE, mid ="grey95", high = UJ_GREEN, midpoint =0,name ="Human − LLM",limits =c(-30, 30),oob = scales::squish ) +labs(x =NULL, y =NULL,subtitle =paste0("Primary model: ", primary_model)) +theme_uj() +theme(axis.text.x =element_text(angle =60, hjust =1, vjust =1, size =7),axis.text.y =element_text(size =11),panel.grid =element_blank() )} else {cat("Insufficient data for gap heatmap.\n")}
Figure 3.6: Human − LLM rating differences by paper and criterion
This visualization helps identify: (1) papers with systematic disagreement across all criteria, (2) criteria where humans and LLMs consistently diverge, and (3) potential patterns related to paper characteristics.
Understanding why specific papers generate disagreement can inform both LLM limitations and potential human biases. The table below shows papers where humans and LLMs diverged most.
Table 3.9: Papers with largest human vs. LLM rating differences
Paper
Human Rating
LLM Rating
Difference
Direction
Bruers 2021
80
46
+34
Human > LLM
Bhattacharya and Packalen 2020
70
58
+12
Human > LLM
Blimpo and Castaneda-Dower 2025
80
71
+9
Human > LLM
Epperson and Gerster 2024
89
82
+7
Human > LLM
Chuard et al. 2022
80
75
+5
Human > LLM
Bergeron et al. 2024
40
85
-45
LLM > Human
Bettle 2023
32
75
-42
LLM > Human
Williams et al. 2024
50
87
-37
LLM > Human
Frech et al. 2023
40
70
-30
LLM > Human
Green et al. 2025
57
80
-23
LLM > Human
These outliers merit qualitative investigation: examining the paper characteristics, human evaluator comments, and LLM rationales may reveal systematic patterns in what drives disagreement.
Note: GPT-5 Pro evaluations include extended reasoning traces. See the Appendix for full assessment summaries and reasoning traces per paper.
Prompt Comparison: Legacy vs Updated GPT-5 Pro
We compare GPT-5 Pro evaluations between two prompt versions:
Legacy (Oct 2024): Original combined schema without assessment summary
Updated (Jan 2026): Modular prompt with explicit diagnostic assessment summary
Table 3.10: Summary statistics: Legacy vs Updated prompt
Criterion
Legacy Mean
Updated Mean
Mean Diff
SD Diff
Adv. Knowledge
84.5
80.4
-4.1
4.8
Claims
83.2
77.5
-5.7
4.8
Global Relevance
85.8
82.6
-3.2
6.3
Logic & Comms
86.4
83.6
-2.8
4.8
Methods
81.3
73.6
-7.7
4.6
Open Science
67.9
54.8
-13.1
8.9
Overall
86.0
80.4
-5.6
4.4
The updated prompt with explicit diagnostic assessment appears to produce systematically different ratings. The correlation between prompt versions indicates the degree of consistency across the prompt change.
Cover, Thomas M, and Joy A Thomas, “Elements of information theory,” (Wiley-Interscience, 2006).
Shannon, Claude E, “A mathematical theory of communication,”The Bell System Technical Journal, 27 (1948), 379–423.
We submit each PDF through a single, schema-enforced Responses call. The system prompt mirrors the Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a ~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. We enforce well-formed intervals (lower < midpoint < upper for 0–100 metrics; ci_lower < score < ci_upper for 0–5 tiers). GPT-5 Pro runs with reasoning effort = “high”, so traces are available for audit.↩︎
0–100 percentiles relative to a reference group of ~‘all serious work you have read in this area in the last three years’, overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), abd global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.↩︎
This section was added with AI assistance to address feedback about contextualizing agreement metrics.↩︎
This analysis addresses feedback about LLM uncertainty quantification.↩︎
Bootstrap inference was added with AI assistance to quantify uncertainty in agreement metrics.↩︎
Information-theoretic interpretation added with AI assistance.↩︎
This analysis was added with AI assistance to reveal differences in what humans vs. LLMs emphasize when predicting journal outcomes.↩︎
This visualization was added with AI assistance to identify patterns in human-LLM disagreement.↩︎
This table was added with AI assistance to facilitate qualitative investigation of outliers.↩︎
Source Code
---title: "Results: Ratings"format: html: defaultengine: knitr---```{r}#| label: setup#| code-summary: "Setup and libraries"#| code-fold: true#| message: false#| warning: falsesource("setup_params.R")library("tidyverse")library("janitor")library("stringr")library("here")library("knitr")library("kableExtra")library("ggrepel")library("scales")library("jsonlite")library("purrr")library("tibble")library("plotly")# Theme and colorsUJ_ORANGE <-"#f19e4b"# LLM primaryUJ_GREEN <-"#99bb66"# HumanUJ_BLUE <-"#4e79a7"MODEL_COLORS <-c("GPT-5 Pro"="#f19e4b","GPT-5.2 Pro"="#c9302c","GPT-4o-mini"="#74aa9c","Claude Sonnet 4"="#d4a574","Gemini 2.0 Flash"="#7986cb","Human"="#99bb66")theme_uj <-function(base_size =11) {theme_minimal(base_size = base_size) +theme(panel.grid.minor =element_blank(),plot.title.position ="plot",legend.position ="bottom" )}canon_metric <-function(x) dplyr::recode( x,"advancing_knowledge"="adv_knowledge","open_science"="open_sci","logic_communication"="logic_comms","global_relevance"="gp_relevance","claims_evidence"="claims",.default = x)`%||%`<-function(x, y) if (!is.null(x)) x else y# Pricing per 1M tokens (USD)pricing <-tribble(~model, ~input_per_m, ~output_per_m,"GPT-5 Pro", 15.00, 60.00,"GPT-5.2 Pro", 15.00, 60.00,"GPT-4o-mini", 0.15, 0.60,"Claude Sonnet 4", 3.00, 15.00,"Gemini 2.0 Flash", 0.075, 0.30)``````{r}#| label: load-human-data#| code-fold: true#| code-summary: "Load human evaluation data"#| message: false# Load human ratings from UnjournalUJmap <-read_delim("data/UJ_map.csv", delim =";", show_col_types =FALSE) |>mutate(label_paper_title = research, label_paper = paper) |>select(c("label_paper_title", "label_paper"))rsx <-read_csv("data/rsx_evalr_rating.csv", show_col_types =FALSE) |>clean_names() |>mutate(label_paper_title = research) |>select(-c("research"))research <-read_csv("data/research.csv", show_col_types =FALSE) |>clean_names() |>filter(status =="50_published evaluations (on PubPub, by Unjournal)") |>left_join(UJmap, by =c("label_paper_title")) |>mutate(doi =str_trim(doi)) |>mutate(label_paper =case_when( doi =="https://doi.org/10.3386/w31162"~"Walker et al. 2023", doi =="doi.org/10.3386/w32728"~"Hahn et al. 2025", doi =="https://doi.org/10.3386/w30011"~"Bhat et al. 2022", doi =="10.1093/wbro/lkae010"~"Crawfurd et al. 2023",TRUE~ label_paper )) |>left_join(rsx, by =c("label_paper_title"))key_map <- research |>transmute(label_paper_title =str_trim(label_paper_title), label_paper = label_paper) |>filter(!is.na(label_paper_title)) |>distinct(label_paper_title, label_paper) |>group_by(label_paper_title) |>slice(1) |>ungroup()rsx_research <- rsx |>mutate(label_paper_title =str_trim(label_paper_title)) |>left_join(key_map, by ="label_paper_title", relationship ="many-to-one")metrics_human <- rsx_research |>mutate(criteria =canon_metric(criteria)) |>filter(criteria %in%c("overall", "claims", "methods", "adv_knowledge", "logic_comms", "open_sci", "gp_relevance")) |>transmute(paper = label_paper, criteria, evaluator, model ="Human",mid =as.numeric(middle_rating),lo =suppressWarnings(as.numeric(lower_ci)),hi =suppressWarnings(as.numeric(upper_ci)) ) |>filter(!is.na(paper), !is.na(mid)) |>mutate(lo =ifelse(is.finite(lo), pmax(0, pmin(100, lo)), NA_real_),hi =ifelse(is.finite(hi), pmax(0, pmin(100, hi)), NA_real_) ) |>mutate(across(c(mid, lo, hi), ~round(.x, 4))) |>distinct(paper, criteria, model, evaluator, mid, lo, hi)human_avg <- metrics_human |>filter(criteria =="overall") |>group_by(paper) |>summarise(human_mid =mean(mid, na.rm =TRUE),human_lo =mean(lo, na.rm =TRUE),human_hi =mean(hi, na.rm =TRUE),n_human =n(),.groups ="drop" )# Summary stats for inline usen_human_papers <-n_distinct(metrics_human$paper)n_human_evaluators <-n_distinct(metrics_human$evaluator)``````{r}#| label: load-llm-data#| code-fold: true#| code-summary: "Load LLM evaluation data (all models)"#| message: falsemodel_dirs <-list("gpt5_pro_updated_jan2026"="GPT-5 Pro","gpt52_pro_focal_jan2026"="GPT-5.2 Pro","gpt_4o_mini_2024_07_18"="GPT-4o-mini","claude_sonnet_4_20250514"="Claude Sonnet 4","gemini_2.0_flash"="Gemini 2.0 Flash")parse_response <-function(path, model_name) {tryCatch({ r <- jsonlite::fromJSON(path, simplifyVector =FALSE) paper <-basename(path) |>str_replace("\\.response\\.json$", "") |>str_replace_all("_", " ") parsed <-NULLif (!is.null(r$output_text) &&nchar(r$output_text) >0) { parsed <- jsonlite::fromJSON(r$output_text, simplifyVector =TRUE) } elseif (!is.null(r$parsed)) { parsed <- r$parsed } elseif (!is.null(r$output)) { msg <- purrr::detect(r$output, ~ .x$type =="message", .default =NULL)if (!is.null(msg) &&length(msg$content) >0) { parsed <- jsonlite::fromJSON(msg$content[[1]]$text, simplifyVector =TRUE) } }if (is.null(parsed)) return(NULL) metrics <- parsed$metrics metric_rows <-list() tier_rows <-list()for (nm innames(metrics)) {if (nm %in%c("tier_should", "tier_will")) { tier_rows[[length(tier_rows) +1]] <-tibble(paper = paper, model = model_name, tier_kind = nm,score = metrics[[nm]]$score,ci_lower = metrics[[nm]]$ci_lower,ci_upper = metrics[[nm]]$ci_upper ) } else { metric_rows[[length(metric_rows) +1]] <-tibble(paper = paper, model = model_name, metric = nm,midpoint = metrics[[nm]]$midpoint,lower_bound = metrics[[nm]]$lower_bound,upper_bound = metrics[[nm]]$upper_bound ) } } input_tok <- r$usage$input_tokens %||% r$input_tokens output_tok <- r$usage$output_tokens %||% r$output_tokens reasoning_tok <- r$usage$output_tokens_details$reasoning_tokenslist(metrics =bind_rows(metric_rows),tiers =bind_rows(tier_rows),tokens =tibble(paper = paper, model = model_name,input_tokens = input_tok %||%NA_integer_,output_tokens = output_tok %||%NA_integer_,reasoning_tokens = reasoning_tok %||%NA_integer_ ),summary =tibble(paper = paper, model = model_name,assessment_summary = parsed$assessment_summary ) ) }, error =function(e) NULL)}load_all_llm <-function() { all_metrics <-list() all_tiers <-list() all_tokens <-list() all_summaries <-list()for (dir_name innames(model_dirs)) { model_name <- model_dirs[[dir_name]] json_dir <-here("results", dir_name, "json")if (dir.exists(json_dir)) { files <-list.files(json_dir, pattern ="\\.response\\.json$", full.names =TRUE)for (f in files) { result <-parse_response(f, model_name)if (!is.null(result)) { all_metrics[[length(all_metrics) +1]] <- result$metrics all_tiers[[length(all_tiers) +1]] <- result$tiers all_tokens[[length(all_tokens) +1]] <- result$tokens all_summaries[[length(all_summaries) +1]] <- result$summary } } } }list(metrics =bind_rows(all_metrics) |>mutate(criteria =canon_metric(metric)),tiers =bind_rows(all_tiers),tokens =bind_rows(all_tokens),summaries =bind_rows(all_summaries) )}llm_data <-load_all_llm()llm_metrics <- llm_data$metricsllm_tiers <- llm_data$tiersllm_tokens <- llm_data$tokensllm_summaries <- llm_data$summaries# Summary stats for inline usen_llm_models <-n_distinct(llm_metrics$model)n_llm_papers <-n_distinct(llm_metrics$paper)llm_model_names <-unique(llm_metrics$model)llm_model_list <-paste(llm_model_names, collapse =", ")# Papers with both human and LLM datamatched_papers <-intersect(unique(metrics_human$paper),unique(llm_metrics$paper))n_matched <-length(matched_papers)# Cost summarycost_data <- llm_tokens |>left_join(pricing, by ="model") |>mutate(reasoning_tokens =coalesce(reasoning_tokens, 0L),cost_usd = (input_tokens * input_per_m + output_tokens * output_per_m) /1e6 )total_cost <-sum(cost_data$cost_usd, na.rm =TRUE)total_evaluations <-nrow(llm_tokens)# Per-model summariesmodel_summary <- cost_data |>group_by(model) |>summarise(n_papers =n(),avg_input =round(mean(input_tokens, na.rm =TRUE)),avg_output =round(mean(output_tokens, na.rm =TRUE)),avg_reasoning =round(mean(reasoning_tokens[reasoning_tokens >0], na.rm =TRUE)),total_cost =sum(cost_usd, na.rm =TRUE),cost_per_paper =mean(cost_usd, na.rm =TRUE),.groups ="drop" )```We compare LLM-generated research evaluations against human expert reviews from [The Unjournal](https://unjournal.pubpub.org). This analysis covers `r n_llm_papers` papers evaluated by `r n_llm_models` frontier LLMs (`r llm_model_list`), with `r n_matched` papers having both human and LLM evaluations for direct comparison. Human evaluations come from `r n_human_evaluators` expert reviewers across `r n_human_papers` papers in The Unjournal's database.## Evaluation OverviewThe previous [Methods](methods.qmd)) chapter explains our process and pipeline.[^results_ratings-1] This generally mirrors The Unjournal's rubric and [guidelines](https://globalimpact.gitbook.io/the-unjournal-project-and-communication-space/policies-projects-evaluation-workflow/evaluation/guidelines-for-evaluators), requesting a range of percentile metrics and normative and predictive 'journal tiers' metrics.^[0–100 percentiles relative to a reference group of ~'all serious work you have read in this area in the last three years', overall (global quality/impact), claims_evidence (claim clarity/support), methods (design/identification/robustness), advancing_knowledge (contribution to field/practice), logic_communication (internal consistency and exposition), open_science (reproducibility, data/code availability), abd global_relevance (decision and priority usefulness). Journal tiers: (0–5, continuous) capture where the work should publish vs. will publish.][^results_ratings-1]: We submit each PDF through a single, schema-enforced Responses call. The system prompt mirrors the Unjournal rubric, blocks extrinsic priors (authors, venue, web context), and requires strict JSON: a \~1k-word diagnostic summary plus numeric midpoints and 90% credible intervals for every metric. We enforce well-formed intervals (lower \< midpoint \< upper for 0–100 metrics; ci_lower \< score \< ci_upper for 0–5 tiers). GPT-5 Pro runs with reasoning effort = "high", so traces are available for audit.Each numeric field carries a midpoint (model’s 50% belief) and a 90% credible interval; wide intervals indicate acknowledged uncertainty.::: callout-note## Future ModelsWe are working to extend this comparison to newer frontier models as they become available and our funding permits, as well as adapting these to bespoke agent-based tools such as Refine.ink.:::### Token Usage and CostToken costs are computed from API-reported input/output tokens (GPT-5 Pro runs with `reasoning` = high). Use the boxplot log scale to compare efficiency frontiers; reasoning tokens are reported when available.```{r}#| label: tbl-cost-overview#| tbl-cap: "Token usage and estimated cost per model"if (nrow(llm_tokens) >0) { cost_table <- model_summary |>transmute(Model = model,Papers = n_papers,`Avg input`= scales::comma(avg_input),`Avg output`= scales::comma(avg_output),`Avg reasoning`=ifelse(is.finite(avg_reasoning) & avg_reasoning >0, scales::comma(avg_reasoning), "—"),`Total cost`= scales::dollar(total_cost, accuracy =0.01),`Cost/paper`= scales::dollar(cost_per_paper, accuracy =0.001) ) |>arrange(desc(as.numeric(gsub("[$,]", "", `Total cost`)))) knitr::kable(cost_table, align =c("l", rep("r", 6)))}```Total cost across all `r total_evaluations` LLM evaluations: **`r scales::dollar(total_cost, accuracy = 0.01)`**.```{r}#| label: fig-cost-box#| fig-cap: "Cost per paper by model (log scale)"#| fig-width: 9#| fig-height: 5if (nrow(cost_data) >0) {ggplot(cost_data, aes(x =reorder(model, cost_usd, FUN = median), y = cost_usd, fill = model)) +geom_boxplot(alpha =0.7, outlier.shape =NA) +geom_jitter(width =0.15, alpha =0.5, size =2) +scale_fill_manual(values = MODEL_COLORS, guide ="none") +scale_y_log10(labels = scales::dollar_format(accuracy =0.001)) +coord_flip() +labs(x =NULL, y ="Cost per paper (USD, log scale)") +theme_uj()}```<!-- DR 7-1-26: Consider overlap between the content above and the methods chapter. We should have some introduction in this chapter but we don't need to talk about procedures in depth. Costs probably does belong here because it's tied to the practical implications. -->------------------------------------------------------------------------## Interactive Forest Plot: Human vs LLM RatingsThe forest plot below shows **overall percentile ratings** for each of the `r n_matched` papers with matched human and LLM evaluations. Human evaluator ratings are shown in green (individual evaluators as small dots, mean as diamond). LLM ratings from different models are shown with distinct colors.Interpretation: points are midpoints, horizontal bars are the model’s 90% credible interval.Hover over any point to see details. Click legend items to show/hide models.```{r}#| label: fig-forest-plotly#| fig-cap: "Interactive forest plot: Human and LLM overall ratings by paper"#| fig-width: 12#| fig-height: 15forest_human <- metrics_human |>filter(criteria =="overall") |>transmute(paper, model ="Human", evaluator, mid, lo, hi, is_mean =FALSE)forest_human_means <- human_avg |>transmute(paper, model ="Human", evaluator ="Mean",mid = human_mid, lo = human_lo, hi = human_hi, is_mean =TRUE)forest_llm <- llm_metrics |>filter(criteria =="overall") |>transmute(paper, model, evaluator = model,mid = midpoint, lo = lower_bound, hi = upper_bound, is_mean =FALSE)forest_data <-bind_rows(forest_human, forest_human_means, forest_llm) |>filter(paper %in% matched_papers)if (nrow(forest_data) >0) { paper_order <- forest_data |>filter(model =="Human", is_mean) |>arrange(desc(mid)) |>pull(paper) forest_data <- forest_data |>mutate(paper =factor(paper, levels =rev(paper_order)),paper_num =as.numeric(paper),hover_text =paste0("<b>", paper, "</b><br>", model, ifelse(evaluator != model &!is_mean, paste0(" (", evaluator, ")"), ""), "<br>","Rating: ", round(mid, 1),ifelse(!is.na(lo) &!is.na(hi), paste0(" [", round(lo, 1), "–", round(hi, 1), "]"), "") ) ) p <-plot_ly() |>add_segments(x =25, xend =25, y =0.5, yend =length(paper_order) +0.5,line =list(color ="lightgray", dash ="dot"),showlegend =FALSE, hoverinfo ="none") |>add_segments(x =50, xend =50, y =0.5, yend =length(paper_order) +0.5,line =list(color ="gray", dash ="dot"),showlegend =FALSE, hoverinfo ="none") |>add_segments(x =75, xend =75, y =0.5, yend =length(paper_order) +0.5,line =list(color ="lightgray", dash ="dot"),showlegend =FALSE, hoverinfo ="none") models_to_plot <-c("Human", setdiff(unique(forest_data$model), "Human"))for (m in models_to_plot) { model_data <- forest_data |>filter(model == m)if (m =="Human") { indiv <- model_data |>filter(!is_mean)if (nrow(indiv) >0) { p <- p |>add_trace(data = indiv, type ="scatter", mode ="markers",x =~mid, y =~paper_num,marker =list(color = MODEL_COLORS[m], size =6, opacity =0.4),text =~hover_text, hoverinfo ="text",name ="Human (individual)", legendgroup ="Human" ) } means <- model_data |>filter(is_mean)if (nrow(means) >0) { p <- p |>add_trace(data = means, type ="scatter", mode ="markers",x =~mid, y =~paper_num,error_x =list(type ="data", symmetric =FALSE,arrayminus =~(mid - lo), array =~(hi - mid),color = MODEL_COLORS[m], thickness =1.5),marker =list(color = MODEL_COLORS[m], size =12, symbol ="diamond"),text =~hover_text, hoverinfo ="text",name ="Human (mean)", legendgroup ="Human" ) } } else { p <- p |>add_trace(data = model_data, type ="scatter", mode ="markers",x =~mid, y =~paper_num,error_x =list(type ="data", symmetric =FALSE,arrayminus =~(mid - lo), array =~(hi - mid),color = MODEL_COLORS[m], thickness =1.5),marker =list(color = MODEL_COLORS[m], size =10),text =~hover_text, hoverinfo ="text", name = m ) } } p <- p |>layout(xaxis =list(title ="Overall percentile (0–100)", range =c(0, 105), zeroline =FALSE),yaxis =list(title ="", ticktext = paper_order, tickvals =seq_along(paper_order), tickmode ="array"),legend =list(orientation ="h", y =-0.1, x =0.5, xanchor ="center"),hovermode ="closest",margin =list(l =200) ) p} else {cat("No matched human-LLM data available for forest plot.\n")}```## Human vs LLM Scatter PlotEach point represents a paper, with human mean rating on the x-axis and LLM rating on the y-axis. Points on the diagonal indicate perfect agreement.Note: human means carry their own variance; correlations here are bounded by human inter-rater noise.```{r}#| label: fig-scatter-interactive#| fig-cap: "Interactive scatter: Human vs LLM overall ratings"#| fig-width: 10#| fig-height: 8scatter_data <- llm_metrics |>filter(criteria =="overall") |>inner_join(human_avg, by ="paper") |>mutate(diff = midpoint - human_mid,paper_short =str_trunc(paper, 25),# Ensure no NA values break ggplotlyhuman_lo =coalesce(human_lo, human_mid),human_hi =coalesce(human_hi, human_mid),lower_bound =coalesce(lower_bound, midpoint),upper_bound =coalesce(upper_bound, midpoint) ) |>filter(!is.na(human_mid), !is.na(midpoint))if (nrow(scatter_data) >0) { p <-ggplot(scatter_data, aes(x = human_mid, y = midpoint, color = model,text =paste0("<b>", paper, "</b><br>","Human: ", round(human_mid, 1), " [", round(human_lo, 1), "–", round(human_hi, 1), "]<br>","LLM: ", round(midpoint, 1), " [", round(lower_bound, 1), "–", round(upper_bound, 1), "]<br>","Diff: ", sprintf("%+.1f", diff) ) )) +geom_abline(slope =1, intercept =0, linetype ="dashed", color ="grey50") +geom_point(size =3.5, alpha =0.7) +scale_color_manual(values = MODEL_COLORS) +coord_cartesian(xlim =c(0, 100), ylim =c(0, 100)) +labs(x ="Human mean rating (0–100)", y ="LLM rating (0–100)", color ="Model") +theme_uj()# Use tryCatch to handle ggplotly errors gracefullytryCatch({ggplotly(p, tooltip ="text") |>layout(legend =list(orientation ="h", y =-0.12, x =0.5, xanchor ="center"),hoverlabel =list(bgcolor ="white") ) }, error =function(e) {# Fallback to static plot if ggplotly failsprint(p)message("Note: Interactive plot unavailable, showing static version.") })} else {cat("No matching data for scatter plot.\n")}```## Model Agreement StatisticsWe assess agreement between LLM and human evaluators using multiple metrics:- **Pearson r**: Linear correlation between ratings- **Spearman ρ**: Rank-based correlation (robust to outliers)- **Mean bias**: Average difference (LLM − Human); positive = LLM rates higher- **RMSE/MAE**: Error magnitude measures```{r}#| label: tbl-agreement#| tbl-cap: "Agreement between each LLM and human mean ratings (overall criterion)"if (nrow(scatter_data) >0) { agreement_tbl <- scatter_data |>group_by(Model = model) |>summarise(N =n(),`Pearson r`=round(cor(human_mid, midpoint, use ="complete.obs"), 3),`Spearman ρ`=round(cor(human_mid, midpoint, method ="spearman", use ="complete.obs"), 3),`Mean bias`=sprintf("%+.1f", mean(midpoint - human_mid, na.rm =TRUE)),RMSE =round(sqrt(mean((midpoint - human_mid)^2, na.rm =TRUE)), 1),MAE =round(mean(abs(midpoint - human_mid), na.rm =TRUE), 1),.groups ="drop" ) |>arrange(desc(`Pearson r`)) knitr::kable(agreement_tbl, align =c("l", rep("r", 6)))}```### Inter-Rater Agreement (Krippendorff's Alpha)Krippendorff's alpha measures agreement among multiple raters, accounting for chance agreement. Values above 0.8 indicate good agreement; 0.67–0.8 is acceptable for exploratory research. Ceiling effects apply because human evaluators are not perfectly aligned.```{r}#| label: tbl-krippendorff#| tbl-cap: "Inter-rater agreement: Krippendorff's alpha by criterion"# Install irr if needed: install.packages("irr")if (requireNamespace("irr", quietly =TRUE)) {library(irr)# Prepare data: each row = paper, each column = rater (human mean + each LLM) compute_alpha <-function(criterion) {# Human mean per paper human_crit <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>group_by(paper) |>summarise(Human =mean(mid, na.rm =TRUE), .groups ="drop")# LLM ratings per paper llm_crit <- llm_metrics |>filter(criteria == criterion, paper %in% matched_papers) |>select(paper, model, midpoint) |>pivot_wider(names_from = model, values_from = midpoint)# Combine combined <- human_crit |>inner_join(llm_crit, by ="paper") |>select(-paper)if (nrow(combined) <3||ncol(combined) <2) return(NA_real_)# Krippendorff's alpha (interval scale)tryCatch({ ka <-kripp.alpha(t(as.matrix(combined)), method ="interval") ka$value }, error =function(e) NA_real_) } criteria_list <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance") alpha_results <-tibble(Criterion =c("Overall", "Claims", "Methods", "Adv. Knowledge","Logic & Comms", "Open Science", "Global Relevance"),criterion_key = criteria_list ) |>rowwise() |>mutate(`Krippendorff's α`=compute_alpha(criterion_key),`α (formatted)`=ifelse(is.na(`Krippendorff's α`), "—",sprintf("%.3f", `Krippendorff's α`)),Interpretation =case_when(is.na(`Krippendorff's α`) ~"Insufficient data",`Krippendorff's α`>=0.8~"Good",`Krippendorff's α`>=0.67~"Acceptable",`Krippendorff's α`>=0.4~"Fair",TRUE~"Poor" ) ) |>ungroup() |>select(Criterion, `α (formatted)`, Interpretation) |>rename(`Krippendorff's α`=`α (formatted)`) knitr::kable(alpha_results, align =c("l", "r", "l"))} else {cat("*Install the `irr` package for Krippendorff's alpha: `install.packages('irr')`*\n")}```<!-- CLAUDE GENERATED: Human-human vs Human-LLM baseline comparison -->### Human-Human vs Human-LLM Agreement (Baseline Context)[^results_ratings-2][^results_ratings-2]: This section was added with AI assistance to address feedback about contextualizing agreement metrics.To interpret human-LLM agreement, we must compare against **human-human agreement** as an upper bound. If human evaluators only agree with each other at α = 0.5, then expecting LLMs to achieve α = 0.8 with humans would be unrealistic. The table below shows both:- **α~HH~**: Krippendorff's alpha among human evaluators (treating each evaluator as a separate rater)- **α~HL~**: Krippendorff's alpha between human mean and each LLM model```{r}#| label: tbl-human-baseline#| tbl-cap: "Human-human vs Human-LLM agreement by criterion (Krippendorff's α)"#| code-fold: trueif (requireNamespace("irr", quietly =TRUE)) {library(irr) criteria_list <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance") criterion_labels <-c(overall ="Overall", claims ="Claims", methods ="Methods",adv_knowledge ="Adv. Knowledge", logic_comms ="Logic & Comms",open_sci ="Open Science", gp_relevance ="Global Relevance" )# Compute human-human alpha (each evaluator as separate rater) compute_hh_alpha <-function(criterion) { human_wide <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>select(paper, evaluator, mid) |>distinct() |>pivot_wider(names_from = paper, values_from = mid)if (nrow(human_wide) <2||ncol(human_wide) <3) return(NA_real_) M <-as.matrix(human_wide[, -1, drop =FALSE])tryCatch({ irr::kripp.alpha(M, method ="interval")$value }, error =function(e) NA_real_) }# Compute human-LLM alpha for a specific model compute_hl_alpha <-function(criterion, model_name) { human_mean <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>group_by(paper) |>summarise(Human =mean(mid, na.rm =TRUE), .groups ="drop") llm_rating <- llm_metrics |>filter(criteria == criterion, model == model_name, paper %in% matched_papers) |>select(paper, midpoint) |>rename(LLM = midpoint) combined <-inner_join(human_mean, llm_rating, by ="paper")if (nrow(combined) <3) return(NA_real_) M <-rbind(Human = combined$Human, LLM = combined$LLM)tryCatch({ irr::kripp.alpha(M, method ="interval")$value }, error =function(e) NA_real_) }# Get unique LLM models llm_models <-unique(llm_metrics$model)# Build comparison table baseline_results <-tibble(Criterion = criterion_labels[criteria_list]) |>mutate(criterion_key = criteria_list) |>rowwise() |>mutate(`α_HH`=compute_hh_alpha(criterion_key) ) |>ungroup()# Add columns for each LLM modelfor (mod in llm_models) { col_name <-paste0("α_HL (", mod, ")") baseline_results <- baseline_results |>rowwise() |>mutate(!!col_name :=compute_hl_alpha(criterion_key, mod)) |>ungroup() }# Format for display baseline_display <- baseline_results |>select(-criterion_key) |>mutate(across(where(is.numeric), ~ifelse(is.na(.x), "—", sprintf("%.2f", .x)))) knitr::kable(baseline_display, align =c("l", rep("r", ncol(baseline_display) -1)))}```**Interpretation**: If α~HL~ approaches or exceeds α~HH~, the LLM is performing at or above the level of agreement humans achieve with each other—a meaningful benchmark for "human-level" evaluation consistency.<!-- END CLAUDE GENERATED --><!-- CLAUDE GENERATED: Calibration analysis -->### Uncertainty Interval Coverage (LLM Credible Interval Assessment)[^results_ratings-3][^results_ratings-3]: This analysis addresses feedback about LLM uncertainty quantification.LLMs report 90% credible intervals alongside their point estimates. If these intervals are **well-specified**, approximately 90% of human mean ratings should fall within them. Lower coverage suggests the LLM is overconfident; higher coverage suggests unnecessary uncertainty.```{r}#| label: tbl-ci-coverage#| tbl-cap: "Share of human mean ratings within LLM 90% credible intervals"#| code-fold: true# Compute CI coverage: % of human means within LLM [lower_bound, upper_bound]compute_coverage <-function(criterion, model_name) { human_mean <- metrics_human |>filter(criteria == criterion, paper %in% matched_papers) |>group_by(paper) |>summarise(human_mid =mean(mid, na.rm =TRUE), .groups ="drop") llm_ci <- llm_metrics |>filter(criteria == criterion, model == model_name, paper %in% matched_papers) |>select(paper, lower_bound, upper_bound) combined <-inner_join(human_mean, llm_ci, by ="paper") |>filter(!is.na(lower_bound), !is.na(upper_bound))if (nrow(combined) <3) return(NA_real_)# Check if human mean falls within LLM CI covered <- combined |>mutate(in_ci = human_mid >= lower_bound & human_mid <= upper_bound) |>summarise(coverage =mean(in_ci, na.rm =TRUE)) |>pull(coverage) covered}criteria_list <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance")criterion_labels <-c(overall ="Overall", claims ="Claims", methods ="Methods",adv_knowledge ="Adv. Knowledge", logic_comms ="Logic & Comms",open_sci ="Open Science", gp_relevance ="Global Relevance")llm_models <-unique(llm_metrics$model)coverage_results <-tibble(Criterion = criterion_labels[criteria_list]) |>mutate(criterion_key = criteria_list)for (mod in llm_models) { col_name <- mod coverage_results <- coverage_results |>rowwise() |>mutate(!!col_name :=compute_coverage(criterion_key, mod)) |>ungroup()}# Format for displaycoverage_display <- coverage_results |>select(-criterion_key) |>mutate(across(where(is.numeric), ~ifelse(is.na(.x), "—", scales::percent(.x, accuracy =1))))knitr::kable(coverage_display, align =c("l", rep("r", ncol(coverage_display) -1)))```**Interpretation**: Coverage near 90% indicates well-specified uncertainty. Coverage below 70% suggests systematic overconfidence; coverage above 95% may indicate overly wide intervals that provide little information.<!-- END CLAUDE GENERATED --><!-- CLAUDE GENERATED: Formal statistical tests -->### Statistical Significance of Agreement Measures[^results_ratings-4][^results_ratings-4]: Bootstrap inference was added with AI assistance to quantify uncertainty in agreement metrics.To assess whether observed correlations are statistically meaningful and to compare models, we compute **bootstrap 95% confidence intervals** for Pearson correlations. This nonparametric approach makes no distributional assumptions and accounts for the small sample sizes typical in evaluation studies.```{r}#| label: tbl-bootstrap-ci#| tbl-cap: "Bootstrap 95% CIs for human-LLM correlation (overall ratings)"#| code-fold: trueset.seed(42) # reproducibility# Bootstrap function for correlationboot_cor <-function(data, n_boot =1000) { cors <-numeric(n_boot) n <-nrow(data)for (i in1:n_boot) { idx <-sample(1:n, n, replace =TRUE)# Use tryCatch to handle edge cases (e.g., constant values in sample) cors[i] <-tryCatch(cor(data$human[idx], data$llm[idx], use ="complete.obs"),error =function(e) NA_real_,warning =function(w) NA_real_ ) }# Remove NAs before computing quantiles cors_valid <- cors[!is.na(cors)]if (length(cors_valid) < n_boot *0.5) {# Too many failed bootstraps - return NAsreturn(c(estimate =NA_real_, ci_lo =NA_real_, ci_hi =NA_real_)) }c(estimate =cor(data$human, data$llm, use ="complete.obs"),ci_lo =quantile(cors_valid, 0.025, na.rm =TRUE),ci_hi =quantile(cors_valid, 0.975, na.rm =TRUE))}# Prepare data: overall rating comparisonhuman_overall <- metrics_human |>filter(criteria =="overall", paper %in% matched_papers) |>group_by(paper) |>summarise(human =mean(mid, na.rm =TRUE), .groups ="drop")llm_models <-unique(llm_metrics$model)boot_results <-map_dfr(llm_models, function(mod) { llm_overall <- llm_metrics |>filter(criteria =="overall", model == mod, paper %in% matched_papers) |>group_by(paper) |>summarise(llm =mean(midpoint, na.rm =TRUE), .groups ="drop") combined <-inner_join(human_overall, llm_overall, by ="paper")if (nrow(combined) >=5) { res <-boot_cor(combined) ci_lo <- res["ci_lo.2.5%"] ci_hi <- res["ci_hi.97.5%"]# Handle different quantile naming conventionsif (is.na(ci_lo)) ci_lo <- res["ci_lo"]if (is.na(ci_hi)) ci_hi <- res["ci_hi"]tibble(Model = mod,n =nrow(combined),r = res["estimate"],`95% CI`=if (is.na(ci_lo) ||is.na(ci_hi)) "—"elsesprintf("[%.2f, %.2f]", ci_lo, ci_hi) ) } else {tibble(Model = mod, n =nrow(combined), r =NA_real_, `95% CI`="—") }})if (nrow(boot_results) >0) { boot_results <- boot_results |>mutate(r =ifelse(is.na(r), "—", sprintf("%.2f", r))) |>arrange(desc(as.numeric(ifelse(r =="—", "-1", r)))) knitr::kable(boot_results, align =c("l", "r", "r", "r"))}```**Interpretation**: Non-overlapping confidence intervals suggest statistically distinguishable performance. Wide intervals reflect uncertainty due to limited sample size—a common constraint in evaluation research where obtaining more human judgments is costly.<!-- END CLAUDE GENERATED --><!-- CLAUDE GENERATED: Information-theoretic measures -->### Information-Theoretic Perspective: Mutual Information[^results_ratings-5][^results_ratings-5]: Information-theoretic interpretation added with AI assistance.Beyond correlation, we can ask: **how much information do LLM ratings convey about human ratings?** We use *mutual information* (MI), a fundamental concept from information theory [@cover2006elements; @shannon1948mathematical].**What is mutual information?** MI quantifies the *reduction in uncertainty* about one variable when you learn the value of another. Formally, MI(X;Y) = H(X) - H(X\|Y), where H is entropy (average surprise/uncertainty). In plain terms:- Before seeing the LLM rating, you have some uncertainty about what the human rating will be- After seeing the LLM rating, how much does your uncertainty decrease?- MI measures this decrease, in *bits* (the same unit used in data compression and communication)**Why use MI instead of (or alongside) correlation?**| Property | Correlation (r) | Mutual Information ||----------------------------------|-----------------|--------------------|| Captures linear relationships | Yes | Yes || Captures nonlinear relationships | No | Yes || Detects threshold effects | No | Yes || Scale | -1 to +1 | 0 to ∞ bits || Sensitive to outliers | Yes | Less so |A practical example: suppose the LLM predicts human ratings perfectly when the paper is methodology-focused, but randomly when it's theory-focused. Correlation might show r = 0.5 (moderate), but MI would reveal this is *inconsistent* performance rather than *uniformly moderate* performance. This distinction matters for deciding when to trust LLM evaluations.**How to interpret MI values (for 5-bin discretization):**- **0 bits**: Independence—LLM ratings tell you nothing about human ratings- **\~0.5 bits**: Weak association—knowing LLM rating slightly reduces uncertainty- **\~1.0 bits**: Moderate association—knowing LLM rating roughly halves your uncertainty- **\~1.5 bits**: Strong association—substantial predictive value- **2.32 bits** (maximum): Perfect prediction—the LLM quintile perfectly determines the human quintileWe discretize ratings into quintiles (5 bins) because MI is formally defined for discrete distributions. This preserves ordinal structure while avoiding overfitting to noise in continuous ratings.```{r}#| label: tbl-mutual-info#| tbl-cap: "Mutual information between human and LLM overall ratings"#| code-fold: true# Discretize into quintilesdiscretize <-function(x, n_bins =5) { breaks <-quantile(x, probs =seq(0, 1, length.out = n_bins +1), na.rm =TRUE) breaks <-unique(breaks)if (length(breaks) <2) return(rep(1, length(x)))cut(x, breaks = breaks, labels =FALSE, include.lowest =TRUE)}# Compute mutual information: MI(X;Y) = sum p(x,y) log2(p(x,y) / p(x)p(y))compute_mi <-function(x, y) {if (length(x) <5||length(y) <5) return(NA_real_) x_disc <-discretize(x) y_disc <-discretize(y) valid <-!is.na(x_disc) &!is.na(y_disc) x_disc <- x_disc[valid] y_disc <- y_disc[valid]if (length(x_disc) <5) return(NA_real_)# Joint distribution p(x,y) joint <-table(x_disc, y_disc) /length(x_disc)# Marginal distributions p(x), p(y) px <-rowSums(joint) py <-colSums(joint)# MI = sum over all (x,y) of p(x,y) * log2(p(x,y) / (p(x)*p(y))) mi <-0for (i inseq_along(px)) {for (j inseq_along(py)) {if (joint[i, j] >0&& px[i] >0&& py[j] >0) { mi <- mi + joint[i, j] *log2(joint[i, j] / (px[i] * py[j])) } } } mi}llm_models <-unique(llm_metrics$model)mi_results <-map_dfr(llm_models, function(mod) { llm_overall <- llm_metrics |>filter(criteria =="overall", model == mod, paper %in% matched_papers) |>group_by(paper) |>summarise(llm =mean(midpoint, na.rm =TRUE), .groups ="drop") combined <-inner_join(human_overall, llm_overall, by ="paper") mi_val <-compute_mi(combined$human, combined$llm) max_mi <-log2(5) # ≈ 2.32 bits for 5 bins norm_mi <-if (!is.na(mi_val)) mi_val / max_mi elseNA_real_tibble(Model = mod,n =nrow(combined),`MI (bits)`=ifelse(is.na(mi_val), "—", sprintf("%.2f", mi_val)),`Normalized`=ifelse(is.na(norm_mi), "—", sprintf("%.0f%%", norm_mi *100)) )})if (nrow(mi_results) >0) { knitr::kable(mi_results, align =c("l", "r", "r", "r"))}```**Interpretation**: Normalized MI above 50% indicates substantial predictive value. If MI is low despite moderate correlation, investigate which paper characteristics the LLM struggles with—this points toward targeted improvements or hybrid human-AI workflows where humans handle cases the LLM finds difficult.<!-- END CLAUDE GENERATED -->### Agreement by Evaluation Criterion```{r}#| label: tbl-agreement-by-criterion#| tbl-cap: "Pearson and Spearman correlations by evaluation criterion"# Compute correlations for each criterion and modelcriterion_agreement <- llm_metrics |>filter(paper %in% matched_papers) |>inner_join( metrics_human |>filter(paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(human_mid =mean(mid, na.rm =TRUE), .groups ="drop"),by =c("paper", "criteria") ) |>group_by(Model = model, Criterion = criteria) |>summarise(n =n(),r =cor(midpoint, human_mid, use ="complete.obs"),rho =cor(midpoint, human_mid, method ="spearman", use ="complete.obs"),.groups ="drop" ) |>filter(n >=3) # need at least 3 papers for meaningful correlationif (nrow(criterion_agreement) >0) {# Pivot for display criterion_labels <-c(overall ="Overall", claims ="Claims", methods ="Methods",adv_knowledge ="Adv. Knowledge", logic_comms ="Logic & Comms",open_sci ="Open Science", gp_relevance ="Global Relevance" ) corr_table <- criterion_agreement |>mutate(Criterion = criterion_labels[Criterion],`r / ρ`=sprintf("%.2f / %.2f", r, rho) ) |>select(Model, Criterion, `r / ρ`) |>pivot_wider(names_from = Model, values_from =`r / ρ`, values_fill ="—")# Order criteria corr_table <- corr_table |>mutate(Criterion =factor(Criterion, levels = criterion_labels)) |>arrange(Criterion) knitr::kable(corr_table, align =c("l", rep("r", ncol(corr_table) -1)))}```## Rating Distribution by Model```{r}#| label: fig-rating-distribution#| fig-cap: "Distribution of overall ratings by evaluator type"#| fig-width: 10#| fig-height: 6all_overall <-bind_rows( metrics_human |>filter(criteria =="overall") |>transmute(paper, model, rating = mid), llm_metrics |>filter(criteria =="overall") |>transmute(paper, model, rating = midpoint)) |>filter(paper %in% matched_papers) |>filter(model !="Claude Sonnet 4")if (nrow(all_overall) >0) {ggplot(all_overall, aes(x = rating, fill = model)) +geom_density(alpha =0.5, adjust =1.2) +scale_fill_manual(values = MODEL_COLORS) +labs(x ="Overall percentile (0–100)", y ="Density", fill ="Evaluator") +theme_uj() +theme(legend.position ="right")}```## Criteria-Level Comparison```{r}#| label: fig-criteria-heatmap#| fig-cap: "Mean rating by model and criterion"#| fig-width: 11#| fig-height: 6metric_labels <-c(overall ="Overall", claims ="Claims", methods ="Methods",adv_knowledge ="Adv. Knowledge", logic_comms ="Logic & Comms",open_sci ="Open Science", gp_relevance ="Global Relevance")criteria_means <-bind_rows( metrics_human |>filter(criteria %in%names(metric_labels), paper %in% matched_papers) |>group_by(criteria) |>summarise(mean_rating =mean(mid, na.rm =TRUE), .groups ="drop") |>mutate(model ="Human"), llm_metrics |>filter(criteria %in%names(metric_labels), paper %in% matched_papers) |>group_by(model, criteria) |>summarise(mean_rating =mean(midpoint, na.rm =TRUE), .groups ="drop")) |>mutate(criteria_label =factor(metric_labels[criteria], levels = metric_labels))if (nrow(criteria_means) >0) {ggplot(criteria_means, aes(x = criteria_label, y = model, fill = mean_rating)) +geom_tile(color ="white", linewidth =0.5) +geom_text(aes(label =round(mean_rating, 0)), size =4, color ="white", fontface ="bold") +scale_fill_gradient(low ="#f0f0f0", high = UJ_ORANGE, limits =c(40, 80), name ="Mean") +labs(x =NULL, y =NULL) +theme_uj() +theme(axis.text.x =element_text(angle =30, hjust =1, size =10), panel.grid =element_blank())}```<!-- CLAUDE GENERATED: Tier correlation dumbbell plot -->## Evaluation Priorities: What Predicts Journal Tier Judgments?[^results_ratings-6][^results_ratings-6]: This analysis was added with AI assistance to reveal differences in what humans vs. LLMs emphasize when predicting journal outcomes.How do evaluators translate quality assessments into journal tier predictions? By correlating each criterion with "where should this publish?" judgments, we can see which factors each evaluator type weighs most heavily. Divergence here reveals systematic differences in evaluation priorities—what we might call "taste."```{r}#| label: fig-tier-correlations#| fig-cap: "Correlation of each criterion with journal tier predictions"#| fig-width: 12#| fig-height: 7#| code-fold: true# Load human tier data if availablehuman_tiers_file <-"data/all_jtiers.csv"if (file.exists(human_tiers_file)) { human_tiers_raw <-read_csv(human_tiers_file, show_col_types =FALSE) |>clean_names()# Check what columns are available and find the tier prediction column# Typically 'merits_journal' or similar tier_col <-intersect(names(human_tiers_raw), c("merits_journal", "journal_tier", "tier_should", "mid"))if (length(tier_col) >0) {# Prepare human tier data human_tier_data <- human_tiers_raw |>filter(!is.na(.data[[tier_col[1]]])) |>rename(tier_should =!!tier_col[1])# Get paper identifier column paper_col <-intersect(names(human_tier_data), c("label_paper", "paper", "research"))if (length(paper_col) >0) { human_tier_data <- human_tier_data |>rename(paper =!!paper_col[1]) } metrics_to_cor <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance")# === Human Correlations === human_wide <- metrics_human |>filter(criteria %in% metrics_to_cor, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(mid =mean(mid, na.rm =TRUE), .groups ="drop") |>pivot_wider(names_from = criteria, values_from = mid)# Join with human tier predictions human_tier_match <- human_tier_data |>group_by(paper) |>summarise(tier_should =mean(tier_should, na.rm =TRUE), .groups ="drop") human_cors_data <- human_wide |>inner_join(human_tier_match, by ="paper")if (nrow(human_cors_data) >=5) { cors_human <-tibble(metric = metrics_to_cor,correlation =sapply(metrics_to_cor, function(m) {if (m %in%names(human_cors_data) &&!all(is.na(human_cors_data[[m]]))) {cor(human_cors_data[[m]], human_cors_data$tier_should, use ="pairwise.complete.obs") } elseNA_real_ }),source ="Human" ) } else { cors_human <-tibble(metric =character(), correlation =numeric(), source =character()) }# === LLM Correlations === llm_wide <- llm_metrics |>filter(criteria %in% metrics_to_cor, model == primary_model, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(mid =mean(midpoint, na.rm =TRUE), .groups ="drop") |>pivot_wider(names_from = criteria, values_from = mid) llm_tier_match <- llm_tiers |>filter(tier_kind =="tier_should", model == primary_model) |>group_by(paper) |>summarise(tier_should =mean(score, na.rm =TRUE), .groups ="drop") llm_cors_data <- llm_wide |>inner_join(llm_tier_match, by ="paper")if (nrow(llm_cors_data) >=5) { cors_llm <-tibble(metric = metrics_to_cor,correlation =sapply(metrics_to_cor, function(m) {if (m %in%names(llm_cors_data) &&!all(is.na(llm_cors_data[[m]]))) {cor(llm_cors_data[[m]], llm_cors_data$tier_should, use ="pairwise.complete.obs") } elseNA_real_ }),source ="LLM" ) } else { cors_llm <-tibble(metric =character(), correlation =numeric(), source =character()) }# === Combine and plot === cors_combined <-bind_rows(cors_human, cors_llm) |>filter(!is.na(correlation)) |>mutate(metric_label =case_when( metric =="overall"~"Overall", metric =="claims"~"Claims & Evidence", metric =="methods"~"Methods", metric =="adv_knowledge"~"Advancing Knowledge", metric =="logic_comms"~"Logic & Communication", metric =="open_sci"~"Open Science", metric =="gp_relevance"~"Global Relevance",TRUE~ metric ) )if (nrow(cors_combined) >0) {# Order by average correlation metric_order_cors <- cors_combined |>group_by(metric_label) |>summarise(avg_cor =mean(correlation, na.rm =TRUE), .groups ="drop") |>arrange(desc(avg_cor)) |>pull(metric_label) cors_wide <- cors_combined |>mutate(metric_label =factor(metric_label, levels = metric_order_cors)) |>pivot_wider(names_from = source, values_from = correlation)# Dumbbell plotggplot(cors_wide, aes(y = metric_label)) +geom_segment(aes(x = Human, xend = LLM, yend = metric_label),color ="gray50", linewidth =1.2, alpha =0.4) +geom_point(aes(x = Human), color = UJ_GREEN, size =5, alpha =0.9) +geom_point(aes(x = LLM), color = UJ_ORANGE, size =5, alpha =0.9) +geom_vline(xintercept =0, linetype ="dashed", color ="gray40") +geom_text(aes(x = Human, label =sprintf("%.2f", Human)),hjust =1.4, size =3.5, color = UJ_GREEN, fontface ="bold") +geom_text(aes(x = LLM, label =sprintf("%.2f", LLM)),hjust =-0.4, size =3.5, color = UJ_ORANGE, fontface ="bold") +scale_x_continuous(limits =c(min(c(cors_wide$Human, cors_wide$LLM), na.rm =TRUE) -0.15,max(c(cors_wide$Human, cors_wide$LLM), na.rm =TRUE) +0.15) ) +labs(x ="Correlation with 'Where should this publish?'",y =NULL,subtitle =paste0("Green = Human, Orange = ", primary_model) ) +theme_uj() +theme(panel.grid.major.x =element_line(color ="grey90"),panel.grid.major.y =element_blank(),axis.text.y =element_text(size =11) ) } else {cat("Insufficient correlation data for dumbbell plot.\n") } } else {cat("Tier prediction column not found in human data.\n") }} else {cat("Human tier data file not found.\n")}```**Interpretation**: Large divergences indicate different evaluation philosophies. For example, if humans weight "open science" heavily but LLMs don't, this suggests the LLM may undervalue reproducibility when forming overall judgments.<!-- END CLAUDE GENERATED -->------------------------------------------------------------------------<!-- CLAUDE GENERATED: Human-LLM gap heatmap -->## Systematic Rating Differences by Paper and Criterion[^results_ratings-7][^results_ratings-7]: This visualization was added with AI assistance to identify patterns in human-LLM disagreement.The heatmap below shows **Human − LLM** differences for each paper and criterion. Green indicates humans rated higher than the LLM; orange indicates the LLM rated higher. Papers are ordered by their overall rating difference.```{r}#| label: fig-gap-heatmap#| fig-cap: "Human − LLM rating differences by paper and criterion"#| fig-width: 14#| fig-height: 8#| code-fold: truemetric_order <-c("overall", "claims", "methods", "adv_knowledge","logic_comms", "open_sci", "gp_relevance")metric_lab <-c(overall ="Overall",claims ="Claims & Evidence",methods ="Methods",adv_knowledge ="Adv. Knowledge",logic_comms ="Logic & Comms",open_sci ="Open Science",gp_relevance ="Global Relevance")# Use the first/primary LLM model for comparison (GPT-5 Pro if available)primary_model <-if ("GPT-5 Pro"%in%unique(llm_metrics$model)) "GPT-5 Pro"elseunique(llm_metrics$model)[1]H_mean <- metrics_human |>filter(criteria %in% metric_order, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(h =mean(mid, na.rm =TRUE), .groups ="drop")L_mean <- llm_metrics |>filter(criteria %in% metric_order, model == primary_model, paper %in% matched_papers) |>group_by(paper, criteria) |>summarise(l =mean(midpoint, na.rm =TRUE), .groups ="drop")Ddiff <-inner_join(H_mean, L_mean, by =c("paper", "criteria")) |>mutate(diff = h - l) |>mutate(crit =factor(criteria, levels = metric_order, labels = metric_lab[metric_order]))# Order papers by overall difference (Human−LLM)ord_p <- Ddiff |>filter(criteria =="overall") |>arrange(desc(diff)) |>pull(paper)if (length(ord_p) >0&&nrow(Ddiff) >0) {ggplot(Ddiff, aes(x =factor(paper, levels = ord_p), y = crit, fill = diff)) +geom_tile(color ="white", linewidth =0.25) +scale_fill_gradient2(low = UJ_ORANGE, mid ="grey95", high = UJ_GREEN, midpoint =0,name ="Human − LLM",limits =c(-30, 30),oob = scales::squish ) +labs(x =NULL, y =NULL,subtitle =paste0("Primary model: ", primary_model)) +theme_uj() +theme(axis.text.x =element_text(angle =60, hjust =1, vjust =1, size =7),axis.text.y =element_text(size =11),panel.grid =element_blank() )} else {cat("Insufficient data for gap heatmap.\n")}```This visualization helps identify: (1) papers with systematic disagreement across all criteria, (2) criteria where humans and LLMs consistently diverge, and (3) potential patterns related to paper characteristics.<!-- END CLAUDE GENERATED --><!-- CLAUDE GENERATED: Top disagreement cases -->## Papers with Largest Human-LLM Disagreement[^results_ratings-8][^results_ratings-8]: This table was added with AI assistance to facilitate qualitative investigation of outliers.Understanding *why* specific papers generate disagreement can inform both LLM limitations and potential human biases. The table below shows papers where humans and LLMs diverged most.```{r}#| label: tbl-top-disagreement#| tbl-cap: "Papers with largest human vs. LLM rating differences"#| code-fold: true# Compute mean overall ratings and differencesH_overall <- metrics_human |>filter(criteria =="overall", paper %in% matched_papers) |>group_by(paper) |>summarise(human_rating =mean(mid, na.rm =TRUE), .groups ="drop")L_overall <- llm_metrics |>filter(criteria =="overall", model == primary_model, paper %in% matched_papers) |>group_by(paper) |>summarise(llm_rating =mean(midpoint, na.rm =TRUE), .groups ="drop")rating_diffs <- H_overall |>inner_join(L_overall, by ="paper") |>mutate(diff = human_rating - llm_rating)# Helper: truncate long titlestruncate_title <-function(title, max_len =50) {if (nchar(title) <= max_len) return(title)paste0(substr(title, 1, max_len -3), "...")}# Top 5 where humans rated highertop_human_pref <- rating_diffs |>filter(diff >0) |>arrange(desc(diff)) |>slice_head(n =5) |>mutate(Paper =sapply(paper, truncate_title),`Human Rating`=round(human_rating, 0),`LLM Rating`=round(llm_rating, 0),`Difference`=sprintf("%+.0f", diff),Direction ="Human > LLM" ) |>select(Paper, `Human Rating`, `LLM Rating`, Difference, Direction)# Top 5 where LLM rated highertop_llm_pref <- rating_diffs |>filter(diff <0) |>arrange(diff) |>slice_head(n =5) |>mutate(Paper =sapply(paper, truncate_title),`Human Rating`=round(human_rating, 0),`LLM Rating`=round(llm_rating, 0),`Difference`=sprintf("%+.0f", diff),Direction ="LLM > Human" ) |>select(Paper, `Human Rating`, `LLM Rating`, Difference, Direction)# Combinetop_disagree <-bind_rows(top_human_pref, top_llm_pref)if (nrow(top_disagree) >0) { knitr::kable(top_disagree, align =c("l", "r", "r", "r", "l"))} else {cat("Insufficient data for disagreement analysis.\n")}```These outliers merit qualitative investigation: examining the paper characteristics, human evaluator comments, and LLM rationales may reveal systematic patterns in what drives disagreement.<!-- END CLAUDE GENERATED -->------------------------------------------------------------------------*Note: GPT-5 Pro evaluations include extended reasoning traces. See the [Appendix](appendix_llm_traces.qmd) for full assessment summaries and reasoning traces per paper.*## Prompt Comparison: Legacy vs Updated GPT-5 ProWe compare GPT-5 Pro evaluations between two prompt versions:- **Legacy (Oct 2024):** Original combined schema without assessment summary- **Updated (Jan 2026):** Modular prompt with explicit diagnostic assessment summary```{r}#| label: load-legacy-comparison#| code-fold: true#| code-summary: "Load legacy vs updated GPT-5 Pro data"#| message: false# Load legacy GPT-5 datalegacy_metrics <-read_csv("data/metrics_long_gpt-5.csv", show_col_types =FALSE) |>transmute(paper_raw = paper,paper =str_replace_all(paper, " ", "_") |>str_replace_all("\\.", ""), # normalizemetric = metric,legacy_mid = midpoint,legacy_lo = lower_bound,legacy_hi = upper_bound )# Load updated GPT-5 Pro dataupdated_metrics <-read_csv("data/metrics_long_gpt5_pro_jan2026.csv", show_col_types =FALSE) |>transmute(paper =str_replace_all(paper, "\\.", ""), # remove dots for matchingmetric = metric,updated_mid = midpoint,updated_lo = lower_bound,updated_hi = upper_bound )# Join by normalized paper name and metricprompt_comparison <- legacy_metrics |>inner_join(updated_metrics, by =c("paper", "metric")) |>mutate(diff = updated_mid - legacy_mid,metric_label =case_when( metric =="overall"~"Overall", metric =="claims_evidence"~"Claims", metric =="methods"~"Methods", metric =="advancing_knowledge"~"Adv. Knowledge", metric =="logic_communication"~"Logic & Comms", metric =="open_science"~"Open Science", metric =="global_relevance"~"Global Relevance",TRUE~ metric ) )n_matched_papers <-n_distinct(prompt_comparison$paper)```We matched `r n_matched_papers` papers across both runs.```{r}#| label: fig-prompt-scatter#| fig-cap: "Legacy vs Updated GPT-5 Pro ratings (overall metric)"#| fig-width: 8#| fig-height: 7overall_comp <- prompt_comparison |>filter(metric =="overall")if (nrow(overall_comp) >0) { cor_val <-cor(overall_comp$legacy_mid, overall_comp$updated_mid, use ="complete.obs")ggplot(overall_comp, aes(x = legacy_mid, y = updated_mid)) +geom_abline(slope =1, intercept =0, linetype ="dashed", color ="gray50") +geom_point(alpha =0.7, size =2.5, color = UJ_ORANGE) +geom_smooth(method ="lm", se =TRUE, alpha =0.2, color = UJ_BLUE) +annotate("text", x =55, y =95,label =paste0("r = ", round(cor_val, 2)),size =5, fontface ="bold") +labs(x ="Legacy Prompt (Oct 2024)",y ="Updated Prompt (Jan 2026)",title ="GPT-5 Pro: Overall Rating Comparison",subtitle ="Dashed line = perfect agreement" ) +coord_fixed(xlim =c(40, 100), ylim =c(40, 100)) +theme_uj()}``````{r}#| label: fig-prompt-diff-by-metric#| fig-cap: "Rating difference (Updated - Legacy) by criterion"#| fig-width: 10#| fig-height: 5if (nrow(prompt_comparison) >0) { metric_order <- prompt_comparison |>group_by(metric_label) |>summarise(mean_diff =mean(diff, na.rm =TRUE)) |>arrange(mean_diff) |>pull(metric_label) prompt_comparison |>mutate(metric_label =factor(metric_label, levels = metric_order)) |>ggplot(aes(x = metric_label, y = diff)) +geom_hline(yintercept =0, linetype ="dashed", color ="gray50") +geom_boxplot(fill = UJ_ORANGE, alpha =0.6, outlier.alpha =0.3) +labs(x =NULL,y ="Rating Difference (Updated - Legacy)",title ="Prompt Version Effect by Criterion",subtitle ="Positive = updated prompt rates higher" ) +theme_uj() +theme(axis.text.x =element_text(angle =30, hjust =1))}``````{r}#| label: tbl-prompt-summary#| tbl-cap: "Summary statistics: Legacy vs Updated prompt"if (nrow(prompt_comparison) >0) { prompt_comparison |>group_by(Criterion = metric_label) |>summarise(`Legacy Mean`=round(mean(legacy_mid, na.rm =TRUE), 1),`Updated Mean`=round(mean(updated_mid, na.rm =TRUE), 1),`Mean Diff`=round(mean(diff, na.rm =TRUE), 1),`SD Diff`=round(sd(diff, na.rm =TRUE), 1),.groups ="drop" ) |>kable() |>kable_styling(bootstrap_options =c("striped", "hover"), full_width =FALSE)}```The updated prompt with explicit diagnostic assessment appears to produce systematically different ratings. The correlation between prompt versions indicates the degree of consistency across the prompt change.