LLM vs. Human Ratings

Overview

This page compares the LLM‑generated ratings (gpt-5) from the(previous section with human evaluations across the Unjournal’s criteria. We (i) harmonize the two sources to a common set of metrics and paper IDs, (ii) compute paired scores per (paper, metric), and (iii) summarize agreement using distributional views and reliability statistics.

Data & Harmonization

  • Sources. LLM ratings come from results/metrics_long.csv (rendered in the previous chapter). Human ratings are imported from your hand‑coded spreadsheet and mapped to LLM paper IDs via UJ_map.csv.
  • Metric alignment. Human criteria are recoded to the LLM schema (e.g., claimsclaims_evidence, adv_knowledgeadvancing_knowledge, etc.). If humans provided both gp_relevance and real_world, we fold these into the single LLM metric global_relevance.
  • Uncertainty fields. Where available, we carry lower/upper bounds for both LLM and humans; these are used in optional uncertainty checks but do not affect the mid‑point comparisons below.
Show code
import os, re, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from scipy.stats import pearsonr, spearmanr

# Glue for inline text (MyST)
try:
    from myst_nb import glue
except Exception:
    # Fallback: still define a function so doc runs; inline {{glue: ...}} needs myst-nb installed.
    def glue(name, val, display=False):
        globals()[name] = val


# Matplotlib defaults for a clean look
plt.rcParams.update({
    "figure.dpi": 120,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "axes.grid": True,
    "grid.alpha": 0.25,
    "axes.titlesize": 11,
    "axes.labelsize": 10,
    "legend.fontsize": 9
})
# ---- Unjournal brand palette (hex) --------------------------------
# From theme SCSS:
#   $unjournal-green:  #99bb66
#   $unjournal-orange: #f19e4b
UJ_ORANGE = "#f19e4b"   # LLM
UJ_GREEN  = "#99bb66"   # Human

# Helpers to make brand-consistent variants & Plotly-friendly RGBA
import colorsys
from matplotlib import colors as mcolors
from matplotlib.colors import LinearSegmentedColormap

def adjust_lightness(hex_color: str, factor: float = 0.80) -> str:
    """
    Darken/lighten a color by scaling lightness in HLS.
    factor < 1 → darker; factor > 1 → lighter. 0.80 ≈ 20% darker.
    """
    r, g, b = mcolors.to_rgb(hex_color)
    h, l, s = colorsys.rgb_to_hls(r, g, b)
    l = max(0, min(1, l * factor))
    r2, g2, b2 = colorsys.hls_to_rgb(h, l, s)
    return mcolors.to_hex((r2, g2, b2))

def rgba_hex(hex_color: str, alpha: float) -> str:
    """Convert hex '#rrggbb' → 'rgba(r,g,b,a)' for Plotly."""
    r, g, b = mcolors.to_rgb(hex_color)
    return f"rgba({int(round(255*r))},{int(round(255*g))},{int(round(255*b))},{alpha})"

# Darker accents (same hue) for outlines & heatmap extremes
UJ_ORANGE_D = adjust_lightness(UJ_ORANGE, 0.78)  # ~22% darker
UJ_GREEN_D  = adjust_lightness(UJ_GREEN,  0.72)  # ~28% darker

# Diverging map using brand colors with darker extremes and soft center
def make_uj_div():
    return LinearSegmentedColormap.from_list(
        "uj_div_brand",
        [
            (0.00, UJ_GREEN_D),            # deep brand green
            (0.40, UJ_GREEN),              # main brand green
            (0.50, "#f7f7f7"),             # neutral center
            (0.60, UJ_ORANGE),             # main brand orange
            (1.00, UJ_ORANGE_D)            # deep brand orange
        ],
        N=256
    )

UJ_DIV = make_uj_div()  # used by both heatmaps



from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap  # add TwoSlopeNorm
Show code
# Files
LLM_FILE   = "results/metrics_long.csv"
HUMAN_FILE = "UJ_ratings/rsx_evalr_rating (7).csv"
IMAP_FILE  = "UJ_ratings/UJ_map.csv"

llm   = pd.read_csv(LLM_FILE)
human = pd.read_csv(HUMAN_FILE)
imap  = pd.read_csv(IMAP_FILE)

# 1) Map human 'criteria' → LLM metric names
crit_map = {
    "overall":          "overall",
    "claims":           "claims_evidence",
    "methods":          "methods",
    "adv_knowledge":    "advancing_knowledge",
    "logic_comms":      "logic_communication",
    "open_sci":         "open_science",
    "gp_relevance":     "global_relevance",
    "real_world":       "global_relevance",  # fold-in
    "merits_journal":   None,
    "journal_predict":  None
}
human["metric"] = human["criteria"].map(crit_map)
human = human[human["metric"].notna()].copy()

# 2) Normalize titles to join robustly
def _norm_title(s: str) -> str:
    s = str(s)
    s = s.replace("’", "'").replace("–","-").replace("—","-")
    s = re.sub(r"\s+", " ", s).strip()
    return s

human["research_norm"] = human["research"].map(_norm_title).str.lower()
imap["research_norm"]  = imap["research"].map(_norm_title).str.lower()

# 3) Attach LLM paper id to human rows
human = human.merge(imap[["paper","research_norm"]], on="research_norm", how="left")

# 4) Diagnostics
missing_id  = sorted(human[human.paper.isna()]["research"].unique())
missing_llm = sorted(set(llm.paper.unique()) - set(human.paper.dropna().unique()))
# if missing_id:
#     print("⚠️ Human titles without LLM id (fill 'paper' in UJ_map.csv):")
#     for t in missing_id: print("   •", t)
# if missing_llm:
#     print("\n⚠️ LLM papers with no human match:")
#     for p in missing_llm: print("   •", p)

# 5) Collapse human repeats to means (per paper×metric)
human_use = (human
    .dropna(subset=["paper","metric","middle_rating"])
    .groupby(["paper","metric"], as_index=False)
    .agg(midpoint_human=("middle_rating","mean"),
         lower_human   =("lower_CI","mean"),
         upper_human   =("upper_CI","mean"),
         n_human       =("middle_rating","size"))
)

# 6) Select/rename LLM columns
llm_use = (llm[["paper","metric","midpoint","lower_bound","upper_bound"]]
           .rename(columns={"midpoint":"midpoint_llm",
                            "lower_bound":"lower_llm",
                            "upper_bound":"upper_llm"}))

# 7) Merge pairs
merged = llm_use.merge(human_use, on=["paper","metric"], how="inner")

# print(f"✅ merged rows: {len(merged)}  "
#       f"({merged.paper.nunique()} papers × {merged.metric.nunique()} metrics)")

# Long form for distribution plots
ratings_long = pd.concat([
    merged[["paper","metric","midpoint_llm"]]
        .rename(columns={"midpoint_llm":"score"}).assign(rater="LLM"),
    merged[["paper","metric","midpoint_human"]]
        .rename(columns={"midpoint_human":"score"}).assign(rater="Human")
], ignore_index=True)
Show code
# Keep per‑rater rows (after mapping; before averaging)
human_raw = human.dropna(subset=["paper","metric","middle_rating"]).copy()

# Try to detect a rater id column; otherwise make one per (paper, metric)
cands = [c for c in human_raw.columns if c.lower() in
         ["rater","reviewer","evaluator","coder","annotator","user","name",
          "id","uid","user_id","evaluator_id","reviewer_id"]]
if cands:
    rid_col = cands[0]
    human_raw["rater_id"] = human_raw[rid_col].astype(str)
else:
    human_raw["rater_id"] = (human_raw
                             .groupby(["paper","metric"])
                             .cumcount()
                             .add(1)
                             .map(lambda i: f"R{i}"))

# One row per paper×metric with LLM midpoint (for joining)
llm_single = llm_use[["paper","metric","midpoint_llm"]].drop_duplicates()
Show code
def pair_llm_human(df: pd.DataFrame) -> pd.DataFrame:
    """Return (paper, metric, LLM, Human) single-row pairs."""
    return (df[["paper","metric","midpoint_llm","midpoint_human"]]
            .dropna()
            .rename(columns={"midpoint_llm":"LLM","midpoint_human":"Human"}))

def bin_together(a, b, n_bins=5, strategy="quantile"):
    """
    Bin two continuous arrays with shared edges (for κ).
    - 'quantile' bins are robust to skew.
    - fallback to equal-width if quantiles collapse.
    """
    both = pd.Series(np.r_[a, b]).dropna()
    if both.nunique() <= max(3, n_bins):
        # Already discrete-ish: map unique sorted values to 0..K-1
        uniq = np.sort(both.unique())
        mapping = {v:i for i,v in enumerate(uniq)}
        return np.vectorize(mapping.get)(a), np.vectorize(mapping.get)(b), len(uniq)
    if strategy == "quantile":
        qs = both.quantile(np.linspace(0, 1, n_bins+1)).values
        edges = np.unique(qs)
        if len(edges) - 1 < 2:
            strategy = "equal"
    if strategy == "equal":
        lo, hi = float(both.min()), float(both.max())
        edges = np.linspace(lo, hi, n_bins+1)
    edges[0] -= 1e-9; edges[-1] += 1e-9
    a_bin = pd.cut(a, bins=edges, labels=False, include_lowest=True)
    b_bin = pd.cut(b, bins=edges, labels=False, include_lowest=True)
    k = int(np.nanmax([a_bin.max(), b_bin.max()]) + 1)
    return np.asarray(a_bin), np.asarray(b_bin), k

def weighted_kappa(a, b, k=None, weights="quadratic"):
    """
    Cohen's weighted kappa for ordinal labels 0..k-1.
    weights: 'quadratic' (default), 'linear', or None (unweighted).
    """
    a = np.asarray(a, dtype=int)
    b = np.asarray(b, dtype=int)
    if k is None:
        k = int(max(a.max(), b.max()) + 1)
    if k <= 1:  # not enough categories
        return np.nan

    # Observed agreement matrix (proportions)
    M = np.zeros((k, k), dtype=float)
    for i, j in zip(a, b):
        if np.isfinite(i) and np.isfinite(j):
            M[i, j] += 1
    if M.sum() == 0:
        return np.nan
    M = M / M.sum()

    # Expected by marginals
    r = M.sum(axis=1, keepdims=True)
    c = M.sum(axis=0, keepdims=True)
    E = r @ c

    # Weight matrix
    I = np.arange(k)[:, None]
    J = np.arange(k)[None, :]
    if weights == "quadratic":
        W = ((I - J)**2) / ((k - 1)**2 if k > 1 else 1)
    elif weights == "linear":
        W = np.abs(I - J) / (k - 1 if k > 1 else 1)
    else:
        W = (I != J).astype(float)


    num = (W * M).sum()
    den = (W * E).sum()
    return np.nan if den == 0 else 1.0 - num / den

def agreement_summary(merged_df, n_bins=5, bin_strategy="quantile"):
    """
    Per-metric agreement: Pearson, Spearman, mean/median bias, κ (unweighted & quadratic).
    """
    pair = pair_llm_human(merged_df)
    rows = []
    for metric, g in pair.groupby("metric"):
        llm = g["LLM"].to_numpy()
        hum = g["Human"].to_numpy()

        pr, pr_p = (np.nan, np.nan)
        sr, sr_p = (np.nan, np.nan)
        if len(g) >= 2:
            try: pr, pr_p = pearsonr(llm, hum)
            except Exception: pass
            try: sr, sr_p = spearmanr(llm, hum)
            except Exception: pass

        diff = llm - hum
        mb, medb = float(np.mean(diff)), float(np.median(diff))

        a_bin, b_bin, k = bin_together(llm, hum, n_bins=n_bins, strategy=bin_strategy)
        k_unw = weighted_kappa(a_bin, b_bin, k=k, weights=None)
        k_quad = weighted_kappa(a_bin, b_bin, k=k, weights="quadratic")

        rows.append(dict(
            metric=metric, n=len(g),
            pearson=pr, spearman=sr,
            mean_bias=mb, median_bias=medb,
            kappa_unw=k_unw, kappa_quad=k_quad
        ))
    out = pd.DataFrame(rows).sort_values("metric").reset_index(drop=True)
    return out

def overall_stats(merged_df, n_bins=5, bin_strategy="quantile"):
    """
    Overall (across all metrics) Pearson and κ (unweighted & quadratic),
    plus overall MAD.
    """
    pair = pair_llm_human(merged_df)
    llm = pair["LLM"].to_numpy()
    hum = pair["Human"].to_numpy()
    pr = pearsonr(llm, hum)[0] if len(pair) >= 2 else np.nan
    mad = float(np.mean(np.abs(llm - hum))) if len(pair) else np.nan
    a_bin, b_bin, k = bin_together(llm, hum, n_bins=n_bins, strategy=bin_strategy)
    k_unw = weighted_kappa(a_bin, b_bin, k=k, weights=None)
    k_quad = weighted_kappa(a_bin, b_bin, k=k, weights="quadratic")
    return dict(pearson=pr, mad=mad, kappa_unw=k_unw, kappa_quad=k_quad, n=len(pair))

Distribution of ratings

Show code
import plotly.graph_objects as go
import colorsys

# --- Brand helpers (self-contained) --------------------------------
def _hex_to_rgba_str(hex_color: str, alpha: float) -> str:
    h = hex_color.lstrip("#")
    r = int(h[0:2], 16); g = int(h[2:4], 16); b = int(h[4:6], 16)
    return f"rgba({r},{g},{b},{alpha})"

def _darken_hex(hex_color: str, factor: float = 0.78) -> str:
    """Darken/lighten while preserving hue via HLS (factor<1 → darker)."""
    h = hex_color.lstrip("#")
    r = int(h[0:2], 16)/255.0; g = int(h[2:4], 16)/255.0; b = int(h[4:6], 16)/255.0
    H, L, S = colorsys.rgb_to_hls(r, g, b)
    L = max(0, min(1, L * factor))
    r2, g2, b2 = colorsys.hls_to_rgb(H, L, S)
    return "#{:02x}{:02x}{:02x}".format(int(round(r2*255)), int(round(g2*255)), int(round(b2*255)))

# Brand colors from SCSS / setup
# UJ_GREEN, UJ_ORANGE are defined in your setup chunk:
# UJ_GREEN = "#99bb66"; UJ_ORANGE = "#f19e4b"
UJ_GREEN_D = _darken_hex(UJ_GREEN, 0.75)   # slightly darker outline
UJ_ORANGE_D = _darken_hex(UJ_ORANGE, 0.78) # optional (not required here)

def _ellipse_xy(cx, cy, rx, ry, n=60):
    t = np.linspace(0, 2*np.pi, n)
    return cx + rx*np.cos(t), cy + ry*np.sin(t)

def _fix_ci_bounds(df, lo_col, hi_col):
    """Ensure lo <= hi; if not, swap."""
    df = df.copy()
    bad = df[lo_col] > df[hi_col]
    if bad.any():
        lo = df.loc[bad, lo_col].copy()
        df.loc[bad, lo_col] = df.loc[bad, hi_col].values
        df.loc[bad, hi_col] = lo.values
    return df

metrics = sorted(merged["metric"].unique())

# -------- Build robust human horizontal bands and LLM vertical bands per metric --------
ellipse_rows = {}   # per metric: DataFrame with columns: paper, cx, cy, rx, ry, hum_mid, llm_mid
hum_points   = {}   # per metric: individual human dots with y=LLM midpoint

for m in metrics:
    # rater-level human rows for this metric
    H = human_raw.loc[human_raw["metric"] == m,
                      ["paper","middle_rating","lower_CI","upper_CI"]].copy()
    # sanitize bounds if present
    if {"lower_CI","upper_CI"}.issubset(H.columns):
        H = _fix_ci_bounds(H, "lower_CI", "upper_CI")

    # per-paper union across raters: use min(lower_CI, points) and max(upper_CI, points)
    grp = H.groupby("paper", as_index=False).agg(
        hum_mean=("middle_rating","mean"),
        lo_pt=("middle_rating","min"),
        hi_pt=("middle_rating","max"),
        lo_ci=("lower_CI","min"),
        hi_ci=("upper_CI","max")
    )

    # combine CIs + points into one union band
    grp["lo_all"] = grp[["lo_pt","lo_ci"]].min(axis=1, skipna=True)
    grp["hi_all"] = grp[["hi_pt","hi_ci"]].max(axis=1, skipna=True)

    # if CIs missing entirely, fall back to point range
    grp["lo_all"] = grp["lo_all"].fillna(grp["lo_pt"])
    grp["hi_all"] = grp["hi_all"].fillna(grp["hi_pt"])

    # clean any remaining inversions
    bad = grp["lo_all"] > grp["hi_all"]
    if bad.any():
        swap_lo = grp.loc[bad, "lo_all"].copy()
        grp.loc[bad, "lo_all"] = grp.loc[bad, "hi_all"].values
        grp.loc[bad, "hi_all"] = swap_lo.values

    grp["cx"] = 0.5*(grp["lo_all"] + grp["hi_all"])
    grp["rx"] = (grp["hi_all"] - grp["lo_all"]).clip(lower=0) / 2.0

    # Join LLM mid & CI (one row per paper×metric in merged)
    L = (merged.loc[merged["metric"]==m,
                    ["paper","midpoint_llm","lower_llm","upper_llm","midpoint_human"]]
               .dropna(subset=["paper","midpoint_llm"]))
    L = _fix_ci_bounds(L, "lower_llm", "upper_llm")

    J = (grp.merge(L, on="paper", how="inner")
             .rename(columns={"midpoint_llm":"llm_mid", "midpoint_human":"hum_mid"}))

    # vertical radius from LLM CI half-width
    J["ry"] = (J["upper_llm"] - J["lower_llm"]).clip(lower=0) / 2.0
    J["cy"] = J["llm_mid"]
    J["cx"] = J["cx"].astype(float)
    J["cy"] = J["cy"].astype(float)

    ellipse_rows[m] = J[["paper","cx","cy","rx","ry","hum_mid","llm_mid","lo_all","hi_all"]].copy()

    # individual human dots (x) at the LLM y for that paper
    hum_points[m] = (H.merge(J[["paper","cy"]], on="paper", how="inner")
                       .rename(columns={"middle_rating":"human_x", "cy":"llm_y"}))

# -------- Global square axis range across all metrics --------
all_x = np.concatenate([df["cx"].to_numpy() for df in ellipse_rows.values() if len(df)])
all_y = np.concatenate([df["cy"].to_numpy() for df in ellipse_rows.values() if len(df)])
if all_x.size and all_y.size:
    lo = float(min(all_x.min(), all_y.min()))
    hi = float(max(all_x.max(), all_y.max()))
else:
    lo, hi = 0.0, 100.0
pad = 0.05 * (hi - lo if hi > lo else 1.0)
x0, x1 = lo - pad, hi + pad
y0, y1 = lo - pad, hi + pad

# -------- Build the Plotly figure --------
fig = go.Figure()
annots = []
trace_spans = []  # (start_idx, end_idx) per metric

for m in metrics:
    start_idx = len(fig.data)

    J = ellipse_rows[m]
    I = hum_points[m]

    # Draw each ellipse as a filled polygon (brand colors)
    for row in J.itertuples(index=False):
        ex, ey = _ellipse_xy(row.cx, row.cy, row.rx, row.ry, n=60)
        fig.add_trace(go.Scatter(
            x=ex, y=ey, mode="lines",
            line=dict(color=_hex_to_rgba_str(UJ_GREEN_D, 0.65), width=1),
            fill="toself", fillcolor=_hex_to_rgba_str(UJ_ORANGE, 0.12),
            hovertemplate=(
                "<b>%{text}</b><br>"
                "Human band: [%{customdata[0]:.1f}, %{customdata[1]:.1f}]<br>"
                "Center (Human, LLM) = (%{customdata[2]:.1f}, %{customdata[3]:.1f})<extra></extra>"
            ),
            text=[row.paper]*len(ex),
            customdata=np.column_stack([np.repeat(row.lo_all, len(ex)),
                                        np.repeat(row.hi_all, len(ex)),
                                        np.repeat(row.cx, len(ex)),
                                        np.repeat(row.cy, len(ex))]),
            showlegend=False,
            visible=False
        ))

    # Individual human dots
    fig.add_trace(go.Scatter(
        x=I["human_x"], y=I["llm_y"],
        mode="markers", name="Human raters",
        marker=dict(size=5, color=UJ_GREEN, opacity=0.50,
                    line=dict(width=0.5, color="rgba(50,50,50,0.25)")),
        text=I["paper"] + " — " + I["human_x"].round(1).astype(str),
        hovertemplate="<b>%{text}</b><br>LLM=%{y:.1f}<extra></extra>",
        showlegend=True, visible=False
    ))

    # Centers (orange diamonds)
    fig.add_trace(go.Scatter(
        x=J["hum_mid"], y=J["llm_mid"],
        mode="markers", name="Midpoints",
        marker=dict(size=9, color=UJ_ORANGE, symbol="diamond",
                    line=dict(color="rgba(50,50,50,0.35)", width=0.6)),
        text=J["paper"],
        hovertemplate="<b>%{text}</b><br>Human mid=%{x:.1f}<br>LLM mid=%{y:.1f}<extra></extra>",
        showlegend=True, visible=False
    ))

    # Single legend stub for ellipses
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode="lines", name="CI (human × LLM)",
        line=dict(color=_hex_to_rgba_str(UJ_GREEN_D, 0.70)), fill="toself",
        showlegend=True, visible=False
    ))

    # Correlations on midpoints for the caption
    if len(J) >= 2:
        r_p = pearsonr(J["hum_mid"], J["llm_mid"])[0]
        r_s = spearmanr(J["hum_mid"], J["llm_mid"])[0]
    else:
        r_p = np.nan; r_s = np.nan

    annots.append(dict(
        x=x0 + 0.05*(x1-x0), y=y1 - 0.05*(y1-y0), xref="x", yref="y",
        showarrow=False, font=dict(size=12, color="#222"),
        text=f"{m.replace('_',' ').title()} — Pearson r = {r_p:.2f}; Spearman ρ = {r_s:.2f}"
    ))
    trace_spans.append((start_idx, len(fig.data)-1))

# 45° equality line
fig.add_shape(type="line", x0=x0, y0=x0, x1=x1, y1=y1,
              line=dict(color="rgba(120,120,120,0.5)", dash="dot"))

# Dropdown: one entry per metric
buttons = []
total = len(fig.data)
for i, m in enumerate(metrics):
    vis = [False]*total
    s, e = trace_spans[i]
    for k in range(s, e+1):
        vis[k] = True
    buttons.append(dict(
        method="update", label=m.replace("_"," ").title(),
        args=[{"visible": vis},
              {"title": f"LLM vs Human — CI ellipses (union of human range/CI) — {m.replace('_',' ').title()}",
               "annotations": [annots[i]]}]
    ))

# Initialize first metric visible
init_vis = [False]*len(fig.data)
s0, e0 = trace_spans[0]
for k in range(s0, e0+1):
    init_vis[k] = True

fig.update_layout(
    template="plotly_white",
    title=f"LLM vs Human — CI ellipses (union of human range/CI) — {metrics[0].replace('_',' ').title()}",
    xaxis_title="Human (x)",
    yaxis_title="LLM (y)",
    xaxis=dict(range=[x0, x1]),
    yaxis=dict(range=[y0, y1], scaleanchor="x", scaleratio=1),

    # --- Legend below the plot ---
    legend=dict(
        orientation="h",
        yanchor="top", y=-0.16,          # push below the plotting area
        xanchor="center", x=0.50,        # centered
        bgcolor="rgba(0,0,0,0)"
    ),

    # Leave enough space at bottom for legend + dropdown
    margin=dict(l=60, r=20, t=70, b=160),

    # --- Dropdown below the legend ---
    updatemenus=[dict(
        type="dropdown",
        direction="up",
        showactive=True,
        buttons=buttons,
        x=0.50, xanchor="center",
        y=-0.30, yanchor="top",          # a bit further down than legend
        pad={"t": 2, "b": 2}
    )]
)

fig.update_traces(visible=False)
for k, v in enumerate(init_vis):
    fig.data[k].visible = v

fig
Figure 3.1
(a) LLM vs Human with robust CI ellipses. Horizontal span covers the union of all human raters’ CIs and point ratings; vertical span from LLM CI. Centers at midpoints. Dropdown selects metric.
(b)

For each paper (selected metric), the ellipse is centered at the pair of midpoints (Human, LLM). The horizontal radius covers the union of all human evidence for that paper — combining individual raters’ point scores and their CIs — while the vertical radius reflects the LLM CI. This makes the ellipse wide enough to contain every human point horizontally (unless a point lies outside its own stated CI, which is flagged in a diagnostic check).

Show code
from matplotlib.gridspec import GridSpec

def _plot_hist_for_metric(ax, df_long, metric, bins, title=None):
    g = df_long[df_long["metric"] == metric]
    ax.hist(g[g["rater"]=="LLM"]["score"],   bins=bins, alpha=0.55, label="LLM",
            color=UJ_ORANGE, edgecolor="white")
    ax.hist(g[g["rater"]=="Human"]["score"], bins=bins, alpha=0.55, label="Human",
            color=UJ_GREEN,  edgecolor="white")
    ax.set_title(title or metric.replace("_", " ").title())
    ax.set_xlabel("Score"); ax.set_ylabel("Count")

metrics_all = list(ratings_long["metric"].dropna().unique())
has_overall = "overall" in metrics_all
metrics_others = [m for m in sorted(metrics_all) if m != "overall"]

# bins logic
smin, smax = ratings_long["score"].min(), ratings_long["score"].max()
bins = np.arange(0, 101, 5) if (0 <= smin and smax <= 100) else np.linspace(smin, smax, 21)

if has_overall:
    cols = 3
    rows_other = math.ceil(len(metrics_others) / cols) or 1
    fig = plt.figure(figsize=(cols*4.6, 1.2*3.8 + rows_other*3.6))
    gs = GridSpec(nrows=rows_other+1, ncols=cols, figure=fig,
                  height_ratios=[1.3] + [1]*rows_other)

    # Big top row: Overall spans all columns
    ax0 = fig.add_subplot(gs[0, :])
    _plot_hist_for_metric(ax0, ratings_long, "overall", bins, title="Overall")

    # Bottom grid: remaining metrics
    for idx, m in enumerate(metrics_others):
        r = 1 + idx // cols
        c = idx % cols
        ax = fig.add_subplot(gs[r, c])
        _plot_hist_for_metric(ax, ratings_long, m, bins)

    # Turn off any empty slots
    total_slots = rows_other * cols
    for empty in range(len(metrics_others), total_slots):
        r = 1 + empty // cols
        c = empty % cols
        fig.add_subplot(gs[r, c]).axis("off")

    # Shared legend once (top center)
    handles, labels = ax0.get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=2, frameon=False)
    plt.tight_layout(rect=(0,0,1,0.94))
else:
    # fallback to simple grid if no overall
    metrics = sorted(metrics_all)
    n = len(metrics); cols = 3; rows = math.ceil(n/cols)
    fig, axes = plt.subplots(rows, cols, figsize=(cols*4.6, rows*3.6), sharex=True)
    axes = np.atleast_2d(axes); axes_flat = axes.flat
    for ax, m in zip(axes_flat, metrics):
        _plot_hist_for_metric(ax, ratings_long, m, bins)
    for ax in list(axes_flat)[n:]:
        ax.axis("off")
    handles, labels = axes_flat[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=2, frameon=False)
    plt.tight_layout(rect=(0,0,1,0.94))
Figure 3.2: Distributions by metric. Top: Overall (full width). Bottom: remaining metrics in a 2×3 grid.

Differences by paper and metric

For each (paper, metric), we plot LLM − Human difference in Figure 3.3. Green means the LLM is lower than the human midpoint; orange means higher.

  • Vertical stripes (by metric) suggest a metric‑specific bias (LLM systematically up/down on that metric).
  • Horizontal stripes (by paper) flag paper‑specific disagreement (LLM consistently above/below for that paper across metrics).
  • Isolated bright cells highlight individual outliers worth auditing qualitatively.
Show code
pair = pair_llm_human(merged)
pair["diff"] = pair["LLM"].astype(float) - pair["Human"].astype(float)

# Row order: sort by signed difference on the 'overall' metric (LLM−Human)
order_overall = (pair.loc[pair["metric"]=="overall"]
                   .groupby("paper")["diff"].mean()
                   .sort_values())  # ascending: LLM lower (green) at top, higher (orange) at bottom

hm = pair.pivot_table(index="paper", columns="metric", values="diff", aggfunc="mean")
# apply row order; append any papers lacking 'overall' at the bottom
ordered_rows = list(order_overall.index) + [p for p in hm.index if p not in order_overall.index]
hm = hm.loc[ordered_rows]

Z = hm.values.astype(float)

# Robust centered normalization (as before)
pos = Z[Z > 0]; neg = -Z[Z < 0]
p_hi = np.nanpercentile(pos, 97.5) if pos.size else 0.0
n_hi = np.nanpercentile(neg, 97.5) if neg.size else 0.0
span = max(p_hi, n_hi)
if not np.isfinite(span) or span == 0:
    span = float(np.nanmax(np.abs(Z))) if np.isfinite(np.nanmax(np.abs(Z))) else 1.0

norm = TwoSlopeNorm(vmin=-span, vcenter=0.0, vmax=span)

fig_hm, ax = plt.subplots(
    figsize=(max(6.5, 0.8*hm.shape[1]), max(6.0, 0.40*hm.shape[0]))
)
im = ax.imshow(Z, cmap=UJ_DIV, norm=norm, aspect="auto", interpolation="nearest")

ax.set_xticks(np.arange(hm.shape[1]), labels=[c.replace("_", " ").title() for c in hm.columns], rotation=40, ha="right")
ax.set_yticks(np.arange(hm.shape[0]), labels=hm.index.tolist())

ax.set_xticks(np.arange(-0.5, hm.shape[1], 1), minor=True)
ax.set_yticks(np.arange(-0.5, hm.shape[0], 1), minor=True)
ax.grid(which="minor", color="white", linewidth=0.6)
ax.tick_params(which="minor", bottom=False, left=False)

cb = plt.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
cb.set_label("LLM − Human (points)")
tick_vals = np.linspace(-span, span, 5)
cb.set_ticks(tick_vals); cb.set_ticklabels([f"{t:.1f}" for t in tick_vals])

ax.set_title("Heatmap: LLM − Human differences by paper × metric (sorted by overall difference)")
plt.tight_layout()
Figure 3.3: LLM − Human difference by paper × metric (robust, centered scale; rows sorted by signed overall difference).

Agreement metrics

Table 3.1 summarizes agreement: Pearson (r) captures linear alignment of levels; Spearman (ρ) captures rank agreement (monotonic relationship), robust to scale differences. We report the mean and median of (LLM − Human) to summarize both average shift and central tendency of the differences and Cohen’s κ (unweighted & quadratic). Because scores are continuous, we discretize both LLM and Human using shared quantile cutpoints (5 bins) and compute the Unweighted κ (exact category agreement beyond chance) and the Quadratic‑weighted κ that penalizes larger disagreements more than smaller ones.

As another indicator of agreement, we check whether each side’s credible interval contain the other’s point estimate, Table 3.3 shows the shares across rating categories.

Table 3.2 shows “calibration” results based on a simple linear regression; Figure 3.4 plots the width of human raters CI against the difference in ratings; and Table 3.4 lists the papers with the largest aabsolute difference between LLM and human raters.

Show code
agree_cat = agreement_summary(merged, n_bins=5, bin_strategy="quantile")
agree_cat.round({"pearson":3,"spearman":3,"mean_bias":2,"median_bias":2,
                 "kappa_unw":3,"kappa_quad":3})

ov = overall_stats(merged, n_bins=5, bin_strategy="quantile")


agree_view = (agree_cat
              .rename(columns={"kappa_unw":"kappa_unwtd","kappa_quad":"kappa_quad"})
              .loc[:, ["metric","n","pearson","spearman","mean_bias","median_bias","kappa_unwtd","kappa_quad"]]
              .sort_values("metric"))
agree_view = agree_view.round({"pearson":3, "spearman":3, "mean_bias":2, "median_bias":2, "kappa_unwtd":3, "kappa_quad":3})
agree_view
Table 3.1: Agreement metrics by metric (Pearson, Spearman, mean/median bias, κ unweighted/weighted).
metric n pearson spearman mean_bias median_bias kappa_unwtd kappa_quad
0 advancing_knowledge 27 0.198 0.259 7.12 6.0 0.052 0.257
1 claims_evidence 6 0.410 0.471 -3.75 -3.0 -0.154 0.400
2 global_relevance 28 0.135 0.109 8.21 8.5 -0.040 0.070
3 logic_communication 28 0.001 0.130 10.01 9.0 -0.029 0.041
4 methods 27 0.309 0.403 6.60 5.5 0.148 0.288
5 open_science 28 0.138 0.103 -6.01 -7.0 0.009 0.137
6 overall 28 0.280 0.309 1.64 1.0 0.125 0.374
Show code
from scipy.stats import linregress

def calibrate_per_metric(merged_df):
    P = pair_llm_human(merged_df)
    rows = []
    for m, g in P.groupby("metric"):
        x, y = g["LLM"].to_numpy(), g["Human"].to_numpy()
        if len(g) >= 2:
            lr = linregress(x, y)  # slope, intercept, rvalue, pvalue, stderr
            yhat = lr.slope * x + lr.intercept
            mad_pre  = float(np.mean(np.abs(x - y)))
            mad_post = float(np.mean(np.abs(yhat - y)))
            rows.append(dict(
                metric=m,
                n=len(g),
                slope=lr.slope, intercept=lr.intercept,
                r2=lr.rvalue**2,
                MAD_pre=mad_pre, MAD_post=mad_post,
                MAD_delta=mad_pre - mad_post
            ))
        else:
            rows.append(dict(metric=m, n=len(g),
                             slope=np.nan, intercept=np.nan, r2=np.nan,
                             MAD_pre=np.nan, MAD_post=np.nan, MAD_delta=np.nan))
    out = pd.DataFrame(rows).sort_values("metric").reset_index(drop=True)
    return out

calib = calibrate_per_metric(merged).copy()
calib_rounded = calib.copy()
for c in ["slope","intercept","r2","MAD_pre","MAD_post","MAD_delta"]:
    calib_rounded[c] = calib_rounded[c].round(3)
calib_rounded
Table 3.2: Calibration of LLM to Human by metric (Human ≈ intercept + slope·LLM). Includes pre-/post- MAD.
metric n slope intercept r2 MAD_pre MAD_post MAD_delta
0 advancing_knowledge 27 0.284 50.232 0.039 11.216 8.812 2.404
1 claims_evidence 6 0.220 61.234 0.168 11.250 5.859 5.391
2 global_relevance 28 0.255 53.573 0.018 14.589 12.000 2.589
3 logic_communication 28 0.002 72.823 0.000 13.464 9.802 3.663
4 methods 27 0.404 39.774 0.096 12.099 9.427 2.672
5 open_science 28 0.174 57.509 0.019 17.976 13.346 4.631
6 overall 28 0.357 47.941 0.078 9.149 7.837 1.312
Show code
cov = (merged.dropna(subset=["lower_llm","upper_llm","lower_human","upper_human",
                             "midpoint_llm","midpoint_human"])
              .copy())

cov["human_in_llm"] = (cov["midpoint_human"] >= cov["lower_llm"]) & (cov["midpoint_human"] <= cov["upper_llm"])
cov["llm_in_human"] = (cov["midpoint_llm"]   >= cov["lower_human"]) & (cov["midpoint_llm"]   <= cov["upper_human"])
cov["both_cover"]   = cov["human_in_llm"] & cov["llm_in_human"]

coverage_tbl = (cov.groupby("metric", as_index=False)
                  .agg(N=("paper","size"),
                       human_in_llm=("human_in_llm","mean"),
                       llm_in_human=("llm_in_human","mean"),
                       both_cover=("both_cover","mean")))
coverage_tbl[["human_in_llm","llm_in_human","both_cover"]] = \
    coverage_tbl[["human_in_llm","llm_in_human","both_cover"]].round(2)

# Overall coverage (glue)
overall_cov = dict(
    N=int(len(cov)),
    human_in_llm=float(cov["human_in_llm"].mean()),
    llm_in_human=float(cov["llm_in_human"].mean()),
    both=float(cov["both_cover"].mean())
)
# glue("cov_both", overall_cov["both"])
# glue("cov_hinl", overall_cov["human_in_llm"])
# glue("cov_linh", overall_cov["llm_in_human"])

coverage_tbl
Table 3.3: CI coverage: does one interval contain the other’s point estimate?
metric N human_in_llm llm_in_human both_cover
0 advancing_knowledge 24 0.67 0.62 0.54
1 claims_evidence 6 0.67 0.67 0.67
2 global_relevance 25 0.36 0.44 0.28
3 logic_communication 25 0.36 0.60 0.36
4 methods 24 0.62 0.54 0.50
5 open_science 25 0.48 0.40 0.28
6 overall 25 0.60 0.60 0.52
Show code
use = merged.dropna(subset=["lower_human","upper_human","midpoint_llm","midpoint_human"]).copy()
use["human_width"] = use["upper_human"] - use["lower_human"]
use["abs_diff"]    = (use["midpoint_llm"] - use["midpoint_human"]).abs()

rho, pval = (np.nan, np.nan)
if len(use) >= 3:
    rho, pval = spearmanr(use["human_width"], use["abs_diff"])

fig, ax = plt.subplots(figsize=(6.2, 4.6))
ax.scatter(use["human_width"], use["abs_diff"], s=28, alpha=.75, color=UJ_ORANGE, edgecolor="white")
ax.set_xlabel("Human CI width")
ax.set_ylabel("|LLM − Human|")
ax.set_title(f"uncertainty vs disagreement  (Spearman ρ = {rho:.2f}, p = {pval:.3f})")
ax.grid(True, alpha=.25)
plt.tight_layout()
Figure 3.4: Human uncertainty (CI width) vs |LLM − Human|. Spearman correlation reported.
Show code
out = merged.copy()
out["abs_diff"] = (out["midpoint_llm"] - out["midpoint_human"]).abs()
out["human_in_llm"] = (out["midpoint_human"] >= out["lower_llm"]) & (out["midpoint_human"] <= out["upper_llm"])
out["llm_in_human"] = (out["midpoint_llm"]   >= out["lower_human"]) & (out["midpoint_llm"]   <= out["upper_human"])
cols = ["paper","metric","midpoint_llm","midpoint_human","abs_diff","human_in_llm","llm_in_human"]
outliers = (out.sort_values("abs_diff", ascending=False)
              .loc[:, cols]
              .head(15))
outliers
Table 3.4: Top absolute differences (LLM vs Human). Flags show whether each point lies within the other’s CI.
paper metric midpoint_llm midpoint_human abs_diff human_in_llm llm_in_human
23 Alcott et al. 2024 global_relevance 78 16.00 62.00 False False
49 Bettle 2023 methods 68 10.00 58.00 False False
20 Alcott et al. 2024 advancing_knowledge 90 32.50 57.50 False False
63 Bruers 2021 open_science 30 80.00 50.00 False False
51 Bettle 2023 logic_communication 78 30.00 48.00 False False
4 Aghion et al. 2017 open_science 40 87.50 47.50 False False
52 Bettle 2023 open_science 55 10.00 45.00 False False
139 Kremer et al. 2022 global_relevance 95 51.25 43.75 False False
82 Chuard et al. 2022 open_science 55 96.00 41.00 False False
48 Bettle 2023 overall 68 32.50 35.50 False False
75 Carson et al. 2023 logic_communication 85 50.00 35.00 False False
39 Barberio et al. 2022 logic_communication 80 45.00 35.00 False False
61 Bruers 2021 advancing_knowledge 55 90.00 35.00 False False
58 Bruers 2021 overall 48 80.00 32.00 False False
3 Aghion et al. 2017 logic_communication 88 57.50 30.50 False False