LLM vs. Human Ratings

Overview

This page compares the LLM‑generated ratings (gpt-5) from the(previous section with human evaluations across the Unjournal’s criteria. We (i) harmonize the two sources to a common set of metrics and paper IDs, (ii) compute paired scores per (paper, metric), and (iii) summarize agreement using distributional views and reliability statistics.

Data & Harmonization

Sources. LLM ratings come from results/metrics_long.csv (rendered in the previous chapter). Human ratings are imported from your hand‑coded spreadsheet and mapped to LLM paper IDs via UJ_map.csv.
Metric alignment. Human criteria are recoded to the LLM schema (e.g., claims → claims_evidence, adv_knowledge → advancing_knowledge, etc.). If humans provided both gp_relevance and real_world, we fold these into the single LLM metric global_relevance.
Uncertainty fields. Where available, we carry lower/upper bounds for both LLM and humans; these are used in optional uncertainty checks but do not affect the mid‑point comparisons below.

Show code

import os, re, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from scipy.stats import pearsonr, spearmanr

# Glue for inline text (MyST)
try:
    from myst_nb import glue
except Exception:
    # Fallback: still define a function so doc runs; inline {{glue: ...}} needs myst-nb installed.
    def glue(name, val, display=False):
        globals()[name] = val


# Matplotlib defaults for a clean look
plt.rcParams.update({
    "figure.dpi": 120,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "axes.grid": True,
    "grid.alpha": 0.25,
    "axes.titlesize": 11,
    "axes.labelsize": 10,
    "legend.fontsize": 9
})
# ---- Unjournal brand palette (hex) --------------------------------
# From theme SCSS:
#   $unjournal-green:  #99bb66
#   $unjournal-orange: #f19e4b
UJ_ORANGE = "#f19e4b"   # LLM
UJ_GREEN  = "#99bb66"   # Human

# Helpers to make brand-consistent variants & Plotly-friendly RGBA
import colorsys
from matplotlib import colors as mcolors
from matplotlib.colors import LinearSegmentedColormap

def adjust_lightness(hex_color: str, factor: float = 0.80) -> str:
    """
    Darken/lighten a color by scaling lightness in HLS.
    factor < 1 → darker; factor > 1 → lighter. 0.80 ≈ 20% darker.
    """
    r, g, b = mcolors.to_rgb(hex_color)
    h, l, s = colorsys.rgb_to_hls(r, g, b)
    l = max(0, min(1, l * factor))
    r2, g2, b2 = colorsys.hls_to_rgb(h, l, s)
    return mcolors.to_hex((r2, g2, b2))

def rgba_hex(hex_color: str, alpha: float) -> str:
    """Convert hex '#rrggbb' → 'rgba(r,g,b,a)' for Plotly."""
    r, g, b = mcolors.to_rgb(hex_color)
    return f"rgba({int(round(255*r))},{int(round(255*g))},{int(round(255*b))},{alpha})"

# Darker accents (same hue) for outlines & heatmap extremes
UJ_ORANGE_D = adjust_lightness(UJ_ORANGE, 0.78)  # ~22% darker
UJ_GREEN_D  = adjust_lightness(UJ_GREEN,  0.72)  # ~28% darker

# Diverging map using brand colors with darker extremes and soft center
def make_uj_div():
    return LinearSegmentedColormap.from_list(
        "uj_div_brand",
        [
            (0.00, UJ_GREEN_D),            # deep brand green
            (0.40, UJ_GREEN),              # main brand green
            (0.50, "#f7f7f7"),             # neutral center
            (0.60, UJ_ORANGE),             # main brand orange
            (1.00, UJ_ORANGE_D)            # deep brand orange
        ],
        N=256
    )

UJ_DIV = make_uj_div()  # used by both heatmaps



from matplotlib.colors import TwoSlopeNorm, LinearSegmentedColormap  # add TwoSlopeNorm

Show code

# Files
LLM_FILE   = "results/metrics_long.csv"
HUMAN_FILE = "UJ_ratings/rsx_evalr_rating (7).csv"
IMAP_FILE  = "UJ_ratings/UJ_map.csv"

llm   = pd.read_csv(LLM_FILE)
human = pd.read_csv(HUMAN_FILE)
imap  = pd.read_csv(IMAP_FILE)

# 1) Map human 'criteria' → LLM metric names
crit_map = {
    "overall":          "overall",
    "claims":           "claims_evidence",
    "methods":          "methods",
    "adv_knowledge":    "advancing_knowledge",
    "logic_comms":      "logic_communication",
    "open_sci":         "open_science",
    "gp_relevance":     "global_relevance",
    "real_world":       "global_relevance",  # fold-in
    "merits_journal":   None,
    "journal_predict":  None
}
human["metric"] = human["criteria"].map(crit_map)
human = human[human["metric"].notna()].copy()

# 2) Normalize titles to join robustly
def _norm_title(s: str) -> str:
    s = str(s)
    s = s.replace("’", "'").replace("–","-").replace("—","-")
    s = re.sub(r"\s+", " ", s).strip()
    return s

human["research_norm"] = human["research"].map(_norm_title).str.lower()
imap["research_norm"]  = imap["research"].map(_norm_title).str.lower()

# 3) Attach LLM paper id to human rows
human = human.merge(imap[["paper","research_norm"]], on="research_norm", how="left")

# 4) Diagnostics
missing_id  = sorted(human[human.paper.isna()]["research"].unique())
missing_llm = sorted(set(llm.paper.unique()) - set(human.paper.dropna().unique()))
# if missing_id:
#     print("⚠️ Human titles without LLM id (fill 'paper' in UJ_map.csv):")
#     for t in missing_id: print("   •", t)
# if missing_llm:
#     print("\n⚠️ LLM papers with no human match:")
#     for p in missing_llm: print("   •", p)

# 5) Collapse human repeats to means (per paper×metric)
human_use = (human
    .dropna(subset=["paper","metric","middle_rating"])
    .groupby(["paper","metric"], as_index=False)
    .agg(midpoint_human=("middle_rating","mean"),
         lower_human   =("lower_CI","mean"),
         upper_human   =("upper_CI","mean"),
         n_human       =("middle_rating","size"))
)

# 6) Select/rename LLM columns
llm_use = (llm[["paper","metric","midpoint","lower_bound","upper_bound"]]
           .rename(columns={"midpoint":"midpoint_llm",
                            "lower_bound":"lower_llm",
                            "upper_bound":"upper_llm"}))

# 7) Merge pairs
merged = llm_use.merge(human_use, on=["paper","metric"], how="inner")

# print(f"✅ merged rows: {len(merged)}  "
#       f"({merged.paper.nunique()} papers × {merged.metric.nunique()} metrics)")

# Long form for distribution plots
ratings_long = pd.concat([
    merged[["paper","metric","midpoint_llm"]]
        .rename(columns={"midpoint_llm":"score"}).assign(rater="LLM"),
    merged[["paper","metric","midpoint_human"]]
        .rename(columns={"midpoint_human":"score"}).assign(rater="Human")
], ignore_index=True)

Show code

# Keep per‑rater rows (after mapping; before averaging)
human_raw = human.dropna(subset=["paper","metric","middle_rating"]).copy()

# Try to detect a rater id column; otherwise make one per (paper, metric)
cands = [c for c in human_raw.columns if c.lower() in
         ["rater","reviewer","evaluator","coder","annotator","user","name",
          "id","uid","user_id","evaluator_id","reviewer_id"]]
if cands:
    rid_col = cands[0]
    human_raw["rater_id"] = human_raw[rid_col].astype(str)
else:
    human_raw["rater_id"] = (human_raw
                             .groupby(["paper","metric"])
                             .cumcount()
                             .add(1)
                             .map(lambda i: f"R{i}"))

# One row per paper×metric with LLM midpoint (for joining)
llm_single = llm_use[["paper","metric","midpoint_llm"]].drop_duplicates()

Show code

def pair_llm_human(df: pd.DataFrame) -> pd.DataFrame:
    """Return (paper, metric, LLM, Human) single-row pairs."""
    return (df[["paper","metric","midpoint_llm","midpoint_human"]]
            .dropna()
            .rename(columns={"midpoint_llm":"LLM","midpoint_human":"Human"}))

def bin_together(a, b, n_bins=5, strategy="quantile"):
    """
    Bin two continuous arrays with shared edges (for κ).
    - 'quantile' bins are robust to skew.
    - fallback to equal-width if quantiles collapse.
    """
    both = pd.Series(np.r_[a, b]).dropna()
    if both.nunique() <= max(3, n_bins):
        # Already discrete-ish: map unique sorted values to 0..K-1
        uniq = np.sort(both.unique())
        mapping = {v:i for i,v in enumerate(uniq)}
        return np.vectorize(mapping.get)(a), np.vectorize(mapping.get)(b), len(uniq)
    if strategy == "quantile":
        qs = both.quantile(np.linspace(0, 1, n_bins+1)).values
        edges = np.unique(qs)
        if len(edges) - 1 < 2:
            strategy = "equal"
    if strategy == "equal":
        lo, hi = float(both.min()), float(both.max())
        edges = np.linspace(lo, hi, n_bins+1)
    edges[0] -= 1e-9; edges[-1] += 1e-9
    a_bin = pd.cut(a, bins=edges, labels=False, include_lowest=True)
    b_bin = pd.cut(b, bins=edges, labels=False, include_lowest=True)
    k = int(np.nanmax([a_bin.max(), b_bin.max()]) + 1)
    return np.asarray(a_bin), np.asarray(b_bin), k

def weighted_kappa(a, b, k=None, weights="quadratic"):
    """
    Cohen's weighted kappa for ordinal labels 0..k-1.
    weights: 'quadratic' (default), 'linear', or None (unweighted).
    """
    a = np.asarray(a, dtype=int)
    b = np.asarray(b, dtype=int)
    if k is None:
        k = int(max(a.max(), b.max()) + 1)
    if k <= 1:  # not enough categories
        return np.nan

    # Observed agreement matrix (proportions)
    M = np.zeros((k, k), dtype=float)
    for i, j in zip(a, b):
        if np.isfinite(i) and np.isfinite(j):
            M[i, j] += 1
    if M.sum() == 0:
        return np.nan
    M = M / M.sum()

    # Expected by marginals
    r = M.sum(axis=1, keepdims=True)
    c = M.sum(axis=0, keepdims=True)
    E = r @ c

    # Weight matrix
    I = np.arange(k)[:, None]
    J = np.arange(k)[None, :]
    if weights == "quadratic":
        W = ((I - J)**2) / ((k - 1)**2 if k > 1 else 1)
    elif weights == "linear":
        W = np.abs(I - J) / (k - 1 if k > 1 else 1)
    else:
        W = (I != J).astype(float)


    num = (W * M).sum()
    den = (W * E).sum()
    return np.nan if den == 0 else 1.0 - num / den

def agreement_summary(merged_df, n_bins=5, bin_strategy="quantile"):
    """
    Per-metric agreement: Pearson, Spearman, mean/median bias, κ (unweighted & quadratic).
    """
    pair = pair_llm_human(merged_df)
    rows = []
    for metric, g in pair.groupby("metric"):
        llm = g["LLM"].to_numpy()
        hum = g["Human"].to_numpy()

        pr, pr_p = (np.nan, np.nan)
        sr, sr_p = (np.nan, np.nan)
        if len(g) >= 2:
            try: pr, pr_p = pearsonr(llm, hum)
            except Exception: pass
            try: sr, sr_p = spearmanr(llm, hum)
            except Exception: pass

        diff = llm - hum
        mb, medb = float(np.mean(diff)), float(np.median(diff))

        a_bin, b_bin, k = bin_together(llm, hum, n_bins=n_bins, strategy=bin_strategy)
        k_unw = weighted_kappa(a_bin, b_bin, k=k, weights=None)
        k_quad = weighted_kappa(a_bin, b_bin, k=k, weights="quadratic")

        rows.append(dict(
            metric=metric, n=len(g),
            pearson=pr, spearman=sr,
            mean_bias=mb, median_bias=medb,
            kappa_unw=k_unw, kappa_quad=k_quad
        ))
    out = pd.DataFrame(rows).sort_values("metric").reset_index(drop=True)
    return out

def overall_stats(merged_df, n_bins=5, bin_strategy="quantile"):
    """
    Overall (across all metrics) Pearson and κ (unweighted & quadratic),
    plus overall MAD.
    """
    pair = pair_llm_human(merged_df)
    llm = pair["LLM"].to_numpy()
    hum = pair["Human"].to_numpy()
    pr = pearsonr(llm, hum)[0] if len(pair) >= 2 else np.nan
    mad = float(np.mean(np.abs(llm - hum))) if len(pair) else np.nan
    a_bin, b_bin, k = bin_together(llm, hum, n_bins=n_bins, strategy=bin_strategy)
    k_unw = weighted_kappa(a_bin, b_bin, k=k, weights=None)
    k_quad = weighted_kappa(a_bin, b_bin, k=k, weights="quadratic")
    return dict(pearson=pr, mad=mad, kappa_unw=k_unw, kappa_quad=k_quad, n=len(pair))

Distribution of ratings

Show code

import plotly.graph_objects as go
import colorsys

# --- Brand helpers (self-contained) --------------------------------
def _hex_to_rgba_str(hex_color: str, alpha: float) -> str:
    h = hex_color.lstrip("#")
    r = int(h[0:2], 16); g = int(h[2:4], 16); b = int(h[4:6], 16)
    return f"rgba({r},{g},{b},{alpha})"

def _darken_hex(hex_color: str, factor: float = 0.78) -> str:
    """Darken/lighten while preserving hue via HLS (factor<1 → darker)."""
    h = hex_color.lstrip("#")
    r = int(h[0:2], 16)/255.0; g = int(h[2:4], 16)/255.0; b = int(h[4:6], 16)/255.0
    H, L, S = colorsys.rgb_to_hls(r, g, b)
    L = max(0, min(1, L * factor))
    r2, g2, b2 = colorsys.hls_to_rgb(H, L, S)
    return "#{:02x}{:02x}{:02x}".format(int(round(r2*255)), int(round(g2*255)), int(round(b2*255)))

# Brand colors from SCSS / setup
# UJ_GREEN, UJ_ORANGE are defined in your setup chunk:
# UJ_GREEN = "#99bb66"; UJ_ORANGE = "#f19e4b"
UJ_GREEN_D = _darken_hex(UJ_GREEN, 0.75)   # slightly darker outline
UJ_ORANGE_D = _darken_hex(UJ_ORANGE, 0.78) # optional (not required here)

def _ellipse_xy(cx, cy, rx, ry, n=60):
    t = np.linspace(0, 2*np.pi, n)
    return cx + rx*np.cos(t), cy + ry*np.sin(t)

def _fix_ci_bounds(df, lo_col, hi_col):
    """Ensure lo <= hi; if not, swap."""
    df = df.copy()
    bad = df[lo_col] > df[hi_col]
    if bad.any():
        lo = df.loc[bad, lo_col].copy()
        df.loc[bad, lo_col] = df.loc[bad, hi_col].values
        df.loc[bad, hi_col] = lo.values
    return df

metrics = sorted(merged["metric"].unique())

# -------- Build robust human horizontal bands and LLM vertical bands per metric --------
ellipse_rows = {}   # per metric: DataFrame with columns: paper, cx, cy, rx, ry, hum_mid, llm_mid
hum_points   = {}   # per metric: individual human dots with y=LLM midpoint

for m in metrics:
    # rater-level human rows for this metric
    H = human_raw.loc[human_raw["metric"] == m,
                      ["paper","middle_rating","lower_CI","upper_CI"]].copy()
    # sanitize bounds if present
    if {"lower_CI","upper_CI"}.issubset(H.columns):
        H = _fix_ci_bounds(H, "lower_CI", "upper_CI")

    # per-paper union across raters: use min(lower_CI, points) and max(upper_CI, points)
    grp = H.groupby("paper", as_index=False).agg(
        hum_mean=("middle_rating","mean"),
        lo_pt=("middle_rating","min"),
        hi_pt=("middle_rating","max"),
        lo_ci=("lower_CI","min"),
        hi_ci=("upper_CI","max")
    )

    # combine CIs + points into one union band
    grp["lo_all"] = grp[["lo_pt","lo_ci"]].min(axis=1, skipna=True)
    grp["hi_all"] = grp[["hi_pt","hi_ci"]].max(axis=1, skipna=True)

    # if CIs missing entirely, fall back to point range
    grp["lo_all"] = grp["lo_all"].fillna(grp["lo_pt"])
    grp["hi_all"] = grp["hi_all"].fillna(grp["hi_pt"])

    # clean any remaining inversions
    bad = grp["lo_all"] > grp["hi_all"]
    if bad.any():
        swap_lo = grp.loc[bad, "lo_all"].copy()
        grp.loc[bad, "lo_all"] = grp.loc[bad, "hi_all"].values
        grp.loc[bad, "hi_all"] = swap_lo.values

    grp["cx"] = 0.5*(grp["lo_all"] + grp["hi_all"])
    grp["rx"] = (grp["hi_all"] - grp["lo_all"]).clip(lower=0) / 2.0

    # Join LLM mid & CI (one row per paper×metric in merged)
    L = (merged.loc[merged["metric"]==m,
                    ["paper","midpoint_llm","lower_llm","upper_llm","midpoint_human"]]
               .dropna(subset=["paper","midpoint_llm"]))
    L = _fix_ci_bounds(L, "lower_llm", "upper_llm")

    J = (grp.merge(L, on="paper", how="inner")
             .rename(columns={"midpoint_llm":"llm_mid", "midpoint_human":"hum_mid"}))

    # vertical radius from LLM CI half-width
    J["ry"] = (J["upper_llm"] - J["lower_llm"]).clip(lower=0) / 2.0
    J["cy"] = J["llm_mid"]
    J["cx"] = J["cx"].astype(float)
    J["cy"] = J["cy"].astype(float)

    ellipse_rows[m] = J[["paper","cx","cy","rx","ry","hum_mid","llm_mid","lo_all","hi_all"]].copy()

    # individual human dots (x) at the LLM y for that paper
    hum_points[m] = (H.merge(J[["paper","cy"]], on="paper", how="inner")
                       .rename(columns={"middle_rating":"human_x", "cy":"llm_y"}))

# -------- Global square axis range across all metrics --------
all_x = np.concatenate([df["cx"].to_numpy() for df in ellipse_rows.values() if len(df)])
all_y = np.concatenate([df["cy"].to_numpy() for df in ellipse_rows.values() if len(df)])
if all_x.size and all_y.size:
    lo = float(min(all_x.min(), all_y.min()))
    hi = float(max(all_x.max(), all_y.max()))
else:
    lo, hi = 0.0, 100.0
pad = 0.05 * (hi - lo if hi > lo else 1.0)
x0, x1 = lo - pad, hi + pad
y0, y1 = lo - pad, hi + pad

# -------- Build the Plotly figure --------
fig = go.Figure()
annots = []
trace_spans = []  # (start_idx, end_idx) per metric

for m in metrics:
    start_idx = len(fig.data)

    J = ellipse_rows[m]
    I = hum_points[m]

    # Draw each ellipse as a filled polygon (brand colors)
    for row in J.itertuples(index=False):
        ex, ey = _ellipse_xy(row.cx, row.cy, row.rx, row.ry, n=60)
        fig.add_trace(go.Scatter(
            x=ex, y=ey, mode="lines",
            line=dict(color=_hex_to_rgba_str(UJ_GREEN_D, 0.65), width=1),
            fill="toself", fillcolor=_hex_to_rgba_str(UJ_ORANGE, 0.12),
            hovertemplate=(
                "<b>%{text}</b><br>"
                "Human band: [%{customdata[0]:.1f}, %{customdata[1]:.1f}]<br>"
                "Center (Human, LLM) = (%{customdata[2]:.1f}, %{customdata[3]:.1f})<extra></extra>"
            ),
            text=[row.paper]*len(ex),
            customdata=np.column_stack([np.repeat(row.lo_all, len(ex)),
                                        np.repeat(row.hi_all, len(ex)),
                                        np.repeat(row.cx, len(ex)),
                                        np.repeat(row.cy, len(ex))]),
            showlegend=False,
            visible=False
        ))

    # Individual human dots
    fig.add_trace(go.Scatter(
        x=I["human_x"], y=I["llm_y"],
        mode="markers", name="Human raters",
        marker=dict(size=5, color=UJ_GREEN, opacity=0.50,
                    line=dict(width=0.5, color="rgba(50,50,50,0.25)")),
        text=I["paper"] + " — " + I["human_x"].round(1).astype(str),
        hovertemplate="<b>%{text}</b><br>LLM=%{y:.1f}<extra></extra>",
        showlegend=True, visible=False
    ))

    # Centers (orange diamonds)
    fig.add_trace(go.Scatter(
        x=J["hum_mid"], y=J["llm_mid"],
        mode="markers", name="Midpoints",
        marker=dict(size=9, color=UJ_ORANGE, symbol="diamond",
                    line=dict(color="rgba(50,50,50,0.35)", width=0.6)),
        text=J["paper"],
        hovertemplate="<b>%{text}</b><br>Human mid=%{x:.1f}<br>LLM mid=%{y:.1f}<extra></extra>",
        showlegend=True, visible=False
    ))

    # Single legend stub for ellipses
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode="lines", name="CI (human × LLM)",
        line=dict(color=_hex_to_rgba_str(UJ_GREEN_D, 0.70)), fill="toself",
        showlegend=True, visible=False
    ))

    # Correlations on midpoints for the caption
    if len(J) >= 2:
        r_p = pearsonr(J["hum_mid"], J["llm_mid"])[0]
        r_s = spearmanr(J["hum_mid"], J["llm_mid"])[0]
    else:
        r_p = np.nan; r_s = np.nan

    annots.append(dict(
        x=x0 + 0.05*(x1-x0), y=y1 - 0.05*(y1-y0), xref="x", yref="y",
        showarrow=False, font=dict(size=12, color="#222"),
        text=f"{m.replace('_',' ').title()} — Pearson r = {r_p:.2f}; Spearman ρ = {r_s:.2f}"
    ))
    trace_spans.append((start_idx, len(fig.data)-1))

# 45° equality line
fig.add_shape(type="line", x0=x0, y0=x0, x1=x1, y1=y1,
              line=dict(color="rgba(120,120,120,0.5)", dash="dot"))

# Dropdown: one entry per metric
buttons = []
total = len(fig.data)
for i, m in enumerate(metrics):
    vis = [False]*total
    s, e = trace_spans[i]
    for k in range(s, e+1):
        vis[k] = True
    buttons.append(dict(
        method="update", label=m.replace("_"," ").title(),
        args=[{"visible": vis},
              {"title": f"LLM vs Human — CI ellipses (union of human range/CI) — {m.replace('_',' ').title()}",
               "annotations": [annots[i]]}]
    ))

# Initialize first metric visible
init_vis = [False]*len(fig.data)
s0, e0 = trace_spans[0]
for k in range(s0, e0+1):
    init_vis[k] = True

fig.update_layout(
    template="plotly_white",
    title=f"LLM vs Human — CI ellipses (union of human range/CI) — {metrics[0].replace('_',' ').title()}",
    xaxis_title="Human (x)",
    yaxis_title="LLM (y)",
    xaxis=dict(range=[x0, x1]),
    yaxis=dict(range=[y0, y1], scaleanchor="x", scaleratio=1),

    # --- Legend below the plot ---
    legend=dict(
        orientation="h",
        yanchor="top", y=-0.16,          # push below the plotting area
        xanchor="center", x=0.50,        # centered
        bgcolor="rgba(0,0,0,0)"
    ),

    # Leave enough space at bottom for legend + dropdown
    margin=dict(l=60, r=20, t=70, b=160),

    # --- Dropdown below the legend ---
    updatemenus=[dict(
        type="dropdown",
        direction="up",
        showactive=True,
        buttons=buttons,
        x=0.50, xanchor="center",
        y=-0.30, yanchor="top",          # a bit further down than legend
        pad={"t": 2, "b": 2}
    )]
)

fig.update_traces(visible=False)
for k, v in enumerate(init_vis):
    fig.data[k].visible = v

fig

Figure 3.1

(a) LLM vs Human with robust CI ellipses. Horizontal span covers the union of all human raters’ CIs and point ratings; vertical span from LLM CI. Centers at midpoints. Dropdown selects metric.

(b)

For each paper (selected metric), the ellipse is centered at the pair of midpoints (Human, LLM). The horizontal radius covers the union of all human evidence for that paper — combining individual raters’ point scores and their CIs — while the vertical radius reflects the LLM CI. This makes the ellipse wide enough to contain every human point horizontally (unless a point lies outside its own stated CI, which is flagged in a diagnostic check).

Show code

from matplotlib.gridspec import GridSpec

def _plot_hist_for_metric(ax, df_long, metric, bins, title=None):
    g = df_long[df_long["metric"] == metric]
    ax.hist(g[g["rater"]=="LLM"]["score"],   bins=bins, alpha=0.55, label="LLM",
            color=UJ_ORANGE, edgecolor="white")
    ax.hist(g[g["rater"]=="Human"]["score"], bins=bins, alpha=0.55, label="Human",
            color=UJ_GREEN,  edgecolor="white")
    ax.set_title(title or metric.replace("_", " ").title())
    ax.set_xlabel("Score"); ax.set_ylabel("Count")

metrics_all = list(ratings_long["metric"].dropna().unique())
has_overall = "overall" in metrics_all
metrics_others = [m for m in sorted(metrics_all) if m != "overall"]

# bins logic
smin, smax = ratings_long["score"].min(), ratings_long["score"].max()
bins = np.arange(0, 101, 5) if (0 <= smin and smax <= 100) else np.linspace(smin, smax, 21)

if has_overall:
    cols = 3
    rows_other = math.ceil(len(metrics_others) / cols) or 1
    fig = plt.figure(figsize=(cols*4.6, 1.2*3.8 + rows_other*3.6))
    gs = GridSpec(nrows=rows_other+1, ncols=cols, figure=fig,
                  height_ratios=[1.3] + [1]*rows_other)

    # Big top row: Overall spans all columns
    ax0 = fig.add_subplot(gs[0, :])
    _plot_hist_for_metric(ax0, ratings_long, "overall", bins, title="Overall")

    # Bottom grid: remaining metrics
    for idx, m in enumerate(metrics_others):
        r = 1 + idx // cols
        c = idx % cols
        ax = fig.add_subplot(gs[r, c])
        _plot_hist_for_metric(ax, ratings_long, m, bins)

    # Turn off any empty slots
    total_slots = rows_other * cols
    for empty in range(len(metrics_others), total_slots):
        r = 1 + empty // cols
        c = empty % cols
        fig.add_subplot(gs[r, c]).axis("off")

    # Shared legend once (top center)
    handles, labels = ax0.get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=2, frameon=False)
    plt.tight_layout(rect=(0,0,1,0.94))
else:
    # fallback to simple grid if no overall
    metrics = sorted(metrics_all)
    n = len(metrics); cols = 3; rows = math.ceil(n/cols)
    fig, axes = plt.subplots(rows, cols, figsize=(cols*4.6, rows*3.6), sharex=True)
    axes = np.atleast_2d(axes); axes_flat = axes.flat
    for ax, m in zip(axes_flat, metrics):
        _plot_hist_for_metric(ax, ratings_long, m, bins)
    for ax in list(axes_flat)[n:]:
        ax.axis("off")
    handles, labels = axes_flat[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc="upper center", ncol=2, frameon=False)
    plt.tight_layout(rect=(0,0,1,0.94))

Figure 3.2: Distributions by metric. Top: Overall (full width). Bottom: remaining metrics in a 2×3 grid.

Differences by paper and metric

For each (paper, metric), we plot LLM − Human difference in Figure 3.3. Green means the LLM is lower than the human midpoint; orange means higher.

Vertical stripes (by metric) suggest a metric‑specific bias (LLM systematically up/down on that metric).
Horizontal stripes (by paper) flag paper‑specific disagreement (LLM consistently above/below for that paper across metrics).
Isolated bright cells highlight individual outliers worth auditing qualitatively.

Show code

pair = pair_llm_human(merged)
pair["diff"] = pair["LLM"].astype(float) - pair["Human"].astype(float)

# Row order: sort by signed difference on the 'overall' metric (LLM−Human)
order_overall = (pair.loc[pair["metric"]=="overall"]
                   .groupby("paper")["diff"].mean()
                   .sort_values())  # ascending: LLM lower (green) at top, higher (orange) at bottom

hm = pair.pivot_table(index="paper", columns="metric", values="diff", aggfunc="mean")
# apply row order; append any papers lacking 'overall' at the bottom
ordered_rows = list(order_overall.index) + [p for p in hm.index if p not in order_overall.index]
hm = hm.loc[ordered_rows]

Z = hm.values.astype(float)

# Robust centered normalization (as before)
pos = Z[Z > 0]; neg = -Z[Z < 0]
p_hi = np.nanpercentile(pos, 97.5) if pos.size else 0.0
n_hi = np.nanpercentile(neg, 97.5) if neg.size else 0.0
span = max(p_hi, n_hi)
if not np.isfinite(span) or span == 0:
    span = float(np.nanmax(np.abs(Z))) if np.isfinite(np.nanmax(np.abs(Z))) else 1.0

norm = TwoSlopeNorm(vmin=-span, vcenter=0.0, vmax=span)

fig_hm, ax = plt.subplots(
    figsize=(max(6.5, 0.8*hm.shape[1]), max(6.0, 0.40*hm.shape[0]))
)
im = ax.imshow(Z, cmap=UJ_DIV, norm=norm, aspect="auto", interpolation="nearest")

ax.set_xticks(np.arange(hm.shape[1]), labels=[c.replace("_", " ").title() for c in hm.columns], rotation=40, ha="right")
ax.set_yticks(np.arange(hm.shape[0]), labels=hm.index.tolist())

ax.set_xticks(np.arange(-0.5, hm.shape[1], 1), minor=True)
ax.set_yticks(np.arange(-0.5, hm.shape[0], 1), minor=True)
ax.grid(which="minor", color="white", linewidth=0.6)
ax.tick_params(which="minor", bottom=False, left=False)

cb = plt.colorbar(im, ax=ax, fraction=0.03, pad=0.02)
cb.set_label("LLM − Human (points)")
tick_vals = np.linspace(-span, span, 5)
cb.set_ticks(tick_vals); cb.set_ticklabels([f"{t:.1f}" for t in tick_vals])

ax.set_title("Heatmap: LLM − Human differences by paper × metric (sorted by overall difference)")
plt.tight_layout()

Figure 3.3: LLM − Human difference by paper × metric (robust, centered scale; rows sorted by signed overall difference).

Agreement metrics

Table 3.1 summarizes agreement: Pearson (r) captures linear alignment of levels; Spearman (ρ) captures rank agreement (monotonic relationship), robust to scale differences. We report the mean and median of (LLM − Human) to summarize both average shift and central tendency of the differences and Cohen’s κ (unweighted & quadratic). Because scores are continuous, we discretize both LLM and Human using shared quantile cutpoints (5 bins) and compute the Unweighted κ (exact category agreement beyond chance) and the Quadratic‑weighted κ that penalizes larger disagreements more than smaller ones.

As another indicator of agreement, we check whether each side’s credible interval contain the other’s point estimate, Table 3.3 shows the shares across rating categories.

Table 3.2 shows “calibration” results based on a simple linear regression; Figure 3.4 plots the width of human raters CI against the difference in ratings; and Table 3.4 lists the papers with the largest aabsolute difference between LLM and human raters.

Show code

agree_cat = agreement_summary(merged, n_bins=5, bin_strategy="quantile")
agree_cat.round({"pearson":3,"spearman":3,"mean_bias":2,"median_bias":2,
                 "kappa_unw":3,"kappa_quad":3})

ov = overall_stats(merged, n_bins=5, bin_strategy="quantile")


agree_view = (agree_cat
              .rename(columns={"kappa_unw":"kappa_unwtd","kappa_quad":"kappa_quad"})
              .loc[:, ["metric","n","pearson","spearman","mean_bias","median_bias","kappa_unwtd","kappa_quad"]]
              .sort_values("metric"))
agree_view = agree_view.round({"pearson":3, "spearman":3, "mean_bias":2, "median_bias":2, "kappa_unwtd":3, "kappa_quad":3})
agree_view

Table 3.1: Agreement metrics by metric (Pearson, Spearman, mean/median bias, κ unweighted/weighted).

	metric	n	pearson	spearman	mean_bias	median_bias	kappa_unwtd	kappa_quad
0	advancing_knowledge	27	0.198	0.259	7.12	6.0	0.052	0.257
1	claims_evidence	6	0.410	0.471	-3.75	-3.0	-0.154	0.400
2	global_relevance	28	0.135	0.109	8.21	8.5	-0.040	0.070
3	logic_communication	28	0.001	0.130	10.01	9.0	-0.029	0.041
4	methods	27	0.309	0.403	6.60	5.5	0.148	0.288
5	open_science	28	0.138	0.103	-6.01	-7.0	0.009	0.137
6	overall	28	0.280	0.309	1.64	1.0	0.125	0.374

Show code

from scipy.stats import linregress

def calibrate_per_metric(merged_df):
    P = pair_llm_human(merged_df)
    rows = []
    for m, g in P.groupby("metric"):
        x, y = g["LLM"].to_numpy(), g["Human"].to_numpy()
        if len(g) >= 2:
            lr = linregress(x, y)  # slope, intercept, rvalue, pvalue, stderr
            yhat = lr.slope * x + lr.intercept
            mad_pre  = float(np.mean(np.abs(x - y)))
            mad_post = float(np.mean(np.abs(yhat - y)))
            rows.append(dict(
                metric=m,
                n=len(g),
                slope=lr.slope, intercept=lr.intercept,
                r2=lr.rvalue**2,
                MAD_pre=mad_pre, MAD_post=mad_post,
                MAD_delta=mad_pre - mad_post
            ))
        else:
            rows.append(dict(metric=m, n=len(g),
                             slope=np.nan, intercept=np.nan, r2=np.nan,
                             MAD_pre=np.nan, MAD_post=np.nan, MAD_delta=np.nan))
    out = pd.DataFrame(rows).sort_values("metric").reset_index(drop=True)
    return out

calib = calibrate_per_metric(merged).copy()
calib_rounded = calib.copy()
for c in ["slope","intercept","r2","MAD_pre","MAD_post","MAD_delta"]:
    calib_rounded[c] = calib_rounded[c].round(3)
calib_rounded

Table 3.2: Calibration of LLM to Human by metric (Human ≈ intercept + slope·LLM). Includes pre-/post- MAD.

	metric	n	slope	intercept	r2	MAD_pre	MAD_post	MAD_delta
0	advancing_knowledge	27	0.284	50.232	0.039	11.216	8.812	2.404
1	claims_evidence	6	0.220	61.234	0.168	11.250	5.859	5.391
2	global_relevance	28	0.255	53.573	0.018	14.589	12.000	2.589
3	logic_communication	28	0.002	72.823	0.000	13.464	9.802	3.663
4	methods	27	0.404	39.774	0.096	12.099	9.427	2.672
5	open_science	28	0.174	57.509	0.019	17.976	13.346	4.631
6	overall	28	0.357	47.941	0.078	9.149	7.837	1.312

Show code

cov = (merged.dropna(subset=["lower_llm","upper_llm","lower_human","upper_human",
                             "midpoint_llm","midpoint_human"])
              .copy())

cov["human_in_llm"] = (cov["midpoint_human"] >= cov["lower_llm"]) & (cov["midpoint_human"] <= cov["upper_llm"])
cov["llm_in_human"] = (cov["midpoint_llm"]   >= cov["lower_human"]) & (cov["midpoint_llm"]   <= cov["upper_human"])
cov["both_cover"]   = cov["human_in_llm"] & cov["llm_in_human"]

coverage_tbl = (cov.groupby("metric", as_index=False)
                  .agg(N=("paper","size"),
                       human_in_llm=("human_in_llm","mean"),
                       llm_in_human=("llm_in_human","mean"),
                       both_cover=("both_cover","mean")))
coverage_tbl[["human_in_llm","llm_in_human","both_cover"]] = \
    coverage_tbl[["human_in_llm","llm_in_human","both_cover"]].round(2)

# Overall coverage (glue)
overall_cov = dict(
    N=int(len(cov)),
    human_in_llm=float(cov["human_in_llm"].mean()),
    llm_in_human=float(cov["llm_in_human"].mean()),
    both=float(cov["both_cover"].mean())
)
# glue("cov_both", overall_cov["both"])
# glue("cov_hinl", overall_cov["human_in_llm"])
# glue("cov_linh", overall_cov["llm_in_human"])

coverage_tbl

Table 3.3: CI coverage: does one interval contain the other’s point estimate?

	metric	N	human_in_llm	llm_in_human	both_cover
0	advancing_knowledge	24	0.67	0.62	0.54
1	claims_evidence	6	0.67	0.67	0.67
2	global_relevance	25	0.36	0.44	0.28
3	logic_communication	25	0.36	0.60	0.36
4	methods	24	0.62	0.54	0.50
5	open_science	25	0.48	0.40	0.28
6	overall	25	0.60	0.60	0.52

Show code

use = merged.dropna(subset=["lower_human","upper_human","midpoint_llm","midpoint_human"]).copy()
use["human_width"] = use["upper_human"] - use["lower_human"]
use["abs_diff"]    = (use["midpoint_llm"] - use["midpoint_human"]).abs()

rho, pval = (np.nan, np.nan)
if len(use) >= 3:
    rho, pval = spearmanr(use["human_width"], use["abs_diff"])

fig, ax = plt.subplots(figsize=(6.2, 4.6))
ax.scatter(use["human_width"], use["abs_diff"], s=28, alpha=.75, color=UJ_ORANGE, edgecolor="white")
ax.set_xlabel("Human CI width")
ax.set_ylabel("|LLM − Human|")
ax.set_title(f"uncertainty vs disagreement  (Spearman ρ = {rho:.2f}, p = {pval:.3f})")
ax.grid(True, alpha=.25)
plt.tight_layout()

Figure 3.4: Human uncertainty (CI width) vs |LLM − Human|. Spearman correlation reported.

Show code

out = merged.copy()
out["abs_diff"] = (out["midpoint_llm"] - out["midpoint_human"]).abs()
out["human_in_llm"] = (out["midpoint_human"] >= out["lower_llm"]) & (out["midpoint_human"] <= out["upper_llm"])
out["llm_in_human"] = (out["midpoint_llm"]   >= out["lower_human"]) & (out["midpoint_llm"]   <= out["upper_human"])
cols = ["paper","metric","midpoint_llm","midpoint_human","abs_diff","human_in_llm","llm_in_human"]
outliers = (out.sort_values("abs_diff", ascending=False)
              .loc[:, cols]
              .head(15))
outliers

Table 3.4: Top absolute differences (LLM vs Human). Flags show whether each point lies within the other’s CI.

	paper	metric	midpoint_llm	midpoint_human	abs_diff	human_in_llm	llm_in_human
23	Alcott et al. 2024	global_relevance	78	16.00	62.00	False	False
49	Bettle 2023	methods	68	10.00	58.00	False	False
20	Alcott et al. 2024	advancing_knowledge	90	32.50	57.50	False	False
63	Bruers 2021	open_science	30	80.00	50.00	False	False
51	Bettle 2023	logic_communication	78	30.00	48.00	False	False
4	Aghion et al. 2017	open_science	40	87.50	47.50	False	False
52	Bettle 2023	open_science	55	10.00	45.00	False	False
139	Kremer et al. 2022	global_relevance	95	51.25	43.75	False	False
82	Chuard et al. 2022	open_science	55	96.00	41.00	False	False
48	Bettle 2023	overall	68	32.50	35.50	False	False
75	Carson et al. 2023	logic_communication	85	50.00	35.00	False	False
39	Barberio et al. 2022	logic_communication	80	45.00	35.00	False	False
61	Bruers 2021	advancing_knowledge	55	90.00	35.00	False	False
58	Bruers 2021	overall	48	80.00	32.00	False	False
3	Aghion et al. 2017	logic_communication	88	57.50	30.50	False	False