library("tidyverse")library("jsonlite")library("knitr")library("kableExtra")library("DT")library("reticulate")# Theme colors (matching results_ratings.qmd)UJ_ORANGE <-"#f19e4b"UJ_GREEN <-"#99bb66"UJ_BLUE <-"#4e79a7"# Severity colors for badgesSEV_NECESSARY <-"#e74c3c"# RedSEV_OPTIONAL <-"#f39c12"# OrangeSEV_UNSURE <-"#95a5a6"# Graytheme_uj <-function(base_size =11) {theme_minimal(base_size = base_size) +theme(panel.grid.minor =element_blank(),plot.title.position ="plot",legend.position ="bottom" )}# Function to create severity badge HTMLseverity_badge <-function(severity) { sev <-tolower(trimws(severity))if (grepl("necessary", sev)) {return('<span style="background-color:#e74c3c;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">🔴 Necessary</span>') } elseif (grepl("optional", sev)) {return('<span style="background-color:#f39c12;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">🟠 Optional</span>') } elseif (sev !="") {return('<span style="background-color:#95a5a6;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">⚪ Unsure</span>') }return('<span style="background-color:#bdc3c7;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">—</span>')}# Function to truncate text for table displaytruncate_text <-function(text, max_chars =80) {if (nchar(text) > max_chars) {paste0(substr(text, 1, max_chars), "...") } else { text }}
This chapter compares qualitative critiques: the key methodological and interpretive issues identified by GPT-5.2 Pro against human expert critiques documented by Unjournal evaluation managers in Coda.
We assess alignment using LLM-based comparison: GPT-5.2 Pro evaluates coverage (what proportion of human concerns GPT identified) and precision (whether GPT issues are substantive). See LLM-Based Assessment for aggregate metrics.
Data Sources
GPT-5.2 Pro Key Issues: Structured output from the focal evaluation run (January 2026), where the model was prompted to identify 5-15 key issues per paper, ordered from most to least important.
Human Expert Critiques: Curated content from the “Key critiques & issues with paper” column in The Unjournal’s internal tracking database (Coda), written by evaluation managers synthesizing evaluator feedback. These use severity labels (“Necessary”, “Optional but important”, “Unsure”) and cite specific evaluator comments.
import jsonimport refrom pathlib import Path# Severity parsing regex patternsSEVERITY_HEADER_RE = re.compile(r"^(necessary|optional|optional but important|unsure|less important|possibly relevant|"r"probably important|probably correct|probably|note|notes|optional/desirable|"r"probably useful|maybe|possibly|unsure if correct|limitations)(\s|:|$)", re.IGNORECASE,)ENUM_RE = re.compile(r"^\s*(\d+)[\.\)]\s+")EVALUATOR_RE = re.compile(r"^\[?(E\d+|Evaluator|DR|NotebookLM)\]?[\s:\[]", re.IGNORECASE)def normalize_severity(header): h = header.lower()if"necessary"in h:return"necessary"if"optional"in h:return"optional"ifany(word in h for word in ["unsure", "possibly", "probably", "less important", "maybe", "limitations"]):return"unsure"return""def extract_short_label(text, max_words=12):"""Extract a descriptive short label from issue text.""" original = text.strip()# Meta-commentary patterns to skip META_SKIP_PATTERNS = [r'^I chose',r'^I selected',r'^I think',r'^I would',r'^I believe',r'^These are',r'^This is (a|the|my)',r'^Note[s]?\s*[:)]',r'^DR\s*[:)]',r'^NotebookLM\s*[:)]',r'^The following',r'^Summary\s*[:)]',r'^Quote from',r'^See also',r'^As noted',r'^and that', # Fragments starting with conjunctionsr'^and it',r"^and that it's",r'^or that',r'^but that',r'^but it',r'^which ',r'^that ',r'^to be ',r'^it is ',r'^it would ', ]# First clean: Remove leading evaluator tags (DR —, [E1], etc.) cleaned_for_meta_check = re.sub(r'^[\s\[]*(?:E\d+|Evaluator\s*\d*|DR|NotebookLM|Treich|Triech|Capitan|Reiley|Manheim|Espinosa|Seinkmane|Cloud|Tsai)[\]:\s—\-]+','', original, flags=re.IGNORECASE ).strip()# Check for meta-commentary patterns AFTER removing evaluator prefixfor pattern in META_SKIP_PATTERNS:if re.match(pattern, cleaned_for_meta_check, re.IGNORECASE):# Try to find actual content after the meta-commentary# Look for a quoted phrase or substantive content after colon/period quote_in_meta = re.search(r'["""\']([^"""\'\n]{15,80})["""\']', cleaned_for_meta_check)if quote_in_meta: quoted = quote_in_meta.group(1).strip()ifnot re.match(r'^(the |a |an |this )', quoted.lower()):return quoted# Look for content after a colon after_meta = re.split(r'[:\n]', cleaned_for_meta_check, maxsplit=1)iflen(after_meta) >1andlen(after_meta[1].strip()) >20: original = after_meta[1].strip()breakelse:# Return a generic placeholder if no content foundreturn"[Meta-commentary - see full text]"# FIRST: Check if text starts with "Label [Evaluator] more text" pattern# Extract just the label part before the evaluator tag# Match common evaluator patterns: [E1], [Treich], [Triech], [Reiley], [Capitan], etc. label_before_eval = re.match(r'^([A-Z][^[\n]{10,80}?)\s*\[(?:E\d+|[A-Z][a-z]+)\]', original)if label_before_eval: label = label_before_eval.group(1).strip() label = re.sub(r'[\s:,;—\-]+$', '', label)if3<=len(label.split()) <= max_words:return label# Also check for "Label E2: more text" pattern (evaluator without brackets) label_before_eval2 = re.match(r'^([A-Z][^\n]{10,80}?)\s+(?:E\d+|DR)\s*:', original)if label_before_eval2: label = label_before_eval2.group(1).strip() label = re.sub(r'[\s:,;—\-]+$', '', label)if3<=len(label.split()) <= max_words:return label# Remove evaluator tags, names, and quotes at start clean = re.sub(r'^\[?(?:E\d+|Evaluator\s*\d*|DR|NotebookLM|Treich|Triech|Capitan|Reiley|Manheim|Espinosa|Seinkmane|Cloud|Tsai)\]?[\s:\[]*', '', original, flags=re.IGNORECASE) clean = re.sub(r'^[""\'\[\(]', '', clean) clean = re.sub(r'^Author[\'s]*\s*response:?\s*', '', clean, flags=re.IGNORECASE)# Skip if remaining text starts with a name (single capitalized word followed by colon) name_check = re.match(r'^([A-Z][a-z]+)\s*:', clean)if name_check andlen(name_check.group(1)) <15: clean = clean[name_check.end():].strip()# Look for quoted phrases that are key terms (handle both straight and curly quotes) quote_match = re.search(r'["""\']([^"""\'\n]{15,80})["""\']', clean[:250])if quote_match: quoted = quote_match.group(1).strip()ifnot re.match(r'^(the |a |an |this )', quoted.lower()):return quoted# Look for "Label: description" or "Label — description" patterns# Use (?<!\w) to avoid matching hyphens in compound words like "pre-registered" label_match = re.match(r'^([^:\n]{10,80}?)(?:\s*[:\—]\s*|\s+—\s+)', clean)if label_match: label = label_match.group(1).strip() label = re.sub(r'[""\']+$', '', label)# Skip if it ends with a preposition or conjunction (incomplete phrase)ifnot re.search(r'\s+(or|and|of|for|to|in|on|by|with|the|a|an)$', label.lower()):ifnot re.match(r'^[A-Z][a-z]+$', label) andlen(label.split()) <= max_words:return label# Look for descriptive noun phrases (allow hyphens in words) patterns = [r'((?:Lack of |Missing |Unclear |Weak |Strong |Potential )?[A-Z][a-z]+(?:[-\s]+[a-z-]+){1,5}(?:\s+(?:effects?|bias|issues?|concerns?|limitations?|assumptions?|analysis|validity|problems?|risks?|errors?|size|needs?|checking)))',r'((?:Small |Large |Limited |Significant )[a-z-]+(?:\s+[a-z-]+){0,5})',r'([A-Z][a-z]+\s+(?:sample|data|model|method|approach|design|analysis|claims?|effects?)(?:\s+[a-z-]+){0,3})',r'(Consider\s+[a-z-]+(?:[-\s]+[a-z-]+){0,5})',r'(Deviations?\s+from\s+(?:the\s+)?[a-z-]+(?:[-\s]+[a-z-]+){0,5})', ]for pattern in patterns: match = re.search(pattern, clean[:250])if match: label = match.group(1).strip()# Remove trailing preposition phrases like "or degrees of", "and more", etc.whileTrue: new_label = re.sub(r'\s+(or|and|with|for|to|of|in|on|by|including)(\s+\w+)*\s*$', '', label)if new_label == label:break label = new_labelif10<len(label) <120andlen(label.split()) <= max_words:return label# Take first sentence or clause first_clause = re.split(r'(?<=[.!?])\s+|\n', clean)[0].strip() first_clause = re.sub(r'^(The |A |An )', '', first_clause) first_clause = re.sub(r'\s+(including|or|and|with|for|to|of|in|on|by)\s*$', '', first_clause)# Remove evaluator tags from middle/end of text first_clause = re.sub(r'\s*\[(?:E\d+|[A-Z][a-z]+)\]\s*', ' ', first_clause) first_clause = re.sub(r'\s+(?:E\d+|DR)\s*:.*$', '', first_clause) first_clause = re.sub(r'\s+', ' ', first_clause).strip() words = first_clause.split()if3<=len(words) <= max_words:return first_clauseeliflen(words) > max_words: text =' '.join(words[:max_words]) last_break =max(text.rfind(' or '), text.rfind(' and '), text.rfind(', '), text.rfind(' — '))if last_break >len(text) //2:return text[:last_break].strip(' ,—')return text +'...'# Fallback key_terms = re.findall(r'[A-Z][a-z]+(?:\s+[a-z]+){0,2}', clean[:150])if key_terms:return' / '.join(key_terms[:3]) words = clean.split()[:max_words]return' '.join(words) + ('...'iflen(clean.split()) > max_words else'')def extract_llm_label(text, max_words=10):"""Extract a short label from LLM issue text."""# Clean up numbering clean = re.sub(r'^[\d\.\)]+\s*', '', text.strip())# LLM issues often have format "Topic: detailed explanation" colon_match = re.match(r'^([^:]{8,60}):\s*', clean)if colon_match: label = colon_match.group(1).strip()# Make sure it's descriptive enoughiflen(label.split()) >=2:return label# Take first clause first_part = re.split(r'[;.]', clean)[0].strip() words = first_part.split()iflen(words) <= max_words:return first_partreturn' '.join(words[:max_words]) +'...'def split_human_issues(text):"""Parse human critique text into individual issues with severity labels."""ifnot text:return [] issues = [] current_severity ="" current_issue_lines = []def flush_issue():nonlocal current_issue_linesifnot current_issue_lines:return full_text ="\n".join(current_issue_lines).strip()# Clean leading punctuation/numbering cleaned = re.sub(r"^[\-\*\d\.\)\s]+", "", full_text).strip()ifnot cleaned orlen(cleaned) <10: current_issue_lines = []return# Skip if it's just a severity labelif SEVERITY_HEADER_RE.match(cleaned): current_issue_lines = []return short_label = extract_short_label(cleaned)# Skip if this is meta-commentary (the label extraction will return placeholder)if short_label =="[Meta-commentary - see full text]": current_issue_lines = []return issues.append({"text": cleaned,"severity": current_severity,"short_label": short_label }) current_issue_lines = [] lines = text.splitlines() i =0while i <len(lines): line = lines[i].rstrip() raw = line.strip()# Check for severity headerif raw and SEVERITY_HEADER_RE.match(raw): flush_issue() current_severity = normalize_severity(raw.split(':')[0].split()[0]) i +=1continue# Check for numbered issue startif ENUM_RE.match(raw): flush_issue()# Remove the number prefix issue_text = re.sub(r"^\s*\d+[\.\)]\s*", "", raw) current_issue_lines = [issue_text] if issue_text else [] i +=1continue# Check for issue label line (short line followed by longer explanation)# e.g., "Ceiling effects with Likert scale measures"if raw andnot EVALUATOR_RE.match(raw) andlen(raw) <80:# Look ahead - if next non-empty line starts with [E or is indented, this is a label next_idx = i +1while next_idx <len(lines) andnot lines[next_idx].strip(): next_idx +=1if next_idx <len(lines): next_line = lines[next_idx].strip()if (EVALUATOR_RE.match(next_line) or lines[next_idx].startswith(' ') or next_line.startswith('"') or next_line.startswith("'")):# This short line is likely a label for a new issue flush_issue() current_issue_lines = [raw] i +=1continue# Empty line - might signal new issue in some formatsifnot raw:# Don't flush on every empty line - only if we have significant contentif current_issue_lines andlen("\n".join(current_issue_lines)) >100:# Check if next content looks like new issue next_idx = i +1while next_idx <len(lines) andnot lines[next_idx].strip(): next_idx +=1if next_idx <len(lines): next_line = lines[next_idx].strip()if ENUM_RE.match(next_line) or (len(next_line) <80andnot EVALUATOR_RE.match(next_line)): flush_issue() i +=1continue# Regular content line - add to current issueif current_issue_lines: current_issue_lines.append(raw)else: current_issue_lines = [raw] i +=1 flush_issue()return issues# Load comparison data and resultscomparison_file = Path("results/key_issues_comparison.json")results_file = Path("results/key_issues_comparison_results.json")parsed_issues = {}comparison_results = {}if comparison_file.exists(): comparison = json.loads(comparison_file.read_text())for item in comparison: paper_id = item.get("gpt_paper", "") coda_critique = item.get("coda_critique", "") parsed_issues[paper_id] = split_human_issues(coda_critique)if results_file.exists(): results = json.loads(results_file.read_text())for r in results: paper_id = r.get("gpt_paper", "") comparison_results[paper_id] = r.get("comparison", {})
We matched 14 papers with both GPT-5.2 Pro key issues and human expert critiques.
Coda title: Misperceptions and Demand for Democracy under Authoritarianism
Coverage: 60% | Precision: 90% | Rating: Moderate
LLM Assessment Summary
GPT captures two central human concerns—spillovers and over-interpretation/external-validity limits of the ‘demand for democracy’ framing—but misses the experts’ most technical points about sensitivity to a key control and the preferred pre–post specification in the lab.
Detailed Comparison Notes
The largest gap is that the human critiques emphasize a specific, high-leverage robustness/power issue (SE halves when controlling for prior election results), whereas GPT focuses more on identification/interpretation threats (SUTVA, exclusion restrictions, partisanship) and reporting/analysis hygiene (placebo content, multiple testing). Those extra GPT points are mostly plausible and substantive but are not the same priorities raised by the experts.
⚠️ Issues Human Experts Raised That GPT Missed (2)
Sensitivity to controls/power: the field ITT estimate’s standard error and statistical significance depend heavily on including the prior election result as a covariate (Table 2 cols 1–2). This dependence should be highlighted and discussed as a key robustness/power concern.
Lab experiment specification: experts ask why the authors do not use pre–post (change-score) outcomes (difference between post- and pre-treatment) rather than only post-treatment values.
📋 Extra Issues GPT Identified (9 not in human critique)
Epistemic/measurement critique of labeling beliefs as “misperceptions” using V-Dem as ground truth
Concern that treatments communicate strong causal claims (democracy → fewer disaster deaths; media independence → less corruption) based on largely observational evidence, potentially conflating persuasion/valence with information correction
Partisan entanglement of field implementation (opposition involvement/branding) undermining a ‘non-partisan information’ interpretation
Data integrity/measurement concerns about administrative turnout values exceeding 1 and documentation inconsistencies
Placebo arm not “clean” because it still contains substantive propositions, complicating interpretation
Multiple comparisons/median splits in heterogeneity analyses and need for adjustment or clearer confirmatory vs exploratory framing
Persistence interpretation using the 2024 municipal election needing stronger discussion of alternative explanations (later campaigns, mapping stability, administrative changes)
Clarity issues distinguishing ITT vs compliance-adjusted effects and avoiding over-claiming about ‘accurate information campaigns’
📄 View full human critique
Optional but important Sensitivity to controls: [Eval. manager, also relates to E1 critique] The field experiment’s power depends strongly on including the value of the prior election in the regression analyses. In Table 2 columns 1-2, the standard error declines by half when the prior election result is included in the intention-to-treat (ITT) estimate for the field experiment. Without including it, the estimate is too noisy to be considered “statistically significant.” [This should be noticed and emphasized.] Spillovers attenuate estimates (~lower bounds): [E2] “adjacent untreated neighborhoods may have been indirectly exposed to treatment messages via word-of-mouth or sharing of pamphlets. If spillovers occurred, the estimated treatment effects could be attenuated or biased.” Unsure “These framings are substantively compelling and contextually salient in Turkey, but they do not isolate support for democracy or media freedom as normative commitments.” “Demand for democracy” wording is too strong; their results may reflect the specific salience of (natural disasters) and corruption, particularly in light of a recent earthquake with widespread reports of harm linked to corruption. This may limit the external generalizability. DR: Unsure because I think the authors are fairly cautious in their statements, and their interpretation seems to be largely about understanding the instrumental link between democracy and good outcomes E1: “why don’t the authors use the difference between post- and pre-treatment values instead of only the post-treatment values” [in the lab experiment]?
🤖 View all GPT key issues (12)
1. The paper operationalizes “misperceptions” by comparing respondents’ beliefs to V-Dem indices treated as ground truth; V-Dem is expert-coded (not citizens’ lived experience) and not “official data,” so the epistemic basis for labeling beliefs as misperceived needs more nuance.
2. Treatment scripts/pamphlets present causal claims (democracy → fewer disaster deaths; media independence → less corruption) as near-laws, but the cited evidence is largely observational/context-dependent; this risks conflating persuasion by normative/valence messaging with “accurate information correction.”
3. Field implementation is intertwined with opposition parties and includes explicitly partisan branding in some arms (especially the basic treatment), complicating interpretation that effects reflect non-partisan information rather than standard partisan persuasion.
4. The main field estimand is a 2SLS effect of “completed conversations” instrumented by assignment; exclusion restrictions are not fully defended because assignment also changes pamphlet exposure, canvasser presence/intensity, and possibly other neighborhood-level channels.
5. Potential interference/spillovers across neighborhoods (social discussion, adjacency, shared spaces) are plausible in a city setting; the design and analysis largely assume SUTVA without bounding or testing spillovers.
6. External validity is limited: the field experiment is in a single city/province with selective neighborhood inclusion, and the online sample is Facebook-ad recruited with non-representative composition; generalization to national electorates or different authoritarian contexts is uncertain.
7. Administrative turnout values exceeding 1 in some ballot boxes (max >1) and documentation inconsistencies about the online “turnout” variable require explicit reconciliation to ensure data integrity and interpretability.
8. The placebo messages still contain substantive propositions (“some people think … has worsened”), so the placebo may not cleanly isolate experimenter demand; this could attenuate differences and complicate interpretation.
9. Heterogeneity analyses rely heavily on median splits and multiple subgroup comparisons; if not fully pre-registered, multiple-testing adjustments or a clearer separation of confirmatory vs exploratory analyses are needed.
10. Persistence claims using the 2024 municipal election are suggestive but need stronger discussion of alternative explanations (subsequent campaigns, compositional/administrative changes, ballot-box mapping stability across elections).
11. The paper sometimes slides from compliance-adjusted per-contact effects to broad claims about the impact of “accurate information campaigns”; clearer separation of reduced-form assignment effects, LATE, and mechanism interpretation would improve causal communication.
12. The “non-partisan approach” narrative is somewhat inconsistent with the realized partisan delivery; clarifying what was attempted vs what was implemented would improve transparency.
Adena_and_Hager_2024
Coda title: Does online fundraising increase charitable giving? A nationwide field experiment on Facebook
Coverage: 68% | Precision: 92% | Rating: Good
LLM Assessment Summary
GPT captures most of the core substantive concerns (winsorization/ROI sensitivity, spillovers/interference, window choice, competitor effects limits, and multiplicity in HTEs), but it misses several expert-emphasized points about pre-registration deviations, confidence intervals/uncertainty communication, and specific shortcomings in HTE and spillover inference.
Detailed Comparison Notes
The biggest gap is that human critiques focus heavily on (i) adherence to the pre-analysis plan (including the unimplemented causal-trees HTE plan), and (ii) inference/communication issues (confidence intervals, joint tests for HTEs, and interpretability of negative subgroup effects). GPT’s spillover critique matches the theme (interference) but not the experts’ emphasis on arbitrariness/sensitivity and the unusually large indirect-effect magnitude. GPT’s additional points (platform delivery endogeneity, mechanism, factorial-design power, reproducibility) are plausible and largely non-spurious, even if not raised by the cited experts.
⚠️ Issues Human Experts Raised That GPT Missed (8)
Pre-registration deviations / researcher degrees of freedom beyond winsorization (explicitly flagging departures from the pre-registered analysis plan)
Outcome normalization choice: donations normalized by ZIP-code population and how that affects interpretation/inference
Pre-registered heterogeneity method not implemented: planned causal trees approach for structured heterogeneity search
HTE statistical inference gaps: missing joint tests (e.g., F-tests) to establish which HTEs are significant
HTE interpretability: large positive/negative split effects likely noise without theory for negative effects; overselling of HTE patterns
Spillover modeling arbitrariness details: sensitivity to assumed spillover radius/distance cutoff and functional form; the striking claim that indirect effect is ~10x direct effect
Conceptual critique of using covariate-balance hypothesis tests as a ‘test of randomization’; preference for standardized differences
📋 Extra Issues GPT Identified (5 not in human critique)
Treatment intensity vs. assignment driven by Facebook’s delivery algorithm potentially inducing endogenous exposure differences within treated ZIP codes
Mechanism claim (“power of asking”) not directly tested against alternative mechanisms (salience/trust/news shocks)
Factorial design/power: content and impression-allocation comparisons not statistically distinguishable and should be framed as low power rather than equivalence
Replication/reproducibility constraints due to proprietary/aggregated data and unclear code availability
Pre-treatment imbalance concerns for the 23-charity alliance outcomes framed as potential residual confounding/seasonality issues
📄 View full human critique
Optional but important: Deviations from the pre-registered analysis plan or degrees of freedom, including Transforming the outcome variable — winsorizing location-days with gifts of over 1000 euros. donations normalized by the population of each zip code. Length of measurement window Investigating the effects on competing charities (additional dimension of analysis) Preregistered, not done: “We also plan to use causal trees to search for meaningful heterogeneity in a structured way using all available pre-treatment covariates…” Interpretation of main results: lack of emphasis on uncertainty bounds
Reiley: “the authors oversell some of their results, mainly by failing to provide confidence intervals rather than point estimates in some of their discussions.” Capitan: “The campaign appears profitable by point estimate, but the confidence intervals and strong behavioral assumptions warrant a more cautious interpretation. Highlighting this uncertainty explicitly…” Interpretation/statistical inference for heterogeneous treatment effects: “The coefficients with the biggest high-low differences all ended up estimating positive treatment effects for one half of the data and negative treatment effects for the other half, which helped the difference become big enough for statistical significance. But it’s hard for me to imagine that negative treatment effects are genuine, rather than mere noise, because I don’t see a good theory for why Save the Children ads would cause donations to decrease among half the population HTE tests subject to MHT bias (the ‘causal trees’ approach they proposed might have dealt with this) Missing statistical tests for HTEs: “the authors did not conduct F-tests to conclude which of these HTEs were statistically significant” Profitability: Trimming (Winsorizing) particularly problematic for these estimates Estimating and Interpreting spillovers: “spillovers shows an indirect effect (the effect of treating neighboring postal codes) that is ten times higher than the direct effect (the effect of treating one’s own postal code).” spillover result relies on a number of arbitrary assumptions, such as the distance over which spillovers can versus cannot occur, and the functional form through which we measure these spillover effects. Possibly relevant but maybe controversial Capitan: “interpreting covariate balance tables as a test of randomization is conceptually incorrect … report standardized differences in covariates without relying on hypothesis tests”
🤖 View all GPT key issues (12)
1. Interference/SUTVA: Geo-randomized Facebook targeting plausibly induces cross-postal-code spillovers and location misclassification; the paper’s neighbor-share regression is suggestive but not a randomized test, so the magnitude/direction of interference remains uncertain and complicates causal estimands.
2. Treatment intensity vs assignment: Postal-code assignment does not ensure uniform exposure; Facebook’s delivery algorithm can create endogenous within-area intensity and composition, making per-capita effects harder to interpret as “advertising causes X euros per million residents.”
3. Revenue effects are borderline: Donation revenue results are weaker than frequency (e.g., long-term p<0.1), so ROI/profitability conclusions are sensitive to specification, tail behavior, and aggregation choices.
4. Winsorization and heavy tails: PLZ-day winsorization at €1,000 may affect inference and ROI in a heavy-tailed donation distribution; robustness is mentioned but should be more fully quantified (e.g., alternative cutoffs, quantile/Poisson models).
5. Post-period choice and seasonality: Ending outcomes at Dec 31 is substantively motivated but couples treatment persistence to extreme seasonal dynamics; more explicit dynamic/event-study estimates would better separate persistence from seasonal noise.
6. Competitor/crowding-out measurement mismatch: Competitor datasets are incomplete (often online-only), overlap across sources is unresolved, and sector-wide giving is unobserved, limiting the strength of conclusions about total charitable-sector effects.
7. Pretreatment imbalance for the 23-charity alliance outcomes: The need for DiD/lagged-outcome adjustment is appropriate, but residual confounding and differing seasonal patterns across areas remain possible; inference is also relatively weak (often p<0.1).
8. Mechanism claims exceed direct tests: The conclusion that effects primarily reflect the “power of asking” is plausible but not directly isolated from alternative mechanisms (salience, trust, concurrent campaigns/news shocks).
9. Factorial design interpretation: Differences across video type and impression allocation are mostly not statistically distinguishable; emphasizing “no matter the content, online fundraising works” should be framed as limited power to detect meaningful differences rather than equivalence.
10. Heterogeneity/multiplicity risk: Many subgroup interactions are reported with strong managerial recommendations, but these appear exploratory and risk false positives without multiplicity control or holdout validation.
11. Profitability depends on assumptions: Long-run ROI uses an assumed lifetime-value multiplier and implicitly assumes competitor crowd-out does not offset social impact; uncertainty propagation around ROI should be more explicit.
12. Replication constraints: Although preregistered and well-documented, proprietary and aggregated data plus unclear code availability limit independent reproducibility of key transformations (aggregation, neighbor calculations, data merges).
Benabou_et_al._2023
Coda title: Willful Ignorance and Moral Behavior
Coverage: 5% | Precision: 10% | Rating: Poor
LLM Assessment Summary
The human critiques focus narrowly on ceiling effects in Likert belief measures and on improper inference from non-significance; GPT’s issues do not address these points and instead critique construct validity and modeling in an apparently different part of the paper.
Detailed Comparison Notes
There is essentially no direct overlap: GPT does not mention Likert ceiling/censoring, Tobit/censored models, power concerns due to ceiling effects, or the logical error of treating non-significance as evidence of no effect. Most GPT points may be substantively reasonable in another context, but they do not align with the specific expert concerns provided.
⚠️ Issues Human Experts Raised That GPT Missed (4)
Ceiling effects in Likert-scale belief measures (most responses at the maximum), which can attenuate estimated treatment effects and differences between groups
Suggestion to use an appropriate censored/outcome model (e.g., Tobit) to account for the ceiling effect (and consider how it interacts with inverse probability weighting)
Misinterpretation of non-significant differences in belief updating as evidence of no difference (“absence of evidence” vs “evidence of absence”), especially under potential low power driven by ceiling effects
Group differences in available ‘room to update’ (information seekers vs avoiders) induced by the ceiling, affecting comparability of belief-updating magnitudes
📋 Extra Issues GPT Identified (10 not in human critique)
Construct validity concerns about pooling ‘deontological’ choices across heterogeneous moral domains/tasks
Overstated conclusion about absence of stable preference types; suggestion of latent variable / IRT / mixture models
Problems using Pearson correlations on dichotomous measures; attenuation and alternative estimators
Confounds in the trolley-style task interpretation (default/omission/commission/fairness/indirectness)
Caution about probabilistic ‘real stakes’ manipulation and interpretation of null effects
Shared framing across tasks potentially driving common behavior (charity vs direct transfer; demand effects)
Limited external validity due to online/student sample
Some tasks may measure compliance/experimenter-demand rather than moral constructs
Multiple-hypothesis/testing and narrative over-weighting small significant links
Open-science/replication clarity: permanent links to data/code/materials
📄 View full human critique
Optional but important Ceiling effects with Likert scale measures [Treich] Assuming that the authors evaluated the beliefs twice, I might have some concerns here. One issue is that most of the participants on this question are distributed at the highest level of the Likert scale (about 70% of the participants report the maximum value looking at Figure A5). When assessing a difference between treatment groups or a treatment effect, ceiling effects are important as they can lead to considerably underestimate the difference. I would suggest using here a Tobit model to take this issue into account. (I assume that there is no issue with combining it with inverse probability weighting.) 2. Reporting on lack of difference in beliefs [Triech] Another related issue concerns the difference in beliefs. The authors write that the difference in beliefs is 0.15 for information avoiders and 0.20 for information seekers (page 18). However, note that the difference is non-negligible (it is about 33% larger). The lack of significance for the difference does not mean that there is no difference (well-known motto: the absence of evidence is not the evidence of absence). This is particularly true in the case of underpowered tests. And as I mentioned above, this is likely to be the case here because of the ceiling effects. If we look at Table A9, we see that the average beliefs are 4.69 for information avoiders and 4.59 for information seekers. It seems that information seekers have more room to update their beliefs on this Likert scale than information avoiders (because of the ceiling effect). Quote from authors (to avoid straw-manning) “However, we do not find that belief updating in response to information differs significantly between information avoiders and seekers (average belief updating: 0.15 vs. 0.20 points, p = 0.55). Hence, differences in belief updating are unlikely to explain the treatment effect heterogeneity between information avoiders and seekers in our experiment.”
🤖 View all GPT key issues (10)
1. Construct validity: the “deontological” option differs across games (harm/omission in trolley, honesty in lying, anti-corruption in bribe, purity in repugnant statement, rule-following, expressive/non-pivotal voting), so weak cross-task correlations may reflect multidimensional moral domains rather than absence of stable traits.
2. The headline conclusion of “no stable individual preference types” is stronger than what low pairwise correlations among binary indicators can establish; latent-variable models (multidimensional factor/IRT, mixture models) could test for structured heterogeneity more directly.
3. Reliance on Pearson correlations of dichotomous choices risks attenuation from measurement error and base-rate differences; reliability-adjusted associations or alternative estimators would better quantify “consistency.”
4. SAL trolley interpretation is confounded by default effects, omission/commission, reallocation of promised aid, fairness across regions, and the indirect nature of “statistical lives via donations,” which may not cleanly map to standard trolley ‘instrumental harm’ constructs.
5. The “real stakes” manipulation in the trolley is probabilistic (10% implementation); while a robustness exercise shows similar probabilities matter in SVO tradeoffs, moral responsibility may respond differently, so null effects should be interpreted cautiously.
6. Many EVM tasks share a common consequence frame (15€ charity donation vs 2€ to another subject); stable preferences about charity vs direct transfers, or about ‘experimenter-intended morality,’ could interact with means manipulations.
7. External validity is limited by an online, mostly-student lab pool; the extent to which observed context-dependence generalizes to high-stakes institutional settings (triage, bribery, AI policy) is uncertain.
8. Some games (e.g., group donation with non-pivotality, rule-following task) may tap compliance/norm sensitivity toward the experiment itself rather than moral philosophy distinctions, complicating interpretation as deontology vs consequentialism.
9. Multiple-hypothesis considerations are noted for some correlations, but the paper’s narrative may still overweight small significant links (e.g., trolley–lying r≈0.16) without a unified error-rate control for exploratory patterns.
10. Open-science/replication usability would be clearer if the manuscript explicitly provided permanent links to anonymized data, code, and experimental materials beyond the preregistration reference.
Bilal_and_Kaenzig_2024
Coda title: The Macroeconomic Impact of Climate Change: Global vs. Local Temperature
Coverage: 85% | Precision: 90% | Rating: Good
LLM Assessment Summary
GPT captures the core human critiques around small-sample/finite-sample inference, extrapolation/linearity, and horizon limitations, but it misses some of the most concrete methodological asks (notably simulation validation and the explicit ‘more lags’ request). It also adds several plausible but broader concerns not highlighted by the experts.
Detailed Comparison Notes
Human critiques are relatively focused (sample size/inference validity; longer horizons/lags; nonlinearity/comparability; out-of-sample scaling). GPT covers these themes but expands into identification/external-validity and SCC-modeling critiques, which are substantively relevant yet not directly reflected in the provided expert comments.
⚠️ Issues Human Experts Raised That GPT Missed (3)
Explicit recommendation to validate finite-sample confidence intervals with simulation evidence tailored to “time series + few observations + many parameters” (GPT notes finite-sample inference concerns but not the concrete simulation-based diagnostic).
Specific suggestion to increase the number of lags to reflect longer geoscience/climate cycles (GPT discusses persistence/shock-construction choices, but not the targeted ‘add more lags’ robustness request in the local temperature model).
Targeted request to report impacts beyond 10 years / provide an SCC robustness calculation that explicitly allows longer-run effects (GPT flags 10-year-horizon issues and SCC model-dependence, but not the concrete ‘show >10 years out’ deliverable as clearly).
📋 Extra Issues GPT Identified (8 not in human critique)
Confounding from unobserved global shocks correlated with temperature innovations (policy/finance/technology/geopolitics) affecting GDP.
External validity concern: natural-variability shocks (ENSO/volcano/internal variability) may not map to anthropogenic warming damages.
Detailed critique of temperature-shock construction choices (Hamilton multi-step forecast error; sensitivity to h and p) and how embedded persistence assumptions affect damages.
Causal skepticism about the extreme-events mediation / bottom-up aggregation channel (measurement and separability assumptions).
Panel design concern about residual common-factor confounding and missing time fixed effects for global shocks.
Uncertainty propagation critique (deep uncertainties not integrated beyond reduced-form CI).
Replicability/code-release and pipeline versioning concerns.
📄 View full human critique
Necessary Small sample size needs augmentation and robustness checking
E2: BK’s main results are calculated using a regression of around 50 observations. The regression appears to have at least 7 parameters. …not clear that the asymptotic approximations required to justify their frequentist confidence intervals are appropriate. I would suggest providing simulation evidence to illustrate the validity of their confidence regions in their setting, with time series data, few observations, and lots of parameters. Authors responded (September 2025) by introducing the BU dataset, which spans 160 years. And note they performed inference robustness using bootstrapping techniques in the appendix.
Optional but important Consider longer-term effects
E2: “It would be good to show the impacts for more than 10 years out, and to include a robustness check SCC calculation which allows for longer run effects” [NotebookLM] . The revised paper directly addresses this by converting their estimates to the damage implied by a permanent 1°C rise in temperature, yielding the much larger long-run figure of 22–34% GDP reduction. Sept version “In the longer BU sample, the same temperature shock leads to a peak effect at 18% after 5 years with a 95% confidence interval of (6%, 30%), … In both cases, impacts do not fully mean-revert even after 10 years. “
And it explicitly includes a section on Sensitivity that details how the SCC changes under alternative assumptions 2. Longer lags to reflect geoscience [NotebookLM]: Evaluator 1 requested increasing the number of lags, especially given the longer cycles noted in geoscience. The newer version notes that in the longer BU sample, they use 4 to 8 lags to capture the complexity of the historical series.
Less important, I guess Non-Linear Effects and Comparability: NotebookLM: Evaluator 2 requested allowing the local temperature model to include non-linear or permanent growth effects for a fair comparison with previous literature. While the authors maintain their main conservative assumption of level effects (not permanent growth effects), they clarify that their local temperature estimates generate a 3% long-run GDP reduction, which is consistent with the “level effects” benchmark of the conventional literature. They explicitly report that imposing non-linear specifications on their shocks did not materially change the core estimates. Out-of-Sample Extrapolation: Evaluator 1 suggested acknowledging how far out of sample the 1°C calibration is, given that shocks are typically around 0.3°C. The 2025 version notes that scaling up to a 1°C increase requires assuming linearity, thereby explicitly addressing this methodological limitation, although they suggest that the consistency of their linear estimates across shock sizes supports the approach.
🤖 View all GPT key issues (11)
1. Time-series identification risk: global temperature innovations may still correlate with unobserved global shocks (policy, finance, technology, geopolitics) that affect GDP at multi-year horizons, and the small annual sample (≈60) makes this hard to rule out conclusively.
2. External validity: natural variability shocks (ENSO/volcano/other internal variability) may not be representative of damages from sustained anthropogenic warming, due to different spatial/seasonal patterns and joint dynamics of climate variables.
3. Magnitude interpretation: the 12% “per 1°C shock” effect partly reflects the post-shock persistence of temperature; the welfare mapping depends critically on how persistence is estimated and translated into a damage kernel.
4. Limited information about nonlinearities and tails: the historical global shocks are small; extrapolating linearly to multi-degree warming and to potential tipping points/adaptation regimes is weakly identified.
5. Long-horizon LP and finite-sample inference: overlapping cumulative-growth outcomes, persistent regressors, multiple controls/dummies, and 10-year horizons can generate biased standard errors and sensitivity to specification choices despite robustness checks.
6. Temperature-shock construction: the Hamilton (2018) multi-step forecast-error approach (choice of h and p) is defensible but not uniquely grounded; shock definitions can embed assumptions about persistence that influence implied damages.
7. Mediation via extreme events is suggestive but not causal: extreme indices rely on reanalysis and threshold/smoothing choices; the bottom-up aggregation assumes separability and correct dynamic persistence and may omit other correlated channels.
8. Panel-country evidence still lacks time fixed effects for global shocks; while global controls and alternative designs are explored, any residual common-factor confounding remains a concern.
9. Structural welfare/SCC translation is model-dependent: representative-agent neoclassical model with damages only through TFP omits adaptation, sectoral/region heterogeneity, risk/uncertainty, and endogenous mitigation; the functional-form restriction for the damage kernel extrapolates beyond observed horizons.
10. Uncertainty propagation: reported confidence intervals mainly reflect reduced-form estimation uncertainty; deeper uncertainties (climate impulse response, external validity, long-horizon extrapolation, measurement error in extremes) are not fully integrated.
11. Replicability: while data sources are public and appendices are detailed, full computational reproducibility likely requires releasing code and exact data-processing pipelines (shocks, weighting, extreme event construction, smoothing choices, and versioning).
Blimpo_and_Castaneda-Dower_2025
Coda title: Asymmetry in Civic Information: An Experiment on Tax Participation among Informal Firms in Togo
Coverage: 80% | Precision: 90% | Rating: Good
LLM Assessment Summary
GPT captures the core human concerns about missing receipt-based verification, self-reporting threats, and (especially) exclusion restriction problems for the 2SLS, but it misses the most specific enumerator/intervention overlap critique and the explicit suggestion to test direct treatment effects on visits and enumerator-trust proxies.
Detailed Comparison Notes
The human critique about enumerators delivering the intervention is only indirectly reflected in GPT’s broader ‘attention/differential engagement’ points; GPT does explicitly flag IV exclusion issues, aligning well with the expert’s main econometric concern. GPT also goes beyond the experts with several generally valid econometric/reporting issues (clustering, multiple testing, robustness), which boosts breadth but reduces strict alignment.
⚠️ Issues Human Experts Raised That GPT Missed (5)
Specific concern that the same people served as both enumerators and intervention deliverers, potentially increasing respondent trust/familiarity and biasing self-reported endline outcomes (especially economic activity) even absent real effects.
Human-suggested diagnostic: test in the data for changes in (or proxies for) respondent familiarity/trust in enumerators that could drive differential reporting.
Human-suggested remedy: consider dropping the 2SLS strategy entirely (given exclusion restriction concerns) and instead present correlational associations between knowledge and outcomes.
Explicit recommendation to test whether the treatment directly affected the probability of receiving a tax official visit (not just how to interpret results conditional on visits).
Suggestion to use survey data to construct an alternative/indirect measure of tax liabilities paid (as a robustness outcome) rather than relying primarily on receipt-validated payments with heavy missingness.
📋 Extra Issues GPT Identified (9 not in human critique)
Potential lack of appropriate clustering of standard errors given clustered sampling/enforcement environments.
Specification sensitivity (large changes in significance across models) and the need for robustness approaches (e.g., randomization inference, pre-specified primary model).
Multiple-testing concerns in heterogeneity analyses and the need for adjustments or clearer reporting of the search space.
Post-treatment conditioning/collider bias warning when splitting/conditioning on visits (related to, but distinct from, the human request to test direct effects on visits).
Attention/Hawthorne effects from differential contact intensity (training + consultations) as an alternative channel beyond knowledge.
Concerns about noise/seasonality and survey-timing effects for economic activity measures.
Mechanism not directly testable without administrative enforcement data (humans noted admin-data limitations mainly for outcome measurement rather than mechanism testing).
Timeline/baseline definition clarity (ensuring covariates/outcomes are strictly pre-treatment).
Open-science/replicability gaps (data/code/preregistration not evident).
📄 View full human critique
Optional but important 1. Same people used as enumerators and intervention — may frustrate causal identification (for outcomes without receipts) [E2 notes] “The intervention was delivered by the same enumerators who conducted the baseline and endline surveys. This is a bit concerning because it introduces the possibility of endline reporting varying because respondents have greater knowledge of and trust in the enumeration team. While this does not concern validated measures like tax payment—which relies on showing a receipt—it does make me a bit more wary of interpreting reports of economic activity. I also think it leads to an exclusion restriction violation for the 2SLS strategy, which currently assumes the treatment can only operate through the channel of improving knowledge. Can the authors test this in some way in the data, i.e., by looking at proxies for familiarity with or trust in the enumerator? This leads me to think the authors might consider dropping the 2SLS and replacing it with correlational analysis between knowledge and tax payments.” 2. They should [E2] “test whether the treatment affected the likelihood of a visit directly?” ‘While the treatment itself could affect the likelihood of a visit, the patterns that we observe are still instructive.’ DR: Unsure 1. Lack of admin data limits accuracy, consider further imputation work E1 “… measures of tax participation and tax payments are self-reported by the MIEs’ owners.” … although they do ask “owners to provide proof of tax payment through receipts … which resulted in a considerable mass of missing values … imputed as zeros…” implying “a noisy measure of the treatment impact on this variable, which the authors [recognize]” consider using survey information to “construct an indirect measure of liabilities paid by the MIEs, as an alternative outcome variable for robustness checks.” DR: But it’s unclear how much can be done here, and they don’t seem to be overclaiming
🤖 View all GPT key issues (12)
1. Tax revenue amounts are based on a small subset with receipts (≈60/383) and the analysis imputes all missing/unverified amounts as zero, conflating nonpayment with non-verification and making the “revenue increased” claim highly sensitive to missing-data assumptions.
2. The binary “pays taxes” outcome may be differentially misreported after training because the intervention changes respondents’ understanding/classification of payments (TPU vs other taxes/fees/bribes), threatening comparability across arms.
3. Treatment firms received substantially more contact (training + follow-up consultations) than controls, so effects on economic activity and even tax behavior may partly reflect attention/Hawthorne effects rather than information about the fiscal contract per se.
4. Conditioning on (or splitting by) post-treatment variables such as “visited by a tax official” risks post-treatment selection/collider bias; such results should be framed as descriptive rather than evidence of bargaining-power mechanisms.
5. The 2SLS “knowledge take-up” LATE relies on a constructed endline measure (including an ‘accountability opinion’ proxy) and likely violates exclusion (assignment affects outcomes through channels other than the measured knowledge items), so IV coefficients should not be interpreted straightforwardly as causal effects of ‘knowledge.’
6. Standard errors are not clearly clustered by enumeration area/neighborhood despite clustered sampling and likely correlated enforcement environments; uncertainty may be understated.
7. Main treatment effect significance varies notably by specification (from 10% without controls to 1% with controls), suggesting sensitivity and the need for robustness checks (e.g., randomization inference, clustered SEs, alternative specifications, or pre-specified primary model).
8. Economic activity measures (open orders; estimated monthly revenue) are potentially noisy and seasonally variable, and the paper does not fully rule out survey-timing or reporting effects tied to differential enumerator engagement across arms.
9. Heterogeneity analyses (revenue quartiles, revenue-to-assets ratio, tax unpredictability) appear exploratory without multiple-testing adjustments or a clear count of attempted interactions, raising the risk of selective emphasis.
10. The mechanism story (‘empowerment’ reallocates audits from low- to high-revenue firms) is plausible but not directly tested with administrative enforcement data; evidence is indirect and partly based on self-reports and descriptive patterns.
11. Timeline/baseline clarity could be improved (sampling-frame survey vs experimental baseline), to confirm all baseline covariates and outcomes used in ANCOVA are strictly pre-treatment and comparable across arms.
12. Replicability/open-science elements (data/code availability, pre-registration, analysis reproducibility) are not evident in the manuscript, limiting external verification and reuse.
Bruers_2021
Coda title: The animal welfare cost of meat: evidence from a survey of hypothetical scenarios among Belgian consumers
Coverage: 45% | Precision: 90% | Rating: Moderate
LLM Assessment Summary
GPT captures the general idea of construct-validity/confounding but does not directly flag the expert’s central criticism about cultured-meat WTP being an invalid proxy for altruistic animal-welfare WTP and the specific confounds involved. Most GPT issues are substantive and plausible, but they largely go beyond the human critique rather than matching it.
Detailed Comparison Notes
The human critique is narrowly focused on one proxy-measurement problem (cultured meat as a confounded measure of altruistic WTP), whereas GPT provides a broad methodological audit of the paper (identification, attrition, outliers, inference, external validity, policy extrapolation). Hence precision is high (issues are relevant to paper quality) but alignment/coverage relative to the specific expert critique is only partial.
⚠️ Issues Human Experts Raised That GPT Missed (3)
Key construct-validity concern specific to the paper’s proxy: using WTP for replacing conventional meat with cultured/cultivated meat may not identify altruistic animal-welfare preferences because it is confounded by expected product quality differences, food neophobia, and “naturalness/anti-food-tech” attitudes.
Cultured-meat substitution may change beliefs about animal population dynamics (e.g., preventing animal births), which can affect welfare judgments independently of “reducing suffering,” so the implied marginal altruistic WTP may be conceptually mis-specified.
The expert critique’s broader point that many stated limitations may already be acknowledged in the paper (i.e., not true omissions) is not addressed by GPT.
📋 Extra Issues GPT Identified (11 not in human critique)
Overstated interpretation relative to acknowledged uncertainty/sensitivity.
📄 View full human critique
Necessary None
Optional but important Many of the limitations are already acknowledged by the author (in the paper?), and thus not “omissions/mistakes”. At least one issue was not noted in the original paper (which the author agrees with).
Espinosa [marginal altruistic WTP] is approximated through the WTP for the replacement of one kilogram of conventional meat supply with one kilogram of cultured meat. However, it is unclear whether the cultured-meat WTP effectively reflects altruistic preferences. For instance, consumers could expect the quality to differ between the two products, they could be reluctant to try new food (food neophobia), they could hold naturalistic views about food (i.e., a reluctance against food tech), etc. At the altruistic level, individuals could also think that cultured meat would prevent the birth of some animals, which, even if their net lifetime welfare is negative, would not be socially desirable. Authors’ response: …I agree with [this] criticism: there are many confounding factors to measure the marginal WTP using questions about cultivated meat. The same goes for hypothetical questions about e.g. meat from animals that are genetically programmed to not feel any suffering. I don’t know of another method used in the literature that captures the full marginal WTP to avoid all suffering of a farmed animal, without confounding elements. As animals on animal welfare certified farms still experience some non-zero suffering, traditional research methodologies involving WTP for animal-welfare-labelled meat do not capture the full WTP to avoid all suffering, and also involve confounding factors such as the level of trust in the certification scheme.
🤖 View all GPT key issues (12)
1. The identification of “external animal welfare costs” with respondents’ pill-scenario WTP/WTA relies on strong, unvalidated assumptions (additive separability, correct interpretation of the vignette, δa=1 interpersonal/species comparability, and a meaningful zero point tied to non-existence/neutral welfare).
2. The welfare-neutral animal is intended to anchor the zero point, but many respondents still exhibit positive valuations for the neutral animal (especially in means), indicating the relative measure may not isolate animal welfare from general aversion to the scenario (identity loss, being non-human, fear/disgust).
3. Extreme attrition and selection: core median estimates use very small subsamples after exclusions (e.g., N≈57/N≈66), and excluding below-median confidence responses likely biases estimates upward (the paper notes low-confidence respondents often report zero/low WTA).
4. Outlier handling for mean WTA is ad hoc (rank “10× jump” rule; effective €1,000,000 cutoff) and changes headline results by orders of magnitude; no preregistered or theory-grounded robust estimation strategy is provided.
5. The WTA–WTP gap is enormous and central; interpreting WTA as the “right” welfare measure is not justified within a clear rights baseline or welfare-theoretic loss function, and the paper alternates between mean/median and WTA/WTP to motivate conclusions.
6. Construct validity: responses may reflect moral attitudes, social desirability, or misunderstanding rather than welfare judgments; the lack of correlation between representative-agent valuations and MWTP for cruelty-free meat suggests the measures are capturing different constructs or are noisy/unstable.
7. Statistical inference is limited: small effective samples, multiple tests, low power, and no multivariate modelling; key heterogeneity questions (knowledge, diet, engagement, comprehension) are left largely unresolved.
8. The per-kg conversion (animal valuation divided by assumed edible meat yield) is a key driver of the “chicken >> beef” conclusion but is not stress-tested for alternative yield assumptions, production systems, or uncertainty in yields.
9. External validity is narrow (Flemish-speaking Belgian online panel, 18–65); nevertheless, results are used to motivate broad global/practitioner conclusions and compared numerically to other countries’ parameters.
10. The carbon-tax backfire calculation mixes contexts (Belgian welfare costs with US prices/elasticities/consumption) and relies on strong functional-form assumptions; uncertainty is not propagated, yet the policy recommendation (flat meat tax) is stated relatively strongly.
11. Reporting/replicability gaps: the manuscript does not (in the provided text) specify full questionnaire wording in an appendix, detailed data cleaning scripts, or provide data/code, limiting reproducibility and auditability.
12. Interpretation sometimes overstates what the data show: given sensitivity analyses and acknowledged unreliability, claims about welfare costs “likely” exceeding consumer surplus and climate costs should be framed as highly contingent scenarios rather than robust estimates.
Clancy_2024
Coda title: The Returns to Science In the Presence of Technological Risks
Coverage: 45% | Precision: 95% | Rating: Moderate
LLM Assessment Summary
GPT captures one of the three main expert concerns well (the fragility of preferring superforecasters over domain experts) and only tangentially relates to the discounting critique, but it largely misses the experts’ central scope/framing complaint about ignoring AI risk. Most GPT issues are substantive, but they are mostly additional technical/modeling critiques not raised by the human experts.
Detailed Comparison Notes
The closest GPT comes to the experts’ AI-omission point is a brief mention that risks may not be shifted by marginal science pace ‘especially if AI/engineering dominate capabilities,’ but it does not flag the evaluators’ main complaint: that AI risk levels are plausibly much higher than bio risk and should be included in the benefit–risk comparison. Likewise, GPT’s general critique of the discounting/hazard construct does not specifically address the evaluators’ objection to the 2% rate being derived from a short-horizon TAI forecast and extrapolated over very long horizons.
⚠️ Issues Human Experts Raised That GPT Missed (3)
Core omission/“stacking the deck”: the paper compares historical benefits of science against biotech risk while largely ignoring AI risk (and does not justify treating AI progress as outside ‘science’/academic funding); also insufficiently explained why other major risks (nukes, climate) are treated as effectively sunk costs.
Specific critique of the 2% discount rate: inconsistent/insufficient justification, including implicitly extrapolating a short-horizon TAI forecast (meant to resolve in ~20 years) over millennia, and high sensitivity of results to this choice.
Reviewer concern that the paper’s scope framing is unfair in a way that biases against concluding in favor of a science slowdown (beyond technical model critiques)—i.e., a “question framing”/comparative-baseline critique rather than a parameter/structure critique.
📋 Extra Issues GPT Identified (10 not in human critique)
Conflation of average returns to science, effect of a one-year global pause, and marginal metascience/philanthropic acceleration (non-equivalence under diminishing returns/general equilibrium).
Detailed concerns about the hazard/“epistemic regime” construction and sensitivity to that modeling choice (beyond the human critique’s focus on a particular discount rate).
Critiques of the XPT-to-hazard transformation pipeline and lack of uncertainty propagation/sensitivity analysis.
Normative assumptions in the welfare function and population ethics shifts across model versions affecting headline ROI.
Potential overstatement in health/population mechanisms (permanent growth effects from a one-year pause) and hardcoded demographic assumptions.
Questioning calibration of long lags from science to invention/diffusion and dependence of results on these lag choices.
Policy-relevance critique of the ROI comparator/counterfactual and linear conversion from a global pause to per-dollar marginal effects.
Weak identification of attribution fractions (e.g., science share of life expectancy gains) underlying key conclusions.
Replicability concerns (lack of clearly provided code/data artifacts; need for reproducible workflow).
📄 View full human critique
Necessary None
Optional but important Unfair to compare all benefits of science to only biotech risk; neglect of AI Risk “stacks the deck against concluding in favor of a science slowdown…” NotebookLM: Evaluator 3 noted that AI risk estimates (both catastrophic and extinction) from the Existential Risk Persuasion Tournament (XPT) are much higher than those for biotech risks (e.g., superforecasters see AI extinction probability as 38x higher than bio extinction probability),
[NotebookLM] Author’s Response: The author, Matt Clancy, acknowledged that “all three reviewers noted at various points […] that this analysis should be extended to include AI risk”. He agreed that the report “should have spent more time on this question” of why it ignores the impact of science on other risk areas. He provided an extensive response attempting to model AI risks post-evaluation.
According to the author, this choice was based on the focus on funding ‘fundamental science’ (perhaps ‘academic’ work) and the assumption that AI progress mainly comes from private, applied, for-profit research. But he conceded this assumption may be “overly strong”. Also, he saw other risks (nuclear weapons and climate change as primarily “sunk costs of technological progress”. But this was not explained in the initial paper, so it should be flagged as an ~issue, omission or error.
Choice of 2% discount rate is not consistently justified [NotebookLM, checked manually] [E2] The author chose 2% (driven mainly by the 1.6% annual probability of transformative AI derived from the Open Philanthropy Worldview contest, assembling credences of AGI arriving before 2043). … this is implicitly extrapolating a forecast designed to be resolved in ~20 years out for thousands of years, which is the same criticism the author used to dismiss alternative, lower discount rates implied by XPT participants’ forecasts (0.05%–0.7%) E1 did simple calculations that confirmed that the results are sensitive to this choice of discount factor [NotebookLM] … [Author accepts] that this implicitly extrapolates a short-term forecast over a much longer timeframe. Justified the choice by noting that a higher discount rate limits the impact of assumptions about the distant future, arguing that this shortcut “has less severe consequences for the model’s results when the implied discount rate is higher” The author’s response might be seen as an ad-hoc justification, and it was not mentioned in the paper → flag as an ~issue, omission or error.
Unsure if correct/relevant NotebookLM: Evaluators 1 and 3 argued that Clancy’s stated personal preference (3 in 4 chance) for the more optimistic Superforecaster estimates (which result in a positive return to science) over the Domain Expert estimates (which suggest accelerating science may be a net negative) was “too strong” or required more ambivalence
🤖 View all GPT key issues (12)
1. The analysis conflates (i) the average return to global science spending, (ii) the effect of a hypothetical one-year global pause in science, and (iii) the marginal impact of metascience/philanthropic acceleration; these are not equivalent under diminishing returns, portfolio composition, and general equilibrium responses.
2. The “epistemic regime” discounting (constant annual p) combines forecast invalidation, catastrophic risk, and structural change into one hazard and assumes policy has no effect on regime transition in the baseline; conclusions about long-run benefits vs perils are highly sensitive to this construct.
3. The “time of perils” is modeled as a discrete step increase with constant hazards (d, d_x) and a one-year shift from pausing/accelerating science; real risk likely evolves endogenously and may not be meaningfully shifted by marginal changes in science pace, especially if AI/engineering dominate capabilities.
4. Mapping XPT forecasts into annual mortality and extinction hazards involves many strong transformations (conditioning on regime exit via GDP/extinction questions; imposing q0/q1/q2 step form; annualizing 5-year probabilities; severity assumptions for >1% and >10% events); uncertainty from this pipeline is not propagated into results.
5. The welfare function embeds strong normative commitments (log income, 2 log-points per healthy year, aggregation over persons and time) that materially drive results; the treatment of population ethics shifts across model versions (constant growth vs fixed births), affecting comparability and interpretation of the headline ROI.
6. The baseline model’s health/population mechanism (a one-year pause permanently reduces population growth) likely overstates long-run effects; the “more realistic health” correction helps but introduces other hardcoded assumptions (US SSA survival curves, ceiling at age 120, diminishing returns via log(birthyear−1800)).
7. The choice and calibration of long lags (20 years science→invention plus ~54 years diffusion) rely on indirect proxies (patent citation patterns, GDP/life expectancy convergence) that conflate science with other determinants; results depend on these lags because costs are assumed near-immediate while benefits are delayed.
8. The comparison metric (ROI vs giving $1 to a $50k earner) is not a policy-relevant counterfactual for most funders, and the conversion from a one-year global pause to per-dollar marginal effects assumes near-linearity and a correct denominator for “science spending.”
9. The claim that non-extinction mortality risks from advanced biotech are too small to offset historical benefits hinges on assuming historical benefit trends persist and are attributable to ‘science’ at the stated shares (e.g., 56% of life expectancy gains); these attribution fractions are only weakly identified.
10. The extinction-risk extension uses a reduced-form term for the value of the next regime (λW) without a consistent treatment of discounting, population, and income dynamics across regimes; the resulting “break-even λ” numbers can be misleadingly precise.
11. Arguments for preferring superforecasters over domain experts (intersubjective accuracy, correlated pessimism, limited track-record evidence) are suggestive but not robustly validated; conclusions about whether to accelerate science under extinction risk remain epistemically fragile.
12. Replicability is limited: the report describes a Python simulation and spreadsheet steps but does not clearly provide code/data artifacts, and many parameter choices involve manual judgment calls that would benefit from packaged reproducible workflows and probabilistic sensitivity analysis.
Dullaghan_and_Zhang_2022
Coda title: Forecasts estimate limited cultured meat production through 2050 (EA forum post)
Coverage: 45% | Precision: 85% | Rating: Moderate
LLM Assessment Summary
GPT captures the major meta-critique about the tiny sample and partially overlaps with concerns about TEA anchoring, but it misses several central, concrete expert points (notably the FGF2 unit error and the conditional-claim misinterpretation) and several domain-specific framing/definition problems. Many GPT-added points are still plausible and substantive, but they go beyond what the human experts emphasized.
Detailed Comparison Notes
On TEAs, GPT’s “heavy reliance on a few techno-economic analyses as anchors” aligns with the experts’ concern about over-reliance on Humbird and insufficient source diversity, but GPT does not explicitly flag the experts’ key nuance that TEAs are conditional scenarios using present-day costs (not forecasts of future costs). The most consequential expert-identified issue—the 1000x FGF2 unit typo—is not reflected in GPT’s list, even though it is a concrete, potentially high-impact error.
⚠️ Issues Human Experts Raised That GPT Missed (7)
Specific magnitude/unit error in the Delft FGF2 cost table (gram vs kg; ~1000x misstatement) and the possibility that this materially anchored forecasts
Mischaracterization of conditional claims (treating conditional probabilities about protests/willingness-to-try as unconditional evidence that consumer approval is not a major constraint)
Need to clearly communicate that the TEAs are conditional/scenario estimates (often using present-day input costs) rather than predictions of future costs, and that this framing may have misled forecasters
Need for deeper cell-biology/bioprocessing expert input to avoid definitional/technical mistakes (e.g., incorrect definition of “growth factors” including vitamins/cytokines)
Diversifying and updating the background literature provided to forecasters beyond heavy reliance on Humbird (e.g., more recent sources, patents, regulatory dossiers, GFI reports)
Elicitation design: lack of opportunity for discussion/structured collaboration and updating among forecasters (which Tetlock-style forecasting suggests improves performance)
Insufficient specification of the cell type/tissue type (beyond ‘cow cells’) despite its strong influence on media needs and cost
📋 Extra Issues GPT Identified (9 not in human critique)
Ambiguity in the core definition “>51% from animal cells” (weight vs calories vs value), especially relevant for hybrid products
Insufficiently pre-committed resolution protocol / authoritative data sources for global production and sales endpoints given proprietary/inconsistent reporting
Potential wording/reference error in an amino-acids input-cost question (mixing recombinant proteins and non-amino-acid examples) as a biasing anchor
Unjustified aggregation rule (geometric mean of odds) and lack of robustness checks to alternative pooling/extremizing/leave-one-out approaches
Use of correlation and p<0.05 significance claims with n=6 without multiple-comparisons control or credible statistical assumptions
Conflation of descriptive forecasts with strategic recommendations without modeling counterfactual philanthropic leverage
Insufficient analysis of displacement/rebound/market-expansion mechanisms relevant to animal-welfare impact
Underspecified “conditional on no transformative AI” assumption and lack of sensitivity to broader macro scenarios
📄 View full human critique
Necessary Small sample limitations [Manheim] severely limits the conclusions which can be drawn from the difference in views between the single expert and the five Somewhat overstated/mis-stated claims from these small samples: Manheim “The aggregate reported results, most notably, a 9% chance of greater than 50m metric tons of cultured meat sold in 2051, hide the disagreement, in this case, the fact that two forecasters reported a greater than 40% probability of this occurring.”
Magnitude of units error in the table for Delft overstating the cost of FGF2 1000x [Manheim] A potentially critical issue was found with the table provided to forecasters and experts for question 24, which says that the Delft analysis had a cost of FGF2/kg of 1.3-2.3B/kg, while the report itself, in table 3, this was the cost per gram, making the numbers approximately 3x what was reported for Humbird, not 3,000x. (The Risner et al price was, in fact, 1000x higher - but was also focused on the promise of eliminating FGF2, i.e. setting the price to zero. Notably, this is a possibility that the forecasts do not seem to take into account.) I do not know enough about this to check it in depth. Author’s response: Hi, Yeah that looks like a typo at first glance. Plausible it influenced things a little, though unsure it’s critical to the end results.
Mischaracterization of conditional claims The authors stated: Consumer approval did not appear as a major constraint as measured by the probability of large-scale anti-cultured meat protests (18%), or the majority of survey respondents saying they would be willing to try cultured meat (70%) (conditional upon >10,000 cumulative metric tons of cellular meat produced at any price by 2031). I assume he is saying that the ‘probability of protest predicted to be low’ and ‘willingness to try’ being high are both signs that consumer approval is not a major barrier. But as you say this is conditional upon >10 kt of production. So the forecasters might indeed think “This is unlikely to be accepted by consumers, and there will probably be massive protests if it is rolled out. And that will be a major barrier. But in the unlikely scenario that consumers seem to like it, and there are no big protests, that makes it far more likely that 10kt will be produced”/
Optional but important [Manheim] “the presentation of the Techno-Economic Analyses did not clarify that these analyses are conditional estimates, not predictions, and that high price scenarios were all based on the present-day costs… not predicted future cost.” More consultation with cell biology and bioprocessing expertise in formulating background for forecasters (specific definitional mistakes reflect a lack of expertise) [Seinkmane] They make further incorrect statements on growth factors, such as: “(growth factors are various hormones, cytokines, vitamins, and some other proteins that promote cell growth)”. Such a definition indeed can be found on Google, however in cell biology it is normally only protein signalling molecules, especially those affecting cell proliferation, that are classified as growth factors, certainly not vitamins and in most current literature not other cytokines either (see wiki article for quick review). Therefore for any further forecasts and synthesis I would strongly recommend further consultations with relevant experts, both in the CM field and in adjacent fields (wider cell biology and bioprocess engineering backgrounds). 3. Diversify sources in formulating background for forecasters [Seinkmane] The authors admit themselves that they relied primarily on one source – the Humbird TEA – with the addition of a couple other papers. I have compiled a list of literature (see relevant tab) that would hopefully provide other and more recent sources to facilitate future forecasts. Aside from peer-reviewed publications and regulatory dossiers + patents available that have already been submitted for product approval, GFI releases information regularly, including their state of industry reports and reports focusing on specific topics such as media ingredients.
Less important/unsure Lack of discussion and updating in expert elicitation [Manheim] It is somewhat unfortunate that the forecasters and the expert were not given an opportunity to discuss and update, as this is an important part of what enables superforecaster success. Specifically, Gardner and Tetlock’s (2015) book points out that discussion dynamics and structured collaboration significantly amplify performance compared to individual experts, due to shared reasoning and cross-examination of assumptions.
Define the cell type more clearly [Seinkmane] Neil & Linch have focused on beef (their questions refer to “cow cells”) and they don’t define cell type very specifically… for future forecasts CM type should be better-defined, as both the species and the tissue type influence the media requirement – and therefore the cost – to a great extent
🤖 View all GPT key issues (11)
1. Very small and potentially unrepresentative panel (n=6) for long-horizon technology-scaling forecasts; no demonstrated calibration for this domain/horizon, and no quantification of aggregate uncertainty or sensitivity to individual forecasters.
2. Ambiguous key construct: “>51% from animal cells” was not specified as weight/volume/calories/value; this can change both forecasters’ interpretations and eventual resolution, especially for hybrid products.
3. Resolution and measurement risk for global production/sales volumes is high (proprietary data, inconsistent reporting); the paper does not fully pre-commit to a detailed resolution protocol and authoritative data sources for the main endpoints.
4. Acknowledged wording/reference error in the amino-acids input-cost question (mixing “recombinant proteins” and non-amino-acid examples) plausibly biases an important ‘crux’ variable and complicates interpretation of correlations and narratives.
5. Aggregation choice (geometric mean of odds) is not justified or stress-tested against alternative pooling rules (linear pool, extremizing, performance weighting, leave-one-out), despite the tiny panel and potentially influential outliers.
6. Correlation and “statistical significance (p<0.05)” claims are not credible with n=6 and many tested pairs; multiple-comparisons and model assumptions are not addressed, risking spurious inferences about which signposts matter.
7. The report mixes descriptive forecasts with strategic recommendations, but does not model counterfactual philanthropic leverage; probabilities of outcomes do not directly imply which interventions are best.
8. Heavy reliance on a few techno-economic analyses as anchors without a structured approach to combining first-principles engineering constraints with reference-class growth; this may over-weight specific assumptions (e.g., sterility standards, media costs).
9. Some questions are compound and cognitively demanding (multiple thresholds, time windows, price conditions, species restrictions), increasing noise and interpretation variance across forecasters.
10. Key welfare mechanism (degree of displacement of conventional meat, rebound effects, market expansion) is acknowledged but not analyzed, limiting direct usefulness for animal-welfare impact estimation.
11. The “conditional on no transformative AI” assumption is underspecified and may complicate interpretation; the paper does not discuss how sensitive conclusions are to broader macro/tech scenarios (policy shocks, pandemics, geopolitical shifts).
Frech_et_al._2023
Coda title: Pharmaceutical Pricing and R&D as a Global Public Good
Coverage: 70% | Precision: 90% | Rating: Good
LLM Assessment Summary
GPT captures the experts’ central critique that the GDP–price relationship is not causally identified and may reflect omitted factors like willingness to pay, but it misses the experts’ specific concern about overconfident cost-effectiveness BOTECs and heterogeneity across indications. Most additional GPT issues are relevant methodological weaknesses rather than spurious points.
Detailed Comparison Notes
The human critiques are narrow (mainly causal/OVB and BOTEC confidence), while GPT provides a broader methods-and-measurement audit. GPT’s points about mechanical scaling and conflation with general spending elasticities substantially overlap with the experts’ ‘GDP correlated with other determinants’ and ‘willingness to pay’ concerns, even if not phrased as classic omitted-variable bias. The only clear gap is the paper’s cost-effectiveness BOTEC claims and their sensitivity/heterogeneity.
⚠️ Issues Human Experts Raised That GPT Missed (2)
Overconfidence in back-of-the-envelope cost-effectiveness (e.g., ~$40k/QALY) claims without clear sensitivity analysis to key assumptions
Failure to acknowledge/handle heterogeneity in cost-effectiveness across drugs/indications (variation around the average BOTEC)
📋 Extra Issues GPT Identified (10 not in human critique)
Questioning whether the dependent variable (single-year quasi-rents) is the incentive-relevant object for R&D decisions (lifecycle/global expected profits)
Mechanical/logical dependence of the constructed ‘contribution’ measure on GDP via spending/sales scaling, undermining identification of strategic free-riding
Weak/assumption-heavy marginal cost identification (Turkey price index proxy; generic-entry erosion; rebate adjustments) and the problematic assumption of common MC across countries
Potential mismeasurement in cross-country price indices due to mix, launch timing, coverage, and confidential discounts
Not accounting for quantity/access differences (launch lags, reimbursement/coverage) that could confound interpretations of low prices as free-riding
Theory-to-empirics mismatch: models are not translated into discriminating, testable implications beyond the GDP correlation
Elasticity>1 is not a clean test of Olson–Zeckhauser/public-goods logic; could reflect standard income elasticity of health/pharma spending
Data construction/auditing concerns (brand-share assumptions; alignment with price-index basket) and limited leverage/outlier diagnostics in small samples
Policy recommendations lack explicit welfare analysis balancing innovation incentives vs static access/affordability and distributional impacts
Replication/transparency gaps (code/data wrangling detail; uncertainty propagation for key assumptions)
📄 View full human critique
Optional but important
Causal claims too strong — GDP correlated with other important determinants of outcomes
[E1] The primary regression analyzed in the paper is overly simplistic and may be subject to omitted variable bias. There could be other factors correlated with GDP that are not included in the model. Therefore, results derived from this model should be interpreted with caution as the estimated effect of GDP may be overstated or misattributed.
[E2] The authors interpret the positive correlation between GDP and drug prices as evidence supporting their hypothesis that drug prices are affected by how much countries internalize returns from R&D (because countries with higher GDP also benefit more from future innovation). However, the relationship between GDP and drug prices could be driven by many factors, chief among them the willingness to pay for current health benefits.
Unsure if correct
[E2] BOTECs stated too confidently, not incorporating sensitivity to key assumptions
The authors, citing numbers from another one of their studies, claim that new drugs launched in the U.S. are very cost-effective—around $40,000 per Quality-Adjusted Life Year (QALY) (Frech et al. 2022). Two main criticisms of these back-of-the-envelope calculations: First, estimates of drug cost-effectiveness rely on many assumptions and are generally highly sensitive to them. Second, there is likely broad variation across indications in the cost-effectiveness of drugs.
🤖 View all GPT key issues (12)
1. The dependent variable (“contribution” = quasi-rents on all branded-originator sales in a single year) is not clearly the incentive-relevant object for R&D, which depends on expected global profit streams for marginal projects; life-cycle stage, risk, and therapeutic heterogeneity are ignored.
2. The main empirical result (log contribution strongly explained by log GDP with elasticity > 1) is plausibly mechanical because the contribution measure is built from drug spending/sales, which naturally scales with GDP; this does not identify public-goods strategic behavior.
3. Marginal cost (MC) is weakly identified: using Turkey’s low price index as MC and using generic-entry price erosion (then adjusting by an average rebate) both rely on strong assumptions and can materially change levels and even signs of contributions.
4. Assuming a common MC across countries is particularly problematic because the paper’s MC includes distribution and downstream costs that vary across systems (wages, geography, pharmacy/PBM margins), confounding cross-country markups.
5. Cross-country price indices may reflect differences in drug mix, launch timing, reimbursement/coverage, and confidential discounts; interpreting them as “same products, different prices” risks mismeasurement of both prices and implied quasi-rents.
6. The analysis treats lower spending/markups as free-riding, but does not incorporate quantity/access differences (e.g., launch lags, reimbursement refusals), which could mean lower prices coincide with reduced consumption of newer drugs.
7. Theoretical constructs (Lindahl optimum; Nash interior/corner equilibria; bargaining) are not mapped to testable implications beyond the GDP correlation; the empirical work does not discriminate among the proposed models.
8. The “exploitation hypothesis” test (elasticity > 1) is not a clean test of Olson–Zeckhauser in this setting because it conflates general health/pharma spending elasticities with strategic under-contribution to R&D.
9. The construction of national branded-originator revenues from total sales × brand share assumes comparability across countries and alignment with the RAND price index basket; potential inconsistencies are not audited.
10. Outlier sensitivity and leverage are not thoroughly assessed (the US and Japan dominate totals; negative/zero contributions are dropped in logs), which can affect coefficient estimates and inference in small samples.
11. The policy conclusion that ROW countries should pay more is not supported by a welfare calculation that explicitly weighs dynamic innovation gains against static access/affordability losses and distributional concerns under differential pricing.
12. Replication barriers remain: no shared code, limited detail on data wrangling/transformations from the RAND tables, and no uncertainty propagation for MC assumptions or index measurement error.
Green_et_al._2025
Coda title: Meaningfully reducing consumption of meat and animal products is an unsolved problem: A meta-analysis
Coverage: 75% | Precision: 92% | Rating: Good
LLM Assessment Summary
GPT captures most core human critiques (improper null imputation, single-outcome selection, and inadequate risk-of-bias coverage including attrition) but misses the explicitly highlighted systematic-review transparency/reproducibility deficits and the n<25 exclusion concern.
Detailed Comparison Notes
The strongest overlap is on (i) the SMD=0.01 handling of missing/non-significant results and (ii) the wasted-data problem from selecting only one effect per study, plus (iii) risk-of-bias gaps (GPT explicitly lists attrition and selective reporting). The main shortfall is that GPT reframes the systematic-review-methods critique into coder/researcher-degrees-of-freedom issues without directly calling out missing reproducible search strings and incomplete inclusion/exclusion documentation, which are central to the human assessment.
⚠️ Issues Human Experts Raised That GPT Missed (3)
Failure to follow standard systematic-review reporting/methods specifically: lack of a fully reproducible search strategy (full queries, databases, dates, etc.)
Lack of clearly articulated inclusion/exclusion criteria and explicit justification/documentation for screening decisions (beyond the single-coder concern)
Questionable exclusion rule of omitting studies with n<25 per group (and the implication that this can impede publication-bias detection by restricting the SE range)
📋 Extra Issues GPT Identified (9 not in human critique)
Heterogeneous outcome constructs and conversions being pooled (non-comparability across very different measures)
Single-coder screening/extraction and missing inter-rater reliability reporting
Post hoc eligibility refinements creating researcher degrees of freedom absent preregistration/robustness checks
Extremely sparse evidence for choice architecture under the stated criteria (very low k leading to over-interpretation risk)
Concerns about publication-bias model assumptions (selection on p-values/sign vs outcome/analysis-level selective reporting, differing incentives in gray literature)
Dependence/weighting sensitivity beyond RVE (how multiple interventions per study are represented; need alternative aggregations)
Limited translation of SMDs into decision-relevant absolute units or cost-effectiveness
RPM vs MAP inference gap: substitution to other MAP largely unmeasured, so conclusions should be framed as an evidence gap
External validity/generalizability limitations due to restrictive eligibility (pure controls, voluntary choice), potentially excluding policy-relevant interventions
📄 View full human critique
Necessary General: “The authors have not followed standard methods for systematic reviews…” — this should at least be mentioned Related: Missing “a fully reproducible search strategy, clearly articulated inclusion and exclusion criteria …, and justification for screening decisions are not comprehensively documented in the manuscript or supplement. Improper missing data handling: Assigning SMD = 0.01 to non-significant unreported effects introduces systematic bias by ignoring imputation variance.
Optional/desirable Single outcome selection wastes data: Extracting only one effect per study discards valuable information (multilevel modeling could work better) Risk-of-bias assessment is inadequate: The informal approach omits critical bias sources like selective reporting and attrition. No discussion of attrition bias in RCTs… “concerning given the known non-randomness of attrition in dietary interventions”
Probably useful but more debatable Excluding studies with n<25 per group is ‘unwarranted’ because it makes publication bias harder to detect by ‘restricting the range of standard errors.’ [But one could make a case that this is a reasonable practical screening choice]
🤖 View all GPT key issues (12)
1. Outcome selection rule: extracting only the longest-follow-up consumption outcome per intervention is a consequential choice that may systematically shrink estimates and is not accompanied by sensitivity analyses using alternative rules (e.g., prespecified primary endpoints or multivariate follow-up modeling).
2. Imputation of “unspecified nulls”: coding outcomes reported as null but lacking sufficient statistics as SMD=0.01 is arbitrary and can materially affect a pooled effect this close to zero; the count of such cases and robustness to alternative codings are not clearly presented in the main text.
3. Heterogeneous outcome constructs: pooling across very different consumption measures (binary self-report reduction, frequency counts, purchasing data, servings) and multiple effect-size conversion pathways risks non-comparability and additional error; limited sensitivity analyses restrict to more homogeneous outcome types.
4. Single-coder screening and extraction: inclusion/exclusion decisions and data extraction were primarily conducted by one author, increasing risk of missed studies, inconsistent coding, and bias; inter-rater reliability is not reported.
5. Post hoc eligibility refinements: several inclusion decisions were made after data collection began (e.g., RPM category definition and exclusions for “substitution-inducing” or low-agency interventions), creating researcher degrees of freedom unless accompanied by preregistration or systematic robustness checks.
6. Sparse evidence for choice architecture under inclusion criteria: only 2 studies (3 estimates) qualify, yielding extremely wide uncertainty; narrative comparisons across theoretical categories can be over-interpreted despite low k.
7. Limited risk-of-bias assessment: the coded bias indicators (self-report/objective, preregistration/open data, publication status) do not fully address common RCT bias domains (attrition, allocation concealment, contamination, compliance, clustering/ICC handling).
8. Publication-bias modeling assumptions: selection models and significance-based approaches assume selection on p<0.05 and sign, but selective reporting may occur at the outcome/analysis level and incentives differ across gray vs journal literature; this complicates interpretation of “bias-corrected” means.
9. Dependence and weighting: RVE is appropriate for multi-arm dependence, but results may still be sensitive to how multiple interventions from the same study are represented; additional checks (e.g., per-study aggregation under alternative models) would strengthen robustness.
10. Limited translation to decision-relevant units: effects are reported mainly as SMDs, with minimal mapping to absolute reductions (e.g., meals/week, grams, CO2e) or cost-effectiveness, making “meaningfully reducing” hard to interpret for practitioners.
11. RPM vs MAP inference gap: the larger RPM pooled effect is clearly identified, but substitution to other MAP is unmeasured in most RPM studies; discussion of substitution remains speculative and should be framed as a key evidence gap rather than an implied conclusion.
12. Generalizability constraints: requirements for pure controls and voluntary choice, while improving interpretability, may exclude policy-relevant interventions (institutional defaults/portioning) and thereby limit external validity for real-world implementation contexts.
McGuire_et_al._2024
Coda title: The wellbeing cost-effectiveness of StrongMinds and Friendship Bench: Combining a systematic review and meta-analysis with charity-related data
Coverage: 55% | Precision: 95% | Rating: Moderate
LLM Assessment Summary
GPT captures the core concern about the linear decay model and broadly echoes the experts’ theme of ad hoc/subjective adjustments, but it misses the experts’ key recommendation for a visual multiverse/sensitivity summary and does not directly flag the specific outlier/RoB exclusion critique.
Detailed Comparison Notes
The GPT list is largely methodologically substantive and overlaps conceptually with the experts’ main ‘standardized rules’ complaint via multiple points about discretionary weighting/adjustments. However, the human critiques focus on transparency and principled standardization (including explicit inclusion/exclusion rules and a compact sensitivity visualization), whereas GPT expands into additional domains (outcome metric mapping, spillovers, M&E design validity, publication-bias tooling, and cost modeling) that are plausible but not raised by the experts.
⚠️ Issues Human Experts Raised That GPT Missed (2)
Arbitrary exclusion of outliers and high risk-of-bias (RoB) studies (experts explicitly objected to removing these and preferred showing full-data impacts transparently).
Request for a clear, visual multiverse/sensitivity-analysis summary (e.g., specification curve / analogue of Figure 2) to show how many analytic choices move the final estimates—called the most significant statistical recommendation.
📋 Extra Issues GPT Identified (10 not in human critique)
WELLBY conversion and construct/normative assumptions (mapping SD-years to 0–10 life satisfaction; combining different outcome constructs).
Use of a pseudo-synthetic control for charity M&E pre–post effects without established exchangeability (risk of regression to mean, selection/attrition, measurement/incentive artifacts).
Household spillover benefits comprising a large share of total effects despite very sparse/heterogeneous evidence; spillover prior strongly driving totals.
Dose-response functional form choice for Friendship Bench (log(+1) transformation) and limited empirical grounding for parameterization.
Specific ad hoc adjustment decisions (e.g., ‘Iran study’ contextual suspicion) potentially conflating bias with true heterogeneity.
Publication-bias correction approach (averaging multiple methods; concerns under heterogeneity/dependence/time moderation).
Generic replication/response-bias discounts applied to M&E data that may double-count or miss biases and materially affect results.
Treatment of StrongMinds RCT relevance and the consequential decision to downweight rather than treat as strong negative evidence.
Cost-model counterfactual/partner adjustments for StrongMinds and need for better empirical grounding and uncertainty propagation.
Handling of long-term follow-ups via a structural compromise (exclude from core time model then apply a multiplicative factor) as a major discretionary modeling choice.
📄 View full human critique
Necessary
“Lack of Systematized/Standardized Adjustment Rules” — Both evaluators noted multiple “project-specific” and “subjective” analytical choices and ad-hoc adjustments. E2 particularly emphasized the approach to weighting and outliers. E1 — “I would be substantially more confident in the comparative cost-effectiveness estimates if the authors provided a clear set of rules for the adjustments they are making, along with a list of these adjustments to be applied consistently across all evaluated projects.”
Optional but important
“Arbitrary Exclusion of Outliers and High Risk of Bias (RoB) Studies” — Evaluator 2 critiqued the removal of outliers (e.g., effect sizes >2 SDs) and high-risk bias studies, arguing it seems “arbitrary” and that it’s “always better to include the full data” to transparently show the impact of such exclusions.
Clearer/visual depiction of sensitivity analysis/multiverse analysis — Both evaluators requested a concise visual summary (such as a specification curve or analogue of Figure 2) showing how the large number of analytical choices and adjustments systematically affect the final estimates. E2 called this their “most significant statistical recommendation”.
Linear decay model — Evaluator 2 expressed surprise at the linear choice, suggesting exponential models are more common in psychological phenomena and that linear decay might lead to “overestimation”
🤖 View all GPT key issues (12)
1. Total-effect-over-time relies on a linear decay assumption and triangular area-under-curve; psychotherapy effects may decay nonlinearly or have long tails, making total WELLBYs highly assumption-sensitive.
2. Handling of influential long-term follow-ups is ad hoc (exclude from core time model, then apply a 1.54 multiplicative compromise); this is a structural modeling choice without clear precedent and materially affects results.
3. The WELLBY conversion (SD-years → WELLBYs via a single “SD=2 on 0–10 ladder” mapping) and combining affective mental health (MHa) with SWB outcomes embed normative/construct assumptions that can shift cross-intervention comparisons.
4. Charity M&E pre–post effects are “corrected” using a pseudo-synthetic control from other trials’ control arms; exchangeability is not established, so residual bias (regression to mean, measurement context, incentives, selection/attrition) could be large.
5. Subjective weights across evidence sources (general RCTs vs charity-related RCTs vs M&E) are central to the headline estimates; the procedure is transparent but remains a major source of discretion, especially for StrongMinds where the RCT conflicts with other evidence.
6. Household spillovers contribute ~30–40% of total benefits but are estimated from very sparse and heterogeneous evidence (few interventions, one dominant study) and acknowledged as very low-quality; the spillover prior strongly affects totals.
7. Dosage adjustment for Friendship Bench is driven by extremely low reported attendance (1.12 sessions) and a chosen log(+1) dose-response form; empirical support for this functional form and parameterization is limited.
8. The “Iran study” adjustment treats larger effects as likely biased based on contextual suspicion; while potentially prudent, it risks discarding true contextual heterogeneity and is not validated against independent indicators of study quality.
9. Publication-bias adjustment uses an average of multiple correction methods (some known to perform poorly under heterogeneity); the resulting discount is not clearly tied to the estimand under multilevel dependence and time moderation.
10. The replication (0.51) and response-bias (0.85) discounts for M&E data are generic and not estimated in the charity context; they may double-count some biases or miss others, and materially alter the M&E contribution.
11. StrongMinds RCT relevance assessment is plausible but partly speculative (pilot/partner/adolescents/COVID); the decision to downweight rather than treat as a serious negative update is consequential and could be contested.
12. Cost modeling includes counterfactual partner adjustments for StrongMinds; these assumptions are important to $/WELLBY and would benefit from clearer empirical grounding and uncertainty propagation.
Peterman_et_al._2025
Coda title: Social Safety Nets, Women’s Economic
Coverage: 15% | Precision: 20% | Rating: Poor
LLM Assessment Summary
The human critiques focus on ASQ-3 measurement validity, pre-specification, multiple comparisons, and bias from unblinded parent reports, while the GPT issues largely address meta-analysis/RVE/heterogeneity and evidence-base composition—suggesting it is critiquing a different kind of paper. As a result, overlap is minimal and most GPT points do not align with the expert concerns.
Detailed Comparison Notes
There is slight conceptual overlap only in the general notion of measurement validity/noise and multiple-testing risk, but GPT frames these in terms of meta-regression/moderators rather than the core human concern: many ASQ outcomes/items analyzed without adjustment and without clear PAP pre-specification. The mismatch strongly suggests the GPT key issues were generated for a different manuscript (systematic review/meta-analysis) than the one the human critiques target (an RCT using ASQ-3 in India).
⚠️ Issues Human Experts Raised That GPT Missed (6)
Need to establish/mention ASQ-3 regional (India-specific) validation and implications if not validated
Multiple-comparisons problem from reporting many outcomes (overall index, domain indices, individual ASQ items) without correction or clear pre-specification
Departure from the pre-analysis plan (PAP): detailed disaggregation into component indices/items not clearly pre-specified
ASQ-3 is a screening tool typically interpreted via risk cutoffs; paper should report effects on the proportion of children ‘at risk’ of developmental delay (not just mean index scores)
Risk of bias from parent-reported ASQ outcomes in an unblinded trial (demand/expectancy effects) should be highlighted as a key limitation
Recommendation (optional) to include a CONSORT flow diagram for transparency on enrollment/attrition
📋 Extra Issues GPT Identified (11 not in human critique)
Meta-analysis heterogeneity (I²/τ²) and need for prediction intervals/limits to generalization
Internal inconsistencies/typos in pooled effects and confidence intervals requiring reconciliation with code outputs
Questionable harmonization/construct validity when standardizing across very different outcome constructs
Lack of transparency/sensitivity analysis for robust variance estimation (RVE) parameters (e.g., within-study rho, small-sample corrections)
Underpowered moderator/meta-regressions and multiple-testing risk in subgroup analyses; need correction or pre-specified hierarchy
Confounding in modality comparisons (UCT vs CCT vs in-kind) due to correlated design features; causal interpretation should be more cautious
Restriction to RCTs may reduce representativeness for real-world national programs often evaluated quasi-experimentally
Regional/modality concentration of the evidence base limiting external validity for underrepresented regions/modalities
Measurement heterogeneity/noise in ‘agency’ outcomes and its effect on pooled estimates
Sparse/non-harmonized cost-benefit evidence and missing women-specific benefits limiting value-for-money conclusions
Optional but important DR — I chose the critiques that seemed important to me, and vetted them quickly in NotebookLM, dropping some the authors’ already seem to address, at least in the August version. But I’m not an expert in this area. ASQ regional validation: “it would be useful to show that the ASQ3 has been validated in India.” Lack of MHT comparisons “The authors present results for an overall child development index, domain-specific indices, as well as individual ASQ items. They do this without accounting for multiple comparisons or presenting this as a pre-specified plan” DR: I checked the PAPs (3 pdfs) on NotebookLM and it said “the detailed breakdown into the multiple component indices and individual items… was not explicitly included in the PAP’s definition of the outcome. “ASQ3 is meant as a screening tool for early childhood development, not a test of child’s abilities. Typically, it relies on cutoffs to determine whether a child is “at risk” of suboptimal development in certain domains. An index score for the ASQ3 were used in the analysis, but it would be useful to see the analysis for differences in proportion of children at risk of developmental delays.”
“Possibly relevant but I’m not sure: Consort flow diagram — “ the study should include a consort flow diagram” I’m not sure if this is desirable practice in this context. I don’t think it’s common practice in the economics literature “ASQ3 is parent reported. This trial was not blinded, so results could have been biased, which is worth mentioning as a key limitation” They seem to recognize this to an extent, even if it’s not emphasized. At least they discuss the issue of ‘experimental demand effects’ in the appendix and provide evidence suggesting it may not be important
Limitations for LLM benchmarking: David Reinstein is not an expert on this. Only 1 strong evaluation, and the critiques are not always in the economics wheelhouse.
🤖 View all GPT key issues (11)
1. Extremely high heterogeneity (I² often ~80–95% and τ² > ~0.02) means the pooled mean effect may be a weak policy summary; the paper should more strongly emphasize dispersion/prediction intervals and limits to generalization.
2. Several apparent internal inconsistencies/typos in reported magnitudes and intervals (e.g., public works pooled effect reported with different values; at least one implausible confidence interval) need reconciliation and traceability to code output.
3. Outcome harmonization across very different constructs (savings/assets/expenditures vs autonomy/voice/decision-making) using standardized effects risks conflating mechanisms and complicates interpretation; stronger justification and construct-validity discussion is needed.
4. RVE implementation details are not fully transparent in the main text (e.g., assumed within-study correlation rho, small-sample corrections, sensitivity to rho), which can matter with many dependent effect sizes per study.
5. Moderator/meta-regression analysis appears underpowered for key subgroups and vulnerable to multiple-testing false positives; a pre-specified moderator hierarchy and/or correction strategy would strengthen inference.
6. Modality comparisons (UCT vs CCT vs in-kind, etc.) may be confounded by correlated design features (targeting rules, recipient identity, enforcement intensity, complementary programming, baseline contexts) that are only coarsely measured; causal interpretation of modality differences should be more cautious.
7. Restricting inclusion to RCTs improves internal validity but may reduce representativeness for large-scale national programs commonly evaluated quasi-experimentally; this could bias conclusions about real-world SSN performance.
8. The evidence base is heavily concentrated in certain regions (notably sub-Saharan Africa) and in cash modalities; conclusions about underrepresented modalities (social care, public works, fee waivers) and regions are therefore fragile.
9. Agency measurement is known to be noisy and context-dependent; the paper notes some limitations (e.g., decision-making ambiguity) but does not fully assess how measurement heterogeneity may attenuate or distort pooled results.
10. Cost-benefit evidence is sparse and not harmonized; most studies omit women-specific benefits, making any overall “value for money” implication highly uncertain and potentially a lower bound in unknown ways.
11. Open-science posture is incomplete at present (public repository only “upon acceptance”; extraction form not public), limiting immediate reproducibility and external auditing of coding decisions.
Weaver_et_al._2025
Coda title: Cash Transfers for Child Development: Experimental Evidence from India
Coverage: 70% | Precision: 90% | Rating: Good
LLM Assessment Summary
GPT captures the central expert concern about sanitation subgroup claims and multiple testing/over-interpretation, and partially overlaps on messaging/implementation (via IVR exposure). It misses the key context concern about concurrent programs and the critique about narrative emphasis, but adds several plausible, substantive methodological issues.
Detailed Comparison Notes
On the most important human point (subgroup/heterogeneity and MHT), GPT aligns strongly: it flags both multiple comparisons and the non-random/post-treatment nature of sanitation measurement. The second human point (intervention context and concurrent services) is only partially addressed: GPT focuses on IVR exposure measurement and bundling, but not on other contemporaneous programs (CCTs/health services) that could drive or modify effects.
⚠️ Issues Human Experts Raised That GPT Missed (5)
Need more detail on intervention context beyond the IVR/cash bundle—especially how participants were actually encouraged to spend on nutritious foods (implementation fidelity and messaging content/delivery).
Potential confounding/interaction with concurrent programs (e.g., conditional cash transfers and other early-childhood health services) that could affect interpretation of impacts and heterogeneity.
Concern that the paper’s framing/abstract language may overemphasize ‘statistically significant’ sanitation-subgroup anthropometric results even while caveating them in-text (i.e., reader takeaway risk).
Exclusion of breastmilk from child food consumption measurement/accounting (not central, but raised).
Requests for clearer study schematics (CONSORT-style flow chart; DAG) to improve transparency (lower priority, but mentioned).
📋 Extra Issues GPT Identified (10 not in human critique)
Clarification that impacts are for a bundled intervention (cash to women + nutrition framing/IVR), not a generic income shock/unconditional cash effect.
COVID-truncated subsample and mode/composition differences across years potentially affecting comparisons over time.
Parent-reported ASQ outcomes: risk of differential reporting/experimenter-demand effects and need for more objective validation.
Dietary measurement limitations (24-hour recall, portion allocation) and robustness to measurement error/differential misreporting.
IVR exposure measurement is indirect; suggestion to use call delivery/connection data or stronger exposure measures.
Sanitation index reliability concerns (perceptions of neighbors, enumerator subjectivity) and sensitivity to alternative proxies.
‘Explained share’/channel decomposition based on cross-sectional correlations is non-causal and could mislead if interpreted mechanistically.
Non-food spending missing in year 1 (form error) limits welfare/budget share interpretation during transfer receipt.
External validity limits (Jharkhand/ICDS-linked registrants; sample slightly wealthier than state average).
Replication readiness/data-and-code availability and index construction scripts not clearly stated.
📄 View full human critique
Optional but important
Subgroup analysis risks overstating statistical significance without adjustments for MHT. Particular signs of this: Tsai notes “‘nutrition only helped for the high-sanitation subgroup’ claim, noting that ‘other studies in LMICs have compared joint nutrition and sanitation interventions to only nutrition interventions directly, and effects on stunting, wasting, and underweight are not consistently better.’”
DR: Although the authors couch their language about this as “suggestive evidence” of mediation, they emphasize it quite a lot in the abstract, intro, etc — a reasonable reader is likely to have the takeaway that the treatment had positive impacts on anthropometric outcomes in areas with better sanitation (and that these differences were ‘statistically significant’)
Need more detail on the intervention context: E1 asked for more details on the messaging: “How were participants ‘encouraged’ to use the funds for nutritious foods?” Tsai particularly wanted more information on concurrent conditional cash transfers (CCTs) and health services targeting early childhood, noting that there might be interaction effects.
Unsure/maybe less important
MHT issue for multiple indices: E1 “The authors present results for an overall child development index, domain-specific indices, as well as individual ASQ items. They do this without accounting for multiple comparisons or presenting this as a pre-specified plan.” [NotebookLM] — But “The PaPs prioritize specifications and randomization inference for the primary hypotheses, which focus on anthropometric measures”
“Exclusion of breastmilk from child food consumption” accounting. NotebookLM suggests that the authors recognize this limitation.
Both evaluators mentioned including flow charts: a consort diagram (evaluator 1) and a DAG (evaluator 2). DR: I’m not sure whether this is an important standard?
🤖 View all GPT key issues (12)
1. Clarify that the estimated effects are for a bundled intervention (cash delivered to women + nutrition framing/IVR), not a generic income shock; claims about “unconditional cash” should be qualified accordingly.
2. Sanitation heterogeneity relies on non-random, post-treatment sanitation measurement; interaction estimates should be presented as suggestive and vulnerable to confounding by correlated area characteristics (health access, markets, disease ecology), despite balance on treatment and residualization checks.
3. Year-2 calorie/nutrient outcomes are based on a COVID-truncated in-person subsample; comparisons across years (e.g., “empowerment grows over time”) may be partly driven by sample composition and survey mode changes.
4. Child functional development is measured via parent-reported ASQ items; differential reporting/experimenter-demand effects cannot be fully ruled out without more objective assessor-based measures or validation subsamples.
5. Dietary intake measurement, while unusually detailed, still depends on 24-hour recall and complex allocation/portion measurement; the paper could better assess robustness to measurement error and potential differential misreporting.
6. Evidence that IVR messaging had little effect is indirect (based on having registered a mobile number); stronger tests using call delivery/connection data or instruments for actual exposure would improve interpretability.
7. The manuscript analyzes many outcomes and heterogeneities; it should more explicitly define primary families, address multiple-hypothesis testing (beyond indices), and distinguish confirmatory vs exploratory findings.
8. Post-treatment sanitation index includes respondent perceptions of neighbors and enumerator observations; reliability and potential measurement error (including enumerator subjectivity) deserve more discussion and sensitivity analyses using alternative sanitation proxies.
9. The paper infers that intermediate-channel changes (knowledge, empowerment, AWC services) explain little via cross-sectional correlations; this “explained share” approach is not causal and may be misleading if interpreted too literally.
10. Non-food spending was not measured in year 1 due to a survey form error; this limits the welfare/accounting interpretation (budget shares, non-food mechanisms) during the key period of transfer receipt.
11. External validity beyond Jharkhand and beyond ICDS-linked registrants is good but not complete; the sample is slightly wealthier than the state average and depends on engagement with AWCs, which may differ in other states/settings.
12. Replication readiness is unclear from the manuscript text excerpt: despite registry/PAPs, the paper should state whether code/data (or restricted-access procedures) are available and provide full construction scripts for complex nutrition and sanitation indices.
Williams_et_al._2024
Coda title: Global potential for natural regeneration in deforested tropical regions
Coverage: 35% | Precision: 92% | Rating: Moderate
LLM Assessment Summary
GPT captures the socioeconomic–biophysical confounding concern and label/omission-error problems, but misses the central expert-flagged threats to validity: contemporaneous-predictor leakage and the flawed future-oriented “validation,” plus several other major human critiques (intensive margin, outdated GFC gain reliance, and missing hindcast/benchmark reconciliation).
Detailed Comparison Notes
Several GPT points are high-quality, paper-relevant methodological critiques (spatial CV, uncertainty propagation, resolution mismatch), but they do not substitute for directly identifying leakage from predictors recorded during/after the regrowth window, which the human experts treat as necessary and potentially fatal to causal/predictive interpretation. GPT’s discussion of temporal inconsistency (e.g., covariates updated to 2018) is related to time-mismatch but is not the same as the experts’ leakage concern about predictors observed contemporaneously with the outcome.
⚠️ Issues Human Experts Raised That GPT Missed (6)
Primary concern: data leakage / contemporaneous predictors — key predictors (e.g., NPP, burned area, road density, some land cover) are measured over the same period as (or overlapping with) the 2000–2012/2016 regrowth outcome, potentially encoding post-outcome information and inflating performance.
“Validation” problem: comparing a 2030-oriented prediction/claim against historical regrowth observations (2000–2016) is not a true validation of future predictive ability.
Neglect of the intensive margin of regrowth (canopy recovery / biomass increases within existing forest pixels), focusing only on extensive-margin new forest gain.
Reliance on an older GFC gain product (v1.5) when newer improved datasets (e.g., incorporating GEDI) exist, potentially affecting label quality and conclusions.
Need for a hindcast and reconciliation of magnitude: the paper does not report model-predicted regrowth for 2000–2016 to benchmark against Fagan et al. (2022), despite a large discrepancy (e.g., 215 Mha vs ~31.6 Mha ± 11.9 Mha).
Definition issue: “natural regrowth” inherits Fagan et al.’s definition that may conflate natural regeneration with human-assisted regeneration (beyond just plantations).
📋 Extra Issues GPT Identified (8 not in human critique)
Lack of calibration/threshold tradeoff reporting (precision–recall, calibration curves), important when converting probabilities to areas and decisions.
Insufficient spatially blocked cross-validation (beyond random splits), with risk of spatial autocorrelation inflating performance estimates.
Mixed spatial/temporal resolution predictors presented at 30 m outputs, risking fine-scale artifacts and overconfidence in local precision.
Temporal inconsistency from updating covariates to later years (e.g., 2018) while training on 2000–2016 outcomes.
Carbon accumulation methodology concerns (resampling 1 km carbon layers to 30 m without uncertainty/covariance handling).
Reproducibility limitation: code only available “on request.”
Risk of policy/operational misinterpretation (map could be read as identifying realizable/additional/permanent sequestration opportunities without sufficient caveats).
📄 View full human critique
Necessary
Data leakage/contemporaneous predictors [mentioned: both evaluators and UJ field specialists] — The ML model seems to be fitted on data/variables that includes post-regeneration outcomes, or that could be influenced by these outcomes; or “using future information to predict past events, or accidentally including validation data in the training data.” This can lead to overly optimistic estimates of model performance and poor generalization to new data.
Here, the issue is the inclusion of “Net primary production, burned area, road density, and soil characteristics were recorded over the same time period as the outcome, and these predictors incorporate information that is influenced by regeneration itself.”
[E1] The outcome variable indicates patches of natural forest regeneration which occurred between 2000–2012 and persisted to 2016. At least three of the predictor variables seen by the training algorithm are also recorded during this period rather than prior to it: net primary production, burned area, and road density. In addition, the authors use variables from the ESA CCI land cover dataset as recorded in the year 2000, overlapping with the outcome.
Optional but important
Reliance on Older GFC Gain Data: [Cloud] The study relies on an older version of the Global Forest Change (GFC) gain data (v1.5, covering 2000-2012/2016). Since the initial analysis for Fagan et al. (2022), improved GFC datasets have been released (e.g., GMD 2022 update using GEDI lidar).
“Confounding by Socioeconomic Factors and Predictor Choice:” [Both evaluators agreed] While the authors train models with biophysical and socioeconomic predictors, their spatial predictions are derived from a model that includes only biophysical variables. Inferring purely biophysical potential from observational data is not feasible with the proposed method. The difficulty lies in the fact that biophysical and socioeconomic conditions are deeply interconnected. In practice, the model will predict high regeneration probability in locations where both factors are favorable. The proposed method cannot support the interpretation of the estimates as ‘purely biophysical potential for natural forest regrowth’.
Neglect of Intensive Margin Regrowth: [Cloud] The study focuses exclusively on the extensive margin (new forest area gain). It completely overlooks the intensive margin – canopy regeneration and forest recovery within existing, potentially degraded, forest pixels.
“Validation” uses past data [Both evaluators/consensus] — The confusion matrix appears to validate the model by comparing its predictions for 2030 against the Fagan et al. (2022) regrowth data for 2000-2016. Validating a model designed to predict future potential against past outcomes is not a true test of its predictive power for the future.
Unsure if correct/relevant
Definition of Natural Regrowth (Inherited from Fagan et al.) [Cloud]: The model relies on Fagan et al.’s definition, which treats all non-plantation gain as “natural regrowth,” conflating truly natural processes with human-assisted regeneration.
Overestimation due to Omission Error (Inherited from Fagan et al.): Fagan et al. reported very high omission errors for regrowth (low area-based Producer’s Accuracy of ~18.7%). The Williams et al. potential area (215 Mha) includes already-regrown areas, substantially overestimating the area truly available for future regeneration.
Discrepancy in Estimated Magnitude and Lack of Historical Validation: [Both evaluators] The authors’ estimate of 215 Mha of expected natural regeneration is strikingly larger than the benchmark from Fagan et al. (2022) of 31.6 Mha ± 11.9 Mha (for 2000-2016). Williams et al. do not report their own model’s prediction for the 2000-2016 historical period (a “hindcast”), a crucial missing validation against this benchmark.
🤖 View all GPT key issues (12)
1. The target variable (“natural regrowth” as >5 m vegetation gain from 2000–2016) is a partial proxy for restoration success and likely misses early regeneration and slow-growing systems; it also does not ensure persistence beyond 2016.
2. The upstream regrowth map used for labels has acknowledged humid-biome omission and low producer’s accuracy for area-adjusted estimates, creating systematic false negatives that can bias model training and geographic patterns.
3. “Non-regeneration” points are sampled from a broad domain that likely includes lands under active management or repeated clearing; thus the model mixes biophysical capacity, human land-use decisions, and detectability rather than isolating biophysical potential.
4. The paper reports high balanced-class accuracy but provides limited information on calibration and threshold-dependent tradeoffs (precision/recall), which are central when aggregating probabilities into expected areas and when using binary maps for targeting.
5. Validation uses random splits with a post hoc autocorrelation-distance diagnostic but not fully spatially blocked cross-validation; performance may be optimistic and appears regionally variable (notably weaker in parts of Southeast Asia).
6. The extremely narrow confidence intervals on the headline area estimates are not credible as full uncertainty bounds because they omit major sources of uncertainty (label noise, transfer/non-stationarity, predictor error/resolution mismatch, and model specification uncertainty).
7. Predictor rasters have mixed spatial/temporal resolutions (30 m to ~1 km; climate normals 1970–2000; NPP 2000–2015; burn area 2001–2017), yet outputs are presented at 30 m; this can create visually fine-scale artifacts and overconfidence in local precision.
8. Updating key forest covariates to 2018 while training on 2000–2016 outcomes introduces temporal inconsistency; combined with known version differences in GFW products, this may yield local incoherence and affect transfer to 2015/2030 predictions.
9. Dropping socioeconomic covariates for prediction stability may increase omitted-variable bias, with biophysical variables acting as proxies for land-use intensity/governance; this complicates the “biophysical conditions can support” interpretation.
10. Carbon accumulation estimates rely on resampling a 1 km carbon potential layer to 30 m with implicit within-km homogeneity and without propagating uncertainty or accounting for covariance between regeneration probability and carbon potential.
11. The map is framed as a first-stage input, but discussion of offsets/targeting could be misread as identifying realizable, additional, and permanent sequestration opportunities; clearer operational guidance on use and limits would reduce misuse.
12. Code is only available “on request,” limiting reproducibility and independent auditing of modelling choices (variable selection, sampling, preprocessing, and aggregation).
LLM-Based Assessment
The comparison between GPT key issues and human critiques was assessed using GPT-5.2 Pro, which evaluated coverage (what proportion of human concerns GPT identified) and precision (whether GPT issues are substantive).
Note: Any interpretation or narrative commentary in this section is written by an LLM (Codex). The values themselves come from the GPT-5.2 Pro comparison outputs.
Show code
# Check if LLM comparison results are availablehas_llm_results <-!is.null(comparison_data) && n_papers >0&&"coverage_pct"%in%names(comparison_data) &&any(!is.na(comparison_data$coverage_pct))
if (!is.null(all_human_issues) &&nrow(all_human_issues) >0) {# Define severity colors sev_colors <-c("Necessary"="#e74c3c", "Optional"="#f39c12", "Unsure"="#95a5a6", "Unclassified"="#bdc3c7")ggplot(severity_dist, aes(x = severity, y = n, fill = severity)) +geom_col(alpha =0.9) +geom_text(aes(label = n), vjust =-0.3, size =5) +scale_fill_manual(values = sev_colors, drop =FALSE) +labs(x ="Severity Label",y ="Number of Issues" ) +theme_uj() +theme(legend.position ="none")}
Figure 4.3: Distribution of human issue severity labels
Issue Count Distribution
Show code
if (!is.null(comparison_data) && n_papers >0) { issue_counts |>ggplot(aes(x =reorder(paper_short, num_gpt_issues), y = num_gpt_issues)) +geom_col(fill = UJ_ORANGE, alpha =0.8) +geom_hline(yintercept =mean(issue_counts$num_gpt_issues),linetype ="dashed", color = UJ_BLUE, linewidth =1) +coord_flip() +labs(x =NULL,y ="Number of Key Issues",subtitle =paste0("Dashed line = mean (", round(mean(issue_counts$num_gpt_issues), 1), " issues)") ) +theme_uj()}
Figure 4.4: Number of key issues identified by GPT-5.2 Pro per paper
Coda Critique Length vs GPT Issue Count
Show code
if (!is.null(comparison_data) && n_papers >0) {ggplot(issue_counts, aes(x = coda_critique_length, y = num_gpt_issues)) +geom_point(size =3, color = UJ_ORANGE, alpha =0.7) +geom_smooth(method ="lm", se =TRUE, color = UJ_BLUE, alpha =0.2) +labs(x ="Human Critique Length (characters)",y ="GPT-5.2 Pro Issue Count" ) +theme_uj()}
Figure 4.5: Relationship between human critique detail and GPT issue count
Observable Structural Differences
The following are verifiable structural differences between the two data sources (not assessments of quality or coverage):
Aspect
GPT-5.2 Pro
Human Expert (Coda)
Format
Numbered bullet points (array of strings)
Free-form prose
Structure
Ordered list (prompted: “most to least important”)
Often uses severity labels (“Necessary”, “Optional but important”)
Source attribution
None (single model output)
Often cites specific evaluators (E1, E2, names)
Length
Constrained by prompt (~10-12 issues)
Unconstrained, highly variable
Questions for Manual Review
The side-by-side comparisons above are provided for manual expert assessment. Key questions to investigate:
Coverage: What proportion of human-identified issues appear in the GPT output (in some form, to some extent)? (This is the focal question).
Precision: Are GPT issues substantive or does the model identify spurious/generic concerns?
Severity alignment: Does GPT’s importance ordering correlate with human severity labels?
Missed issues: Are there critical human concerns that GPT systematically misses?
Novel issues: Does GPT surface valid concerns that humans overlooked?
Show code
if (has_llm_results) {cat('::: {.callout-tip}\n')cat('## LLM Assessment Complete\n\n')cat('The comparisons above have been assessed using GPT-5.2 Pro. See the [LLM-Based Assessment](#sec-llm-assessment) section for coverage and precision metrics.\n')cat(':::\n')} else {cat('::: {.callout-warning}\n')cat('## No Automated Assessment Yet\n\n')cat('This page displays raw data for manual review. The claims about coverage, precision, and alignment **have not been assessed**—they require either:\n\n')cat('- Manual expert review of each paper\'s comparisons\n')cat('- LLM-based comparison (run the `key-issues-comparison` chunk in `methods.qmd`)\n\n')cat('Any conclusions about "high overlap" or "systematic differences" should emerge from that analysis, not be assumed.\n')cat(':::\n')}
LLM Assessment Complete
The comparisons above have been assessed using GPT-5.2 Pro. See the LLM-Based Assessment section for coverage and precision metrics.
Manual Annotation Tool
To systematically assess concordance between human and LLM critiques, use the Issue Annotation Tool. (Jan 8 2026: We are working on building a hosted tool, for now follow the instructions below.)
# 1. Generate annotation data (parses human critiques into individual issues)python3 tools/build_issue_annotation_data.py# 2. Open the annotation UI in a browseropen tools/issue_annotation_ui/index.html
Annotation workflow:
For each human-identified issue:
Match score (0-1): How well do LLM issues capture this concern?
0 = Not addressed at all
0.5 = Partially captured or tangentially related
1 = Fully captured by one or more LLM issues
Confidence (0-1): How certain are you of this assessment?
Context flag: Check this box if the human critique references information the LLM didn’t have (e.g., appendix or preregistration materials not shared with the LLM at the point this evaluation was done)
Link to LLM issues: Select which LLM issues (L1, L2, …) correspond to this human issue
Discussion: Explain your reasoning, note partial matches, or flag ambiguities
Export annotations as JSON/CSV for analysis. Annotations are auto-saved to browser localStorage.
Annotation Data Schema
The tool parses human critiques using heuristics:
Severity labels normalized to: necessary, optional, unsure
Evaluator attributions (E1, E2, DR) preserved in issue text
Issue boundaries detected via enumeration, sentence breaks, and section headers
Review parsed issues in the UI and edit/add/remove as needed before annotating.
Source Code
---title: "Results: Critiques & Key Issues"format: html: defaultengine: knitr---```{r}#| label: setup-critiques#| code-summary: "Setup and libraries"#| code-fold: true#| message: false#| warning: falselibrary("tidyverse")library("jsonlite")library("knitr")library("kableExtra")library("DT")library("reticulate")# Theme colors (matching results_ratings.qmd)UJ_ORANGE <-"#f19e4b"UJ_GREEN <-"#99bb66"UJ_BLUE <-"#4e79a7"# Severity colors for badgesSEV_NECESSARY <-"#e74c3c"# RedSEV_OPTIONAL <-"#f39c12"# OrangeSEV_UNSURE <-"#95a5a6"# Graytheme_uj <-function(base_size =11) {theme_minimal(base_size = base_size) +theme(panel.grid.minor =element_blank(),plot.title.position ="plot",legend.position ="bottom" )}# Function to create severity badge HTMLseverity_badge <-function(severity) { sev <-tolower(trimws(severity))if (grepl("necessary", sev)) {return('<span style="background-color:#e74c3c;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">🔴 Necessary</span>') } elseif (grepl("optional", sev)) {return('<span style="background-color:#f39c12;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">🟠 Optional</span>') } elseif (sev !="") {return('<span style="background-color:#95a5a6;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">⚪ Unsure</span>') }return('<span style="background-color:#bdc3c7;color:white;padding:2px 6px;border-radius:3px;font-size:0.8em;">—</span>')}# Function to truncate text for table displaytruncate_text <-function(text, max_chars =80) {if (nchar(text) > max_chars) {paste0(substr(text, 1, max_chars), "...") } else { text }}```This chapter compares **qualitative critiques**: the key methodological and interpretive issues identified by GPT-5.2 Pro against human expert critiques documented by Unjournal evaluation managers in Coda.We assess alignment using **LLM-based comparison**: GPT-5.2 Pro evaluates coverage (what proportion of human concerns GPT identified) and precision (whether GPT issues are substantive). See [LLM-Based Assessment](#sec-llm-assessment) for aggregate metrics.## Data Sources**GPT-5.2 Pro Key Issues**: Structured output from the focal evaluation run (January 2026), where the model was prompted to identify 5-15 key issues per paper, ordered from most to least important.**Human Expert Critiques**: Curated content from the "Key critiques & issues with paper" column in The Unjournal's internal tracking database (Coda), written by evaluation managers synthesizing evaluator feedback. These use severity labels ("Necessary", "Optional but important", "Unsure") and cite specific evaluator comments.```{r}#| label: load-comparison-data#| code-fold: true#| code-summary: "Load comparison data"# Load the matched comparison datacomparison_file <-"results/key_issues_comparison.json"comparison_results_file <-c("results/key_issue_comp_results.json","results/key_issues_comparison_results.json")comparison_results_file <- comparison_results_file[file.exists(comparison_results_file)][1]if (file.exists(comparison_file)) { comparison_data <-fromJSON(comparison_file) n_papers <-nrow(comparison_data)} else { comparison_data <-NULL n_papers <-0}# Load LLM-based comparison results (coverage/precision) if availableif (!is.null(comparison_data) &&!is.na(comparison_results_file)) { llm_results_raw <-fromJSON(comparison_results_file) llm_results <- llm_results_raw |>as_tibble() |>unnest_wider(comparison) |>select( gpt_paper, coverage_pct, precision_pct, missed_issues, extra_issues, overall_rating, overall_justification, detailed_notes ) comparison_data <- comparison_data |>left_join(llm_results, by ="gpt_paper")}# Update paper count after loadingif (!is.null(comparison_data)) { n_papers <-nrow(comparison_data)}``````{python}#| label: parse-human-issues#| code-fold: true#| code-summary: "Parse human critiques into individual issues"import jsonimport refrom pathlib import Path# Severity parsing regex patternsSEVERITY_HEADER_RE = re.compile(r"^(necessary|optional|optional but important|unsure|less important|possibly relevant|"r"probably important|probably correct|probably|note|notes|optional/desirable|"r"probably useful|maybe|possibly|unsure if correct|limitations)(\s|:|$)", re.IGNORECASE,)ENUM_RE = re.compile(r"^\s*(\d+)[\.\)]\s+")EVALUATOR_RE = re.compile(r"^\[?(E\d+|Evaluator|DR|NotebookLM)\]?[\s:\[]", re.IGNORECASE)def normalize_severity(header): h = header.lower()if"necessary"in h:return"necessary"if"optional"in h:return"optional"ifany(word in h for word in ["unsure", "possibly", "probably", "less important", "maybe", "limitations"]):return"unsure"return""def extract_short_label(text, max_words=12):"""Extract a descriptive short label from issue text.""" original = text.strip()# Meta-commentary patterns to skip META_SKIP_PATTERNS = [r'^I chose',r'^I selected',r'^I think',r'^I would',r'^I believe',r'^These are',r'^This is (a|the|my)',r'^Note[s]?\s*[:)]',r'^DR\s*[:)]',r'^NotebookLM\s*[:)]',r'^The following',r'^Summary\s*[:)]',r'^Quote from',r'^See also',r'^As noted',r'^and that', # Fragments starting with conjunctionsr'^and it',r"^and that it's",r'^or that',r'^but that',r'^but it',r'^which ',r'^that ',r'^to be ',r'^it is ',r'^it would ', ]# First clean: Remove leading evaluator tags (DR —, [E1], etc.) cleaned_for_meta_check = re.sub(r'^[\s\[]*(?:E\d+|Evaluator\s*\d*|DR|NotebookLM|Treich|Triech|Capitan|Reiley|Manheim|Espinosa|Seinkmane|Cloud|Tsai)[\]:\s—\-]+','', original, flags=re.IGNORECASE ).strip()# Check for meta-commentary patterns AFTER removing evaluator prefixfor pattern in META_SKIP_PATTERNS:if re.match(pattern, cleaned_for_meta_check, re.IGNORECASE):# Try to find actual content after the meta-commentary# Look for a quoted phrase or substantive content after colon/period quote_in_meta = re.search(r'["""\']([^"""\'\n]{15,80})["""\']', cleaned_for_meta_check)if quote_in_meta: quoted = quote_in_meta.group(1).strip()ifnot re.match(r'^(the |a |an |this )', quoted.lower()):return quoted# Look for content after a colon after_meta = re.split(r'[:\n]', cleaned_for_meta_check, maxsplit=1)iflen(after_meta) >1andlen(after_meta[1].strip()) >20: original = after_meta[1].strip()breakelse:# Return a generic placeholder if no content foundreturn"[Meta-commentary - see full text]"# FIRST: Check if text starts with "Label [Evaluator] more text" pattern# Extract just the label part before the evaluator tag# Match common evaluator patterns: [E1], [Treich], [Triech], [Reiley], [Capitan], etc. label_before_eval = re.match(r'^([A-Z][^[\n]{10,80}?)\s*\[(?:E\d+|[A-Z][a-z]+)\]', original)if label_before_eval: label = label_before_eval.group(1).strip() label = re.sub(r'[\s:,;—\-]+$', '', label)if3<=len(label.split()) <= max_words:return label# Also check for "Label E2: more text" pattern (evaluator without brackets) label_before_eval2 = re.match(r'^([A-Z][^\n]{10,80}?)\s+(?:E\d+|DR)\s*:', original)if label_before_eval2: label = label_before_eval2.group(1).strip() label = re.sub(r'[\s:,;—\-]+$', '', label)if3<=len(label.split()) <= max_words:return label# Remove evaluator tags, names, and quotes at start clean = re.sub(r'^\[?(?:E\d+|Evaluator\s*\d*|DR|NotebookLM|Treich|Triech|Capitan|Reiley|Manheim|Espinosa|Seinkmane|Cloud|Tsai)\]?[\s:\[]*', '', original, flags=re.IGNORECASE) clean = re.sub(r'^[""\'\[\(]', '', clean) clean = re.sub(r'^Author[\'s]*\s*response:?\s*', '', clean, flags=re.IGNORECASE)# Skip if remaining text starts with a name (single capitalized word followed by colon) name_check = re.match(r'^([A-Z][a-z]+)\s*:', clean)if name_check andlen(name_check.group(1)) <15: clean = clean[name_check.end():].strip()# Look for quoted phrases that are key terms (handle both straight and curly quotes) quote_match = re.search(r'["""\']([^"""\'\n]{15,80})["""\']', clean[:250])if quote_match: quoted = quote_match.group(1).strip()ifnot re.match(r'^(the |a |an |this )', quoted.lower()):return quoted# Look for "Label: description" or "Label — description" patterns# Use (?<!\w) to avoid matching hyphens in compound words like "pre-registered" label_match = re.match(r'^([^:\n]{10,80}?)(?:\s*[:\—]\s*|\s+—\s+)', clean)if label_match: label = label_match.group(1).strip() label = re.sub(r'[""\']+$', '', label)# Skip if it ends with a preposition or conjunction (incomplete phrase)ifnot re.search(r'\s+(or|and|of|for|to|in|on|by|with|the|a|an)$', label.lower()):ifnot re.match(r'^[A-Z][a-z]+$', label) andlen(label.split()) <= max_words:return label# Look for descriptive noun phrases (allow hyphens in words) patterns = [r'((?:Lack of |Missing |Unclear |Weak |Strong |Potential )?[A-Z][a-z]+(?:[-\s]+[a-z-]+){1,5}(?:\s+(?:effects?|bias|issues?|concerns?|limitations?|assumptions?|analysis|validity|problems?|risks?|errors?|size|needs?|checking)))',r'((?:Small |Large |Limited |Significant )[a-z-]+(?:\s+[a-z-]+){0,5})',r'([A-Z][a-z]+\s+(?:sample|data|model|method|approach|design|analysis|claims?|effects?)(?:\s+[a-z-]+){0,3})',r'(Consider\s+[a-z-]+(?:[-\s]+[a-z-]+){0,5})',r'(Deviations?\s+from\s+(?:the\s+)?[a-z-]+(?:[-\s]+[a-z-]+){0,5})', ]for pattern in patterns: match = re.search(pattern, clean[:250])if match: label = match.group(1).strip()# Remove trailing preposition phrases like "or degrees of", "and more", etc.whileTrue: new_label = re.sub(r'\s+(or|and|with|for|to|of|in|on|by|including)(\s+\w+)*\s*$', '', label)if new_label == label:break label = new_labelif10<len(label) <120andlen(label.split()) <= max_words:return label# Take first sentence or clause first_clause = re.split(r'(?<=[.!?])\s+|\n', clean)[0].strip() first_clause = re.sub(r'^(The |A |An )', '', first_clause) first_clause = re.sub(r'\s+(including|or|and|with|for|to|of|in|on|by)\s*$', '', first_clause)# Remove evaluator tags from middle/end of text first_clause = re.sub(r'\s*\[(?:E\d+|[A-Z][a-z]+)\]\s*', ' ', first_clause) first_clause = re.sub(r'\s+(?:E\d+|DR)\s*:.*$', '', first_clause) first_clause = re.sub(r'\s+', ' ', first_clause).strip() words = first_clause.split()if3<=len(words) <= max_words:return first_clauseeliflen(words) > max_words: text =' '.join(words[:max_words]) last_break =max(text.rfind(' or '), text.rfind(' and '), text.rfind(', '), text.rfind(' — '))if last_break >len(text) //2:return text[:last_break].strip(' ,—')return text +'...'# Fallback key_terms = re.findall(r'[A-Z][a-z]+(?:\s+[a-z]+){0,2}', clean[:150])if key_terms:return' / '.join(key_terms[:3]) words = clean.split()[:max_words]return' '.join(words) + ('...'iflen(clean.split()) > max_words else'')def extract_llm_label(text, max_words=10):"""Extract a short label from LLM issue text."""# Clean up numbering clean = re.sub(r'^[\d\.\)]+\s*', '', text.strip())# LLM issues often have format "Topic: detailed explanation" colon_match = re.match(r'^([^:]{8,60}):\s*', clean)if colon_match: label = colon_match.group(1).strip()# Make sure it's descriptive enoughiflen(label.split()) >=2:return label# Take first clause first_part = re.split(r'[;.]', clean)[0].strip() words = first_part.split()iflen(words) <= max_words:return first_partreturn' '.join(words[:max_words]) +'...'def split_human_issues(text):"""Parse human critique text into individual issues with severity labels."""ifnot text:return [] issues = [] current_severity ="" current_issue_lines = []def flush_issue():nonlocal current_issue_linesifnot current_issue_lines:return full_text ="\n".join(current_issue_lines).strip()# Clean leading punctuation/numbering cleaned = re.sub(r"^[\-\*\d\.\)\s]+", "", full_text).strip()ifnot cleaned orlen(cleaned) <10: current_issue_lines = []return# Skip if it's just a severity labelif SEVERITY_HEADER_RE.match(cleaned): current_issue_lines = []return short_label = extract_short_label(cleaned)# Skip if this is meta-commentary (the label extraction will return placeholder)if short_label =="[Meta-commentary - see full text]": current_issue_lines = []return issues.append({"text": cleaned,"severity": current_severity,"short_label": short_label }) current_issue_lines = [] lines = text.splitlines() i =0while i <len(lines): line = lines[i].rstrip() raw = line.strip()# Check for severity headerif raw and SEVERITY_HEADER_RE.match(raw): flush_issue() current_severity = normalize_severity(raw.split(':')[0].split()[0]) i +=1continue# Check for numbered issue startif ENUM_RE.match(raw): flush_issue()# Remove the number prefix issue_text = re.sub(r"^\s*\d+[\.\)]\s*", "", raw) current_issue_lines = [issue_text] if issue_text else [] i +=1continue# Check for issue label line (short line followed by longer explanation)# e.g., "Ceiling effects with Likert scale measures"if raw andnot EVALUATOR_RE.match(raw) andlen(raw) <80:# Look ahead - if next non-empty line starts with [E or is indented, this is a label next_idx = i +1while next_idx <len(lines) andnot lines[next_idx].strip(): next_idx +=1if next_idx <len(lines): next_line = lines[next_idx].strip()if (EVALUATOR_RE.match(next_line) or lines[next_idx].startswith(' ') or next_line.startswith('"') or next_line.startswith("'")):# This short line is likely a label for a new issue flush_issue() current_issue_lines = [raw] i +=1continue# Empty line - might signal new issue in some formatsifnot raw:# Don't flush on every empty line - only if we have significant contentif current_issue_lines andlen("\n".join(current_issue_lines)) >100:# Check if next content looks like new issue next_idx = i +1while next_idx <len(lines) andnot lines[next_idx].strip(): next_idx +=1if next_idx <len(lines): next_line = lines[next_idx].strip()if ENUM_RE.match(next_line) or (len(next_line) <80andnot EVALUATOR_RE.match(next_line)): flush_issue() i +=1continue# Regular content line - add to current issueif current_issue_lines: current_issue_lines.append(raw)else: current_issue_lines = [raw] i +=1 flush_issue()return issues# Load comparison data and resultscomparison_file = Path("results/key_issues_comparison.json")results_file = Path("results/key_issues_comparison_results.json")parsed_issues = {}comparison_results = {}if comparison_file.exists(): comparison = json.loads(comparison_file.read_text())for item in comparison: paper_id = item.get("gpt_paper", "") coda_critique = item.get("coda_critique", "") parsed_issues[paper_id] = split_human_issues(coda_critique)if results_file.exists(): results = json.loads(results_file.read_text())for r in results: paper_id = r.get("gpt_paper", "") comparison_results[paper_id] = r.get("comparison", {})```We matched **`r n_papers` papers** with both GPT-5.2 Pro key issues and human expert critiques.## Overview Statistics```{r}#| label: tbl-overview-stats#| tbl-cap: "Summary of key issues comparison data"if (!is.null(comparison_data) && n_papers >0) { overview <- comparison_data |>mutate(num_gpt_issues =sapply(gpt_key_issues, length) ) |>summarise(`Papers Compared`=n(),`Avg GPT Issues per Paper`=round(mean(num_gpt_issues), 1),`Min GPT Issues`=min(num_gpt_issues),`Max GPT Issues`=max(num_gpt_issues),`Avg Coda Critique Length (chars)`=round(mean(coda_critique_length)),`Min Coda Length`=min(coda_critique_length),`Max Coda Length`=max(coda_critique_length) ) |>pivot_longer(everything(), names_to ="Metric", values_to ="Value")kable(overview, align =c("l", "r"))}```## Paper-by-Paper Comparison {#sec-paper-comparison}Each paper section shows:- **Assessment badge**: Coverage and precision metrics with overall rating- **LLM Assessment Summary**: GPT-5.2 Pro's justification of the comparison- **Missed Issues**: Human expert issues that GPT failed to capture- **Extra Issues**: GPT issues not present in human critique- **Full texts**: Expandable sections with complete human critique and GPT issues```{r}#| label: render-comparisons#| results: asis#| code-fold: true#| code-summary: "Render paper comparisons"# Access parsed issues and comparison results from Pythonparsed_issues_list <- py$parsed_issuescomparison_results_list <- py$comparison_resultsif (!is.null(comparison_data) && n_papers >0) {for (i in1:nrow(comparison_data)) { row <- comparison_data[i, ] paper_id <- row$gpt_paper# Paper headercat(sprintf("\n### %s\n\n", paper_id))cat(sprintf("**Coda title:** %s\n\n", row$coda_title))# Get comparison results for this paper paper_comparison <- comparison_results_list[[paper_id]] gpt_issues <- row$gpt_key_issues[[1]]# Show LLM assessment badgeif (!is.na(row$coverage_pct)) { coverage_val <-as.numeric(row$coverage_pct) precision_val <-as.numeric(row$precision_pct) avg_score <- (coverage_val + precision_val) /2 rating <-if (!is.null(row$overall_rating)) row$overall_rating else"Unknown"# Determine badge color based on ratingif (rating %in%c("Good", "Excellent")) { badge_class <-"callout-tip" } elseif (rating =="Moderate") { badge_class <-"callout-note" } else { badge_class <-"callout-warning" }cat(sprintf("::: {.%s appearance=\"simple\"}\n", badge_class))cat(sprintf("**Coverage:** %s%% | **Precision:** %s%% | **Rating:** %s\n", row$coverage_pct, row$precision_pct, rating))cat(":::\n\n") }# ---- LLM ASSESSMENT SUMMARY ----if (!is.null(row$overall_justification) &&nchar(row$overall_justification) >0) {cat("#### LLM Assessment Summary\n\n")cat(row$overall_justification)cat("\n\n") }# ---- DETAILED NOTES (expandable) ----if (!is.null(row$detailed_notes) &&nchar(row$detailed_notes) >0) {cat("::: {.callout-note collapse='true'}\n")cat("## Detailed Comparison Notes\n\n")cat(row$detailed_notes)cat("\n:::\n\n") }# ---- MATCHED PAIRS (New format from enhanced LLM comparison) ---- matched_pairs <-if (!is.null(paper_comparison$matched_pairs)) paper_comparison$matched_pairs elselist()if (length(matched_pairs) >0) {cat(sprintf("#### ✅ Matched Issues (%d human issues with LLM coverage)\n\n", length(matched_pairs)))for (j inseq_along(matched_pairs)) { pair <- matched_pairs[[j]] label <-if (!is.null(pair$label)) pair$label else"Unnamed issue" match_quality <-if (!is.null(pair$match_quality)) pair$match_quality else"?" match_explanation <-if (!is.null(pair$match_explanation)) pair$match_explanation else"" detailed_discussion <-if (!is.null(pair$detailed_discussion)) pair$detailed_discussion else"" human_idx <-if (!is.null(pair$human_issue_index)) pair$human_issue_index else j llm_indices <-if (!is.null(pair$llm_issue_indices)) pair$llm_issue_indices elsec()# Match quality colorif (is.numeric(match_quality)) {if (match_quality >=70) { quality_color <-"#27ae60"# Green } elseif (match_quality >=40) { quality_color <-"#f39c12"# Orange } else { quality_color <-"#e74c3c"# Red } } else { quality_color <-"#95a5a6"# Gray }cat(sprintf('<div style="margin:0.75em 0;padding:0.75em;background:#f0fff0;border-left:4px solid %s;border-radius:4px;">\n', quality_color))cat(sprintf('<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:0.5em;">\n'))cat(sprintf('<strong style="font-size:1.05em;">H%d: %s</strong>\n', human_idx, gsub("<", "<", gsub(">", ">", label))))cat(sprintf('<span style="background-color:%s;color:white;padding:3px 8px;border-radius:12px;font-weight:bold;font-size:0.9em;">%s%%</span>\n', quality_color, match_quality))cat('</div>\n')if (length(llm_indices) >0) { llm_tags <-paste(sapply(llm_indices, function(x) sprintf("L%d", x)), collapse =", ")cat(sprintf('<div style="font-size:0.85em;color:#666;margin-bottom:0.5em;">Matched with: %s</div>\n', llm_tags)) }if (nchar(match_explanation) >0) {cat(sprintf('<div style="font-size:0.9em;margin-top:0.5em;">%s</div>\n', gsub("<", "<", gsub(">", ">", match_explanation)))) }if (nchar(detailed_discussion) >0) {cat('<details style="margin-top:0.5em;">\n')cat('<summary style="font-size:0.85em;color:#666;cursor:pointer;">Show detailed discussion</summary>\n')cat(sprintf('<div style="margin-top:0.5em;padding:0.5em;background:#fafafa;font-size:0.85em;border-radius:4px;">%s</div>\n',gsub("<", "<", gsub(">", ">", detailed_discussion))))cat('</details>\n') }cat('</div>\n') }cat('\n') }# ---- UNMATCHED HUMAN ISSUES (New format) ---- unmatched_human <-if (!is.null(paper_comparison$unmatched_human)) paper_comparison$unmatched_human elselist()if (length(unmatched_human) >0) {cat(sprintf("#### ⚠️ Unmatched Human Issues (%d not captured by LLM)\n\n", length(unmatched_human)))for (j inseq_along(unmatched_human)) { item <- unmatched_human[[j]] idx <-if (!is.null(item$index)) item$index else j brief <-if (!is.null(item$brief_description)) item$brief_description else"No description" why_missed <-if (!is.null(item$why_missed)) item$why_missed else""cat(sprintf('<div style="margin:0.5em 0;padding:0.75em;background:#ffebee;border-left:4px solid #c62828;font-size:0.9em;">\n'))cat(sprintf('<strong>H%d:</strong> %s\n', idx, gsub("<", "<", gsub(">", ">", brief))))if (nchar(why_missed) >0) {cat(sprintf('<div style="margin-top:0.5em;font-size:0.85em;color:#666;"><em>Why missed:</em> %s</div>\n',gsub("<", "<", gsub(">", ">", why_missed)))) }cat('</div>\n') }cat('\n') }# ---- UNMATCHED LLM ISSUES (New format) ---- unmatched_llm <-if (!is.null(paper_comparison$unmatched_llm)) paper_comparison$unmatched_llm elselist()if (length(unmatched_llm) >0) {cat('<details style="margin-bottom:1.5em;">\n')cat(sprintf('<summary style="color:#1565c0;font-weight:bold;cursor:pointer;">📋 Unmatched LLM Issues (%d not in human critique)</summary>\n', length(unmatched_llm)))cat('<div style="margin-top:0.5em;">\n')for (j inseq_along(unmatched_llm)) { item <- unmatched_llm[[j]] idx <-if (!is.null(item$index)) item$index else j brief <-if (!is.null(item$brief_description)) item$brief_description else"No description" why_extra <-if (!is.null(item$why_extra)) item$why_extra else""cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:#e3f2fd;border-left:4px solid #1565c0;font-size:0.9em;">\n'))cat(sprintf('<strong>L%d:</strong> %s\n', idx, gsub("<", "<", gsub(">", ">", brief))))if (nchar(why_extra) >0) {cat(sprintf('<div style="margin-top:0.5em;font-size:0.85em;color:#666;"><em>Why extra:</em> %s</div>\n',gsub("<", "<", gsub(">", ">", why_extra)))) }cat('</div>\n') }cat('</div>\n')cat('</details>\n\n') }# ---- FALLBACK: Old format missed/extra issues (if new format not available) ----if (length(matched_pairs) ==0&&length(unmatched_human) ==0) {# Use old format missed_issues <-if (!is.null(paper_comparison$missed_issues)) paper_comparison$missed_issues elsecharacter(0)if (length(missed_issues) >0) {cat(sprintf("#### ⚠️ Issues Human Experts Raised That GPT Missed (%d)\n\n", length(missed_issues)))for (j inseq_along(missed_issues)) { issue_text <- missed_issues[[j]] issue_escaped <-gsub("<", "<", gsub(">", ">", issue_text))cat(sprintf('<div style="margin:0.5em 0;padding:0.75em;background:#ffebee;border-left:4px solid #c62828;font-size:0.9em;">\n'))cat(sprintf('%s\n', issue_escaped))cat('</div>\n') }cat('\n') } extra_issues <-if (!is.null(paper_comparison$extra_issues)) paper_comparison$extra_issues elsecharacter(0)if (length(extra_issues) >0) {cat('<details style="margin-bottom:1.5em;">\n')cat(sprintf('<summary style="color:#1565c0;font-weight:bold;cursor:pointer;">📋 Extra Issues GPT Identified (%d not in human critique)</summary>\n', length(extra_issues)))cat('<div style="margin-top:0.5em;">\n')for (j inseq_along(extra_issues)) { issue_text <- extra_issues[[j]] issue_escaped <-gsub("<", "<", gsub(">", ">", issue_text))cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:#e3f2fd;border-left:4px solid #1565c0;font-size:0.9em;">\n'))cat(sprintf('%s\n', issue_escaped))cat('</div>\n') }cat('</div>\n')cat('</details>\n\n') } }# ---- FULL HUMAN CRITIQUE (expandable) ----cat('<details style="margin-bottom:1em;">\n')cat('<summary style="color:#e65100;cursor:pointer;">📄 View full human critique</summary>\n')cat('<div style="margin-top:0.5em;padding:0.75em;background:#fffbf0;border:1px solid #ffe0b2;border-radius:4px;font-size:0.9em;white-space:pre-wrap;">\n')cat(gsub("<", "<", gsub(">", ">", row$coda_critique)))cat('\n</div>\n')cat('</details>\n\n')# ---- FULL GPT ISSUES (expandable) ----if (length(gpt_issues) >0) {cat('<details style="margin-bottom:1em;">\n')cat(sprintf('<summary style="color:#1565c0;cursor:pointer;">🤖 View all GPT key issues (%d)</summary>\n', length(gpt_issues)))cat('<div style="margin-top:0.5em;">\n')for (j inseq_along(gpt_issues)) { issue <- gpt_issues[j] issue_clean <-gsub("^[0-9]+[)\\.]\\s*", "", issue) issue_clean <-gsub("^\\. ", "", issue_clean) issue_escaped <-gsub("<", "<", gsub(">", ">", issue_clean))cat(sprintf('<div style="margin:0.5em 0;padding:0.5em;background:#f0f7ff;border-left:4px solid #4e79a7;font-size:0.9em;">\n'))cat(sprintf('<strong>%d.</strong> %s\n', j, issue_escaped))cat('</div>\n') }cat('</div>\n')cat('</details>\n\n') }cat("---\n\n") }}```## LLM-Based Assessment {#sec-llm-assessment}The comparison between GPT key issues and human critiques was assessed using GPT-5.2 Pro, which evaluated coverage (what proportion of human concerns GPT identified) and precision (whether GPT issues are substantive).**Note**: Any interpretation or narrative commentary in this section is written by an LLM (Codex). The values themselves come from the GPT-5.2 Pro comparison outputs.```{r}#| label: check-llm-results#| code-fold: true# Check if LLM comparison results are availablehas_llm_results <-!is.null(comparison_data) && n_papers >0&&"coverage_pct"%in%names(comparison_data) &&any(!is.na(comparison_data$coverage_pct))``````{r}#| label: llm-assessment-summary#| tbl-cap: "LLM assessment of GPT vs human critique alignment"#| eval: !expr has_llm_resultsif (has_llm_results) {# Filter to papers with valid LLM results llm_results <- comparison_data |>filter(!is.na(coverage_pct) &!is.na(precision_pct)) summary_stats <- llm_results |>summarise(`Papers Assessed`=n(),`Mean Coverage (%)`=round(mean(coverage_pct, na.rm =TRUE), 1),`Mean Precision (%)`=round(mean(precision_pct, na.rm =TRUE), 1),`Coverage Range`=paste0(min(coverage_pct, na.rm =TRUE), "-", max(coverage_pct, na.rm =TRUE)),`Precision Range`=paste0(min(precision_pct, na.rm =TRUE), "-", max(precision_pct, na.rm =TRUE)) ) |>mutate(across(everything(), as.character)) |>pivot_longer(everything(), names_to ="Metric", values_to ="Value")kable(summary_stats, align =c("l", "r"))}``````{r}#| label: fig-coverage-precision#| fig-cap: "Coverage vs Precision across papers (LLM-assessed)"#| fig-width: 9#| fig-height: 7#| eval: !expr has_llm_resultsif (has_llm_results) { llm_results <- comparison_data |>filter(!is.na(coverage_pct) &!is.na(precision_pct)) |>mutate(paper_short =str_trunc(gpt_paper, 25))ggplot(llm_results, aes(x = coverage_pct, y = precision_pct)) +geom_point(size =4, color = UJ_ORANGE, alpha =0.8) +geom_text(aes(label = paper_short), hjust =-0.1, vjust =0.5, size =2.5, check_overlap =TRUE) +geom_vline(xintercept =mean(llm_results$coverage_pct), linetype ="dashed", color = UJ_BLUE) +geom_hline(yintercept =mean(llm_results$precision_pct), linetype ="dashed", color = UJ_BLUE) +scale_x_continuous(limits =c(0, 100), breaks =seq(0, 100, 20)) +scale_y_continuous(limits =c(0, 100), breaks =seq(0, 100, 20)) +labs(x ="Coverage (%): Human issues captured by GPT",y ="Precision (%): GPT issues that are substantive",subtitle ="Dashed lines = means" ) +theme_uj()}``````{r}#| label: tbl-paper-ratings#| tbl-cap: "Per-paper LLM assessment results"#| eval: !expr has_llm_resultsif (has_llm_results) { ratings_table <- comparison_data |>filter(!is.na(coverage_pct)) |>mutate(Paper =str_trunc(gpt_paper, 35),`Coverage (%)`= coverage_pct,`Precision (%)`= precision_pct,Rating = overall_rating ) |>select(Paper, `Coverage (%)`, `Precision (%)`, Rating) |>arrange(desc(`Coverage (%)`))kable(ratings_table, align =c("l", "r", "r", "l"))}``````{r}#| label: fig-rating-distribution#| fig-cap: "Distribution of overall ratings across papers"#| fig-width: 8#| fig-height: 4#| eval: !expr has_llm_resultsif (has_llm_results) { rating_counts <- comparison_data |>filter(!is.na(overall_rating)) |>mutate(Rating =factor(overall_rating, levels =c("Poor", "Moderate", "Good", "Excellent")) ) |>count(Rating, .drop =FALSE)# Define colors for ratings rating_colors <-c("Poor"="#e74c3c", "Moderate"="#f39c12", "Good"="#27ae60", "Excellent"="#2ecc71")ggplot(rating_counts, aes(x = Rating, y = n, fill = Rating)) +geom_col(alpha =0.9) +geom_text(aes(label = n), vjust =-0.3, size =5) +scale_fill_manual(values = rating_colors, drop =FALSE) +labs(x ="Overall Rating",y ="Number of Papers" ) +theme_uj() +theme(legend.position ="none")}```::: {.callout-note collapse="true"}## Interpretation Guide- **Coverage**: Percentage of human-identified issues that GPT also captured (in some form). Higher = GPT missed fewer human concerns.- **Precision**: Percentage of GPT issues that are substantive rather than generic or spurious. Higher = GPT's critiques are more targeted.- **Overall Rating**: Qualitative assessment (Excellent/Good/Moderate/Poor) based on both coverage and precision metrics.:::## Summary and Observations```{r}#| label: summary-analysis#| code-fold: trueif (!is.null(comparison_data) && n_papers >0) {# Calculate some summary statistics issue_counts <- comparison_data |>mutate(num_gpt_issues =sapply(gpt_key_issues, length),paper_short =str_trunc(gpt_paper, 30) ) |>select(paper_short, num_gpt_issues, coda_critique_length)# Calculate severity distribution from parsed issues all_human_issues <-do.call(rbind, lapply(names(parsed_issues_list), function(paper_id) { issues <- parsed_issues_list[[paper_id]]if (is.null(issues) ||length(issues) ==0) return(NULL)data.frame(paper_id = paper_id,severity =sapply(issues, function(x) x$severity),stringsAsFactors =FALSE ) }))if (!is.null(all_human_issues) &&nrow(all_human_issues) >0) { severity_dist <- all_human_issues |>mutate(severity =case_when(grepl("necessary", tolower(severity)) ~"Necessary",grepl("optional", tolower(severity)) ~"Optional", severity !=""~"Unsure",TRUE~"Unclassified" ),severity =factor(severity, levels =c("Necessary", "Optional", "Unsure", "Unclassified")) ) |>count(severity, .drop =FALSE) }}```### Aggregate Statistics```{r}#| label: tbl-aggregate-stats#| tbl-cap: "Aggregate statistics across all papers"#| results: asisif (!is.null(comparison_data) && n_papers >0) {# Count human issues by severity total_human_issues <-if (!is.null(all_human_issues)) nrow(all_human_issues) else0 total_gpt_issues <-sum(issue_counts$num_gpt_issues) necessary_count <-if (!is.null(all_human_issues)) sum(grepl("necessary", tolower(all_human_issues$severity))) else0 optional_count <-if (!is.null(all_human_issues)) sum(grepl("optional", tolower(all_human_issues$severity))) else0 aggregate_stats <-data.frame(Metric =c("Papers Compared","Total Human Issues Parsed"," - Necessary"," - Optional"," - Unsure/Other","Total LLM Issues","Avg Human Issues per Paper","Avg LLM Issues per Paper" ),Value =c( n_papers, total_human_issues, necessary_count, optional_count, total_human_issues - necessary_count - optional_count, total_gpt_issues,round(total_human_issues / n_papers, 1),round(total_gpt_issues / n_papers, 1) ) )kable(aggregate_stats, align =c("l", "r"))}``````{r}#| label: fig-severity-distribution#| fig-cap: "Distribution of human issue severity labels"#| fig-width: 8#| fig-height: 4if (!is.null(all_human_issues) &&nrow(all_human_issues) >0) {# Define severity colors sev_colors <-c("Necessary"="#e74c3c", "Optional"="#f39c12", "Unsure"="#95a5a6", "Unclassified"="#bdc3c7")ggplot(severity_dist, aes(x = severity, y = n, fill = severity)) +geom_col(alpha =0.9) +geom_text(aes(label = n), vjust =-0.3, size =5) +scale_fill_manual(values = sev_colors, drop =FALSE) +labs(x ="Severity Label",y ="Number of Issues" ) +theme_uj() +theme(legend.position ="none")}```### Issue Count Distribution```{r}#| label: fig-issue-counts#| fig-cap: "Number of key issues identified by GPT-5.2 Pro per paper"#| fig-width: 10#| fig-height: 5if (!is.null(comparison_data) && n_papers >0) { issue_counts |>ggplot(aes(x =reorder(paper_short, num_gpt_issues), y = num_gpt_issues)) +geom_col(fill = UJ_ORANGE, alpha =0.8) +geom_hline(yintercept =mean(issue_counts$num_gpt_issues),linetype ="dashed", color = UJ_BLUE, linewidth =1) +coord_flip() +labs(x =NULL,y ="Number of Key Issues",subtitle =paste0("Dashed line = mean (", round(mean(issue_counts$num_gpt_issues), 1), " issues)") ) +theme_uj()}```### Coda Critique Length vs GPT Issue Count```{r}#| label: fig-length-scatter#| fig-cap: "Relationship between human critique detail and GPT issue count"#| fig-width: 8#| fig-height: 6if (!is.null(comparison_data) && n_papers >0) {ggplot(issue_counts, aes(x = coda_critique_length, y = num_gpt_issues)) +geom_point(size =3, color = UJ_ORANGE, alpha =0.7) +geom_smooth(method ="lm", se =TRUE, color = UJ_BLUE, alpha =0.2) +labs(x ="Human Critique Length (characters)",y ="GPT-5.2 Pro Issue Count" ) +theme_uj()}```### Observable Structural DifferencesThe following are **verifiable structural differences** between the two data sources (not assessments of quality or coverage):| Aspect | GPT-5.2 Pro | Human Expert (Coda) ||-----------------|---------------------|----------------------------------|| Format | Numbered bullet points (array of strings) | Free-form prose || Structure | Ordered list (prompted: "most to least important") | Often uses severity labels ("Necessary", "Optional but important") || Source attribution | None (single model output) | Often cites specific evaluators (E1, E2, names) || Length | Constrained by prompt (\~10-12 issues) | Unconstrained, highly variable |### Questions for Manual ReviewThe side-by-side comparisons above are provided for **manual expert assessment**. Key questions to investigate:1. **Coverage**: What proportion of human-identified issues appear in the GPT output (in some form, to some extent)? (This is the focal question).2. **Precision**: Are GPT issues substantive or does the model identify spurious/generic concerns?3. **Severity alignment**: Does GPT's importance ordering correlate with human severity labels?4. **Missed issues**: Are there critical human concerns that GPT systematically misses?5. **Novel issues**: Does GPT surface valid concerns that humans overlooked?```{r}#| label: assessment-status-callout#| results: asisif (has_llm_results) {cat('::: {.callout-tip}\n')cat('## LLM Assessment Complete\n\n')cat('The comparisons above have been assessed using GPT-5.2 Pro. See the [LLM-Based Assessment](#sec-llm-assessment) section for coverage and precision metrics.\n')cat(':::\n')} else {cat('::: {.callout-warning}\n')cat('## No Automated Assessment Yet\n\n')cat('This page displays raw data for manual review. The claims about coverage, precision, and alignment **have not been assessed**—they require either:\n\n')cat('- Manual expert review of each paper\'s comparisons\n')cat('- LLM-based comparison (run the `key-issues-comparison` chunk in `methods.qmd`)\n\n')cat('Any conclusions about "high overlap" or "systematic differences" should emerge from that analysis, not be assumed.\n')cat(':::\n')}```## Manual Annotation ToolTo systematically assess concordance between human and LLM critiques, use the **Issue Annotation Tool**. (Jan 8 2026: We are working on building a hosted tool, for now follow the instructions below.)``` bash# 1. Generate annotation data (parses human critiques into individual issues)python3 tools/build_issue_annotation_data.py# 2. Open the annotation UI in a browseropen tools/issue_annotation_ui/index.html```**Annotation workflow:**For each human-identified issue:1. **Match score (0-1)**: How well do LLM issues capture this concern? - 0 = Not addressed at all - 0.5 = Partially captured or tangentially related - 1 = Fully captured by one or more LLM issues2. **Confidence (0-1)**: How certain are you of this assessment?3. **Context flag**: Check this box if the human critique references information the LLM didn't have (e.g., appendix or preregistration materials not shared with the LLM at the point this evaluation was done)4. **Link to LLM issues**: Select which LLM issues (L1, L2, ...) correspond to this human issue5. **Discussion**: Explain your reasoning, note partial matches, or flag ambiguities**Export** annotations as JSON/CSV for analysis. Annotations are auto-saved to browser localStorage.::: callout-note## Annotation Data SchemaThe tool parses human critiques using heuristics:- **Severity labels** normalized to: `necessary`, `optional`, `unsure`- **Evaluator attributions** (E1, E2, DR) preserved in issue text- **Issue boundaries** detected via enumeration, sentence breaks, and section headersReview parsed issues in the UI and edit/add/remove as needed before annotating.:::