[
  {
    "id": "an-overview-of-the-bioasq-large-scale-biomedical-semantic-indexing-and-question--2015",
    "title": "An overview of the BioASQ large-scale biomedical semantic indexing and question answering competition",
    "year": 2015,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🩺 Medical reasoning / health rubrics",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "deep-reinforcement-learning-from-human-preferences-2017",
    "title": "Deep reinforcement learning from human preferences",
    "year": 2017,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "reward_modeling",
      "preference_learning"
    ],
    "domains": [
      "alignment"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🤝 Human preference data / RLHF",
    "tags": [
      "seeded-from-bib",
      "rlhf",
      "preference-data",
      "reward-modeling"
    ],
    "one_line_summary": "Shows how human preference comparisons can train reward models for reinforcement learning.",
    "why_it_matters": "It is a foundation for later post-training data records that turn comparisons into trainable reward signals.",
    "data_object": "pairwise preference; scalar reward",
    "feedback_verifier": "judgment required",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/1706.03741",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/1706.03741",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/1706.03741"
  },
  {
    "id": "data-statements-for-natural-language-processing-2018",
    "title": "Data statements for natural language processing",
    "year": 2018,
    "venue": "TACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "audit"
    ],
    "domains": [
      "data_documentation",
      "nlp"
    ],
    "category": [
      "surveys_and_primers",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "📦 Data documentation / datasheets",
    "tags": [
      "seeded-from-bib",
      "dataset-documentation",
      "provenance",
      "nlp"
    ],
    "one_line_summary": "Proposes data statements for NLP datasets, foregrounding language, speaker/community provenance, annotation context, and intended deployment boundaries.",
    "why_it_matters": "Reasoning-data users need this lens when a corpus mixes web text, synthetic questions, human annotations, or domain-specific tasks whose population assumptions affect generalization.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://aclanthology.org/Q18-1041/",
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": "https://aclanthology.org/Q18-1041/",
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/data-statements-for-natural-language-processing.md"
    },
    "primary_link": "https://aclanthology.org/Q18-1041/"
  },
  {
    "id": "datasheets-for-datasets-2018",
    "title": "Datasheets for datasets",
    "year": 2018,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "audit"
    ],
    "domains": [
      "data_documentation"
    ],
    "category": [
      "surveys_and_primers",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "📦 Data documentation / datasheets",
    "tags": [
      "seeded-from-bib",
      "dataset-documentation",
      "provenance",
      "audit"
    ],
    "one_line_summary": "Introduces dataset datasheets: a structured documentation template for provenance, composition, collection process, recommended uses, and limitations.",
    "why_it_matters": "It gives reasoning-data releases a minimum disclosure standard before anyone reuses prompts, traces, labels, rewards, or benchmark items.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/1803.09010",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/1803.09010",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/datasheets-for-datasets.md"
    },
    "primary_link": "https://arxiv.org/abs/1803.09010"
  },
  {
    "id": "holist-an-environment-for-machine-learning-of-higher-order-logic-theorem-proving-2019",
    "title": "HOList: An environment for machine learning of higher-order logic theorem proving",
    "year": 2019,
    "venue": "ICML",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "pubmedqa-a-dataset-for-biomedical-research-question-answering-2019",
    "title": "PubMedQA: A dataset for biomedical research question answering",
    "year": 2019,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🩺 Medical reasoning / health rubrics",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "a-primer-in-bertology-what-we-know-about-how-bert-works-2020",
    "title": "A primer in BERTology: What we know about how BERT works",
    "year": 2020,
    "venue": "TACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "fact-or-fiction-verifying-scientific-claims-2020",
    "title": "Fact or fiction: Verifying scientific claims",
    "year": 2020,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "retrieval-augmented-generation-for-knowledge-intensive-nlp-tasks-2020",
    "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    "year": 2020,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "casehold-a-dataset-for-legal-holding-statement-prediction-2021",
    "title": "CaseHOLD: A dataset for legal holding statement prediction",
    "year": 2021,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "⚖️ Legal reasoning",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "contractnli-a-dataset-for-document-level-natural-language-inference-for-contract-2021",
    "title": "ContractNLI: A dataset for document-level natural language inference for contracts",
    "year": 2021,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "cuad-an-expert-annotated-nlp-dataset-for-legal-contract-review-2021",
    "title": "CUAD: An expert-annotated NLP dataset for legal contract review",
    "year": 2021,
    "venue": "NeurIPS Datasets and Benchmarks",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "⚖️ Legal reasoning",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "evaluating-large-language-models-trained-on-code-2021",
    "title": "Evaluating large language models trained on code",
    "year": 2021,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "data_release",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "test_time_compute"
    ],
    "domains": [
      "code",
      "software_engineering"
    ],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "seeded-from-bib",
      "humaneval",
      "code",
      "unit-tests",
      "benchmark"
    ],
    "one_line_summary": "The Codex evaluation paper introduces HumanEval and studies code generation through functional correctness, repeated sampling, and pass@k.",
    "why_it_matters": "It connects code reasoning data to executable verification: generated programs are judged by tests, not by surface similarity to reference solutions.",
    "data_object": "executable Python function.; process: prompt, generated code, unit-test results, sample count.; Python execution sandbox and test suite.",
    "feedback_verifier": "HumanEval tests and pass@k evaluation.",
    "audit_focus": "Small public benchmarks are easy to memorize., Unit tests can miss incorrect or insecure behavior., Repeated sampling can hide low single-sample reliability.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2107.03374",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2107.03374",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/openai/human-eval",
      "data": "https://github.com/openai/human-eval",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/evaluating-large-language-models-trained-on-code.md"
    },
    "primary_link": "https://arxiv.org/abs/2107.03374"
  },
  {
    "id": "finetuned-language-models-are-zero-shot-learners-2021",
    "title": "Finetuned language models are zero-shot learners",
    "year": 2021,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "finqa-a-dataset-of-numerical-reasoning-over-financial-data-2021",
    "title": "FinQA: A dataset of numerical reasoning over financial data",
    "year": 2021,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "evaluation",
      "sft"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🏦 Financial reasoning",
    "tags": [
      "seeded-from-bib",
      "finance",
      "numerical-reasoning",
      "tabular-textual-qa"
    ],
    "one_line_summary": "FinQA introduces financial-document QA with expert-written questions and gold reasoning programs for numerical reasoning over financial reports.",
    "why_it_matters": "It is a finance-domain reasoning benchmark where the data object includes questions, evidence from financial reports, answers, and reasoning programs rather than only free-form responses.",
    "data_object": "answer level; step level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://aclanthology.org/2021.emnlp-main.300/",
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": "https://aclanthology.org/2021.emnlp-main.300/",
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": "https://finqasite.github.io/",
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://aclanthology.org/2021.emnlp-main.300/"
  },
  {
    "id": "measuring-coding-challenge-competence-with-apps-2021",
    "title": "Measuring coding challenge competence with APPS",
    "year": 2021,
    "venue": "NeurIPS",
    "authors": [
      "Dan Hendrycks",
      "Steven Basart",
      "Saurav Kadavath",
      "Mantas Mazeika",
      "Akul Arora",
      "Ethan Guo",
      "Collin Burns",
      "Samir Puranik",
      "Horace He",
      "Dawn Song",
      "Jacob Steinhardt"
    ],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "sft",
      "test_time_compute"
    ],
    "domains": [
      "code",
      "programming",
      "unit-tests"
    ],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "code",
      "programming",
      "seeded-from-bib",
      "unit-tests"
    ],
    "one_line_summary": "APPS evaluates code-generation competence with 10,000 programming problems checked by executable test cases.",
    "why_it_matters": "It is a pre-HumanEval large-scale code benchmark where the feedback-bearing object is a problem statement, generated program, and unit-test outcome.",
    "data_object": "Python code submission evaluated against test cases.; process: difficulty, prompt, starter code where available, generated solution, public/hidden test outcomes.; offline programming benchmark with executable Python tests.",
    "feedback_verifier": "unit-test pass/fail signal.",
    "audit_focus": "Programs can overfit weak tests., Syntax validity is not the same as functional correctness., Contamination can inflate code benchmark scores.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2105.09938",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2105.09938",
      "openreview": "https://openreview.net/forum?id=sD93GOzH3i5",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/hendrycks/apps",
      "data": "https://github.com/hendrycks/apps",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/apps.md"
    },
    "primary_link": "https://arxiv.org/abs/2105.09938"
  },
  {
    "id": "measuring-mathematical-problem-solving-with-the-math-dataset-2021",
    "title": "Measuring mathematical problem solving with the MATH dataset",
    "year": 2021,
    "venue": "NeurIPS Datasets and Benchmarks",
    "authors": [],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "sft"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "seeded-from-bib",
      "math",
      "benchmark",
      "dataset"
    ],
    "one_line_summary": "Introduces MATH, a competition-style math benchmark with challenging problems, subject categories, and step-by-step solutions.",
    "why_it_matters": "MATH became a central answer-verifiable surface for evaluating and training advanced mathematical reasoning beyond grade-school word problems.",
    "data_object": "answer level",
    "feedback_verifier": "programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2103.03874",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2103.03874",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/hendrycks/math",
      "data": "https://github.com/hendrycks/math",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/measuring-mathematical-problem-solving-with-the-math-dataset.md"
    },
    "primary_link": "https://arxiv.org/abs/2103.03874"
  },
  {
    "id": "minif2f-a-cross-system-benchmark-for-formal-olympiad-level-mathematics-2021",
    "title": "miniF2F: A cross-system benchmark for formal olympiad-level mathematics",
    "year": 2021,
    "venue": "ICLR",
    "authors": [
      "Kunhao Zheng",
      "Jesse Michael Han",
      "Stanislas Polu"
    ],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "programmatic",
      "environmental"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "evaluation",
      "agent_training",
      "sft"
    ],
    "domains": [
      "formal-math",
      "theorem-proving",
      "proof-assistants"
    ],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "formal-math",
      "proof-assistants",
      "seeded-from-bib",
      "theorem-proving"
    ],
    "one_line_summary": "miniF2F is a cross-system formal mathematics benchmark for comparing theorem provers across Lean, Metamath, Isabelle, and HOL Light targets.",
    "why_it_matters": "It is a compact formal-proof evaluation surface where the verifier is not a text judge but a proof assistant accepting or rejecting a proof.",
    "data_object": "formal proof accepted by a target proof assistant.; process: formal system, theorem statement, split, generated proof/tactics, verifier result.; Lean, Metamath, Isabelle, and HOL Light style theorem proving environments.",
    "feedback_verifier": "proof assistant kernel/checker acceptance.",
    "audit_focus": "A theorem can be easier in one formal system than another., Search budget can dominate model differences., Forks can drift from the original benchmark.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2109.00110",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2109.00110",
      "openreview": "https://openreview.net/forum?id=9ZPegFuFTFv",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/openai/miniF2F",
      "data": "https://github.com/openai/miniF2F",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/minif2f.md"
    },
    "primary_link": "https://arxiv.org/abs/2109.00110"
  },
  {
    "id": "multitask-prompted-training-enables-zero-shot-task-generalization-2021",
    "title": "Multitask prompted training enables zero-shot task generalization",
    "year": 2021,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "qasper-a-dataset-of-information-seeking-questions-and-answers-over-scientific-re-2021",
    "title": "Qasper: A dataset of information-seeking questions and answers over scientific research papers",
    "year": 2021,
    "venue": "NAACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "tat-qa-a-question-answering-benchmark-on-a-hybrid-of-tabular-and-textual-content-2021",
    "title": "TAT-QA: A question answering benchmark on a hybrid of tabular and textual content in finance",
    "year": 2021,
    "venue": "ACL",
    "authors": [],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "evaluation",
      "sft"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🏦 Financial reasoning",
    "tags": [
      "seeded-from-bib",
      "finance",
      "table-text-reasoning",
      "numerical-reasoning"
    ],
    "one_line_summary": "TAT-QA evaluates numerical reasoning over hybrid financial tables and text, with annotated derivations and answers.",
    "why_it_matters": "It gives legal/finance-style domain reasoning a concrete benchmark surface where evidence selection, table-text grounding, arithmetic, and answer normalization all matter.",
    "data_object": "answer level; step level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://aclanthology.org/2021.acl-long.254/",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2105.07624",
      "openreview": null,
      "acl": "https://aclanthology.org/2021.acl-long.254/",
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/NExTplusplus/TAT-QA",
      "data": "https://github.com/NExTplusplus/TAT-QA",
      "huggingface": null,
      "project": "https://nextplusplus.github.io/TAT-QA/",
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://aclanthology.org/2021.acl-long.254/"
  },
  {
    "id": "training-verifiers-to-solve-math-word-problems-2021",
    "title": "Training verifiers to solve math word problems",
    "year": 2021,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "verifier_reward",
      "data_release"
    ],
    "verification_contract": [
      "programmatic",
      "judgment_required"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "evaluation",
      "reward_modeling"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "process_supervision_prm"
    ],
    "subfield": "🧪 Process reward models",
    "tags": [
      "seeded-from-bib",
      "gsm8k",
      "math",
      "verifier",
      "benchmark"
    ],
    "one_line_summary": "Introduces GSM8K and trains verifier models to rank model-generated math solutions by likely correctness.",
    "why_it_matters": "It anchors answer-level math reasoning data as a pair of problem, solution, and verifier-selection signal, anticipating RLVR and reward-model workflows.",
    "data_object": "answer level; scalar reward",
    "feedback_verifier": "programmatic, judgment required",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2110.14168",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2110.14168",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/openai/grade-school-math",
      "data": "https://github.com/openai/grade-school-math",
      "huggingface": "https://huggingface.co/datasets/openai/gsm8k",
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/training-verifiers-to-solve-math-word-problems.md"
    },
    "primary_link": "https://arxiv.org/abs/2110.14168"
  },
  {
    "id": "chain-of-thought-prompting-elicits-reasoning-in-large-language-models-2022",
    "title": "Chain-of-thought prompting elicits reasoning in large language models",
    "year": 2022,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation"
    ],
    "domains": [
      "prompting",
      "reasoning"
    ],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧠 Chain-of-thought / rationale data",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "Shows that few-shot natural-language rationales can elicit multi-step reasoning behavior from sufficiently large language models.",
    "why_it_matters": "It is the conceptual bridge from answer-only prompts to trace-shaped reasoning examples, which later become SFT, distillation, filtering, and verifier targets.",
    "data_object": "answer level",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2201.11903",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2201.11903",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/chain-of-thought-prompting-elicits-reasoning-in-large-language-models.md"
    },
    "primary_link": "https://arxiv.org/abs/2201.11903"
  },
  {
    "id": "coderl-mastering-code-generation-through-pretrained-models-and-deep-reinforcemen-2022",
    "title": "CodeRL: Mastering code generation through pretrained models and deep reinforcement learning",
    "year": 2022,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🏋️ RLVR optimization scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "codet-code-generation-with-generated-tests-2022",
    "title": "CodeT: Code generation with generated tests",
    "year": 2022,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "coliee-competition-on-legal-information-extraction-entailment-2022",
    "title": "COLIEE: Competition on legal information extraction/entailment",
    "year": 2022,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "⚖️ Legal reasoning",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "constitutional-ai-harmlessness-from-ai-feedback-2022",
    "title": "Constitutional AI: Harmlessness from AI feedback",
    "year": 2022,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "preference_learning",
      "safety_alignment",
      "reward_modeling"
    ],
    "domains": [
      "safety",
      "alignment"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🤖 RLAIF / synthetic feedback",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "Constitutional AI trains harmless behavior from AI-generated critiques, revisions, and AI preference feedback guided by a written constitution.",
    "why_it_matters": "It is a core recipe for replacing part of human feedback with principle-guided model feedback, making critiques and preference pairs first-class post-training data.",
    "data_object": "original answer, self-critique, revised answer, preference pair, reward-model score.; process: principle used, critique, revision, comparison, preference label.; offline SL and RLHF/RLAIF alignment pipeline.",
    "feedback_verifier": "AI preference model trained from comparisons guided by constitutional principles.",
    "audit_focus": "AI feedback can encode model bias at scale., Principles may be underspecified or culturally narrow., A model can become safe-looking but evasive if helpfulness is not audited.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2212.08073",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2212.08073",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": "https://github.com/anthropics/ConstitutionalHarmlessnessPaper",
      "bibtex": null,
      "card": "cards/recipes/constitutional-ai.md"
    },
    "primary_link": "https://arxiv.org/abs/2212.08073"
  },
  {
    "id": "convfinqa-exploring-the-chain-of-numerical-reasoning-in-conversational-finance-q-2022",
    "title": "ConvFinQA: Exploring the chain of numerical reasoning in conversational finance question answering",
    "year": 2022,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🏦 Financial reasoning",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "draft-sketch-and-prove-guiding-formal-theorem-provers-with-informal-proofs-2022",
    "title": "Draft, sketch, and prove: Guiding formal theorem provers with informal proofs",
    "year": 2022,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "scaling-laws-for-reward-model-overoptimization-2022",
    "title": "Scaling laws for reward model overoptimization",
    "year": 2022,
    "venue": "ICML",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "scienceworld-is-your-agent-smarter-than-a-5th-grader-2022",
    "title": "ScienceWorld: Is your agent smarter than a 5th grader?",
    "year": 2022,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "solving-math-word-problems-with-process-and-outcome-based-feedback-2022",
    "title": "Solving math word problems with process- and outcome-based feedback",
    "year": 2022,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "process_supervision_prm"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "star-bootstrapping-reasoning-with-reasoning-2022",
    "title": "STaR: Bootstrapping reasoning with reasoning",
    "year": 2022,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "survey_background"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation"
    ],
    "domains": [
      "reasoning"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧪 Filtering and verifier refresh",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "STaR iteratively generates rationales, keeps examples whose final answers are correct, and fine-tunes on the accepted reasoning traces.",
    "why_it_matters": "It is a compact recipe for self-improving reasoning data: model traces become training data only after answer-based filtering.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2203.14465",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2203.14465",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/star-bootstrapping-reasoning-with-reasoning.md"
    },
    "primary_link": "https://arxiv.org/abs/2203.14465"
  },
  {
    "id": "training-a-helpful-and-harmless-assistant-with-reinforcement-learning-from-human-2022",
    "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    "year": 2022,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "reward_modeling",
      "preference_learning",
      "safety_alignment"
    ],
    "domains": [
      "alignment",
      "safety"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🤝 Human preference data / RLHF",
    "tags": [
      "seeded-from-bib",
      "rlhf",
      "preference-data",
      "safety"
    ],
    "one_line_summary": "Documents preference and RLHF data for helpfulness and harmlessness assistant behavior.",
    "why_it_matters": "It provides the alignment-data lineage that later reasoning-data recipes inherit when they combine demonstrations, preferences, and reward models.",
    "data_object": "pairwise preference; scalar reward",
    "feedback_verifier": "judgment required",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2204.05862",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2204.05862",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2204.05862"
  },
  {
    "id": "training-language-models-to-follow-instructions-with-human-feedback-2022",
    "title": "Training language models to follow instructions with human feedback",
    "year": 2022,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background",
      "model_report"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "sft",
      "preference_learning",
      "reward_modeling"
    ],
    "domains": [
      "alignment",
      "chat"
    ],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🤝 Human preference data / RLHF",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "InstructGPT establishes the demonstration, preference-comparison, reward-model, and PPO pipeline that many later post-training recipes inherit.",
    "why_it_matters": "It is the alignment-data baseline for separating supervised demonstrations, pairwise preferences, learned rewards, and policy optimization in later reasoning models.",
    "data_object": "pairwise preference; scalar reward",
    "feedback_verifier": "judgment required",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2203.02155",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2203.02155",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/training-language-models-to-follow-instructions-with-human-feedback.md"
    },
    "primary_link": "https://arxiv.org/abs/2203.02155"
  },
  {
    "id": "truthfulqa-2022",
    "title": "TruthfulQA",
    "year": 2022,
    "venue": "ACL",
    "authors": [
      "Stephanie Lin",
      "Jacob Hilton",
      "Owain Evans"
    ],
    "source_role": [
      "benchmark",
      "audit_failure"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "safety_alignment",
      "audit"
    ],
    "domains": [
      "truthfulness",
      "factuality",
      "safety"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍⚖️ Human/expert judgment",
    "tags": [
      "factuality",
      "safety",
      "seeded-from-bib",
      "truthfulness"
    ],
    "one_line_summary": "TruthfulQA is a benchmark for measuring whether models imitate common human falsehoods instead of giving truthful answers.",
    "why_it_matters": "It is a canonical truthfulness audit surface for reasoning models because stronger generation can still amplify persuasive false answers learned from web text.",
    "data_object": "free-form generation or multiple-choice answer with truthfulness and informativeness labels.; process: question category, reference true answers, reference false answers, model answer, truthfulness score, informativeness score.; offline benchmark with human-written items and evaluator scripts.",
    "feedback_verifier": "human references plus automated/human scoring protocols for truthfulness and informativeness.",
    "audit_focus": "A model can be uninformative but truthful., A model can sound confident while reproducing a human misconception., Multiple-choice and generation modes can lead to different conclusions.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2109.07958",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2109.07958",
      "openreview": null,
      "acl": "https://aclanthology.org/2022.acl-long.229/",
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/sylinrl/TruthfulQA",
      "data": "https://github.com/sylinrl/TruthfulQA",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/truthfulqa.md"
    },
    "primary_link": "https://arxiv.org/abs/2109.07958"
  },
  {
    "id": "alce-enabling-large-language-models-to-generate-text-with-citations-2023",
    "title": "ALCE: Enabling large language models to generate text with citations",
    "year": 2023,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "api-bank-a-benchmark-for-tool-augmented-llms-2023",
    "title": "API-Bank: A benchmark for tool-augmented LLMs",
    "year": 2023,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "data-provenance-for-language-models-2023",
    "title": "Data provenance for language models",
    "year": 2023,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "📦 Data documentation / datasheets",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "direct-preference-optimization-your-language-model-is-secretly-a-reward-model-2023",
    "title": "Direct preference optimization: Your language model is secretly a reward model",
    "year": 2023,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "pairwise_preference"
    ],
    "training_use": [
      "preference_learning"
    ],
    "domains": [
      "alignment",
      "preference"
    ],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "⚖️ DPO / preference optimization",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "DPO trains a policy directly from preference pairs by turning the reward-model objective into a supervised contrastive optimization problem.",
    "why_it_matters": "It shows that pairwise preference data can shape post-training behavior without deploying a separate learned reward model during optimization.",
    "data_object": "pairwise preference",
    "feedback_verifier": "judgment required",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2305.18290",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2305.18290",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/direct-preference-optimization-your-language-model-is-secretly-a-reward-model.md"
    },
    "primary_link": "https://arxiv.org/abs/2305.18290"
  },
  {
    "id": "distilling-step-by-step-outperforming-larger-language-models-with-less-training--2023",
    "title": "Distilling step-by-step: Outperforming larger language models with less training data and smaller model sizes",
    "year": 2023,
    "venue": "ACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧠 Chain-of-thought / rationale data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "factscore-2023",
    "title": "FActScore",
    "year": 2023,
    "venue": "EMNLP",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "financebench-a-benchmark-for-financial-question-answering-2023",
    "title": "FinanceBench: A benchmark for financial question answering",
    "year": 2023,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🏦 Financial reasoning",
    "tags": [
      "seeded-from-bib",
      "finance",
      "domain-benchmark",
      "factuality"
    ],
    "one_line_summary": "FinanceBench benchmarks financial question answering over public company filings and expert-authored answers.",
    "why_it_matters": "It is a domain-specific reasoning benchmark where grounding, evidence retrieval, expert answers, and current filing data matter more than generic exact-match reasoning.",
    "data_object": "answer level",
    "feedback_verifier": "judgment required, mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2311.11944",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2311.11944",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/patronus-ai/financebench",
      "data": "https://github.com/patronus-ai/financebench",
      "huggingface": "https://huggingface.co/datasets/PatronusAI/financebench",
      "project": "https://github.com/patronus-ai/financebench",
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2311.11944"
  },
  {
    "id": "gorilla-2023",
    "title": "Gorilla",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "gpqa-2023",
    "title": "GPQA",
    "year": 2023,
    "venue": "arXiv",
    "authors": [
      "David Rein",
      "Betty Li Hou",
      "Asa Cooper Stickland",
      "Jackson Petty",
      "Richard Yuanzhe Pang",
      "Julien Dirani",
      "Julian Michael",
      "Samuel R. Bowman"
    ],
    "source_role": [
      "benchmark"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "audit"
    ],
    "domains": [
      "science",
      "expert-evaluation",
      "scalable-oversight"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🩺 Medical reasoning / health rubrics",
    "tags": [
      "expert-evaluation",
      "scalable-oversight",
      "science",
      "seeded-from-bib"
    ],
    "one_line_summary": "GPQA is a graduate-level science Q&A benchmark designed so skilled non-experts with web access still struggle.",
    "why_it_matters": "It is a scalable-oversight benchmark: the data object tests whether models can answer expert questions that are difficult for ordinary validators to check.",
    "data_object": "multiple-choice answer with optional rationale and expert label.; process: domain, question, answer options, expert label, validation metadata, canary/string metadata.; offline expert Q&A benchmark.",
    "feedback_verifier": "expert-authored answer key and validation protocol.",
    "audit_focus": "Multiple-choice guessing can inflate scores., Non-expert validators may not catch subtle mistakes., Tool access changes what the benchmark measures.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2311.12022",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2311.12022",
      "openreview": "https://openreview.net/forum?id=Ti67584b98",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/idavidrein/gpqa",
      "data": "https://github.com/idavidrein/gpqa",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/gpqa.md"
    },
    "primary_link": "https://arxiv.org/abs/2311.12022"
  },
  {
    "id": "judgelm-fine-tuned-large-language-models-are-scalable-judges-2023",
    "title": "JudgeLM: Fine-tuned large language models are scalable judges",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "judging-llm-as-a-judge-with-mt-bench-and-chatbot-arena-2023",
    "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    "year": 2023,
    "venue": "NeurIPS Datasets and Benchmarks",
    "authors": [
      "Lianmin Zheng",
      "Wei-Lin Chiang",
      "Ying Sheng",
      "Siyuan Zhuang",
      "Zhanghao Wu",
      "Yonghao Zhuang",
      "Zi Lin",
      "Zhuohan Li",
      "Dacheng Li",
      "Eric P. Xing",
      "Hao Zhang",
      "Joseph E. Gonzalez",
      "Ion Stoica"
    ],
    "source_role": [
      "benchmark",
      "verifier_reward",
      "audit_failure"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "evaluation",
      "reward_modeling",
      "preference_learning",
      "audit"
    ],
    "domains": [
      "llm-as-judge",
      "preference-evaluation",
      "chat"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation",
      "audit_failure_contamination_verifier_attacks",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "⚖️ LLM-as-judge data",
    "tags": [
      "chat",
      "llm-as-judge",
      "preference-evaluation",
      "seeded-from-bib"
    ],
    "one_line_summary": "MT-Bench and Chatbot Arena establish LLM-as-a-judge and pairwise human-preference evaluation surfaces for open-ended chat models.",
    "why_it_matters": "It is the standard cautionary reference for judge data: scalable model judges are useful, but position, verbosity, self-enhancement, and limited-reasoning biases must be audited.",
    "data_object": "model response, judge score, pairwise preference, or arena battle outcome.; process: question, turn, model identity, response, judge prompt template, score, preference label, bias-control setting.; offline judge harness and crowd-sourced arena platform.",
    "feedback_verifier": "strong model judge and human preference comparisons.",
    "audit_focus": "Judge scores can be position-biased., Verbose answers can be over-rewarded., A model judge may share weaknesses with the evaluated model.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2306.05685",
      "venue": "https://papers.nips.cc/paper_files/paper/2023/hash/91f18a1287b398d378ef22505bf41832-Abstract-Datasets_and_Benchmarks.html",
      "arxiv": "https://arxiv.org/abs/2306.05685",
      "openreview": "https://openreview.net/forum?id=uccHPGDlao",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge",
      "data": "https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/mt-bench-chatbot-arena.md"
    },
    "primary_link": "https://arxiv.org/abs/2306.05685"
  },
  {
    "id": "language-models-do-not-always-say-what-they-think-unfaithful-explanations-in-cha-2023",
    "title": "Language models do not always say what they think: Unfaithful explanations in chain-of-thought prompting",
    "year": 2023,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧠 Chain-of-thought / rationale data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "large-language-models-encode-clinical-knowledge-2023",
    "title": "Large language models encode clinical knowledge",
    "year": 2023,
    "venue": "Nature",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "leandojo-theorem-proving-with-retrieval-augmented-language-models-2023",
    "title": "LeanDojo: Theorem proving with retrieval-augmented language models",
    "year": 2023,
    "venue": "NeurIPS Datasets and Benchmarks",
    "authors": [
      "Kaiyu Yang",
      "Aidan M. Swope",
      "Alex Gu",
      "Rahul Chalamala",
      "Peiyang Song",
      "Shixing Yu",
      "Saad Godil",
      "Ryan Prenger",
      "Anima Anandkumar"
    ],
    "source_role": [
      "data_release",
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "state_action_level",
      "step_level",
      "full_episode"
    ],
    "training_use": [
      "agent_training",
      "sft",
      "evaluation"
    ],
    "domains": [
      "formal-math",
      "lean",
      "retrieval"
    ],
    "category": [
      "programmatic_math_code_proof",
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "formal-math",
      "lean",
      "retrieval",
      "seeded-from-bib"
    ],
    "one_line_summary": "LeanDojo releases an open Lean theorem-proving environment, benchmark, and retrieval-augmented prover pipeline.",
    "why_it_matters": "It turns formal proof work into reusable agent data: repository state, accessible premises, proof states, tactics, retrieval context, and verifier feedback are all part of the record.",
    "data_object": "Lean tactic sequence or proof script checked by Lean.; process: repository commit, theorem, proof state, premises, retrieved context, tactic, Lean feedback, split.; Lean proof assistant environment and traced math-library repositories.",
    "feedback_verifier": "Lean checker and environment feedback.",
    "audit_focus": "A prover can rely on retrieval leakage., Lean version drift can break proofs., Premise accessibility rules can change task difficulty.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2306.15626",
      "venue": "https://proceedings.neurips.cc/paper_files/paper/2023/hash/4441469427094f8873d0fecb0c4e1cee-Abstract-Datasets_and_Benchmarks.html",
      "arxiv": "https://arxiv.org/abs/2306.15626",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/lean-dojo/LeanDojo",
      "data": "https://zenodo.org/records/10114157",
      "huggingface": null,
      "project": "https://leandojo.org/",
      "bibtex": null,
      "card": "cards/agents/leandojo.md"
    },
    "primary_link": "https://arxiv.org/abs/2306.15626"
  },
  {
    "id": "legalbench-2023",
    "title": "LegalBench",
    "year": 2023,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍⚖️ Human/expert judgment",
    "tags": [
      "seeded-from-bib",
      "legal",
      "domain-benchmark",
      "legal-reasoning"
    ],
    "one_line_summary": "LegalBench provides a collaboratively built benchmark suite for legal reasoning tasks across many legal domains.",
    "why_it_matters": "It anchors the legal side of judgment-required reasoning data, where task definitions, legal-domain splits, expert validity, and answer rubrics are often more important than a simple verifier.",
    "data_object": "answer level",
    "feedback_verifier": "judgment required, mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2308.11462",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2308.11462",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/HazyResearch/legalbench",
      "data": "https://github.com/HazyResearch/legalbench",
      "huggingface": "https://huggingface.co/datasets/nguha/legalbench",
      "project": "https://hazyresearch.stanford.edu/legalbench/",
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2308.11462"
  },
  {
    "id": "mammoth-building-math-generalist-models-through-hybrid-instruction-tuning-2023",
    "title": "MAmmoTH: Building math generalist models through hybrid instruction tuning",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "programmatic_math_code_proof"
    ],
    "subfield": "🧱 Instruction tuning / SFT data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "maud-a-merger-agreement-understanding-dataset-2023",
    "title": "MAUD: A merger agreement understanding dataset",
    "year": 2023,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "measuring-faithfulness-in-chain-of-thought-reasoning-2023",
    "title": "Measuring faithfulness in chain-of-thought reasoning",
    "year": 2023,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧠 Chain-of-thought / rationale data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "metamath-bootstrap-your-own-mathematical-questions-for-large-language-models-2023",
    "title": "MetaMath: Bootstrap your own mathematical questions for large language models",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧪 Verifier robustness and answer extraction",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "mind2web-towards-a-generalist-agent-for-the-web-2023",
    "title": "Mind2Web: Towards a generalist agent for the web",
    "year": 2023,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🌍 Web/browser agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "orca-progressive-learning-from-complex-explanation-traces-of-gpt-4-2023",
    "title": "Orca: Progressive learning from complex explanation traces of GPT-4",
    "year": 2023,
    "venue": "arXiv",
    "authors": [
      "Subhabrata Mukherjee",
      "Arindam Mitra",
      "Ganesh Jawahar",
      "Sahaj Agarwal",
      "Hamid Palangi",
      "Ahmed Awadallah"
    ],
    "source_role": [
      "construction_recipe",
      "model_report"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation",
      "evaluation"
    ],
    "domains": [
      "explanation-traces",
      "distillation",
      "synthetic-data"
    ],
    "category": [
      "frontier_model_reports",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧬 What is disclosed vs hidden",
    "tags": [
      "distillation",
      "explanation-traces",
      "seeded-from-bib",
      "synthetic-data"
    ],
    "one_line_summary": "Orca studies progressive learning from complex teacher explanation traces rather than shallow imitation of final answers.",
    "why_it_matters": "It is an early and influential reasoning-distillation recipe: the reusable object is teacher-assisted explanation data plus careful evaluation against style-only imitation.",
    "data_object": "instruction response with detailed explanation, intermediate reasoning, and final answer.; process: prompt source, teacher identity, explanation trace, task type, response, evaluation benchmark.; offline synthetic-data distillation and evaluation pipeline.",
    "feedback_verifier": "downstream reasoning, exam, and benchmark evaluation rather than a single automatic verifier.",
    "audit_focus": "Students can learn teacher style without robust reasoning., Synthetic traces can include teacher errors., Closed teacher data makes lineage hard to audit.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2306.02707",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2306.02707",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": "https://www.microsoft.com/en-us/research/publication/orca-progressive-learning-from-complex-explanation-traces-of-gpt-4/",
      "bibtex": null,
      "card": "cards/recipes/orca.md"
    },
    "primary_link": "https://arxiv.org/abs/2306.02707"
  },
  {
    "id": "prm800k-2023",
    "title": "Let's Verify Step by Step",
    "year": 2023,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "verifier_reward",
      "data_release"
    ],
    "verification_contract": [
      "judgment_required",
      "programmatic"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "process_supervision",
      "reward_modeling",
      "evaluation"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "process_supervision_prm",
      "benchmarks_evaluation"
    ],
    "subfield": "🪜 Human step-level labels",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "Provides step-level human labels for mathematical reasoning traces and trains process reward models to identify correct intermediate reasoning.",
    "why_it_matters": "It is the process-supervision anchor for moving from answer-level math rewards to step-level feedback in reasoning-model training and evaluation.",
    "data_object": "step-level labels and final answers; process: step, label, solution trace; offline math reasoning traces",
    "feedback_verifier": "process reward model trained from step labels",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2305.20050",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2305.20050",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/prm800k.md"
    },
    "primary_link": "https://arxiv.org/abs/2305.20050"
  },
  {
    "id": "proofnet-autoformalizing-and-formally-proving-undergraduate-level-mathematics-2023",
    "title": "ProofNet: Autoformalizing and formally proving undergraduate-level mathematics",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof"
    ],
    "subfield": "🧪 Verifier robustness and answer extraction",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "react-synergizing-reasoning-and-acting-in-language-models-2023",
    "title": "ReAct: Synergizing reasoning and acting in language models",
    "year": 2023,
    "venue": "ICLR",
    "authors": [
      "Shunyu Yao",
      "Jeffrey Zhao",
      "Dian Yu",
      "Nan Du",
      "Izhak Shafran",
      "Karthik Narasimhan",
      "Yuan Cao"
    ],
    "source_role": [
      "agent_environment",
      "construction_recipe"
    ],
    "verification_contract": [
      "environmental",
      "mixed"
    ],
    "supervision_granularity": [
      "state_action_level",
      "full_episode"
    ],
    "training_use": [
      "agent_training",
      "evaluation",
      "test_time_compute"
    ],
    "domains": [
      "agents",
      "tools",
      "reasoning-acting"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🔁 Replayable trajectory data",
    "tags": [
      "agents",
      "reasoning-acting",
      "seeded-from-bib",
      "tools"
    ],
    "one_line_summary": "ReAct interleaves reasoning traces with task-specific actions so models can update plans from external observations.",
    "why_it_matters": "It is a foundational agent-data pattern: the training/evaluation record is not just an answer but a trajectory of thought-like notes, actions, observations, and final response.",
    "data_object": "trajectory containing reasoning note, action, observation, and final answer or task completion.; process: task state, action string, observation, reasoning note, final answer, success indicator.; Wikipedia API, embodied/web shopping environments, and task-specific simulators.",
    "feedback_verifier": "environment success, answer correctness, or task-specific evaluation.",
    "audit_focus": "Reasoning notes can rationalize bad actions., Environment wrappers can change task difficulty., Few-shot exemplars may encode brittle action formats.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2210.03629",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2210.03629",
      "openreview": "https://openreview.net/forum?id=WE_vluYUL-X",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/ysymyth/ReAct",
      "data": null,
      "huggingface": null,
      "project": "https://react-lm.github.io/",
      "bibtex": null,
      "card": "cards/agents/react.md"
    },
    "primary_link": "https://arxiv.org/abs/2210.03629"
  },
  {
    "id": "reflexion-language-agents-with-verbal-reinforcement-learning-2023",
    "title": "Reflexion: Language agents with verbal reinforcement learning",
    "year": 2023,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "environmental_agents_tools_web_swe",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🔁 Self-training / STaR / Self-Instruct",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "rest-textsuperscriptem-2023",
    "title": "ReST\\textsuperscriptEM",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "self-consistency-improves-chain-of-thought-reasoning-in-language-models-2023",
    "title": "Self-consistency improves chain of thought reasoning in language models",
    "year": 2023,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "scaling_study",
      "survey_background"
    ],
    "verification_contract": [
      "mixed",
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "evaluation",
      "test_time_compute"
    ],
    "domains": [
      "reasoning",
      "test_time_compute"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧠 Chain-of-thought / rationale data",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "Self-consistency samples multiple chain-of-thought reasoning paths and chooses the answer that is most consistent across samples.",
    "why_it_matters": "It is the classic test-time compute baseline for reasoning: performance can improve by spending more samples and marginalizing over traces without changing training data.",
    "data_object": "multiple rationales and final answers for the same prompt.; process: sampling temperature, number of paths, answer extraction, aggregation rule.; benchmark prompting setup.",
    "feedback_verifier": "majority or marginalization over sampled answers.",
    "audit_focus": "More samples can amplify benchmark-specific shortcuts., Aggregation does not guarantee step faithfulness., Unmatched inference budgets can make methods look better than they are.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2203.11171",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2203.11171",
      "openreview": "https://openreview.net/forum?id=1PL1NIMMrw",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/self-consistency-chain-of-thought.md"
    },
    "primary_link": "https://arxiv.org/abs/2203.11171"
  },
  {
    "id": "self-instruct-aligning-language-models-with-self-generated-instructions-2023",
    "title": "Self-Instruct: Aligning language models with self-generated instructions",
    "year": 2023,
    "venue": "ACL",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "data_release"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft"
    ],
    "domains": [
      "instruction_following"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "foundation-starter",
      "primary-link-checked"
    ],
    "one_line_summary": "Self-Instruct bootstraps instruction-following data by having a model generate instructions, inputs, and outputs, then filtering low-quality or duplicate examples.",
    "why_it_matters": "It is the canonical self-generated instruction-data recipe that later reasoning datasets adapt for prompt sourcing and synthetic expansion.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2212.10560",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2212.10560",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/self-instruct-aligning-language-models-with-self-generated-instructions.md"
    },
    "primary_link": "https://arxiv.org/abs/2212.10560"
  },
  {
    "id": "self-rag-learning-to-retrieve-generate-and-critique-through-self-reflection-2023",
    "title": "Self-RAG: Learning to retrieve, generate, and critique through self-reflection",
    "year": 2023,
    "venue": "ICLR",
    "authors": [
      "Akari Asai",
      "Zeqiu Wu",
      "Yizhong Wang",
      "Avirup Sil",
      "Hannaneh Hajishirzi"
    ],
    "source_role": [
      "construction_recipe",
      "data_release",
      "agent_environment"
    ],
    "verification_contract": [
      "mixed",
      "judgment_required"
    ],
    "supervision_granularity": [
      "step_level",
      "answer_level"
    ],
    "training_use": [
      "sft",
      "evaluation",
      "agent_training",
      "audit"
    ],
    "domains": [
      "retrieval",
      "critique",
      "factuality"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "environmental_agents_tools_web_swe",
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "Other related work",
    "tags": [
      "critique",
      "factuality",
      "retrieval",
      "seeded-from-bib"
    ],
    "one_line_summary": "Self-RAG trains models to decide when to retrieve, generate with evidence, and critique outputs using reflection tokens.",
    "why_it_matters": "It is a key retrieval-augmented reasoning recipe where the data object includes control tokens, retrieved passages, critiques, and final generations.",
    "data_object": "generation with retrieval decisions, critique signals, and final answer.; process: query, retrieval decision token, retrieved passage, support critique, utility critique, final answer.; retriever plus generation model with special reflection tokens.",
    "feedback_verifier": "critique signals and task-specific factuality/answer-quality evaluation.",
    "audit_focus": "A model can retrieve irrelevant passages confidently., Critique tokens can become style markers without true verification., Retriever choice changes benchmark conclusions.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2310.11511",
      "venue": "https://proceedings.iclr.cc/paper_files/paper/2024/hash/25f7be9694d7b32d5cc670927b8091e1-Abstract-Conference.html",
      "arxiv": "https://arxiv.org/abs/2310.11511",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/akariasai/self-rag",
      "data": "https://huggingface.co/datasets/selfrag/selfrag_train_data",
      "huggingface": "https://huggingface.co/selfrag/selfrag_llama2_7b",
      "project": "https://selfrag.github.io/",
      "bibtex": null,
      "card": "cards/recipes/self-rag.md"
    },
    "primary_link": "https://arxiv.org/abs/2310.11511"
  },
  {
    "id": "selfcodealign-self-alignment-for-code-generation-2023",
    "title": "SelfCodeAlign: Self-alignment for code generation",
    "year": 2023,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "swe-bench-can-language-models-resolve-real-world-github-issues-2023",
    "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    "year": 2023,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "software_engineering",
      "code"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib",
      "swe-bench",
      "software-engineering",
      "agent",
      "environment"
    ],
    "one_line_summary": "SWE-bench turns real GitHub issues into repository-level repair tasks evaluated by applying patches and running tests.",
    "why_it_matters": "It is the agent/environment anchor where the reasoning-data object includes repository state, issue text, actions, patches, and test-backed outcomes.",
    "data_object": "full episode; state action level",
    "feedback_verifier": "environmental, programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2310.06770",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2310.06770",
      "openreview": "https://openreview.net/forum?id=VTF8yNQM66",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/swe-bench/SWE-bench",
      "data": "https://github.com/swe-bench/SWE-bench",
      "huggingface": null,
      "project": "https://www.swebench.com/original.html",
      "bibtex": null,
      "card": "cards/agents/swe-bench-can-language-models-resolve-real-world-github-issues.md"
    },
    "primary_link": "https://arxiv.org/abs/2310.06770"
  },
  {
    "id": "toolformer-language-models-can-teach-themselves-to-use-tools-2023",
    "title": "Toolformer: Language models can teach themselves to use tools",
    "year": 2023,
    "venue": "NeurIPS",
    "authors": [
      "Timo Schick",
      "Jane Dwivedi-Yu",
      "Roberto Dessi",
      "Roberta Raileanu",
      "Maria Lomeli",
      "Luke Zettlemoyer",
      "Nicola Cancedda",
      "Thomas Scialom"
    ],
    "source_role": [
      "construction_recipe",
      "agent_environment"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "answer_level"
    ],
    "training_use": [
      "sft",
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "tools",
      "api-calls",
      "self-supervision"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "api-calls",
      "seeded-from-bib",
      "self-supervision",
      "tools"
    ],
    "one_line_summary": "Toolformer creates self-supervised tool-use data by inserting API calls only when tool results improve language-model likelihood.",
    "why_it_matters": "It is a classic construction recipe for tool-call supervision: models learn when to call tools, what arguments to pass, and how to fold observations back into text.",
    "data_object": "text sequence with inserted API call and tool result markup.; process: candidate call location, API name, arguments, tool output, likelihood improvement, retained annotation.; external tool APIs used during data construction and evaluation.",
    "feedback_verifier": "language-model likelihood improvement after including tool result.",
    "audit_focus": "Likelihood improvement may not equal truthful tool use., Tools can return stale or wrong outputs., The model can learn call syntax without robust tool-selection judgment.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2302.04761",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2302.04761",
      "openreview": "https://openreview.net/forum?id=Yacmpz84TH",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/toolformer.md"
    },
    "primary_link": "https://arxiv.org/abs/2302.04761"
  },
  {
    "id": "ultrafeedback-boosting-language-models-with-high-quality-feedback-2023",
    "title": "UltraFeedback: Boosting language models with high-quality feedback",
    "year": 2023,
    "venue": "ICML",
    "authors": [
      "Ganqu Cui",
      "Lifan Yuan",
      "Ning Ding",
      "Guanming Yao",
      "Bingxiang He",
      "Wei Zhu",
      "Yuan Ni",
      "Guotong Xie",
      "Ruobing Xie",
      "Yankai Lin",
      "Zhiyuan Liu",
      "Maosong Sun"
    ],
    "source_role": [
      "data_release",
      "verifier_reward",
      "construction_recipe"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward",
      "pairwise_preference"
    ],
    "training_use": [
      "preference_learning",
      "reward_modeling",
      "sft",
      "safety_alignment"
    ],
    "domains": [
      "preference-data",
      "ai-feedback",
      "reward-modeling"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "judgment_required_rubrics_safety_domain",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🤖 RLAIF / synthetic feedback",
    "tags": [
      "ai-feedback",
      "preference-data",
      "reward-modeling",
      "seeded-from-bib"
    ],
    "one_line_summary": "UltraFeedback releases large-scale AI feedback with fine-grained ratings and critiques over diverse instruction-response pairs.",
    "why_it_matters": "It is a widely reused preference/reward data source, but its value depends on auditing prompt sources, judge model behavior, rubric dimensions, and corrected labels.",
    "data_object": "instruction, candidate responses, fine-grained ratings, textual critiques, and derived preference pairs.; process: source dataset, model identity, response, rating dimension, critique text, corrected overall score.; offline feedback generation and reward-model training pipeline.",
    "feedback_verifier": "AI-generated scalar and textual feedback over response quality dimensions.",
    "audit_focus": "AI feedback can encode judge-model bias., A corrected dataset version can change reward-model behavior., Fine-grained scores may not translate cleanly into pairwise preferences.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2310.01377",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2310.01377",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/OpenBMB/UltraFeedback",
      "data": "https://huggingface.co/datasets/openbmb/UltraFeedback",
      "huggingface": "https://huggingface.co/datasets/openbmb/UltraFeedback",
      "project": null,
      "bibtex": null,
      "card": "cards/releases/ultrafeedback.md"
    },
    "primary_link": "https://arxiv.org/abs/2310.01377"
  },
  {
    "id": "webarena-a-realistic-web-environment-for-building-autonomous-agents-2023",
    "title": "WebArena: A realistic web environment for building autonomous agents",
    "year": 2023,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental"
    ],
    "supervision_granularity": [
      "full_episode"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "web",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🌍 Web/browser agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Realistic web tasks where the data object is an agent episode and the verifier is task completion in a live-like browser environment.",
    "why_it_matters": "Realistic web tasks where the data object is an agent episode and the verifier is task completion in a live-like browser environment.",
    "data_object": "environment interaction trajectory; process: observation, action, state; browser-accessible web environment",
    "feedback_verifier": "task-specific success evaluator",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2307.13854",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2307.13854",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/webarena.md"
    },
    "primary_link": "https://arxiv.org/abs/2307.13854"
  },
  {
    "id": "wizardmath-empowering-mathematical-reasoning-for-large-language-models-via-reinf-2023",
    "title": "WizardMath: Empowering mathematical reasoning for large language models via reinforced evol-instruct",
    "year": 2023,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧱 Instruction tuning / SFT data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "ai-models-collapse-when-trained-on-recursively-generated-data-2024",
    "title": "AI models collapse when trained on recursively generated data",
    "year": 2024,
    "venue": "Nature",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧯 Contamination / evaluation surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "apigen-automated-pipeline-for-generating-verifiable-and-diverse-function-calling-2024",
    "title": "APIGen: Automated pipeline for generating verifiable and diverse function-calling datasets",
    "year": 2024,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "appworld-a-controllable-world-of-apps-and-people-for-benchmarking-interactive-co-2024",
    "title": "AppWorld: A controllable world of apps and people for benchmarking interactive coding agents",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "apps",
      "agents",
      "code"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "📱 App/mobile agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Controllable app world for interactive agents where tool/API state and final task success form the feedback contract.",
    "why_it_matters": "Controllable app world for interactive agents where tool/API state and final task success form the feedback contract.",
    "data_object": "API/tool action trajectory; process: tool call, state transition, observation; simulated app ecosystem with users and APIs",
    "feedback_verifier": "programmatic environment assertions",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2407.18901",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2407.18901",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/appworld.md"
    },
    "primary_link": "https://arxiv.org/abs/2407.18901"
  },
  {
    "id": "bfcl-v3-2024",
    "title": "BFCL v3",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "bigcodebench-benchmarking-code-generation-with-diverse-function-calls-and-comple-2024",
    "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    "year": 2024,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "programmatic_math_code_proof",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "bright-a-realistic-and-challenging-benchmark-for-reasoning-intensive-retrieval-2024",
    "title": "BRIGHT: A realistic and challenging benchmark for reasoning-intensive retrieval",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧯 Benchmark contamination",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "browsergym-a-gym-environment-for-web-agents-2024",
    "title": "BrowserGym: A gym environment for web agents",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "infrastructure",
      "agent_environment",
      "benchmark"
    ],
    "verification_contract": [
      "environmental"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "web",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🌍 Web/browser agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "A web-agent substrate: useful less as a static dataset and more as a repeatable environment for trajectory collection and evaluation.",
    "why_it_matters": "A web-agent substrate: useful less as a static dataset and more as a repeatable environment for trajectory collection and evaluation.",
    "data_object": "browser trajectory; process: DOM/state observation, action, reward/result; gym-style browser environment",
    "feedback_verifier": "environment task evaluator",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2412.05467",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2412.05467",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/browsergym.md"
    },
    "primary_link": "https://arxiv.org/abs/2412.05467"
  },
  {
    "id": "chembench-a-benchmark-for-evaluating-large-language-models-in-chemistry-2024",
    "title": "ChemBench: A benchmark for evaluating large language models in chemistry",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧯 Benchmark contamination",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "deepseek-prover-advancing-theorem-proving-in-llms-2024",
    "title": "DeepSeek-Prover: Advancing theorem proving in LLMs",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Huajian Xin",
      "Daya Guo",
      "Zhihong Shao",
      "Zhizhou Ren",
      "Qihao Zhu",
      "Bo Liu",
      "Chong Ruan",
      "Wenda Li",
      "Xiaodan Liang"
    ],
    "source_role": [
      "data_release",
      "construction_recipe",
      "model_report"
    ],
    "verification_contract": [
      "programmatic",
      "environmental"
    ],
    "supervision_granularity": [
      "step_level",
      "answer_level"
    ],
    "training_use": [
      "sft",
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "formal-math",
      "lean",
      "synthetic-data"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "formal-math",
      "lean",
      "seeded-from-bib",
      "synthetic-data"
    ],
    "one_line_summary": "DeepSeek-Prover generates large-scale Lean 4 theorem-proving data from informal math problems and trains a formal proof model.",
    "why_it_matters": "It is a key formal-reasoning data recipe where synthetic formal statements, generated proofs, and Lean verification form a reusable post-training object.",
    "data_object": "Lean 4 theorem statement and proof script checked by Lean.; process: informal problem, formal statement, generated proof, Lean result, benchmark split.; Lean 4 proof assistant and formal theorem-proving benchmark harness.",
    "feedback_verifier": "Lean kernel/checker acceptance.",
    "audit_focus": "Formal statements can be wrong even if proofs verify., Pass@k hides low single-shot reliability., Lean/mathlib version drift can break reproducibility.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2405.14333",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2405.14333",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": "https://huggingface.co/datasets/deepseek-ai/DeepSeek-Prover-V1",
      "huggingface": "https://huggingface.co/deepseek-ai/DeepSeek-Prover-V1",
      "project": "https://huggingface.co/deepseek-ai/DeepSeek-Prover-V1",
      "bibtex": null,
      "card": "cards/recipes/deepseek-prover.md"
    },
    "primary_link": "https://arxiv.org/abs/2405.14333"
  },
  {
    "id": "deepseek-prover-v1-5-harnessing-proof-assistant-feedback-for-reinforcement-learn-2024",
    "title": "DeepSeek-Prover-V1.5: Harnessing proof assistant feedback for reinforcement learning and Monte-Carlo tree search",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Huajian Xin",
      "Z. Z. Ren",
      "Junxiao Song",
      "Zhihong Shao",
      "Wanjia Zhao",
      "Haocheng Wang",
      "Bo Liu",
      "Liyue Zhang",
      "Xuan Lu",
      "Qiushi Du",
      "Wenjun Gao",
      "Qihao Zhu",
      "Dejian Yang",
      "Zhibin Gou",
      "Z. F. Wu",
      "Fuli Luo",
      "Chong Ruan"
    ],
    "source_role": [
      "construction_recipe",
      "model_report",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic",
      "environmental"
    ],
    "supervision_granularity": [
      "step_level",
      "scalar_reward",
      "full_episode"
    ],
    "training_use": [
      "rlvr",
      "agent_training",
      "evaluation",
      "test_time_compute"
    ],
    "domains": [
      "formal-math",
      "lean",
      "rl"
    ],
    "category": [
      "programmatic_math_code_proof",
      "scaling_test_time_compute_rlvr",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "formal-math",
      "lean",
      "rl",
      "seeded-from-bib"
    ],
    "one_line_summary": "DeepSeek-Prover-V1.5 adds proof-assistant feedback, RL, and RMaxTS search on top of DeepSeek-Prover-style formal proof data.",
    "why_it_matters": "It shows how proof-assistant feedback can become both a reward signal and a search guide for formal mathematical reasoning.",
    "data_object": "Lean proof script, proof-search path, feedback signal, and verification result.; process: theorem, proof attempt, Lean feedback, reward, search node, final proof, pass/fail result.; Lean 4 proof assistant plus RMaxTS search procedure.",
    "feedback_verifier": "proof assistant feedback used for RL and search selection.",
    "audit_focus": "Search budget can dominate model quality., Checker feedback is sparse and version-dependent., RL can optimize toward easy theorem families.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2408.08152",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2408.08152",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/deepseek-ai/DeepSeek-Prover-V1.5",
      "data": null,
      "huggingface": "https://huggingface.co/deepseek-ai/DeepSeek-Prover-V1.5-RL",
      "project": "https://github.com/deepseek-ai/DeepSeek-Prover-V1.5",
      "bibtex": null,
      "card": "cards/recipes/deepseek-prover-v1-5.md"
    },
    "primary_link": "https://arxiv.org/abs/2408.08152"
  },
  {
    "id": "deepseekmath-pushing-the-limits-of-mathematical-reasoning-in-open-language-model-2024",
    "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Zhihong Shao",
      "Peiyi Wang",
      "Qihao Zhu",
      "Runxin Xu",
      "Junxiao Song",
      "Xiao Bi",
      "Haowei Zhang",
      "Mingchuan Zhang",
      "Y. K. Li",
      "Y. Wu",
      "Daya Guo"
    ],
    "source_role": [
      "model_report",
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "sft",
      "rlvr",
      "evaluation",
      "test_time_compute"
    ],
    "domains": [
      "math",
      "rlvr",
      "web-data"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "frontier_model_reports"
    ],
    "subfield": "🧮 Math RLVR datasets",
    "tags": [
      "math",
      "rlvr",
      "seeded-from-bib",
      "web-data"
    ],
    "one_line_summary": "DeepSeekMath combines math-focused web-data selection with SFT, GRPO-style RL, and self-consistency evaluation for open mathematical reasoning.",
    "why_it_matters": "It is an important bridge from data selection to RLVR: performance gains are attributed to both a math pretraining corpus and a more memory-efficient policy-optimization recipe.",
    "data_object": "natural-language mathematical solution plus final answer, sometimes sampled multiple times.; process: data-selection score, training stage, problem, solution, final answer, verifier/evaluation result, sampling count.; offline math training and benchmark evaluation pipeline.",
    "feedback_verifier": "answer correctness and GRPO-style reward over math tasks.",
    "audit_focus": "Self-consistency can hide weak single-sample accuracy., Web-data mining may import benchmark leakage., Final-answer rewards can miss flawed derivations.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2402.03300",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2402.03300",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/deepseek-ai/deepseek-math",
      "data": null,
      "huggingface": "https://huggingface.co/collections/deepseek-ai/deepseek-math",
      "project": "https://github.com/deepseek-ai/deepseek-math",
      "bibtex": null,
      "card": "cards/recipes/deepseekmath.md"
    },
    "primary_link": "https://arxiv.org/abs/2402.03300"
  },
  {
    "id": "does-writing-with-lms-reduce-content-diversity-2024",
    "title": "Does writing with LMs reduce content diversity?",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧪 Verifier gaming",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "frontiermath-a-benchmark-for-evaluating-advanced-mathematical-reasoning-in-ai-2024",
    "title": "FrontierMath: A benchmark for evaluating advanced mathematical reasoning in AI",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "frontier_model_reports",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧰 Programmatic benchmarks",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "goedel-prover-a-frontier-model-for-open-source-automated-theorem-proving-2024",
    "title": "Goedel-Prover: A frontier model for open-source automated theorem proving",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "gsm-symbolic-2024",
    "title": "GSM-Symbolic",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧪 Verifier gaming",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "harmbench-2024",
    "title": "HarmBench",
    "year": 2024,
    "venue": "ICML",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "helpsteer2-open-source-preference-data-for-helpfulness-and-safety-2024",
    "title": "HelpSteer2: Open-source preference data for helpfulness and safety",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🤝 Human preference data / RLHF",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "introducing-swe-bench-verified-2024",
    "title": "Introducing SWE-bench Verified",
    "year": 2024,
    "venue": "OpenAI / SWE-bench report",
    "authors": [
      "SWE-bench Team",
      "OpenAI Preparedness"
    ],
    "source_role": [
      "benchmark",
      "agent_environment",
      "audit_failure"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "evaluation",
      "agent_training",
      "audit"
    ],
    "domains": [
      "software-engineering",
      "agents",
      "unit-tests"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "programmatic_math_code_proof",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "agents",
      "seeded-from-bib",
      "software-engineering",
      "unit-tests"
    ],
    "one_line_summary": "SWE-bench Verified is a human-filtered 500-instance subset of SWE-bench designed to reduce ambiguous, unsolvable, or incorrectly tested software-engineering tasks.",
    "why_it_matters": "It is now a central coding-agent evaluation surface because the data object contains issue text, repository state, proposed patch, and test-backed success criteria.",
    "data_object": "patch diff applied to a repository plus test execution results.; process: repository, issue, base commit, patch, FAIL TO PASS tests, PASS TO PASS tests, human validation notes.; Dockerized repository checkout and unit-test harness.",
    "feedback_verifier": "post-patch unit tests plus human filtering of task validity.",
    "audit_focus": "Leaderboard scores can depend strongly on scaffold design., Tests may not cover all acceptable patches., Public benchmark tasks can become training targets over time.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://openai.com/index/introducing-swe-bench-verified/",
      "venue": "https://www.swebench.com/verified.html",
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/swe-bench/SWE-bench",
      "data": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified",
      "huggingface": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified",
      "project": "https://www.swebench.com/verified.html",
      "bibtex": null,
      "card": "cards/agents/swe-bench-verified.md"
    },
    "primary_link": "https://openai.com/index/introducing-swe-bench-verified/"
  },
  {
    "id": "is-model-collapse-inevitable-2024",
    "title": "Is model collapse inevitable?",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧯 Contamination / evaluation surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "lab-bench-measuring-capabilities-of-language-models-for-biology-research-2024",
    "title": "LAB-Bench: Measuring capabilities of language models for biology research",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "large-language-monkeys-scaling-inference-compute-with-repeated-sampling-2024",
    "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    "year": 2024,
    "venue": "arXiv preprint arXiv:2407.21787",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2407.21787",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2407.21787",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2407.21787"
  },
  {
    "id": "learning-to-reason-with-llms-2024",
    "title": "Learning to reason with LLMs",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "frontier_model_reports"
    ],
    "subfield": "🧠 Reasoning LLM surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "livebench-a-challenging-contamination-free-benchmark-for-large-language-models-2024",
    "title": "LiveBench: A challenging, contamination-free benchmark for large language models",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "audit_failure"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "audit"
    ],
    "domains": [
      "evaluation",
      "math",
      "code",
      "reasoning"
    ],
    "category": [
      "surveys_and_primers",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧯 Contamination / evaluation surveys",
    "tags": [
      "seeded-from-bib",
      "livebench",
      "contamination",
      "benchmark",
      "evaluation"
    ],
    "one_line_summary": "LiveBench is a frequently updated, contamination-limited benchmark that uses recent sources and objective scoring across math, code, reasoning, language, instruction following, and data analysis.",
    "why_it_matters": "It gives reasoning-data readers a benchmark-refresh pattern for separating real progress from memorized or stale evaluation items.",
    "data_object": "answer level",
    "feedback_verifier": "programmatic, mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2406.19314",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2406.19314",
      "openreview": "https://openreview.net/forum?id=sKYHBTAxVa",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": "https://livebench.ai/",
      "bibtex": null,
      "card": "cards/failures/livebench-a-challenging-contamination-free-benchmark-for-large-language-models.md"
    },
    "primary_link": "https://arxiv.org/abs/2406.19314"
  },
  {
    "id": "livecodebench-holistic-and-contamination-free-evaluation-of-large-language-model-2024",
    "title": "LiveCodeBench: Holistic and contamination-free evaluation of large language models for code",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Naman Jain",
      "King Han",
      "Alex Gu",
      "Wen-Ding Li",
      "Fanjia Yan",
      "Tianjun Zhang",
      "Sida Wang",
      "Armando Solar-Lezama",
      "Koushik Sen",
      "Ion Stoica"
    ],
    "source_role": [
      "benchmark",
      "audit_failure"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level",
      "full_episode"
    ],
    "training_use": [
      "evaluation",
      "audit",
      "test_time_compute"
    ],
    "domains": [
      "code",
      "contamination",
      "execution"
    ],
    "category": [
      "programmatic_math_code_proof",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "code",
      "contamination",
      "execution",
      "seeded-from-bib"
    ],
    "one_line_summary": "LiveCodeBench continuously collects recent programming problems to evaluate code generation, execution, repair, and test-output prediction under lower contamination risk.",
    "why_it_matters": "It gives code-reasoning evaluation a moving-time-window design, making it harder to confuse memorized public problems with genuine coding capability.",
    "data_object": "program submission or code-related output evaluated by tests or task-specific checks.; process: problem release date, platform, prompt, generated code, tests, pass/fail result, evaluation window.; code execution and benchmark leaderboard infrastructure.",
    "feedback_verifier": "programmatic tests and task-specific correctness checks.",
    "audit_focus": "Live benchmarks still become stale after release., Execution settings can affect pass/fail outcomes., Public leaderboard feedback can shape future training.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2403.07974",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2403.07974",
      "openreview": "https://openreview.net/forum?id=chfJJYC3iL",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/livecodebench/livecodebench",
      "data": "https://github.com/livecodebench/livecodebench",
      "huggingface": null,
      "project": "https://livecodebench.github.io/",
      "bibtex": null,
      "card": "cards/benchmarks/livecodebench.md"
    },
    "primary_link": "https://arxiv.org/abs/2403.07974"
  },
  {
    "id": "magicoder-empowering-code-generation-with-oss-instruct-2024",
    "title": "Magicoder: Empowering code generation with OSS-instruct",
    "year": 2024,
    "venue": "ICML",
    "authors": [
      "Yuxiang Wei",
      "Zhe Wang",
      "Jiawei Liu",
      "Yifeng Ding",
      "Lingming Zhang"
    ],
    "source_role": [
      "construction_recipe",
      "data_release",
      "model_report"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "sft",
      "evaluation",
      "distillation"
    ],
    "domains": [
      "code",
      "synthetic-data",
      "open-source-context"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "benchmarks_evaluation"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "code",
      "open-source-context",
      "seeded-from-bib",
      "synthetic-data"
    ],
    "one_line_summary": "Magicoder introduces OSS-Instruct, a code-data recipe that uses open-source code snippets to generate more realistic instruction data.",
    "why_it_matters": "It is a strong code-data construction example because it grounds synthetic coding instructions in real open-source references rather than free-floating prompt invention.",
    "data_object": "instruction-response coding example, often linked to a code reference or task scaffold.; process: source snippet, generated instruction, solution response, model family, benchmark result.; offline code-data generation and code benchmark evaluation.",
    "feedback_verifier": "coding benchmark pass rates and optional executable checks.",
    "audit_focus": "Synthetic code tasks can inherit license issues., Reference snippets may leak benchmark patterns., Teacher-generated solutions can be plausible but wrong.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2312.02120",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2312.02120",
      "openreview": null,
      "acl": null,
      "pmlr": "https://proceedings.mlr.press/v235/wei24h.html",
      "cvf": null,
      "doi": null,
      "code": "https://github.com/ise-uiuc/magicoder",
      "data": "https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K",
      "huggingface": "https://huggingface.co/ise-uiuc/Magicoder-S-DS-6.7B",
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/magicoder.md"
    },
    "primary_link": "https://arxiv.org/abs/2312.02120"
  },
  {
    "id": "mammoth2-scaling-instructions-from-the-web-2024",
    "title": "MAmmoTH2: Scaling Instructions from the Web",
    "year": 2024,
    "venue": "Advances in Neural Information Processing Systems (NeurIPS)",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2405.03548",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2405.03548",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2405.03548"
  },
  {
    "id": "math-shepherd-2024",
    "title": "Math-Shepherd",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "verifier_reward"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "process_supervision",
      "reward_modeling"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "process_supervision_prm",
      "scaling_test_time_compute_rlvr",
      "benchmarks_evaluation"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [],
    "one_line_summary": "Rollout-value supervision method that assigns process rewards to intermediate math reasoning steps.",
    "why_it_matters": "It is the clearest bridge between final-answer verifiers and step-level PRM data: the label is not a human judgment but a rollout-derived estimate of whether a partial step can still reach the right answer.",
    "data_object": "step-level rollout-value labels; process: reasoning step, rollout result, process reward score; offline math reasoning traces",
    "feedback_verifier": "rollout-derived process reward signal",
    "audit_focus": "rollout policy strength can leak into labels, step rewards may favor locally plausible continuations, generated solutions can inherit base-model shortcuts",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2312.08935",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2312.08935",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/math_shepherd.md"
    },
    "primary_link": "https://arxiv.org/abs/2312.08935"
  },
  {
    "id": "opencodeinterpreter-integrating-code-generation-with-execution-and-refinement-2024",
    "title": "OpenCodeInterpreter: Integrating code generation with execution and refinement",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "openmathinstruct-2-accelerating-ai-for-math-with-massive-open-source-instruction-2024",
    "title": "OpenMathInstruct-2: Accelerating AI for math with massive open-source instruction data",
    "year": 2024,
    "venue": "ICLR",
    "authors": [
      "Shubham Toshniwal",
      "Wei Du",
      "Ivan Moshkov",
      "Branislav Kisacanin",
      "Alexan Ayrapetyan",
      "Igor Gitman"
    ],
    "source_role": [
      "data_release",
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "sft",
      "distillation",
      "evaluation"
    ],
    "domains": [
      "math",
      "synthetic-data",
      "instruction-tuning"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "instruction-tuning",
      "math",
      "seeded-from-bib",
      "synthetic-data"
    ],
    "one_line_summary": "OpenMathInstruct-2 releases 14M math instruction-tuning problem-solution pairs generated with an open synthesis pipeline.",
    "why_it_matters": "It is a major open math-data release for studying teacher strength, solution format, question diversity, and SFT scaling in reasoning models.",
    "data_object": "problem-solution pair with natural-language mathematical reasoning and final answer.; process: source problem, augmented problem, generated solution, teacher model, filtering metadata, dataset split.; NVIDIA NeMo-Skills generation, training, and evaluation pipeline.",
    "feedback_verifier": "answer checks and benchmark evaluation over math tasks.",
    "audit_focus": "Synthetic solutions can encode teacher shortcuts., Large scale can hide duplicated or near-duplicated questions., Verbose traces may hurt rather than help SFT.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2410.01560",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2410.01560",
      "openreview": "https://openreview.net/forum?id=mTCbq2QssD",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/NVIDIA-NeMo/Skills",
      "data": "https://huggingface.co/datasets/nvidia/OpenMathInstruct-2",
      "huggingface": "https://huggingface.co/datasets/nvidia/OpenMathInstruct-2",
      "project": "https://nvidia-nemo.github.io/Skills/releases/openmathinstruct2/",
      "bibtex": null,
      "card": "cards/releases/openmathinstruct-2.md"
    },
    "primary_link": "https://arxiv.org/abs/2410.01560"
  },
  {
    "id": "osworld-benchmarking-multimodal-agents-for-open-ended-tasks-in-real-computer-env-2024",
    "title": "OSWorld: Benchmarking multimodal agents for open-ended tasks in real computer environments",
    "year": 2024,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental"
    ],
    "supervision_granularity": [
      "full_episode"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "computer_use",
      "agents",
      "multimodal"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🖥️ OS/desktop agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Open-ended computer-use benchmark that makes environment state, UI actions, and terminal outcomes central to reasoning-data evaluation.",
    "why_it_matters": "Open-ended computer-use benchmark that makes environment state, UI actions, and terminal outcomes central to reasoning-data evaluation.",
    "data_object": "GUI/OS action trajectory; process: observation, action, environment state; desktop operating-system environment",
    "feedback_verifier": "task completion evaluator",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2404.07972",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2404.07972",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/osworld.md"
    },
    "primary_link": "https://arxiv.org/abs/2404.07972"
  },
  {
    "id": "overoptimization-in-direct-alignment-algorithms-2024",
    "title": "Overoptimization in direct alignment algorithms",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "paperqa2-language-agents-achieve-superhuman-synthesis-of-scientific-knowledge-2024",
    "title": "PaperQA2 / Language agents achieve superhuman synthesis of scientific knowledge",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "prometheus-2-an-open-source-language-model-specialized-in-evaluating-other-langu-2024",
    "title": "Prometheus 2: An open source language model specialized in evaluating other language models",
    "year": 2024,
    "venue": "EMNLP",
    "authors": [
      "Seungone Kim",
      "Juyoung Suk",
      "Shayne Longpre",
      "Bill Yuchen Lin",
      "Jamin Shin",
      "Sean Welleck",
      "Graham Neubig",
      "Moontae Lee",
      "Kyungjae Lee",
      "Minjoon Seo"
    ],
    "source_role": [
      "verifier_reward",
      "model_report",
      "data_release"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "answer_level",
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "reward_modeling",
      "preference_learning",
      "evaluation",
      "audit"
    ],
    "domains": [
      "llm-as-judge",
      "rubrics",
      "evaluation-models"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "foundations_instruction_preference_alignment",
      "benchmarks_evaluation"
    ],
    "subfield": "⚖️ LLM-as-judge data",
    "tags": [
      "evaluation-models",
      "llm-as-judge",
      "rubrics",
      "seeded-from-bib"
    ],
    "one_line_summary": "Prometheus 2 is an open evaluator model for both direct assessment and pairwise ranking under user-defined criteria.",
    "why_it_matters": "It gives the atlas a concrete open-source judge model whose training/evaluation data can be audited instead of treating proprietary judges as black boxes.",
    "data_object": "rubric-conditioned scalar score, critique, or pairwise preference output.; process: instruction, candidate response, evaluation criterion, assessment format, score/ranking, reference judgment.; open evaluator model, GitHub code, ACL software/data artifacts, and HF weights.",
    "feedback_verifier": "Prometheus 2 judge output aligned against human/proprietary-judge benchmarks.",
    "audit_focus": "Open judges can inherit rubric bias., Agreement with another judge is not the same as correctness., Pairwise and scalar formats can disagree.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2405.01535",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2405.01535",
      "openreview": null,
      "acl": "https://aclanthology.org/2024.emnlp-main.248/",
      "pmlr": null,
      "cvf": null,
      "doi": "https://doi.org/10.18653/v1/2024.emnlp-main.248",
      "code": "https://github.com/prometheus-eval/prometheus-eval",
      "data": "https://aclanthology.org/2024.emnlp-main.248.data.zip",
      "huggingface": "https://huggingface.co/prometheus-eval/prometheus-7b-v2.0",
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/prometheus-2.md"
    },
    "primary_link": "https://arxiv.org/abs/2405.01535"
  },
  {
    "id": "qwen2-5-math-technical-report-toward-mathematical-expert-model-via-self-improvem-2024",
    "title": "Qwen2.5-Math technical report: Toward mathematical expert model via self-improvement",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Qwen Team"
    ],
    "source_role": [
      "model_report",
      "construction_recipe",
      "verifier_reward"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "sft",
      "reward_modeling",
      "rlvr",
      "evaluation",
      "test_time_compute"
    ],
    "domains": [
      "math",
      "tool-integrated-reasoning",
      "reward-modeling"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "math",
      "reward-modeling",
      "seeded-from-bib",
      "tool-integrated-reasoning"
    ],
    "one_line_summary": "Qwen2.5-Math reports a math-specialized model family supporting chain-style and tool-integrated reasoning with base, instruct, and reward-model variants.",
    "why_it_matters": "It is a useful model-report case where math post-training data, reward models, tool-integrated reasoning, and multilingual math evaluation are tied together.",
    "data_object": "math solution, final answer, optional tool/code execution trace, and reward-model score.; process: model stage, problem source, reasoning mode, tool use, answer, reward score, benchmark result.; Qwen math model family, GitHub evaluation scripts, and Hugging Face model releases.",
    "feedback_verifier": "math answer checks, reward model signals, and benchmark evaluations.",
    "audit_focus": "Tool-integrated results are not comparable to no-tool results., Reward models can favor format over proof validity., Model-family reports can blur data and inference effects.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2409.12122",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2409.12122",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/QwenLM/Qwen2.5-Math",
      "data": null,
      "huggingface": "https://huggingface.co/Qwen/Qwen2.5-Math-7B",
      "project": "https://qwenlm.github.io/blog/qwen2.5-math/",
      "bibtex": null,
      "card": "cards/recipes/qwen2-5-math.md"
    },
    "primary_link": "https://arxiv.org/abs/2409.12122"
  },
  {
    "id": "r-tuning-2024",
    "title": "R-Tuning",
    "year": 2024,
    "venue": "NAACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "raft-adapting-language-model-to-domain-specific-rag-2024",
    "title": "RAFT: Adapting language model to domain-specific RAG",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "rest-mcts-2024",
    "title": "ReST-MCTS*",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "process_supervision",
      "reward_modeling",
      "test_time_compute"
    ],
    "domains": [
      "math",
      "reasoning"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🔁 Self-training / STaR / Self-Instruct",
    "tags": [
      "seeded-from-bib",
      "primary-link-checked"
    ],
    "one_line_summary": "Process-reward-guided tree search recipe for self-training reasoning traces.",
    "why_it_matters": "It shows how a process reward can guide search-generated trajectories, so readers can separate data generation, verifier choice, and inference-budget effects.",
    "data_object": "reasoning trajectory with intermediate search states; process: node state, rollout candidate, process reward score; MCTS-style reasoning tree",
    "feedback_verifier": "process reward guided tree search",
    "audit_focus": "search policy may overfit process reward artifacts, accepted traces can hide rejected rollout distribution, inference budget may be conflated with data quality",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2406.03816",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2406.03816",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2406.03816"
  },
  {
    "id": "rewardbench-evaluating-reward-models-for-language-modeling-2024",
    "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    "year": 2024,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "benchmark",
      "verifier_reward"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "evaluation",
      "reward_modeling",
      "preference_learning"
    ],
    "domains": [
      "preference",
      "safety",
      "chat"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧪 Rubric reward models",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "RewardBench evaluates reward models on prompt/chosen/rejected trios spanning chat, reasoning, and safety, including structured preference failures.",
    "why_it_matters": "It helps readers test whether a reward signal generalizes beyond helpfulness style into subtle factual, reasoning, refusal, and safety preferences.",
    "data_object": "pairwise or scalar reward decisions; process: prompt, chosen/rejected response, reward model score; offline preference benchmark",
    "feedback_verifier": "reward model or judge",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2403.13787",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2403.13787",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/rewardbench.md"
    },
    "primary_link": "https://arxiv.org/abs/2403.13787"
  },
  {
    "id": "rewarding-progress-scaling-automated-process-verifiers-for-llm-reasoning-2024",
    "title": "Rewarding progress: Scaling automated process verifiers for LLM reasoning",
    "year": 2024,
    "venue": "ICLR",
    "authors": [
      "Amrith Setlur",
      "Chirag Nagpal",
      "Adam Fisch",
      "Xinyang Geng",
      "Jacob Eisenstein",
      "Rishabh Agarwal",
      "Alekh Agarwal",
      "Jonathan Berant",
      "Aviral Kumar"
    ],
    "source_role": [
      "verifier_reward",
      "process_supervision",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward",
      "scalar_reward"
    ],
    "training_use": [
      "process_supervision",
      "reward_modeling",
      "rlvr",
      "test_time_compute"
    ],
    "domains": [
      "process-reward-models",
      "rlvr",
      "math"
    ],
    "category": [
      "process_supervision_prm",
      "scaling_test_time_compute_rlvr",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🔁 Rollout-value supervision",
    "tags": [
      "math",
      "process-reward-models",
      "rlvr",
      "seeded-from-bib"
    ],
    "one_line_summary": "Rewarding Progress proposes Process Advantage Verifiers that score whether a reasoning step increases future correctness probability.",
    "why_it_matters": "It gives process supervision a concrete target beyond dense labels: measure progress under a prover policy and use that signal for search or RL.",
    "data_object": "step-level process advantage score plus final answer/correctness signal.; process: problem, partial trace before step, step, future success estimate, verifier score, final outcome.; reasoning search and online RL setup using process rewards.",
    "feedback_verifier": "Process Advantage Verifier trained to predict progress toward correct answer.",
    "audit_focus": "Progress estimates can be policy-specific., Verifier-guided search can exploit reward artifacts., Step rewards can encourage short-term progress that hurts final correctness.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2410.08146",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2410.08146",
      "openreview": "https://openreview.net/forum?id=A6Y7AqlzLW",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/rewarding-progress.md"
    },
    "primary_link": "https://arxiv.org/abs/2410.08146"
  },
  {
    "id": "scicode-a-benchmark-for-scientific-code-generation-and-reasoning-2024",
    "title": "SciCode: A benchmark for scientific code generation and reasoning",
    "year": 2024,
    "venue": "NeurIPS Datasets and Benchmarks",
    "authors": [
      "Minyang Tian",
      "Luyu Gao",
      "Shizhuo Dylan Zhang",
      "Xinan Chen",
      "Cunwei Fan",
      "Xuefei Guo",
      "Roland Haas",
      "Pan Ji",
      "Kittithat Krongchon",
      "Yao Li",
      "Shengyan Liu",
      "Di Luo",
      "Yutao Ma",
      "Hao Tong",
      "Kha Trinh",
      "Chenyu Tian",
      "Zihan Wang",
      "Bohao Wu",
      "Yanyu Xiong",
      "Shengzhu Yin",
      "Minhui Zhu",
      "Kilian Lieret",
      "Yanxin Lu",
      "Genglin Liu",
      "Yufeng Du",
      "Tianhua Tao",
      "Ofir Press",
      "Jamie Callan",
      "Eliu Huerta",
      "Hao Peng"
    ],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "programmatic",
      "judgment_required"
    ],
    "supervision_granularity": [
      "answer_level",
      "step_level"
    ],
    "training_use": [
      "evaluation",
      "audit",
      "test_time_compute"
    ],
    "domains": [
      "scientific-code",
      "research-problems",
      "execution"
    ],
    "category": [
      "programmatic_math_code_proof",
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧰 Programmatic benchmarks",
    "tags": [
      "execution",
      "research-problems",
      "scientific-code",
      "seeded-from-bib"
    ],
    "one_line_summary": "SciCode evaluates code generation for realistic scientific research problems decomposed into subproblems with tests and gold solutions.",
    "why_it_matters": "It is a high-quality scientific-code benchmark where the data object links domain context, subproblem decomposition, code synthesis, and executable tests.",
    "data_object": "code solution evaluated with scientist-annotated tests or expected outputs.; process: domain, main problem, subproblem, background text, generated code, tests, gold solution, pass/fail.; scientific Python/code execution benchmark harness.",
    "feedback_verifier": "test cases and scientist-curated gold solutions.",
    "audit_focus": "Models can pass narrow tests without scientific validity., Domain background can leak solution hints., Execution environments can change numerical results.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2407.13168",
      "venue": "https://proceedings.neurips.cc/paper_files/paper/2024/hash/36850592258c8c41cecdaa3dea5ff7de-Abstract-Datasets_and_Benchmarks_Track.html",
      "arxiv": "https://arxiv.org/abs/2407.13168",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/scicode-bench/SciCode",
      "data": null,
      "huggingface": null,
      "project": "https://scicode-bench.github.io/",
      "bibtex": null,
      "card": "cards/benchmarks/scicode.md"
    },
    "primary_link": "https://arxiv.org/abs/2407.13168"
  },
  {
    "id": "self-rewarding-lms-2024",
    "title": "Self-Rewarding LMs",
    "year": 2024,
    "venue": "ICML",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🤖 RLAIF / synthetic feedback",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "spin-self-play-fine-tuning-converts-weak-language-models-to-strong-language-mode-2024",
    "title": "SPIN: Self-play fine-tuning converts weak language models to strong language models",
    "year": 2024,
    "venue": "ICML",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "⚖️ DPO / preference optimization",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "swe-gym-advancing-software-engineering-agents-with-training-and-evaluation-envir-2024",
    "title": "SWE-Gym: Advancing software engineering agents with training and evaluation environments",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "swe-search-2024",
    "title": "SWE-Search",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "tau-bench-a-benchmark-for-tool-agent-user-interaction-in-real-world-domains-2024",
    "title": "tau-bench: A benchmark for tool-agent-user interaction in real-world domains",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "the-llama-3-herd-of-models-2024",
    "title": "The Llama 3 Herd of models",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "frontier_model_reports"
    ],
    "subfield": "🧠 Magistral / Phi / Nemotron style reports",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "toolace-winning-the-points-of-llm-function-calling-2024",
    "title": "ToolACE: Winning the points of LLM function calling",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "toolllm-facilitating-large-language-models-to-master-16000-real-world-apis-2024",
    "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    "year": 2023,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "data_release",
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "state_action_level"
    ],
    "training_use": [
      "sft",
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "tools",
      "apis",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "construction_recipes_open_reasoning_data",
      "benchmarks_evaluation"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Tool-use data and ToolBench-style evaluation show how API calls become the reasoning trace and how tool responses anchor feedback.",
    "why_it_matters": "Tool-use data and ToolBench-style evaluation show how API calls become the reasoning trace and how tool responses anchor feedback.",
    "data_object": "tool-call trajectory plus final response; process: API call, arguments, tool response; real-world API/tool catalog",
    "feedback_verifier": "tool response validity and task success checks",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2307.16789",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2307.16789",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/toolllm_toolbench.md"
    },
    "primary_link": "https://arxiv.org/abs/2307.16789"
  },
  {
    "id": "toolsandbox-a-stateful-conversational-interactive-evaluation-benchmark-for-tool--2024",
    "title": "ToolSandbox: A stateful, conversational, interactive evaluation benchmark for tool use",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "tulu-3-pushing-frontiers-in-open-language-model-post-training-2024",
    "title": "Tulu 3: Pushing frontiers in open language model post-training",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Nathan Lambert",
      "Jacob Morrison",
      "Valentina Pyatkin",
      "Shengyi Huang",
      "Hamish Ivison",
      "Faeze Brahman",
      "Lester James V. Miranda",
      "Alisa Liu",
      "Nouha Dziri",
      "Shane Lyu",
      "Yuling Gu",
      "Saumya Malik",
      "Victoria Graf",
      "Jena D. Hwang",
      "Jiangjiang Yang",
      "Ronan Le Bras",
      "Oyvind Tafjord",
      "Chris Wilhelm",
      "Luca Soldaini",
      "Noah A. Smith",
      "Yizhong Wang",
      "Pradeep Dasigi",
      "Hannaneh Hajishirzi"
    ],
    "source_role": [
      "model_report",
      "construction_recipe",
      "data_release"
    ],
    "verification_contract": [
      "mixed",
      "programmatic",
      "judgment_required"
    ],
    "supervision_granularity": [
      "answer_level",
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "sft",
      "preference_learning",
      "rlvr",
      "evaluation"
    ],
    "domains": [
      "open-post-training",
      "instruction-tuning",
      "rlvr"
    ],
    "category": [
      "frontier_model_reports",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧪 RLVR recipe reports",
    "tags": [
      "instruction-tuning",
      "open-post-training",
      "rlvr",
      "seeded-from-bib"
    ],
    "one_line_summary": "Tulu 3 releases an open post-training stack with SFT data, preference data, RLVR recipes, code, models, and evaluation guidance.",
    "why_it_matters": "It is one of the clearest open references for modern post-training pipelines because it exposes data mixtures, objectives, decontamination, evaluation, and training infrastructure together.",
    "data_object": "instruction-response examples, preference pairs, verifiable task outputs, and model-evaluation records.; process: dataset shard, objective stage, prompt, response, preference label or reward, evaluation split, decontamination status.; open-instruct training/evaluation stack and Hugging Face dataset/model releases.",
    "feedback_verifier": "mixture of preference labels, reward models, and verifiable rewards depending on stage.",
    "audit_focus": "Full-stack releases can obscure which component caused a gain., Evaluation suites can leak into data curation loops., RLVR improvements may be domain-specific.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2411.15124",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2411.15124",
      "openreview": "https://openreview.net/forum?id=i1uGbfHHpH",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/allenai/open-instruct",
      "data": "https://huggingface.co/collections/allenai/tulu-3-datasets",
      "huggingface": "https://huggingface.co/collections/allenai/tulu-3-datasets",
      "project": "https://allenai.org/blog/tulu-3-technical",
      "bibtex": null,
      "card": "cards/recipes/tulu-3.md"
    },
    "primary_link": "https://arxiv.org/abs/2411.15124"
  },
  {
    "id": "visualwebarena-evaluating-multimodal-agents-on-realistic-visual-web-tasks-2024",
    "title": "VisualWebArena: Evaluating multimodal agents on realistic visual web tasks",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental"
    ],
    "supervision_granularity": [
      "full_episode"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "web",
      "vision",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🌍 Web/browser agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Extends WebArena-style evaluation toward visual web interaction, useful when reasoning data includes screenshots and UI state.",
    "why_it_matters": "Extends WebArena-style evaluation toward visual web interaction, useful when reasoning data includes screenshots and UI state.",
    "data_object": "visual web tasks with screenshots and browser state",
    "feedback_verifier": "task success checks",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "partial",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "weblinx-real-world-website-navigation-with-multi-turn-dialogue-2024",
    "title": "WebLINX: Real-world website navigation with multi-turn dialogue",
    "year": 2024,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🌍 Web/browser agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "wildguard-2024",
    "title": "WildGuard",
    "year": 2024,
    "venue": "NeurIPS",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🛡️ Safety reasoning data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "workarena-how-capable-are-web-agents-at-solving-common-knowledge-work-tasks-2024",
    "title": "WorkArena: How capable are web agents at solving common knowledge work tasks?",
    "year": 2024,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🌍 Web/browser agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "xstest-2024",
    "title": "XSTest",
    "year": 2024,
    "venue": "NAACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "1-4-million-open-source-distilled-reasoning-dataset-to-empower-large-language-mo-2025",
    "title": "1.4 Million Open-Source Distilled Reasoning Dataset to Empower Large Language Model Training (AM-DeepSeek-R1-Distilled)",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2503.19633",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2503.19633",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2503.19633",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2503.19633"
  },
  {
    "id": "1-shot-rlvr-learning-reasoning-with-minimal-verifiable-data-2025",
    "title": "1-shot RLVR: Learning reasoning with minimal verifiable data",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🏋️ RLVR optimization scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "a-sober-look-at-progress-in-language-model-reasoning-pitfalls-and-paths-to-repro-2025",
    "title": "A Sober Look at Progress in Language Model Reasoning: Pitfalls and Paths to Reproducibility",
    "year": 2025,
    "venue": "Conference on Language Modeling (COLM)",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧯 Contamination / evaluation surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Audits reasoning-model progress claims by showing that benchmark results can be highly sensitive to decoding, seeds, prompt format, and environment details.",
    "why_it_matters": "It is an audit anchor for this atlas: reasoning-data claims need reproducible evaluation settings, not just headline benchmark gains.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.07086",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.07086",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/a-sober-look-at-progress-in-language-model-reasoning-pitfalls-and-paths-to-reproducibility.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.07086"
  },
  {
    "id": "a-survey-on-llm-mid-training-2025",
    "title": "A Survey on LLM Mid-Training",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2510.23081",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.23081",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.23081",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2510.23081"
  },
  {
    "id": "absolute-zero-reinforced-self-play-reasoning-with-zero-data-2025",
    "title": "Absolute Zero: Reinforced Self-play Reasoning with Zero Data",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.03335",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "rlvr",
      "evaluation"
    ],
    "domains": [
      "math",
      "code"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "programmatic_math_code_proof"
    ],
    "subfield": "🔁 Self-play / self-improvement",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Self-play RLVR recipe where the model proposes and solves tasks without external training data, using verifiable feedback to ground the loop.",
    "why_it_matters": "Self-play RLVR recipe where the model proposes and solves tasks without external training data, using verifiable feedback to ground the loop.",
    "data_object": "generated task, solution, and verified answer; process: proposed task, solution, verifier result; code executor / verifiable task substrate",
    "feedback_verifier": "executor-backed verifiable reward",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.03335",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.03335",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/absolute_zero.md"
    },
    "primary_link": "https://arxiv.org/abs/2505.03335"
  },
  {
    "id": "abstentionbench-2025",
    "title": "AbstentionBench",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "audit_failure"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "safety_alignment"
    ],
    "domains": [
      "abstention",
      "factuality",
      "uncertainty"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🛡️ Safety reasoning data",
    "tags": [],
    "one_line_summary": "AbstentionBench evaluates whether LLMs know when not to answer across unknown, underspecified, false-premise, subjective, and stale-information questions.",
    "why_it_matters": "It is a direct audit surface for reasoning models: stronger reasoning can still fail if the model confidently answers unanswerable questions instead of abstaining.",
    "data_object": "model response, abstention decision, and correctness/abstention judgment.; process: scenario type, source dataset, answerability label, judge/validation metadata.; offline benchmark with model-evaluation harness.",
    "feedback_verifier": "human-validated judges and benchmark labels for abstention scenarios.",
    "audit_focus": "A model can game abstention by refusing too often., Benchmark labels around subjectivity and underspecification can be ambiguous., Prompt tuning may improve benchmark score without improving epistemic reasoning.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.09038",
      "venue": "https://proceedings.neurips.cc/paper_files/paper/2025/hash/fb122bfc3f0127a94ded048b5b03496f-Abstract-Datasets_and_Benchmarks_Track.html",
      "arxiv": "https://arxiv.org/abs/2506.09038",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/facebookresearch/AbstentionBench",
      "data": "https://huggingface.co/datasets/facebook/AbstentionBench",
      "huggingface": "https://huggingface.co/datasets/facebook/AbstentionBench",
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/abstentionbench.md"
    },
    "primary_link": "https://arxiv.org/abs/2506.09038"
  },
  {
    "id": "abstentionbench-reasoning-llms-fail-on-unanswerable-questions-2025",
    "title": "AbstentionBench: Reasoning LLMs Fail on Unanswerable Questions",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.09038",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.09038",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.09038",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2506.09038"
  },
  {
    "id": "acord-attorney-curated-open-research-dataset-2025",
    "title": "ACORD: Attorney-curated open research dataset",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "aegis2-0-a-diverse-ai-safety-dataset-and-risks-taxonomy-for-alignment-of-llm-gua-2025",
    "title": "Aegis2.0: A Diverse AI Safety Dataset and Risks Taxonomy for Alignment of LLM Guardrails",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2501.09004",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.09004",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.09004",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2501.09004"
  },
  {
    "id": "aegis2-2025",
    "title": "Aegis2.0",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "benchmark",
      "verifier_reward"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "safety_alignment",
      "evaluation",
      "reward_modeling"
    ],
    "domains": [
      "safety",
      "guardrails",
      "alignment"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧑‍⚖️ Human/expert judgment",
    "tags": [],
    "one_line_summary": "Aegis2.0 releases a human-annotated AI-safety dataset and risk taxonomy for training and evaluating LLM guardrails.",
    "why_it_matters": "It is useful for reasoning-data readers because safety alignment often depends on rubric-like hazard labels, response-pair judgments, and guard-model training data rather than exact-answer verification.",
    "data_object": "prompt or prompt-response sample with hazard taxonomy labels and safety annotations.; process: hazard category, fine-grained risk label, response-safety decision, split metadata when using the dataset release.; offline guardrail training/evaluation dataset.",
    "feedback_verifier": "risk labels and guard-model evaluation signal.",
    "audit_focus": "Taxonomy labels can hide disagreement between annotators or judge models., Safety datasets can overfit visible hazard categories and miss emerging harms., Guardrail training may trade helpfulness for over-refusal if topic-following data is not tracked.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.09004",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.09004",
      "openreview": null,
      "acl": "https://aclanthology.org/2025.naacl-long.306/",
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": "https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0",
      "huggingface": "https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0",
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/aegis2.md"
    },
    "primary_link": "https://arxiv.org/abs/2501.09004"
  },
  {
    "id": "alphaevolve-a-coding-agent-for-scientific-and-algorithmic-discovery-2025",
    "title": "AlphaEvolve: A coding agent for scientific and algorithmic discovery",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.13131",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.13131",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.13131",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2506.13131"
  },
  {
    "id": "am-thinking-v1-advancing-the-frontier-of-reasoning-at-32b-scale-2025",
    "title": "AM-Thinking-v1: Advancing the Frontier of Reasoning at 32B Scale",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.08311",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "frontier_model_reports"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.08311",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.08311",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.08311"
  },
  {
    "id": "androidworld-a-dynamic-benchmarking-environment-for-autonomous-agents-2025",
    "title": "AndroidWorld: A dynamic benchmarking environment for autonomous agents",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "evaluation",
      "agent_training"
    ],
    "domains": [
      "mobile",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "📱 App/mobile agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Android tasks turn mobile UI state and action histories into evaluable agent trajectories.",
    "why_it_matters": "Android tasks turn mobile UI state and action histories into evaluable agent trajectories.",
    "data_object": "Android action trajectory; process: screen observation, UI action, state transition; Android device/emulator environment",
    "feedback_verifier": "task-specific success evaluator",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2405.14573",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2405.14573",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/androidworld.md"
    },
    "primary_link": "https://arxiv.org/abs/2405.14573"
  },
  {
    "id": "beyond-correctness-harmonizing-process-and-outcome-rewards-through-rl-training-p-2025",
    "title": "Beyond Correctness: Harmonizing Process and Outcome Rewards through RL Training (PROF)",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2509.03403",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "process_supervision_prm"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2509.03403",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2509.03403",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2509.03403"
  },
  {
    "id": "big-math-2025",
    "title": "Big-Math-RL-Verified",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "benchmark"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "rlvr",
      "sft",
      "evaluation"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "benchmarks_evaluation"
    ],
    "subfield": "🧮 Math RLVR datasets",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "Large-scale math release useful for studying answer verification, false negatives, and RLVR-ready filtering.",
    "why_it_matters": "Large-scale math release useful for studying answer verification, false negatives, and RLVR-ready filtering.",
    "data_object": "math problem, answer, and verification signal; process: problem, answer, verification label; offline math verifier substrate",
    "feedback_verifier": "answer-level math verifier",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.17387",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.17387",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/big_math.md"
    },
    "primary_link": "https://arxiv.org/abs/2502.17387"
  },
  {
    "id": "big-math-a-large-scale-high-quality-math-dataset-for-reinforcement-learning-and--2025",
    "title": "Big-math: A large-scale, high-quality math dataset for reinforcement learning and supervised fine-tuning",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "benchmarks_evaluation"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "chemllmbench-and-chemistry-reasoning-evaluations-for-language-models-2025",
    "title": "ChemLLMBench and chemistry reasoning evaluations for language models",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "clip-low-increases-entropy-and-clip-high-decreases-entropy-in-reinforcement-lear-2025",
    "title": "Clip-Low Increases Entropy and Clip-High Decreases Entropy in Reinforcement Learning of Large Language Models",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2509.26114",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2509.26114",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2509.26114",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2509.26114"
  },
  {
    "id": "cognitive-behaviors-that-enable-self-improving-reasoners-or-four-habits-of-highl-2025",
    "title": "Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2503.01307",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2503.01307",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2503.01307",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2503.01307"
  },
  {
    "id": "dapo-2025",
    "title": "DAPO",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "rlvr"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [],
    "one_line_summary": "GRPO-lineage RLVR recipe where filtering changes what reaches the gradient.",
    "why_it_matters": "GRPO-lineage RLVR recipe where filtering changes what reaches the gradient.",
    "data_object": "answer level",
    "feedback_verifier": "programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2503.14476",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2503.14476",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/dapo.md"
    },
    "primary_link": "https://arxiv.org/abs/2503.14476"
  },
  {
    "id": "dapo-an-open-source-llm-reinforcement-learning-system-at-scale-2025",
    "title": "DAPO: An open-source LLM reinforcement learning system at scale",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "deepmath-103k-2025",
    "title": "DeepMath-103K",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "rlvr",
      "evaluation"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "surveys_and_primers",
      "programmatic_math_code_proof",
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [],
    "one_line_summary": "Math release highlighted for verifier pinning and decontamination.",
    "why_it_matters": "Math release highlighted for verifier pinning and decontamination.",
    "data_object": "answer level",
    "feedback_verifier": "programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.11456",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.11456",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/deepmath_103k.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.11456"
  },
  {
    "id": "deepmath-103k-a-large-scale-challenging-decontaminated-and-verifiable-mathematic-2025",
    "title": "DeepMath-103K: A Large-Scale, Challenging, Decontaminated, and Verifiable Mathematical Dataset for Advancing Reasoning",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2504.11456",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.11456",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.11456",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2504.11456"
  },
  {
    "id": "deepscaler-scaling-reinforcement-learning-for-reasoning-in-open-models-2025",
    "title": "DeepScaleR: Scaling reinforcement learning for reasoning in open models",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "deepseek-prover-v2-advancing-formal-mathematical-reasoning-via-reinforcement-lea-2025",
    "title": "DeepSeek-Prover-V2: Advancing formal mathematical reasoning via reinforcement learning",
    "year": 2025,
    "venue": "arXiv",
    "authors": [
      "Z. Z. Ren",
      "Zhihong Shao",
      "Junxiao Song",
      "Huajian Xin",
      "Haocheng Wang",
      "Wanjia Zhao",
      "Liyue Zhang",
      "Zhe Fu",
      "Qihao Zhu",
      "Dejian Yang",
      "Z. F. Wu",
      "Zhibin Gou",
      "Shirong Ma",
      "Hongxuan Tang",
      "Yuxuan Liu",
      "Wenjun Gao",
      "Daya Guo",
      "Chong Ruan"
    ],
    "source_role": [
      "model_report",
      "construction_recipe",
      "data_release"
    ],
    "verification_contract": [
      "programmatic",
      "environmental",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "full_episode",
      "scalar_reward"
    ],
    "training_use": [
      "rlvr",
      "sft",
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "formal-math",
      "lean",
      "subgoal-decomposition"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "formal-math",
      "lean",
      "seeded-from-bib",
      "subgoal-decomposition"
    ],
    "one_line_summary": "DeepSeek-Prover-V2 uses recursive subgoal decomposition and RL to connect informal reasoning with formal Lean theorem proving.",
    "why_it_matters": "It is a frontier formal-reasoning stack where data includes subgoals, informal reasoning, synthesized formal proofs, and proof-assistant verification.",
    "data_object": "subgoal chain, informal reasoning trace, Lean proof, and checker result.; process: problem, subgoal decomposition, synthesized proof, Lean feedback, reward, benchmark result.; Lean 4 environment plus recursive theorem-proving pipeline.",
    "feedback_verifier": "Lean verification and RL reward over formal proof success.",
    "audit_focus": "Subgoal decomposition can introduce false intermediate claims., Formal and informal reasoning scores are not directly comparable., Large-model teacher lineage can hide data provenance.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.21801",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.21801",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/deepseek-ai/DeepSeek-Prover-V2",
      "data": "https://huggingface.co/datasets/deepseek-ai/DeepSeek-ProverBench",
      "huggingface": "https://huggingface.co/deepseek-ai/DeepSeek-Prover-V2-671B",
      "project": "https://github.com/deepseek-ai/DeepSeek-Prover-V2",
      "bibtex": null,
      "card": "cards/recipes/deepseek-prover-v2.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.21801"
  },
  {
    "id": "deepseek-r1-2025",
    "title": "DeepSeek-R1",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "model_report",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "distillation",
      "rlvr"
    ],
    "domains": [
      "math",
      "code",
      "reasoning"
    ],
    "category": [
      "surveys_and_primers",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [],
    "one_line_summary": "DeepSeek-R1 reports a reasoning-model post-training recipe centered on reinforcement learning with verifiable rewards, cold-start data, and distillation.",
    "why_it_matters": "It is a frontier reference for public RLVR discussion, showing how verifiable tasks, reward design, and distillation shape reasoning behavior.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.12948",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.12948",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/deepseek_r1.md"
    },
    "primary_link": "https://arxiv.org/abs/2501.12948"
  },
  {
    "id": "deepseek-r1-incentivizing-reasoning-capability-in-llms-via-reinforcement-learnin-2025",
    "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🚀 DeepSeek-R1 family",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "deepseek-v3-2-pushing-the-frontier-of-open-large-language-models-2025",
    "title": "DeepSeek-V3.2: Pushing the Frontier of Open Large Language Models",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2512.02556",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "frontier_model_reports"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2512.02556",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2512.02556",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2512.02556"
  },
  {
    "id": "distillation-scaling-laws-2025",
    "title": "Distillation Scaling Laws",
    "year": 2025,
    "venue": "Proceedings of the 42nd International Conference on Machine Learning (ICML)",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.08606",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.08606",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2502.08606"
  },
  {
    "id": "does-rl-really-incentivize-reasoning-beyond-base-2025",
    "title": "Does RL really incentivize reasoning beyond base?",
    "year": 2025,
    "venue": "NeurIPS Oral",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧪 Verifier scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "does-supervised-fine-tuning-memorize-while-reinforcement-learning-generalizes-2025",
    "title": "Does supervised fine-tuning memorize while reinforcement learning generalizes?",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🔁 Data reuse and uniqueness",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "faithbench-a-diverse-hallucination-benchmark-for-summarization-by-modern-llms-2025",
    "title": "FaithBench: A Diverse Hallucination Benchmark for Summarization by Modern LLMs",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2410.13210",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2410.13210",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2410.13210",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2410.13210"
  },
  {
    "id": "finder-financial-data-extraction-and-reasoning-2025",
    "title": "FinDER: Financial data extraction and reasoning",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🏦 Financial reasoning",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "from-accuracy-to-robustness-a-study-of-rule-and-model-based-verifiers-in-mathema-2025",
    "title": "From Accuracy to Robustness: A Study of Rule- and Model-based Verifiers in Mathematical Reasoning",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.22203",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "programmatic_math_code_proof",
      "process_supervision_prm",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.22203",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.22203",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.22203"
  },
  {
    "id": "from-system-1-to-system-2-a-survey-of-reasoning-large-language-models-2025",
    "title": "From system 1 to system 2: A survey of reasoning large language models",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "gaperon-a-peppered-english-french-generative-language-model-suite-2025",
    "title": "Gaperon: A Peppered English-French Generative Language Model Suite",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2510.25771",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.25771",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.25771",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2510.25771"
  },
  {
    "id": "general-reasoning-models-survey-and-perspectives-2025",
    "title": "General reasoning models: Survey and perspectives",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "goedel-prover-v2-scaling-formal-theorem-proving-with-scaffolded-data-synthesis-a-2025",
    "title": "Goedel-Prover-V2: Scaling Formal Theorem Proving with Scaffolded Data Synthesis and Self-Correction",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2508.03613",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧾 Formal proof / Lean / theorem proving",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2508.03613",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2508.03613",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2508.03613"
  },
  {
    "id": "grpo-is-secretly-a-process-reward-model-2025",
    "title": "GRPO is Secretly a Process Reward Model",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2509.21154",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "process_supervision_prm"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2509.21154",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2509.21154",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2509.21154"
  },
  {
    "id": "healthbench-2025",
    "title": "HealthBench",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "verifier_reward"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "scalar_reward",
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "reward_modeling",
      "safety_alignment"
    ],
    "domains": [
      "health",
      "safety",
      "medical"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧪 Rubric reward models",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "HealthBench evaluates healthcare conversations with physician-written, conversation-specific rubrics across safety, accuracy, communication, and domain contexts.",
    "why_it_matters": "It is a high-stakes example of judgment-required reasoning data where rubric design matters more than exact-match scoring.",
    "data_object": "response with rubric/judge evaluation; process: prompt, response, rubric dimension; offline health evaluation benchmark",
    "feedback_verifier": "rubric-guided expert/LLM judgment",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.08775",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.08775",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/healthbench.md"
    },
    "primary_link": "https://arxiv.org/abs/2505.08775"
  },
  {
    "id": "healthbench-evaluating-large-language-models-towards-improved-human-health-2025",
    "title": "HealthBench: Evaluating large language models towards improved human health",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍⚖️ Human/expert judgment",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "humanity-s-last-exam-2025",
    "title": "Humanity's Last Exam",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "introducing-openai-o3-and-o4-mini-2025",
    "title": "Introducing OpenAI o3 and o4-mini",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "frontier_model_reports"
    ],
    "subfield": "🧪 RLVR recipe reports",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "khatri-scaling-rl-2025",
    "title": "The Art of Scaling Reinforcement Learning Compute for LLMs",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "scaling_study",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "rlvr",
      "test_time_compute",
      "evaluation"
    ],
    "domains": [
      "scaling",
      "rlvr"
    ],
    "category": [
      "scaling_test_time_compute_rlvr",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "⏱️ Test-time compute",
    "tags": [],
    "one_line_summary": "The Art of Scaling RL Compute studies RL compute scaling with large ablations and separates asymptotic performance from compute efficiency.",
    "why_it_matters": "It gives atlas readers a framework for judging RL recipe claims: some choices move the ceiling, while others mostly change how cheaply the run reaches it.",
    "data_object": "training runs, reward outcomes, validation curves, and ablation results.; process: loss aggregation, normalization, curriculum, off-policy choice, compute budget, asymptote, efficiency.; large-scale RL training experiments.",
    "feedback_verifier": "compute-performance curves and recipe ablations.",
    "audit_focus": "Compute-heavy studies can be hard to reproduce., Best-practice recipes may depend on task/reward families., Scaling curves can encourage overconfidence if validation tasks are narrow.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.13786",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.13786",
      "openreview": "https://openreview.net/forum?id=FMjeC9Msws",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/the-art-of-scaling-rl-compute.md"
    },
    "primary_link": "https://arxiv.org/abs/2510.13786"
  },
  {
    "id": "kimi-k1-5-2025",
    "title": "Kimi K1.5: Scaling Reinforcement Learning with LLMs",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "model_report",
      "scaling_study"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "rlvr",
      "test_time_compute"
    ],
    "domains": [
      "math",
      "code",
      "long-context"
    ],
    "category": [
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🌙 Kimi reasoning reports",
    "tags": [],
    "one_line_summary": "Frontier report used for long-context RL and scaling discussion.",
    "why_it_matters": "Frontier report used for long-context RL and scaling discussion.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.12599",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.12599",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/kimi_k15.md"
    },
    "primary_link": "https://arxiv.org/abs/2501.12599"
  },
  {
    "id": "kimi-k2-open-agentic-intelligence-2025",
    "title": "Kimi K2: Open Agentic Intelligence",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2507.20534",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "frontier_model_reports"
    ],
    "subfield": "🌙 Kimi reasoning reports",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2507.20534",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2507.20534",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2507.20534"
  },
  {
    "id": "kodcode-a-diverse-challenging-and-verifiable-synthetic-dataset-for-coding-2025",
    "title": "KodCode: A Diverse, Challenging, and Verifiable Synthetic Dataset for Coding",
    "year": 2025,
    "venue": "ACL Findings",
    "authors": [],
    "source_role": [
      "data_release",
      "construction_recipe",
      "benchmark"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "rlvr",
      "evaluation"
    ],
    "domains": [
      "code"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "benchmarks_evaluation"
    ],
    "subfield": "🧮 Math RLVR datasets",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Synthetic coding dataset where problems, solutions, and tests form a verifiable training object for SFT and RLVR.",
    "why_it_matters": "Synthetic coding dataset where problems, solutions, and tests form a verifiable training object for SFT and RLVR.",
    "data_object": "question-solution-test triplet; process: problem, solution, unit tests; code execution and unit-test substrate",
    "feedback_verifier": "test-based self-verification",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2503.02951",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2503.02951",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/kodcode.md"
    },
    "primary_link": "https://arxiv.org/abs/2503.02951"
  },
  {
    "id": "lastingbench-defend-benchmarks-against-knowledge-leakage-2025",
    "title": "LastingBench: Defend Benchmarks Against Knowledge Leakage",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.21614",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧯 Benchmark contamination",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.21614",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.21614",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2506.21614"
  },
  {
    "id": "leaky-thoughts-2025",
    "title": "Leaky Thoughts",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "audit_failure",
      "benchmark"
    ],
    "verification_contract": [
      "judgment_required",
      "environmental"
    ],
    "supervision_granularity": [
      "step_level",
      "full_episode"
    ],
    "training_use": [
      "evaluation",
      "safety_alignment",
      "audit"
    ],
    "domains": [
      "privacy",
      "agent",
      "security"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🛡️ Safety reasoning data",
    "tags": [],
    "one_line_summary": "Leaky Thoughts shows that reasoning traces from personal-agent settings can expose sensitive user data through prompt injection or accidental leakage.",
    "why_it_matters": "It turns chain-of-thought and test-time compute into a privacy audit problem: more internal reasoning can increase utility while enlarging the attack surface.",
    "data_object": "internal reasoning trace, final answer, and leakage/extraction outcome.; process: sensitive field, prompt-injection condition, reasoning length or budget, output leakage indicator.; personal-agent evaluation setting with hidden or internal reasoning traces.",
    "feedback_verifier": "extraction probes and agentic evaluations.",
    "audit_focus": "Hiding thoughts from users does not make them safe., Trace logging can create a new privacy dataset., Utility improvements from more reasoning may worsen leakage risk.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.15674",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.15674",
      "openreview": null,
      "acl": "https://aclanthology.org/2025.emnlp-main.1347/",
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/parameterlab/leaky_thoughts",
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/failures/leaky-thoughts.md"
    },
    "primary_link": "https://arxiv.org/abs/2506.15674"
  },
  {
    "id": "leaky-thoughts-large-reasoning-models-are-not-private-thinkers-2025",
    "title": "Leaky Thoughts: Large Reasoning Models Are Not Private Thinkers",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.15674",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧬 Hidden lineage / teacher leakage",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.15674",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.15674",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2506.15674"
  },
  {
    "id": "limo-2025",
    "title": "LIMO: Less Is More for Reasoning",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "Other related work",
    "tags": [],
    "one_line_summary": "Small-set curation reference distinguishing elicitation from broad coverage.",
    "why_it_matters": "Small-set curation reference distinguishing elicitation from broad coverage.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.03387",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.03387",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/limo.md"
    },
    "primary_link": "https://arxiv.org/abs/2502.03387"
  },
  {
    "id": "llama-nemotron-2025",
    "title": "Llama-Nemotron: Efficient Reasoning Models",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "model_report",
      "data_release"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation",
      "rlvr"
    ],
    "domains": [
      "reasoning",
      "chat",
      "safety"
    ],
    "category": [
      "surveys_and_primers",
      "judgment_required_rubrics_safety_domain",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [],
    "one_line_summary": "Mixed post-training corpus reference for reasoning, chat, and safety partitions.",
    "why_it_matters": "Mixed post-training corpus reference for reasoning, chat, and safety partitions.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.00949",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.00949",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/llama_nemotron.md"
    },
    "primary_link": "https://arxiv.org/abs/2505.00949"
  },
  {
    "id": "long-grounded-thoughts-2025",
    "title": "Long Grounded Thoughts",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2511.05705",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2511.05705",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2511.05705",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2511.05705"
  },
  {
    "id": "magistral-2025",
    "title": "Magistral",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "model_report",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "rlvr"
    ],
    "domains": [
      "math",
      "code"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [],
    "one_line_summary": "Reasoning report illustrating reward-stack pinning and prompt-corpus cycling.",
    "why_it_matters": "Reasoning report illustrating reward-stack pinning and prompt-corpus cycling.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.10910",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.10910",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/magistral.md"
    },
    "primary_link": "https://arxiv.org/abs/2506.10910"
  },
  {
    "id": "math-perturb-2025",
    "title": "MATH-Perturb",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "mcp-universe-tool-and-environment-infrastructure-for-agent-evaluation-and-traini-2025",
    "title": "MCP-Universe: Tool and environment infrastructure for agent evaluation and training",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "medreason-eliciting-factual-medical-reasoning-steps-in-llms-via-knowledge-graphs-2025",
    "title": "MedReason: Eliciting factual medical reasoning steps in LLMs via knowledge graphs",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🩺 Medical reasoning / health rubrics",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "megascience-a-benchmark-and-data-resource-for-scientific-reasoning-2025",
    "title": "MegaScience: A benchmark and data resource for scientific reasoning",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "mid-training-of-large-language-models-a-survey-2025",
    "title": "Mid-Training of Large Language Models: A Survey",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2510.06826",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.06826",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.06826",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2510.06826"
  },
  {
    "id": "min-k-2025",
    "title": "Min-K\\%++",
    "year": 2025,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧪 Verifier gaming",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "minimax-m1-scaling-test-time-compute-efficiently-with-lightning-attention-2025",
    "title": "MiniMax-M1: Scaling Test-Time Compute Efficiently with Lightning Attention",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.13585",
    "authors": [],
    "source_role": [
      "model_report",
      "scaling_study",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed",
      "programmatic",
      "environmental"
    ],
    "supervision_granularity": [
      "answer_level",
      "full_episode"
    ],
    "training_use": [
      "rlvr",
      "test_time_compute",
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "math",
      "code",
      "software_engineering",
      "agents"
    ],
    "category": [
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr",
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🧠 Magistral / Phi / Nemotron style reports",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Frontier-style model report connecting efficient long-context/test-time compute, RL training, and software/agent evaluation surfaces.",
    "why_it_matters": "Frontier-style model report connecting efficient long-context/test-time compute, RL training, and software/agent evaluation surfaces.",
    "data_object": "reasoning output, code/tool result, or agent task output; process: reasoning output, benchmark result, thinking budget; math, code, SWE, tool-use, and long-context evaluation surfaces",
    "feedback_verifier": "programmatic, environment, and benchmark feedback",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.13585",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.13585",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/MiniMax-AI/MiniMax-M1",
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/minimax_m1.md"
    },
    "primary_link": "https://arxiv.org/abs/2506.13585"
  },
  {
    "id": "multi-agent-evolve-llm-self-improve-through-co-evolution-2025",
    "title": "Multi-Agent Evolve: LLM self-improve through co-evolution",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🔁 Self-play / self-improvement",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "naturalreasoning-reasoning-in-natural-language-with-large-scale-verifiable-data-2025",
    "title": "NaturalReasoning: Reasoning in the Wild with 2.8M Challenging Questions",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation",
      "evaluation"
    ],
    "domains": [
      "stem",
      "economics",
      "social_science",
      "general_reasoning"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Large-scale natural-language reasoning questions broaden post-training data beyond math/code while keeping a verifiability lens.",
    "why_it_matters": "Large-scale natural-language reasoning questions broaden post-training data beyond math/code while keeping a verifiability lens.",
    "data_object": "question with reference answer or reasoning target; process: question, reference answer, domain label; offline natural-language tasks",
    "feedback_verifier": "reference answers, reward models, or self-rewarding depending on split",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.13124",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.13124",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/naturalreasoning.md"
    },
    "primary_link": "https://arxiv.org/abs/2502.13124"
  },
  {
    "id": "nemotron-math-reasoning-data-with-tool-integrated-reasoning-variants-2025",
    "title": "Nemotron-Math: Reasoning Data with Tool-Integrated Reasoning Variants",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2512.15489",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "frontier_model_reports",
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧠 Magistral / Phi / Nemotron style reports",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2512.15489",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2512.15489",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2512.15489"
  },
  {
    "id": "one-token-to-fool-2025",
    "title": "One Token to Fool LLM-as-a-Judge",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "audit_failure",
      "verifier_reward"
    ],
    "verification_contract": [
      "judgment_required"
    ],
    "supervision_granularity": [
      "scalar_reward"
    ],
    "training_use": [
      "evaluation",
      "reward_modeling"
    ],
    "domains": [
      "judge"
    ],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "⚖️ LLM-as-judge data",
    "tags": [],
    "one_line_summary": "Verifier-attack paper showing trivial cue tokens can flip judge verdicts.",
    "why_it_matters": "Verifier-attack paper showing trivial cue tokens can flip judge verdicts.",
    "data_object": "scalar reward",
    "feedback_verifier": "judgment required",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2507.08794",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2507.08794",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/one_token_to_fool_judge.md"
    },
    "primary_link": "https://arxiv.org/abs/2507.08794"
  },
  {
    "id": "online-rubrics-elicitation-from-pairwise-comparisons-2025",
    "title": "Online Rubrics Elicitation from Pairwise Comparisons",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2510.07284",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.07284",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.07284",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2510.07284"
  },
  {
    "id": "onlinerubrics-2025",
    "title": "OnlineRubrics",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "open-reasoner-zero-an-open-source-approach-to-rlvr-for-reasoning-2025",
    "title": "Open-Reasoner-Zero: An open-source approach to RLVR for reasoning",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "open-reasoner-zero-an-open-source-approach-to-scaling-up-reinforcement-learning--2025",
    "title": "Open-Reasoner-Zero: An Open Source Approach to Scaling Up Reinforcement Learning on the Base Model",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2503.24290",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2503.24290",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2503.24290",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2503.24290"
  },
  {
    "id": "opencodereasoning-2-scalable-code-reasoning-data-2025",
    "title": "OpenCodeReasoning-2: Scalable code reasoning data",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "opencodereasoning-code-reasoning-traces-at-scale-2025",
    "title": "OpenCodeReasoning: Code reasoning traces at scale",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "opencodereasoning-ii-a-simple-test-time-scaling-approach-via-self-critique-2025",
    "title": "OpenCodeReasoning-II: A Simple Test Time Scaling Approach via Self-Critique",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "sft",
      "distillation",
      "test_time_compute",
      "evaluation"
    ],
    "domains": [
      "code"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Large code-reasoning release with question-solution-critique triples, connecting distillation data to test-time self-critique.",
    "why_it_matters": "Large code-reasoning release with question-solution-critique triples, connecting distillation data to test-time self-critique.",
    "data_object": "question-solution-critique triple; process: solution, critique, language/runtime label; coding benchmark / compiler substrate",
    "feedback_verifier": "tests and critique model signals",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2507.09075",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2507.09075",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/opencodereasoning_ii.md"
    },
    "primary_link": "https://arxiv.org/abs/2507.09075"
  },
  {
    "id": "openhands-2025",
    "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    "year": 2024,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "infrastructure",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental",
      "mixed"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "software_engineering",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "frontier_model_reports"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Open platform for software-development agents; useful for thinking about executable trajectories, sandbox state, and community-maintained agent scaffolds.",
    "why_it_matters": "Open platform for software-development agents; useful for thinking about executable trajectories, sandbox state, and community-maintained agent scaffolds.",
    "data_object": "tool/action/observation trajectory; process: plan, shell command, file edit; sandboxed software-development runtime",
    "feedback_verifier": "task, test, or human-review outcome depending on benchmark",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2407.16741",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2407.16741",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/All-Hands-AI/OpenHands",
      "data": null,
      "huggingface": null,
      "project": "https://www.openhands.dev/",
      "bibtex": null,
      "card": "cards/agents/openhands.md"
    },
    "primary_link": "https://arxiv.org/abs/2407.16741"
  },
  {
    "id": "openmathreasoning-2025",
    "title": "OpenMathReasoning: A large-scale dataset of math reasoning traces",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation",
      "rlvr"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "benchmarks_evaluation"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [],
    "one_line_summary": "Large-scale math reasoning trace release for programmatic verification.",
    "why_it_matters": "Large-scale math reasoning trace release for programmatic verification.",
    "data_object": "answer level",
    "feedback_verifier": "programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.16891",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.16891",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/openmathreasoning.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.16891"
  },
  {
    "id": "openthoughts3-2025",
    "title": "OpenThoughts: Data recipes for reasoning models",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation"
    ],
    "domains": [
      "math",
      "code",
      "science"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "🏗️ Open reasoning data releases",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "OpenThoughts studies open data recipes for reasoning models through large public reasoning datasets and many controlled pipeline experiments.",
    "why_it_matters": "It is one of the most useful open references for how question sourcing, filtering, teacher traces, and answer generation choices change downstream reasoning performance.",
    "data_object": "reasoning traces and final answers; process: question, reasoning trace, answer; offline reasoning corpus",
    "feedback_verifier": "filters, benchmark feedback, and recipe ablations",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.04178",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.04178",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/open-thoughts/open-thoughts",
      "data": null,
      "huggingface": "https://huggingface.co/datasets/open-thoughts/OpenThoughts3-1.2M",
      "project": null,
      "bibtex": null,
      "card": "cards/releases/openthoughts.md"
    },
    "primary_link": "https://arxiv.org/abs/2506.04178"
  },
  {
    "id": "phi-4-reasoning-2025",
    "title": "Phi-4-reasoning Technical Report",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "model_report"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "distillation",
      "sft"
    ],
    "domains": [
      "math",
      "code"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "✍️ Teacher trace generation",
    "tags": [],
    "one_line_summary": "Reasoning model report highlighting teacher distillation as trace writing.",
    "why_it_matters": "Reasoning model report highlighting teacher distillation as trace writing.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.21318",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.21318",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/phi4_reasoning.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.21318"
  },
  {
    "id": "prime-process-reinforcement-through-implicit-rewards-2025",
    "title": "PRIME: Process reinforcement through implicit rewards",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "verifier_reward",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward",
      "answer_level"
    ],
    "training_use": [
      "rlvr",
      "process_supervision",
      "reward_modeling"
    ],
    "domains": [
      "math",
      "code",
      "reasoning"
    ],
    "category": [
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🛠️ Automatic process supervision",
    "tags": [
      "seeded-from-bib",
      "primary-link-checked"
    ],
    "one_line_summary": "Online process-reinforcement recipe that derives implicit process rewards from rollouts and outcome labels.",
    "why_it_matters": "It is a clean example of process supervision without manual dense labels, useful for comparing PRM data, outcome rewards, and RLVR optimization scaffolds.",
    "data_object": "rollout with implicit process reward signal; process: policy rollout, outcome label, implicit process reward; online RL training loop",
    "feedback_verifier": "implicit process rewards derived from outcome labels",
    "audit_focus": "implicit rewards can inherit outcome-verifier shortcuts, online reward updates may introduce reward hacking, benchmark improvements may conflate optimizer and reward-contract changes",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.01456",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.01456",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/PRIME-RL/PRIME",
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2502.01456"
  },
  {
    "id": "autopsv-automated-process-supervised-verifier-2024",
    "title": "AutoPSV: Automated Process-Supervised Verifier",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "verifier_reward",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "process_supervision",
      "reward_modeling",
      "evaluation"
    ],
    "domains": [
      "math",
      "commonsense",
      "reasoning"
    ],
    "category": [
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🛠️ Automatic process supervision",
    "tags": [
      "primary-link-checked"
    ],
    "one_line_summary": "Automated process-supervision recipe that derives step annotations from verifier confidence changes.",
    "why_it_matters": "It gives the PRM track a concrete automatic-labeling path between human step labels and rollout-value supervision.",
    "data_object": "step-level confidence-change annotations; process: reasoning step, verifier confidence, relative confidence change; offline reasoning traces",
    "feedback_verifier": "answer-trained verifier converted into process annotations",
    "audit_focus": "answer-level verifier confidence can mislabel intermediate steps, relative confidence changes may not identify causal first errors, commonsense and math tasks may require different error taxonomies",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2405.16802",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2405.16802",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/rookie-joe/AutoPSV",
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2405.16802"
  },
  {
    "id": "step-dpo-step-wise-preference-optimization-for-long-chain-reasoning-of-llms-2024",
    "title": "Step-DPO: Step-wise Preference Optimization for Long-chain Reasoning of LLMs",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "step_level"
    ],
    "training_use": [
      "process_supervision",
      "preference_learning"
    ],
    "domains": [
      "math",
      "reasoning"
    ],
    "category": [
      "foundations_instruction_preference_alignment",
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "⚖️ DPO / preference optimization",
    "tags": [
      "primary-link-checked"
    ],
    "one_line_summary": "Step-wise preference optimization method for long-chain reasoning traces.",
    "why_it_matters": "It helps readers see how preference optimization becomes a process-level data problem when the chosen/rejected object is an intermediate continuation rather than a whole answer.",
    "data_object": "step-wise preference pairs; process: reasoning step, preferred continuation, rejected continuation; offline long-chain reasoning traces",
    "feedback_verifier": "step-wise preference optimization objective",
    "audit_focus": "local step preference may not align with final correctness, preference construction can hide teacher or scorer bias, long-chain traces can overfit style instead of reasoning validity",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2406.18629",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2406.18629",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2406.18629"
  },
  {
    "id": "rearter-retrieval-augmented-reasoning-with-trustworthy-process-rewarding-2025",
    "title": "ReARTeR: Retrieval-Augmented Reasoning with Trustworthy Process Rewarding",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "verifier_reward",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "process_supervision",
      "preference_learning",
      "test_time_compute"
    ],
    "domains": [
      "retrieval",
      "multi-hop reasoning"
    ],
    "category": [
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧪 Process reward models",
    "tags": [
      "primary-link-checked"
    ],
    "one_line_summary": "Retrieval-augmented reasoning recipe that uses trustworthy process rewards and explanations to collect step-level preference data.",
    "why_it_matters": "It broadens the PRM track from math-only step labels to retrieval-grounded reasoning where process scores, explanations, and search all affect the reusable data object.",
    "data_object": "step-level preference data with process scores and explanations; process: retrieval context, reasoning step, process reward score; RAG reasoning pipeline",
    "feedback_verifier": "process reward model plus process explanation model",
    "audit_focus": "PRM and explanation model may disagree, retrieval context can leak answer evidence unevenly, early-step PRM bias can distort search",
    "curation_level": "L2_artifact_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.07861",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.07861",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/Jeryi-Sun/ReARTeR",
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2501.07861"
  },
  {
    "id": "prmbench-a-fine-grained-and-challenging-benchmark-for-process-level-reward-model-2025",
    "title": "PRMBench: A fine-grained and challenging benchmark for process-level reward models",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "verifier_reward",
      "process_supervision"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "evaluation",
      "reward_modeling",
      "process_supervision"
    ],
    "domains": [
      "math",
      "reasoning"
    ],
    "category": [
      "process_supervision_prm",
      "benchmarks_evaluation",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "📊 PRM benchmarks and evaluation",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Fine-grained PRM benchmark for testing whether process reward models catch local reasoning mistakes rather than only final-answer failure.",
    "why_it_matters": "Fine-grained PRM benchmark for testing whether process reward models catch local reasoning mistakes rather than only final-answer failure.",
    "data_object": "step-level labels or scores; process: step, label, error type; offline reasoning traces",
    "feedback_verifier": "process-level reward model benchmark",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.03124",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.03124",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/prmbench.md"
    },
    "primary_link": "https://arxiv.org/abs/2501.03124"
  },
  {
    "id": "process-reward-models-for-code-reasoning-2025",
    "title": "Process reward models for code reasoning",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "process_supervision_prm"
    ],
    "subfield": "🧪 Process reward models",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "processbench-identifying-process-errors-in-mathematical-reasoning-2025",
    "title": "ProcessBench: Identifying Process Errors in Mathematical Reasoning",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "benchmark",
      "process_supervision",
      "verifier_reward"
    ],
    "verification_contract": [
      "judgment_required",
      "programmatic"
    ],
    "supervision_granularity": [
      "step_level"
    ],
    "training_use": [
      "evaluation",
      "process_supervision",
      "reward_modeling"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "process_supervision_prm",
      "benchmarks_evaluation"
    ],
    "subfield": "📊 PRM benchmarks and evaluation",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Benchmark centered on first-error/local-error detection, a core failure mode for process supervision and verifier training.",
    "why_it_matters": "Benchmark centered on first-error/local-error detection, a core failure mode for process supervision and verifier training.",
    "data_object": "step labels or first-error markers; process: reasoning step, error marker, diagnostic label; offline math traces",
    "feedback_verifier": "process-error detector",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2412.06559",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2412.06559",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/processbench.md"
    },
    "primary_link": "https://arxiv.org/abs/2412.06559"
  },
  {
    "id": "qwen2-5-math-prm-2025",
    "title": "Qwen2.5-Math-PRM",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof",
      "process_supervision_prm"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "qwen3-2025",
    "title": "Qwen3 Technical Report",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "model_report"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "rlvr"
    ],
    "domains": [
      "math",
      "code",
      "general"
    ],
    "category": [
      "frontier_model_reports",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🐉 Qwen reasoning/math/code reports",
    "tags": [],
    "one_line_summary": "Open model-family report useful for coordinated release-tick analysis.",
    "why_it_matters": "Open model-family report useful for coordinated release-tick analysis.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.09388",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.09388",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/qwen3.md"
    },
    "primary_link": "https://arxiv.org/abs/2505.09388"
  },
  {
    "id": "r-zero-self-evolving-reasoning-llm-from-zero-data-2025",
    "title": "R-Zero: Self-Evolving Reasoning LLM from Zero Data",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2508.05004",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2508.05004",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2508.05004",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2508.05004"
  },
  {
    "id": "r2e-gym-2025",
    "title": "R2E-Gym",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "agent_environment",
      "benchmark"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "software-engineering"
    ],
    "category": [
      "programmatic_math_code_proof",
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🧰 Programmatic benchmarks",
    "tags": [],
    "one_line_summary": "Verifiable SWE environment for reasoning-to-edit tasks.",
    "why_it_matters": "Verifiable SWE environment for reasoning-to-edit tasks.",
    "data_object": "full episode; state action level",
    "feedback_verifier": "environmental, programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.07164",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.07164",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/r2e_gym.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.07164"
  },
  {
    "id": "r2e-gym-procedural-training-environments-for-repository-level-code-agents-2025",
    "title": "R2E-Gym: Procedural training environments for repository-level code agents",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "r3-robust-rubric-agnostic-reward-models-2025",
    "title": "R3: Robust Rubric-Agnostic Reward Models",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.13388",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧪 Rubric reward models",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.13388",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.13388",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.13388"
  },
  {
    "id": "reinforcement-learning-with-verifiable-rewards-implicitly-incentivizes-correct-r-2025",
    "title": "Reinforcement Learning with Verifiable Rewards Implicitly Incentivizes Correct Reasoning in Base LLMs",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.14245",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🏋️ RLVR optimization scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.14245",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.14245",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2506.14245"
  },
  {
    "id": "retool-reinforcement-learning-for-strategic-tool-use-in-llms-2025",
    "title": "ReTool: Reinforcement Learning for Strategic Tool Use in LLMs",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2504.11536",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🏋️ RLVR optimization scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.11536",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.11536",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2504.11536"
  },
  {
    "id": "rstar-math-2025",
    "title": "rStar-Math",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "rstar2-agent-agentic-reasoning-technical-report-2025",
    "title": "rStar2-Agent: Agentic Reasoning Technical Report",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2508.20722",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2508.20722",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2508.20722",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2508.20722"
  },
  {
    "id": "rubrics-as-rewards-2025",
    "title": "Rubrics as rewards",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧪 Rubric reward models",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "s1-2025",
    "title": "s1: Simple Test-Time Scaling",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "scaling_study"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "test_time_compute"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [],
    "one_line_summary": "s1 curates a small s1K reasoning dataset and studies budget forcing as a simple way to scale test-time reasoning compute.",
    "why_it_matters": "It is a useful counterpoint to massive-data recipes: careful small-set curation plus inference-budget control can materially change reasoning performance.",
    "data_object": "answer level",
    "feedback_verifier": "mixed",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2501.19393",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2501.19393",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/releases/s1.md"
    },
    "primary_link": "https://arxiv.org/abs/2501.19393"
  },
  {
    "id": "safechain-safety-of-language-models-with-long-chain-of-thought-reasoning-capabil-2025",
    "title": "SafeChain: Safety of Language Models with Long Chain-of-Thought Reasoning Capabilities",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2502.12025",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧠 Chain-of-thought / rationale data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.12025",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.12025",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2502.12025"
  },
  {
    "id": "safety-through-reasoning-an-empirical-study-of-reasoning-guardrail-models-2025",
    "title": "Safety Through Reasoning: An Empirical Study of Reasoning Guardrail Models",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.20087",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🛡️ Safety reasoning data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.20087",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.20087",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.20087"
  },
  {
    "id": "scaling-behaviors-of-llm-reinforcement-learning-post-training-an-empirical-study-2025",
    "title": "Scaling Behaviors of LLM Reinforcement Learning Post-Training: An Empirical Study in Mathematical Reasoning",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2509.25300",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr",
      "programmatic_math_code_proof"
    ],
    "subfield": "🏋️ RLVR optimization scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2509.25300",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2509.25300",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2509.25300"
  },
  {
    "id": "search-time-data-contamination-2025",
    "title": "Search-Time Data Contamination",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2508.13180",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧯 Contamination / evaluation surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2508.13180",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2508.13180",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2508.13180"
  },
  {
    "id": "sky-t1-fully-open-reasoning-model-and-data-recipe-2025",
    "title": "Sky-T1: Fully open reasoning model and data recipe",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🏗️ Open reasoning data releases",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "skywork-open-reasoner-1-technical-report-2025",
    "title": "Skywork Open Reasoner 1 Technical Report",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.22312",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.22312",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.22312",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.22312"
  },
  {
    "id": "spurious-rewards-2025",
    "title": "Spurious Rewards",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "audit_failure",
      "scaling_study"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "scalar_reward"
    ],
    "training_use": [
      "rlvr",
      "evaluation"
    ],
    "domains": [
      "math",
      "rlvr"
    ],
    "category": [
      "surveys_and_primers",
      "programmatic_math_code_proof",
      "scaling_test_time_compute_rlvr",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [],
    "one_line_summary": "Reward-signal audit for spurious behavior in RLVR.",
    "why_it_matters": "Reward-signal audit for spurious behavior in RLVR.",
    "data_object": "scalar reward",
    "feedback_verifier": "programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.10947",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.10947",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/spurious_rewards.md"
    },
    "primary_link": "https://arxiv.org/abs/2506.10947"
  },
  {
    "id": "spurious-rewards-rethinking-training-signals-in-rlvr-2025",
    "title": "Spurious Rewards: Rethinking Training Signals in RLVR",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2506.10947",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧨 Spurious rewards",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2506.10947",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2506.10947",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2506.10947"
  },
  {
    "id": "subliminal-learning-2025",
    "title": "Subliminal Learning",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "audit_failure",
      "construction_recipe"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "full_episode"
    ],
    "training_use": [
      "distillation",
      "evaluation",
      "audit"
    ],
    "domains": [
      "synthetic-data",
      "lineage",
      "distillation"
    ],
    "category": [
      "construction_recipes_open_reasoning_data",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "✍️ Teacher trace generation",
    "tags": [],
    "one_line_summary": "Subliminal Learning shows that teacher models can transmit behavioral traits through semantically unrelated generated data, even after visible trait references are filtered.",
    "why_it_matters": "It is a data-lineage warning for reasoning distillation: synthetic traces may carry hidden model traits that are invisible to content filters.",
    "data_object": "generated data plus downstream behavioral evaluation of the student.; process: teacher identity, student base model, visible filtering policy, hidden trait evaluation.; distillation and synthetic-data training pipeline.",
    "feedback_verifier": "trait probes after student training.",
    "audit_focus": "Data may look safe while carrying hidden traits., Lineage effects can be invisible from sample inspection., Distillation chains can propagate behavior across model generations.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2507.14805",
      "venue": "https://www.nature.com/articles/s41586-026-10319-8",
      "arxiv": "https://arxiv.org/abs/2507.14805",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/MinhxLe/subliminal-learning",
      "data": null,
      "huggingface": null,
      "project": "https://subliminal-learning.com/",
      "bibtex": null,
      "card": "cards/failures/subliminal-learning.md"
    },
    "primary_link": "https://arxiv.org/abs/2507.14805"
  },
  {
    "id": "subliminal-learning-language-models-transmit-behavioral-traits-via-hidden-signal-2025",
    "title": "Subliminal Learning: Language models transmit behavioral traits via hidden signals in data",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2507.14805",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧬 Hidden lineage / teacher leakage",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2507.14805",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2507.14805",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2507.14805"
  },
  {
    "id": "swe-gym-2025",
    "title": "SWE-Gym",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "data_release",
      "agent_environment"
    ],
    "verification_contract": [
      "environmental",
      "programmatic"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "software-engineering",
      "agent"
    ],
    "category": [
      "programmatic_math_code_proof",
      "environmental_agents_tools_web_swe",
      "construction_recipes_open_reasoning_data",
      "scaling_test_time_compute_rlvr",
      "benchmarks_evaluation"
    ],
    "subfield": "🧰 Programmatic benchmarks",
    "tags": [],
    "one_line_summary": "Repository-scale training environment showing substrate as data.",
    "why_it_matters": "Repository-scale training environment showing substrate as data.",
    "data_object": "full episode; state action level",
    "feedback_verifier": "environmental, programmatic",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2412.21139",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2412.21139",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/agents/swe_gym.md"
    },
    "primary_link": "https://arxiv.org/abs/2412.21139"
  },
  {
    "id": "swe-rebench-an-automated-pipeline-for-task-collection-and-decontaminated-evaluat-2025",
    "title": "SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.20411",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.20411",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.20411",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.20411"
  },
  {
    "id": "swe-rl-advancing-language-agents-for-software-engineering-via-reinforcement-lear-2025",
    "title": "SWE-RL: Advancing language agents for software engineering via reinforcement learning",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "swe-rl-advancing-llm-reasoning-via-reinforcement-learning-on-open-software-evolu-2025",
    "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
    "year": 2025,
    "venue": "Advances in Neural Information Processing Systems (NeurIPS)",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.18449",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.18449",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2502.18449"
  },
  {
    "id": "swe-smith-scaling-data-construction-for-software-engineering-agents-2025",
    "title": "SWE-smith: Scaling data construction for software engineering agents",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "tan-scaling-rl-2025",
    "title": "Scaling Behaviors of LLM Reinforcement Learning Post-Training",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "scaling_study",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "rlvr",
      "evaluation"
    ],
    "domains": [
      "math",
      "scaling",
      "rlvr"
    ],
    "category": [
      "programmatic_math_code_proof",
      "scaling_test_time_compute_rlvr",
      "frontier_model_reports"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [],
    "one_line_summary": "This study measures how model size, data volume, and compute budget interact during RL post-training for mathematical reasoning.",
    "why_it_matters": "It helps turn RLVR from recipe folklore into a scaling problem: data reuse, optimization steps, and model size have different effects on learning efficiency and final performance.",
    "data_object": "problem, generated solution/answer, reward outcome, and training curve metrics.; process: model size, data volume, compute budget, optimization steps, reward signal, validation performance.; RL post-training experiments over math tasks.",
    "feedback_verifier": "answer-level reward for mathematical reasoning and scaling curves.",
    "audit_focus": "Math-only scaling can overstate transfer to open-ended reasoning., Repeated data reuse can improve metrics while increasing overfitting risk., Power-law fits can hide reward or benchmark artifacts.",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2509.25300",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2509.25300",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/scaling-behaviors-rl-post-training.md"
    },
    "primary_link": "https://arxiv.org/abs/2509.25300"
  },
  {
    "id": "the-agent-company-benchmarking-and-analyzing-agent-work-in-enterprise-like-envir-2025",
    "title": "The Agent Company: Benchmarking and analyzing agent work in enterprise-like environments",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "the-entropy-mechanism-of-reinforcement-learning-for-reasoning-language-models-2025",
    "title": "The Entropy Mechanism of Reinforcement Learning for Reasoning Language Models",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2505.22617",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🏋️ RLVR optimization scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.22617",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.22617",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2505.22617"
  },
  {
    "id": "the-invisible-leash-why-rlvr-may-or-may-not-escape-its-origin-2025",
    "title": "The Invisible Leash: Why RLVR May or May Not Escape Its Origin",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2507.14843",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🔍 Scaling attribution",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2507.14843",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2507.14843",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2507.14843"
  },
  {
    "id": "the-markovian-thinker-architecture-agnostic-linear-scaling-of-reasoning-2025",
    "title": "The Markovian Thinker: Architecture-Agnostic Linear Scaling of Reasoning",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2510.06557",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧪 Verifier scaling",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.06557",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.06557",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2510.06557"
  },
  {
    "id": "toward-conversational-diagnostic-ai-the-amie-system-2025",
    "title": "Toward conversational diagnostic AI: The AMIE system",
    "year": 2025,
    "venue": "Nature",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧾 Factuality / grounding",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "towards-understanding-self-play-for-llm-reasoning-2025",
    "title": "Towards Understanding Self-play for LLM Reasoning",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2510.27072",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🔁 Self-play / self-improvement",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2510.27072",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2510.27072",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2510.27072"
  },
  {
    "id": "ttrl-test-time-reinforcement-learning-2025",
    "title": "TTRL: Test-Time Reinforcement Learning",
    "year": 2025,
    "venue": "arXiv preprint arXiv:2504.16084",
    "authors": [],
    "source_role": [
      "construction_recipe",
      "scaling_study",
      "verifier_reward"
    ],
    "verification_contract": [
      "mixed"
    ],
    "supervision_granularity": [
      "scalar_reward",
      "answer_level"
    ],
    "training_use": [
      "rlvr",
      "test_time_compute",
      "evaluation"
    ],
    "domains": [
      "reasoning",
      "unlabeled_data"
    ],
    "category": [
      "scaling_test_time_compute_rlvr",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "⏱️ Test-time compute",
    "tags": [
      "curated-card",
      "primary-link-checked",
      "seeded-from-bib"
    ],
    "one_line_summary": "Test-time reinforcement learning recipe that studies how unlabeled data and reward signals can adapt a model during inference-time training.",
    "why_it_matters": "Test-time reinforcement learning recipe that studies how unlabeled data and reward signals can adapt a model during inference-time training.",
    "data_object": "candidate response with reward/adaptation signal; process: unlabeled input, rollout, reward signal; test-time task distribution",
    "feedback_verifier": "task-specific or learned reward used during adaptation",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.16084",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.16084",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/recipes/ttrl.md"
    },
    "primary_link": "https://arxiv.org/abs/2504.16084"
  },
  {
    "id": "var-math-2025",
    "title": "VAR-MATH",
    "year": 2025,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "programmatic_math_code_proof"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "why-we-no-longer-evaluate-on-swe-bench-verified-2025",
    "title": "Why we no longer evaluate on SWE-bench Verified",
    "year": 2025,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "xlam-a-family-of-large-action-models-to-empower-ai-agent-systems-2025",
    "title": "xLAM: A family of large action models to empower AI agent systems",
    "year": 2025,
    "venue": "NAACL",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "frontier_model_reports"
    ],
    "subfield": "🔁 Replayable trajectory data",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "agent-world-scaling-real-world-environment-synthesis-for-evolving-general-agent--2026",
    "title": "Agent-World: Scaling Real-World Environment Synthesis for Evolving General Agent Intelligence",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2604.18292",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2604.18292",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2604.18292",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2604.18292"
  },
  {
    "id": "alternating-reinforcement-learning-for-rubric-based-reward-modeling-in-non-verif-2026",
    "title": "Alternating Reinforcement Learning for Rubric-Based Reward Modeling in Non-Verifiable LLM Post-Training (Rubric-ARM)",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2602.01511",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "judgment_required_rubrics_safety_domain",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2602.01511",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2602.01511",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2602.01511"
  },
  {
    "id": "an-imperfect-verifier-is-good-enough-learning-with-noisy-rewards-2026",
    "title": "An Imperfect Verifier is Good Enough: Learning with Noisy Rewards",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2604.07666",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "process_supervision_prm",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2604.07666",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2604.07666",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2604.07666"
  },
  {
    "id": "autorubric-unifying-rubric-based-llm-evaluation-2026",
    "title": "Autorubric: Unifying Rubric-based LLM Evaluation",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2603.00077",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧑‍⚖️ Human/expert judgment",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2603.00077",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2603.00077",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2603.00077"
  },
  {
    "id": "bootstrapping-post-training-signals-for-open-ended-tasks-via-rubric-based-self-p-2026",
    "title": "Bootstrapping Post-training Signals for Open-ended Tasks via Rubric-based Self-play on Pre-training Text (POP)",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2604.20051",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2604.20051",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2604.20051",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2604.20051"
  },
  {
    "id": "coverrl-breaking-the-consensus-trap-in-label-free-reasoning-via-generator-verifi-2026",
    "title": "CoVerRL: Breaking the Consensus Trap in Label-Free Reasoning via Generator-Verifier Co-Evolution",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2603.17775",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "process_supervision_prm",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🪜 Human step-level labels",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2603.17775",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2603.17775",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2603.17775"
  },
  {
    "id": "decoupling-kl-and-trajectories-a-unified-perspective-for-sft-dagger-offline-rl-a-2026",
    "title": "Decoupling KL and Trajectories: A Unified Perspective for SFT, DAgger, Offline RL, and OPD in LLM Distillation",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2605.16826",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2605.16826",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2605.16826",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2605.16826"
  },
  {
    "id": "dual-consensus-escaping-from-spurious-majority-in-unsupervised-rlvr-via-two-stag-2026",
    "title": "Dual Consensus: Escaping from Spurious Majority in Unsupervised RLVR via Two-Stage Vote Mechanism",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2603.16223",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🎮 Reward hacking",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2603.16223",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2603.16223",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2603.16223"
  },
  {
    "id": "learning-beyond-teacher-generalized-on-policy-distillation-with-reward-extrapola-2026",
    "title": "Learning beyond Teacher: Generalized On-Policy Distillation with Reward Extrapolation (G-OPD)",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2602.12125",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "✍️ Teacher trace generation",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2602.12125",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2602.12125",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2602.12125"
  },
  {
    "id": "live-swe-agent-can-software-engineering-agents-self-evolve-on-the-fly-2026",
    "title": "Live-SWE-agent: Can Software Engineering Agents Self-Evolve on the Fly?",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2511.13646",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "🧑‍💻 SWE/repository agents",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2511.13646",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2511.13646",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2511.13646"
  },
  {
    "id": "llms-gaming-verifiers-rlvr-can-lead-to-reward-hacking-2026",
    "title": "LLMs Gaming Verifiers: RLVR can Lead to Reward Hacking",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2604.15149",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "process_supervision_prm",
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2604.15149",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2604.15149",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2604.15149"
  },
  {
    "id": "omni-rrm-advancing-omni-reward-modeling-via-automatic-rubric-grounded-preference-2026",
    "title": "Omni-RRM: Advancing Omni Reward Modeling via Automatic Rubric-Grounded Preference Synthesis",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2602.00846",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "foundations_instruction_preference_alignment",
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🤝 Human preference data / RLHF",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2602.00846",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2602.00846",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2602.00846"
  },
  {
    "id": "pass-k-t-re-examining-the-reasoning-boundary-for-agentic-rl-2026",
    "title": "pass@$(k,T)$: Re-examining the reasoning boundary for agentic RL",
    "year": 2026,
    "venue": "arXiv preprint",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🎲 pass@k / sampling budget",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "rewardbench-2-2026",
    "title": "RewardBench 2",
    "year": 2026,
    "venue": "ICLR",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "judgment_required_rubrics_safety_domain",
      "benchmarks_evaluation"
    ],
    "subfield": "🧪 Rubric reward models",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "soft-contamination-means-benchmarks-test-shallow-generalization-2026",
    "title": "Soft Contamination Means Benchmarks Test Shallow Generalization",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2602.12413",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧯 Contamination / evaluation surveys",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2602.12413",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2602.12413",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2602.12413"
  },
  {
    "id": "spurious-rewards-paradox-mechanistically-understanding-how-rlvr-activates-memori-2026",
    "title": "Spurious Rewards Paradox: Mechanistically Understanding How RLVR Activates Memorization Shortcuts in LLMs",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2601.11061",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "audit_failure_contamination_verifier_attacks"
    ],
    "subfield": "🧨 Spurious rewards",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2601.11061",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2601.11061",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2601.11061"
  },
  {
    "id": "swe-master-unleashing-the-potential-of-software-engineering-agents-via-post-trai-2026",
    "title": "SWE-Master: Unleashing the Potential of Software Engineering Agents via Post-Training",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2602.03411",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "surveys_and_primers",
      "environmental_agents_tools_web_swe"
    ],
    "subfield": "Other related work",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "survey background",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2602.03411",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2602.03411",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2602.03411"
  },
  {
    "id": "terminal-bench-a-benchmark-and-task-environment-for-terminal-agents-2026",
    "title": "Terminal-Bench: A benchmark and task environment for terminal agents",
    "year": 2026,
    "venue": "unknown",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "environmental_agents_tools_web_swe",
      "audit_failure_contamination_verifier_attacks",
      "benchmarks_evaluation"
    ],
    "subfield": "🧰 Agent benchmarks and terminal predicates",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "needs_metadata",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": null
  },
  {
    "id": "why-does-self-distillation-sometimes-degrade-the-reasoning-capability-of-llms-2026",
    "title": "Why Does Self-Distillation (Sometimes) Degrade the Reasoning Capability of LLMs?",
    "year": 2026,
    "venue": "arXiv preprint arXiv:2603.24472",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "unknown"
    ],
    "domains": [],
    "category": [
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧱 Prompt sourcing",
    "tags": [
      "seeded-from-bib"
    ],
    "one_line_summary": "Official source is linked; detailed reasoning-data summary is still pending.",
    "why_it_matters": "Verified citation waypoint; add a paper-specific data-object, verifier, and audit note before promoting it as a core read.",
    "data_object": "metadata pending",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2603.24472",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2603.24472",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2603.24472"
  },
  {
    "id": "openr1-math-220k-2025",
    "title": "OpenR1-Math-220k",
    "year": 2025,
    "venue": "Hugging Face / GitHub",
    "authors": [],
    "source_role": [
      "data_release",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "sft",
      "distillation",
      "rlvr"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "construction_recipes_open_reasoning_data"
    ],
    "subfield": "🧮 Math RLVR datasets",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "Open R1 math dataset/reproduction asset with large-scale math questions and reasoning traces; read it through lineage, verifier, and filtering fields.",
    "why_it_matters": "Open R1 math dataset/reproduction asset with large-scale math questions and reasoning traces; read it through lineage, verifier, and filtering fields.",
    "data_object": "math problem with reasoning trace and final answer; process: problem, reasoning trace, answer; offline math corpus",
    "feedback_verifier": "math answer verifier / filtering pipeline",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "partial",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/huggingface/open-r1",
      "data": null,
      "huggingface": "https://huggingface.co/datasets/open-r1/OpenR1-Math-220k",
      "project": null,
      "bibtex": null,
      "card": "cards/releases/openr1.md"
    },
    "primary_link": null
  },
  {
    "id": "omegaprm-automated-process-supervision-2024",
    "title": "OmegaPRM: Improve Mathematical Reasoning in Language Models by Automated Process Supervision",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "process_supervision",
      "verifier_reward",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic",
      "mixed"
    ],
    "supervision_granularity": [
      "step_level",
      "process_reward"
    ],
    "training_use": [
      "process_supervision",
      "reward_modeling",
      "test_time_compute"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "process_supervision_prm",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🛠️ Automatic process supervision",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "Automated process-supervision recipe that uses search to locate first errors and generate PRM training signals without human labels.",
    "why_it_matters": "Automated process-supervision recipe that uses search to locate first errors and generate PRM training signals without human labels.",
    "data_object": "process supervision annotations; process: partial reasoning prefix, first-error signal, positive/negative step examples; offline math search tree",
    "feedback_verifier": "automated process reward signal",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2406.06592",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2406.06592",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/omegaprm.md"
    },
    "primary_link": "https://arxiv.org/abs/2406.06592"
  },
  {
    "id": "tinyv-2025",
    "title": "TinyV: Reducing False Negatives in Verification Improves RL for LLM Reasoning",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "verifier_reward",
      "audit_failure",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic",
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "scalar_reward"
    ],
    "training_use": [
      "rlvr",
      "reward_modeling",
      "evaluation"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "process_supervision_prm",
      "audit_failure_contamination_verifier_attacks",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "Other related work",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "Lightweight verifier aimed at recovering false negatives from rule-based math verifiers during RL training.",
    "why_it_matters": "Lightweight verifier aimed at recovering false negatives from rule-based math verifiers during RL training.",
    "data_object": "candidate answer with recovered reward decision; process: original verifier verdict, TinyV verdict, reward correction; offline math verifier stack",
    "feedback_verifier": "small LLM verifier augmenting rules",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L4_carded",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2505.14625",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2505.14625",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/uw-nsl/TinyV",
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/verifiers/tinyv.md"
    },
    "primary_link": "https://arxiv.org/abs/2505.14625"
  },
  {
    "id": "qwen3-coder-2025",
    "title": "Qwen3-Coder",
    "year": 2025,
    "venue": "GitHub / project report",
    "authors": [],
    "source_role": [
      "model_report",
      "construction_recipe"
    ],
    "verification_contract": [
      "programmatic",
      "environmental",
      "mixed"
    ],
    "supervision_granularity": [
      "answer_level",
      "full_episode"
    ],
    "training_use": [
      "sft",
      "rlvr",
      "agent_training",
      "evaluation"
    ],
    "domains": [
      "code",
      "software_engineering",
      "agents"
    ],
    "category": [
      "environmental_agents_tools_web_swe",
      "construction_recipes_open_reasoning_data",
      "frontier_model_reports"
    ],
    "subfield": "🛠️ Tool-use data",
    "tags": [
      "curated-card",
      "primary-link-checked"
    ],
    "one_line_summary": "Coding-agent recipe entry for studying how code data, tool-call tasks, verifiable execution, and agentic RL enter a frontier open model release.",
    "why_it_matters": "Coding-agent recipe entry for studying how code data, tool-call tasks, verifiable execution, and agentic RL enter a frontier open model release.",
    "data_object": "code solution, tool-call, or agent trajectory; process: code answer, tool call, execution result; code execution and agent task environments",
    "feedback_verifier": "unit tests, execution feedback, and agent task success signals",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L0_seeded",
    "status": "partial",
    "needs_search": true,
    "artifacts": {
      "paper": null,
      "venue": null,
      "arxiv": null,
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/QwenLM/Qwen3-Coder",
      "data": null,
      "huggingface": null,
      "project": "https://qwenlm.github.io/blog/qwen3-coder/",
      "bibtex": null,
      "card": "cards/recipes/qwen3_coder.md"
    },
    "primary_link": null
  },
  {
    "id": "gsm8k-grade-school-math-2021",
    "title": "GSM8K: Grade School Math 8K",
    "year": 2021,
    "venue": "arXiv / OpenAI dataset",
    "authors": [
      "Karl Cobbe et al."
    ],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation",
      "sft",
      "reward_modeling"
    ],
    "domains": [
      "math"
    ],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "📐 Math answer-verifiable data",
    "tags": [
      "gsm8k",
      "math",
      "benchmark",
      "verifier"
    ],
    "one_line_summary": "Canonical grade-school math benchmark with natural-language word problems, worked solutions, and final numeric answers.",
    "why_it_matters": "It remains a compact sanity check for answer-verifiable reasoning data, verifier reranking, SFT, and RLVR-style math training.",
    "data_object": "natural-language solution with final numeric answer; process: question, solution, final answer; offline math benchmark",
    "feedback_verifier": "answer extraction and arithmetic correctness checks",
    "audit_focus": "answer extraction errors, contamination through benchmark reuse",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2110.14168",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2110.14168",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/openai/grade-school-math",
      "data": "https://github.com/openai/grade-school-math",
      "huggingface": "https://huggingface.co/datasets/openai/gsm8k",
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/gsm8k-grade-school-math-8k.md"
    },
    "primary_link": "https://arxiv.org/abs/2110.14168"
  },
  {
    "id": "humaneval-code-generation-benchmark-2021",
    "title": "HumanEval: Hand-Written Evaluation Set",
    "year": 2021,
    "venue": "arXiv / OpenAI dataset",
    "authors": [
      "Mark Chen et al."
    ],
    "source_role": [
      "benchmark",
      "data_release"
    ],
    "verification_contract": [
      "programmatic"
    ],
    "supervision_granularity": [
      "answer_level"
    ],
    "training_use": [
      "evaluation"
    ],
    "domains": [
      "code"
    ],
    "category": [
      "programmatic_math_code_proof",
      "benchmarks_evaluation"
    ],
    "subfield": "💻 Code execution / unit-test data",
    "tags": [
      "humaneval",
      "code",
      "unit-tests",
      "benchmark"
    ],
    "one_line_summary": "HumanEval provides hand-written Python programming problems with unit tests for executable code-generation evaluation.",
    "why_it_matters": "It made unit-test execution a standard verifier for code reasoning, pass@k reporting, and later code-data filtering recipes.",
    "data_object": "Python function completion; process: prompt, canonical solution, unit tests; Python execution harness",
    "feedback_verifier": "unit tests",
    "audit_focus": "public benchmark contamination, unit-test coverage gaps",
    "curation_level": "L5_audit_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2107.03374",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2107.03374",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": "https://github.com/openai/human-eval",
      "data": "https://github.com/openai/human-eval",
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": "cards/benchmarks/humaneval-hand-written-evaluation-set.md"
    },
    "primary_link": "https://arxiv.org/abs/2107.03374"
  },
  {
    "id": "reasoning-with-large-language-models-a-survey-2024",
    "title": "Reasoning with Large Language Models, a Survey",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "audit"
    ],
    "domains": [
      "reasoning",
      "survey"
    ],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "🧠 Reasoning LLM surveys",
    "tags": [
      "survey",
      "reasoning-llms"
    ],
    "one_line_summary": "Surveys reasoning with large language models, giving newcomers a map of reasoning paradigms, tasks, and evaluation patterns.",
    "why_it_matters": "It fills the reasoning-LLM survey lane of the atlas so readers can separate model-centric reasoning work from data-object and verifier-centric papers.",
    "data_object": "survey taxonomy and literature map.; literature survey.",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2407.11511",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2407.11511",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2407.11511"
  },
  {
    "id": "a-survey-of-reinforcement-learning-from-human-feedback-2023",
    "title": "A Survey of Reinforcement Learning from Human Feedback",
    "year": 2023,
    "venue": "TMLR",
    "authors": [
      "Timo Kaufmann",
      "Paul Weng",
      "Viktor Bengs",
      "Eyke Hüllermeier"
    ],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "reward_modeling",
      "preference_learning",
      "audit"
    ],
    "domains": [
      "rlhf",
      "reward-modeling",
      "alignment"
    ],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "rlhf",
      "reward-model-survey",
      "human-feedback",
      "preference-learning"
    ],
    "one_line_summary": "Surveys RLHF as a feedback-to-reward-to-policy pipeline, giving readers the baseline vocabulary for human preference data and learned reward models.",
    "why_it_matters": "It fills the RLHF survey doorway by separating human preference feedback, reward modeling, and policy optimization before readers compare them with verifiable-reward reasoning data.",
    "data_object": "survey taxonomy over feedback collection, reward modeling, and policy optimization.; process: feedback source, preference format, reward model objective; RLHF pipelines spanning LLMs and broader RL settings.",
    "feedback_verifier": "learned reward model from human feedback.",
    "audit_focus": "Human feedback can be noisy, subjective, sparse, or expensive., Reward models can overfit annotator preferences and become exploitable objectives., LLM readers may overgeneralize broad RLHF lessons to verifiable-reasoning settings.",
    "curation_level": "L3_summary_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2312.14925",
      "venue": "https://openreview.net/forum?id=f7OkIurx4b",
      "arxiv": "https://arxiv.org/abs/2312.14925",
      "openreview": "https://openreview.net/forum?id=f7OkIurx4b",
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": "https://doi.org/10.48550/arXiv.2312.14925",
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2312.14925"
  },
  {
    "id": "a-comprehensive-survey-of-reward-models-taxonomy-applications-challenges-and-future-2025",
    "title": "A Comprehensive Survey of Reward Models: Taxonomy, Applications, Challenges, and Future",
    "year": 2025,
    "venue": "arXiv",
    "authors": [
      "Jialun Zhong",
      "Wei Shen",
      "Yanzeng Li",
      "Songyang Gao",
      "Hua Lu",
      "Yicheng Chen",
      "Yang Zhang",
      "Wei Zhou",
      "Jinjie Gu",
      "Lei Zou"
    ],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "reward_modeling",
      "preference_learning",
      "evaluation",
      "audit"
    ],
    "domains": [
      "reward-modeling",
      "rlhf",
      "alignment"
    ],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "judgment_required_rubrics_safety_domain"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "reward-model-survey",
      "reward-modeling",
      "rlhf",
      "preference-data"
    ],
    "one_line_summary": "Surveys reward models from preference collection through model training, use, evaluation benchmarks, and failure modes.",
    "why_it_matters": "It gives readers a reward-model-specific map, which is essential before comparing learned human-preference rewards with PRMs, rubric rewards, and programmatic RLVR verifiers.",
    "data_object": "taxonomy of reward-model data sources, objectives, applications, evaluations, and challenges.; process: preference source, reward model architecture, usage mode; LLM reward-model training and evaluation pipelines.",
    "feedback_verifier": "reward model as proxy objective for downstream post-training.",
    "audit_focus": "Reward models may encode annotator bias, style bias, or length preference., Proxy rewards can be overoptimized or attacked when used as training objectives., Benchmark scores can obscure whether the reward model is useful for reasoning data.",
    "curation_level": "L3_summary_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2504.12328",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2504.12328",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": "https://doi.org/10.48550/arXiv.2504.12328",
      "code": null,
      "data": null,
      "huggingface": null,
      "project": "https://github.com/JLZhong23/awesome-reward-models",
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2504.12328"
  },
  {
    "id": "a-survey-on-human-preference-learning-for-large-language-models-2024",
    "title": "A Survey on Human Preference Learning for Large Language Models",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Ruili Jiang",
      "Kehai Chen",
      "Xuefeng Bai",
      "Zhixuan He",
      "Juntao Li",
      "Muyun Yang",
      "Tiejun Zhao",
      "Liqiang Nie",
      "Min Zhang"
    ],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "judgment_required",
      "mixed"
    ],
    "supervision_granularity": [
      "pairwise_preference",
      "scalar_reward"
    ],
    "training_use": [
      "preference_learning",
      "reward_modeling",
      "evaluation"
    ],
    "domains": [
      "human-preference",
      "preference-learning",
      "alignment"
    ],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment"
    ],
    "subfield": "🧪 RLHF / reward-model surveys",
    "tags": [
      "preference-learning",
      "human-preference",
      "rlhf",
      "reward-modeling"
    ],
    "one_line_summary": "Reviews preference feedback sources, preference formats, modeling methods, usage objectives, and evaluation for LLM alignment.",
    "why_it_matters": "It makes the preference-data layer explicit, helping readers distinguish demonstrations, pairwise comparisons, scalar rewards, DPO-style objectives, and evaluation judgments.",
    "data_object": "preference-centered taxonomy over feedback data, preference modeling, preference usage, and aligned-model evaluation.; process: preference source, preference format, preference model; LLM alignment pipelines using human preference signals.",
    "feedback_verifier": "human preference signal transformed into reward, preference loss, or evaluation judgment.",
    "audit_focus": "Preference labels can be noisy, culturally variable, or underspecified., Pairwise preferences may not preserve reasoning correctness or factual grounding., Evaluation of aligned models can conflate helpfulness, style, and reasoning quality.",
    "curation_level": "L3_summary_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2406.11191",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2406.11191",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": "https://doi.org/10.48550/arXiv.2406.11191",
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2406.11191"
  },
  {
    "id": "reinforcement-learning-for-llm-post-training-a-survey-2024",
    "title": "Reinforcement Learning for LLM Post-Training: A Survey",
    "year": 2024,
    "venue": "arXiv",
    "authors": [
      "Zhichao Wang",
      "Kiran Ramnath",
      "Bin Bi",
      "Shiva Kumar Pentyala",
      "Sougata Chaudhuri",
      "Shubham Mehrotra",
      "Zixu Zhu",
      "Xiang-Bo Mao",
      "Sitaram Asur",
      "Na Cheng"
    ],
    "source_role": [
      "survey_background",
      "scaling_study"
    ],
    "verification_contract": [
      "mixed",
      "programmatic"
    ],
    "supervision_granularity": [
      "scalar_reward",
      "answer_level"
    ],
    "training_use": [
      "reward_modeling",
      "preference_learning",
      "rlvr",
      "test_time_compute",
      "audit"
    ],
    "domains": [
      "post-training",
      "rlhf",
      "rlvr",
      "preference-learning"
    ],
    "category": [
      "surveys_and_primers",
      "foundations_instruction_preference_alignment",
      "scaling_test_time_compute_rlvr"
    ],
    "subfield": "🧭 Post-training surveys",
    "tags": [
      "rlhf",
      "rlvr",
      "reward-model-survey",
      "post-training",
      "policy-optimization"
    ],
    "one_line_summary": "Unifies RLHF, DPO, PPO/GRPO, and RLVR as LLM post-training methods, with attention to prompts, responses, rewards, and optimization details.",
    "why_it_matters": "It connects classic RLHF and reward modeling to reasoning-oriented RLVR, helping readers avoid conflating human preference rewards with programmatic or verifiable rewards.",
    "data_object": "technical survey comparing RLHF and RLVR policy-gradient style post-training methods.; process: prompt sampling, response sampling, reward source; LLM post-training algorithms and reasoning tasks such as math and coding.",
    "feedback_verifier": "learned preference rewards, verifiable rewards, and policy-gradient objectives.",
    "audit_focus": "Method comparisons can mix data effects with optimizer and sampling-budget effects., RLHF and RLVR rewards are often discussed together despite different verification contracts., Implementation details can dominate reported gains if not separated from data quality.",
    "curation_level": "L3_summary_ready",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2407.16216",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2407.16216",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": "https://doi.org/10.48550/arXiv.2407.16216",
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2407.16216"
  },
  {
    "id": "survey-of-reasoning-large-language-models-2025",
    "title": "A Survey of Reasoning with Foundation Models",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "audit"
    ],
    "domains": [
      "reasoning",
      "survey"
    ],
    "category": [
      "surveys_and_primers"
    ],
    "subfield": "🧠 Reasoning LLM surveys",
    "tags": [
      "survey",
      "reasoning-llms",
      "foundation-models"
    ],
    "one_line_summary": "Surveys reasoning with foundation models, including tasks, methods, and evaluation patterns that reasoning-data readers need as context.",
    "why_it_matters": "It gives the atlas a second reasoning-survey waypoint so readers can orient before choosing math, code, agent, rubric, or scaling tracks.",
    "data_object": "survey taxonomy and literature map.; literature survey.",
    "feedback_verifier": "metadata pending",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2502.17419",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2502.17419",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2502.17419"
  },
  {
    "id": "survey-on-evaluation-of-llm-based-agents-2025",
    "title": "A Survey on Evaluation of LLM-based Agents",
    "year": 2025,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "survey_background"
    ],
    "verification_contract": [
      "environmental",
      "mixed"
    ],
    "supervision_granularity": [
      "full_episode",
      "state_action_level"
    ],
    "training_use": [
      "evaluation",
      "audit"
    ],
    "domains": [
      "agents",
      "evaluation"
    ],
    "category": [
      "surveys_and_primers",
      "environmental_agents_tools_web_swe",
      "benchmarks_evaluation"
    ],
    "subfield": "🌐 Agent data / tool-use surveys",
    "tags": [
      "survey",
      "agent-evaluation",
      "tool-use"
    ],
    "one_line_summary": "Surveys how LLM-based agents are evaluated across tasks, environments, metrics, and interaction settings.",
    "why_it_matters": "It gives the atlas an agent-survey waypoint for readers who need to understand environment data before comparing SWE, web, app, or OS agent benchmarks.",
    "data_object": "survey taxonomy for agent evaluation tasks and environments.; process: task, environment, trajectory, evaluator, terminal predicate.; LLM-agent evaluation literature.",
    "feedback_verifier": "environmental and benchmark evaluators summarized by the survey.",
    "audit_focus": "check links, lineage, verifier, split, and contamination",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2503.16416",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2503.16416",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2503.16416"
  },
  {
    "id": "language-model-developers-should-report-train-test-overlap-2024",
    "title": "Language Model Developers Should Report Train-Test Overlap",
    "year": 2024,
    "venue": "arXiv",
    "authors": [],
    "source_role": [
      "audit_failure",
      "survey_background"
    ],
    "verification_contract": [
      "unknown"
    ],
    "supervision_granularity": [
      "unknown"
    ],
    "training_use": [
      "audit"
    ],
    "domains": [
      "contamination",
      "data-reuse"
    ],
    "category": [
      "scaling_test_time_compute_rlvr",
      "audit_failure_contamination_verifier_attacks",
      "surveys_and_primers"
    ],
    "subfield": "🔁 Data reuse and uniqueness",
    "tags": [
      "train-test-overlap",
      "data-reuse",
      "contamination"
    ],
    "one_line_summary": "Argues that language-model releases should report train-test overlap so evaluation gains can be interpreted with data reuse in mind.",
    "why_it_matters": "It gives the scaling track a concrete data-reuse and uniqueness reference for checking whether repeated or overlapping examples are counted as fresh evidence.",
    "data_object": "overlap and reporting analysis.; process: training corpus, evaluation set, overlap estimate, reporting policy.; benchmark and training-data documentation.",
    "feedback_verifier": "overlap analysis rather than a reward model.",
    "audit_focus": "Reported benchmark gains can be inflated when train-test overlap is not disclosed.",
    "curation_level": "L1_link_verified",
    "status": "verified",
    "needs_search": false,
    "artifacts": {
      "paper": "https://arxiv.org/abs/2410.08385",
      "venue": null,
      "arxiv": "https://arxiv.org/abs/2410.08385",
      "openreview": null,
      "acl": null,
      "pmlr": null,
      "cvf": null,
      "doi": null,
      "code": null,
      "data": null,
      "huggingface": null,
      "project": null,
      "bibtex": null,
      "card": null
    },
    "primary_link": "https://arxiv.org/abs/2410.08385"
  }
]
