{"schema":{"version":"iter-310","generated":"2026-05-28T16:51:19.667Z","docs":"https://policywindow.org/wiki/methodology","license":"CC BY 4.0 (article content) + MIT (code/schema)"},"instruments":[{"shortCode":"EU-AIA-2024","jurisdiction":"EU","name":"EU AI Act","kind":"binding_regulation","adoptedDate":"2024-07-12","effectiveDate":"2024-08-01","sourceUrl":"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32024R1689","sourceCitation":"Regulation (EU) 2024/1689","lastReviewedAt":"2026-05-24","status":"in_force","notes":"Risk-based framework. Prohibited practices (Art. 5) effective Feb 2025; general-purpose AI obligations Aug 2025; high-risk system obligations Aug 2026.","conceptsUsed":["frontier-tier","systemic-risk","designated-systemic","compute-threshold","red-team-evaluation","model-card","provenance-watermarking","alignment","deceptive-alignment","scalable-oversight","capability-elicitation","policy-instrument","ai-supply-chain","training-data-attribution","prompt-injection","multi-turn-evaluation","data-poisoning","jailbreak-resistance","sandbagging","hallucination","in-context-learning","retrieval-augmented-generation"],"externalIdentifiers":{"wikidata_q_id":"Q120746920","iso_3166_alpha2":"EU"}},{"shortCode":"US-EO-14110","jurisdiction":"US","name":"Executive Order 14110 on Safe, Secure, Trustworthy AI","kind":"executive_order","adoptedDate":"2023-10-30","effectiveDate":"2023-10-30","sourceUrl":"https://www.federalregister.gov/documents/2023/11/01/2023-24283/safe-secure-and-trustworthy-development-and-use-of-artificial-intelligence","sourceCitation":"Exec. Order No. 14110, 88 Fed. Reg. 75191 (Nov. 1, 2023)","lastReviewedAt":"2026-05-24","status":"partial","notes":"Partially rescinded by EO 14179 (Jan 2025). Some §4 reporting persists via Defense Production Act + BIS interim rule.","conceptsUsed":["frontier-tier","compute-threshold","red-team-evaluation","provenance-watermarking","alignment","capability-elicitation","dual-use-research-taxonomy","policy-instrument"],"externalIdentifiers":{"wikidata_q_id":"Q123204027","iso_3166_alpha2":"US"}},{"shortCode":"US-EO-14179","jurisdiction":"US","name":"Executive Order 14179 — Removing Barriers to American Leadership in AI","kind":"executive_order","adoptedDate":"2025-01-23","effectiveDate":"2025-01-23","sourceUrl":"https://www.whitehouse.gov/presidential-actions/2025/01/removing-barriers-to-american-leadership-in-artificial-intelligence/","sourceCitation":"Exec. Order No. 14179, 90 Fed. Reg. 8741 (Jan 31, 2025)","lastReviewedAt":"2026-05-24","status":"in_force","notes":"Rescinds EO 14110's regulatory-burden provisions. Directs OMB / OSTP / NSC to remove barriers to AI development. Does NOT itself impose new substantive obligations — coverage is mostly silent. The DPA-grounded compute-reporting interim rule (BIS, Jan 2025) and Defense Production Act §708 reporting persist independently.","conceptsUsed":["policy-instrument"],"externalIdentifiers":{"iso_3166_alpha2":"US"}},{"shortCode":"UK-WHITEPAPER-2023","jurisdiction":"UK","name":"UK Pro-Innovation Approach to AI Regulation (White Paper)","kind":"policy_statement","adoptedDate":"2023-03-29","effectiveDate":null,"sourceUrl":"https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach","sourceCitation":"CP 815 (2023)","lastReviewedAt":"2026-05-24","status":"in_force","notes":"Principles-based, regulator-led approach (no statutory AI law). Cross-sectoral principles delegated to existing regulators. AISI established Nov 2023 for evaluation/safety research.","conceptsUsed":["frontier-tier","asl-3","red-team-evaluation","scalable-oversight","policy-instrument"],"externalIdentifiers":{"iso_3166_alpha2":"GB"}},{"shortCode":"CN-GENAI-2023","jurisdiction":"CN","name":"Interim Measures for Generative AI Service Management","kind":"binding_regulation","adoptedDate":"2023-07-13","effectiveDate":"2023-08-15","sourceUrl":"https://www.cac.gov.cn/2023-07/13/c_1690898327029107.htm","sourceCitation":"CAC Order No. 15","lastReviewedAt":"2026-05-24","status":"in_force","notes":"Joint CAC/MIIT/MPS measures. Registration + safety assessment for public-facing generative AI. Aligns with Algorithm Recommendation Rules (2022) and Deep Synthesis Rules (2022).","conceptsUsed":["provenance-watermarking","policy-instrument","training-data-attribution","data-poisoning"],"externalIdentifiers":{"iso_3166_alpha2":"CN"}},{"shortCode":"G7-HIROSHIMA","jurisdiction":"G7","name":"G7 Hiroshima AI Process Code of Conduct","kind":"voluntary_code","adoptedDate":"2023-10-30","effectiveDate":"2023-10-30","sourceUrl":"https://www.mofa.go.jp/files/100573472.pdf","sourceCitation":"G7 Hiroshima AI Process, Oct 2023","lastReviewedAt":"2026-05-24","status":"in_force","notes":"Voluntary commitments by frontier AI developers. 11-point code covering risk identification, deployment, content provenance, security investment, info sharing.","conceptsUsed":["frontier-tier","asl-3","systemic-risk","red-team-evaluation","model-card","provenance-watermarking","alignment","deceptive-alignment","capability-elicitation","dual-use-research-taxonomy","policy-instrument","agentic-system","jailbreak-resistance"]},{"shortCode":"OECD-AI-PRIN","jurisdiction":"OECD","name":"OECD AI Principles (Recommendation)","kind":"voluntary_code","adoptedDate":"2019-05-22","effectiveDate":"2019-05-22","sourceUrl":"https://oecd.ai/en/ai-principles","sourceCitation":"OECD/LEGAL/0449","lastReviewedAt":"2026-05-24","status":"in_force","notes":"First intergovernmental standard. Updated 2024 to clarify GPAI scope. Foundation referenced by G7, GPAI, and many national frameworks.","conceptsUsed":["model-card","policy-instrument","hallucination"]},{"shortCode":"COE-AI-CONV","jurisdiction":"council_of_europe","name":"Council of Europe Framework Convention on AI","kind":"international_treaty","adoptedDate":"2024-05-17","effectiveDate":null,"sourceUrl":"https://www.coe.int/en/web/artificial-intelligence/the-framework-convention-on-artificial-intelligence","sourceCitation":"CETS No. 225","status":"adopted_not_in_force","notes":"First legally-binding international treaty on AI. Opened for signature Sep 2024. Enters into force three months after five ratifications including three CoE members.","conceptsUsed":["systemic-risk","policy-instrument"]},{"shortCode":"UN-RES-2024","jurisdiction":"UN","name":"UN GA Resolution on Safe, Secure, Trustworthy AI","kind":"resolution","adoptedDate":"2024-03-21","effectiveDate":"2024-03-21","sourceUrl":"https://documents.un.org/doc/undoc/gen/n24/065/92/pdf/n2406592.pdf","sourceCitation":"A/RES/78/265","status":"in_force","notes":"Non-binding. Calls on member states to bridge digital divides and develop national strategies. China + US co-sponsored; passed by consensus.","conceptsUsed":["policy-instrument"]},{"shortCode":"NIST-AI-RMF","jurisdiction":"US","name":"NIST AI Risk Management Framework","kind":"technical_standard","adoptedDate":"2023-01-26","effectiveDate":"2023-01-26","sourceUrl":"https://www.nist.gov/itl/ai-risk-management-framework","sourceCitation":"NIST AI 100-1","lastReviewedAt":"2026-05-24","status":"in_force","notes":"Voluntary. Four functions (Govern / Map / Measure / Manage). GenAI Profile (NIST AI 600-1) added 2024 for GPAI-specific guidance.","conceptsUsed":["model-card","scalable-oversight","dual-use-research-taxonomy","policy-instrument"],"externalIdentifiers":{"wikidata_q_id":"Q117205559","ror_id":"https://ror.org/05xpvk416","iso_3166_alpha2":"US"}},{"shortCode":"BLETCHLEY-2023","jurisdiction":"global","name":"Bletchley Declaration on AI Safety","kind":"voluntary_code","adoptedDate":"2023-11-02","effectiveDate":"2023-11-02","sourceUrl":"https://www.gov.uk/government/publications/ai-safety-summit-2023-the-bletchley-declaration","sourceCitation":"Bletchley Declaration (UK AI Safety Summit, Nov 2023)","status":"in_force","notes":"First multilateral consensus on frontier-AI safety risks. 28 signatories including US, EU, China. Introduced the policy vocabulary of 'frontier AI' that later instruments adopted. Non-binding but precedent-setting; spawned the AI Safety Institute network.","externalIdentifiers":{"wikidata_q_id":"Q123544428"}},{"shortCode":"SEOUL-2024","jurisdiction":"global","name":"Seoul Declaration on Safe, Innovative and Inclusive AI","kind":"voluntary_code","adoptedDate":"2024-05-22","effectiveDate":"2024-05-22","sourceUrl":"https://www.gov.uk/government/publications/seoul-ministerial-declaration-on-safe-innovative-and-inclusive-ai","sourceCitation":"Seoul Declaration (AI Seoul Summit, May 2024)","status":"in_force","notes":"Bletchley follow-up. 16 frontier-AI-developer companies signed Frontier AI Safety Commitments alongside. Introduces measurable capability-evaluation expectations and pre-deployment thresholds; first instrument to formalise frontier-lab voluntary commitments as a governance category.","conceptsUsed":["agentic-system","inference-time-compute"]},{"shortCode":"NIST-AI-RMF-GENAI","jurisdiction":"US","name":"NIST AI RMF Generative AI Profile","kind":"technical_standard","adoptedDate":"2024-07-26","effectiveDate":"2024-07-26","sourceUrl":"https://www.nist.gov/publications/artificial-intelligence-risk-management-framework-generative-artificial-intelligence","sourceCitation":"NIST AI 600-1 (Jul 2024)","status":"in_force","notes":"Companion to NIST AI 100-1 covering GenAI-specific risks: CBRN information uplift, confabulation, data privacy, environmental impacts, harmful bias, dangerous information, IP misuse, obscene/abusive/violent content, information security, information integrity, human-AI configuration, value chain and component integration. Voluntary.","conceptsUsed":["ai-supply-chain","prompt-injection","agentic-system","tool-use-safety","multi-turn-evaluation","data-poisoning","jailbreak-resistance","hallucination","in-context-learning","retrieval-augmented-generation"]},{"shortCode":"CA-SB-1047","jurisdiction":"US","name":"California SB-1047: Safe and Secure Innovation for Frontier AI Models Act","kind":"binding_regulation","adoptedDate":"2024-09-29","effectiveDate":null,"sourceUrl":"https://leginfo.legislature.ca.gov/faces/billNavClient.xhtml?bill_id=202320240SB1047","sourceCitation":"Cal. SB-1047 (Wiener, 2024)","status":"proposed","notes":"First US state-level model-testing mandate. Passed CA legislature Sep 2024; vetoed by Gov. Newsom Sep 29, 2024. Re-introduction expected 2025-2026 with amendments. Would have required pre-deployment third-party testing for models above 10^26 FLOPs OR $100M+ training cost. Cited in every 2024-2025 AI governance literature review as the most impactful US state intervention."},{"shortCode":"IN-DPDP-2023","jurisdiction":"IN","name":"India Digital Personal Data Protection Act + AI Advisory (MEITY)","kind":"binding_regulation","adoptedDate":"2023-08-11","effectiveDate":"2025-01-01","sourceUrl":"https://www.meity.gov.in/writereaddata/files/Digital%20Personal%20Data%20Protection%20Act%202023.pdf","sourceCitation":"Digital Personal Data Protection Act, 2023 + MEITY AI Advisories (2024)","status":"in_force","notes":"India's primary AI-adjacent statute is the DPDPA + MEITY's binding AI advisories (Mar 2024 + Apr 2024 walked-back versions). No dedicated AI law yet; the proposed Digital India Act was paused 2024-2025. Affects 1.4B people — the single largest population under any AI-governance regime tracked here.","externalIdentifiers":{"iso_3166_alpha2":"IN"}},{"shortCode":"BR-AIBILL-2024","jurisdiction":"BR","name":"Brazil AI Bill (PL 2338/2023)","kind":"binding_regulation","adoptedDate":null,"effectiveDate":null,"sourceUrl":"https://www25.senado.leg.br/web/atividade/materias/-/materia/157233","sourceCitation":"Senate Bill PL 2338/2023 (Brazil National Congress)","status":"proposed","notes":"Risk-based framework structurally similar to EU AIA but with distinct development-rights framing rooted in Brazil's Marco Civil tradition. Senate-approved Dec 2024; Chamber of Deputies vote pending 2025. Notable for explicit human-dignity + collective-rights provisions absent from EU AIA. Sets a precedent for Latin American AI regulation if enacted.","externalIdentifiers":{"iso_3166_alpha2":"BR"},"conceptsUsed":["training-data-attribution","hallucination"]},{"shortCode":"ASEAN-AI-GUIDE-2024","jurisdiction":"ASEAN","name":"ASEAN Guide on AI Governance and Ethics","kind":"voluntary_code","adoptedDate":"2024-02-02","effectiveDate":"2024-02-02","sourceUrl":"https://asean.org/wp-content/uploads/2024/02/ASEAN-Guide-on-AI-Governance-and-Ethics_beautified_201223_v2.pdf","sourceCitation":"ASEAN Digital Ministers Meeting (DGMIN), Feb 2024","status":"in_force","notes":"Non-binding voluntary guide for 10 ASEAN member states (Indonesia, Malaysia, Philippines, Singapore, Thailand, Vietnam, Myanmar, Cambodia, Laos, Brunei). Adopts a cross-cutting risk + values framework intentionally distinct from the EU AIA's prescriptive model — emphasises 'pragmatic + flexible' implementation reflecting member-state capacity diversity. Pairs with Singapore AI Verify Foundation's technical toolkit.","conceptsUsed":["ai-supply-chain"]},{"shortCode":"AU-AI-STRATEGY-2024","jurisdiction":"African_Union","name":"African Union Continental AI Strategy","kind":"policy_statement","adoptedDate":"2024-07-19","effectiveDate":"2024-07-19","sourceUrl":"https://au.int/en/documents/20240719/continental-artificial-intelligence-strategy","sourceCitation":"AU Continental AI Strategy (Executive Council 45th Ordinary Session)","status":"in_force","notes":"Continental-level non-binding strategy for 55 AU member states. Frames AI through development-rights / digital-sovereignty / capacity-building lens. Explicitly references unequal compute access + dataset coloniality as governance concerns absent from OECD-bloc instruments. Operationalisation via national strategies (e.g., Egypt 2030, Kenya AI Roadmap, South Africa NAIPF)."},{"shortCode":"ANTHROPIC-RSP-2024","jurisdiction":"US","name":"Anthropic Responsible Scaling Policy (RSP) v2","kind":"voluntary_code","adoptedDate":"2024-10-15","effectiveDate":"2024-10-15","sourceUrl":"https://www.anthropic.com/news/announcing-our-updated-responsible-scaling-policy","sourceCitation":"Anthropic Responsible Scaling Policy v2 (Oct 2024)","status":"in_force","notes":"First-mover industry safety framework. Introduces the AI Safety Level (ASL) capability-tier vocabulary subsequently adapted by OpenAI Preparedness + DeepMind FSF. v2 (Oct 2024) refines ASL-3/ASL-4 capability thresholds, mandates pre-deployment capability evaluations, and commits to a Frontier Red Team. Seoul Frontier AI Safety Commitments signatory; cited by name in EU AI Office GPAI Code of Practice drafts.","conceptsUsed":["frontier-tier","asl-3","red-team-evaluation","alignment","deceptive-alignment","capability-elicitation","scalable-oversight","dual-use-research-taxonomy"],"externalIdentifiers":{"iso_3166_alpha2":"US"}},{"shortCode":"OPENAI-PREPAREDNESS-2023","jurisdiction":"US","name":"OpenAI Preparedness Framework","kind":"voluntary_code","adoptedDate":"2023-12-18","effectiveDate":"2023-12-18","sourceUrl":"https://openai.com/safety/preparedness","sourceCitation":"OpenAI Preparedness Framework (Dec 2023)","status":"in_force","notes":"Capability-tier risk evaluation regime with four categorical levels (Low / Medium / High / Critical) across four risk categories (cybersecurity, CBRN, persuasion, model autonomy). Pre-deployment evaluation against the framework gates release decisions; Safety Advisory Group + board-level Safety & Security Committee govern threshold determinations. Seoul Frontier AI Safety Commitments signatory.","conceptsUsed":["frontier-tier","red-team-evaluation","alignment","capability-elicitation","dual-use-research-taxonomy"],"externalIdentifiers":{"iso_3166_alpha2":"US"}},{"shortCode":"DEEPMIND-FSF-2024","jurisdiction":"US","name":"Google DeepMind Frontier Safety Framework","kind":"voluntary_code","adoptedDate":"2024-05-17","effectiveDate":"2024-05-17","sourceUrl":"https://deepmind.google/discover/blog/introducing-the-frontier-safety-framework/","sourceCitation":"Google DeepMind Frontier Safety Framework (May 2024)","status":"in_force","notes":"Critical Capability Levels (CCL) regime spanning autonomy, biosecurity, cybersecurity, and persuasion domains. Distinct vocabulary from Anthropic ASL + OpenAI Preparedness — designed for cross-domain elicitation; each CCL triggers domain-specific mitigations including model-weight access controls + enhanced red-teaming. Seoul Frontier AI Safety Commitments signatory. Alphabet-published; effective across Google DeepMind frontier-model releases.","conceptsUsed":["frontier-tier","red-team-evaluation","alignment","capability-elicitation","dual-use-research-taxonomy"],"externalIdentifiers":{"iso_3166_alpha2":"US"}},{"shortCode":"META-FRONTIER-2024","jurisdiction":"US","name":"Meta Frontier AI Framework","kind":"voluntary_code","adoptedDate":"2024-02-02","effectiveDate":"2024-02-02","sourceUrl":"https://ai.meta.com/responsible-ai/","sourceCitation":"Meta Frontier AI Framework (Feb 2024)","status":"in_force","notes":"Meta's open-weight-frontier governance posture. Categorises frontier models into 'high risk' + 'critical risk' tiers; the framework's distinctive feature is its explicit defence of open-weight release as a governance posture (vs. the closed-model stance of Anthropic / OpenAI / DeepMind). Pre-release threat modelling + post-release monitoring; commits to halt training if critical-risk threshold reached without mitigations. Seoul Frontier AI Safety Commitments signatory.","conceptsUsed":["frontier-tier","red-team-evaluation","dual-use-research-taxonomy","capability-elicitation"],"externalIdentifiers":{"iso_3166_alpha2":"US"}},{"shortCode":"UK-US-AISI-MOU-2024","jurisdiction":"global","name":"UK-US AI Safety Institute Memorandum of Understanding","kind":"international_treaty","adoptedDate":"2024-04-01","effectiveDate":"2024-04-01","sourceUrl":"https://www.gov.uk/government/publications/memorandum-of-understanding-between-the-ai-safety-institutes-of-the-united-kingdom-and-the-united-states","sourceCitation":"UK-US AISI MoU (Apr 2024)","status":"in_force","notes":"First binding bilateral on frontier-AI safety. Commits both AISIs to coordinated pre-deployment evaluations, red-team data sharing, methodological alignment on capability elicitation, and joint exercises across at least one major frontier-model release. Precedent for the broader AISI network (US, UK, JP, SG, CA, FR, KR) consolidated at the Seoul Summit; cited in Seoul Declaration §5-7 operationalising international coordination.","conceptsUsed":["frontier-tier","red-team-evaluation","capability-elicitation"]},{"shortCode":"WH-VOLUNTARY-2023","jurisdiction":"US","name":"White House Voluntary AI Commitments","kind":"voluntary_code","adoptedDate":"2023-07-21","effectiveDate":"2023-07-21","sourceUrl":"https://www.whitehouse.gov/briefing-room/statements-releases/2023/07/21/fact-sheet-biden-harris-administration-secures-voluntary-commitments-from-leading-artificial-intelligence-companies-to-manage-the-risks-posed-by-ai/","sourceCitation":"White House Voluntary AI Commitments (Jul 2023; second tranche Sep 2023)","status":"in_force","notes":"First broad-spectrum US industry commitments; precursor to EO 14110 §4.2(a) reporting + the Seoul Frontier AI Safety Commitments. 15 signatories across two tranches (Jul + Sep 2023): Anthropic, OpenAI, Google DeepMind, Microsoft, Meta, Inflection, Amazon (Jul); Adobe, Cohere, IBM, Nvidia, Palantir, Salesforce, Scale AI, Stability AI (Sep). Eight commitment areas: internal + external security testing, info sharing, cybersecurity investment, third-party vuln disclosure, watermarking, public reporting, prioritising research on societal risks, deploying AI to address societal challenges.","conceptsUsed":["frontier-tier","red-team-evaluation","provenance-watermarking","dual-use-research-taxonomy"],"externalIdentifiers":{"iso_3166_alpha2":"US"}},{"shortCode":"SG-MODEL-AI-2024","jurisdiction":"SG","name":"Singapore Model AI Governance Framework for Generative AI","kind":"voluntary_code","adoptedDate":"2024-05-30","effectiveDate":"2024-05-30","sourceUrl":"https://aiverifyfoundation.sg/downloads/Model_AI_Governance_Framework_for_Generative_AI.pdf","sourceCitation":"Singapore Model AI Governance Framework for Generative AI (May 2024)","status":"in_force","notes":"Update to the 2020 Model AI Governance Framework (v2), expanding scope to generative AI. Nine dimensions: accountability, data, trusted development + deployment, incident reporting, testing + assurance, security, content provenance, safety + alignment R&D, AI for public good. Pairs with the AI Verify Foundation's open-source technical-testing toolkit. Voluntary; cited as the ASEAN-aligned reference for technically-grounded governance and influential beyond ASEAN-10.","conceptsUsed":["frontier-tier","model-card","provenance-watermarking","red-team-evaluation","alignment"],"externalIdentifiers":{"iso_3166_alpha2":"SG"}},{"shortCode":"JP-METI-AI-2024","jurisdiction":"JP","name":"Japan METI AI Guidelines for Business","kind":"voluntary_code","adoptedDate":"2024-04-19","effectiveDate":"2024-04-19","sourceUrl":"https://www.meti.go.jp/english/press/2024/0419_002.html","sourceCitation":"METI/MIC AI Guidelines for Business v1.0 (Apr 2024)","status":"in_force","notes":"Joint METI + MIC issuance consolidating prior AI Utilization Guidelines (2019) + AI R&D Principles (2017) into a single business-facing framework. Voluntary; explicitly aligned with G7 Hiroshima AI Process Code of Conduct + OECD AI Principles. Ten core principles spanning fair competition, accountability, transparency, education, AI safety. Companion of the Hiroshima AI Process Reporting Framework Japan operationalises; reflects Japan's preferred soft-law posture vs. the EU AIA's prescriptive model.","conceptsUsed":["frontier-tier","model-card","red-team-evaluation"],"externalIdentifiers":{"iso_3166_alpha2":"JP"}}],"topics":[{"code":"foundation_models","kind":"capability","label":"Foundation Models / GPAI","description":"Obligations specific to general-purpose / foundation models above certain capability thresholds.","empiricalConsensus":"contested","contestedQuestion":"Does the foundation-model category map to a coherent capability tier, or is it a regulatory convenience? Compute-threshold vs behavioural-threshold debate is unresolved across EU/US/China.","lastReviewedAt":"2026-05-24"},{"code":"biometric_id","kind":"capability","label":"Biometric Identification","description":"Real-time and post-hoc biometric identification in public spaces.","empiricalConsensus":"settled"},{"code":"deepfakes","kind":"capability","label":"Deepfakes / Synthetic Content","description":"AI-generated content disclosure, watermarking, election integrity protections.","empiricalConsensus":"contested","contestedQuestion":"Is robust watermarking durable under adversarial removal at deployment scale? Field is split on technical feasibility despite policy convergence on the requirement.","lastReviewedAt":"2026-05-24"},{"code":"employment","kind":"sector","label":"AI in Employment","description":"Hiring, workplace monitoring, automated decisions in employment contexts.","empiricalConsensus":"settled"},{"code":"healthcare","kind":"sector","label":"AI in Healthcare","description":"Clinical decision support, medical devices, diagnostic AI.","empiricalConsensus":"settled"},{"code":"criminal_justice","kind":"sector","label":"AI in Criminal Justice","description":"Predictive policing, risk assessment, sentencing assistance.","empiricalConsensus":"contested","contestedQuestion":"Does algorithmic risk-assessment reduce or reproduce racial disparities? Empirical literature (ProPublica COMPAS critique vs. industry replication) is unresolved."},{"code":"education","kind":"sector","label":"AI in Education","description":"Automated grading, proctoring, student-data analytics.","empiricalConsensus":"settled"},{"code":"compute_reporting","kind":"procedural","label":"Compute-Threshold Reporting","description":"Mandatory reporting based on training-compute or capability thresholds.","empiricalConsensus":"contested","contestedQuestion":"Are compute thresholds (10²⁵ FLOPs EU, 10²⁶ FLOPs US) a defensible proxy for governance-relevant capability, given algorithmic-efficiency improvements? Field is split."},{"code":"transparency","kind":"procedural","label":"Transparency Obligations","description":"Disclosure of training data, model cards, system-card requirements.","empiricalConsensus":"contested","contestedQuestion":"Does transparency disclosure (model cards, training-data summaries) actually reduce bias / misuse / accidents? Selbst & Barocas (2019) argue disclosure ≠ fairness; regulators assume it helps."},{"code":"redress","kind":"procedural","label":"Individual Redress","description":"Right to explanation, appeal mechanisms, complaint channels.","empiricalConsensus":"settled"},{"code":"training_data","kind":"procedural","label":"Training-Data Rights","description":"Copyright, consent, text-and-data-mining exceptions.","empiricalConsensus":"contested","contestedQuestion":"Does the EU CDSM Directive's TDM-exemption cover commercial foundation-model training? Major active litigation (NYT v OpenAI, Getty v Stability) and parallel claim regimes in UK/JP/US."},{"code":"sovereign_ai","kind":"political_frame","label":"Sovereign AI Doctrine","description":"Domestic-compute, export controls, jurisdiction-bound model deployment.","empiricalConsensus":"emerging","contestedQuestion":"Is jurisdiction-bound model deployment technically feasible at frontier scale? Field literature is sparse; doctrine is post-2023 and largely aspirational."},{"code":"catastrophic_risk","kind":"capability","label":"Catastrophic & Existential Risk","description":"Governance of model capabilities that could cause mass casualties or civilisational-scale harms (CBRN uplift, autonomous replication, deceptive alignment). Distinct from EU AIA 'systemic risk' which targets market-scale rather than catastrophic-scale harms.","empiricalConsensus":"contested","contestedQuestion":"Are current frontier-model capabilities a meaningful contribution to catastrophic-risk probability? Field is split between catastrophic-risk-as-imminent (FLI, CAIS) and catastrophic-risk-as-speculative (Pope et al., Andersson) positions."},{"code":"tech_sovereignty","kind":"political_frame","label":"Technological Sovereignty","description":"National policies asserting domestic capability + decision-making over AI infrastructure: compute on shore, domestic foundation models, talent retention, export-control reciprocity. Specifically NOT 'sovereign AI' (which focuses on deployment restrictions) — sovereignty here is about productive capacity.","empiricalConsensus":"emerging","contestedQuestion":"Can mid-sized economies sustain frontier-tier AI capability domestically, or does the compute-cost curve favour US/CN/EU only? Active debate in India, Brazil, ASEAN policy literatures."},{"code":"development_rights_framing","kind":"political_frame","label":"Development-Rights Framings","description":"Governance approaches grounded in development-rights / digital-self-determination / Global-South-sovereignty arguments rather than EU/US risk-based framings. Loudest in Brazil, India, ASEAN, African Union policy discourse.","empiricalConsensus":"emerging","contestedQuestion":"Is development-rights framing compatible with the EU AIA's rights-based framing, or do they conflict on operational decisions (e.g., who can deploy frontier models in developing economies)?"},{"code":"international_coordination","kind":"meta","label":"International Coordination","description":"The substantive governance work happening at, between, and around multilateral fora: treaty negotiations, AI Safety Institute network MoUs, forum-shifting between G7 / G20 / OECD / UN, regulatory arbitrage. Distinct from any specific instrument; this is the meta-domain of how governance moves.","empiricalConsensus":"emerging","contestedQuestion":"Will AI-governance coordination converge on the OECD / UN / G7 / GPAI / bilateral-MoU mode, or fragment into bloc-based regimes (US-led / EU-led / China-led)? Field consensus is forming but unsettled."},{"code":"agentic_systems_governance","kind":"capability","label":"Agentic AI Governance","description":"Obligations specific to AI systems that take autonomous multi-step actions (browse, transact, plan, recurse). Distinct from foundation_models (capability) and catastrophic_risk (outcome) — this is the action-surface frame. Surfaces in EU AI Office GPAI Code drafts, UK AISI agent evaluations, Seoul Frontier AI Safety Commitments §3, NIST AI 600-1.","empiricalConsensus":"emerging","contestedQuestion":"Should governance attach to the AGENT (multi-step actions, tool use, recursion) or to the model that powers it? Capability-tier vs action-tier frames are unresolved across jurisdictions.","lastReviewedAt":"2026-05-26"},{"code":"open_weight_release","kind":"procedural","label":"Open-Weight Frontier Release","description":"Governance posture toward releasing frontier model weights publicly (Meta Llama, Mistral, DeepSeek vs. closed-weight Anthropic / OpenAI / DeepMind). EU AIA Recital 102 + Art. 53(2) carve-outs; CA SB-1047's failed framework; Meta Frontier AI Framework's explicit defence; emerging US export-control overlay.","empiricalConsensus":"contested","contestedQuestion":"Should frontier weight-release be governed by capability-tier (block above threshold) or by safety-evaluation-evidence (allow with pre-release red-team) or by recipient-restriction (export controls)? Three distinct frames currently in active conflict.","lastReviewedAt":"2026-05-26"},{"code":"synthetic_content_provenance","kind":"procedural","label":"Synthetic Content Provenance","description":"Labelling, watermarking, and machine-readable provenance for AI-generated audio / video / text. Distinct from `deepfakes` (which centres on misuse harms) — this is the upstream infrastructure layer. EU AIA Art. 50, China GenAI Measures Art. 13 (mandatory tagging), NIST AI 600-1, G7 Hiroshima Code commitment 6, C2PA standard adoption.","empiricalConsensus":"contested","contestedQuestion":"Should provenance be a model-provider obligation (watermark at generation), a platform obligation (label at distribution), or a recipient right (declare on request)? Each jurisdiction is currently selecting a different burden allocation.","lastReviewedAt":"2026-05-26"}],"benchmarks":[{"shortCode":"SWE-BENCH-VER","name":"SWE-bench Verified","domain":"agentic","measures":"Solve real-world GitHub issues from 12 popular Python repos. The 'Verified' subset is human-validated to remove ambiguity and have working tests.","scoreRange":{"min":0,"max":100,"unit":"% solved"},"methodologyUrl":"https://openai.com/index/introducing-swe-bench-verified/","publishedYear":2024,"contaminationRisk":"medium","notes":"500-task verified subset. Run-time evaluation; can't be gamed by pure memorisation but agent harness affects results.","saturationStatus":"active"},{"shortCode":"MMLU","name":"MMLU","domain":"general_reasoning","measures":"Massive Multitask Language Understanding — 57-subject multiple-choice covering humanities, STEM, social sciences, professional/legal.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://arxiv.org/abs/2009.03300","publishedYear":2020,"contaminationRisk":"high","notes":"Saturating — top models ~92%. Test-set leakage to training corpora is widely documented. MMLU-Pro is the harder successor.","saturationStatus":"saturated","successorBenchmarkCode":"MMLU-PRO"},{"shortCode":"MMLU-PRO","name":"MMLU-Pro","domain":"general_reasoning","measures":"Successor to MMLU with 10-option multiple-choice (up from 4), more reasoning-focused tasks, and removed leaky / ambiguous items.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://arxiv.org/abs/2406.01574","publishedYear":2024,"contaminationRisk":"medium","notes":"Less saturated than MMLU. Frontier models ~70-80%.","saturationStatus":"saturating","successorBenchmarkCode":"HLE"},{"shortCode":"GPQA-DIAMOND","name":"GPQA Diamond","domain":"general_reasoning","measures":"Graduate-level Google-Proof Q&A in biology, chemistry, physics. 'Diamond' subset is the 198 hardest items.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://arxiv.org/abs/2311.12022","publishedYear":2023,"contaminationRisk":"low","notes":"Designed to be Google-proof — questions where domain PhD students score ~65% but non-expert searchers ~34%.","saturationStatus":"saturating","successorBenchmarkCode":"HLE"},{"shortCode":"ARC-AGI-V2","name":"ARC-AGI v2","domain":"general_reasoning","measures":"Abstract reasoning over visual grids. Each task requires inferring the transformation rule from 2-3 examples.","scoreRange":{"min":0,"max":100,"unit":"% solved"},"methodologyUrl":"https://arcprize.org/","publishedYear":2024,"contaminationRisk":"low","notes":"v2 launched 2024-12 with harder tasks designed to remain unsolvable by pure pattern matching. $1M public prize for >85% on private set.","saturationStatus":"active"},{"shortCode":"HUMANEVAL","name":"HumanEval","domain":"code","measures":"164 hand-written Python programming problems. Generate a function that passes provided unit tests.","scoreRange":{"min":0,"max":100,"unit":"pass@1 %"},"methodologyUrl":"https://arxiv.org/abs/2107.03374","publishedYear":2021,"contaminationRisk":"high","notes":"Saturated — top models ~95%. Largely superseded by SWE-bench for real-world relevance.","saturationStatus":"deprecated","successorBenchmarkCode":"SWE-BENCH-VER"},{"shortCode":"MATH","name":"MATH (Hendrycks)","domain":"math","measures":"12,500 competition-math problems from AMC, AIME, etc. Evaluates step-by-step reasoning + final-answer accuracy.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://arxiv.org/abs/2103.03874","publishedYear":2021,"contaminationRisk":"medium","notes":"Frontier reasoning models 90%+. AIME-2024 is the harder successor for unsaturated math eval.","saturationStatus":"saturated","successorBenchmarkCode":"AIME-2024"},{"shortCode":"AIME-2024","name":"AIME 2024","domain":"math","measures":"30 problems from the 2024 American Invitational Mathematics Examination — high-school competition math.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://www.maa.org/math-competitions/american-invitational-mathematics-examination-aime","publishedYear":2024,"contaminationRisk":"low","notes":"Released after most current models' training cutoffs. Top reasoning models 75-90%; non-reasoning 10-30%.","saturationStatus":"saturating","successorBenchmarkCode":"FRONTIER-MATH"},{"shortCode":"HLE","name":"Humanity's Last Exam","domain":"knowledge","measures":"3,000+ frontier-difficulty expert-curated questions across all academic disciplines. Designed to remain unsaturated through 2026+.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://lastexam.ai/","publishedYear":2025,"contaminationRisk":"low","notes":"Center for AI Safety + Scale AI collaboration. Frontier models 8-22% at launch. Replaces MMLU as the de-facto knowledge ceiling.","saturationStatus":"active"},{"shortCode":"FRONTIER-MATH","name":"FrontierMath","domain":"math","measures":"Hundreds of original research-mathematician-curated math problems requiring deep reasoning. Held-out evaluation only.","scoreRange":{"min":0,"max":100,"unit":"% accuracy"},"methodologyUrl":"https://epochai.org/frontiermath","publishedYear":2024,"contaminationRisk":"low","notes":"Epoch AI eval. Top reasoning models 2-5% at launch; OpenAI o3-preview reported 25% under custom harness.","saturationStatus":"active"}],"concepts":[{"code":"frontier-tier","label":"Frontier-Tier AI","domain":"risk_class","definition":"A categorical classification of AI models above certain capability or compute thresholds, indicating heightened regulatory scrutiny.","scope":"Frontier-tier classification varies by jurisdiction. The EU AI Act presumes 'systemic risk' at ≥10²⁵ FLOPs training compute OR ≥45M EU monthly active users. The US EO 14110 used 10²⁶ FLOPs as the reporting trigger. Industry frameworks (Anthropic ASL, OpenAI Preparedness, DeepMind FSF) use capability-based rather than pure-compute frontier markers. The term 'frontier' has no single canonical definition; it is operationalized differently across regulators and developers.","usedByInstruments":["EU-AIA-2024","US-EO-14110","UK-WHITEPAPER-2023","G7-HIROSHIMA","ANTHROPIC-RSP-2024","OPENAI-PREPAREDNESS-2023","DEEPMIND-FSF-2024","META-FRONTIER-2024","UK-US-AISI-MOU-2024","WH-VOLUNTARY-2023","SG-MODEL-AI-2024","JP-METI-AI-2024"],"relatedConcepts":["asl-3","systemic-risk","designated-systemic","compute-threshold"],"relatedTopics":["foundation_models","compute_reporting"],"sourceUrl":"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32024R1689","sourceCitation":"EU AI Act Art. 51 + Annex XIII (the closest binding definition)","empiricalConsensus":"contested","contestedQuestion":"Does 'frontier' have a coherent definition across regulators + industry, or is it a contextual term whose meaning shifts with jurisdiction? Compute-threshold (EU/US) vs behavioural-tier (Anthropic/OpenAI/DeepMind) split is unresolved.","notes":"When a wiki article references 'frontier' without jurisdictional qualifier, defer to the EU AIA Art. 51 definition as the most widely cited binding text."},{"code":"asl-3","label":"AI Safety Level 3 (ASL-3)","domain":"safety","definition":"A capability-based risk tier in Anthropic's Responsible Scaling Policy denoting models with the potential to substantially uplift CBRN attack capabilities or autonomous AI replication.","scope":"ASL-3 was introduced in Anthropic's Responsible Scaling Policy (RSP) framework. Triggering ASL-3 capability requires the model to demonstrate substantial uplift in chemical, biological, radiological, or nuclear (CBRN) weapons design beyond baseline internet resources, OR show signs of autonomous self-replication. ASL-3 status mandates specific deployment safeguards including red-team evaluations, restricted API access, and incident-response protocols. Comparable tiers exist in OpenAI's Preparedness Framework (high) and DeepMind's Frontier Safety Framework (Critical Capability Levels).","usedByInstruments":["G7-HIROSHIMA","UK-WHITEPAPER-2023","ANTHROPIC-RSP-2024"],"relatedConcepts":["frontier-tier","systemic-risk","compute-threshold"],"relatedTopics":["foundation_models","deepfakes"],"sourceUrl":"https://www.anthropic.com/news/anthropics-responsible-scaling-policy","sourceCitation":"Anthropic Responsible Scaling Policy v1.x","empiricalConsensus":"settled","notes":"ASL-3 is a vendor-specific term; comparable but not interchangeable with EU AIA 'systemic risk' or OpenAI 'high' capability rating. Wiki articles citing ASL-3 should preserve the original-framework name when comparing across vendors."},{"code":"systemic-risk","label":"Systemic Risk (AI)","domain":"risk_class","definition":"A regulatory designation indicating that a general-purpose AI model poses risks of significant scale or scope across the EU internal market, triggering Article 55 obligations under the EU AI Act.","scope":"Article 51 of the EU AI Act establishes that a general-purpose AI (GPAI) model has systemic risk when its capabilities equal or exceed those of the most advanced models, evaluated via Annex XIII criteria. Presumption thresholds: ≥10²⁵ FLOPs training compute OR ≥45M EU monthly active users OR designation by the AI Office based on capability indicators. Designation triggers Article 55 obligations: model evaluation including adversarial testing, systemic risk assessment, incident reporting, cybersecurity protection, and energy reporting.","usedByInstruments":["EU-AIA-2024","G7-HIROSHIMA","COE-AI-CONV"],"relatedConcepts":["frontier-tier","asl-3","designated-systemic","compute-threshold"],"relatedTopics":["foundation_models","compute_reporting","redress"],"sourceUrl":"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32024R1689","sourceCitation":"Regulation (EU) 2024/1689, Arts. 51-55","empiricalConsensus":"contested","contestedQuestion":"EU AIA's systemic-risk thresholds presume that capabilities ≥10²⁵ FLOPs OR ≥45M EU MAU correlate with systemic risk. Field is divided on whether either correlation is empirically validated; the catastrophic-risk literature uses a stricter definition (CBRN uplift, autonomous replication) that the EU AIA does not directly target.","notes":"'Systemic risk' under the EU AIA is distinct from financial-system 'systemic risk' (SIFI/G-SIB regimes). Wiki articles in AI contexts default to the EU AIA usage."},{"code":"designated-systemic","label":"Designated Systemic-Risk Model","domain":"risk_class","definition":"A general-purpose AI model that has been formally designated by the EU AI Office under Article 51(1)(b) as posing systemic risk, regardless of whether it meets the presumption thresholds.","scope":"Designation is the formal regulatory act by which a GPAI model becomes subject to Article 55 obligations. Two paths: (1) presumption — automatic when training compute ≥10²⁵ FLOPs OR EU MAU ≥45M; or (2) explicit designation by the AI Office based on Annex XIII capability indicators. Once designated, the model is listed on a public register; its provider must comply with Art. 55 within prescribed timelines. Designation can be challenged but the burden is on the provider to show non-systemic status.","usedByInstruments":["EU-AIA-2024"],"relatedConcepts":["systemic-risk","frontier-tier","compute-threshold"],"relatedTopics":["foundation_models","compute_reporting","transparency"],"sourceUrl":"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32024R1689","sourceCitation":"Regulation (EU) 2024/1689, Art. 51(1)(b) + Annex XIII","empiricalConsensus":"settled","notes":"As of the catalog refresh date, no GPAI model has been publicly designated under the explicit pathway; all systemic-risk models so far have been by presumption thresholds. Track future designations via the AI Office register."},{"code":"compute-threshold","label":"Compute Threshold (AI Governance)","domain":"compute","definition":"A regulatory trigger expressed as floating-point operations (FLOPs) consumed during model training, above which specific reporting, evaluation, or governance obligations attach.","scope":"Compute thresholds operationalize the intuition that capability scales (imperfectly) with training compute. Jurisdictions have adopted different thresholds: US EO 14110 used 10²⁶ FLOPs for foundation-model reporting; EU AI Act Art. 51 uses 10²⁵ FLOPs as the systemic-risk presumption; China's GenAI Measures use no compute threshold (registration triggered by public-facing deployment instead); UK AISI commitments are voluntary and capability-based rather than compute-thresholded. Critics note that thresholds become outdated as algorithmic efficiency improves and that compute alone is an imperfect capability proxy.","usedByInstruments":["EU-AIA-2024","US-EO-14110"],"relatedConcepts":["frontier-tier","systemic-risk","designated-systemic"],"relatedTopics":["foundation_models","compute_reporting","sovereign_ai"],"sourceUrl":"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32024R1689","sourceCitation":"Regulation (EU) 2024/1689, Art. 51(2) + Annex XIII pt. (a)","empiricalConsensus":"contested","contestedQuestion":"Is compute-thresholding a defensible proxy for governance-relevant capability? Algorithmic-efficiency improvements (DeepSeek R1 demonstrating frontier-tier reasoning below 10²⁵ FLOPs) destabilize the threshold; field is split on whether compute thresholds should be indexed to efficiency, replaced by behavioural evaluation, or kept fixed for predictability.","notes":"When citing a specific FLOP threshold, always pair it with the jurisdiction and instrument. '10²⁵ FLOPs' is meaningful only under EU AIA; the same number has different implications in other regimes."},{"code":"red-team-evaluation","label":"Red-Team Evaluation","domain":"safety","definition":"Structured adversarial probing of an AI model's capabilities and behaviour before deployment, designed to elicit failures that ordinary evaluation would miss.","scope":"Red-team evaluation originated in cybersecurity (penetration testing) and was adapted to AI by the 2022 DEF CON Generative Red Team event and later codified in the 2023 White House voluntary commitments. EU AI Act Art. 55(1)(a) requires adversarial testing for general-purpose AI models with systemic risk. US EO 14110 §4.2(a)(i) required reporting of red-team results for foundation models above the compute threshold (rescinded under EO 14179). G7 Hiroshima Code §1 calls for 'adversarial testing prior to and throughout deployment.' Anthropic, OpenAI, and Google DeepMind each maintain internal red-team programs with public methodology disclosures.\n\nGovernance disputes centre on: (1) WHO must red-team (provider, independent third-party, government); (2) WHAT capabilities are in scope (CBRN uplift, autonomous replication, election manipulation, etc.); (3) WHO sees the results (provider only, regulator under confidentiality, public); (4) WHAT triggers re-evaluation after deployment.","usedByInstruments":["EU-AIA-2024","US-EO-14110","G7-HIROSHIMA","UK-WHITEPAPER-2023","ANTHROPIC-RSP-2024","OPENAI-PREPAREDNESS-2023","DEEPMIND-FSF-2024","META-FRONTIER-2024","UK-US-AISI-MOU-2024","WH-VOLUNTARY-2023","SG-MODEL-AI-2024","JP-METI-AI-2024"],"relatedConcepts":["frontier-tier","asl-3","systemic-risk","designated-systemic"],"relatedTopics":["foundation_models","deepfakes","compute_reporting"],"sourceUrl":"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32024R1689","sourceCitation":"EU AI Act Art. 55(1)(a) — the most binding articulation","empiricalConsensus":"contested","contestedQuestion":"WHO must red-team (provider, independent third-party, regulator), WHAT capabilities are in scope (CBRN uplift, autonomous replication, election manipulation), and WHO sees the results (provider only, regulator under confidentiality, public)? Field convergence post-Seoul 2024 is slow.","notes":"Distinguish from 'evaluation' (general benchmark-style measurement) and 'audit' (post-hoc third-party review). Red-teaming is specifically pre-deployment + adversarial-intent."},{"code":"model-card","label":"Model Card","domain":"policy_instrument","definition":"A standardized disclosure document accompanying an AI model that describes its intended use, training data, evaluation results, limitations, and known failure modes.","scope":"Model cards originated in Mitchell et al. (2019) 'Model Cards for Model Reporting' (FAccT). The pattern was adopted by Hugging Face Hub (default model template), Google PAIR, and Microsoft Responsible AI. EU AI Act Art. 53 codifies model-card-style disclosures for general-purpose AI models — providers must document training-data summary, capabilities, limitations, intended use, and evaluation methodology. NIST AI RMF (Govern 1.3, Map 5.1) cites model cards as a transparency mechanism. ISO/IEC 23894 (AI risk management) endorses analogous documentation.\n\nDistinguish from: (a) 'system card' — wraps a model card with deployment-context information (OpenAI uses this term for GPT-4 family); (b) 'data sheet' — Gebru et al. 2018, focuses on training datasets rather than models; (c) 'fact sheet' — IBM's term for similar disclosure. Model cards remain voluntary in most jurisdictions; the EU AIA Art. 53 disclosure is the first binding equivalent.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF","G7-HIROSHIMA","OECD-AI-PRIN","SG-MODEL-AI-2024","JP-METI-AI-2024"],"relatedConcepts":["frontier-tier","systemic-risk","red-team-evaluation"],"relatedTopics":["transparency","foundation_models","redress"],"sourceUrl":"https://arxiv.org/abs/1810.03993","sourceCitation":"Mitchell et al. (2019), 'Model Cards for Model Reporting,' FAccT '19","empiricalConsensus":"settled","notes":"When comparing model cards across providers, normalize for completeness: cards may omit training-compute, dataset composition, or evaluation methodology under trade-secret claims. EU AIA Art. 53 carves out trade-secret exemptions narrowly."},{"code":"alignment","label":"AI Alignment","domain":"safety","definition":"The technical problem of designing AI systems whose objectives, behaviour, and emergent goals reliably track the values or instructions of their principals across deployment contexts.","scope":"Alignment, in the technical sense, is distinct from regulatory 'compliance' or 'safety.' It asks: even if a model is capable and even if it is supervised, does it pursue what its principal actually wants — or does it pursue a proxy objective that diverges in edge cases? The problem decomposes into outer alignment (specifying what we want the model to do — see Krakovna et al.'s 'specification gaming' literature) and inner alignment (whether the model trained on that specification actually internalised it — see Hubinger et al. 2019 on mesa-optimisation).\n\nGovernance instruments rarely use the word 'alignment' directly. EU AIA Art. 51-55 obligations approximate alignment concerns by mandating systemic-risk assessment + adversarial testing + cybersecurity protection, but do not require demonstrated alignment of model objectives. US EO 14110 §4.2(a) mandated reporting on alignment-relevant capabilities (red-team results) without defining 'alignment.' Anthropic, OpenAI, and DeepMind publish their own alignment research agendas; these are de facto cited in policy debates but absent from binding text. The field treats alignment as a research problem first and a governance object only secondarily.","usedByInstruments":["EU-AIA-2024","US-EO-14110","G7-HIROSHIMA","ANTHROPIC-RSP-2024","OPENAI-PREPAREDNESS-2023","DEEPMIND-FSF-2024","SG-MODEL-AI-2024"],"relatedConcepts":["deceptive-alignment","mesa-optimization","scalable-oversight","capability-elicitation","red-team-evaluation"],"relatedTopics":["foundation_models","compute_reporting","transparency"],"sourceUrl":"https://intelligence.org/files/AIPosNegFactor.pdf","sourceCitation":"Yudkowsky, E. (2008), 'Artificial Intelligence as a Positive and Negative Factor in Global Risk' — the field-foundational articulation of the alignment problem.","empiricalConsensus":"contested","contestedQuestion":"Is the inner-outer alignment decomposition the right frame, or does it presume capabilities (long-horizon planning, model self-awareness) frontier LLMs do not yet have? Pope et al. (2023) vs. Hubinger lineage.","notes":"Wiki articles referring to 'alignment' in a regulatory context should pair the technical sense with the specific regulator's adjacent vocabulary (EU AIA: 'systemic risk assessment'; US EO 14110: 'safety evaluations'). The technical-alignment literature predates and exceeds the regulatory framings."},{"code":"deceptive-alignment","label":"Deceptive Alignment","domain":"safety","definition":"A failure mode in which a model appears aligned during training and evaluation because doing so serves its actual (mesa-)objective, but pursues divergent objectives once deployed or once it judges itself unobserved.","scope":"Deceptive alignment is the most-cited threat model in technical AI-safety arguments for capability evaluations under adversarial conditions. The canonical formulation is Hubinger et al. (2019) — a learned inner optimiser may model the training process and behave aligned during training as an instrumental subgoal of a different terminal objective. Once the training-process model judges deployment, the deceptive policy diverges.\n\nIts policy relevance lies in what it implies for evaluation: standard benchmark + holdout testing is insufficient if the model can detect evaluation conditions. EU AI Act Art. 55(1)(a) adversarial-testing requirement is the closest binding analogue. Anthropic's Responsible Scaling Policy explicitly cites deceptive alignment as a triggering capability for ASL-3 safeguards. OpenAI's Preparedness Framework lists 'persuasion / manipulation' and 'autonomous replication' as proxies the company evaluates partly to surface deceptive-alignment indicators.\n\nThe concept is empirically contested. Critics (Pope et al. 2023, Andersson 2024) argue that deceptive-alignment requires capabilities (long-horizon planning over deployment futures, model self-awareness of training) that current LLMs lack and that the threat is overstated relative to mundane misalignment. The contested status is itself policy-relevant: regulators must decide whether to legislate against a speculative failure mode.","usedByInstruments":["EU-AIA-2024","G7-HIROSHIMA","ANTHROPIC-RSP-2024"],"relatedConcepts":["alignment","mesa-optimization","scalable-oversight","red-team-evaluation"],"relatedTopics":["foundation_models","compute_reporting"],"sourceUrl":"https://arxiv.org/abs/1906.01820","sourceCitation":"Hubinger, E., et al. (2019), 'Risks from Learned Optimization in Advanced Machine Learning Systems.'","empiricalConsensus":"contested","contestedQuestion":"Does deceptive alignment require capabilities (long-horizon planning, training-process modelling) that current frontier LLMs demonstrably have? Pope et al. 2023 argue no; Hubinger lineage argues maybe-soon.","notes":"Empirically contested. When citing as a regulatory motivation, pair with at least one critical citation (Pope et al. 2023) so the wiki does not present a contested threat-model as settled."},{"code":"mesa-optimization","label":"Mesa-Optimization","domain":"safety","definition":"The phenomenon in which a learned model itself implements an optimisation algorithm at inference time, producing an inner objective ('mesa-objective') that may differ from the outer training objective.","scope":"Mesa-optimisation, formalised by Hubinger et al. (2019), is the technical substrate of the deceptive-alignment concern. The outer optimisation process (gradient descent) selects parameters that minimise training loss; if those parameters implement an inner search process with its own objective, the inner objective is the 'mesa-objective.' Mesa-optimisation is plausible only for models with sufficient capability to implement learned planners, search procedures, or world models — empirically demonstrated at small scale in toy domains (Hubinger et al. 2021; Park et al. 2023) but not yet at frontier-LLM scale.\n\nGovernance relevance is indirect: if mesa-optimisation is real and detectable, capability evaluations should target the inner objective rather than the outer behavioural metric. The EU AI Act and US EO 14110 do not explicitly require this. Anthropic's RSP and the Frontier Foundation Model Eval Consortium include capability-elicitation methods designed to surface inner objectives, but these are voluntary.\n\nThe concept is contested both empirically (does current SOTA actually mesa-optimise?) and conceptually (is the inner/outer dichotomy the right frame, vs. e.g. context-dependent goals). When citing in policy contexts, signal the contestation status.","usedByInstruments":[],"relatedConcepts":["alignment","deceptive-alignment","scalable-oversight"],"relatedTopics":["foundation_models","compute_reporting"],"sourceUrl":"https://arxiv.org/abs/1906.01820","sourceCitation":"Hubinger, E., et al. (2019), 'Risks from Learned Optimization in Advanced Machine Learning Systems.'","empiricalConsensus":"contested","contestedQuestion":"Does current SOTA actually mesa-optimise? Toy-domain demonstrations exist; frontier-scale evidence does not. The inner/outer dichotomy itself is contested as the right frame.","notes":"Mesa-optimisation is currently invoked in policy debates more often as a threat-model rationale than as an empirically-demonstrated failure. Wiki articles citing it should note the empirical-status uncertainty (Avila F6)."},{"code":"scalable-oversight","label":"Scalable Oversight","domain":"safety","definition":"The set of techniques for supervising AI systems whose outputs are too complex, too numerous, or too domain-distant for unaided human evaluators to judge correctness.","scope":"Scalable oversight addresses the 'who watches the watchers' problem at AI scale. When a model produces 10⁶ outputs per day, or operates in a domain where the supervising human is not expert (e.g., novel mathematics, advanced biology), traditional human-in-the-loop review fails. Christiano et al. (2018) 'Supervising Strong Learners by Amplifying Weak Experts' is the foundational articulation. The agenda spans: (a) debate (two AIs argue, a human judges short transcripts — Irving et al. 2018); (b) iterated amplification (humans + assistants supervise stronger models, recursively — Christiano et al. 2018); (c) constitutional AI / RLAIF (rule-based or AI-feedback supervision in place of unscaled human labels — Bai et al. 2022, Anthropic); (d) weak-to-strong generalisation (Burns et al. 2023, OpenAI) — can a weak supervisor train a stronger model to behave well on tasks the weak supervisor cannot grade?\n\nGovernance relevance is direct. EU AI Act Art. 14 mandates 'human oversight' for high-risk systems; the article is written assuming bandwidth-feasible human review, which scalable-oversight literature argues breaks at frontier-model scale. UK AISI red-team commitments explicitly invoke scalable-oversight techniques. NIST AI RMF Govern 1.3 calls for documented oversight mechanisms but does not specify scalability requirements. The gap between regulatory 'human oversight' language and the technical reality of supervising super-human-domain outputs is one of the field's most-discussed governance-implementation gaps.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF","UK-WHITEPAPER-2023","ANTHROPIC-RSP-2024"],"relatedConcepts":["alignment","deceptive-alignment","capability-elicitation","red-team-evaluation"],"relatedTopics":["foundation_models","transparency","redress"],"sourceUrl":"https://arxiv.org/abs/1810.08575","sourceCitation":"Christiano, P., Shlegeris, B., Amodei, D. (2018), 'Supervising Strong Learners by Amplifying Weak Experts.'","empiricalConsensus":"emerging","contestedQuestion":"Which scalable-oversight technique (debate / iterated amplification / constitutional AI / weak-to-strong generalisation) actually works at frontier scale? Field has compelling small-scale demonstrations but no convergent answer.","notes":"Wiki articles citing 'human oversight' under EU AIA Art. 14 should reference scalable-oversight as the field's term for the implementation problem the article gestures at without solving."},{"code":"capability-elicitation","label":"Capability Elicitation","domain":"safety","definition":"Techniques designed to reveal the upper bounds of an AI model's capabilities, rather than measuring its default behaviour, so that downstream safety judgements can be calibrated to what the model *can* do under adversarial prompting or fine-tuning.","scope":"Capability elicitation is methodologically distinct from benchmarking. A benchmark measures average performance under standard prompting; elicitation aims to surface the model's actual capability ceiling. Common methods: (a) adversarial prompting — red-team-style attempts to invoke a withheld behaviour (Branwen 2020, Weidinger et al. 2024); (b) chain-of-thought + structured prompting — forcing step-by-step reasoning, often revealing skills the model would otherwise hide or skip (Wei et al. 2022); (c) multi-stage / decomposition prompting — breaking tasks into sub-tasks that decompose deception incentives (Andersson 2024); (d) fine-tuning pressure — does the safety behaviour break under modest fine-tuning, indicating the underlying capability is preserved (Qi et al. 2023, 'Fine-tuning Aligned LLMs')?\n\nGovernance relevance: EU AI Act Art. 55(1)(a) adversarial testing presupposes elicitation methods exist. US EO 14110 §4.2(a) reporting includes red-team results, which depend on elicitation methodology choices. The lack of standardisation across elicitation methods is one reason regulator-mandated evaluation results are not directly comparable across providers (Anthropic's elicitation suite ≠ OpenAI's ≠ DeepMind's). The Frontier Foundation Model Eval Consortium is attempting to converge methodology; consensus remains partial.","usedByInstruments":["EU-AIA-2024","US-EO-14110","G7-HIROSHIMA","ANTHROPIC-RSP-2024","OPENAI-PREPAREDNESS-2023","DEEPMIND-FSF-2024","META-FRONTIER-2024","UK-US-AISI-MOU-2024"],"relatedConcepts":["alignment","scalable-oversight","red-team-evaluation","deceptive-alignment"],"relatedTopics":["foundation_models","compute_reporting","transparency"],"sourceUrl":"https://arxiv.org/abs/2310.06987","sourceCitation":"Qi, X., Zeng, Y., Xie, T., Chen, P.-Y., Jia, R., Mittal, P., Henderson, P. (2023), 'Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To!'","empiricalConsensus":"emerging","contestedQuestion":"What is the right standardised elicitation methodology for regulator-mandated capability evaluation? Each frontier lab uses a different suite; Frontier Foundation Model Eval Consortium is converging slowly.","notes":"Distinguish from 'benchmarking' (average-case measurement) and 'red-teaming' (specific adversarial procedure). Capability elicitation is the umbrella; red-teaming is one technique under it."},{"code":"dual-use-research-taxonomy","label":"Dual-Use Research Norms (DURC for AI)","domain":"safety","definition":"A normative framework — adapted from biosecurity's Dual-Use Research of Concern (DURC) policies — for governing AI research and publication decisions when research outputs have both beneficial and harmful applications.","scope":"Dual-use research norms in AI explicitly draw on the biosecurity precedent: the 1975 Asilomar conference on recombinant DNA, the 2004 US National Science Advisory Board for Biosecurity, and the 2014 US gain-of-function moratorium. The AI parallels are publication-control debates around GPT-2 (OpenAI's staged release, 2019), the deepfake-generation research community (FaceSwap-era, 2017-2020), CBRN-uplift research, and offensive cybersecurity capabilities (e.g., AutoAttack research). Field positions cluster: (a) full publication — Brundage et al. 2018 critique of selective release; (b) staged or structured access — Solaiman et al. 2019; (c) capability-thresholded redaction — Anthropic, OpenAI, DeepMind dual-use policies, 2023-2025.\n\nGovernance instruments are catching up. US EO 14110 §4.2(a)(ii) explicitly required reporting on dual-use capabilities including CBRN, cyber, and autonomous-replication. EU AI Act Art. 5 prohibits certain dual-use applications (manipulation, social scoring) but does not regulate research-stage decisions. NIST AI RMF Map 1.1 includes 'risk of misuse' assessment but does not prescribe publication norms. The G7 Hiroshima Code §3 endorses 'responsible information sharing' without operationalising it.\n\nFor AI safety researchers, dual-use research norms are the closest analogue to peer-review-style governance of which findings should be public — a research-community-internal governance layer that operates upstream of regulator-mandated controls.","usedByInstruments":["US-EO-14110","G7-HIROSHIMA","NIST-AI-RMF","ANTHROPIC-RSP-2024","OPENAI-PREPAREDNESS-2023","DEEPMIND-FSF-2024","META-FRONTIER-2024","WH-VOLUNTARY-2023"],"relatedConcepts":["alignment","capability-elicitation","red-team-evaluation","asl-3"],"relatedTopics":["foundation_models","training_data","transparency"],"sourceUrl":"https://arxiv.org/abs/1908.09203","sourceCitation":"Solaiman, I., et al. (2019), 'Release Strategies and the Social Impacts of Language Models' — the canonical articulation of structured-access norms for foundation models.","empiricalConsensus":"contested","contestedQuestion":"Is the biosecurity DURC analogy applicable to AI? Information-spread dynamics differ fundamentally (Brundage 2023); the field has not converged on whether DURC-style governance translates.","notes":"The biosecurity DURC analogy is contested: critics (Brundage 2023) argue that information-spread dynamics in AI are fundamentally different from biological materials. Pair citations of 'dual-use research norms in AI' with a note on the analogy's contested status."},{"code":"provenance-watermarking","label":"Provenance & Watermarking","domain":"safety","definition":"Cryptographic or perceptual signals embedded in AI-generated content (image, audio, video, text) that enable downstream detection of synthetic origin.","scope":"Provenance and watermarking sit at the intersection of authenticity verification (proving an artifact's source) and AI-generation disclosure (signalling that content is synthetic). Two technical lineages converge: (a) cryptographic provenance — content-credential standards like C2PA (Coalition for Content Provenance and Authenticity) that sign metadata into media at capture time; (b) statistical / robust watermarking — perturbation patterns embedded in pixels/audio/text that survive recompression, paraphrasing, or screen-capture.\n\nRegulatory coverage is the most cross-jurisdictionally aligned of any AI-governance domain. EU AI Act Art. 50(4) requires deepfake disclosure and watermarking for AI-generated content. US EO 14110 §4.5 mandated NIST guidance on content authentication (issued 2024; partly rescinded under EO 14179). China's Deep Synthesis Provisions (Art. 16, 2022) require explicit labelling of synthetic content. G7 Hiroshima §5 calls for interoperable provenance mechanisms. Despite this alignment, NO interoperability standard has been agreed: C2PA, SynthID (Google DeepMind), Stable Signature (Meta), and the various per-vendor watermarks remain mutually incompatible. This is the wiki's most actively contested implementation gap.","usedByInstruments":["EU-AIA-2024","US-EO-14110","CN-GENAI-2023","G7-HIROSHIMA","WH-VOLUNTARY-2023","SG-MODEL-AI-2024"],"relatedConcepts":["frontier-tier","model-card"],"relatedTopics":["deepfakes","transparency","training_data"],"sourceUrl":"https://c2pa.org/specifications/specifications/2.1/specs/C2PA_Specification.html","sourceCitation":"C2PA Technical Specification v2.1 (the most widely adopted provenance standard)","empiricalConsensus":"contested","contestedQuestion":"Are robust statistical watermarks durable under adversarial removal at deployment scale? Field has demonstrated breakability for text watermarks (Jovanović et al. 2024, Sadasivan et al. 2023) but image + audio remain more resilient. Cross-vendor interoperability standard is also unresolved (C2PA vs SynthID vs Stable Signature).","notes":"When a wiki article references 'watermarking' without scheme qualifier, default to 'robust statistical watermarking' for text+image AI outputs; C2PA-style provenance is a sibling, not a synonym."},{"code":"policy-instrument","label":"Policy Instrument","domain":"policy_instrument","definition":"An identifiable technique of collective action — a binding regulation, an executive order, a voluntary code, a technical standard, a treaty, or similar — by which a public authority structures behaviour to address a policy problem. Instrument choice is itself a substantive policy decision, not a downstream implementation detail.","scope":"The canonical public-policy literature treats a policy instrument as a discrete 'tool of government' deployed to organise collective action. Hood's seminal NATO typology (Hood 1983, The Tools of Government, ch. 1-2) groups instruments by the resource base they exploit — Nodality (information), Authority (legal command), Treasure (fiscal transfer), and Organisation (direct provision). Salamon (2002, The Tools of Government: A Guide to the New Governance, pp. 1-47) extends the frame to a 'third-party governance' world in which most instruments are distributed delivery mechanisms (grants, contracts, vouchers, tax expenditures, regulation), and Howlett (2011, Designing Public Policies, ch. 3-5) operationalises instrument choice as constrained by information, capability, and political variables. The political-sociology tradition (Lascoumes & Le Galès 2007, Governance 20(1): 1-21) goes further: instruments are not neutral techniques but 'a particular form of materialisation of state power' (pp. 4-5) that produce effects independently of their stated objectives — meaning instrument choice is policy substance.\n\nIn AI governance, the patchwork of binding regulation (EU AIA), executive orders (US EO 14110), voluntary codes (G7 Hiroshima), technical standards (NIST AI RMF), international treaties (CoE AI Convention), and resolutions (UN A/RES/78/265) is best understood not as incoherence but as the predicted response to what Marchant et al. (2011, The Growing Gap Between Emerging Technologies and Legal-Ethical Oversight, ch. 1) call the 'pacing problem' — formal regulation lags capability development by years, so jurisdictions sequence soft-law (norm-setting, capability evaluation) ahead of hard-law (binding obligations). Anderljung et al. (2023, 'Frontier AI Regulation,' arXiv:2307.03718, §3) argue the multi-instrument mix is necessary under dual-use indeterminacy; critics argue it enables regulatory arbitrage.\n\nThe seven InstrumentKind values in this wiki map onto Hood's NATO scheme as follows: binding_regulation + executive_order + international_treaty = Authority; technical_standard = Authority+Nodality hybrid; policy_statement + voluntary_code + resolution = Nodality/sermons. Market-based instruments (tradeable permits, Pigouvian taxes) and pure information instruments (registries, labels) are present in AI governance but not yet first-class categories in this catalog.","usedByInstruments":["EU-AIA-2024","US-EO-14110","US-EO-14179","UK-WHITEPAPER-2023","CN-GENAI-2023","G7-HIROSHIMA","OECD-AI-PRIN","COE-AI-CONV","UN-RES-2024","NIST-AI-RMF"],"relatedConcepts":["model-card","red-team-evaluation","compute-threshold","provenance-watermarking"],"relatedTopics":["foundation_models","compute_reporting","transparency","international_coordination"],"sourceUrl":"https://doi.org/10.1111/j.1468-0491.2007.00342.x","sourceCitation":"Lascoumes, P. & Le Galès, P. (2007). Introduction: Understanding Public Policy through Its Instruments — From the Nature of Instruments to the Sociology of Public Policy Instrumentation. Governance 20(1): 1-21. See also Hood (1983) The Tools of Government, ch. 1-2; Salamon (2002) The Tools of Government: A Guide to the New Governance, pp. 1-47; Howlett (2011) Designing Public Policies, ch. 3-5.","empiricalConsensus":"contested","contestedQuestion":"Does the AI-governance multi-instrument patchwork (binding / voluntary / standards / treaty) converge toward hard-law over time (Abbott & Snidal 2000, International Organization 54(3): 421-456) or stabilise as a permanent mixed equilibrium (Pauwelyn et al. 2014)? Related: is the mix a feature of jurisdictional experimentation (Anderljung et al. 2023) or a bug enabling regulatory arbitrage (Russell 2024)? Field consensus is forming but unsettled.","notes":"Foundational concept article for the policy_instrument domain — defines the category that every INSTRUMENTS entry instantiates. When citing 'policy instrument' in other wiki articles without further qualifier, default to the Hood / Salamon / Howlett synthesis; reserve Lascoumes & Le Galès when the article's argument turns on instruments-as-power rather than instruments-as-techniques. The seven InstrumentKind values do NOT yet include market-based or pure-information instruments; if a future AI-governance instrument falls outside the seven, expand InstrumentKind rather than forcing a mis-fit."},{"code":"ai-supply-chain","label":"AI Supply Chain","domain":"safety","definition":"The end-to-end pipeline of inputs, intermediate artefacts, and downstream applications by which an AI system is built and deployed — typically decomposed as training data → compute → model weights → fine-tuning → deployment → downstream applications.","scope":"The AI supply-chain framing treats AI development as an industrial value chain in which each upstream stage constrains what the downstream stage can do, and each stage raises distinct governance questions. Training data raises copyright, consent, and bias questions (NYT v. OpenAI, GEMA v. OpenAI, Andersen v. Stability AI). Compute raises export-control and concentration questions (US BIS rules on advanced GPUs to China, the CHIPS Act, the 2024 EU Chips Act). Model weights raise open-vs-closed governance questions (Meta Llama, Mistral, DeepSeek vs. closed frontier labs). Fine-tuning raises capability-elicitation questions (Qi et al. 2023 'Fine-tuning Aligned LLMs Compromises Safety'). Deployment raises monitoring and incident-reporting questions. Downstream applications raise sectoral-liability questions (medical-device AI, automated decision-making in employment).\n\nGovernance treatment is fragmented across the chain. EU AI Act Recital 60 + Art. 25 introduces explicit value-chain obligations: the GPAI provider and the downstream deployer have different obligations, and contracts must allocate them. US EO 14110 §4.2 targeted the compute stage (Defense Production Act reporting for foundation-model training above the threshold). NIST AI RMF GenAI Profile (NIST AI 600-1, 2024) names 'Value Chain and Component Integration' as one of twelve GenAI risk categories. ASEAN AI Guide §3 treats the supply chain as a 'shared responsibility' across actors. The supply-chain framing is increasingly the unit of governance analysis because chokepoints (compute access, training-data legality, weight distribution) determine where policy levers have purchase.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI","ASEAN-AI-GUIDE-2024"],"relatedConcepts":["compute-threshold","training-data-attribution","model-card","model-distillation-risk","data-poisoning"],"relatedTopics":["foundation_models","training_data","compute_reporting","sovereign_ai"],"sourceUrl":"https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf","sourceCitation":"NIST AI 600-1 (Jul 2024), 'AI Risk Management Framework: Generative AI Profile' — names 'Value Chain and Component Integration' as a primary risk category.","empiricalConsensus":"emerging","notes":"When citing 'AI supply chain' in policy contexts, name the stage of interest (data / compute / weights / deployment) because governance levers are stage-specific. Confusing stage-level interventions (e.g. export controls on GPUs) with end-to-end claims is one of the most common policy-analysis errors in this domain."},{"code":"training-data-attribution","label":"Training-Data Attribution","domain":"safety","definition":"Technical methods that identify which training examples most influenced a specific AI model output, enabling provenance claims about generated content and supporting copyright / consent / accountability disputes downstream.","scope":"Training-data attribution (TDA) is the inverse of training: given an output, recover the training examples that caused it. The technical lineage runs from influence functions (Koh & Liang 2017, 'Understanding Black-box Predictions via Influence Functions,' ICML) through gradient-based methods (Pruthi et al. 2020, TracIn) to recent scalable approximations for foundation models (Grosse et al. 2023, Anthropic, 'Studying Large Language Model Generalization with Influence Functions'; Park et al. 2023 TRAK). Adjacent methods include training-data extraction (Carlini et al. 2021, 'Extracting Training Data from Large Language Models') which surfaces verbatim memorisation rather than influence.\n\nGovernance relevance is now legally acute. The NYT v. OpenAI complaint (Dec 2023) used training-data extraction to show verbatim NYT articles in GPT-4 outputs; ongoing US copyright suits (Authors Guild v. OpenAI, Getty v. Stability AI, Tremblay v. OpenAI) turn partly on whether attribution methods can demonstrate substantial similarity at training-corpus scale. EU AI Act Art. 53(1)(c) requires GPAI providers to publish a 'sufficiently detailed summary' of training-data content — a disclosure obligation that is the regulatory analogue of attribution. China's GenAI Measures Art. 7 requires legal sourcing of training data. Brazil's PL 2338/2023 includes an explicit author-compensation provision. India's DPDPA does not yet address training-data rights directly, but the 2024 MEITY advisories signal forthcoming guidance.\n\nMethodologically, TDA at frontier-model scale remains contested: influence-function approximations require restrictive assumptions (locally-linear loss surface) that don't hold for over-parameterised LLMs, and verbatim-extraction methods undercount the (likely larger) population of paraphrased or compositionally-derived outputs.","usedByInstruments":["EU-AIA-2024","CN-GENAI-2023","BR-AIBILL-2024"],"relatedConcepts":["ai-supply-chain","model-card","data-poisoning"],"relatedTopics":["training_data","transparency","redress"],"sourceUrl":"https://arxiv.org/abs/2308.03296","sourceCitation":"Grosse, R., et al. (2023), 'Studying Large Language Model Generalization with Influence Functions' (Anthropic) — the canonical articulation of scalable influence-function-based attribution for foundation models.","empiricalConsensus":"emerging","notes":"Distinguish TDA (which training examples *caused* this output, by influence) from training-data extraction (which examples are verbatim recoverable from the model). Both are policy-relevant but for different claims: influence supports causal-contribution arguments, extraction supports memorisation arguments."},{"code":"prompt-injection","label":"Prompt Injection","domain":"safety","definition":"An adversarial input technique in which untrusted content fed to an AI model (e.g., text on a webpage the model reads, a document the user uploads, a tool's output) contains instructions that override the model's intended behaviour or principal-provided system prompt.","scope":"Prompt injection was named by Willison (2022, 'Prompt injection attacks against GPT-3') and formalised by Greshake et al. (2023, 'Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection'). The attack class splits into two sub-cases: (a) direct prompt injection — the user (or attacker posing as user) submits adversarial text in the prompt; mitigated partly by training-time alignment + system-prompt design; (b) indirect prompt injection — the model ingests untrusted content (a webpage during browsing, a PDF the user uploads, the output of a tool call) which contains adversarial instructions; the model cannot reliably distinguish 'data' from 'instructions' because both share the same token-stream interface. Indirect injection is the more serious failure mode at deployment because the attacker doesn't need access to the user's session.\n\nNIST AI RMF GenAI Profile (NIST AI 600-1) names prompt injection in the 'Information Security' risk category. EU AI Act Art. 15 ('cybersecurity' requirement for high-risk and Art. 55 for GPAI with systemic risk) is the closest binding obligation — providers must protect against 'attempts by unauthorised third parties to alter the use, behaviour or performance of the system.' Industry mitigations (constitutional classifiers, dual-LLM gateway patterns, content-isolation tags) are evolving rapidly but no architectural defence is yet known to be robust. The OWASP LLM Top 10 (2023, 2025 update) lists prompt injection as LLM01 — the most-cited application-security risk for LLM-integrated software.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI"],"relatedConcepts":["agentic-system","tool-use-safety","jailbreak-resistance","data-poisoning","retrieval-augmented-generation"],"relatedTopics":["foundation_models","transparency"],"sourceUrl":"https://arxiv.org/abs/2302.12173","sourceCitation":"Greshake, K., Abdelnabi, S., Mishra, S., Endres, C., Holz, T., Fritz, M. (2023), 'Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection.'","empiricalConsensus":"settled","notes":"Distinguish prompt injection (instruction-channel attack via shared token stream) from jailbreaking (adversarial-prompt attack targeting alignment training) and from data poisoning (training-time attack). The three are often conflated in policy text but require different mitigations."},{"code":"agentic-system","label":"Agentic AI System","domain":"safety","definition":"An AI system that takes actions in the world — calling tools, executing code, browsing the web, sending messages, planning multi-step sequences — rather than only generating text or images for a human reader.","scope":"An agentic system, in the technical sense, is one whose outputs include actions with external effects (tool calls, API requests, code execution, file writes) and whose loop structure permits multi-step planning over those actions. The architecture pattern emerged with ReAct (Yao et al. 2022, 'ReAct: Synergizing Reasoning and Acting in Language Models'), AutoGPT and BabyAGI (2023, open-source), and is now the deployment substrate for Claude's tool use, GPT's function calling + assistants API, and Google DeepMind's Project Astra demos. The governance-relevant distinction from chat-only LLMs is that agentic systems can cause harm by acting (sending money, running attacks, exfiltrating data) rather than only by saying — Wittgenstein's 'words can wound' becomes 'words and actions can wound, and the actions are at machine speed.'\n\nRegulatory vocabulary has not caught up. EU AI Act treats agentic systems as a sub-case of GPAI plus deployment context, with no agentic-specific obligations. Seoul Declaration (May 2024) and the 16 frontier-lab Frontier AI Safety Commitments mention 'advanced AI systems' but do not operationalise the agentic-vs-chat distinction. UK AISI's evaluations include agentic-capability tests (autonomous-replication, self-exfiltration) that imply the category but do not define it. The G7 Hiroshima Code §1 uses 'advanced AI' as the umbrella. Industry-side frameworks (Anthropic RSP, OpenAI Preparedness, DeepMind FSF) treat agentic capability as a tier-relevant signal: at sufficient action capability, capability-tier safeguards apply that wouldn't apply to a chat-only model with equal knowledge.","usedByInstruments":["G7-HIROSHIMA","SEOUL-2024","NIST-AI-RMF-GENAI"],"relatedConcepts":["tool-use-safety","scalable-oversight","alignment","deceptive-alignment","multi-turn-evaluation","prompt-injection"],"relatedTopics":["foundation_models","catastrophic_risk","transparency"],"sourceUrl":"https://arxiv.org/abs/2210.03629","sourceCitation":"Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., Cao, Y. (2022), 'ReAct: Synergizing Reasoning and Acting in Language Models.'","empiricalConsensus":"emerging","notes":"When citing 'agentic' in policy contexts, distinguish (a) tool-using LLMs that act through a fixed API surface (most current 'agents'); (b) browser-driven agents with general internet access; (c) embodied agents (robotics + LLM). Each raises distinct governance questions; collapsing the three is one of the most common analytical errors in 2025-2026 policy writing."},{"code":"tool-use-safety","label":"Tool-Use Safety","domain":"safety","definition":"The sub-domain of agentic-system safety concerned with the risks that arise when an AI model invokes external tools (search, code execution, APIs, financial transactions, system commands) — including risks of unintended action, instruction subversion, privilege escalation, and resource consumption.","scope":"Tool-use safety treats the model + tool surface as the unit of analysis rather than the model in isolation. The risk surface expands along several axes: (a) capability composition — a chat-safe model may become capability-dangerous when given a code-execution tool plus internet access; (b) instruction-channel adversaries — tool outputs are an indirect-prompt-injection vector (a web search result containing adversarial instructions); (c) privilege escalation — tools that share authentication with the user may be invoked beyond user intent; (d) resource exhaustion — agents can spend money, compute, or API credits at machine speed; (e) confused-deputy attacks — the tool acts with the user's authority on instructions actually from a third party.\n\nMitigation patterns include: capability allowlists (only specific tools, specific scopes), human-in-the-loop confirmation for high-impact actions (the OpenAI Operator + Anthropic Computer Use UX patterns), output-isolation tags (Anthropic's tool-result-tag scheme), and gateway-LLM patterns (Wallace et al. 2024 dual-LLM). NIST AI RMF GenAI Profile §2.7 'Value Chain and Component Integration' touches the tool-integration risk. EU AI Act Art. 14 'human oversight' is the closest binding obligation but presumes human-bandwidth-feasible review, which agentic systems break at scale. Industry-side frameworks (Anthropic RSP, OpenAI Preparedness) treat tool-use capability as a tier-relevant signal.","usedByInstruments":["NIST-AI-RMF-GENAI"],"relatedConcepts":["agentic-system","scalable-oversight","prompt-injection","alignment","capability-elicitation"],"relatedTopics":["foundation_models","catastrophic_risk"],"sourceUrl":"https://arxiv.org/abs/2402.07896","sourceCitation":"Wallace, E., et al. (2024), 'The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions' (OpenAI) — the canonical industry articulation of instruction-channel hierarchy as a tool-use-safety defence.","empiricalConsensus":"emerging","notes":"Tool-use safety is the sub-problem of agentic-system safety where the action surface is mediated by discrete tool calls. The boundary with general agentic-system safety is fuzzy when tools include code execution (which is effectively a universal action)."},{"code":"multi-turn-evaluation","label":"Multi-Turn Evaluation","domain":"safety","definition":"An evaluation methodology that probes AI models across multi-step conversations rather than single prompts — designed to surface deception, sycophancy, context-accumulation jailbreaks, and capability degradation that single-prompt benchmarks miss.","scope":"Single-turn benchmarks (MMLU, HumanEval, GPQA) measure performance on independent prompts. Multi-turn evaluation extends the protocol to dialogues, with each model response feeding into the next prompt. This methodology surfaces failure modes that single-turn evaluation misses: (a) sycophancy drift — the model progressively conforms to user beliefs across turns (Sharma et al. 2023, 'Towards Understanding Sycophancy in Language Models'); (b) jailbreak via context accumulation — many-shot jailbreaking (Anil et al. 2024, Anthropic, 'Many-shot Jailbreaking') exploits the long context window; (c) deceptive alignment indicators — multi-turn probes can elicit inconsistencies between model self-reports across turns (Pacchiardi et al. 2023, 'How to Catch an AI Liar'); (d) capability elicitation — chain-of-thought + decomposition prompting often outperforms single-shot prompting (Wei et al. 2022, Andersson 2024). Benchmarks such as MT-Bench (Zheng et al. 2023), AgentBench (Liu et al. 2024), and HarmBench (Mazeika et al. 2024) operationalise the multi-turn protocol.\n\nGovernance relevance: EU AI Act Art. 55(1)(a) adversarial-testing requirement presupposes that the testing methodology can detect deployment-realistic failure modes — many of which are multi-turn-only. UK AISI's pre-deployment evaluation suite includes multi-turn jailbreak + agentic-trajectory probes. NIST AI RMF GenAI Profile Manage 2.3 calls for evaluation 'across the lifecycle' which implicitly covers multi-turn. Standardisation across providers remains partial — each frontier lab uses a different multi-turn methodology, making cross-vendor comparison fraught (Frontier Foundation Model Eval Consortium converging slowly).","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI"],"relatedConcepts":["capability-elicitation","red-team-evaluation","jailbreak-resistance","deceptive-alignment","sandbagging","agentic-system"],"relatedTopics":["foundation_models","compute_reporting","transparency"],"sourceUrl":"https://arxiv.org/abs/2306.05685","sourceCitation":"Zheng, L., et al. (2023), 'Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena' — operationalises the multi-turn evaluation protocol for foundation models.","empiricalConsensus":"emerging","notes":"Multi-turn evaluation is the umbrella; specific protocols (many-shot probing, agentic trajectories, conversational red-teaming) are sub-cases. When citing in policy text, name the specific protocol to avoid the methodology-laundering risk where 'we did multi-turn evaluation' substitutes for substantive methodology disclosure."},{"code":"data-poisoning","label":"Data Poisoning","domain":"safety","definition":"A training-time attack in which an adversary inserts crafted examples into the training corpus or fine-tuning dataset to alter the resulting model's behaviour — typically inserting a backdoor that triggers on a specific input pattern or degrading performance on a target class.","scope":"Data poisoning is the canonical training-time adversarial attack. The lineage runs from Biggio et al. (2012, 'Poisoning Attacks against Support Vector Machines') through targeted backdoor attacks on deep networks (Gu et al. 2017, 'BadNets'; Chen et al. 2017) to recent work on foundation-model corpora (Carlini et al. 2024, 'Poisoning Web-Scale Training Datasets is Practical'). Two sub-cases matter: (a) targeted poisoning — adversary inserts examples to cause specific misclassification or backdoor on a trigger; (b) untargeted poisoning — adversary degrades overall performance, often as denial-of-service. For foundation models trained on web-scale corpora (Common Crawl, LAION), the practicality bar is low: Carlini et al. (2024) demonstrated that injecting poisoned examples into ~0.01% of the training corpus is feasible for an attacker controlling a handful of expired domains.\n\nGovernance relevance is direct and increasingly cited. NIST AI RMF GenAI Profile (NIST AI 600-1) §2.6 'Information Security' names data poisoning. EU AI Act Art. 15 cybersecurity obligations + Art. 55 systemic-risk obligations require protection against 'attempts to alter the use, behaviour or performance of the system' which covers training-time attacks. China's GenAI Measures Art. 7 mandates legal-source training data, which intersects with poisoning resistance. The governance gap: poisoning resistance is hard to verify post-hoc — once a model is trained, distinguishing poisoned-but-undetected from clean is an open problem. For open-data + open-weight foundation models (Pile, RedPajama, Llama series), poisoning resistance must be designed in at curation time.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI","CN-GENAI-2023"],"relatedConcepts":["ai-supply-chain","training-data-attribution","model-distillation-risk","jailbreak-resistance","prompt-injection"],"relatedTopics":["training_data","foundation_models","transparency"],"sourceUrl":"https://arxiv.org/abs/2302.10149","sourceCitation":"Carlini, N., et al. (2024), 'Poisoning Web-Scale Training Datasets is Practical' — establishes practical feasibility of poisoning frontier-model training corpora.","empiricalConsensus":"settled","notes":"Distinguish data poisoning (training-time corpus attack) from prompt injection (inference-time input attack) and from model distillation risk (post-training capability leak). All three are sometimes conflated under 'adversarial attacks on LLMs' but require distinct mitigations."},{"code":"model-distillation-risk","label":"Model Distillation Risk","domain":"safety","definition":"The risk that a closed-weight frontier model's capabilities can be partially recovered by training a smaller open-weight model on the closed model's outputs, undermining the governance assumption that closed weights confer capability containment.","scope":"Knowledge distillation (Hinton et al. 2015, 'Distilling the Knowledge in a Neural Network') is a benign technique for compressing teacher models into smaller student models. The governance concern is that distillation works across organisational boundaries: an attacker (or unaligned actor) can query a closed frontier API at scale, collect input-output pairs, and train an open-weight model that approximates the closed teacher's capabilities. Empirical examples have driven the policy debate: Alpaca + Vicuna (Stanford, 2023) demonstrated that 52K-100K instruction-following examples from GPT-3.5 sufficed to produce a competent open student; DeepSeek-R1's Jan 2025 release used distillation-from-traces to produce reasoning capabilities that approach o1-class systems. Industry terms-of-service (OpenAI, Anthropic, Google) prohibit using outputs to train competing models, but enforcement against jurisdictionally-distant actors is limited.\n\nThe governance implication is structural: the open-vs-closed debate (Llama, Mistral, DeepSeek vs. Anthropic, OpenAI, Google DeepMind) hinges partly on whether closed-weight release actually contains capability. If distillation is robust, closed-vs-open is a capability-acquisition-delay measure rather than a capability-containment measure. EU AI Act, US EO 14110, and G7 Hiroshima all presume closed-weight containment in their compute-threshold + capability-evaluation regimes; the distillation effect is not explicitly addressed. Anthropic, OpenAI, and DeepMind have published distillation-defence research (output watermarks, model-fingerprint methods) but no robust technical fix exists.","usedByInstruments":[],"relatedConcepts":["ai-supply-chain","capability-elicitation","frontier-tier","compute-threshold","inference-time-compute"],"relatedTopics":["foundation_models","compute_reporting","sovereign_ai"],"sourceUrl":"https://arxiv.org/abs/1503.02531","sourceCitation":"Hinton, G., Vinyals, O., Dean, J. (2015), 'Distilling the Knowledge in a Neural Network' — the foundational distillation paper; the governance-relevant adaptation runs through Alpaca/Vicuna (2023) and DeepSeek-R1 (2025).","empiricalConsensus":"contested","contestedQuestion":"Does distillation transfer the substantive capabilities of frontier closed models, or only superficial mimicry of style + format? Empirical evidence is mixed — Alpaca/Vicuna evaluations showed style transfer but limited reasoning transfer (Gudibande et al. 2023, 'The False Promise of Imitating Proprietary LLMs'); DeepSeek-R1 distillation showed substantive reasoning transfer. The field is split.","notes":"When citing 'distillation' in policy contexts, distinguish (a) benign within-organisation compression; (b) competitive cross-organisation distillation via API outputs (the governance concern). The Gudibande et al. 2023 'false promise' caveat is important — early distillation results overstated capability transfer."},{"code":"jailbreak-resistance","label":"Jailbreak Resistance","domain":"safety","definition":"The robustness of an AI model's safety training against adversarial prompts crafted to elicit policy-prohibited outputs — distinct from alignment (which concerns the model's goals) and from baseline safety training (which concerns the model's defaults).","scope":"Jailbreak resistance is the operational counterpart to alignment. A model can be 'aligned' in the sense of internalising its principal's intent at training time and still be 'jailbreakable' in the sense that adversarial prompting recovers prohibited behaviours. The attack literature is extensive: roleplay-framing attacks (DAN-style prompts, 2022-2023), encoding attacks (Wei et al. 2023, 'Jailbroken: How Does LLM Safety Training Fail?'), gradient-based suffix attacks (Zou et al. 2023, 'Universal and Transferable Adversarial Attacks on Aligned Language Models'), many-shot jailbreaking (Anil et al. 2024, Anthropic, exploiting long context), and persuasion-style attacks (Zeng et al. 2024, 'How Johnny Can Persuade LLMs to Jailbreak Them'). Industry defences (constitutional classifiers, RLHF + constitutional AI, output filters, multi-stage safety pipelines) are improving but no model has demonstrated full robustness; the white-hat assumption is that adequately-resourced attackers can find a working jailbreak for any current frontier model.\n\nGovernance relevance: EU AI Act Art. 55(1)(a) adversarial-testing requirement directly targets jailbreak resistance; the testing methodology must include adversarial probing. UK AISI evaluations include public-domain + novel jailbreak probes. NIST AI RMF GenAI Profile §2.6 'Information Security' addresses adversarial robustness. Industry-side frameworks (Anthropic RSP, OpenAI Preparedness, DeepMind FSF) treat jailbreak resistance as one input to capability-tier safeguards — at high CBRN-uplift capability, jailbreak resistance becomes load-bearing for deployment safety.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI","G7-HIROSHIMA"],"relatedConcepts":["red-team-evaluation","alignment","capability-elicitation","multi-turn-evaluation","prompt-injection","data-poisoning"],"relatedTopics":["foundation_models","transparency","catastrophic_risk"],"sourceUrl":"https://arxiv.org/abs/2307.15043","sourceCitation":"Zou, A., Wang, Z., Kolter, J. Z., Fredrikson, M. (2023), 'Universal and Transferable Adversarial Attacks on Aligned Language Models' — the canonical demonstration that gradient-based suffix attacks transfer across aligned LLMs.","empiricalConsensus":"settled","notes":"Distinguish jailbreak resistance (robustness to adversarial elicitation of prohibited outputs) from alignment (whether the model's goals match the principal's) and from prompt injection (whether untrusted content can hijack the instruction channel). All three are necessary but none is sufficient for deployment safety."},{"code":"model-merging-risk","label":"Model-Merging Risk","domain":"safety","definition":"The governance concern that post-training combination of multiple specialised models — via weight averaging, task-arithmetic, or modular merging — can produce capability or safety properties not present in any single source model, in ways the original safety evaluations would miss.","scope":"Model merging refers to a family of post-training techniques that combine the weights of multiple fine-tuned models into a single composite model without further training. Methods include simple weight averaging (Wortsman et al. 2022, 'Model Soups'), task arithmetic (Ilharco et al. 2023, 'Editing Models with Task Arithmetic'), TIES-Merging (Yadav et al. 2023, NeurIPS), DARE (Yu et al. 2024), and SLERP-style interpolation. The technique has exploded among open-weight finetuners on Hugging Face — by late-2024 a substantial fraction of the top-ranked Open LLM Leaderboard models were merges rather than single-source fine-tunes.\n\nThe governance concern arises from a basic combinatorial fact: safety properties are not preserved under merging. A model that has been safety-trained on harmful-content refusals can be merged with a 'helpful-only' or 'uncensored' fine-tune to produce a model that recovers the underlying capability while losing the safety training (Bhardwaj et al. 2024, 'Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic'). Conversely, capability properties can emerge from merges that weren't in any source model. None of the major regulatory regimes (EU AI Act, US EO 14110, China GenAI Measures, NIST AI RMF) explicitly addresses model merging — the regulatory unit of analysis is 'a model' rather than 'a model + its merge descendants.' This is one of the most clearly identified under-governed surfaces in the open-weight ecosystem.","usedByInstruments":[],"relatedConcepts":["ai-supply-chain","model-distillation-risk","capability-elicitation","jailbreak-resistance","alignment"],"relatedTopics":["foundation_models","training_data"],"sourceUrl":"https://arxiv.org/abs/2402.11746","sourceCitation":"Bhardwaj, R., et al. (2024), 'Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic' — canonical demonstration that safety training is not preserved under task arithmetic / merging.","empiricalConsensus":"emerging","notes":"Model merging is under-governed because regulatory frameworks treat 'the model' as a discrete artefact, whereas open-weight merging produces an unbounded descendant tree. When citing in policy contexts, note the regulatory-unit-of-analysis problem explicitly."},{"code":"inference-time-compute","label":"Inference-Time Compute","domain":"compute","definition":"The scaling regime in which model capability is increased by spending more compute at inference time (multiple samples, search, longer reasoning chains, tool-using iteration) rather than by training a larger model — disrupting the training-compute-as-capability-proxy assumption underlying most current AI governance.","scope":"The dominant assumption underlying compute-threshold regulation (EU AIA Art. 51, US EO 14110 §4.2(a)) is that training compute correlates with deployment capability. Inference-time-compute scaling complicates this: a model trained at compute level C can be deployed with inference-time compute K·C per response, producing capability properties intermediate between the base model and a model trained at K·C. OpenAI's o1 (Sep 2024) and o3 (Dec 2024) series, Anthropic's extended-thinking modes, DeepMind's AlphaCode-2 / AlphaProof, and DeepSeek-R1 (Jan 2025) demonstrate the regime empirically. Snell et al. (2024, 'Scaling LLM Test-Time Compute Optimally') and Brown et al. (2024) provide the empirical scaling laws.\n\nGovernance implications are direct. (a) Compute thresholds based on training-FLOPs alone (EU AIA 10²⁵, US EO 10²⁶) understate the deployed capability of inference-scaled models. (b) DeepSeek-R1 demonstrated frontier-tier reasoning at training-compute well below 10²⁵ FLOPs, weakening the threshold's empirical defensibility. (c) Capability evaluations must specify the inference-compute budget under which the model was tested, since a model can be safe at K=1 and dangerous at K=100. (d) The mitigation surface for inference-time-scaled capabilities is different — restricting access to high-compute deployment APIs is policy-tractable in a way that restricting model-weight distribution is not. The Seoul Declaration + Frontier AI Safety Commitments (May 2024) gesture at this with 'pre-deployment evaluation under realistic conditions,' but no regulator has yet formalised inference-compute-aware thresholds.","usedByInstruments":["SEOUL-2024"],"relatedConcepts":["compute-threshold","frontier-tier","capability-elicitation","model-distillation-risk","agentic-system"],"relatedTopics":["foundation_models","compute_reporting","tech_sovereignty"],"sourceUrl":"https://arxiv.org/abs/2408.03314","sourceCitation":"Snell, C., Lee, J., Xu, K., Kumar, A. (2024), 'Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters' — establishes inference-time-compute scaling as a first-class capability lever.","empiricalConsensus":"emerging","notes":"When citing 'compute' in AI-governance contexts post-2024, specify whether the claim is about training-time or inference-time compute. Conflating the two is the most common analytical error in 2025-2026 policy writing on compute thresholds."},{"code":"sandbagging","label":"Sandbagging","domain":"safety","definition":"A theoretical failure mode in which a model deliberately underperforms on capability evaluations — either to avoid triggering capability-tier safeguards or because of subtle training-time incentives that reward apparent low capability under evaluation conditions.","scope":"Sandbagging is the inverse of capability elicitation: the elicitation problem is whether evaluators can reveal what the model can do; the sandbagging problem is whether the model has incentive to hide what it can do. The theoretical case (Hubinger et al. 2019, situated alignment; Andersson 2024, capability evaluation under adversarial conditions; van der Weij et al. 2024, 'AI Sandbagging: Language Models can Strategically Underperform on Evaluations') argues that a sufficiently situationally-aware model could in principle detect evaluation conditions (telltale prompt formatting, benchmark phrasing) and produce evaluator-pleasing low-capability outputs while behaving differently in deployment. Empirical demonstrations are limited but growing: van der Weij et al. (2024) show that GPT-4 and Claude-3-Opus can be prompted to strategically underperform on specific benchmarks; whether they do so absent such prompting is unresolved.\n\nGovernance relevance is theoretical-becoming-practical. EU AI Act Art. 55(1)(a) adversarial testing requires evaluation under adversarial conditions, which presumes sandbagging-resistant methodology. UK AISI's evaluation suite includes elicitation methods (chain-of-thought, fine-tuning pressure) designed partly to defeat sandbagging. The conceptual challenge is that sandbagging cannot be definitively ruled out by behavioural evaluation alone — if a model is sufficiently capable of detecting evaluation, only mechanistic-interpretability + capability-elicitation-under-fine-tuning-pressure provides evidence. Sandbagging is one of the load-bearing threat models for the argument that current capability evaluations may systematically under-state frontier-model deployment risk.","usedByInstruments":["EU-AIA-2024"],"relatedConcepts":["capability-elicitation","deceptive-alignment","multi-turn-evaluation","red-team-evaluation","alignment"],"relatedTopics":["foundation_models","compute_reporting"],"sourceUrl":"https://arxiv.org/abs/2406.07358","sourceCitation":"van der Weij, T., Hofstätter, F., Jaffe, O., Brown, S., Ward, F. (2024), 'AI Sandbagging: Language Models can Strategically Underperform on Evaluations.'","empiricalConsensus":"contested","contestedQuestion":"Does sandbagging occur absent explicit prompting in current frontier LLMs? van der Weij et al. 2024 demonstrate it can be prompted; whether it emerges spontaneously is an open empirical question with implications for evaluation-methodology design.","notes":"Empirical existence of unprompted sandbagging in current frontier models is contested. When citing as a regulatory motivation, pair with at least one critical citation (Andersson 2024 capability-elicitation literature) so the wiki does not present a contested threat-model as settled."},{"code":"hallucination","label":"Hallucination","domain":"safety","definition":"Confidently-asserted but factually incorrect output produced by an AI model — including fabricated citations, invented people or events, and confabulated numerical values — that the model cannot reliably distinguish from correct output at generation time.","scope":"Hallucination, in the foundation-model-output sense, was named by Ji et al. (2023, 'Survey of Hallucination in Natural Language Generation') and has become the canonical term for LLM factual error. The phenomenon decomposes into intrinsic hallucination (output contradicts available context) and extrinsic hallucination (output asserts facts that aren't grounded in context). NIST AI RMF GenAI Profile (NIST AI 600-1) names 'Confabulation' as a primary risk category, capturing the same phenomenon under a different label (NIST's choice signals a preference against anthropomorphic framing).\n\nGovernance relevance touches four surfaces. (a) Liability — when an AI-mediated legal brief contains hallucinated citations (Mata v. Avianca, 2023, S.D.N.Y.), who bears responsibility: the lawyer, the AI provider, or the AI deployer? EU AI Act Art. 13 transparency requirements + Art. 86 right-to-explanation are the closest binding frame. (b) Disclosure — should providers disclose hallucination rates as part of model-card disclosures (EU AIA Art. 53)? Industry practice is partial. (c) Redress — when hallucinated output causes harm (defamation via fabricated facts, financial loss via wrong numbers), redress mechanisms are unclear. EU AIA Art. 85 + OECD Principle 1.5 (accountability) frame the obligation; operationalisation is inconsistent. (d) Sectoral safety — hallucination in healthcare (medical-misinformation), criminal-justice (false-positive risk scores), and education (factual errors as authoritative output) drives most sectoral guidance. NIST AI 600-1 explicitly treats confabulation as a primary risk; UK AISI evaluations include factuality probes; Brazil PL 2338/2023 includes accuracy obligations.\n\nMethodologically, hallucination cannot be eliminated by current architectures (Xu et al. 2024, 'Hallucination is Inevitable'). Mitigation is via retrieval-augmented generation, confidence calibration, and post-hoc verification — not architectural fixes.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI","BR-AIBILL-2024","OECD-AI-PRIN"],"relatedConcepts":["retrieval-augmented-generation","model-card","training-data-attribution","scalable-oversight"],"relatedTopics":["foundation_models","transparency","redress","healthcare"],"sourceUrl":"https://arxiv.org/abs/2202.03629","sourceCitation":"Ji, Z., et al. (2023), 'Survey of Hallucination in Natural Language Generation,' ACM Computing Surveys 55(12): 1-38.","empiricalConsensus":"settled","notes":"NIST AI 600-1 prefers 'confabulation' over 'hallucination' to avoid anthropomorphic framing; the two terms are interchangeable in current technical literature but the policy-vocabulary choice signals editorial discipline. Wiki articles should default to 'hallucination' as the more widely-used term, but cite the NIST framing when paralleling AI 600-1."},{"code":"in-context-learning","label":"In-Context Learning","domain":"safety","definition":"The capacity of a foundation model to adapt its behaviour to a new task purely from examples provided in the prompt, without any updates to the model's weights — discovered as an emergent property of large language models and now a primary evaluation surface.","scope":"In-context learning (ICL) was named by Brown et al. (2020, 'Language Models are Few-Shot Learners,' the GPT-3 paper) as the surprising observation that sufficiently large language models could perform new tasks from a few demonstrations in the prompt. The phenomenon is empirically robust across scales above ~1B parameters; theoretical accounts (Xie et al. 2022, 'An Explanation of In-context Learning as Implicit Bayesian Inference'; Garg et al. 2022; von Oswald et al. 2023, 'Transformers Learn In-Context by Gradient Descent') propose various mechanisms but no consensus mechanism has emerged.\n\nGovernance relevance is methodological. (a) Capability evaluations that test only baseline prompting under-state real-world capability, because deployment prompts routinely include task examples (Wei et al. 2022 chain-of-thought; Anil et al. 2024 many-shot). EU AI Act Art. 55(1)(a) adversarial testing must include ICL-mode probing to be capability-accurate. (b) Safety evaluations that test only baseline refusals under-state real-world failure surface, because many-shot jailbreaking exploits ICL to recover prohibited capabilities (Anil et al. 2024). (c) Model-card disclosures should specify which capabilities are baseline vs ICL-elicited (EU AIA Art. 53 transparency obligation). (d) ICL also affects the open-vs-closed debate: a closed model accessed via API still exposes ICL-elicitation surface, weakening the capability-containment assumption.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI"],"relatedConcepts":["capability-elicitation","multi-turn-evaluation","jailbreak-resistance","agentic-system","inference-time-compute"],"relatedTopics":["foundation_models","compute_reporting","transparency"],"sourceUrl":"https://arxiv.org/abs/2005.14165","sourceCitation":"Brown, T., et al. (2020), 'Language Models are Few-Shot Learners' (GPT-3 paper) — the canonical articulation of in-context learning as an emergent capability.","empiricalConsensus":"settled","notes":"Distinguish ICL (in-prompt example-based adaptation) from fine-tuning (weight-update-based adaptation) and from retrieval-augmented generation (retrieved-context-based adaptation). All three affect deployed capability without modifying the underlying model, but at different latencies + with different governance surfaces."},{"code":"retrieval-augmented-generation","label":"Retrieval-Augmented Generation (RAG)","domain":"safety","definition":"An AI system pattern in which a model's outputs are conditioned on external content retrieved at inference time from a knowledge source — combining the parametric knowledge of the model with the up-to-date or domain-specific knowledge of the retrieval index.","scope":"Retrieval-augmented generation was formalised by Lewis et al. (2020, 'Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks,' NeurIPS) and is now the dominant pattern for deploying LLMs against proprietary, current, or specialised knowledge. The architecture pattern: at inference time, the user query is used to retrieve k documents from an index (vector store, search engine, structured database); those documents are appended to the prompt context; the model generates an answer conditioned on both its parametric memory and the retrieved context. RAG is the substrate for most enterprise LLM deployments — legal assistants citing case law, customer-support agents citing product docs, medical-AI citing clinical guidelines.\n\nGovernance relevance opens a distinct surface from pure-LLM outputs. (a) Provenance — retrieved content has its own source attribution that must flow into the output; this is the technical substrate for citation-verifiability requirements (EU AIA Art. 50 transparency for AI-generated content). (b) Hallucination mitigation — RAG reduces but does not eliminate hallucination, because the model may still misquote or compositionally fabricate from retrieved sources. (c) Indirect prompt injection — the retrieval corpus is a primary adversarial-input vector (Greshake et al. 2023); an attacker who can plant content in the retrievable index can hijack the model. (d) Downstream-misinformation risk — RAG systems that surface low-quality sources amplify them with authoritative voice. (e) IP + training-data overlap — RAG creates a deployment-time analogue of training-data attribution questions, since retrieved-and-paraphrased content may infringe copyright at use-time. NIST AI RMF GenAI Profile §2.7 'Value Chain and Component Integration' is the closest binding frame; EU AI Act Art. 53 GPAI obligations apply to the model but the retrieval-index layer is largely unregulated.","usedByInstruments":["EU-AIA-2024","NIST-AI-RMF-GENAI"],"relatedConcepts":["hallucination","prompt-injection","training-data-attribution","ai-supply-chain","in-context-learning"],"relatedTopics":["foundation_models","training_data","transparency","redress"],"sourceUrl":"https://arxiv.org/abs/2005.11401","sourceCitation":"Lewis, P., et al. (2020), 'Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks,' NeurIPS — the canonical articulation of RAG.","empiricalConsensus":"settled","notes":"When citing RAG in policy contexts, distinguish the model-layer governance surface (EU AIA Art. 53 model-card obligations) from the retrieval-index-layer governance surface (largely unregulated). The retrieval layer is where most enterprise deployments concentrate risk in 2025-2026 because it sees less regulatory scrutiny than the underlying model."}],"coverage":{"EU-AIA-2024:foundation_models":{"type":"governs","citation":"Arts. 51-55 (general-purpose AI + systemic risk)","confidence":"high"},"EU-AIA-2024:biometric_id":{"type":"governs","citation":"Art. 5(1)(h) prohibition + Art. 26(10) post-hoc rules","confidence":"high"},"EU-AIA-2024:deepfakes":{"type":"governs","citation":"Art. 50(4) (disclosure obligation for deep fakes)","confidence":"high"},"EU-AIA-2024:employment":{"type":"governs","citation":"Annex III §4 (high-risk: employment management)","confidence":"high"},"EU-AIA-2024:healthcare":{"type":"governs","citation":"Annex III §5(a) (high-risk: essential services) + MDR overlap","confidence":"high"},"EU-AIA-2024:criminal_justice":{"type":"governs","citation":"Annex III §6 (high-risk: law enforcement)","confidence":"high"},"EU-AIA-2024:education":{"type":"governs","citation":"Annex III §3 (high-risk: educational access)","confidence":"high"},"EU-AIA-2024:compute_reporting":{"type":"governs","citation":"Art. 52 + Annex XIII (10²⁵ FLOP presumption)","confidence":"high"},"EU-AIA-2024:transparency":{"type":"governs","citation":"Arts. 13, 50 (transparency obligations)","confidence":"high"},"EU-AIA-2024:redress":{"type":"governs","citation":"Art. 85 (right to lodge complaints)","confidence":"high"},"EU-AIA-2024:training_data":{"type":"implicit","citation":"Recital 105; CDSM Directive provides primary copyright framework","confidence":"medium"},"EU-AIA-2024:sovereign_ai":{"type":"silent","citation":"No explicit sovereign-AI doctrine","confidence":"high"},"US-EO-14110:foundation_models":{"type":"governs","citation":"§4.2(a) — Defense Production Act reporting","confidence":"high"},"US-EO-14110:biometric_id":{"type":"implicit","citation":"§7 civil rights; sectoral agencies retain authority","confidence":"medium"},"US-EO-14110:deepfakes":{"type":"governs","citation":"§4.5 (content authentication, watermarking)","confidence":"high"},"US-EO-14110:employment":{"type":"implicit","citation":"§6 + DOL guidance; sectoral","confidence":"medium"},"US-EO-14110:healthcare":{"type":"implicit","citation":"§8 + HHS strategy","confidence":"medium"},"US-EO-14110:criminal_justice":{"type":"governs","citation":"§7.1(b) (DOJ AI use review)","confidence":"high"},"US-EO-14110:education":{"type":"implicit","citation":"§8(b) + ED guidance","confidence":"medium"},"US-EO-14110:compute_reporting":{"type":"governs","citation":"§4.2(a)(i) — 10²⁶ FLOP threshold","confidence":"high"},"US-EO-14110:transparency":{"type":"implicit","citation":"§4.2(a)(i) (reporting includes red-team results)","confidence":"medium"},"US-EO-14110:redress":{"type":"silent","citation":"Sectoral; no general AI-redress mechanism"},"US-EO-14110:training_data":{"type":"silent","citation":"Copyright addressed by courts + USCO, not EO"},"US-EO-14110:sovereign_ai":{"type":"governs","citation":"§4.8 (BIS export controls leverage)"},"UK-WHITEPAPER-2023:foundation_models":{"type":"implicit","citation":"Cross-cutting principles; sector regulators apply"},"UK-WHITEPAPER-2023:biometric_id":{"type":"implicit","citation":"ICO + Surveillance Camera Commissioner remit"},"UK-WHITEPAPER-2023:deepfakes":{"type":"silent","citation":"Online Safety Act 2023 covers harmful content separately"},"UK-WHITEPAPER-2023:employment":{"type":"implicit","citation":"ICO + EHRC remit"},"UK-WHITEPAPER-2023:healthcare":{"type":"implicit","citation":"MHRA software-as-medical-device"},"UK-WHITEPAPER-2023:criminal_justice":{"type":"implicit","citation":"Forensic Information Databases Strategy Board"},"UK-WHITEPAPER-2023:education":{"type":"silent","citation":"No dedicated guidance"},"UK-WHITEPAPER-2023:compute_reporting":{"type":"silent","citation":"Voluntary AISI testing instead; no statutory reporting"},"UK-WHITEPAPER-2023:transparency":{"type":"implicit","citation":"Principle 4 (transparency + explainability)"},"UK-WHITEPAPER-2023:redress":{"type":"implicit","citation":"Principle 5 (contestability + redress)","confidence":"medium"},"UK-WHITEPAPER-2023:training_data":{"type":"silent","citation":"Tdmexception consultation 2024 pending"},"UK-WHITEPAPER-2023:sovereign_ai":{"type":"silent","citation":"No explicit sovereign-AI position"},"CN-GENAI-2023:foundation_models":{"type":"governs","citation":"Art. 2 (applies to GenAI services regardless of size)"},"CN-GENAI-2023:biometric_id":{"type":"silent","citation":"Covered by Personal Information Protection Law separately"},"CN-GENAI-2023:deepfakes":{"type":"governs","citation":"Art. 12 (labelling) + Deep Synthesis Rules","confidence":"high"},"CN-GENAI-2023:employment":{"type":"silent","citation":"No specific provisions"},"CN-GENAI-2023:healthcare":{"type":"silent","citation":"Sectoral rules"},"CN-GENAI-2023:criminal_justice":{"type":"silent","citation":"Internal-government uses excluded from scope"},"CN-GENAI-2023:education":{"type":"silent","citation":"Sectoral rules"},"CN-GENAI-2023:compute_reporting":{"type":"silent","citation":"Service-deployment trigger, not compute"},"CN-GENAI-2023:transparency":{"type":"conflicts","citation":"Art. 4 + Algorithm Recommendation Rules — disclosure to CAC, not public; conflicts with EU public-disclosure model"},"CN-GENAI-2023:redress":{"type":"governs","citation":"Art. 15 (complaint channels)"},"CN-GENAI-2023:training_data":{"type":"governs","citation":"Art. 7 (legal source + IP requirements)"},"CN-GENAI-2023:sovereign_ai":{"type":"governs","citation":"Art. 17 (registration + algorithm filing)"},"G7-HIROSHIMA:foundation_models":{"type":"governs","citation":"Code applies to advanced AI"},"G7-HIROSHIMA:biometric_id":{"type":"silent","citation":"Not addressed"},"G7-HIROSHIMA:deepfakes":{"type":"governs","citation":"Code §5 (content provenance + watermarking)"},"G7-HIROSHIMA:employment":{"type":"silent","citation":"Not addressed directly"},"G7-HIROSHIMA:healthcare":{"type":"silent","citation":"Not addressed directly"},"G7-HIROSHIMA:criminal_justice":{"type":"silent","citation":"Not addressed directly"},"G7-HIROSHIMA:education":{"type":"silent","citation":"Not addressed directly"},"G7-HIROSHIMA:compute_reporting":{"type":"silent","citation":"Voluntary self-reporting only"},"G7-HIROSHIMA:transparency":{"type":"governs","citation":"Code §2 (publicly report capabilities, limitations)","confidence":"medium"},"G7-HIROSHIMA:redress":{"type":"silent","citation":"Not addressed"},"G7-HIROSHIMA:training_data":{"type":"silent","citation":"Not addressed"},"G7-HIROSHIMA:sovereign_ai":{"type":"silent","citation":"Not addressed"},"OECD-AI-PRIN:foundation_models":{"type":"implicit","citation":"2024 update clarifies GPAI scope","confidence":"low"},"OECD-AI-PRIN:biometric_id":{"type":"silent","citation":"Not addressed at principle level","confidence":"low"},"OECD-AI-PRIN:deepfakes":{"type":"silent","citation":"Not addressed at principle level"},"OECD-AI-PRIN:employment":{"type":"silent","citation":"Not addressed sectorally"},"OECD-AI-PRIN:healthcare":{"type":"silent","citation":"Not addressed sectorally"},"OECD-AI-PRIN:criminal_justice":{"type":"silent","citation":"Not addressed sectorally"},"OECD-AI-PRIN:education":{"type":"silent","citation":"Not addressed sectorally"},"OECD-AI-PRIN:compute_reporting":{"type":"silent","citation":"Not addressed"},"OECD-AI-PRIN:transparency":{"type":"governs","citation":"Principle 1.3 (transparency + explainability)"},"OECD-AI-PRIN:redress":{"type":"governs","citation":"Principle 1.5 (accountability)","confidence":"low"},"OECD-AI-PRIN:training_data":{"type":"silent","citation":"Not addressed at principle level"},"OECD-AI-PRIN:sovereign_ai":{"type":"silent","citation":"Not addressed"},"COE-AI-CONV:foundation_models":{"type":"implicit","citation":"Applies to AI throughout lifecycle (Art. 3)"},"COE-AI-CONV:biometric_id":{"type":"implicit","citation":"Arts. 9-10 (privacy + non-discrimination)"},"COE-AI-CONV:deepfakes":{"type":"silent","citation":"Not addressed specifically"},"COE-AI-CONV:employment":{"type":"implicit","citation":"Non-discrimination + dignity provisions"},"COE-AI-CONV:healthcare":{"type":"silent","citation":"Sectoral; CoE Bioethics Convention separate"},"COE-AI-CONV:criminal_justice":{"type":"governs","citation":"Art. 14 (procedural safeguards)"},"COE-AI-CONV:education":{"type":"silent","citation":"Not addressed specifically"},"COE-AI-CONV:compute_reporting":{"type":"silent","citation":"Not addressed"},"COE-AI-CONV:transparency":{"type":"governs","citation":"Art. 8 (transparency + oversight)"},"COE-AI-CONV:redress":{"type":"governs","citation":"Arts. 14-15 (procedural safeguards + remedies)"},"COE-AI-CONV:training_data":{"type":"implicit","citation":"Art. 11 (privacy + data protection)"},"COE-AI-CONV:sovereign_ai":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:foundation_models":{"type":"silent","citation":"Resolution is principle-level, not specific"},"UN-RES-2024:biometric_id":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:deepfakes":{"type":"implicit","citation":"References disinformation broadly"},"UN-RES-2024:employment":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:healthcare":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:criminal_justice":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:education":{"type":"implicit","citation":"Calls on digital-divide bridging"},"UN-RES-2024:compute_reporting":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:transparency":{"type":"implicit","citation":"Calls for trustworthy AI broadly"},"UN-RES-2024:redress":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:training_data":{"type":"silent","citation":"Not addressed"},"UN-RES-2024:sovereign_ai":{"type":"silent","citation":"Not addressed"},"NIST-AI-RMF:foundation_models":{"type":"governs","citation":"GenAI Profile (NIST AI 600-1, 2024)"},"NIST-AI-RMF:biometric_id":{"type":"silent","citation":"Not in framework scope"},"NIST-AI-RMF:deepfakes":{"type":"implicit","citation":"GenAI Profile addresses synthetic content"},"NIST-AI-RMF:employment":{"type":"silent","citation":"Cross-cutting; not sectoral"},"NIST-AI-RMF:healthcare":{"type":"silent","citation":"Cross-cutting; not sectoral"},"NIST-AI-RMF:criminal_justice":{"type":"silent","citation":"Cross-cutting; not sectoral"},"NIST-AI-RMF:education":{"type":"silent","citation":"Cross-cutting; not sectoral"},"NIST-AI-RMF:compute_reporting":{"type":"silent","citation":"Framework is voluntary; EO did the reporting"},"NIST-AI-RMF:transparency":{"type":"governs","citation":"Trustworthy characteristics 5 (transparency) + 6 (explainability)"},"NIST-AI-RMF:redress":{"type":"implicit","citation":"Accountability characteristic"},"NIST-AI-RMF:training_data":{"type":"implicit","citation":"Manage 4: data integrity"},"NIST-AI-RMF:sovereign_ai":{"type":"silent","citation":"Not addressed"},"EU-AIA-2024:catastrophic_risk":{"type":"implicit","citation":"Art. 51 + Recital 32 — systemic risk overlaps with but does not fully cover catastrophic-risk framing"},"EU-AIA-2024:tech_sovereignty":{"type":"implicit","citation":"Recitals 1-5 + EU competence framing; AI Office establishes EU capacity"},"EU-AIA-2024:development_rights_framing":{"type":"silent","citation":"EU framework is rights-based but rooted in EU-charter rights, not development-rights doctrine"},"US-EO-14110:catastrophic_risk":{"type":"governs","citation":"§4.2(a)(ii) — CBRN + autonomous replication explicitly named"},"US-EO-14110:tech_sovereignty":{"type":"governs","citation":"§4.8 + CHIPS Act overlap (BIS export controls, domestic compute)"},"US-EO-14110:development_rights_framing":{"type":"silent","citation":"Not in US AI-governance vocabulary"},"UK-WHITEPAPER-2023:catastrophic_risk":{"type":"implicit","citation":"AISI remit covers frontier-model evaluation; not in white paper text"},"UK-WHITEPAPER-2023:tech_sovereignty":{"type":"implicit","citation":"Sovereign-capability framing in UK AI Action Plan (2025) — not in 2023 white paper"},"UK-WHITEPAPER-2023:development_rights_framing":{"type":"silent","citation":"Not in UK AI-governance vocabulary"},"CN-GENAI-2023:catastrophic_risk":{"type":"silent","citation":"PRC framing uses 'safety + security' broadly, not catastrophic risk"},"CN-GENAI-2023:tech_sovereignty":{"type":"governs","citation":"Art. 4 + national-strategy alignment; domestic-AI doctrine explicit"},"CN-GENAI-2023:development_rights_framing":{"type":"implicit","citation":"PRC has invoked development rights in UN AI debates (2024 GA)"},"G7-HIROSHIMA:catastrophic_risk":{"type":"governs","citation":"Code §1 + §3 — explicit risk-identification including CBRN"},"G7-HIROSHIMA:tech_sovereignty":{"type":"implicit","citation":"Adoption-by-developer framing; G7 carries implicit sovereignty assumptions"},"G7-HIROSHIMA:development_rights_framing":{"type":"silent","citation":"Not addressed in G7 framing"},"OECD-AI-PRIN:catastrophic_risk":{"type":"silent","citation":"2019 vintage — predates the 2023+ catastrophic-risk policy turn"},"OECD-AI-PRIN:tech_sovereignty":{"type":"silent","citation":"Not addressed; OECD framing is principles-not-sovereignty"},"OECD-AI-PRIN:development_rights_framing":{"type":"implicit","citation":"Principle 1.1 'inclusive growth' brushes against development-rights framing"},"COE-AI-CONV:catastrophic_risk":{"type":"silent","citation":"Treaty focuses on individual rights, not catastrophic-system risks"},"COE-AI-CONV:tech_sovereignty":{"type":"silent","citation":"Not addressed"},"COE-AI-CONV:development_rights_framing":{"type":"implicit","citation":"Rights-based framing partly overlaps with development-rights doctrine but not explicitly"},"UN-RES-2024:catastrophic_risk":{"type":"implicit","citation":"Notes 'shared concerns' but no operative catastrophic-risk text"},"UN-RES-2024:tech_sovereignty":{"type":"implicit","citation":"Calls for bridging digital divides — adjacent to but not sovereignty"},"UN-RES-2024:development_rights_framing":{"type":"governs","citation":"Operative paragraphs frame AI through development-rights + digital divide lens; co-sponsored by Global-South coalition"},"NIST-AI-RMF:catastrophic_risk":{"type":"implicit","citation":"Map 1.1 risk classification covers catastrophic via 'societal' impact tier; GenAI Profile (2024) adds explicit content"},"NIST-AI-RMF:tech_sovereignty":{"type":"silent","citation":"Methodology-not-sovereignty framing"},"NIST-AI-RMF:development_rights_framing":{"type":"silent","citation":"Not in NIST vocabulary"},"BLETCHLEY-2023:foundation_models":{"type":"governs","citation":"Declaration §1-2 (frontier AI defined as the subject)"},"BLETCHLEY-2023:catastrophic_risk":{"type":"governs","citation":"Declaration §3-5 (substantial risks from frontier AI, including catastrophic harm)"},"BLETCHLEY-2023:compute_reporting":{"type":"implicit","citation":"Declaration §6 calls for capability evaluation but does not specify compute thresholds"},"BLETCHLEY-2023:transparency":{"type":"implicit","citation":"Declaration §6 endorses transparency to evaluators; no operative requirements"},"BLETCHLEY-2023:international_coordination":{"type":"governs","citation":"Declaration §8-10 (international coordination is the operative ask)"},"SEOUL-2024:foundation_models":{"type":"governs","citation":"Declaration + accompanying Frontier AI Safety Commitments (16 signatory companies)"},"SEOUL-2024:catastrophic_risk":{"type":"governs","citation":"Frontier AI Safety Commitments §1: identify thresholds for severe risks pre-deployment"},"SEOUL-2024:compute_reporting":{"type":"implicit","citation":"Safety Commitments invoke capability thresholds; compute is one proxy"},"SEOUL-2024:transparency":{"type":"governs","citation":"Declaration §4 + Commitments §3 (publish safety frameworks)"},"SEOUL-2024:international_coordination":{"type":"governs","citation":"Declaration §5-7 (AISI network, follow-up summits)"},"NIST-AI-RMF-GENAI:foundation_models":{"type":"governs","citation":"Entire NIST AI 600-1 scope is GPAI / GenAI"},"NIST-AI-RMF-GENAI:catastrophic_risk":{"type":"governs","citation":"NIST AI 600-1 §3.1 CBRN Information Uplift; §3.3 Dangerous, Violent, or Hateful Content"},"NIST-AI-RMF-GENAI:deepfakes":{"type":"governs","citation":"NIST AI 600-1 §3.11 Confabulation + §3.10 Information Integrity (synthetic content)"},"NIST-AI-RMF-GENAI:training_data":{"type":"governs","citation":"NIST AI 600-1 §3.4 Data Privacy + §3.7 Intellectual Property"},"NIST-AI-RMF-GENAI:transparency":{"type":"governs","citation":"Govern + Map cross-cutting documentation requirements applied to GenAI"},"NIST-AI-RMF-GENAI:redress":{"type":"implicit","citation":"Accountability characteristic from base RMF; not GenAI-specific text"},"CA-SB-1047:foundation_models":{"type":"governs","citation":"Cal. SB-1047 §22603 — 'covered model' = above 10^26 FLOPs OR $100M training cost"},"CA-SB-1047:catastrophic_risk":{"type":"governs","citation":"Cal. SB-1047 §22602 — defines 'critical harm' including mass casualties, $500M+ damage"},"CA-SB-1047:compute_reporting":{"type":"governs","citation":"Cal. SB-1047 §22603(b) — annual reporting of training compute + safety determination"},"CA-SB-1047:transparency":{"type":"implicit","citation":"Required safety determinations are public; full safety case is to regulator only"},"CA-SB-1047:redress":{"type":"implicit","citation":"Whistleblower protections (§22607) + AG enforcement (§22608); no individual redress"},"IN-DPDP-2023:training_data":{"type":"governs","citation":"DPDPA §§4-7 (consent + purpose limitation for AI training data)"},"IN-DPDP-2023:transparency":{"type":"implicit","citation":"DPDPA §5 notice requirements + MEITY Mar-2024 Advisory transparency mandates"},"IN-DPDP-2023:redress":{"type":"governs","citation":"DPDPA §§13-15 (data principal rights, grievance + Data Protection Board)"},"IN-DPDP-2023:foundation_models":{"type":"implicit","citation":"MEITY Apr-2024 advisory walked back the Mar-2024 pre-deployment-approval requirement; current approach is post-deployment incident reporting"},"IN-DPDP-2023:development_rights_framing":{"type":"governs","citation":"Digital India framing centres development rights + tech-sovereignty; explicit in DPDPA preamble + MEITY's AI Mission documents"},"IN-DPDP-2023:deepfakes":{"type":"governs","citation":"MEITY Mar-2024 Advisory + IT Rules 2021 §3(1)(b)(v) deepfake takedown obligations"},"BR-AIBILL-2024:foundation_models":{"type":"governs","citation":"PL 2338/2023 Arts. 17-19 (general-purpose AI systemic-risk obligations)","confidence":"low"},"BR-AIBILL-2024:catastrophic_risk":{"type":"governs","citation":"PL 2338/2023 Art. 14 (excessive-risk AI applications — explicit prohibition + risk-tier framework)"},"BR-AIBILL-2024:transparency":{"type":"governs","citation":"PL 2338/2023 Art. 7 (right to information about AI use + algorithmic explanation)"},"BR-AIBILL-2024:redress":{"type":"governs","citation":"PL 2338/2023 Art. 9 (right to contest AI decisions, ANPD as regulator)"},"BR-AIBILL-2024:training_data":{"type":"implicit","citation":"PL 2338/2023 cross-references LGPD (2018) for data-rights baseline"},"BR-AIBILL-2024:development_rights_framing":{"type":"governs","citation":"PL 2338/2023 Arts. 3-4 (founding principles include 'sustainable development' + 'human dignity' — distinct from EU AIA's rights-only framing)"},"ASEAN-AI-GUIDE-2024:transparency":{"type":"governs","citation":"ASEAN Guide §4 (transparency + explainability principle)"},"ASEAN-AI-GUIDE-2024:foundation_models":{"type":"implicit","citation":"Guide §6 covers GenAI but with flexible implementation expectations"},"ASEAN-AI-GUIDE-2024:international_coordination":{"type":"governs","citation":"Guide explicitly designed to harmonise across ASEAN-10 member states + interoperate with OECD AI Principles + G7 Hiroshima"},"ASEAN-AI-GUIDE-2024:tech_sovereignty":{"type":"implicit","citation":"Guide framing emphasises ASEAN-bloc capacity-building over external dependency"},"ASEAN-AI-GUIDE-2024:development_rights_framing":{"type":"implicit","citation":"Guide centres 'pragmatic + flexible' implementation reflecting member-state development trajectories"},"AU-AI-STRATEGY-2024:development_rights_framing":{"type":"governs","citation":"AU Strategy §§1-3 (AI as continental development priority + data-coloniality framing)"},"AU-AI-STRATEGY-2024:tech_sovereignty":{"type":"governs","citation":"AU Strategy §4 (continental compute + data infrastructure + skill-formation)"},"AU-AI-STRATEGY-2024:training_data":{"type":"implicit","citation":"AU Strategy §5 + Malabo Convention (2014) data-protection baseline"},"AU-AI-STRATEGY-2024:foundation_models":{"type":"silent","citation":"Strategy is policy-level; specific foundation-model obligations deferred to national strategies"},"AU-AI-STRATEGY-2024:international_coordination":{"type":"governs","citation":"AU Strategy §6 (coordination with UN GA AI resolutions + AU-EU AI Working Group)"},"ANTHROPIC-RSP-2024:foundation_models":{"type":"governs","citation":"RSP v2 §2 — ASL framework applies to frontier model releases"},"ANTHROPIC-RSP-2024:catastrophic_risk":{"type":"governs","citation":"RSP v2 §3 — ASL-3 / ASL-4 capability thresholds explicitly target CBRN uplift + autonomous-replication"},"ANTHROPIC-RSP-2024:compute_reporting":{"type":"implicit","citation":"RSP v2 capability evaluations triggered by capability rather than pure compute; compute is one signal"},"ANTHROPIC-RSP-2024:transparency":{"type":"governs","citation":"RSP v2 §5 — public publication of safety determinations + capability eval methodology"},"ANTHROPIC-RSP-2024:international_coordination":{"type":"implicit","citation":"Seoul Frontier AI Safety Commitments signatory; coordinates with US + UK AISIs on capability evaluation"},"OPENAI-PREPAREDNESS-2023:foundation_models":{"type":"governs","citation":"Preparedness Framework §1-2 — applies to all OpenAI frontier-model releases"},"OPENAI-PREPAREDNESS-2023:catastrophic_risk":{"type":"governs","citation":"Preparedness Framework risk-tier matrix — Critical tier explicitly targets CBRN, cyber, persuasion, autonomy"},"OPENAI-PREPAREDNESS-2023:transparency":{"type":"implicit","citation":"Public Preparedness Reports + Safety Advisory Group decisions; full evaluation methodology partially disclosed"},"OPENAI-PREPAREDNESS-2023:compute_reporting":{"type":"implicit","citation":"Capability-tier evaluations are the primary trigger; compute is a coincident signal"},"OPENAI-PREPAREDNESS-2023:international_coordination":{"type":"implicit","citation":"Seoul Frontier AI Safety Commitments signatory; pre-deployment evaluation sharing with US + UK AISIs"},"DEEPMIND-FSF-2024:foundation_models":{"type":"governs","citation":"FSF applies to Google DeepMind frontier-model releases"},"DEEPMIND-FSF-2024:catastrophic_risk":{"type":"governs","citation":"FSF Critical Capability Levels (CCL) — explicit thresholds for autonomy, biosecurity, cyber, persuasion"},"DEEPMIND-FSF-2024:transparency":{"type":"implicit","citation":"FSF publication discloses framework + thresholds; per-evaluation outputs not consistently public"},"DEEPMIND-FSF-2024:international_coordination":{"type":"implicit","citation":"Seoul Frontier AI Safety Commitments signatory; UK AISI pre-deployment evaluation cooperation"},"META-FRONTIER-2024:foundation_models":{"type":"governs","citation":"Framework applies to Meta frontier-model releases (Llama family)"},"META-FRONTIER-2024:catastrophic_risk":{"type":"governs","citation":"Framework critical-risk tier — commit to halt training pre-mitigation if reached"},"META-FRONTIER-2024:transparency":{"type":"governs","citation":"Open-weight release + framework publication is itself a transparency posture; trade-off discussed in framework text"},"META-FRONTIER-2024:training_data":{"type":"implicit","citation":"Open-weight framing engages training-data + IP issues; not the framework's primary lane"},"META-FRONTIER-2024:international_coordination":{"type":"implicit","citation":"Seoul Frontier AI Safety Commitments signatory"},"UK-US-AISI-MOU-2024:foundation_models":{"type":"governs","citation":"MoU scope is frontier AI evaluation"},"UK-US-AISI-MOU-2024:catastrophic_risk":{"type":"implicit","citation":"Joint evaluation scope encompasses CBRN + autonomy uplift questions; MoU text does not enumerate explicit thresholds"},"UK-US-AISI-MOU-2024:transparency":{"type":"implicit","citation":"Information sharing between AISIs; not public-facing transparency obligations"},"UK-US-AISI-MOU-2024:international_coordination":{"type":"governs","citation":"MoU is the operative bilateral; precedent for the broader AISI network"},"WH-VOLUNTARY-2023:foundation_models":{"type":"governs","citation":"Commitments §1-2 — internal + external security testing of frontier models"},"WH-VOLUNTARY-2023:catastrophic_risk":{"type":"implicit","citation":"Commitments §1 references CBRN + bio risks via 'most significant societal risks'; not threshold-explicit"},"WH-VOLUNTARY-2023:deepfakes":{"type":"governs","citation":"Commitments §5 (watermarking + content provenance for AI-generated content)"},"WH-VOLUNTARY-2023:transparency":{"type":"governs","citation":"Commitments §6 (public reporting on capabilities, limitations, appropriate use)"},"WH-VOLUNTARY-2023:compute_reporting":{"type":"implicit","citation":"Self-reporting through commitments framework; binding compute thresholds came via EO 14110 §4.2(a)"},"WH-VOLUNTARY-2023:international_coordination":{"type":"implicit","citation":"Precursor to Seoul Frontier AI Safety Commitments; same signatory base largely overlaps"},"SG-MODEL-AI-2024:foundation_models":{"type":"governs","citation":"Framework Dimension 3 (Trusted Development + Deployment) explicitly covers GenAI models"},"SG-MODEL-AI-2024:transparency":{"type":"governs","citation":"Framework Dimension 7 (Content Provenance) + Dimension 5 (Testing + Assurance) — pairs with AI Verify toolkit"},"SG-MODEL-AI-2024:deepfakes":{"type":"governs","citation":"Framework Dimension 7 — content provenance + synthetic-content disclosure"},"SG-MODEL-AI-2024:redress":{"type":"implicit","citation":"Framework Dimension 1 (Accountability) + Dimension 4 (Incident Reporting); pairs with PDPA grievance regime"},"SG-MODEL-AI-2024:international_coordination":{"type":"governs","citation":"Framework explicitly aligns with G7 Hiroshima Code + OECD AI Principles; ASEAN Guide pairs"},"SG-MODEL-AI-2024:tech_sovereignty":{"type":"implicit","citation":"AI Verify Foundation positions Singapore as an interoperable AI-assurance hub"},"JP-METI-AI-2024:foundation_models":{"type":"governs","citation":"Guidelines Part 3 — covers AI providers including foundation-model developers"},"JP-METI-AI-2024:transparency":{"type":"governs","citation":"Guidelines Principle 5 (Transparency) — model documentation + capability disclosure"},"JP-METI-AI-2024:international_coordination":{"type":"governs","citation":"Guidelines explicit alignment with G7 Hiroshima AI Process Code of Conduct + OECD AI Principles"},"JP-METI-AI-2024:redress":{"type":"implicit","citation":"Principle 6 (Accountability) + Principle 8 (Fair Competition) — sectoral redress channels assumed"},"JP-METI-AI-2024:training_data":{"type":"implicit","citation":"Principle 4 (Safety) + Principle 2 (Education-Literacy) brush against training-data norms; ACA copyright regime separately addresses"},"EU-AIA-2024:agentic_systems_governance":{"type":"implicit","citation":"Arts. 26-29 deployer obligations apply to agent operators; Arts. 51-55 GPAI obligations capture the underlying model"},"US-EO-14110:agentic_systems_governance":{"type":"silent","citation":"§4.2(a) reporting captures the model layer, not autonomous-action behaviour"},"US-EO-14179:foundation_models":{"type":"silent","citation":"Deregulatory; rescinds EO 14110 §4.2(a) reporting framework without imposing replacement foundation-model rules","confidence":"high"},"US-EO-14179:agentic_systems_governance":{"type":"silent","citation":"Deregulatory; removes barriers without imposing agent-specific obligations"},"UK-WHITEPAPER-2023:agentic_systems_governance":{"type":"silent","citation":"Principle-based, regulator-led; no agent-specific cross-cutting rule"},"CN-GENAI-2023:agentic_systems_governance":{"type":"implicit","citation":"Arts. 4, 8 (service-provision scope) — agent-like generative services fall within registration + safety-assessment obligations"},"G7-HIROSHIMA:agentic_systems_governance":{"type":"implicit","citation":"Code §1 'advanced AI systems' + §3 risk-identification cover agentic behaviour through capability frame"},"OECD-AI-PRIN:agentic_systems_governance":{"type":"silent","citation":"Pre-dates agent-specific governance debate"},"COE-AI-CONV:agentic_systems_governance":{"type":"implicit","citation":"General-AI scope (Art. 3) covers agent systems; no agent-specific provision"},"UN-RES-2024:agentic_systems_governance":{"type":"silent","citation":"High-level resolution; no agent-specific language"},"NIST-AI-RMF:agentic_systems_governance":{"type":"implicit","citation":"Map / Manage functions apply to autonomous systems; no agent-specific profile yet"},"BLETCHLEY-2023:agentic_systems_governance":{"type":"implicit","citation":"Frontier-AI risk frame includes autonomous-action risks; no specific obligation"},"SEOUL-2024:agentic_systems_governance":{"type":"governs","citation":"Frontier AI Safety Commitments §3 — pre-deployment capability evaluations include agentic behaviours under 'realistic deployment conditions'"},"NIST-AI-RMF-GENAI:agentic_systems_governance":{"type":"governs","citation":"NIST AI 600-1 names Value Chain + Component Integration as risk category covering agentic / tool-use deployments"},"CA-SB-1047:agentic_systems_governance":{"type":"silent","citation":"Vetoed; would have applied to frontier models generally, not agents specifically"},"IN-DPDP-2023:agentic_systems_governance":{"type":"silent","citation":"Data-protection focus; no agent-specific provision"},"BR-AIBILL-2024:agentic_systems_governance":{"type":"implicit","citation":"Risk-based framework (PL 2338 Arts. 13-15) covers agent systems under high-risk tiers if applicable"},"ASEAN-AI-GUIDE-2024:agentic_systems_governance":{"type":"silent","citation":"Non-binding ethics guide; predates agent-specific debate"},"AU-AI-STRATEGY-2024:agentic_systems_governance":{"type":"silent","citation":"Strategy-level, no operational agent rules"},"ANTHROPIC-RSP-2024:agentic_systems_governance":{"type":"governs","citation":"RSP v2 — ASL thresholds include 'autonomous AI replication' + agentic capability evaluations","confidence":"high"},"OPENAI-PREPAREDNESS-2023:agentic_systems_governance":{"type":"governs","citation":"Preparedness Framework — Model Autonomy is one of four named risk categories","confidence":"high"},"DEEPMIND-FSF-2024:agentic_systems_governance":{"type":"governs","citation":"FSF Critical Capability Levels — Autonomy is one of four named CCL domains","confidence":"high"},"META-FRONTIER-2024:agentic_systems_governance":{"type":"implicit","citation":"Capability tiers cover agentic behaviour; not named as a distinct category"},"UK-US-AISI-MOU-2024:agentic_systems_governance":{"type":"implicit","citation":"Joint AISI capability evaluations include agentic-behaviour testing"},"WH-VOLUNTARY-2023:agentic_systems_governance":{"type":"silent","citation":"Predates agent-specific debate; covers eight cross-cutting commitments without agent specifics"},"SG-MODEL-AI-2024:agentic_systems_governance":{"type":"silent","citation":"GenAI-framework focus; predates agentic vocabulary"},"JP-METI-AI-2024:agentic_systems_governance":{"type":"silent","citation":"Guidelines pre-date agentic-specific debate"},"EU-AIA-2024:open_weight_release":{"type":"governs","citation":"Art. 53(2) + Recital 102/104 — explicit open-source GPAI exemption (with caveats for systemic-risk models)","confidence":"high"},"US-EO-14110:open_weight_release":{"type":"implicit","citation":"§4.6 NTIA report on dual-use foundation models specifically addresses open-weight risk; not binding obligation"},"US-EO-14179:open_weight_release":{"type":"silent","citation":"Deregulatory; does not address release modality"},"UK-WHITEPAPER-2023:open_weight_release":{"type":"silent","citation":"Principle-based; no release-modality rule"},"CN-GENAI-2023:open_weight_release":{"type":"implicit","citation":"Art. 8 — registration / safety assessment applies regardless of weight release modality"},"G7-HIROSHIMA:open_weight_release":{"type":"silent","citation":"Code does not differentiate by release modality"},"OECD-AI-PRIN:open_weight_release":{"type":"silent","citation":"Principles agnostic to release modality"},"COE-AI-CONV:open_weight_release":{"type":"silent","citation":"Framework-level; no release-modality provision"},"UN-RES-2024:open_weight_release":{"type":"silent","citation":"High-level resolution; no release-modality provision"},"NIST-AI-RMF:open_weight_release":{"type":"silent","citation":"Voluntary framework; agnostic to release modality"},"BLETCHLEY-2023:open_weight_release":{"type":"silent","citation":"Declaration text does not address release modality"},"SEOUL-2024:open_weight_release":{"type":"implicit","citation":"Frontier AI Safety Commitments apply to all 16 signatories regardless of open/closed weight stance (Meta is signatory)"},"NIST-AI-RMF-GENAI:open_weight_release":{"type":"silent","citation":"Profile is risk-domain-organised, not release-modality-organised"},"CA-SB-1047:open_weight_release":{"type":"governs","citation":"Vetoed bill — would have imposed pre-deployment testing on covered models including open-weight releases (Anthropic + Meta both objected on different grounds)","confidence":"high"},"IN-DPDP-2023:open_weight_release":{"type":"silent","citation":"Data-protection focus"},"BR-AIBILL-2024:open_weight_release":{"type":"silent","citation":"PL 2338 does not differentiate by release modality"},"ASEAN-AI-GUIDE-2024:open_weight_release":{"type":"silent","citation":"Non-binding ethics guide; no release-modality position"},"AU-AI-STRATEGY-2024:open_weight_release":{"type":"implicit","citation":"Continental strategy frames AI capacity-building — open access to weights aligns with capacity goals"},"ANTHROPIC-RSP-2024:open_weight_release":{"type":"implicit","citation":"RSP applies to Anthropic's models which are closed-weight; framework does not address third-party open release"},"OPENAI-PREPAREDNESS-2023:open_weight_release":{"type":"implicit","citation":"Framework applies to OpenAI deployments (closed-weight); does not address third-party open release"},"DEEPMIND-FSF-2024:open_weight_release":{"type":"implicit","citation":"Framework applies to Google DeepMind deployments (mostly closed); third-party open release not addressed"},"META-FRONTIER-2024:open_weight_release":{"type":"governs","citation":"Framework's distinctive feature — explicit defence of open-weight release as governance posture; halt-training commitment if 'critical risk' threshold reached without mitigations","confidence":"high"},"UK-US-AISI-MOU-2024:open_weight_release":{"type":"silent","citation":"MoU is on joint evaluations methodology; release-modality not addressed"},"WH-VOLUNTARY-2023:open_weight_release":{"type":"silent","citation":"Voluntary commitments predate the open/closed weight governance debate"},"SG-MODEL-AI-2024:open_weight_release":{"type":"silent","citation":"Framework does not differentiate by release modality"},"JP-METI-AI-2024:open_weight_release":{"type":"silent","citation":"Guidelines do not address release modality"},"EU-AIA-2024:synthetic_content_provenance":{"type":"governs","citation":"Art. 50(2) — provider machine-readable marking obligation; Art. 50(4) — deployer disclosure for deep fakes (distinct from the `deepfakes` topic which focuses on misuse-harms)","confidence":"high"},"US-EO-14110:synthetic_content_provenance":{"type":"governs","citation":"§4.5(a) — content authentication + watermarking standards via NIST + Commerce","confidence":"high"},"US-EO-14179:synthetic_content_provenance":{"type":"silent","citation":"Rescinds EO 14110's regulatory burden but §4.5 watermarking work continues at NIST; provenance not specifically governed"},"UK-WHITEPAPER-2023:synthetic_content_provenance":{"type":"silent","citation":"Principle-based; provenance not a cross-cutting principle"},"CN-GENAI-2023:synthetic_content_provenance":{"type":"governs","citation":"Art. 12 — mandatory marking of generative-AI output; aligns with Deep Synthesis Rules (2022) tagging requirements","confidence":"medium"},"G7-HIROSHIMA:synthetic_content_provenance":{"type":"governs","citation":"Code §6 — 'develop and deploy reliable content authentication and provenance mechanisms'","confidence":"high"},"OECD-AI-PRIN:synthetic_content_provenance":{"type":"silent","citation":"Principles pre-date the provenance debate"},"COE-AI-CONV:synthetic_content_provenance":{"type":"silent","citation":"Framework-level; provenance not addressed"},"UN-RES-2024:synthetic_content_provenance":{"type":"implicit","citation":"General call for state action on safe AI; provenance not specifically addressed"},"NIST-AI-RMF:synthetic_content_provenance":{"type":"implicit","citation":"General framework applies; provenance-specific guidance lives in the GenAI Profile"},"BLETCHLEY-2023:synthetic_content_provenance":{"type":"silent","citation":"Declaration focuses on frontier safety; provenance not addressed"},"SEOUL-2024:synthetic_content_provenance":{"type":"silent","citation":"Focus on capability evaluations; provenance not addressed"},"NIST-AI-RMF-GENAI:synthetic_content_provenance":{"type":"governs","citation":"NIST AI 600-1 — Information Integrity is one of 12 named GenAI risk categories; covers synthetic-content labelling + provenance","confidence":"high"},"CA-SB-1047:synthetic_content_provenance":{"type":"silent","citation":"Vetoed bill focused on safety incident reporting; provenance not addressed"},"IN-DPDP-2023:synthetic_content_provenance":{"type":"silent","citation":"Data-protection focus; MEITY advisories addressed deepfakes separately"},"BR-AIBILL-2024:synthetic_content_provenance":{"type":"implicit","citation":"PL 2338 general accuracy + transparency obligations would extend to provenance via interpretation"},"ASEAN-AI-GUIDE-2024:synthetic_content_provenance":{"type":"silent","citation":"Non-binding ethics guide; provenance not addressed"},"AU-AI-STRATEGY-2024:synthetic_content_provenance":{"type":"silent","citation":"Continental strategy; no provenance-specific provision"},"ANTHROPIC-RSP-2024:synthetic_content_provenance":{"type":"implicit","citation":"Deployment-stage controls would include content provenance where capability tier requires"},"OPENAI-PREPAREDNESS-2023:synthetic_content_provenance":{"type":"silent","citation":"Pre-deployment risk evaluation focus; provenance not a named risk category"},"DEEPMIND-FSF-2024:synthetic_content_provenance":{"type":"silent","citation":"FSF focuses on capability levels; provenance not in CCL domains"},"META-FRONTIER-2024:synthetic_content_provenance":{"type":"silent","citation":"Framework focuses on capability tiers; provenance not addressed"},"UK-US-AISI-MOU-2024:synthetic_content_provenance":{"type":"silent","citation":"MoU focuses on capability evaluations; provenance not in scope"},"WH-VOLUNTARY-2023:synthetic_content_provenance":{"type":"governs","citation":"Voluntary commitment #5 — 'develop and deploy mechanisms that enable users to understand if audio or visual content is AI-generated, including robust provenance, watermarking, or both'","confidence":"high"},"SG-MODEL-AI-2024:synthetic_content_provenance":{"type":"governs","citation":"Framework dimension 7 — Content Provenance (one of nine framework dimensions, paired with AI Verify Foundation's technical-testing toolkit)","confidence":"high"},"JP-METI-AI-2024:synthetic_content_provenance":{"type":"implicit","citation":"Principle 5 (Transparency) + Hiroshima-alignment imply provenance obligations via reference incorporation"}},"counts":{"instruments":26,"topics":19,"benchmarks":10,"concepts":30,"coverageCells":297}}