{
  "schema_version": "core-reviewed-term-ai-handoff-v2",
  "version": "core-reviewed-term-ai-handoff-v2",
  "compat": {
    "backward_compatible_schema_versions": [
      "core-reviewed-term-ai-handoff-v1"
    ],
    "v1_fields_preserved": true
  },
  "contract_status": "preview",
  "distribution": "per_term_read_only_ai_handoff",
  "no_write_operations": true,
  "content_tier": "db_backed",
  "canonical": {
    "url": "https://core.yogoq.com/en-US/core/ai-evaluation",
    "slug": "ai-evaluation",
    "locale": "en-US"
  },
  "summary": {
    "schema_version": "core-reviewed-term-summary-v1",
    "term_id": "17e45979-823d-54d6-b3a9-317d422b8574",
    "canonical_slug": "ai-evaluation",
    "canonical_url": "https://core.yogoq.com/en-US/core/ai-evaluation",
    "locale": "en-US",
    "display_name": "AI Evaluation",
    "english_name": null,
    "abbreviation": "Evals",
    "short_definition": "AI evaluation tests whether AI outputs or actions meet quality, safety, cost, and business criteria. It is a prerequisite for production generative AI and agent workflows.",
    "content_tier": "db_backed",
    "quality": "reviewed",
    "publication_status": "published_reviewed",
    "version": "core-reviewed-term-summary-v1",
    "last_reviewed_at": "2026-07-04T15:30:00.000Z",
    "sources_count": 2,
    "limitations_key": "core-trust-policy-v1-2026-06-22"
  },
  "aliases": [
    {
      "text": "AI Evaluation",
      "type": "display_name",
      "locale": "en-US"
    },
    {
      "text": "Evals",
      "type": "abbreviation",
      "locale": null
    },
    {
      "text": "AI評価セット",
      "type": "katakana",
      "locale": "en-US"
    },
    {
      "text": "AI Evaluation",
      "type": "english_name",
      "locale": "en-US"
    },
    {
      "text": "AI評価",
      "type": "localized_title",
      "locale": "ja-JP"
    }
  ],
  "content": {
    "definition": {
      "key": "definition",
      "title": "一言でいうと",
      "text": "AI evaluation tests whether AI outputs or actions meet quality, safety, cost, and business criteria. It is a prerequisite for production generative AI and agent workflows.",
      "items": []
    },
    "formula": {
      "key": "formula",
      "title": "計算の考え方",
      "text": "AI evaluation uses scores for each task type and failure rates. Pass rate | Passing cases / eval cases | Shows minimum launch quality Critical failure rate | High-impact errors / eval cases | Captures incident risk Regression rate | Previously passing cases now failing / prior passing cases | Shows change impact",
      "items": [
        "Pass rate | Passing cases / eval cases | Shows minimum launch quality",
        "Critical failure rate | High-impact errors / eval cases | Captures incident risk",
        "Regression rate | Previously passing cases now failing / prior passing cases | Shows change impact"
      ]
    },
    "boundary": {
      "key": "boundary",
      "title": "含めるもの / 含めないもの",
      "text": "AI evaluation covers model, prompt, data, tools, UI, and operating rules. Include | Accuracy, grounding, format, safety, bias, tool actions, review load | Practical quality Exclude | One-off impressions, demo feel, model-name comparisons | Not reproducible Make explicit | Cases, rubric, pass line, reviewer, update cadence | Required for improvement",
      "items": [
        "Include | Accuracy, grounding, format, safety, bias, tool actions, review load | Practical quality",
        "Exclude | One-off impressions, demo feel, model-name comparisons | Not reproducible",
        "Make explicit | Cases, rubric, pass line, reviewer, update cadence | Required for improvement"
      ]
    },
    "usage": [
      {
        "key": "meaning",
        "title": "意味",
        "text": "AI evaluation is the practice of repeatedly measuring whether an AI system's answers, decisions, tool calls, and long-task behavior meet expected criteria. It applies not only to classification with known labels, but also to summarization, grounded answers, drafting, code generation, and agent execution. A production evaluation defines test cases, scoring criteria, passing thresholds, human review ownership, and update cadence. Because models, prompts, retrieval data, and tools change, evaluation is an operating system for quality rather than a one-time acceptance check.",
        "items": []
      },
      {
        "key": "usage",
        "title": "役立つ場面",
        "text": "Teams can compare model, prompt, retrieval, and fine-tuning options with evidence. Launch gates can block unacceptable failures before production. AI product improvement becomes KPI-driven rather than impression-driven.",
        "items": [
          "Teams can compare model, prompt, retrieval, and fine-tuning options with evidence.",
          "Launch gates can block unacceptable failures before production.",
          "AI product improvement becomes KPI-driven rather than impression-driven."
        ]
      },
      {
        "key": "usage",
        "title": "使い方のポイント",
        "text": null,
        "items": [
          "AI evaluation measures AI output and behavior reproducibly.",
          "It includes prompts, data, tools, UI, and operations, not only the model.",
          "Representative, failure, and boundary cases are needed for production confidence.",
          "Model or prompt changes require regression tests.",
          "Critical failure rate matters as much as average performance."
        ]
      },
      {
        "key": "drivers",
        "title": "何が数字を動かすか",
        "text": "Evaluation quality depends on representative cases, failure cases, scoring criteria, and regression testing. Case design | Include real user questions and known failures Rubric | Make pass/fail explainable Failure examples | Boundary and risk cases prevent incidents Regression | Verify changes do not break known-good behavior",
        "items": [
          "Case design | Include real user questions and known failures",
          "Rubric | Make pass/fail explainable",
          "Failure examples | Boundary and risk cases prevent incidents",
          "Regression | Verify changes do not break known-good behavior"
        ]
      }
    ],
    "misunderstandings": [
      {
        "key": "misunderstandings",
        "title": "判断するときの注意点",
        "text": "Do not launch on a high average score alone. High-impact use cases need critical-failure gates, not just averages. Easy eval sets create false confidence. Human review should check scorer alignment and rubric drift.",
        "items": [
          "High-impact use cases need critical-failure gates, not just averages.",
          "Easy eval sets create false confidence.",
          "Human review should check scorer alignment and rubric drift."
        ]
      },
      {
        "key": "misunderstandings",
        "title": "よくある誤解 / 落とし穴",
        "text": null,
        "items": [
          "Evaluation is not one-and-done. Model and data changes require re-evaluation.",
          "A few human spot checks are not enough without representative and failure cases.",
          "A high average score can hide unacceptable critical failures."
        ]
      }
    ],
    "examples": [
      {
        "key": "examples",
        "title": "最小例",
        "text": "A marketing team uses AI to draft campaign ideas. Initially reviewers judge outputs subjectively, so a model change quietly reduces quality. The team creates an eval set with good and bad prior examples and scores persona fit, brand tone, prohibited phrases, evidence, and CTA clarity. Every prompt change is compared on pass rate and critical failure rate. A brand-violation failure blocks release even if the average score improves. The discussion becomes evidence-based rather than opinion-based.",
        "items": []
      }
    ],
    "comparisons": [
      {
        "key": "comparisons",
        "title": "似ている言葉との違い",
        "text": "AI Evaluation | Tests outputs and behavior | Judges production quality A/B Test | Compares user response | Measures real-world impact after launch Monitoring | Tracks production behavior | Detects drift after release",
        "items": [
          "AI Evaluation | Tests outputs and behavior | Judges production quality",
          "A/B Test | Compares user response | Measures real-world impact after launch",
          "Monitoring | Tracks production behavior | Detects drift after release"
        ]
      },
      {
        "key": "related_metrics",
        "title": "一緒に見る指標",
        "text": "AI evaluation is the base layer for generative AI, prompting, tuning, and agents. Generative AI | Produces outputs to evaluate | Needs launch gates Prompt Engineering | Changes need measurement | Keeps iteration grounded AI Agent | Long tasks and tool use need evaluation | Success rate alone is insufficient",
        "items": [
          "Generative AI | Produces outputs to evaluate | Needs launch gates",
          "Prompt Engineering | Changes need measurement | Keeps iteration grounded",
          "AI Agent | Long tasks and tool use need evaluation | Success rate alone is insufficient"
        ]
      }
    ],
    "faq": [
      {
        "question": "Is AI evaluation only model comparison?",
        "answer": "No. It evaluates prompts, data, retrieval, tools, UI, and operating rules."
      },
      {
        "question": "How many cases are needed?",
        "answer": "It depends on risk. Start with representative and dangerous cases, then expand from usage logs."
      },
      {
        "question": "Is average score enough?",
        "answer": "No. Critical failures and prohibited behavior should be separate launch blockers."
      }
    ]
  },
  "source_refs": [
    "concept:ai-evaluation:en-US",
    "concept:ai-evaluation:ja-JP",
    "core-product-update-ai-term-pack-v1:ai-evaluation",
    "core-product-update-ai-term-pack-v1:Evals"
  ],
  "sources": {
    "source_refs": [
      "concept:ai-evaluation:en-US",
      "concept:ai-evaluation:ja-JP",
      "core-product-update-ai-term-pack-v1:ai-evaluation",
      "core-product-update-ai-term-pack-v1:Evals"
    ],
    "visible_sources": [
      {
        "label": "NIST: AI RMF",
        "url": "https://nvlpubs.nist.gov/nistpubs/ai/nist.ai.100-1.pdf",
        "kind": "tier_s"
      },
      {
        "label": "NIST: Generative AI Profile",
        "url": "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf",
        "kind": "tier_s"
      }
    ]
  },
  "related_terms": [
    {
      "topic_id": "slug:generative-ai",
      "canonical_slug": "generative-ai",
      "canonical_url": "https://core.yogoq.com/en-US/core/generative-ai",
      "title": "Generative AI",
      "relation_type": "used_together"
    },
    {
      "topic_id": "slug:prompt-engineering",
      "canonical_slug": "prompt-engineering",
      "canonical_url": "https://core.yogoq.com/en-US/core/prompt-engineering",
      "title": "Prompt Engineering",
      "relation_type": "used_together"
    },
    {
      "topic_id": "slug:fine-tuning",
      "canonical_slug": "fine-tuning",
      "canonical_url": "https://core.yogoq.com/en-US/core/fine-tuning",
      "title": "Fine-tuning",
      "relation_type": "used_together"
    },
    {
      "topic_id": "slug:model-evaluation",
      "canonical_slug": "model-evaluation",
      "canonical_url": "https://core.yogoq.com/en-US/core/model-evaluation",
      "title": "Model Evaluation",
      "relation_type": "confusable_with"
    }
  ],
  "relations": [
    {
      "term_id": "slug:generative-ai",
      "canonical_slug": "generative-ai",
      "canonical_url": "https://core.yogoq.com/en-US/core/generative-ai",
      "display_name": "Generative AI",
      "relation_type": "related",
      "source_relation_type": "used_together"
    },
    {
      "term_id": "slug:prompt-engineering",
      "canonical_slug": "prompt-engineering",
      "canonical_url": "https://core.yogoq.com/en-US/core/prompt-engineering",
      "display_name": "Prompt Engineering",
      "relation_type": "related",
      "source_relation_type": "used_together"
    },
    {
      "term_id": "slug:fine-tuning",
      "canonical_slug": "fine-tuning",
      "canonical_url": "https://core.yogoq.com/en-US/core/fine-tuning",
      "display_name": "Fine-tuning",
      "relation_type": "related",
      "source_relation_type": "used_together"
    },
    {
      "term_id": "slug:model-evaluation",
      "canonical_slug": "model-evaluation",
      "canonical_url": "https://core.yogoq.com/en-US/core/model-evaluation",
      "display_name": "Model Evaluation",
      "relation_type": "compare",
      "source_relation_type": "confusable_with"
    }
  ],
  "chunk_ids": [
    "core:chunk:ai-evaluation:en-US:definition:674f46a6e5da989a",
    "core:chunk:ai-evaluation:en-US:formula:c15f941f3af1c403",
    "core:chunk:ai-evaluation:en-US:boundary:47e39ee06c655226",
    "core:chunk:ai-evaluation:en-US:meaning:55fab6fca2eb6e76",
    "core:chunk:ai-evaluation:en-US:usage:7dfb722212b4d505",
    "core:chunk:ai-evaluation:en-US:usage:17205ea76364fc18",
    "core:chunk:ai-evaluation:en-US:drivers:7249ad86abe7ac58",
    "core:chunk:ai-evaluation:en-US:misunderstandings:03aa037c676e1849",
    "core:chunk:ai-evaluation:en-US:misunderstandings:fa544b7662daf80c",
    "core:chunk:ai-evaluation:en-US:examples:5f1fbe314aa7ca0b",
    "core:chunk:ai-evaluation:en-US:comparisons:81933e614bd09be3",
    "core:chunk:ai-evaluation:en-US:related_metrics:2a7049da97317bef",
    "core:chunk:ai-evaluation:en-US:faq:06a1052c55962a47",
    "core:chunk:ai-evaluation:en-US:faq:e90414eefb2532f4",
    "core:chunk:ai-evaluation:en-US:faq:e48fb90637cfbbdb"
  ],
  "chunks": [
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:definition:674f46a6e5da989a",
      "section_key": "definition",
      "heading": "一言でいうと",
      "text": "AI evaluation tests whether AI outputs or actions meet quality, safety, cost, and business criteria. It is a prerequisite for production generative AI and agent workflows.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "674f46a6e5da989ad287e06bde59cfe8c3f9000fa4777bb9fc0c239d1b51e277"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:formula:c15f941f3af1c403",
      "section_key": "formula",
      "heading": "計算の考え方",
      "text": "AI evaluation uses scores for each task type and failure rates. Pass rate | Passing cases / eval cases | Shows minimum launch quality Critical failure rate | High-impact errors / eval cases | Captures incident risk Regression rate | Previously passing cases now failing / prior passing cases | Shows change impact Pass rate | Passing cases / eval cases | Shows minimum launch quality Critical failure rate | High-impact errors / eval cases | Captures incident risk Regression rate | Previously passing cases now failing / prior passing cases | Shows change impact",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "c15f941f3af1c4039b97742ec6f0157b0461d053daf60c8315f575f97156a465"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:boundary:47e39ee06c655226",
      "section_key": "boundary",
      "heading": "含めるもの / 含めないもの",
      "text": "AI evaluation covers model, prompt, data, tools, UI, and operating rules. Include | Accuracy, grounding, format, safety, bias, tool actions, review load | Practical quality Exclude | One-off impressions, demo feel, model-name comparisons | Not reproducible Make explicit | Cases, rubric, pass line, reviewer, update cadence | Required for improvement Include | Accuracy, grounding, format, safety, bias, tool actions, review load | Practical quality Exclude | One-off impressions, demo feel, model-name comparisons | Not reproducible Make explicit | Cases, rubric, pass line, reviewer, update cadence | Required for improvement",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "47e39ee06c65522672a7e3483d3413f2b25ae2d95484e514aacb78916ab313a4"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:meaning:55fab6fca2eb6e76",
      "section_key": "meaning",
      "heading": "意味",
      "text": "AI evaluation is the practice of repeatedly measuring whether an AI system's answers, decisions, tool calls, and long-task behavior meet expected criteria. It applies not only to classification with known labels, but also to summarization, grounded answers, drafting, code generation, and agent execution. A production evaluation defines test cases, scoring criteria, passing thresholds, human review ownership, and update cadence. Because models, prompts, retrieval data, and tools change, evaluation is an operating system for quality rather than a one-time acceptance check.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "55fab6fca2eb6e76a180581c5e41c66e0ad3f9c7118d97bb36699dc91bdd6611"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:usage:7dfb722212b4d505",
      "section_key": "usage",
      "heading": "役立つ場面",
      "text": "Teams can compare model, prompt, retrieval, and fine-tuning options with evidence. Launch gates can block unacceptable failures before production. AI product improvement becomes KPI-driven rather than impression-driven. Teams can compare model, prompt, retrieval, and fine-tuning options with evidence. Launch gates can block unacceptable failures before production. AI product improvement becomes KPI-driven rather than impression-driven.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "7dfb722212b4d50569fb8182b3e0b1025e53b0422524ad00e393147bf20c4d45"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:usage:17205ea76364fc18",
      "section_key": "usage",
      "heading": "使い方のポイント",
      "text": "AI evaluation measures AI output and behavior reproducibly. It includes prompts, data, tools, UI, and operations, not only the model. Representative, failure, and boundary cases are needed for production confidence. Model or prompt changes require regression tests. Critical failure rate matters as much as average performance.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "17205ea76364fc18e06ff19f112f365a7f99c0dd3fbfc73aa969315779f0d827"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:drivers:7249ad86abe7ac58",
      "section_key": "drivers",
      "heading": "何が数字を動かすか",
      "text": "Evaluation quality depends on representative cases, failure cases, scoring criteria, and regression testing. Case design | Include real user questions and known failures Rubric | Make pass/fail explainable Failure examples | Boundary and risk cases prevent incidents Regression | Verify changes do not break known-good behavior Case design | Include real user questions and known failures Rubric | Make pass/fail explainable Failure examples | Boundary and risk cases prevent incidents Regression | Verify changes do not break known-good behavior",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "7249ad86abe7ac583d336759a682d433c28dd710c0d9cbb1216ccffb42f104ac"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:misunderstandings:03aa037c676e1849",
      "section_key": "misunderstandings",
      "heading": "判断するときの注意点",
      "text": "Do not launch on a high average score alone. High-impact use cases need critical-failure gates, not just averages. Easy eval sets create false confidence. Human review should check scorer alignment and rubric drift. High-impact use cases need critical-failure gates, not just averages. Easy eval sets create false confidence. Human review should check scorer alignment and rubric drift.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "03aa037c676e1849a8b6cb13128ed730fa54880cd055d1ebeedd3dcc252c0ebe"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:misunderstandings:fa544b7662daf80c",
      "section_key": "misunderstandings",
      "heading": "よくある誤解 / 落とし穴",
      "text": "Evaluation is not one-and-done. Model and data changes require re-evaluation. A few human spot checks are not enough without representative and failure cases. A high average score can hide unacceptable critical failures.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "fa544b7662daf80c7f716edf73bb370d89e82404d25c299b4aa8373600edc6a0"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:examples:5f1fbe314aa7ca0b",
      "section_key": "examples",
      "heading": "最小例",
      "text": "A marketing team uses AI to draft campaign ideas. Initially reviewers judge outputs subjectively, so a model change quietly reduces quality. The team creates an eval set with good and bad prior examples and scores persona fit, brand tone, prohibited phrases, evidence, and CTA clarity. Every prompt change is compared on pass rate and critical failure rate. A brand-violation failure blocks release even if the average score improves. The discussion becomes evidence-based rather than opinion-based.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "5f1fbe314aa7ca0b1ce3e22ae45806277f1c519b158f1926cf529d466282d3c5"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:comparisons:81933e614bd09be3",
      "section_key": "comparisons",
      "heading": "似ている言葉との違い",
      "text": "AI Evaluation | Tests outputs and behavior | Judges production quality A/B Test | Compares user response | Measures real-world impact after launch Monitoring | Tracks production behavior | Detects drift after release AI Evaluation | Tests outputs and behavior | Judges production quality A/B Test | Compares user response | Measures real-world impact after launch Monitoring | Tracks production behavior | Detects drift after release",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "81933e614bd09be34b2376fd6668d25d9c25001f8f171e932e5f2de9de56f807"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:related_metrics:2a7049da97317bef",
      "section_key": "related_metrics",
      "heading": "一緒に見る指標",
      "text": "AI evaluation is the base layer for generative AI, prompting, tuning, and agents. Generative AI | Produces outputs to evaluate | Needs launch gates Prompt Engineering | Changes need measurement | Keeps iteration grounded AI Agent | Long tasks and tool use need evaluation | Success rate alone is insufficient Generative AI | Produces outputs to evaluate | Needs launch gates Prompt Engineering | Changes need measurement | Keeps iteration grounded AI Agent | Long tasks and tool use need evaluation | Success rate alone is insufficient",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "2a7049da97317bef5c143a9b75d13059d8d1d25d866f2d44d5c596804253c3ed"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:faq:06a1052c55962a47",
      "section_key": "faq",
      "heading": "Is AI evaluation only model comparison?",
      "text": "Is AI evaluation only model comparison? No. It evaluates prompts, data, retrieval, tools, UI, and operating rules.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "06a1052c55962a4788d4584797c7cc27ba6bbe9bdf62233479715574aebbaba4"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:faq:e90414eefb2532f4",
      "section_key": "faq",
      "heading": "How many cases are needed?",
      "text": "How many cases are needed? It depends on risk. Start with representative and dangerous cases, then expand from usage logs.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "e90414eefb2532f410b209d79d1ce2a7b52416ac8deb8244a93abc45bb0d633b"
    },
    {
      "chunk_id": "core:chunk:ai-evaluation:en-US:faq:e48fb90637cfbbdb",
      "section_key": "faq",
      "heading": "Is average score enough?",
      "text": "Is average score enough? No. Critical failures and prohibited behavior should be separate launch blockers.",
      "source_refs": [
        "concept:ai-evaluation:en-US",
        "concept:ai-evaluation:ja-JP",
        "core-product-update-ai-term-pack-v1:ai-evaluation",
        "core-product-update-ai-term-pack-v1:Evals"
      ],
      "content_hash": "e48fb90637cfbbdbad948ad7e62dc79094a248e771bcc7eb21777bf72ce2808d"
    }
  ],
  "content_hash": "159b20f6ca4bde2416d7bae534b671f3e83d2b277a7b42662f70d6f0b5371a14",
  "hash_algorithm": "sha256",
  "rag": {
    "chunk_id_policy": "core:chunk:{canonical_slug}:{locale}:{section_key}:{sha256_16}",
    "content_hash_policy": "sha256_stable_json",
    "source_refs_precision": "source_object_ref_or_source_key_current_snapshot"
  },
  "quality": {
    "content_tier": "db_backed",
    "quality": "reviewed",
    "publication_status": "published_reviewed",
    "source_count": 2,
    "last_reviewed_at": "2026-07-04T15:30:00.000Z",
    "trust_policy_version": "core-trust-policy-v1-2026-06-22"
  },
  "limitations": {
    "policy_version": "core-trust-policy-v1-2026-06-22",
    "professional_advice_boundary": "reference_only_not_professional_advice",
    "text": "This page is reference information for research and learning. For accounting, legal, finance, health, security, or other individual decisions, confirm against primary sources or qualified professionals.",
    "items": [
      "Public pages support general understanding and practical context; they are not professional advice for individual cases.",
      "Fast-changing information such as regulations, accounting standards, prices, product specs, and legal requirements should be checked against primary sources before final decisions.",
      "Even when AI-assisted drafting or audit is used, publication relies on quality gates and human-readable evidence."
    ]
  }
}