[{"data":1,"prerenderedAt":758},["ShallowReactive",2],{"repo-tree":3,"repo-\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002Freadme":283},[4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,64,67,69,72,75,78,81,84,86,88,90,93,96,99,102,105,108,111,114,117,120,123,125,127,129,131,133,135,138,141,143,146,149,152,155,158,161,164,167,169,172,175,178,180,183,186,189,192,195,198,201,203,206,209,212,215,218,221,224,227,230,233,236,239,242,245,248,251,254,257,260,263,266,269,272,275,278,281],{"path":5,"title":6},"\u002Fagents\u002Fbackend-code-style","Backend Conventions",{"path":8,"title":9},"\u002Fagents\u002Fdatabase","Database",{"path":11,"title":12},"\u002Fagents\u002Fportal-code-style","Portal Conventions",{"path":14,"title":15},"\u002Fagents\u002Ftranslation","Translation",{"path":17,"title":18},"\u002Fconventions\u002Fbackend-coding","Backend coding conventions",{"path":20,"title":21},"\u002Fconventions\u002Ffrontend-coding","Frontend coding conventions",{"path":23,"title":24},"\u002Fdevelopment-process","Development process",{"path":26,"title":27},"\u002Flearning-api-preview-hetzner-setup","Learning API Preview on Hetzner + Cloudflare",{"path":29,"title":30},"\u002Flearning-api-preview-vm-plan","Learning API Preview VM Plan",{"path":32,"title":33},"\u002Fmonorepo-structure","Monorepo structure",{"path":35,"title":36},"\u002Foperations","Operations — bugs and support",{"path":38,"title":39},"\u002Fpostmortems\u002F2026-03-16_onboarding-currency-regression","Onboarding Zod transform silently broken — web signups assigned wrong checkout currency",{"path":41,"title":42},"\u002Fpostmortems\u002Freadme","Postmortems",{"path":44,"title":45},"\u002Fpostmortems\u002F_template","TEMPLATE",{"path":47,"title":48},"\u002Fpostmortems\u002Fposthog-comparison","Postmortem practice — comparison with PostHog",{"path":50,"title":51},"\u002Fpreview-environment-plan","Preview Environment Plan",{"path":53,"title":54},"\u002Fprinciples","Engineering principles",{"path":56,"title":57},"\u002Fworking-with-ai","Working with AI",{"path":59,"title":60},"\u002F.claude\u002Fskills\u002Feval-playground\u002Fskill","Eval Playground — Co-development Skill",{"path":62,"title":63},"\u002F.claude\u002Fskills\u002Ffigma-diff-section\u002Fskill","Figma Diff Section Pipeline",{"path":65,"title":66},"\u002Fagents","AGENTS.md",{"path":68,"title":66},"\u002Fclaude",{"path":70,"title":71},"\u002Freadme","Studyflash",{"path":73,"title":74},"\u002Fapps\u002Fcore-api\u002Fagents","Core API (apps\u002Fcore-api)",{"path":76,"title":77},"\u002Fapps\u002Fcore-api\u002Freadme","README",{"path":79,"title":80},"\u002Fapps\u002Femail-previews\u002Fagents","Email Previews (apps\u002Femail-previews)",{"path":82,"title":83},"\u002Fapps\u002Flanding-page\u002Fagents","Landing Page (apps\u002Flanding-page)",{"path":85,"title":83},"\u002Fapps\u002Flanding-page\u002Fclaude",{"path":87,"title":66},"\u002Fapps\u002Flearning-api\u002Fagents",{"path":89,"title":77},"\u002Fapps\u002Flearning-api\u002Freadme",{"path":91,"title":92},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Feval_metrics_design","Surface-Specific Eval Metrics Design",{"path":94,"title":95},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ftest_set","Quiz Eval Test Set",{"path":97,"title":98},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ffrontend\u002Freadme","React + TypeScript + Vite",{"path":100,"title":101},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fcontent-pillar-shallow-coverage\u002Freadme","Content pillar misses subtopics in dense documents",{"path":103,"title":104},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-empty-section-headers\u002Freadme","Empty section headers dropped by docling chunker",{"path":106,"title":107},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-table-reading-order\u002Freadme","Table\u002Fbox layout causes wrong reading order",{"path":109,"title":110},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002Freadme","Quiz eval metrics — canonical rubrics",{"path":112,"title":113},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-12-quiz-summary-feedback-current-state","Quiz and Summary Feedback Current State",{"path":115,"title":116},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics","Quiz Evaluation Metrics",{"path":118,"title":119},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state","Quiz Eval Current State",{"path":121,"title":122},"\u002Fapps\u002Flearning-api\u002Fmonitoring\u002Freadme","Monitoring Stack",{"path":124,"title":77},"\u002Fapps\u002Flearning-api\u002Fshared\u002Freadme",{"path":126,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fflashcard_agent\u002Freadme",{"path":128,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fingestion_agent\u002Freadme",{"path":130,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fquiz_agent\u002Freadme",{"path":132,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fsummary_agent\u002Freadme",{"path":134,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Fparser\u002Freadme",{"path":136,"title":137},"\u002Fapps\u002Fmarketing-emails-preview\u002Fagents","Marketing Emails Preview (apps\u002Fmarketing-emails-preview)",{"path":139,"title":140},"\u002Fapps\u002Fmobile-app\u002Fagents","StudyFlash Mobile App - Claude Code Configuration",{"path":142,"title":140},"\u002Fapps\u002Fmobile-app\u002Fclaude",{"path":144,"title":145},"\u002Fapps\u002Fmountain-max\u002Fagents","Mountain Max (apps\u002Fmountain-max)",{"path":147,"title":148},"\u002Fapps\u002Fmountain-max\u002Fgame\u002Freadme","Mountain Max Game",{"path":150,"title":151},"\u002Fapps\u002Fportal\u002Fagents","Portal (apps\u002Fportal)",{"path":153,"title":154},"\u002Fapps\u002Fportal\u002Freadme","Nuxt Minimal Starter",{"path":156,"title":157},"\u002Fapps\u002Fportal\u002Fapp\u002Fcomposables\u002Ffiles\u002Freadme","File Upload Composables",{"path":159,"title":160},"\u002Fapps\u002Fportal\u002Fdocs\u002Flibrary-routing","Library Routing Documentation",{"path":162,"title":163},"\u002Fapps\u002Fsupabase\u002Fagents","Supabase (apps\u002Fsupabase)",{"path":165,"title":166},"\u002Fapps\u002Fwrapped\u002Fagents","Wrapped (apps\u002Fwrapped)",{"path":168,"title":98},"\u002Fapps\u002Fwrapped\u002Freadme",{"path":170,"title":171},"\u002Finfra\u002Freadme","infra\u002F",{"path":173,"title":174},"\u002Finfra\u002Fdns\u002Freadme","DNS Infrastructure",{"path":176,"title":177},"\u002Finfra\u002Fdokploy\u002Freadme","studyflash-dokploy",{"path":179,"title":77},"\u002Finfra\u002Fdokploy\u002Fsdk\u002Fnodejs\u002Freadme",{"path":181,"title":182},"\u002Finfra\u002Finfisical\u002Freadme","Infisical Infrastructure",{"path":184,"title":185},"\u002Finfra\u002Flearning-api\u002Freadme","Pulumi GCP TypeScript Template",{"path":187,"title":188},"\u002Finfra\u002Fopenreplay\u002Freadme","OpenReplay on Hetzner",{"path":190,"title":191},"\u002Finfra\u002Fscripts\u002Freadme","infra\u002Fscripts\u002F",{"path":193,"title":194},"\u002Finfra\u002Fturborepo-cache\u002Freadme","Turborepo Remote Cache Infrastructure",{"path":196,"title":197},"\u002Finternal\u002Fchatwoot\u002Freadme","Chatwoot Infrastructure",{"path":199,"title":200},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Freadme","studyflash-chatwoot-provider",{"path":202,"title":77},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Fsdk\u002Fnodejs\u002Freadme",{"path":204,"title":205},"\u002Finternal\u002Fdocs\u002Freadme","internal\u002Fdocs",{"path":207,"title":208},"\u002Finternal\u002Fsupport-bot\u002Fclaude","Support Bot (Maximilian)",{"path":210,"title":211},"\u002Finternal\u002Fsupport-bot\u002Freadme","Studyflash Customer Support Bot (Maximilian)",{"path":213,"title":214},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Faccount_issues","Account Issues",{"path":216,"title":217},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fbilling_invoice","Billing Invoice",{"path":219,"title":220},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fcontent_upload","Content Upload",{"path":222,"title":223},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fdata_loss","Data Loss",{"path":225,"title":226},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fflashcard_issues","Flashcard Issues",{"path":228,"title":229},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgarbage","Garbage",{"path":231,"title":232},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgeneral_how_to","General How To",{"path":234,"title":235},"\u002Finternal\u002Fsupport-bot\u002Fkb","Knowledge Base Index",{"path":237,"title":238},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Flanguage_issues","Language Issues",{"path":240,"title":241},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmindmap_issues","Mindmap Issues",{"path":243,"title":244},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmisunderstanding","Misunderstanding",{"path":246,"title":247},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmock_exam_issues","Mock Exam Issues",{"path":249,"title":250},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fpodcast_issues","Podcast Issues",{"path":252,"title":253},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fquiz_issues","Quiz Issues",{"path":255,"title":256},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Frefund_request","Refund Request",{"path":258,"title":259},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_cancellation","Subscription Cancellation",{"path":261,"title":262},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_info","Subscription Info",{"path":264,"title":265},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsummary_issues","Summary Issues",{"path":267,"title":268},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Ftechnical_errors","Technical Errors",{"path":270,"title":271},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fvideo_issues","Video Issues",{"path":273,"title":274},"\u002Fpackages\u002Fcommon\u002Fdocs\u002Fearly-access-features","Declarative Early Access Features",{"path":276,"title":277},"\u002Fpackages\u002Fcommon\u002Fscripts\u002Freadme","Common Package Scripts",{"path":279,"title":280},"\u002Fpackages\u002Fdevtools\u002Ffigma-plugins\u002Freadme","Figma plugins",{"path":282,"title":77},"\u002Fpackages\u002Fpulumi-infisical\u002Freadme",{"id":284,"title":110,"body":285,"description":750,"extension":751,"lastReviewed":752,"meta":753,"navigation":754,"owner":752,"path":109,"seo":755,"status":752,"stem":756,"tags":752,"__hash__":757},"repo\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002FREADME.md",{"type":286,"value":287,"toc":736},"minimark",[288,292,305,309,316,330,335,363,369,372,377,382,387,401,406,409,414,419,429,440,443,448,461,465,479,491,494,502,507,511,551,566,569,577,582,587,599,608,611,616,621,626,646,654,658,668,677,681,684,705,709,712,733],[289,290,110],"h1",{"id":291},"quiz-eval-metrics-canonical-rubrics",[293,294,295,296,300,301,304],"p",{},"Single source of truth for what each metric measures. When you change a rubric here, update BOTH the workflow judge's prompt (in the metric's ",[297,298,299],"code",{},".py",") AND the cross-check bundle prep (in ",[297,302,303],{},"cli.py",") so the two judges stay aligned. Drift between them makes agreement numbers meaningless.",[306,307,308],"h2",{"id":308},"quiz_correctness",[293,310,311,315],{},[312,313,314],"strong",{},"What it measures:"," whether each question's marked-correct answer is supported by the source text the question was generated from, and distractors are not.",[293,317,318,321,322,325,326,329],{},[312,319,320],{},"Inputs:"," questions with ",[297,323,324],{},"chunk_id"," pointing to a topic-merged chunk whose full ",[297,327,328],{},"text"," is supplied.",[293,331,332],{},[312,333,334],{},"Classes (4-way, objective):",[336,337,338,345,351,357],"ul",{},[339,340,341,344],"li",{},[297,342,343],{},"CORRECT"," — source text EXPLICITLY supports the marked correct answer AND contradicts each distractor. Judge must be able to quote the supporting passage.",[339,346,347,350],{},[297,348,349],{},"INCORRECT_ANSWER"," — source contradicts a choice marked correct, or supports a choice marked as a distractor more strongly than the marked correct one.",[339,352,353,356],{},[297,354,355],{},"INCORRECT_DISTRACTOR"," — a distractor is also a valid correct answer per the source.",[339,358,359,362],{},[297,360,361],{},"UNSUPPORTED"," — source does not contain enough information to verify either way. Do NOT give benefit of the doubt.",[293,364,365,368],{},[312,366,367],{},"Notes:"," 4-way because the classes are categorically distinct (not ordinal degrees), so they stay multi-class for both workflow and agent judges. Agent calibration on sonnet: ~94% agreement with workflow across 5 decks.",[306,370,371],{"id":371},"quiz_distractor_quality",[293,373,374,376],{},[312,375,314],{}," whether distractors are plausible enough to tempt a student who has NOT studied the material.",[293,378,379,381],{},[312,380,320],{}," MCQ \u002F single_choice questions only (fill_in_the_blank and true_false skipped).",[293,383,384],{},[312,385,386],{},"Classes (binary, subjective):",[336,388,389,395],{},[339,390,391,394],{},[297,392,393],{},"PLAUSIBLE"," — a student without subject knowledge could reasonably be tempted to pick this. Common misconceptions, adjacent domain concepts with correct terminology, inverted causality using real facts, partial truths.",[339,396,397,400],{},[297,398,399],{},"NOT_PLAUSIBLE"," — eliminable without studying: contradicts an adjective in the question stem, contradicts culturally-embedded common knowledge, off-topic, or too generic to answer the question.",[293,402,403,405],{},[312,404,367],{}," subjective middle class (WEAK) was collapsed to binary because agent-vs-agent self-agreement on 3-way was ~72%, making any >70% workflow-vs-agent target impossible. Binary + opus gets agent self-agreement to ~91%.",[306,407,408],{"id":408},"quiz_redundancy",[293,410,411,413],{},[312,412,314],{}," whether two questions in the same deck test the same underlying fact (different phrasing \u002F choice sets count as redundant if the underlying tested fact is the same).",[293,415,416,418],{},[312,417,320],{}," all questions from a single generation.",[293,420,421,424,425,428],{},[312,422,423],{},"How:"," adapter over ",[297,426,427],{},"metrics\u002Fredundancy.py",". Questions become Flashcard-shaped (question + joined correct answers). Embedding-threshold retrieval (BM25 + cosine + RRF) produces candidate pairs; LLM verification confirms true redundancy. Returns redundancy groups.",[293,430,431,433,434,436,437,439],{},[312,432,367],{}," per-deck signal. Shares prompt + threshold with flashcard redundancy so the rubric stays consistent across surfaces. Update ",[297,435,427],{}," to change behavior; ",[297,438,408],{}," is a thin adapter.",[306,441,442],{"id":442},"quiz_relevance",[293,444,445,447],{},[312,446,314],{}," whether each question tests meaningful, generalisable subject-matter knowledge versus administrative\u002Flogistical trivia or non-generalisable illustrative-example details.",[293,449,450,452,453,456,457,460],{},[312,451,320],{}," questions only, plus an optional ",[297,454,455],{},"document_summary"," (pulled from ",[297,458,459],{},"content_pillars[\"document_summary\"]",") so the judge knows what \"on-topic\" means for this deck.",[293,462,463],{},[312,464,386],{},[336,466,467,473],{},[339,468,469,472],{},[297,470,471],{},"GOOD"," — tests a concept, definition, mechanism, relationship, or key fact from the subject matter.",[339,474,475,478],{},[297,476,477],{},"BAD"," — tests admin\u002Flogistics (professor email, deadlines, slide numbers, textbook ISBN, URLs), document\u002Fsource metadata (section headings as trivia), or one-off details from illustrative anecdotes.",[293,480,481,483,484,486,487,490],{},[312,482,367],{}," binary for the same reason as ",[297,485,371],{}," — a middle \"debatable\" class makes self-agreement fall below any usable threshold. ",[297,488,489],{},"good_rate = good \u002F total",". Mirrors the flashcard relevance LLM path.",[306,492,493],{"id":493},"quiz_difficulty",[293,495,496,498,499,501],{},[312,497,314],{}," whether the stem→correct-choice path is too easy — i.e. a student who has NOT studied the source can pick the right answer via stem-echo, tautology, grammatical cueing, or common cultural knowledge. Orthogonal to ",[297,500,371],{},", which judges the wrong-answer elimination path.",[293,503,504,506],{},[312,505,320],{}," questions only (no source grounding). Judge sees stem + all choices with correctness marks.",[293,508,509],{},[312,510,386],{},[336,512,513,545],{},[339,514,515,518,519],{},[297,516,517],{},"TRIVIAL"," — a non-student could arrive at the correct answer. Tells:\n",[336,520,521,527,533,539],{},[339,522,523,526],{},[312,524,525],{},"Stem-echo"," — distinctive stem term paraphrased in the correct choice but absent from distractors.",[339,528,529,532],{},[312,530,531],{},"Grammatical cueing"," — stem grammar (a\u002Fan, singular\u002Fplural, tense) fits only one choice.",[339,534,535,538],{},[312,536,537],{},"Common-knowledge"," — educated-adult general knowledge suffices (famous geography, basic biology).",[339,540,541,544],{},[312,542,543],{},"Tautology"," — the question's own terms give the answer (e.g. \"the dual aspect\" → \"two\").",[339,546,547,550],{},[297,548,549],{},"NOT_TRIVIAL"," — picking the correct answer requires specific subject knowledge.",[293,552,553,555,556,558,559,561,562,565],{},[312,554,367],{}," binary; reasoning MUST name the specific tell when ",[297,557,517],{},". Complements ",[297,560,371],{}," (right-answer angle vs wrong-answer angle of the ",[297,563,564],{},"too_easy"," user complaint).",[306,567,568],{"id":568},"quiz_leakage",[293,570,571,573,574,576],{},[312,572,314],{}," whether the stem LITERALLY reveals the correct-choice text. Strict literal-overlap check; paraphrased stem-echo belongs to ",[297,575,493],{},".",[293,578,579,581],{},[312,580,320],{}," questions only (stem + choices with correctness marks).",[293,583,584],{},[312,585,586],{},"Classes (binary, objective):",[336,588,589,594],{},[339,590,591,593],{},[297,592,477],{}," — a verbatim token or multi-word phrase from the correct-choice text appears in the stem in a way that distinguishes the correct choice from distractors. Includes fill-in-the-blank questions where the answer word appears outside the blank position.",[339,595,596,598],{},[297,597,471],{}," — no literal overlap that gives the correct choice away.",[293,600,601,603,604,607],{},[312,602,367],{}," port of the flashcard ",[297,605,606],{},"metrics\u002Fleakage.py"," rubric to quiz inputs; kept deliberately narrow (literal match only) so the LLM judge stays consistent. Conceptual\u002Fsemantic hints and shared domain vocabulary are NOT leakage — they fall under difficulty.",[306,609,610],{"id":610},"quiz_clarity",[293,612,613,615],{},[312,614,314],{}," whether a question is self-contained — a prepared student can answer it from the question text + choices alone.",[293,617,618,620],{},[312,619,320],{}," questions only (no source).",[293,622,623],{},[312,624,625],{},"Classes:",[336,627,628,634,640],{},[339,629,630,633],{},[297,631,632],{},"CLEAR"," — self-contained, one unambiguous interpretation.",[339,635,636,639],{},[297,637,638],{},"AMBIGUOUS"," — multiple plausible readings of what's being asked, or the right answer depends on how the student interprets the phrasing.",[339,641,642,645],{},[297,643,644],{},"CONTEXT_DEPENDENT"," — references material the student can't see (\"according to the passage\", \"the author argues\", dangling pronouns).",[293,647,648,650,651,576],{},[312,649,367],{}," no source grounding needed — clarity is a property of the question text itself. ",[297,652,653],{},"good_rate = clear \u002F total",[306,655,657],{"id":656},"quiz_structural-static-no-llm","quiz_structural (static, no LLM)",[293,659,660,663,664,667],{},[312,661,662],{},"rejection_rate:"," ",[297,665,666],{},"rejected_count \u002F raw_question_count",". Post-LLM-parse validation failures (invalid_choices, missing correct answer, wrong T\u002FF prefix). Flags malformed structured output from the generation prompt.",[293,669,670,663,673,676],{},[312,671,672],{},"chunk_yield_rate:",[297,674,675],{},"chunks_with_≥1_valid_question \u002F total_chunks",". Catches silent LLM-parse failures where a chunk yielded zero questions.",[306,678,680],{"id":679},"quiz_structural_bias-static-no-llm","quiz_structural_bias (static, no LLM)",[293,682,683],{},"Measures surface-cue giveaways a student could exploit without knowing the material. Computed only on single-correct questions (multi-correct MCQ reported as side-channel).",[336,685,686,695],{},[339,687,688,691,692,576],{},[312,689,690],{},"length_outlier_rate:"," fraction of single-correct questions where the correct answer is uniquely longest or shortest. ",[297,693,694],{},"good_rate = 1 - length_outlier_rate",[339,696,697,700,701,576],{},[312,698,699],{},"tf_true_rate:"," fraction of true_false questions where \"True\" is the correct answer. Balanced = 0.5; flagged outside ",[702,703,704],"span",{},"0.35, 0.65",[306,706,708],{"id":707},"calibration-workflow","Calibration workflow",[293,710,711],{},"Before cross-checking any LLM metric against subagents, confirm the rubric in this doc matches the rubric embedded in:",[336,713,714,724],{},[339,715,716,717,720,721,723],{},"the workflow prompt (search for ",[297,718,719],{},"_PROMPT = "," in the metric's ",[297,722,299],{},")",[339,725,726,727,730,731,723],{},"the bundle preparation (search for ",[297,728,729],{},"prepare-*-review"," in ",[297,732,303],{},[293,734,735],{},"If they've drifted, update all three to match before trusting any agreement numbers.",{"title":737,"searchDepth":738,"depth":738,"links":739},"",2,[740,741,742,743,744,745,746,747,748,749],{"id":308,"depth":738,"text":308},{"id":371,"depth":738,"text":371},{"id":408,"depth":738,"text":408},{"id":442,"depth":738,"text":442},{"id":493,"depth":738,"text":493},{"id":568,"depth":738,"text":568},{"id":610,"depth":738,"text":610},{"id":656,"depth":738,"text":657},{"id":679,"depth":738,"text":680},{"id":707,"depth":738,"text":708},"Single source of truth for what each metric measures. When you change a rubric here, update BOTH the workflow judge's prompt (in the metric's .py) AND the cross-check bundle prep (in cli.py) so the two judges stay aligned. Drift between them makes agreement numbers meaningless.","md",null,{},true,{"title":110,"description":750},"apps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002FREADME","xFEdTIEJYVL4RdP4fjrXJdEeZRs_EQQSCA5Mjg7310c",1779007962950]