[{"data":1,"prerenderedAt":3045},["ShallowReactive",2],{"repo-tree":3,"repo-\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics":283},[4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,64,67,69,72,75,78,81,84,86,88,90,93,96,99,102,105,108,111,114,117,120,123,125,127,129,131,133,135,138,141,143,146,149,152,155,158,161,164,167,169,172,175,178,180,183,186,189,192,195,198,201,203,206,209,212,215,218,221,224,227,230,233,236,239,242,245,248,251,254,257,260,263,266,269,272,275,278,281],{"path":5,"title":6},"\u002Fagents\u002Fbackend-code-style","Backend Conventions",{"path":8,"title":9},"\u002Fagents\u002Fdatabase","Database",{"path":11,"title":12},"\u002Fagents\u002Fportal-code-style","Portal Conventions",{"path":14,"title":15},"\u002Fagents\u002Ftranslation","Translation",{"path":17,"title":18},"\u002Fconventions\u002Fbackend-coding","Backend coding conventions",{"path":20,"title":21},"\u002Fconventions\u002Ffrontend-coding","Frontend coding conventions",{"path":23,"title":24},"\u002Fdevelopment-process","Development process",{"path":26,"title":27},"\u002Flearning-api-preview-hetzner-setup","Learning API Preview on Hetzner + Cloudflare",{"path":29,"title":30},"\u002Flearning-api-preview-vm-plan","Learning API Preview VM Plan",{"path":32,"title":33},"\u002Fmonorepo-structure","Monorepo structure",{"path":35,"title":36},"\u002Foperations","Operations — bugs and support",{"path":38,"title":39},"\u002Fpostmortems\u002F2026-03-16_onboarding-currency-regression","Onboarding Zod transform silently broken — web signups assigned wrong checkout currency",{"path":41,"title":42},"\u002Fpostmortems\u002Freadme","Postmortems",{"path":44,"title":45},"\u002Fpostmortems\u002F_template","TEMPLATE",{"path":47,"title":48},"\u002Fpostmortems\u002Fposthog-comparison","Postmortem practice — comparison with PostHog",{"path":50,"title":51},"\u002Fpreview-environment-plan","Preview Environment Plan",{"path":53,"title":54},"\u002Fprinciples","Engineering principles",{"path":56,"title":57},"\u002Fworking-with-ai","Working with AI",{"path":59,"title":60},"\u002F.claude\u002Fskills\u002Feval-playground\u002Fskill","Eval Playground — Co-development Skill",{"path":62,"title":63},"\u002F.claude\u002Fskills\u002Ffigma-diff-section\u002Fskill","Figma Diff Section Pipeline",{"path":65,"title":66},"\u002Fagents","AGENTS.md",{"path":68,"title":66},"\u002Fclaude",{"path":70,"title":71},"\u002Freadme","Studyflash",{"path":73,"title":74},"\u002Fapps\u002Fcore-api\u002Fagents","Core API (apps\u002Fcore-api)",{"path":76,"title":77},"\u002Fapps\u002Fcore-api\u002Freadme","README",{"path":79,"title":80},"\u002Fapps\u002Femail-previews\u002Fagents","Email Previews (apps\u002Femail-previews)",{"path":82,"title":83},"\u002Fapps\u002Flanding-page\u002Fagents","Landing Page (apps\u002Flanding-page)",{"path":85,"title":83},"\u002Fapps\u002Flanding-page\u002Fclaude",{"path":87,"title":66},"\u002Fapps\u002Flearning-api\u002Fagents",{"path":89,"title":77},"\u002Fapps\u002Flearning-api\u002Freadme",{"path":91,"title":92},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Feval_metrics_design","Surface-Specific Eval Metrics Design",{"path":94,"title":95},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ftest_set","Quiz Eval Test Set",{"path":97,"title":98},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ffrontend\u002Freadme","React + TypeScript + Vite",{"path":100,"title":101},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fcontent-pillar-shallow-coverage\u002Freadme","Content pillar misses subtopics in dense documents",{"path":103,"title":104},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-empty-section-headers\u002Freadme","Empty section headers dropped by docling chunker",{"path":106,"title":107},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-table-reading-order\u002Freadme","Table\u002Fbox layout causes wrong reading order",{"path":109,"title":110},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002Freadme","Quiz eval metrics — canonical rubrics",{"path":112,"title":113},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-12-quiz-summary-feedback-current-state","Quiz and Summary Feedback Current State",{"path":115,"title":116},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics","Quiz Evaluation Metrics",{"path":118,"title":119},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state","Quiz Eval Current State",{"path":121,"title":122},"\u002Fapps\u002Flearning-api\u002Fmonitoring\u002Freadme","Monitoring Stack",{"path":124,"title":77},"\u002Fapps\u002Flearning-api\u002Fshared\u002Freadme",{"path":126,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fflashcard_agent\u002Freadme",{"path":128,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fingestion_agent\u002Freadme",{"path":130,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fquiz_agent\u002Freadme",{"path":132,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fsummary_agent\u002Freadme",{"path":134,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Fparser\u002Freadme",{"path":136,"title":137},"\u002Fapps\u002Fmarketing-emails-preview\u002Fagents","Marketing Emails Preview (apps\u002Fmarketing-emails-preview)",{"path":139,"title":140},"\u002Fapps\u002Fmobile-app\u002Fagents","StudyFlash Mobile App - Claude Code Configuration",{"path":142,"title":140},"\u002Fapps\u002Fmobile-app\u002Fclaude",{"path":144,"title":145},"\u002Fapps\u002Fmountain-max\u002Fagents","Mountain Max (apps\u002Fmountain-max)",{"path":147,"title":148},"\u002Fapps\u002Fmountain-max\u002Fgame\u002Freadme","Mountain Max Game",{"path":150,"title":151},"\u002Fapps\u002Fportal\u002Fagents","Portal (apps\u002Fportal)",{"path":153,"title":154},"\u002Fapps\u002Fportal\u002Freadme","Nuxt Minimal Starter",{"path":156,"title":157},"\u002Fapps\u002Fportal\u002Fapp\u002Fcomposables\u002Ffiles\u002Freadme","File Upload Composables",{"path":159,"title":160},"\u002Fapps\u002Fportal\u002Fdocs\u002Flibrary-routing","Library Routing Documentation",{"path":162,"title":163},"\u002Fapps\u002Fsupabase\u002Fagents","Supabase (apps\u002Fsupabase)",{"path":165,"title":166},"\u002Fapps\u002Fwrapped\u002Fagents","Wrapped (apps\u002Fwrapped)",{"path":168,"title":98},"\u002Fapps\u002Fwrapped\u002Freadme",{"path":170,"title":171},"\u002Finfra\u002Freadme","infra\u002F",{"path":173,"title":174},"\u002Finfra\u002Fdns\u002Freadme","DNS Infrastructure",{"path":176,"title":177},"\u002Finfra\u002Fdokploy\u002Freadme","studyflash-dokploy",{"path":179,"title":77},"\u002Finfra\u002Fdokploy\u002Fsdk\u002Fnodejs\u002Freadme",{"path":181,"title":182},"\u002Finfra\u002Finfisical\u002Freadme","Infisical Infrastructure",{"path":184,"title":185},"\u002Finfra\u002Flearning-api\u002Freadme","Pulumi GCP TypeScript Template",{"path":187,"title":188},"\u002Finfra\u002Fopenreplay\u002Freadme","OpenReplay on Hetzner",{"path":190,"title":191},"\u002Finfra\u002Fscripts\u002Freadme","infra\u002Fscripts\u002F",{"path":193,"title":194},"\u002Finfra\u002Fturborepo-cache\u002Freadme","Turborepo Remote Cache Infrastructure",{"path":196,"title":197},"\u002Finternal\u002Fchatwoot\u002Freadme","Chatwoot Infrastructure",{"path":199,"title":200},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Freadme","studyflash-chatwoot-provider",{"path":202,"title":77},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Fsdk\u002Fnodejs\u002Freadme",{"path":204,"title":205},"\u002Finternal\u002Fdocs\u002Freadme","internal\u002Fdocs",{"path":207,"title":208},"\u002Finternal\u002Fsupport-bot\u002Fclaude","Support Bot (Maximilian)",{"path":210,"title":211},"\u002Finternal\u002Fsupport-bot\u002Freadme","Studyflash Customer Support Bot (Maximilian)",{"path":213,"title":214},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Faccount_issues","Account Issues",{"path":216,"title":217},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fbilling_invoice","Billing Invoice",{"path":219,"title":220},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fcontent_upload","Content Upload",{"path":222,"title":223},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fdata_loss","Data Loss",{"path":225,"title":226},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fflashcard_issues","Flashcard Issues",{"path":228,"title":229},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgarbage","Garbage",{"path":231,"title":232},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgeneral_how_to","General How To",{"path":234,"title":235},"\u002Finternal\u002Fsupport-bot\u002Fkb","Knowledge Base Index",{"path":237,"title":238},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Flanguage_issues","Language Issues",{"path":240,"title":241},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmindmap_issues","Mindmap Issues",{"path":243,"title":244},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmisunderstanding","Misunderstanding",{"path":246,"title":247},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmock_exam_issues","Mock Exam Issues",{"path":249,"title":250},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fpodcast_issues","Podcast Issues",{"path":252,"title":253},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fquiz_issues","Quiz Issues",{"path":255,"title":256},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Frefund_request","Refund Request",{"path":258,"title":259},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_cancellation","Subscription Cancellation",{"path":261,"title":262},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_info","Subscription Info",{"path":264,"title":265},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsummary_issues","Summary Issues",{"path":267,"title":268},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Ftechnical_errors","Technical Errors",{"path":270,"title":271},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fvideo_issues","Video Issues",{"path":273,"title":274},"\u002Fpackages\u002Fcommon\u002Fdocs\u002Fearly-access-features","Declarative Early Access Features",{"path":276,"title":277},"\u002Fpackages\u002Fcommon\u002Fscripts\u002Freadme","Common Package Scripts",{"path":279,"title":280},"\u002Fpackages\u002Fdevtools\u002Ffigma-plugins\u002Freadme","Figma plugins",{"path":282,"title":77},"\u002Fpackages\u002Fpulumi-infisical\u002Freadme",{"id":284,"title":116,"body":285,"description":3037,"extension":3038,"lastReviewed":3039,"meta":3040,"navigation":3041,"owner":3039,"path":115,"seo":3042,"status":3039,"stem":3043,"tags":3039,"__hash__":3044},"repo\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics.md",{"type":286,"value":287,"toc":2984},"minimark",[288,292,300,303,306,316,321,324,329,389,394,397,416,421,431,437,444,449,457,466,525,536,538,546,553,587,596,598,606,613,619,624,626,648,650,658,663,666,671,735,740,754,758,764,769,775,779,786,797,800,807,809,816,826,829,835,837,844,865,868,875,877,885,895,898,905,907,915,920,925,932,935,941,946,949,957,962,968,971,976,983,988,994,996,1004,1009,1012,1017,1031,1038,1042,1048,1053,1059,1062,1068,1086,1088,1094,1111,1113,1118,1135,1137,1143,1161,1163,1170,1175,1178,1185,1212,1218,1221,1225,1231,1234,1237,1274,1276,1295,1297,1316,1318,1323,1329,1340,1342,1349,1354,1357,1433,1436,1440,1446,1449,1456,1461,1467,1469,1477,1482,1485,1491,1497,1507,1556,1560,1566,1573,1578,1584,1586,1594,1599,1602,1616,1622,1636,1639,1647,1651,1657,1660,1666,1671,1677,1679,1683,1686,1688,1695,1700,1703,1711,1717,1720,1728,1734,1741,1750,1752,1759,1764,1767,1772,1779,1785,1788,1797,1801,1804,1810,1820,1823,1828,1834,1848,1850,1854,1867,1871,2051,2054,2060,2064,2067,2165,2168,2172,2175,2179,2185,2189,2293,2296,2300,2303,2452,2455,2458,2464,2468,2500,2502,2506,2509,2513,2516,2520,2530,2537,2543,2545,2549,2560,2564,2725,2731,2734,2738,2804,2807,2811,2851,2854,2858,2861,2967,2974],[289,290,116],"h1",{"id":291},"quiz-evaluation-metrics",[293,294,295,299],"p",{},[296,297,298],"strong",{},"Date:"," 2026-04-24",[293,301,302],{},"Eight LLM-judged metrics plus two static diagnostics. The first three are dealbreakers — if any of them fails, the quiz is broken. The rest are quality signals. The two static metrics (structural health and structural bias) run without any LLM calls and catch pipeline\u002Fformatting issues the judges can't see.",[304,305],"hr",{},[307,308,310,311,315],"h2",{"id":309},"_1-quiz_correctness-source-grounded-answer-verification","1. ",[312,313,314],"code",{},"quiz_correctness"," — source-grounded answer verification",[293,317,318],{},[296,319,320],{},"\"Is the marked answer actually correct?\"",[293,322,323],{},"This is the only metric with source grounding at judge time — the judge sees the full chunk text the question was generated from.",[293,325,326],{},[296,327,328],{},"4 classes (not a scale — distinct failure modes):",[330,331,332,345],"table",{},[333,334,335],"thead",{},[336,337,338,342],"tr",{},[339,340,341],"th",{},"Class",[339,343,344],{},"What it means",[346,347,348,359,369,379],"tbody",{},[336,349,350,356],{},[351,352,353],"td",{},[312,354,355],{},"CORRECT",[351,357,358],{},"Source explicitly supports the marked answer AND contradicts each distractor. Judge must quote the passage.",[336,360,361,366],{},[351,362,363],{},[312,364,365],{},"INCORRECT_ANSWER",[351,367,368],{},"Source contradicts the \"correct\" answer, or supports a distractor better.",[336,370,371,376],{},[351,372,373],{},[312,374,375],{},"INCORRECT_DISTRACTOR",[351,377,378],{},"A distractor is also valid per the source (two right answers).",[336,380,381,386],{},[351,382,383],{},[312,384,385],{},"UNSUPPORTED",[351,387,388],{},"Source doesn't have enough info to verify. No benefit of the doubt.",[293,390,391],{},[296,392,393],{},"Why 4-way instead of binary?",[293,395,396],{},"Each failure mode has a different fix:",[398,399,400,406,411],"ul",{},[401,402,403,405],"li",{},[312,404,365],{}," → fix the generation prompt to ground better",[401,407,408,410],{},[312,409,375],{}," → fix distractor generation to check for overlap",[401,412,413,415],{},[312,414,385],{}," → the generator is hallucinating beyond the source",[293,417,418],{},[296,419,420],{},"Score:",[422,423,428],"pre",{"className":424,"code":426,"language":427},[425],"language-text","correct_rate = CORRECT \u002F total × 100\n","text",[312,429,426],{"__ignoreMap":430},"",[293,432,433,436],{},[296,434,435],{},"Across variants:"," Tight range (92–95%). All models are good here — correctness is largely a solved problem at this point. The biggest differentiator is not the model but the chunking strategy.",[293,438,439],{},[440,441],"img",{"alt":442,"src":443},"Correctness by variant","charts\u002Fcorrectness_by_variant.png",[445,446,448],"h3",{"id":447},"real-failure-examples-biology-deck","Real failure examples (biology deck)",[293,450,451],{},[296,452,453,454,456],{},"Failure 1 — ",[312,455,375],{}," (two right answers)",[458,459,460],"blockquote",{},[293,461,462,465],{},[296,463,464],{},"Q:"," Which of the following are examples of transcellular fluid?",[330,467,468,481],{},[333,469,470],{},[336,471,472,475,478],{},[339,473,474],{},"Choice",[339,476,477],{},"Marked as",[339,479,480],{},"Verdict",[346,482,483,494,503,512],{},[336,484,485,488,491],{},[351,486,487],{},"Cerebrospinal fluid",[351,489,490],{},"✓ Correct",[351,492,493],{},"Supported",[336,495,496,499,501],{},[351,497,498],{},"Synovial fluid",[351,500,490],{},[351,502,493],{},[336,504,505,508,510],{},[351,506,507],{},"Aqueous humor",[351,509,490],{},[351,511,493],{},[336,513,514,517,520],{},[351,515,516],{},"Blood plasma",[351,518,519],{},"✗ Distractor",[351,521,522],{},[296,523,524],{},"Problem",[293,526,527,530,531,535],{},[296,528,529],{},"Why it failed:"," Source says ",[532,533,534],"em",{},"\"blood plasma, a special ECF compartment\""," — so plasma is ALSO a valid answer. The question has two correct answers but only one is marked correct.",[304,537],{},[293,539,540],{},[296,541,542,543,545],{},"Failure 2 — ",[312,544,375],{}," (distractor is also valid)",[458,547,548],{},[293,549,550,552],{},[296,551,464],{}," A patient presents with significant edema. Which could be a contributing factor?",[330,554,555,565],{},[333,556,557],{},[336,558,559,561,563],{},[339,560,474],{},[339,562,477],{},[339,564,480],{},[346,566,567,576],{},[336,568,569,572,574],{},[351,570,571],{},"Decreased plasma proteins → reduced colloid osmotic pressure",[351,573,490],{},[351,575,493],{},[336,577,578,581,583],{},[351,579,580],{},"Increased capillary permeability",[351,582,519],{},[351,584,585],{},[296,586,524],{},[293,588,589,591,592,595],{},[296,590,529],{}," Source explicitly says ",[532,593,594],{},"\"Increased capillary permeability is the hallmark of the inflammatory response... forming an oedema.\""," The distractor is actually a valid cause of edema too!",[304,597],{},[293,599,600],{},[296,601,602,603,605],{},"Failure 3 — ",[312,604,385],{}," (inference beyond source)",[458,607,608],{},[293,609,610,612],{},[296,611,464],{}," If a small, non-polar molecule needs to move against a steep concentration gradient, which transport mechanism would be most efficient?",[293,614,615,618],{},[296,616,617],{},"Marked correct:"," Active transport",[293,620,621,623],{},[296,622,529],{}," Source says active transport moves things against gradients, and that small non-polar molecules pass through membranes easily — but it never says active transport is used for non-polar molecules or compares \"efficiency.\" The answer is a reasonable inference but isn't actually in the source.",[304,625],{},[458,627,628],{},[293,629,630,633,634,636,637,640,641,643,644,647],{},[296,631,632],{},"Note from Ahmed:"," The third failure mode (",[312,635,385],{},") is a fuck-up on my part. It sometimes penalizes reasonable inferences, which hurts evals when the LLM generates higher-order questions (UNDERSTAND\u002FAPPLY\u002FANALYZE) rather than pure recall. A question that requires the student to ",[532,638,639],{},"apply"," knowledge will naturally go beyond verbatim source text — but the metric flags it as unsupported. This creates tension between ",[312,642,314],{}," and ",[312,645,646],{},"blooms_score",": we want deeper questions, but the correctness metric punishes them. Need to revisit this.",[304,649],{},[307,651,653,654,657],{"id":652},"_2-quiz_triviality-can-you-solve-it-without-studying","2. ",[312,655,656],{},"quiz_triviality"," — can you solve it without studying?",[293,659,660],{},[296,661,662],{},"\"Does the question give away its own answer?\"",[293,664,665],{},"If a student can pick the correct answer without knowing the subject — through test-taking tricks or common knowledge — the question is useless. It gives false confidence.",[293,667,668],{},[296,669,670],{},"5 tell categories (first match wins):",[330,672,673,683],{},[333,674,675],{},[336,676,677,680],{},[339,678,679],{},"Tell",[339,681,682],{},"What it catches",[346,684,685,695,705,715,725],{},[336,686,687,692],{},[351,688,689],{},[312,690,691],{},"LITERAL_OVERLAP",[351,693,694],{},"Answer text appears verbatim in the stem",[336,696,697,702],{},[351,698,699],{},[312,700,701],{},"CATEGORY_CUE",[351,703,704],{},"Stem asks for a type, only one choice fits",[336,706,707,712],{},[351,708,709],{},[312,710,711],{},"GRAMMAR_CUE",[351,713,714],{},"Article\u002Ftense\u002Fplural fits only one choice",[336,716,717,722],{},[351,718,719],{},[312,720,721],{},"TAUTOLOGY",[351,723,724],{},"Stem contains a word whose definition IS the answer",[336,726,727,732],{},[351,728,729],{},[312,730,731],{},"COMMON_KNOWLEDGE",[351,733,734],{},"Educated adult knows it without studying",[293,736,737],{},[296,738,739],{},"What it doesn't check:",[398,741,742,749],{},[401,743,744,745,748],{},"Distractor quality (that's ",[312,746,747],{},"quiz_distractor_quality",")",[401,750,751,752,748],{},"Whether the answer is correct (that's ",[312,753,314],{},[293,755,756],{},[296,757,420],{},[422,759,762],{"className":760,"code":761,"language":427},[425],"good_rate = NOT_TRIVIAL \u002F total × 100\n",[312,763,761],{"__ignoreMap":430},[293,765,766,768],{},[296,767,435],{}," Nano is worst (75–76%), flashlite is best (87%). The weaker the model, the more it leaks answers into its own stems.",[293,770,771],{},[440,772],{"alt":773,"src":774},"Triviality by variant","charts\u002Ftriviality_by_variant.png",[445,776,778],{"id":777},"real-failure-examples","Real failure examples",[293,780,781],{},[296,782,783,784],{},"Example 1 — ",[312,785,691],{},[458,787,788],{},[293,789,790,792,793,796],{},[296,791,464],{}," Defense mechanisms such as repression and denial are examples of __________.\n",[296,794,795],{},"Answer:"," defense mechanisms",[293,798,799],{},"The answer is literally the first two words of the stem. Zero studying required.",[458,801,802],{},[293,803,804],{},[532,805,806],{},"— generated by gpt-5.4-nano",[304,808],{},[293,810,811],{},[296,812,813,814],{},"Example 2 — ",[312,815,721],{},[458,817,818],{},[293,819,820,822,823,825],{},[296,821,464],{}," Comprehensiveness refers to whether a personality theory explains most or all known facts and observations within its domain. True\u002FFalse?\n",[296,824,795],{}," True",[293,827,828],{},"The statement IS the definition of the term. It's tautological — can't be false.",[458,830,831],{},[293,832,833],{},[532,834,806],{},[304,836],{},[293,838,839],{},[296,840,841,842],{},"Example 3 — ",[312,843,731],{},[458,845,846,851],{},[293,847,848,850],{},[296,849,464],{}," Which statement best explains the primary purpose of criminal law?",[398,852,853,856,859,862],{},[401,854,855],{},"✓ To protect society and maintain social order",[401,857,858],{},"✗ To compensate private individuals for personal losses",[401,860,861],{},"✗ To regulate only business transactions",[401,863,864],{},"✗ To resolve family disagreements without state involvement",[293,866,867],{},"Any adult knows criminal law is about protecting society. No legal education needed.",[458,869,870],{},[293,871,872],{},[532,873,874],{},"— generated by gpt-5.4",[304,876],{},[293,878,879],{},[296,880,881,882,884],{},"Example 4 — ",[312,883,731],{}," (the \"always\" giveaway)",[458,886,887],{},[293,888,889,891,892,894],{},[296,890,464],{}," Nudging is always highly effective for addressing large-scale environmental issues. True\u002FFalse?\n",[296,893,795],{}," False",[293,896,897],{},"The word \"always\" gives it away — nothing is \"always\" effective. A test-taking trick, not knowledge.",[458,899,900],{},[293,901,902],{},[532,903,904],{},"— generated by gemini-2.5-flash",[304,906],{},[307,908,910,911,914],{"id":909},"_3-blueprint_coverage-per-chunk-learning-objective-coverage","3. ",[312,912,913],{},"blueprint_coverage"," — per-chunk learning-objective coverage",[293,916,917],{},[296,918,919],{},"\"Did the quiz actually test what the source material teaches?\"",[293,921,922],{},[296,923,924],{},"Step 1 — Extract learning objectives (once per deck)",[293,926,927,928,931],{},"Before any quiz variant runs, an LLM reads each chunk and extracts a short list of learning objectives (LOs). These are cached in ",[312,929,930],{},"blueprint_cache\u002F"," so every variant gets judged against the same blueprint.",[293,933,934],{},"Example from a chunk about mitosis:",[422,936,939],{"className":937,"code":938,"language":427},[425],"LO1: Identify the four phases of mitosis\nLO2: Explain the role of spindle fibers\nLO3: Distinguish between mitosis and meiosis\n",[312,940,938],{"__ignoreMap":430},[293,942,943],{},[296,944,945],{},"Step 2 — Map questions to LOs",[293,947,948],{},"For each generated question, the judge checks: can the stem → correct-answer path be traced back to one of those LOs?",[398,950,951,954],{},[401,952,953],{},"\"What are the four phases of mitosis?\" → covers LO1 ✓",[401,955,956],{},"\"What color are spindle fibers under a microscope?\" → doesn't cover any LO ✗",[293,958,959],{},[296,960,961],{},"Step 3 — Score",[422,963,966],{"className":964,"code":965,"language":427},[425],"blueprint_coverage = covered_LOs \u002F total_LOs × 100\n",[312,967,965],{"__ignoreMap":430},[293,969,970],{},"If the deck has 20 LOs across all chunks and the quiz hits 14 of them → 70%.",[293,972,973],{},[296,974,975],{},"Why LOs instead of phrase extraction (like flashcard coverage)?",[293,977,978,979,982],{},"Quiz questions span multiple sentences — a single stem might synthesize info from three places in the chunk. Phrase-level matching undercounts that. LOs capture ",[532,980,981],{},"what the student should be able to do",", which is the right unit for quizzes.",[293,984,985,987],{},[296,986,435],{}," The biggest spread of any metric. 5.4 dominates at 72%, flash models are worst at 37–51%. More capable models cover more learning objectives; topic chunking doesn't help flash here.",[293,989,990],{},[440,991],{"alt":992,"src":993},"Blueprint coverage by variant","charts\u002Fcoverage_by_variant.png",[304,995],{},[307,997,999,1000,1003],{"id":998},"_4-quiz_relevance-subject-matter-vs-admin-trivia","4. ",[312,1001,1002],{},"quiz_relevance"," — subject matter vs. admin trivia",[293,1005,1006],{},[296,1007,1008],{},"\"Is this question testing real knowledge, or is it asking what slide number something was on?\"",[293,1010,1011],{},"Students upload all kinds of documents — lecture slides full of professor contact info, deadlines, textbook ISBNs. The generator sometimes latches onto that junk and turns it into quiz questions. This metric catches those.",[293,1013,1014],{},[296,1015,1016],{},"Binary:",[398,1018,1019,1025],{},[401,1020,1021,1024],{},[312,1022,1023],{},"GOOD"," — tests a concept, definition, mechanism, relationship",[401,1026,1027,1030],{},[312,1028,1029],{},"BAD"," — tests admin logistics, document metadata, or one-off details that don't generalize",[293,1032,1033,1034,1037],{},"The judge receives the deck's ",[312,1035,1036],{},"document_summary"," so it knows what \"on-topic\" means for each deck. A question about Phoenix Park is trivia for a biology deck but might be relevant for a history deck — the summary provides that context.",[293,1039,1040],{},[296,1041,420],{},[422,1043,1046],{"className":1044,"code":1045,"language":427},[425],"good_rate = GOOD \u002F total × 100\n",[312,1047,1045],{"__ignoreMap":430},[293,1049,1050,1052],{},[296,1051,435],{}," Near-ceiling for everyone (96–99%). Relevance is mostly solved — generators rarely produce \"what's the professor's email\" questions. The remaining failures are edge cases like researcher names and peripheral dates.",[293,1054,1055],{},[440,1056],{"alt":1057,"src":1058},"Relevance by variant","charts\u002Frelevance_by_variant.png",[445,1060,778],{"id":1061},"real-failure-examples-1",[293,1063,1064,1067],{},[296,1065,1066],{},"Name trivia"," (colonial policing deck — the doc IS about colonial policing, but knowing a specific park name isn't the point):",[458,1069,1070,1078,1081],{},[293,1071,1072,1074,1075,1077],{},[296,1073,464],{}," The training of colonial police officers often took place at headquarters in __________ Park in Dublin.\n",[296,1076,795],{}," Phoenix",[293,1079,1080],{},"Understanding the Irish model's influence = GOOD. Memorizing the park name = BAD.",[293,1082,1083],{},[532,1084,1085],{},"— generated by gemini-3.1-pro-preview",[304,1087],{},[293,1089,1090,1093],{},[296,1091,1092],{},"Researcher surname"," (personality psychology deck):",[458,1095,1096,1104,1107],{},[293,1097,1098,1100,1101,1103],{},[296,1099,464],{}," The Five Factor Model originated by Tupes & Christal (1958) was later evolved by Digman & __________.\n",[296,1102,795],{}," Goldberg",[293,1105,1106],{},"Understanding the Five Factor Model = GOOD. Knowing a co-author's surname = BAD.",[293,1108,1109],{},[532,1110,904],{},[304,1112],{},[293,1114,1115,1093],{},[296,1116,1117],{},"Random decade",[458,1119,1120,1128,1131],{},[293,1121,1122,1124,1125,1127],{},[296,1123,464],{}," Initial attempts to link personality types to health conditions were made in the __________.\n",[296,1126,795],{}," 1960s",[293,1129,1130],{},"A peripheral date with zero pedagogical value.",[293,1132,1133],{},[532,1134,904],{},[304,1136],{},[293,1138,1139,1142],{},[296,1140,1141],{},"Formatting abbreviation"," (English analysis guide — the doc teaches analytical writing skills, not citation formatting):",[458,1144,1145,1153,1156],{},[293,1146,1147,1149,1150,1152],{},[296,1148,464],{}," When referring to more than one line in an analysis, you should use the abbreviation '__________'.\n",[296,1151,795],{}," ll.",[293,1154,1155],{},"Testing a formatting convention, not analytical skill.",[293,1157,1158],{},[532,1159,1160],{},"— generated by gemini-3.1-flash-lite-preview",[304,1162],{},[307,1164,1166,1167,1169],{"id":1165},"_5-quiz_distractor_quality-are-the-wrong-answers-good-fakes","5. ",[312,1168,747],{}," — are the wrong answers good fakes?",[293,1171,1172],{},[296,1173,1174],{},"\"Can a student eliminate distractors without studying?\"",[293,1176,1177],{},"Triviality asks if the stem gives away the right answer. Distractor quality asks the opposite: are the wrong answers obviously wrong? Same coin, opposite side.",[293,1179,1180,1181,1184],{},"The judge evaluates ",[296,1182,1183],{},"each distractor individually",", applying 4 checks in order:",[1186,1187,1188,1194,1200,1206],"ol",{},[401,1189,1190,1193],{},[296,1191,1192],{},"Stem-contradiction"," — does the stem itself rule this out? → WEAK",[401,1195,1196,1199],{},[296,1197,1198],{},"Cultural-elimination"," — would a non-student adult reject this? → WEAK",[401,1201,1202,1205],{},[296,1203,1204],{},"Category mismatch"," — wrong subject area entirely? → ABSURD",[401,1207,1208,1211],{},[296,1209,1210],{},"Otherwise"," → PLAUSIBLE",[293,1213,1214,1215],{},"Key instruction: ",[532,1216,1217],{},"\"Grade from the perspective of a student who has NOT studied. A distractor that an expert sees through is still PLAUSIBLE if a confused student could be tempted.\"",[293,1219,1220],{},"Only applies to MCQ \u002F single-choice. True\u002Ffalse and fill-in-the-blank have no distractors to judge.",[293,1222,1223],{},[296,1224,420],{},[422,1226,1229],{"className":1227,"code":1228,"language":427},[425],"plausible_rate = PLAUSIBLE distractors \u002F total distractors × 100\n",[312,1230,1228],{"__ignoreMap":430},[445,1232,778],{"id":1233},"real-failure-examples-2",[293,1235,1236],{},"The nano model generates distractors addicted to absolutes — \"always\", \"guarantees\", \"eliminates all\", \"never\". Any test-wise student crosses those out immediately.",[458,1238,1239,1244,1267,1270],{},[293,1240,1241,1243],{},[296,1242,464],{}," Which combination of stated pros for cap-and-trade is most accurate?",[398,1245,1246,1253,1260],{},[401,1247,1248,1249,1252],{},"✗ \"It ",[296,1250,1251],{},"guarantees"," no complexity and no price uncertainty\" → WEAK",[401,1254,1255,1256,1259],{},"✗ \"It is ",[296,1257,1258],{},"always"," cheaper without trade-offs\" → WEAK",[401,1261,1262,1263,1266],{},"✗ \"It removes ",[296,1264,1265],{},"any"," need for program design\" → WEAK",[293,1268,1269],{},"All three eliminable on surface logic alone. No cap-and-trade knowledge needed.",[293,1271,1272],{},[532,1273,806],{},[304,1275],{},[458,1277,1278,1283,1288,1291],{},[293,1279,1280,1282],{},[296,1281,464],{}," Which policy argument is framed around intergenerational fairness?",[398,1284,1285],{},[401,1286,1287],{},"✗ \"Intergenerational fairness is the same as eliminating unemployment\" → ABSURD",[293,1289,1290],{},"Completely off-topic. Different policy domain entirely.",[293,1292,1293],{},[532,1294,806],{},[304,1296],{},[458,1298,1299,1304,1309,1312],{},[293,1300,1301,1303],{},[296,1302,464],{}," Which con of cap-and-trade is described as tied to who may obtain permits?",[398,1305,1306],{},[401,1307,1308],{},"✗ \"It eliminates the risk of loopholes by fixing permits at zero\" → WEAK",[293,1310,1311],{},"\"Fixing permits at zero\" is internally self-defeating \u002F nonsensical.",[293,1313,1314],{},[532,1315,806],{},[304,1317],{},[293,1319,1320,1322],{},[296,1321,435],{}," Nano is worst (73–74%), flash\u002Fpro models cluster around 80–84%. Tracks closely with triviality — the models that leak answers into stems also produce weak distractors.",[293,1324,1325],{},[440,1326],{"alt":1327,"src":1328},"Distractor quality by variant","charts\u002Fdistractor_quality_by_variant.png",[458,1330,1331],{},[293,1332,1333,1336,1337,1339],{},[296,1334,1335],{},"Note:"," This metric overlaps heavily with ",[312,1338,656],{},". Triviality catches giveaways in the stem→answer path; distractor quality catches eliminable wrong answers. In practice they often fire on the same questions — if the distractors are garbage, the question is also trivially solvable. We may want to merge these or drop one in a future iteration.",[304,1341],{},[307,1343,1345,1346,1348],{"id":1344},"_6-blooms_score-how-deep-do-the-questions-go","6. ",[312,1347,646],{}," — how deep do the questions go?",[293,1350,1351],{},[296,1352,1353],{},"\"Is this quiz all recall, or does it actually make students think?\"",[293,1355,1356],{},"Each question is classified on a 4-level Bloom's taxonomy subset:",[330,1358,1359,1375],{},[333,1360,1361],{},[336,1362,1363,1366,1369,1372],{},[339,1364,1365],{},"Level",[339,1367,1368],{},"Weight",[339,1370,1371],{},"What it tests",[339,1373,1374],{},"Example",[346,1376,1377,1391,1405,1419],{},[336,1378,1379,1382,1385,1388],{},[351,1380,1381],{},"REMEMBER",[351,1383,1384],{},"1",[351,1386,1387],{},"Recall a fact, term, date",[351,1389,1390],{},"\"What is the capital of France?\"",[336,1392,1393,1396,1399,1402],{},[351,1394,1395],{},"UNDERSTAND",[351,1397,1398],{},"2",[351,1400,1401],{},"Explain, compare, paraphrase",[351,1403,1404],{},"\"Why is the left ventricle thicker than the right?\"",[336,1406,1407,1410,1413,1416],{},[351,1408,1409],{},"APPLY",[351,1411,1412],{},"3",[351,1414,1415],{},"Use knowledge in a new situation",[351,1417,1418],{},"\"A patient presents with X — which drug?\"",[336,1420,1421,1424,1427,1430],{},[351,1422,1423],{},"ANALYZE",[351,1425,1426],{},"4",[351,1428,1429],{},"Examine relationships, multi-step reasoning",[351,1431,1432],{},"\"Compare structural changes in CHF vs HCM\"",[293,1434,1435],{},"We stop at ANALYZE because generators essentially never produce EVALUATE\u002FCREATE-level items. Including unused classes just inflates disagreement.",[293,1437,1438],{},[296,1439,420],{},[422,1441,1444],{"className":1442,"code":1443,"language":427},[425],"blooms_score = mean_weight \u002F 4 × 100\n",[312,1445,1443],{"__ignoreMap":430},[293,1447,1448],{},"A quiz that's 100% REMEMBER scores 25%. A healthy mix of 60% REMEMBER + 30% UNDERSTAND + 10% APPLY scores 42%. Recall isn't a failure — it's the baseline. The score rewards variety, not the absence of recall.",[293,1450,1451,1452,1455],{},"The ",[312,1453,1454],{},"blooms_distribution"," field records the breakdown (e.g. 30% remember, 60% understand, 10% apply) — useful for spotting generators that collapse everything onto one level.",[293,1457,1458,1460],{},[296,1459,435],{}," All generators cluster in the 34–37% range. The distribution tells the story: ~60% REMEMBER, ~35% UNDERSTAND, ~2% APPLY, ~0% ANALYZE. There's a healthy REMEMBER\u002FUNDERSTAND base but almost no APPLY or ANALYZE questions. gem31pro-high leads at 37.4% with the best UNDERSTAND share (40%). Pushing generators toward more APPLY-level questions is the main lever left.",[293,1462,1463],{},[440,1464],{"alt":1465,"src":1466},"Bloom's distribution","charts\u002Fblooms_stacked.png",[304,1468],{},[307,1470,1472,1473,1476],{"id":1471},"_7-difficulty_good_rate-right-level-for-the-audience","7. ",[312,1474,1475],{},"difficulty_good_rate"," — right level for the audience?",[293,1478,1479],{},[296,1480,1481],{},"\"Is this quiz appropriately hard for who's taking it?\"",[293,1483,1484],{},"Bloom's alone isn't enough — two ANALYZE questions can differ wildly in solve effort. So this metric combines two axes:",[422,1486,1489],{"className":1487,"code":1488,"language":427},[425],"score = bloom(1–4) × complexity(1–5) → range 1–20\n",[312,1490,1488],{"__ignoreMap":430},[293,1492,1493,1496],{},[296,1494,1495],{},"Bloom"," = cognitive level (same as above).",[293,1498,1499,1502,1503,1506],{},[296,1500,1501],{},"Complexity"," = how hard the subject matter is ",[296,1504,1505],{},"for the target audience",". The judge is told the audience (high school \u002F university \u002F professional) and subject, so \"What are the four chambers of the heart?\" scores low complexity for a med student but higher for a middle-schooler.",[330,1508,1509,1518],{},[333,1510,1511],{},[336,1512,1513,1515],{},[339,1514,1501],{},[339,1516,1517],{},"Meaning",[346,1519,1520,1527,1534,1541,1548],{},[336,1521,1522,1524],{},[351,1523,1384],{},[351,1525,1526],{},"Basic fact any student in the field would know early on",[336,1528,1529,1531],{},[351,1530,1398],{},[351,1532,1533],{},"Standard introductory curriculum concept",[336,1535,1536,1538],{},[351,1537,1412],{},[351,1539,1540],{},"Connecting multiple concepts, moderate depth",[336,1542,1543,1545],{},[351,1544,1426],{},[351,1546,1547],{},"Deep understanding, nuanced reasoning",[336,1549,1550,1553],{},[351,1551,1552],{},"5",[351,1554,1555],{},"Expert-level, synthesis across sub-domains",[293,1557,1558],{},[296,1559,420],{},[422,1561,1564],{"className":1562,"code":1563,"language":427},[425],"difficulty_good_rate = questions with score >= 4 \u002F total × 100\n",[312,1565,1563],{"__ignoreMap":430},[293,1567,1568,1569,1572],{},"Questions scoring below 4 are both REMEMBER-level AND basic complexity — the easiest possible questions. ",[312,1570,1571],{},"difficulty_mean"," (the raw 1–20 average) is kept alongside as a diagnostic.",[293,1574,1575,1577],{},[296,1576,435],{}," Flat range (36–42%). Closely correlated with blooms — since most questions are REMEMBER, bloom(1) × complexity caps the score. gem31pro-high edges ahead at 42% but the difference is small.",[293,1579,1580],{},[440,1581],{"alt":1582,"src":1583},"Difficulty by variant","charts\u002Fdifficulty_by_variant.png",[304,1585],{},[307,1587,1589,1590,1593],{"id":1588},"_8-uniqueness_rate-does-the-quiz-ask-the-same-thing-twice","8. ",[312,1591,1592],{},"uniqueness_rate"," — does the quiz ask the same thing twice?",[293,1595,1596],{},[296,1597,1598],{},"\"Did the generator produce 10 questions or 7 questions and 3 repeats?\"",[293,1600,1601],{},"Two-stage pipeline:",[293,1603,1604,1607,1608,1611,1612,1615],{},[296,1605,1606],{},"Stage 1 — Embedding filter."," Embed each question (stem + correct answer) with ",[312,1609,1610],{},"gemini-embedding-001",". Compute pairwise cosine similarity. Surface pairs above ",[296,1613,1614],{},"0.90",".",[293,1617,1618,1621],{},[296,1619,1620],{},"Stage 2 — LLM arbitration."," For each surfaced pair, a judge classifies:",[398,1623,1624,1630],{},[401,1625,1626,1629],{},[312,1627,1628],{},"LAZY_DUPLICATE"," — same fact, rephrased. A student who answers one learns nothing from the other.",[401,1631,1632,1635],{},[312,1633,1634],{},"REINFORCEMENT"," — same topic, different angle or cognitive level. Answering both strengthens understanding.",[293,1637,1638],{},"Examples:",[398,1640,1641,1644],{},[401,1642,1643],{},"\"What is photosynthesis?\" + \"Define the process of photosynthesis\" → LAZY_DUPLICATE (same fact, surface rephrasing)",[401,1645,1646],{},"\"What is photosynthesis?\" + \"How would photosynthesis be affected if CO₂ levels doubled?\" → REINFORCEMENT (recall vs. application)",[293,1648,1649],{},[296,1650,420],{},[422,1652,1655],{"className":1653,"code":1654,"language":427},[425],"uniqueness_rate = (1 - lazy_duplicate_questions \u002F total) × 100\n",[312,1656,1654],{"__ignoreMap":430},[293,1658,1659],{},"Reinforcement pairs do NOT count against the score.",[293,1661,1662,1665],{},[296,1663,1664],{},"Why two stages?"," Cosine alone flags too many reinforcement pairs as duplicates. LLM-only on every pair is expensive. The 0.90 threshold catches almost every true duplicate while the LLM only arbitrates the ambiguous ones.",[293,1667,1668,1670],{},[296,1669,435],{}," The widest split. Nano and 5.4 generate tons of duplicates (60–61%), while flashlite barely produces any (96%). This correlates with question count — models that generate more questions per chunk tend to repeat themselves more.",[293,1672,1673],{},[440,1674],{"alt":1675,"src":1676},"Uniqueness by variant","charts\u002Funiqueness_by_variant.png",[304,1678],{},[307,1680,1682],{"id":1681},"side-channel-diagnostics-static-no-llm","Side-channel diagnostics (static, no LLM)",[293,1684,1685],{},"The eight metrics above are all LLM-judged. Two additional static checks run without any LLM calls — they catch structural problems in the generator's output that a student could exploit or that indicate pipeline health issues.",[304,1687],{},[445,1689,1691,1694],{"id":1690},"quiz_structural-pipeline-parse-health",[312,1692,1693],{},"quiz_structural"," — pipeline parse health",[293,1696,1697],{},[296,1698,1699],{},"\"Did the generator produce valid, parseable questions?\"",[293,1701,1702],{},"Two sub-metrics that flag problems between the LLM's raw output and the question format we need:",[293,1704,1705,1710],{},[296,1706,1707],{},[312,1708,1709],{},"rejection_rate"," — what fraction of the LLM's raw output failed post-parse validation?",[422,1712,1715],{"className":1713,"code":1714,"language":427},[425],"rejection_rate = rejected_count \u002F raw_question_count\n",[312,1716,1714],{"__ignoreMap":430},[293,1718,1719],{},"Rejection reasons: invalid choice count (e.g. single_choice with \u003C2 or >4 choices), missing correct answer, multiple correct answers on a single_choice question, wrong True\u002FFalse prefix. These are structured-output failures — the generation prompt asked for one format, the model produced another.",[293,1721,1722,1727],{},[296,1723,1724],{},[312,1725,1726],{},"chunk_yield_rate"," — what fraction of source chunks produced at least one valid question?",[422,1729,1732],{"className":1730,"code":1731,"language":427},[425],"chunk_yield_rate = chunks_with_≥1_valid_question \u002F total_chunks\n",[312,1733,1731],{"__ignoreMap":430},[293,1735,1736,1737,1740],{},"A low yield means the LLM is silently failing on some chunks — either generating nothing or generating only questions that fail validation. Since ",[312,1738,1739],{},"process_chunk"," swallows schema-validation failures, chunk yield is the only way to catch these silent drops.",[293,1742,1743,1745,1746,1749],{},[296,1744,435],{}," Chunk yield is near-100% for all variants — models rarely fail to produce ",[532,1747,1748],{},"something"," for a chunk. Rejection rate is more interesting: 0–9% depending on deck complexity. 100% of rejections in the baseline were single_choice questions with multiple correct answers — the type contract isn't enforced strongly enough in the prompt.",[304,1751],{},[445,1753,1755,1758],{"id":1754},"quiz_structural_bias-surface-cue-giveaways",[312,1756,1757],{},"quiz_structural_bias"," — surface-cue giveaways",[293,1760,1761],{},[296,1762,1763],{},"\"Can a student guess the right answer just by looking at the shape of the choices?\"",[293,1765,1766],{},"No LLM, no subject knowledge — just character counts and True\u002FFalse tallies. If the correct answer is always the longest option, a test-wise student picks longest every time and beats random chance.",[1768,1769,1771],"h4",{"id":1770},"length-outlier-rate","Length outlier rate",[293,1773,1774,1775,1778],{},"For each single-correct question: is the correct answer ",[296,1776,1777],{},"uniquely"," the longest or shortest choice? If yes, a student can exploit the \"pick the longest\" heuristic without knowing anything.",[422,1780,1783],{"className":1781,"code":1782,"language":427},[425],"length_outlier_rate = (longest_correct + shortest_correct) \u002F single_correct_count\ngood_rate = (1 - length_outlier_rate) × 100\n",[312,1784,1782],{"__ignoreMap":430},[293,1786,1787],{},"Only computed on questions with exactly one correct choice — the \"pick the longest\" trick only works cleanly when there's one answer to pick. Multi-correct MCQ are tracked separately as a side-channel.",[293,1789,1790,1792,1793,1796],{},[296,1791,435],{}," This is the worst metric across the board. Every variant scores between 23–39% good_rate, meaning ",[296,1794,1795],{},"61–77% of single-correct questions have the correct answer as a unique length outlier."," The generation prompt already tells the model to match answer lengths — the model ignores that instruction ~6–7 times out of 10. flash-topic and 5.4 are worst (23–24%), gem31pro and nano-high are least bad (~38%) but still failing majority of the time. This is the single biggest structural fix remaining — a post-generation length-rewrite pass or a self-check step before emission.",[1768,1798,1800],{"id":1799},"tf-true-rate-balance","T\u002FF true-rate balance",[293,1802,1803],{},"For True\u002FFalse questions: what fraction have \"True\" as the correct answer?",[422,1805,1808],{"className":1806,"code":1807,"language":427},[425],"tf_true_rate = tf_true_count \u002F tf_count\n",[312,1809,1807],{"__ignoreMap":430},[293,1811,1812,1813,1819],{},"Balanced decks sit near 50%. Outside the ",[296,1814,1815],{},[1816,1817,1818],"span",{},"0.35, 0.65"," band → flagged. If a student notices the pattern leans one way, \"always pick False\" (or \"always pick True\") beats random.",[293,1821,1822],{},"Supports multilingual T\u002FF values — English \"True\", German \"Wahr\", French \"Vrai\", Spanish \"Verdadero\", etc.",[293,1824,1825,1827],{},[296,1826,435],{}," Most models produce reasonably balanced T\u002FF splits (81–95% balance score). flashlite and gem31pro are most balanced (93–95%), 5.4 is solid (92%). The outlier is nano-none at 65% — noticeably skewed, meaning \"always pick True\" (or False) gives an edge. nano-high is better at 76%, suggesting reasoning effort helps with T\u002FF balance. flash variants cluster around 81–83%.",[293,1829,1830],{},[440,1831],{"alt":1832,"src":1833},"Structural bias — length and T\u002FF balance","charts\u002Fstructural_bias_dual.png",[458,1835,1836],{},[293,1837,1838,1840,1841,1843,1844,1847],{},[296,1839,1335],{}," Neither of these metrics uses an LLM. They run on the raw question JSON in milliseconds and catch problems that the LLM-judged metrics miss entirely — ",[312,1842,656],{}," can catch \"the longest answer is always right\" as a pattern only if the judge notices it across many questions, but ",[312,1845,1846],{},"structural_bias"," catches it deterministically on every single question. They complement each other: triviality catches semantic giveaways, structural bias catches statistical ones.",[304,1849],{},[307,1851,1853],{"id":1852},"baseline-analysis-easy-gains","Baseline analysis & easy gains",[293,1855,1856,1857,1862,1863,1866],{},"The current production configuration is ",[296,1858,1859],{},[312,1860,1861],{},"flash-topic"," — ",[312,1864,1865],{},"gemini-2.5-flash"," with topic-level chunking, no reasoning effort.",[445,1868,1870],{"id":1869},"where-the-baseline-sits","Where the baseline sits",[330,1872,1873,1892],{},[333,1874,1875],{},[336,1876,1877,1880,1883,1886,1889],{},[339,1878,1879],{},"Metric",[339,1881,1882],{},"Score",[339,1884,1885],{},"Rank",[339,1887,1888],{},"Best variant",[339,1890,1891],{},"Gap",[346,1893,1894,1911,1927,1946,1962,1979,1997,2016,2033],{},[336,1895,1896,1899,1902,1905,1908],{},[351,1897,1898],{},"Correctness",[351,1900,1901],{},"93.5%",[351,1903,1904],{},"5\u002F8",[351,1906,1907],{},"gem31pro (95.1%)",[351,1909,1910],{},"+1.6pp",[336,1912,1913,1916,1919,1921,1924],{},[351,1914,1915],{},"Non-triviality",[351,1917,1918],{},"82.3%",[351,1920,1904],{},[351,1922,1923],{},"flashlite-high (87.6%)",[351,1925,1926],{},"+5.3pp",[336,1928,1929,1932,1935,1940,1943],{},[351,1930,1931],{},"Blueprint Coverage",[351,1933,1934],{},"41.5%",[351,1936,1937],{},[296,1938,1939],{},"6\u002F8",[351,1941,1942],{},"5.4 (72.3%)",[351,1944,1945],{},"+30.8pp",[336,1947,1948,1951,1954,1956,1959],{},[351,1949,1950],{},"Relevance",[351,1952,1953],{},"97.5%",[351,1955,1904],{},[351,1957,1958],{},"5.4 (98.9%)",[351,1960,1961],{},"+1.4pp",[336,1963,1964,1967,1970,1973,1976],{},[351,1965,1966],{},"Distractor Quality",[351,1968,1969],{},"84.1%",[351,1971,1972],{},"2\u002F8",[351,1974,1975],{},"gem31pro (84.3%)",[351,1977,1978],{},"+0.2pp",[336,1980,1981,1984,1987,1991,1994],{},[351,1982,1983],{},"Bloom's Depth",[351,1985,1986],{},"34.7%",[351,1988,1989],{},[296,1990,1939],{},[351,1992,1993],{},"gem31pro (37.4%)",[351,1995,1996],{},"+2.7pp",[336,1998,1999,2002,2005,2010,2013],{},[351,2000,2001],{},"Difficulty",[351,2003,2004],{},"35.7%",[351,2006,2007],{},[296,2008,2009],{},"8\u002F8",[351,2011,2012],{},"gem31pro (42.3%)",[351,2014,2015],{},"+6.6pp",[336,2017,2018,2021,2024,2027,2030],{},[351,2019,2020],{},"Uniqueness",[351,2022,2023],{},"91.0%",[351,2025,2026],{},"3\u002F8",[351,2028,2029],{},"flashlite-low (96.0%)",[351,2031,2032],{},"+5.0pp",[336,2034,2035,2038,2041,2045,2048],{},[351,2036,2037],{},"Length Balance",[351,2039,2040],{},"23.2%",[351,2042,2043],{},[296,2044,2009],{},[351,2046,2047],{},"nano-high (38.5%)",[351,2049,2050],{},"+15.3pp",[293,2052,2053],{},"Dead last on difficulty and length balance. Near-bottom on coverage and Bloom's depth. Strong on distractor quality and uniqueness. Mediocre on everything else.",[293,2055,2056],{},[440,2057],{"alt":2058,"src":2059},"Baseline vs alternatives","charts\u002Fbaseline_radar.png",[445,2061,2063],{"id":2062},"easy-gain-1-switch-from-topic-subtopic-chunking-free-no-model-change","Easy gain #1 — switch from topic → subtopic chunking (free, no model change)",[293,2065,2066],{},"Same model, same cost, same speed. Just change the chunking parameter.",[330,2068,2069,2083],{},[333,2070,2071],{},[336,2072,2073,2075,2077,2080],{},[339,2074,1879],{},[339,2076,1861],{},[339,2078,2079],{},"flash (subtopic)",[339,2081,2082],{},"Delta",[346,2084,2085,2099,2111,2127,2139,2151],{},[336,2086,2087,2089,2091,2094],{},[351,2088,1931],{},[351,2090,1934],{},[351,2092,2093],{},"50.7%",[351,2095,2096],{},[296,2097,2098],{},"+9.2pp",[336,2100,2101,2103,2105,2108],{},[351,2102,1915],{},[351,2104,1918],{},[351,2106,2107],{},"83.0%",[351,2109,2110],{},"+0.8pp",[336,2112,2113,2116,2119,2122],{},[351,2114,2115],{},"Questions per gen",[351,2117,2118],{},"42.6",[351,2120,2121],{},"68.7",[351,2123,2124],{},[296,2125,2126],{},"+26.1",[336,2128,2129,2131,2133,2136],{},[351,2130,2020],{},[351,2132,2023],{},[351,2134,2135],{},"87.3%",[351,2137,2138],{},"-3.7pp",[336,2140,2141,2143,2145,2148],{},[351,2142,1966],{},[351,2144,1969],{},[351,2146,2147],{},"83.5%",[351,2149,2150],{},"-0.6pp",[336,2152,2153,2156,2159,2162],{},[351,2154,2155],{},"Generation time",[351,2157,2158],{},"20.7s",[351,2160,2161],{},"21.8s",[351,2163,2164],{},"+1.1s",[293,2166,2167],{},"Subtopic chunking gives the model smaller, more focused chunks → +9.2pp coverage and +61% more questions. The uniqueness drop (-3.7pp) is the tradeoff — more questions from the same material means more overlap risk — but 87% is still solid. Essentially free lunch.",[445,2169,2171],{"id":2170},"easy-gain-2-length-balancing-rewrite-pass","Easy gain #2 — length-balancing rewrite pass",[293,2173,2174],{},"23.2% length balance is the worst score across all metrics for the baseline. The generation prompt already says \"match answer lengths\" — models ignore it. You can't fix this with pure string manipulation — trimming \"The mitochondria converts glucose into ATP through oxidative phosphorylation\" to match \"Nucleus\" destroys the content. This needs a cheap LLM rewrite pass (nano-tier model, just rewording distractors to roughly match the correct answer's length) or a self-check step where the generator reviews and rewrites its own choices before emission. Every variant suffers here (best is 38.5%), so this fix benefits any model choice.",[445,2176,2178],{"id":2177},"easy-gain-3-tf-balance-enforcement-pure-code","Easy gain #3 — T\u002FF balance enforcement (pure code)",[293,2180,2181,2182,2184],{},"81.4% T\u002FF balance is fine but could be better. During generation, count the True\u002FFalse split. If it drifts outside ",[1816,2183,1818],{},", flip some question polarities (\"X is true\" → \"X is false\" with the answer inverted). Pure post-processing, no model cost.",[445,2186,2188],{"id":2187},"medium-effort-upgrade-to-gem31pro","Medium effort — upgrade to gem31pro",[330,2190,2191,2204],{},[333,2192,2193],{},[336,2194,2195,2197,2199,2202],{},[339,2196,1879],{},[339,2198,1861],{},[339,2200,2201],{},"gem31pro",[339,2203,2082],{},[346,2205,2206,2220,2233,2245,2256,2267,2279],{},[336,2207,2208,2210,2212,2215],{},[351,2209,1931],{},[351,2211,1934],{},[351,2213,2214],{},"60.1%",[351,2216,2217],{},[296,2218,2219],{},"+18.6pp",[336,2221,2222,2224,2226,2229],{},[351,2223,2001],{},[351,2225,2004],{},[351,2227,2228],{},"42.3%",[351,2230,2231],{},[296,2232,2015],{},[336,2234,2235,2237,2239,2242],{},[351,2236,1915],{},[351,2238,1918],{},[351,2240,2241],{},"85.9%",[351,2243,2244],{},"+3.6pp",[336,2246,2247,2249,2251,2254],{},[351,2248,1983],{},[351,2250,1986],{},[351,2252,2253],{},"37.4%",[351,2255,1996],{},[336,2257,2258,2260,2262,2265],{},[351,2259,1898],{},[351,2261,1901],{},[351,2263,2264],{},"95.1%",[351,2266,1910],{},[336,2268,2269,2271,2273,2276],{},[351,2270,2115],{},[351,2272,2118],{},[351,2274,2275],{},"71.4",[351,2277,2278],{},"+28.8",[336,2280,2281,2283,2285,2288],{},[351,2282,2155],{},[351,2284,2158],{},[351,2286,2287],{},"91.8s",[351,2289,2290],{},[296,2291,2292],{},"+71.1s (4.4×)",[293,2294,2295],{},"Gains across the board — coverage, difficulty, triviality, correctness all improve. But 4.4× slower and more expensive. Could work as a premium tier or for high-stakes decks only.",[445,2297,2299],{"id":2298},"interesting-outlier-the-overgenerate-then-filter-family-54-nano","Interesting outlier — the overgenerate-then-filter family (5.4, nano)",[293,2301,2302],{},"Both 5.4 and nano generate far more questions than flash — but a lot of them are duplicates. The question is whether the raw volume + dedup beats flash's smaller-but-cleaner output.",[330,2304,2305,2322],{},[333,2306,2307],{},[336,2308,2309,2311,2313,2316,2319],{},[339,2310,1879],{},[339,2312,1861],{},[339,2314,2315],{},"nano",[339,2317,2318],{},"nano-high",[339,2320,2321],{},"5.4",[346,2323,2324,2341,2358,2374,2401,2418,2435],{},[336,2325,2326,2328,2330,2333,2336],{},[351,2327,1931],{},[351,2329,1934],{},[351,2331,2332],{},"59.3%",[351,2334,2335],{},"64.8%",[351,2337,2338],{},[296,2339,2340],{},"72.3%",[336,2342,2343,2345,2349,2352,2355],{},[351,2344,2020],{},[351,2346,2347],{},[296,2348,2023],{},[351,2350,2351],{},"61.5%",[351,2353,2354],{},"70.4%",[351,2356,2357],{},"60.2%",[336,2359,2360,2363,2365,2368,2371],{},[351,2361,2362],{},"Raw questions",[351,2364,2118],{},[351,2366,2367],{},"113.0",[351,2369,2370],{},"104.4",[351,2372,2373],{},"158.3",[336,2375,2376,2381,2386,2391,2396],{},[351,2377,2378],{},[296,2379,2380],{},"Effective unique qs",[351,2382,2383],{},[296,2384,2385],{},"39",[351,2387,2388],{},[296,2389,2390],{},"69",[351,2392,2393],{},[296,2394,2395],{},"73",[351,2397,2398],{},[296,2399,2400],{},"95",[336,2402,2403,2405,2409,2412,2415],{},[351,2404,1966],{},[351,2406,2407],{},[296,2408,1969],{},[351,2410,2411],{},"74.0%",[351,2413,2414],{},"73.5%",[351,2416,2417],{},"74.1%",[336,2419,2420,2422,2426,2429,2432],{},[351,2421,1915],{},[351,2423,2424],{},[296,2425,1918],{},[351,2427,2428],{},"75.6%",[351,2430,2431],{},"76.0%",[351,2433,2434],{},"78.8%",[336,2436,2437,2439,2443,2446,2449],{},[351,2438,2155],{},[351,2440,2441],{},[296,2442,2158],{},[351,2444,2445],{},"76.3s",[351,2447,2448],{},"232.1s",[351,2450,2451],{},"63.1s",[293,2453,2454],{},"Even after dedup, nano-none produces ~69 effective unique questions vs flash-topic's 39 — and covers +17.9pp more learning objectives. But the quality gap is real: -10pp distractor quality, -6.7pp triviality. Nano produces more questions, covering more material, but those questions are easier to game.",[293,2456,2457],{},"5.4 has the same quality problems plus costs more. Its advantage over nano is pure coverage volume (+13pp), but nano gets most of the coverage gain at a fraction of 5.4's cost.",[293,2459,2460,2463],{},[296,2461,2462],{},"The interesting experiment:"," nano + deduplication + distractor-rewrite pass. Generate with nano's volume (cheap, fast-ish), dedup the lazy duplicates, then run a quick rewrite pass on the surviving questions to fix distractor length\u002Fquality. If the rewrite pass is cheap (and it should be — it's reformulating existing distractors, not generating new knowledge), this pipeline could combine nano's coverage with flash-level quality.",[445,2465,2467],{"id":2466},"priority-stack","Priority stack",[1186,2469,2470,2476,2482,2488,2494],{},[401,2471,2472,2475],{},[296,2473,2474],{},"Switch to subtopic chunking"," — free, +9pp coverage, ship today",[401,2477,2478,2481],{},[296,2479,2480],{},"Add length-balancing rewrite pass"," — cheap LLM call, fixes the worst metric across all variants",[401,2483,2484,2487],{},[296,2485,2486],{},"Add T\u002FF balance enforcement"," — pure code, minor but free",[401,2489,2490,2493],{},[296,2491,2492],{},"Evaluate gem31pro for premium tier"," — big quality jump, need to price the latency cost",[401,2495,2496,2499],{},[296,2497,2498],{},"Test nano + dedup + rewrite pipeline"," — nano's coverage at ~1\u002F10th the cost of 5.4, with a quality cleanup pass on top",[304,2501],{},[307,2503,2505],{"id":2504},"language-effects","Language effects",[293,2507,2508],{},"The eval set covers 4 languages: English (826 evals), German (368), French (289), Dutch (176).",[445,2510,2512],{"id":2511},"dutch-correctness-drops-across-the-board","Dutch correctness drops across the board",[293,2514,2515],{},"Dutch correctness averages 88.8% vs 94.6% for English — a -5.8pp gap that holds across every variant (85–93% Dutch vs 92–95% English) and both judges (GPT-5.4: 87% vs 95%, Gemini: 90% vs 95%). This is a real generator problem, not judge noise — models produce more incorrect answers in Dutch. Only 4 Dutch decks though, so small sample.",[445,2517,2519],{"id":2518},"judges-are-lenient-on-non-english-triviality-and-distractor-quality","Judges are lenient on non-English triviality and distractor quality",[293,2521,2522,2523,643,2526,2529],{},"Both judges consistently rate French and German as ",[532,2524,2525],{},"less trivial",[532,2527,2528],{},"better distractors"," than English. French non-triviality: 90% vs English 85% (GPT-5.4), 86% vs 74% (Gemini). The same pattern holds for German.",[293,2531,2532,2533,2536],{},"This is almost certainly a judge blind spot — catching \"common knowledge\" giveaways and grammar cues is harder in a language the judge is less fluent in. A French student would spot a giveaway that neither judge flags. This means ",[296,2534,2535],{},"non-English triviality and distractor quality scores are probably inflated"," — the real quality gap between languages may be larger than the numbers show.",[293,2538,2539],{},[440,2540],{"alt":2541,"src":2542},"Language effects — correctness drop and judge leniency","charts\u002Flanguage_effects.png",[304,2544],{},[307,2546,2548],{"id":2547},"cost-analysis","Cost analysis",[293,2550,2551,2552,2555,2556,2559],{},"Actual costs from ",[312,2553,2554],{},"llm_usage_logs"," (generation) and pipeline ",[312,2557,2558],{},"eval_cost"," fields (evaluation). All 830 generation runs × 2 judges × ~1,659 evals.",[445,2561,2563],{"id":2562},"generation-cost-per-variant","Generation cost per variant",[330,2565,2566,2585],{},[333,2567,2568],{},[336,2569,2570,2573,2576,2579,2582],{},[339,2571,2572],{},"Variant",[339,2574,2575],{},"$\u002Frun",[339,2577,2578],{},"Total",[339,2580,2581],{},"Runs",[339,2583,2584],{},"Notes",[346,2586,2587,2604,2621,2639,2655,2671,2687,2703],{},[336,2588,2589,2592,2595,2598,2601],{},[351,2590,2591],{},"flashlite-low",[351,2593,2594],{},"$0.0042",[351,2596,2597],{},"$0.48",[351,2599,2600],{},"113",[351,2602,2603],{},"Cheapest. No reasoning tokens.",[336,2605,2606,2609,2612,2615,2618],{},[351,2607,2608],{},"flashlite-high",[351,2610,2611],{},"$0.0068",[351,2613,2614],{},"$0.76",[351,2616,2617],{},"112",[351,2619,2620],{},"+62% over low — reasoning effort bump",[336,2622,2623,2625,2628,2631,2633],{},[351,2624,1861],{},[351,2626,2627],{},"$0.0164",[351,2629,2630],{},"$1.84",[351,2632,2617],{},[351,2634,2635,2638],{},[296,2636,2637],{},"Baseline."," 2.4× flashlite-high",[336,2640,2641,2643,2646,2649,2652],{},[351,2642,2315],{},[351,2644,2645],{},"$0.0184",[351,2647,2648],{},"$1.95",[351,2650,2651],{},"106",[351,2653,2654],{},"Comparable to flash-topic",[336,2656,2657,2660,2663,2666,2668],{},[351,2658,2659],{},"flash",[351,2661,2662],{},"$0.0265",[351,2664,2665],{},"$2.97",[351,2667,2617],{},[351,2669,2670],{},"Same model, subtopic chunks = +61% cost",[336,2672,2673,2675,2678,2681,2684],{},[351,2674,2318],{},[351,2676,2677],{},"$0.0472",[351,2679,2680],{},"$3.02",[351,2682,2683],{},"64",[351,2685,2686],{},"Reasoning effort = 2.6× nano",[336,2688,2689,2691,2694,2697,2700],{},[351,2690,2321],{},[351,2692,2693],{},"$0.1081",[351,2695,2696],{},"$11.35",[351,2698,2699],{},"105",[351,2701,2702],{},"6.6× baseline",[336,2704,2705,2707,2712,2717,2719],{},[351,2706,2201],{},[351,2708,2709],{},[296,2710,2711],{},"$0.4457",[351,2713,2714],{},[296,2715,2716],{},"$47.25",[351,2718,2651],{},[351,2720,2721,2724],{},[296,2722,2723],{},"27× baseline."," Reasoning tokens dominate.",[293,2726,2727,2730],{},[296,2728,2729],{},"Total generation cost: $69.61"," across all 830 runs.",[293,2732,2733],{},"gem31pro is in a different universe — $0.45\u002Frun vs $0.016 for the baseline. Its coverage lead (+18.6pp) costs 27× more per generation. Nano gets most of that coverage gain (+23.3pp) at 1.1× the baseline cost — orders of magnitude better cost-efficiency.",[445,2735,2737],{"id":2736},"evaluation-cost","Evaluation cost",[330,2739,2740,2755],{},[333,2741,2742],{},[336,2743,2744,2747,2749,2752],{},[339,2745,2746],{},"Judge",[339,2748,2578],{},[339,2750,2751],{},"Evals",[339,2753,2754],{},"$\u002Feval",[346,2756,2757,2771,2785],{},[336,2758,2759,2762,2765,2768],{},[351,2760,2761],{},"Gemini 3.1 Pro",[351,2763,2764],{},"$185.85",[351,2766,2767],{},"830",[351,2769,2770],{},"$0.224",[336,2772,2773,2776,2779,2782],{},[351,2774,2775],{},"GPT-5.4",[351,2777,2778],{},"$40.06",[351,2780,2781],{},"829",[351,2783,2784],{},"$0.048",[336,2786,2787,2791,2796,2801],{},[351,2788,2789],{},[296,2790,2578],{},[351,2792,2793],{},[296,2794,2795],{},"$225.91",[351,2797,2798],{},[296,2799,2800],{},"1,659",[351,2802,2803],{},"$0.136",[293,2805,2806],{},"Gemini is 4.6× more expensive per eval than GPT-5.4. Since both judges preserve variant rankings on the metrics that matter (coverage ρ=0.98, triviality ρ=0.95), the obvious move for future evals is to drop to a single GPT-5.4 judge and cut eval cost by ~80%.",[445,2808,2810],{"id":2809},"total-pipeline-cost","Total pipeline cost",[330,2812,2813,2823],{},[333,2814,2815],{},[336,2816,2817,2820],{},[339,2818,2819],{},"Component",[339,2821,2822],{},"Cost",[346,2824,2825,2833,2840],{},[336,2826,2827,2830],{},[351,2828,2829],{},"Generation (830 runs)",[351,2831,2832],{},"$69.61",[336,2834,2835,2838],{},[351,2836,2837],{},"Evaluation (1,659 evals)",[351,2839,2795],{},[336,2841,2842,2846],{},[351,2843,2844],{},[296,2845,2578],{},[351,2847,2848],{},[296,2849,2850],{},"$295.52",[293,2852,2853],{},"Eval cost is 3.2× generation cost. For future experiments: single judge + fewer reps (2 instead of 3) would cut eval cost to ~$27 while preserving ranking accuracy.",[445,2855,2857],{"id":2856},"cost-efficiency-where-to-spend-the-next-dollar","Cost-efficiency: where to spend the next dollar",[293,2859,2860],{},"The question isn't \"which model is best\" — it's \"which model gives the most quality per dollar.\"",[330,2862,2863,2881],{},[333,2864,2865],{},[336,2866,2867,2869,2872,2875,2878],{},[339,2868,2572],{},[339,2870,2871],{},"Gen $\u002Frun",[339,2873,2874],{},"Coverage",[339,2876,2877],{},"Triviality",[339,2879,2880],{},"Coverage per $0.01",[346,2882,2883,2897,2911,2925,2939,2953],{},[336,2884,2885,2887,2890,2892,2894],{},[351,2886,2315],{},[351,2888,2889],{},"$0.018",[351,2891,2332],{},[351,2893,2428],{},[351,2895,2896],{},"32.2pp",[336,2898,2899,2901,2904,2906,2908],{},[351,2900,1861],{},[351,2902,2903],{},"$0.016",[351,2905,1934],{},[351,2907,1918],{},[351,2909,2910],{},"25.3pp",[336,2912,2913,2915,2918,2920,2922],{},[351,2914,2318],{},[351,2916,2917],{},"$0.047",[351,2919,2335],{},[351,2921,2431],{},[351,2923,2924],{},"13.7pp",[336,2926,2927,2929,2932,2934,2936],{},[351,2928,2659],{},[351,2930,2931],{},"$0.027",[351,2933,2093],{},[351,2935,2107],{},[351,2937,2938],{},"19.2pp",[336,2940,2941,2943,2946,2948,2950],{},[351,2942,2321],{},[351,2944,2945],{},"$0.108",[351,2947,2340],{},[351,2949,2434],{},[351,2951,2952],{},"6.7pp",[336,2954,2955,2957,2960,2962,2964],{},[351,2956,2201],{},[351,2958,2959],{},"$0.446",[351,2961,2214],{},[351,2963,2241],{},[351,2965,2966],{},"1.3pp",[293,2968,2969,2970,2973],{},"Nano delivers the best coverage-per-dollar by a wide margin. gem31pro's coverage is actually ",[532,2971,2972],{},"lower"," than 5.4 despite costing 4× more — its strength is triviality and difficulty, not coverage. The interesting frontier is nano + quality cleanup pass: nano's coverage volume at nano's price, with a cheap rewrite pass to fix the distractor\u002Ftriviality gap.",[293,2975,2976,2980],{},[440,2977],{"alt":2978,"src":2979},"Cost breakdown — generation vs evaluation","charts\u002Fcost_breakdown.png",[440,2981],{"alt":2982,"src":2983},"Cost efficiency — quality per dollar","charts\u002Fcost_efficiency.png",{"title":430,"searchDepth":2985,"depth":2985,"links":2986},2,[2987,2992,2996,2998,3002,3006,3008,3010,3012,3018,3027,3031],{"id":309,"depth":2985,"text":2988,"children":2989},"1. quiz_correctness — source-grounded answer verification",[2990],{"id":447,"depth":2991,"text":448},3,{"id":652,"depth":2985,"text":2993,"children":2994},"2. quiz_triviality — can you solve it without studying?",[2995],{"id":777,"depth":2991,"text":778},{"id":909,"depth":2985,"text":2997},"3. blueprint_coverage — per-chunk learning-objective coverage",{"id":998,"depth":2985,"text":2999,"children":3000},"4. quiz_relevance — subject matter vs. admin trivia",[3001],{"id":1061,"depth":2991,"text":778},{"id":1165,"depth":2985,"text":3003,"children":3004},"5. quiz_distractor_quality — are the wrong answers good fakes?",[3005],{"id":1233,"depth":2991,"text":778},{"id":1344,"depth":2985,"text":3007},"6. blooms_score — how deep do the questions go?",{"id":1471,"depth":2985,"text":3009},"7. difficulty_good_rate — right level for the audience?",{"id":1588,"depth":2985,"text":3011},"8. uniqueness_rate — does the quiz ask the same thing twice?",{"id":1681,"depth":2985,"text":1682,"children":3013},[3014,3016],{"id":1690,"depth":2991,"text":3015},"quiz_structural — pipeline parse health",{"id":1754,"depth":2991,"text":3017},"quiz_structural_bias — surface-cue giveaways",{"id":1852,"depth":2985,"text":1853,"children":3019},[3020,3021,3022,3023,3024,3025,3026],{"id":1869,"depth":2991,"text":1870},{"id":2062,"depth":2991,"text":2063},{"id":2170,"depth":2991,"text":2171},{"id":2177,"depth":2991,"text":2178},{"id":2187,"depth":2991,"text":2188},{"id":2298,"depth":2991,"text":2299},{"id":2466,"depth":2991,"text":2467},{"id":2504,"depth":2985,"text":2505,"children":3028},[3029,3030],{"id":2511,"depth":2991,"text":2512},{"id":2518,"depth":2991,"text":2519},{"id":2547,"depth":2985,"text":2548,"children":3032},[3033,3034,3035,3036],{"id":2562,"depth":2991,"text":2563},{"id":2736,"depth":2991,"text":2737},{"id":2809,"depth":2991,"text":2810},{"id":2856,"depth":2991,"text":2857},"Date: 2026-04-24","md",null,{},true,{"title":116,"description":3037},"apps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics","GDNzI7OKCIFYburmKeAtNwrqFsCXopdJxFsM6TVpgvI",1779007962950]