[{"data":1,"prerenderedAt":507},["ShallowReactive",2],{"repo-tree":3,"repo-\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state":283},[4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,64,67,69,72,75,78,81,84,86,88,90,93,96,99,102,105,108,111,114,117,120,123,125,127,129,131,133,135,138,141,143,146,149,152,155,158,161,164,167,169,172,175,178,180,183,186,189,192,195,198,201,203,206,209,212,215,218,221,224,227,230,233,236,239,242,245,248,251,254,257,260,263,266,269,272,275,278,281],{"path":5,"title":6},"\u002Fagents\u002Fbackend-code-style","Backend Conventions",{"path":8,"title":9},"\u002Fagents\u002Fdatabase","Database",{"path":11,"title":12},"\u002Fagents\u002Fportal-code-style","Portal Conventions",{"path":14,"title":15},"\u002Fagents\u002Ftranslation","Translation",{"path":17,"title":18},"\u002Fconventions\u002Fbackend-coding","Backend coding conventions",{"path":20,"title":21},"\u002Fconventions\u002Ffrontend-coding","Frontend coding conventions",{"path":23,"title":24},"\u002Fdevelopment-process","Development process",{"path":26,"title":27},"\u002Flearning-api-preview-hetzner-setup","Learning API Preview on Hetzner + Cloudflare",{"path":29,"title":30},"\u002Flearning-api-preview-vm-plan","Learning API Preview VM Plan",{"path":32,"title":33},"\u002Fmonorepo-structure","Monorepo structure",{"path":35,"title":36},"\u002Foperations","Operations — bugs and support",{"path":38,"title":39},"\u002Fpostmortems\u002F2026-03-16_onboarding-currency-regression","Onboarding Zod transform silently broken — web signups assigned wrong checkout currency",{"path":41,"title":42},"\u002Fpostmortems\u002Freadme","Postmortems",{"path":44,"title":45},"\u002Fpostmortems\u002F_template","TEMPLATE",{"path":47,"title":48},"\u002Fpostmortems\u002Fposthog-comparison","Postmortem practice — comparison with PostHog",{"path":50,"title":51},"\u002Fpreview-environment-plan","Preview Environment Plan",{"path":53,"title":54},"\u002Fprinciples","Engineering principles",{"path":56,"title":57},"\u002Fworking-with-ai","Working with AI",{"path":59,"title":60},"\u002F.claude\u002Fskills\u002Feval-playground\u002Fskill","Eval Playground — Co-development Skill",{"path":62,"title":63},"\u002F.claude\u002Fskills\u002Ffigma-diff-section\u002Fskill","Figma Diff Section Pipeline",{"path":65,"title":66},"\u002Fagents","AGENTS.md",{"path":68,"title":66},"\u002Fclaude",{"path":70,"title":71},"\u002Freadme","Studyflash",{"path":73,"title":74},"\u002Fapps\u002Fcore-api\u002Fagents","Core API (apps\u002Fcore-api)",{"path":76,"title":77},"\u002Fapps\u002Fcore-api\u002Freadme","README",{"path":79,"title":80},"\u002Fapps\u002Femail-previews\u002Fagents","Email Previews (apps\u002Femail-previews)",{"path":82,"title":83},"\u002Fapps\u002Flanding-page\u002Fagents","Landing Page (apps\u002Flanding-page)",{"path":85,"title":83},"\u002Fapps\u002Flanding-page\u002Fclaude",{"path":87,"title":66},"\u002Fapps\u002Flearning-api\u002Fagents",{"path":89,"title":77},"\u002Fapps\u002Flearning-api\u002Freadme",{"path":91,"title":92},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Feval_metrics_design","Surface-Specific Eval Metrics Design",{"path":94,"title":95},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ftest_set","Quiz Eval Test Set",{"path":97,"title":98},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ffrontend\u002Freadme","React + TypeScript + Vite",{"path":100,"title":101},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fcontent-pillar-shallow-coverage\u002Freadme","Content pillar misses subtopics in dense documents",{"path":103,"title":104},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-empty-section-headers\u002Freadme","Empty section headers dropped by docling chunker",{"path":106,"title":107},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-table-reading-order\u002Freadme","Table\u002Fbox layout causes wrong reading order",{"path":109,"title":110},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002Freadme","Quiz eval metrics — canonical rubrics",{"path":112,"title":113},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-12-quiz-summary-feedback-current-state","Quiz and Summary Feedback Current State",{"path":115,"title":116},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics","Quiz Evaluation Metrics",{"path":118,"title":119},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state","Quiz Eval Current State",{"path":121,"title":122},"\u002Fapps\u002Flearning-api\u002Fmonitoring\u002Freadme","Monitoring Stack",{"path":124,"title":77},"\u002Fapps\u002Flearning-api\u002Fshared\u002Freadme",{"path":126,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fflashcard_agent\u002Freadme",{"path":128,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fingestion_agent\u002Freadme",{"path":130,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fquiz_agent\u002Freadme",{"path":132,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fsummary_agent\u002Freadme",{"path":134,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Fparser\u002Freadme",{"path":136,"title":137},"\u002Fapps\u002Fmarketing-emails-preview\u002Fagents","Marketing Emails Preview (apps\u002Fmarketing-emails-preview)",{"path":139,"title":140},"\u002Fapps\u002Fmobile-app\u002Fagents","StudyFlash Mobile App - Claude Code Configuration",{"path":142,"title":140},"\u002Fapps\u002Fmobile-app\u002Fclaude",{"path":144,"title":145},"\u002Fapps\u002Fmountain-max\u002Fagents","Mountain Max (apps\u002Fmountain-max)",{"path":147,"title":148},"\u002Fapps\u002Fmountain-max\u002Fgame\u002Freadme","Mountain Max Game",{"path":150,"title":151},"\u002Fapps\u002Fportal\u002Fagents","Portal (apps\u002Fportal)",{"path":153,"title":154},"\u002Fapps\u002Fportal\u002Freadme","Nuxt Minimal Starter",{"path":156,"title":157},"\u002Fapps\u002Fportal\u002Fapp\u002Fcomposables\u002Ffiles\u002Freadme","File Upload Composables",{"path":159,"title":160},"\u002Fapps\u002Fportal\u002Fdocs\u002Flibrary-routing","Library Routing Documentation",{"path":162,"title":163},"\u002Fapps\u002Fsupabase\u002Fagents","Supabase (apps\u002Fsupabase)",{"path":165,"title":166},"\u002Fapps\u002Fwrapped\u002Fagents","Wrapped (apps\u002Fwrapped)",{"path":168,"title":98},"\u002Fapps\u002Fwrapped\u002Freadme",{"path":170,"title":171},"\u002Finfra\u002Freadme","infra\u002F",{"path":173,"title":174},"\u002Finfra\u002Fdns\u002Freadme","DNS Infrastructure",{"path":176,"title":177},"\u002Finfra\u002Fdokploy\u002Freadme","studyflash-dokploy",{"path":179,"title":77},"\u002Finfra\u002Fdokploy\u002Fsdk\u002Fnodejs\u002Freadme",{"path":181,"title":182},"\u002Finfra\u002Finfisical\u002Freadme","Infisical Infrastructure",{"path":184,"title":185},"\u002Finfra\u002Flearning-api\u002Freadme","Pulumi GCP TypeScript Template",{"path":187,"title":188},"\u002Finfra\u002Fopenreplay\u002Freadme","OpenReplay on Hetzner",{"path":190,"title":191},"\u002Finfra\u002Fscripts\u002Freadme","infra\u002Fscripts\u002F",{"path":193,"title":194},"\u002Finfra\u002Fturborepo-cache\u002Freadme","Turborepo Remote Cache Infrastructure",{"path":196,"title":197},"\u002Finternal\u002Fchatwoot\u002Freadme","Chatwoot Infrastructure",{"path":199,"title":200},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Freadme","studyflash-chatwoot-provider",{"path":202,"title":77},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Fsdk\u002Fnodejs\u002Freadme",{"path":204,"title":205},"\u002Finternal\u002Fdocs\u002Freadme","internal\u002Fdocs",{"path":207,"title":208},"\u002Finternal\u002Fsupport-bot\u002Fclaude","Support Bot (Maximilian)",{"path":210,"title":211},"\u002Finternal\u002Fsupport-bot\u002Freadme","Studyflash Customer Support Bot (Maximilian)",{"path":213,"title":214},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Faccount_issues","Account Issues",{"path":216,"title":217},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fbilling_invoice","Billing Invoice",{"path":219,"title":220},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fcontent_upload","Content Upload",{"path":222,"title":223},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fdata_loss","Data Loss",{"path":225,"title":226},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fflashcard_issues","Flashcard Issues",{"path":228,"title":229},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgarbage","Garbage",{"path":231,"title":232},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgeneral_how_to","General How To",{"path":234,"title":235},"\u002Finternal\u002Fsupport-bot\u002Fkb","Knowledge Base Index",{"path":237,"title":238},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Flanguage_issues","Language Issues",{"path":240,"title":241},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmindmap_issues","Mindmap Issues",{"path":243,"title":244},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmisunderstanding","Misunderstanding",{"path":246,"title":247},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmock_exam_issues","Mock Exam Issues",{"path":249,"title":250},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fpodcast_issues","Podcast Issues",{"path":252,"title":253},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fquiz_issues","Quiz Issues",{"path":255,"title":256},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Frefund_request","Refund Request",{"path":258,"title":259},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_cancellation","Subscription Cancellation",{"path":261,"title":262},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_info","Subscription Info",{"path":264,"title":265},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsummary_issues","Summary Issues",{"path":267,"title":268},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Ftechnical_errors","Technical Errors",{"path":270,"title":271},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fvideo_issues","Video Issues",{"path":273,"title":274},"\u002Fpackages\u002Fcommon\u002Fdocs\u002Fearly-access-features","Declarative Early Access Features",{"path":276,"title":277},"\u002Fpackages\u002Fcommon\u002Fscripts\u002Freadme","Common Package Scripts",{"path":279,"title":280},"\u002Fpackages\u002Fdevtools\u002Ffigma-plugins\u002Freadme","Figma plugins",{"path":282,"title":77},"\u002Fpackages\u002Fpulumi-infisical\u002Freadme",{"id":284,"title":119,"body":285,"description":499,"extension":500,"lastReviewed":501,"meta":502,"navigation":503,"owner":501,"path":118,"seo":504,"status":501,"stem":505,"tags":501,"__hash__":506},"repo\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state.md",{"type":286,"value":287,"toc":491},"minimark",[288,292,304,309,317,333,340,344,347,368,371,377,403,407,410,420,423,440,447,451,454,457,475,478,481,485,488],[289,290,119],"h1",{"id":291},"quiz-eval-current-state",[293,294,295,299,300,303],"p",{},[296,297,298],"strong",{},"Date:"," 2026-05-01\n",[296,301,302],{},"Purpose:"," Durable summary of the current quiz eval baseline, retained reports, and immediate product implications.",[305,306,308],"h2",{"id":307},"what-this-consolidation-keeps","What This Consolidation Keeps",[293,310,311,312,316],{},"This state keeps the parts that are useful on ",[313,314,315],"code",{},"main",":",[318,319,320,324,327,330],"ul",{},[321,322,323],"li",{},"quiz eval framework and cross-check workflows",[321,325,326],{},"clean reports, charts, and dashboard",[321,328,329],{},"the latest April 24 report summarizing the 39-deck eval run",[321,331,332],{},"current-state feedback summary for quiz and summary complaints",[293,334,335,336,339],{},"It intentionally does ",[296,337,338],{},"not"," keep raw result JSON, raw logs, generated question dumps, binary test-set documents, broad experiment scratch files, or unrelated mobile\u002Finfra changes that came along for the ride in stacked branches.",[305,341,343],{"id":342},"baseline","Baseline",[293,345,346],{},"The production baseline evaluated in the April reports is:",[318,348,349,355,362,365],{},[321,350,351,352],{},"model: ",[313,353,354],{},"gemini-2.5-flash",[321,356,357,358,361],{},"chunking: topic-level merge, called ",[313,359,360],{},"flash-topic"," in the reports",[321,363,364],{},"prompt: production quiz generation prompt",[321,366,367],{},"no extra post-generation rewrite pass",[293,369,370],{},"The baseline is reliable on core correctness and distractor quality, but weak on coverage, cognitive depth, and structural answer cues.",[293,372,373,374,316],{},"Key baseline findings from ",[313,375,376],{},"2026-04-24-quiz-eval-metrics.md",[318,378,379,382,385,388,391,394,397,400],{},[321,380,381],{},"correctness: 93.5%",[321,383,384],{},"non-triviality: 82.3%",[321,386,387],{},"blueprint coverage: 41.5%",[321,389,390],{},"distractor quality: 84.1%",[321,392,393],{},"Bloom's depth: 34.7%",[321,395,396],{},"difficulty: 35.7%",[321,398,399],{},"uniqueness: 91.0%",[321,401,402],{},"length balance: 23.2%",[305,404,406],{"id":405},"latest-eval-report","Latest Eval Report",[293,408,409],{},"The latest retained report is:",[411,412,418],"pre",{"className":413,"code":415,"language":416,"meta":417},[414],"language-text","reports\u002F2026-04-24-quiz-eval-metrics.md\n","text","",[313,419,415],{"__ignoreMap":417},[293,421,422],{},"It summarizes the final April 23 pipeline run:",[318,424,425,428,431,434,437],{},[321,426,427],{},"39 decks",[321,429,430],{},"8 generator variants",[321,432,433],{},"2 judges",[321,435,436],{},"1,659 evals",[321,438,439],{},"cost data from generation and eval runs",[293,441,442,443,446],{},"Raw result JSON files are generated artifacts and are intentionally gitignored under ",[313,444,445],{},"reports\u002F*.json",".",[305,448,450],{"id":449},"current-best-understanding","Current Best Understanding",[293,452,453],{},"The biggest immediate product win is not a model swap. It is changing how quiz generation chunks the source material.",[293,455,456],{},"Priority stack:",[458,459,460,463,466,469,472],"ol",{},[321,461,462],{},"switch from topic-level to subtopic-level chunking",[321,464,465],{},"add a length-balancing rewrite\u002Fself-check pass",[321,467,468],{},"enforce T\u002FF balance in code",[321,470,471],{},"evaluate expensive stronger models only for premium\u002Fhigh-stakes cases",[321,473,474],{},"test a cheap overgenerate\u002Fdedup\u002Frewrite path before adopting nano-style high-volume generation",[293,476,477],{},"Subtopic chunking improved blueprint coverage by about 9 points and produced about 61% more questions in the April 24 report, with little latency cost. The tradeoff is slightly lower uniqueness, which should be handled by deduplication or redundancy filtering.",[293,479,480],{},"The worst structural failure is answer-length leakage. Across variants, most single-correct questions still make the correct answer a unique length outlier. Prompt instructions alone did not fix this; it needs a rewrite pass or self-check stage.",[305,482,484],{"id":483},"language-and-judge-caveats","Language And Judge Caveats",[293,486,487],{},"The latest report saw Dutch correctness underperform English. Non-English triviality and distractor-quality scores may also be inflated because judge models can miss language-specific giveaways.",[293,489,490],{},"Future evals should keep enough language coverage to distinguish generator issues from judge blind spots.",{"title":417,"searchDepth":492,"depth":492,"links":493},2,[494,495,496,497,498],{"id":307,"depth":492,"text":308},{"id":342,"depth":492,"text":343},{"id":405,"depth":492,"text":406},{"id":449,"depth":492,"text":450},{"id":483,"depth":492,"text":484},"Date: 2026-05-01\nPurpose: Durable summary of the current quiz eval baseline, retained reports, and immediate product implications.","md",null,{},true,{"title":119,"description":499},"apps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state","eh8f_R7Nkg1pFmrSuw-RSr8fj6Z04Rztf4uE5_gutOU",1779007962950]