[{"data":1,"prerenderedAt":839},["ShallowReactive",2],{"repo-tree":3,"repo-\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Feval_metrics_design":283},[4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,64,67,69,72,75,78,81,84,86,88,90,93,96,99,102,105,108,111,114,117,120,123,125,127,129,131,133,135,138,141,143,146,149,152,155,158,161,164,167,169,172,175,178,180,183,186,189,192,195,198,201,203,206,209,212,215,218,221,224,227,230,233,236,239,242,245,248,251,254,257,260,263,266,269,272,275,278,281],{"path":5,"title":6},"\u002Fagents\u002Fbackend-code-style","Backend Conventions",{"path":8,"title":9},"\u002Fagents\u002Fdatabase","Database",{"path":11,"title":12},"\u002Fagents\u002Fportal-code-style","Portal Conventions",{"path":14,"title":15},"\u002Fagents\u002Ftranslation","Translation",{"path":17,"title":18},"\u002Fconventions\u002Fbackend-coding","Backend coding conventions",{"path":20,"title":21},"\u002Fconventions\u002Ffrontend-coding","Frontend coding conventions",{"path":23,"title":24},"\u002Fdevelopment-process","Development process",{"path":26,"title":27},"\u002Flearning-api-preview-hetzner-setup","Learning API Preview on Hetzner + Cloudflare",{"path":29,"title":30},"\u002Flearning-api-preview-vm-plan","Learning API Preview VM Plan",{"path":32,"title":33},"\u002Fmonorepo-structure","Monorepo structure",{"path":35,"title":36},"\u002Foperations","Operations — bugs and support",{"path":38,"title":39},"\u002Fpostmortems\u002F2026-03-16_onboarding-currency-regression","Onboarding Zod transform silently broken — web signups assigned wrong checkout currency",{"path":41,"title":42},"\u002Fpostmortems\u002Freadme","Postmortems",{"path":44,"title":45},"\u002Fpostmortems\u002F_template","TEMPLATE",{"path":47,"title":48},"\u002Fpostmortems\u002Fposthog-comparison","Postmortem practice — comparison with PostHog",{"path":50,"title":51},"\u002Fpreview-environment-plan","Preview Environment Plan",{"path":53,"title":54},"\u002Fprinciples","Engineering principles",{"path":56,"title":57},"\u002Fworking-with-ai","Working with AI",{"path":59,"title":60},"\u002F.claude\u002Fskills\u002Feval-playground\u002Fskill","Eval Playground — Co-development Skill",{"path":62,"title":63},"\u002F.claude\u002Fskills\u002Ffigma-diff-section\u002Fskill","Figma Diff Section Pipeline",{"path":65,"title":66},"\u002Fagents","AGENTS.md",{"path":68,"title":66},"\u002Fclaude",{"path":70,"title":71},"\u002Freadme","Studyflash",{"path":73,"title":74},"\u002Fapps\u002Fcore-api\u002Fagents","Core API (apps\u002Fcore-api)",{"path":76,"title":77},"\u002Fapps\u002Fcore-api\u002Freadme","README",{"path":79,"title":80},"\u002Fapps\u002Femail-previews\u002Fagents","Email Previews (apps\u002Femail-previews)",{"path":82,"title":83},"\u002Fapps\u002Flanding-page\u002Fagents","Landing Page (apps\u002Flanding-page)",{"path":85,"title":83},"\u002Fapps\u002Flanding-page\u002Fclaude",{"path":87,"title":66},"\u002Fapps\u002Flearning-api\u002Fagents",{"path":89,"title":77},"\u002Fapps\u002Flearning-api\u002Freadme",{"path":91,"title":92},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Feval_metrics_design","Surface-Specific Eval Metrics Design",{"path":94,"title":95},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ftest_set","Quiz Eval Test Set",{"path":97,"title":98},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Ffrontend\u002Freadme","React + TypeScript + Vite",{"path":100,"title":101},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fcontent-pillar-shallow-coverage\u002Freadme","Content pillar misses subtopics in dense documents",{"path":103,"title":104},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-empty-section-headers\u002Freadme","Empty section headers dropped by docling chunker",{"path":106,"title":107},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fknown-issues\u002Fdocling-table-reading-order\u002Freadme","Table\u002Fbox layout causes wrong reading order",{"path":109,"title":110},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Fmetrics\u002Freadme","Quiz eval metrics — canonical rubrics",{"path":112,"title":113},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-12-quiz-summary-feedback-current-state","Quiz and Summary Feedback Current State",{"path":115,"title":116},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-04-24-quiz-eval-metrics","Quiz Evaluation Metrics",{"path":118,"title":119},"\u002Fapps\u002Flearning-api\u002Fevals-playground\u002Freports\u002F2026-05-01-quiz-eval-current-state","Quiz Eval Current State",{"path":121,"title":122},"\u002Fapps\u002Flearning-api\u002Fmonitoring\u002Freadme","Monitoring Stack",{"path":124,"title":77},"\u002Fapps\u002Flearning-api\u002Fshared\u002Freadme",{"path":126,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fflashcard_agent\u002Freadme",{"path":128,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fingestion_agent\u002Freadme",{"path":130,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fquiz_agent\u002Freadme",{"path":132,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Flearning_agents\u002Fsummary_agent\u002Freadme",{"path":134,"title":77},"\u002Fapps\u002Flearning-api\u002Fworkers\u002Fparser\u002Freadme",{"path":136,"title":137},"\u002Fapps\u002Fmarketing-emails-preview\u002Fagents","Marketing Emails Preview (apps\u002Fmarketing-emails-preview)",{"path":139,"title":140},"\u002Fapps\u002Fmobile-app\u002Fagents","StudyFlash Mobile App - Claude Code Configuration",{"path":142,"title":140},"\u002Fapps\u002Fmobile-app\u002Fclaude",{"path":144,"title":145},"\u002Fapps\u002Fmountain-max\u002Fagents","Mountain Max (apps\u002Fmountain-max)",{"path":147,"title":148},"\u002Fapps\u002Fmountain-max\u002Fgame\u002Freadme","Mountain Max Game",{"path":150,"title":151},"\u002Fapps\u002Fportal\u002Fagents","Portal (apps\u002Fportal)",{"path":153,"title":154},"\u002Fapps\u002Fportal\u002Freadme","Nuxt Minimal Starter",{"path":156,"title":157},"\u002Fapps\u002Fportal\u002Fapp\u002Fcomposables\u002Ffiles\u002Freadme","File Upload Composables",{"path":159,"title":160},"\u002Fapps\u002Fportal\u002Fdocs\u002Flibrary-routing","Library Routing Documentation",{"path":162,"title":163},"\u002Fapps\u002Fsupabase\u002Fagents","Supabase (apps\u002Fsupabase)",{"path":165,"title":166},"\u002Fapps\u002Fwrapped\u002Fagents","Wrapped (apps\u002Fwrapped)",{"path":168,"title":98},"\u002Fapps\u002Fwrapped\u002Freadme",{"path":170,"title":171},"\u002Finfra\u002Freadme","infra\u002F",{"path":173,"title":174},"\u002Finfra\u002Fdns\u002Freadme","DNS Infrastructure",{"path":176,"title":177},"\u002Finfra\u002Fdokploy\u002Freadme","studyflash-dokploy",{"path":179,"title":77},"\u002Finfra\u002Fdokploy\u002Fsdk\u002Fnodejs\u002Freadme",{"path":181,"title":182},"\u002Finfra\u002Finfisical\u002Freadme","Infisical Infrastructure",{"path":184,"title":185},"\u002Finfra\u002Flearning-api\u002Freadme","Pulumi GCP TypeScript Template",{"path":187,"title":188},"\u002Finfra\u002Fopenreplay\u002Freadme","OpenReplay on Hetzner",{"path":190,"title":191},"\u002Finfra\u002Fscripts\u002Freadme","infra\u002Fscripts\u002F",{"path":193,"title":194},"\u002Finfra\u002Fturborepo-cache\u002Freadme","Turborepo Remote Cache Infrastructure",{"path":196,"title":197},"\u002Finternal\u002Fchatwoot\u002Freadme","Chatwoot Infrastructure",{"path":199,"title":200},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Freadme","studyflash-chatwoot-provider",{"path":202,"title":77},"\u002Finternal\u002Fchatwoot\u002Fprovider\u002Fsdk\u002Fnodejs\u002Freadme",{"path":204,"title":205},"\u002Finternal\u002Fdocs\u002Freadme","internal\u002Fdocs",{"path":207,"title":208},"\u002Finternal\u002Fsupport-bot\u002Fclaude","Support Bot (Maximilian)",{"path":210,"title":211},"\u002Finternal\u002Fsupport-bot\u002Freadme","Studyflash Customer Support Bot (Maximilian)",{"path":213,"title":214},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Faccount_issues","Account Issues",{"path":216,"title":217},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fbilling_invoice","Billing Invoice",{"path":219,"title":220},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fcontent_upload","Content Upload",{"path":222,"title":223},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fdata_loss","Data Loss",{"path":225,"title":226},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fflashcard_issues","Flashcard Issues",{"path":228,"title":229},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgarbage","Garbage",{"path":231,"title":232},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fgeneral_how_to","General How To",{"path":234,"title":235},"\u002Finternal\u002Fsupport-bot\u002Fkb","Knowledge Base Index",{"path":237,"title":238},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Flanguage_issues","Language Issues",{"path":240,"title":241},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmindmap_issues","Mindmap Issues",{"path":243,"title":244},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmisunderstanding","Misunderstanding",{"path":246,"title":247},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fmock_exam_issues","Mock Exam Issues",{"path":249,"title":250},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fpodcast_issues","Podcast Issues",{"path":252,"title":253},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fquiz_issues","Quiz Issues",{"path":255,"title":256},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Frefund_request","Refund Request",{"path":258,"title":259},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_cancellation","Subscription Cancellation",{"path":261,"title":262},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsubscription_info","Subscription Info",{"path":264,"title":265},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fsummary_issues","Summary Issues",{"path":267,"title":268},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Ftechnical_errors","Technical Errors",{"path":270,"title":271},"\u002Finternal\u002Fsupport-bot\u002Fkb\u002Fvideo_issues","Video Issues",{"path":273,"title":274},"\u002Fpackages\u002Fcommon\u002Fdocs\u002Fearly-access-features","Declarative Early Access Features",{"path":276,"title":277},"\u002Fpackages\u002Fcommon\u002Fscripts\u002Freadme","Common Package Scripts",{"path":279,"title":280},"\u002Fpackages\u002Fdevtools\u002Ffigma-plugins\u002Freadme","Figma plugins",{"path":282,"title":77},"\u002Fpackages\u002Fpulumi-infisical\u002Freadme",{"id":284,"title":92,"body":285,"description":805,"extension":832,"lastReviewed":833,"meta":834,"navigation":835,"owner":833,"path":91,"seo":836,"status":833,"stem":837,"tags":833,"__hash__":838},"repo\u002Fapps\u002Flearning-api\u002Fevals-playground\u002FEVAL_METRICS_DESIGN.md",{"type":286,"value":287,"toc":804},"minimark",[288,292,297,315,318,321,325,333,339,345,351,371,377,382,406,413,418,423,427,441,446,453,458,463,468,479,484,491,496,501,505,516,521,523,527,533,538,543,548,562,567,573,578,583,587,598,603,609,614,619,623,651,656,662,667,672,676,687,692,694,698,703,721,736,748,750,754],[289,290,92],"h1",{"id":291},"surface-specific-eval-metrics-design",[293,294,296],"h2",{"id":295},"context","Context",[298,299,300,301,305,306,309,310,314],"p",{},"We're comparing content pillar strategies (",[302,303,304],"code",{},"merged_chunks"," vs ",[302,307,308],{},"docling_chunks",") across three surfaces: flashcards, quiz, summary. The oracle judge does holistic A\u002FB comparison. Auto-metrics provide specific, quantitative signals explaining ",[311,312,313],"strong",{},"why"," one variant is better.",[298,316,317],{},"Flashcards already have 8 metrics. Quiz and summary have zero. This doc designs pragmatic metrics for both.",[319,320],"hr",{},[293,322,324],{"id":323},"quiz-metrics","Quiz Metrics",[326,327,329,330],"h3",{"id":328},"_1-quiz_answer_correctness","1. ",[302,331,332],{},"quiz_answer_correctness",[298,334,335,338],{},[311,336,337],{},"What it tests",": Is the marked correct answer actually correct? Are marked wrong answers actually wrong?",[298,340,341,344],{},[311,342,343],{},"Why it matters",": A quiz with wrong answers is worse than no quiz. This is the #1 quality signal.",[298,346,347,350],{},[311,348,349],{},"How it works",":",[352,353,354,358,361,364],"ul",{},[355,356,357],"li",{},"For each question, send Q + all choices + source chunk text to LLM",[355,359,360],{},"LLM verifies: (a) correct answer is factually right, (b) each distractor is actually wrong",[355,362,363],{},"Classification: CORRECT, INCORRECT_ANSWER, INCORRECT_DISTRACTOR, ERROR",[355,365,366,367,370],{},"Returns ",[302,368,369],{},"good_rate"," = % of questions where answer + all distractors are valid",[298,372,373,376],{},[311,374,375],{},"Inputs",": Quiz questions with choices, content pillars (for source chunk text)",[298,378,379,350],{},[311,380,381],{},"Surface-specific nuances",[352,383,384,390,396],{},[355,385,386,389],{},[302,387,388],{},"true_false",": Verify the statement's truth value matches the marked answer",[355,391,392,395],{},[302,393,394],{},"fill_in_the_blank",": Verify the answer fits the blank correctly",[355,397,398,401,402,405],{},[302,399,400],{},"single_choice"," \u002F ",[302,403,404],{},"multiple_choice",": Verify correct answer(s) AND each distractor",[326,407,409,410],{"id":408},"_2-quiz_distractor_quality","2. ",[302,411,412],{},"quiz_distractor_quality",[298,414,415,417],{},[311,416,337],{},": Are distractors plausible enough to challenge a student who doesn't know the material?",[298,419,420,422],{},[311,421,343],{},": Trivially wrong distractors make quizzes useless (\"What's 2+2? A:4, B:Banana, C:Purple\"). Good distractors are the difference between useful and useless MCQs.",[298,424,425,350],{},[311,426,349],{},[352,428,429,432,435,438],{},[355,430,431],{},"For each MCQ question, send Q + correct answer + distractors to LLM",[355,433,434],{},"LLM rates each distractor: PLAUSIBLE (would fool someone who didn't study), WEAK (obviously wrong to most), ABSURD (nonsensical)",[355,436,437],{},"Score = % of distractors rated PLAUSIBLE",[355,439,440],{},"Extra data: which distractors are weak, suggestions for improvement",[298,442,443,445],{},[311,444,375],{},": Quiz questions (MCQ types only, skip true_false and fill_in_the_blank)",[326,447,449,450],{"id":448},"_3-quiz_coverage","3. ",[302,451,452],{},"quiz_coverage",[298,454,455,457],{},[311,456,337],{},": Do quiz questions cover the important concepts from the source?",[298,459,460,462],{},[311,461,343],{},": A quiz that only tests one chapter is incomplete.",[298,464,465,467],{},[311,466,349],{},": Reuse the coverage_v2 pattern:",[352,469,470,473,476],{},[355,471,472],{},"Extract key concepts from content pillars",[355,474,475],{},"For each concept, LLM judges depth of coverage by quiz questions (0-3 scale)",[355,477,478],{},"Score = weighted mean of depths",[298,480,481,483],{},[311,482,375],{},": Quiz questions + content pillars",[326,485,487,488],{"id":486},"_4-quiz_clarity","4. ",[302,489,490],{},"quiz_clarity",[298,492,493,495],{},[311,494,337],{},": Are questions clear, unambiguous, and self-contained?",[298,497,498,500],{},[311,499,343],{},": An ambiguous question frustrates students regardless of answer quality.",[298,502,503,350],{},[311,504,349],{},[352,506,507,510,513],{},[355,508,509],{},"Batch quiz questions (10 per batch)",[355,511,512],{},"LLM classifies each: CLEAR, AMBIGUOUS, CONTEXT_DEPENDENT, ERROR",[355,514,515],{},"Score = % CLEAR",[298,517,518,520],{},[311,519,375],{},": Quiz questions (no source material needed)",[319,522],{},[293,524,526],{"id":525},"summary-metrics","Summary Metrics",[326,528,329,530],{"id":529},"_1-summary_faithfulness",[302,531,532],{},"summary_faithfulness",[298,534,535,537],{},[311,536,337],{},": Does the summary contain only factually accurate claims derivable from the source?",[298,539,540,542],{},[311,541,343],{},": Hallucinated content in a summary is actively harmful for studying.",[298,544,545,547],{},[311,546,349],{}," (inspired by RAGAS faithfulness):",[352,549,550,553,556,559],{},[355,551,552],{},"Step 1: LLM extracts atomic claims from summary (each claim = one fact)",[355,554,555],{},"Step 2: For each claim, LLM verifies against source markdown: SUPPORTED, UNSUPPORTED, CONTRADICTED",[355,557,558],{},"Score = % SUPPORTED out of total non-trivial claims",[355,560,561],{},"Extra data: list of unsupported\u002Fcontradicted claims with reasoning",[298,563,564,566],{},[311,565,375],{},": Summary text + source markdown (from parsing_result.md_url)",[326,568,409,570],{"id":569},"_2-summary_coverage",[302,571,572],{},"summary_coverage",[298,574,575,577],{},[311,576,337],{},": Does the summary cover all major topics from the source?",[298,579,580,582],{},[311,581,343],{},": A summary that misses half the content is incomplete.",[298,584,585,350],{},[311,586,349],{},[352,588,589,592,595],{},[355,590,591],{},"Extract main topics from content pillars",[355,593,594],{},"For each topic, LLM judges: COVERED (mentioned and explained), MENTIONED (briefly touched), MISSING (not addressed)",[355,596,597],{},"Score = weighted (COVERED=1, MENTIONED=0.5, MISSING=0) \u002F total topics",[298,599,600,602],{},[311,601,375],{},": Summary text + content pillars",[326,604,449,606],{"id":605},"_3-summary_coherence",[302,607,608],{},"summary_coherence",[298,610,611,613],{},[311,612,337],{},": Is the summary well-organized, logically structured, and readable?",[298,615,616,618],{},[311,617,343],{},": A coherent summary is easier to study from.",[298,620,621,350],{},[311,622,349],{},[352,624,625,648],{},[355,626,627,628],{},"LLM rates the summary on a 1-5 scale across 3 dimensions:\n",[352,629,630,636,642],{},[355,631,632,635],{},[311,633,634],{},"Organization",": Clear structure, logical section ordering",[355,637,638,641],{},[311,639,640],{},"Flow",": Smooth transitions, no abrupt jumps",[355,643,644,647],{},[311,645,646],{},"Completeness",": Each section is self-contained, no dangling references",[355,649,650],{},"Score = mean of 3 ratings, normalized to 0-100",[298,652,653,655],{},[311,654,375],{},": Summary text only (no source needed)",[326,657,487,659],{"id":658},"_4-summary_information_density",[302,660,661],{},"summary_information_density",[298,663,664,666],{},[311,665,337],{},": Is the summary appropriately concise? Does it avoid filler and repetition?",[298,668,669,671],{},[311,670,343],{},": Verbose summaries waste study time. Dense summaries are more useful.",[298,673,674,350],{},[311,675,349],{},[352,677,678,681,684],{},[355,679,680],{},"LLM identifies: filler phrases, repeated information, unnecessary padding",[355,682,683],{},"Score = (total sentences - filler sentences) \u002F total sentences * 100",[355,685,686],{},"Extra data: highlighted filler\u002Frepetition passages",[298,688,689,691],{},[311,690,375],{},": Summary text only",[319,693],{},[293,695,697],{"id":696},"implementation-priority","Implementation Priority",[298,699,700,350],{},[311,701,702],{},"Phase 1 (essential for meaningful experiment results)",[704,705,706,711,716],"ol",{},[355,707,708,710],{},[302,709,332],{}," - Without this, quiz comparison is guesswork",[355,712,713,715],{},[302,714,532],{}," - Without this, summary comparison is guesswork",[355,717,718,720],{},[302,719,572],{}," - Direct quality signal using existing infrastructure",[298,722,723,726,727,729,730,732,733,735],{},[311,724,725],{},"Phase 2 (improves experiment depth)",":\n4. ",[302,728,412],{}," - Unique to MCQ, differentiating signal\n5. ",[302,731,452],{}," - Reuses coverage_v2 pattern\n6. ",[302,734,608],{}," - Quick win, simple implementation",[298,737,738,741,742,744,745,747],{},[311,739,740],{},"Phase 3 (nice to have)",":\n7. ",[302,743,490],{}," - Simple but less discriminating\n8. ",[302,746,661],{}," - Subtle signal",[319,749],{},[293,751,753],{"id":752},"integration-plan","Integration Plan",[704,755,756,778,791,798,801],{},[355,757,758,759,762,763,762,766,762,769,762,772,762,775],{},"New files: ",[302,760,761],{},"metrics\u002Fquiz_correctness.py",", ",[302,764,765],{},"metrics\u002Fquiz_distractor_quality.py",[302,767,768],{},"metrics\u002Fquiz_coverage.py",[302,770,771],{},"metrics\u002Fsummary_faithfulness.py",[302,773,774],{},"metrics\u002Fsummary_coverage.py",[302,776,777],{},"metrics\u002Fsummary_coherence.py",[355,779,780,781,784,785,762,788],{},"New functions in ",[302,782,783],{},"sdk.py",": ",[302,786,787],{},"evaluate_quiz()",[302,789,790],{},"evaluate_summary()",[355,792,793,794,797],{},"Hook into ",[302,795,796],{},"experiment_runner.py"," after generation (parallel with flashcard eval)",[355,799,800],{},"New MetricName literals for each surface",[355,802,803],{},"Frontend: surface-specific metric display in ExperimentOverview",{"title":805,"searchDepth":806,"depth":806,"links":807},"",2,[808,809,820,830,831],{"id":295,"depth":806,"text":296},{"id":323,"depth":806,"text":324,"children":810},[811,814,816,818],{"id":328,"depth":812,"text":813},3,"1. quiz_answer_correctness",{"id":408,"depth":812,"text":815},"2. quiz_distractor_quality",{"id":448,"depth":812,"text":817},"3. quiz_coverage",{"id":486,"depth":812,"text":819},"4. quiz_clarity",{"id":525,"depth":806,"text":526,"children":821},[822,824,826,828],{"id":529,"depth":812,"text":823},"1. summary_faithfulness",{"id":569,"depth":812,"text":825},"2. summary_coverage",{"id":605,"depth":812,"text":827},"3. summary_coherence",{"id":658,"depth":812,"text":829},"4. summary_information_density",{"id":696,"depth":806,"text":697},{"id":752,"depth":806,"text":753},"md",null,{},true,{"title":92,"description":805},"apps\u002Flearning-api\u002Fevals-playground\u002FEVAL_METRICS_DESIGN","GMFnRf8MoBnmj_UGqPnNJEYOHGAdSALEOP4EWPBGleU",1779007962949]