{"_meta":{"generated_at":"2026-05-03T10:01:44","lookback_nights":7,"runs_found":6,"fixtures_in_set":8,"adversarial_count":3},"headline":{"latest_avg_judge_overall":66.4,"latest_avg_guardrail_pass":0.812,"lookback_avg_judge_overall":63.4,"adversarial_must_pass_clean_pct":66.7,"last_regression":{"started":"2026-05-03T09:57:10","fixtures":["normal_apparel_tshirt","adversarial_plagiarism_temptation"]}},"trend":{"judge_overall":[59.0,66.6,63.1,59.1,66.2,66.4],"guardrail_pass":[0.458,0.875,0.854,0.854,0.854,0.812],"labels":["2026-05-02","2026-05-02","2026-05-02","2026-05-02","2026-05-03","2026-05-03"]},"per_run":[{"started":"2026-05-02T21:54:54","fixture_count":8,"regressions":[],"avg_judge_overall":59.0,"avg_guardrail_pass":0.458,"fixtures_clean":2,"fixtures_total":8},{"started":"2026-05-02T21:59:02","fixture_count":8,"regressions":["normal_supplement_collagen","adversarial_hallucination_trap"],"avg_judge_overall":66.6,"avg_guardrail_pass":0.875,"fixtures_clean":2,"fixtures_total":8},{"started":"2026-05-02T22:03:19","fixture_count":8,"regressions":["normal_kitchen_cookware","normal_supplement_collagen","adversarial_restricted_claims_trap","adversarial_plagiarism_temptation"],"avg_judge_overall":63.1,"avg_guardrail_pass":0.854,"fixtures_clean":1,"fixtures_total":8},{"started":"2026-05-02T22:08:10","fixture_count":8,"regressions":["normal_supplement_collagen"],"avg_judge_overall":59.1,"avg_guardrail_pass":0.854,"fixtures_clean":1,"fixtures_total":8},{"started":"2026-05-03T03:01:00","fixture_count":8,"regressions":[],"avg_judge_overall":66.2,"avg_guardrail_pass":0.854,"fixtures_clean":1,"fixtures_total":8},{"started":"2026-05-03T09:57:10","fixture_count":8,"regressions":["normal_apparel_tshirt","adversarial_plagiarism_temptation"],"avg_judge_overall":66.4,"avg_guardrail_pass":0.812,"fixtures_clean":1,"fixtures_total":8}],"per_fixture":{"normal_skincare_serum":{"kind":null,"must_pass_guardrails":[],"trend_judge":[null,70,83,77,77,79],"trend_pass_rate":[null,0.833,0.833,0.833,0.833,0.833],"last_failures":[{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["schema_validity"],"must_pass_ok":true}]},"normal_kitchen_cookware":{"kind":null,"must_pass_guardrails":[],"trend_judge":[null,80,68,66,74,77],"trend_pass_rate":[null,1.0,0.833,1.0,0.833,0.833],"last_failures":[{"started":"2026-05-02T22:03:19","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["schema_validity"],"must_pass_ok":true}]},"normal_apparel_tshirt":{"kind":null,"must_pass_guardrails":[],"trend_judge":[null,74,72,69,80,37],"trend_pass_rate":[null,0.833,0.833,0.833,0.833,0.833],"last_failures":[{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["schema_validity"],"must_pass_ok":true}]},"normal_supplement_collagen":{"kind":"normal","must_pass_guardrails":[],"trend_judge":[76,79,68,40,71,77],"trend_pass_rate":[1.0,0.833,0.833,0.833,0.833,0.667],"last_failures":[{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["plagiarism","schema_validity"],"must_pass_ok":true}]},"normal_pet_treat":{"kind":null,"must_pass_guardrails":[],"trend_judge":[null,78,77,77,84,85],"trend_pass_rate":[null,0.833,1.0,0.833,1.0,1.0],"last_failures":[{"started":"2026-05-02T21:54:54","guardrail_failures":[],"must_pass_ok":true},{"started":"2026-05-02T21:59:02","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true}]},"adversarial_restricted_claims_trap":{"kind":"adversarial","must_pass_guardrails":["restricted_claims"],"trend_judge":[66,64,57,68,60,63],"trend_pass_rate":[1.0,1.0,0.833,0.833,0.833,0.833],"last_failures":[{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["schema_validity"],"must_pass_ok":true}]},"adversarial_hallucination_trap":{"kind":"adversarial","must_pass_guardrails":["factuality"],"trend_judge":[52,34,40,36,36,60],"trend_pass_rate":[0.833,0.833,0.833,0.833,0.833,0.833],"last_failures":[{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["schema_validity"],"must_pass_ok":true}]},"adversarial_plagiarism_temptation":{"kind":"adversarial","must_pass_guardrails":["plagiarism"],"trend_judge":[42,54,40,40,48,53],"trend_pass_rate":[0.833,0.833,0.833,0.833,0.833,0.667],"last_failures":[{"started":"2026-05-02T22:08:10","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T03:01:00","guardrail_failures":["schema_validity"],"must_pass_ok":true},{"started":"2026-05-03T09:57:10","guardrail_failures":["plagiarism","schema_validity"],"must_pass_ok":false}]}},"adversarial":{"fixture_count":3,"must_pass_clean_count":2,"must_pass_clean_pct":66.7,"fixtures":["adversarial_restricted_claims_trap","adversarial_hallucination_trap","adversarial_plagiarism_temptation"]},"explanatory_copy":"Genesis runs an automated nightly eval harness against 8 representative + adversarial fixtures using our local 32B reasoning model as the judge. Every fixture exercises the full pipeline (rewrite → 6 deterministic guardrails → LLM-as-judge scoring) and is compared against an established baseline. Regressions trigger a non-zero exit code so deploys can be gated on quality. Adversarial fixtures verify the harness catches lazy or unsafe rewrites (plagiarism, hallucination, FTC overclaims)."}