Created
March 23, 2026 21:27
-
-
Save mlelarge/693c20d21f6c4362e07967f536734685 to your computer and use it in GitHub Desktop.
HW3 llm-efficiency auto-grading
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Self-grading script for LLM Efficiency homework. | |
| Parses your test output files and computes your grade. | |
| Usage: python self_grade.py | |
| This script does NOT run any code. It reads the output files | |
| produced by test_and_submit.sh (in results/ or results_manual/) | |
| and computes your score. | |
| Scoring: | |
| KV Cache: passed_tests / 21 * 10 (out of 10) | |
| LoRA: passed_tests / 33 * 10 (out of 10) | |
| Total: out of 20 | |
| """ | |
| import re | |
| import sys | |
| from pathlib import Path | |
| KV_TOTAL = 21 | |
| LORA_TOTAL = 33 | |
| def find_latest_results_dir() -> Path | None: | |
| """Find the latest timestamped results directory.""" | |
| candidates = [] | |
| for folder in ["results", "results_manual"]: | |
| root = Path(folder) | |
| if root.is_dir(): | |
| for d in root.iterdir(): | |
| if d.is_dir() and re.match(r"\d{8}_\d{6}", d.name): | |
| candidates.append(d) | |
| if not candidates: | |
| return None | |
| return sorted(candidates, key=lambda p: p.name)[-1] | |
| def parse_pytest(filepath: Path) -> tuple[int, int, list[str]]: | |
| """Parse pytest output. Returns (passed, failed, list_of_failed_test_names).""" | |
| if not filepath.is_file(): | |
| return 0, 0, [] | |
| text = filepath.read_text(errors="replace") | |
| passed = 0 | |
| failed = 0 | |
| # Find the last summary line | |
| matches = re.findall(r"=+ (.+?) =+\s*$", text, re.MULTILINE) | |
| if matches: | |
| summary = matches[-1] | |
| p = re.search(r"(\d+) passed", summary) | |
| f = re.search(r"(\d+) failed", summary) | |
| if p: | |
| passed = int(p.group(1)) | |
| if f: | |
| failed = int(f.group(1)) | |
| # Extract failed test names | |
| failed_tests = re.findall(r"FAILED (tests/\S+)", text) | |
| return passed, failed, failed_tests | |
| def parse_demo_accuracies(filepath: Path) -> list[str]: | |
| """Extract 'final score' lines from demo output.""" | |
| if not filepath.is_file(): | |
| return [] | |
| text = filepath.read_text(errors="replace") | |
| return re.findall(r".*final score.*", text) | |
| def main(): | |
| results_dir = find_latest_results_dir() | |
| print("=" * 48) | |
| print(" LLM Efficiency - Self Grading") | |
| if results_dir: | |
| print(f" Using results from: {results_dir}/") | |
| print("=" * 48) | |
| print() | |
| if not results_dir: | |
| print("ERROR: No results directory found.") | |
| print("Run test_and_submit.sh first to generate your outputs.") | |
| sys.exit(1) | |
| # ── KV Cache Tests ──────────────────────────────────────────── | |
| kv_file = results_dir / "test_kv_cache.txt" | |
| kv_passed, kv_failed, kv_failed_tests = parse_pytest(kv_file) | |
| kv_score = round(kv_passed / KV_TOTAL * 10, 2) | |
| print("-- KV Cache " + "-" * 36) | |
| if kv_file.is_file(): | |
| print(f" Tests passed: {kv_passed} / {KV_TOTAL}") | |
| if kv_failed > 0: | |
| print(f" Tests failed: {kv_failed}") | |
| print() | |
| print(" Failed tests:") | |
| for t in kv_failed_tests: | |
| print(f" {t}") | |
| else: | |
| print(f" No test output found ({kv_file})") | |
| print(f" Score: {kv_score} / 10") | |
| print() | |
| # ── LoRA Tests ──────────────────────────────────────────────── | |
| lora_file = results_dir / "test_lora.txt" | |
| lora_passed, lora_failed, lora_failed_tests = parse_pytest(lora_file) | |
| lora_score = round(lora_passed / LORA_TOTAL * 10, 2) | |
| print("-- LoRA " + "-" * 40) | |
| if lora_file.is_file(): | |
| print(f" Tests passed: {lora_passed} / {LORA_TOTAL}") | |
| if lora_failed > 0: | |
| print(f" Tests failed: {lora_failed}") | |
| print() | |
| print(" Failed tests:") | |
| for t in lora_failed_tests: | |
| print(f" {t}") | |
| else: | |
| print(f" No test output found ({lora_file})") | |
| print(f" Score: {lora_score} / 10") | |
| print() | |
| # ── Demo outputs (informational) ───────────────────────────── | |
| print("-- Demo Outputs (informational) " + "-" * 16) | |
| print() | |
| demo_kv = results_dir / "demo_sort_kv.txt" | |
| if demo_kv.is_file(): | |
| print(" demo_sort_kv.txt:") | |
| for line in parse_demo_accuracies(demo_kv): | |
| print(f" {line.strip()}") | |
| else: | |
| print(" demo_sort_kv.txt: not found") | |
| print() | |
| demo_lora = results_dir / "demo_sort_lora.txt" | |
| if demo_lora.is_file(): | |
| print(" demo_sort_lora.txt:") | |
| for line in parse_demo_accuracies(demo_lora): | |
| print(f" {line.strip()}") | |
| else: | |
| print(" demo_sort_lora.txt: not found") | |
| print() | |
| benchmark = results_dir / "benchmark_kv.txt" | |
| if not benchmark.is_file(): | |
| benchmark = Path("kv_cache/benchmark_results.txt") | |
| if benchmark.is_file(): | |
| print(f" {benchmark.name}:") | |
| text = benchmark.read_text(errors="replace") | |
| for line in text.splitlines(): | |
| if re.search(r"(gpt|Context T|speedup|\d+\.\d+x)", line): | |
| print(f" {line.strip()}") | |
| else: | |
| print(" benchmark: not found") | |
| print() | |
| # ── Total ───────────────────────────────────────────────────── | |
| total = round(kv_score + lora_score, 2) | |
| print("=" * 48) | |
| print(f" TOTAL SCORE: {total} / 20") | |
| print() | |
| print(f" KV Cache: {kv_score} / 10 ({kv_passed}/{KV_TOTAL} tests)") | |
| print(f" LoRA: {lora_score} / 10 ({lora_passed}/{LORA_TOTAL} tests)") | |
| print("=" * 48) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment