Create a test dataset with human-verified labels

Create a test dataset with human-verified labels

Compare teacher and student models on the same test set to ensure the student meets quality requirements.

Important: Use Ground Truth

⚠️
Do not evaluate on teacher-labeled data. The student was trained on teacher labels, so comparing them on that data is circular. Use a held-out test set with human-verified ground truth labels.
graph LR
    A[Teacher Labels] -->|Training| B[Student Model]
    C[Ground Truth Test Set] -->|Evaluate| D[Teacher]
    C -->|Evaluate| E[Student]
    D --> F[Compare]
    E --> F

Prepare Test Set

If you don’t have a ground truth test set, create one:

from seeme import Client

client = Client()

test_dataset = client.create_dataset(
    name="Test Set: Product Classifier",
    description="Human-verified ground truth for evaluation"
)

test_version = client.create_dataset_version(
    dataset_id=test_dataset.id,
    name="v1"
)

# Create splits and labels
test_split = client.create_split(version_id=test_version.id, name="test")
for label in ["good", "scratch", "dent", "discoloration", "crack"]:
    client.create_label(version_id=test_version.id, name=label)

# Upload and manually label test images
# This should be done by humans, not the teacher model!

Run Evaluation

Evaluate both models on the same test set:

# Evaluate teacher
teacher_results = client.evaluate_model(
    model_id=teacher_model.id,
    dataset_id=test_dataset.id,
    version_id=test_version.id,
    split="test"
)

# Evaluate student
student_results = client.evaluate_model(
    model_id=student_model.id,
    dataset_id=test_dataset.id,
    version_id=test_version.id,
    split="test"
)

print("Evaluation Results")
print("=" * 50)
print(f"{'Metric':<20} {'Teacher':<15} {'Student':<15}")
print("-" * 50)
print(f"{'Accuracy':<20} {teacher_results['accuracy']:<15.2%} {student_results['accuracy']:<15.2%}")
print(f"{'Loss':<20} {teacher_results['loss']:<15.4f} {student_results['loss']:<15.4f}")

Detailed Comparison

Per-Class Performance

# Get predictions from both models
teacher_preds = client.predict_batch(
    model_id=teacher_model.id,
    dataset_id=test_dataset.id,
    version_id=test_version.id,
    split="test"
)

student_preds = client.predict_batch(
    model_id=student_model.id,
    dataset_id=test_dataset.id,
    version_id=test_version.id,
    split="test"
)

# Calculate per-class metrics
def per_class_accuracy(predictions, labels):
    metrics = {}
    for label in labels:
        correct = sum(1 for p in predictions
                     if p.ground_truth == label and p.prediction == label)
        total = sum(1 for p in predictions if p.ground_truth == label)
        metrics[label] = correct / total if total > 0 else 0
    return metrics

labels = ["good", "scratch", "dent", "discoloration", "crack"]
teacher_per_class = per_class_accuracy(teacher_preds, labels)
student_per_class = per_class_accuracy(student_preds, labels)

print("\nPer-Class Accuracy")
print("=" * 60)
print(f"{'Class':<20} {'Teacher':<15} {'Student':<15} {'Gap':<10}")
print("-" * 60)

for label in labels:
    t_acc = teacher_per_class[label]
    s_acc = student_per_class[label]
    gap = t_acc - s_acc
    gap_str = f"{gap:+.1%}" if gap != 0 else "0%"
    print(f"{label:<20} {t_acc:<15.2%} {s_acc:<15.2%} {gap_str:<10}")

Find Where Student Struggles

# Find items where teacher is right but student is wrong
student_failures = []

for t_pred, s_pred in zip(teacher_preds, student_preds):
    teacher_correct = t_pred.prediction == t_pred.ground_truth
    student_correct = s_pred.prediction == s_pred.ground_truth

    if teacher_correct and not student_correct:
        student_failures.append({
            "item_id": s_pred.item_id,
            "ground_truth": s_pred.ground_truth,
            "teacher_pred": t_pred.prediction,
            "student_pred": s_pred.prediction,
            "student_confidence": s_pred.confidence
        })

print(f"\nStudent failures (teacher correct, student wrong): {len(student_failures)}")

# Group by error type
from collections import Counter
error_types = Counter(f"{f['ground_truth']}{f['student_pred']}" for f in student_failures)

print("\nMost common student errors:")
for error_type, count in error_types.most_common(5):
    print(f"  {error_type}: {count}")

Agreement Analysis

# How often do teacher and student agree?
agreements = sum(1 for t, s in zip(teacher_preds, student_preds)
                if t.prediction == s.prediction)
agreement_rate = agreements / len(teacher_preds)

print(f"\nTeacher-Student Agreement: {agreement_rate:.2%}")

# When they disagree, who is usually right?
disagreements = [(t, s) for t, s in zip(teacher_preds, student_preds)
                 if t.prediction != s.prediction]

teacher_right_on_disagree = sum(1 for t, s in disagreements
                                if t.prediction == t.ground_truth)
student_right_on_disagree = sum(1 for t, s in disagreements
                                if s.prediction == s.ground_truth)

print(f"When they disagree ({len(disagreements)} cases):")
print(f"  Teacher correct: {teacher_right_on_disagree} ({teacher_right_on_disagree/len(disagreements):.1%})")
print(f"  Student correct: {student_right_on_disagree} ({student_right_on_disagree/len(disagreements):.1%})")
print(f"  Both wrong: {len(disagreements) - teacher_right_on_disagree - student_right_on_disagree}")

Speed Comparison

import time

# Benchmark inference speed
def benchmark_model(client, model_id, test_images, n_runs=50):
    # Warmup
    for img in test_images[:5]:
        client.predict(model_id=model_id, item=img)

    # Timed runs
    start = time.time()
    for i in range(n_runs):
        img = test_images[i % len(test_images)]
        client.predict(model_id=model_id, item=img)
    elapsed = time.time() - start

    return (elapsed / n_runs) * 1000  # ms per prediction

test_images = ["test1.jpg", "test2.jpg", "test3.jpg", "test4.jpg", "test5.jpg"]

teacher_latency = benchmark_model(client, teacher_model.id, test_images)
student_latency = benchmark_model(client, student_model.id, test_images)

print("\nInference Speed")
print("=" * 50)
print(f"{'Model':<20} {'Latency (ms)':<15} {'Throughput':<15}")
print("-" * 50)
print(f"{'Teacher':<20} {teacher_latency:<15.1f} {1000/teacher_latency:<15.1f}/sec")
print(f"{'Student':<20} {student_latency:<15.1f} {1000/student_latency:<15.1f}/sec")
print(f"{'Speedup':<20} {teacher_latency/student_latency:<15.1f}x")

Cost Comparison

# Estimate cost per 1000 predictions
def estimate_cost(model, latency_ms, is_external_llm=False, external_cost_per_call=0.01):
    if is_external_llm:
        cost_per_1000 = external_cost_per_call * 1000
    else:
        # Estimate based on compute time (example rates)
        gpu_cost_per_hour = 1.0  # $1/hour for GPU
        predictions_per_hour = 3600 * 1000 / latency_ms
        cost_per_1000 = (1000 / predictions_per_hour) * gpu_cost_per_hour

    return cost_per_1000

teacher_cost = estimate_cost(teacher_model, teacher_latency, is_external_llm=True, external_cost_per_call=0.01)
student_cost = estimate_cost(student_model, student_latency, is_external_llm=False)

print("\nCost Comparison (per 1000 predictions)")
print("=" * 50)
print(f"Teacher: ${teacher_cost:.2f}")
print(f"Student: ${student_cost:.4f}")
print(f"Savings: {teacher_cost / student_cost:.0f}x")

Summary Report

def distillation_summary(teacher_acc, student_acc, teacher_latency, student_latency,
                        teacher_size_mb, student_size_mb, teacher_cost, student_cost):
    """Generate distillation summary report."""

    accuracy_gap = teacher_acc - student_acc
    speedup = teacher_latency / student_latency
    size_reduction = teacher_size_mb / student_size_mb
    cost_reduction = teacher_cost / student_cost

    report = f"""
╔══════════════════════════════════════════════════════════════╗
║                  DISTILLATION SUMMARY                        ║
╠══════════════════════════════════════════════════════════════╣
║  Metric              │  Teacher    │  Student   │  Change    ║
╠──────────────────────┼─────────────┼────────────┼────────────╣
║  Accuracy            │  {teacher_acc:>8.2%}{student_acc:>8.2%}{accuracy_gap:>+7.2%}║  Latency (ms)        │  {teacher_latency:>8.1f}{student_latency:>8.1f}{speedup:>6.1f}x ⚡  ║
║  Model Size (MB)     │  {teacher_size_mb:>8.1f}{student_size_mb:>8.1f}{size_reduction:>6.1f}x 📦  ║
║  Cost per 1000       │  ${teacher_cost:>7.2f}   │  ${student_cost:>7.4f}{cost_reduction:>6.0f}x 💰  ║
╚══════════════════════════════════════════════════════════════╝
"""

    # Recommendation
    if accuracy_gap <= 0.03:
        recommendation = "✅ RECOMMENDED: Student matches teacher quality. Deploy student."
    elif accuracy_gap <= 0.05:
        recommendation = "⚠️  ACCEPTABLE: Small accuracy gap. Consider if tradeoff is worth it."
    else:
        recommendation = "❌ NEEDS WORK: Accuracy gap too large. Add more training data."

    report += f"\n{recommendation}\n"

    return report

# Generate report
report = distillation_summary(
    teacher_acc=teacher_results['accuracy'],
    student_acc=student_results['accuracy'],
    teacher_latency=teacher_latency,
    student_latency=student_latency,
    teacher_size_mb=teacher_model.size_mb,
    student_size_mb=student_model.size_mb,
    teacher_cost=teacher_cost,
    student_cost=student_cost
)

print(report)

Decision Criteria

Accuracy GapRecommendation
< 2%✅ Excellent - Deploy student
2-5%✅ Good - Deploy if speed/cost matters
5-10%⚠️ Consider - May need more data
> 10%❌ Too large - Iterate on distillation

If Student Needs Improvement

If the accuracy gap is too large:

  1. Add more training data - Label more examples with the teacher
  2. Use a larger student - Try EfficientNet-B0 instead of MobileNet
  3. Focus on weak classes - Add more examples for classes where student struggles
  4. Improve teacher - If teacher accuracy is also low, improve it first
  5. Lower confidence threshold - Include more teacher predictions (with review)

Next Step

If the student meets your quality bar, proceed to Deploy to put it into production.