Create a test dataset with human-verified labels
Compare teacher and student models on the same test set to ensure the student meets quality requirements.
Important: Use Ground Truth
⚠️
Do not evaluate on teacher-labeled data. The student was trained on teacher labels, so comparing them on that data is circular. Use a held-out test set with human-verified ground truth labels.
graph LR
A[Teacher Labels] -->|Training| B[Student Model]
C[Ground Truth Test Set] -->|Evaluate| D[Teacher]
C -->|Evaluate| E[Student]
D --> F[Compare]
E --> FPrepare Test Set
If you don’t have a ground truth test set, create one:
from seeme import Client
client = Client()
test_dataset = client.create_dataset(
name="Test Set: Product Classifier",
description="Human-verified ground truth for evaluation"
)
test_version = client.create_dataset_version(
dataset_id=test_dataset.id,
name="v1"
)
# Create splits and labels
test_split = client.create_split(version_id=test_version.id, name="test")
for label in ["good", "scratch", "dent", "discoloration", "crack"]:
client.create_label(version_id=test_version.id, name=label)
# Upload and manually label test images
# This should be done by humans, not the teacher model!Run Evaluation
Evaluate both models on the same test set:
# Evaluate teacher
teacher_results = client.evaluate_model(
model_id=teacher_model.id,
dataset_id=test_dataset.id,
version_id=test_version.id,
split="test"
)
# Evaluate student
student_results = client.evaluate_model(
model_id=student_model.id,
dataset_id=test_dataset.id,
version_id=test_version.id,
split="test"
)
print("Evaluation Results")
print("=" * 50)
print(f"{'Metric':<20} {'Teacher':<15} {'Student':<15}")
print("-" * 50)
print(f"{'Accuracy':<20} {teacher_results['accuracy']:<15.2%} {student_results['accuracy']:<15.2%}")
print(f"{'Loss':<20} {teacher_results['loss']:<15.4f} {student_results['loss']:<15.4f}")Detailed Comparison
Per-Class Performance
# Get predictions from both models
teacher_preds = client.predict_batch(
model_id=teacher_model.id,
dataset_id=test_dataset.id,
version_id=test_version.id,
split="test"
)
student_preds = client.predict_batch(
model_id=student_model.id,
dataset_id=test_dataset.id,
version_id=test_version.id,
split="test"
)
# Calculate per-class metrics
def per_class_accuracy(predictions, labels):
metrics = {}
for label in labels:
correct = sum(1 for p in predictions
if p.ground_truth == label and p.prediction == label)
total = sum(1 for p in predictions if p.ground_truth == label)
metrics[label] = correct / total if total > 0 else 0
return metrics
labels = ["good", "scratch", "dent", "discoloration", "crack"]
teacher_per_class = per_class_accuracy(teacher_preds, labels)
student_per_class = per_class_accuracy(student_preds, labels)
print("\nPer-Class Accuracy")
print("=" * 60)
print(f"{'Class':<20} {'Teacher':<15} {'Student':<15} {'Gap':<10}")
print("-" * 60)
for label in labels:
t_acc = teacher_per_class[label]
s_acc = student_per_class[label]
gap = t_acc - s_acc
gap_str = f"{gap:+.1%}" if gap != 0 else "0%"
print(f"{label:<20} {t_acc:<15.2%} {s_acc:<15.2%} {gap_str:<10}")Find Where Student Struggles
# Find items where teacher is right but student is wrong
student_failures = []
for t_pred, s_pred in zip(teacher_preds, student_preds):
teacher_correct = t_pred.prediction == t_pred.ground_truth
student_correct = s_pred.prediction == s_pred.ground_truth
if teacher_correct and not student_correct:
student_failures.append({
"item_id": s_pred.item_id,
"ground_truth": s_pred.ground_truth,
"teacher_pred": t_pred.prediction,
"student_pred": s_pred.prediction,
"student_confidence": s_pred.confidence
})
print(f"\nStudent failures (teacher correct, student wrong): {len(student_failures)}")
# Group by error type
from collections import Counter
error_types = Counter(f"{f['ground_truth']} → {f['student_pred']}" for f in student_failures)
print("\nMost common student errors:")
for error_type, count in error_types.most_common(5):
print(f" {error_type}: {count}")Agreement Analysis
# How often do teacher and student agree?
agreements = sum(1 for t, s in zip(teacher_preds, student_preds)
if t.prediction == s.prediction)
agreement_rate = agreements / len(teacher_preds)
print(f"\nTeacher-Student Agreement: {agreement_rate:.2%}")
# When they disagree, who is usually right?
disagreements = [(t, s) for t, s in zip(teacher_preds, student_preds)
if t.prediction != s.prediction]
teacher_right_on_disagree = sum(1 for t, s in disagreements
if t.prediction == t.ground_truth)
student_right_on_disagree = sum(1 for t, s in disagreements
if s.prediction == s.ground_truth)
print(f"When they disagree ({len(disagreements)} cases):")
print(f" Teacher correct: {teacher_right_on_disagree} ({teacher_right_on_disagree/len(disagreements):.1%})")
print(f" Student correct: {student_right_on_disagree} ({student_right_on_disagree/len(disagreements):.1%})")
print(f" Both wrong: {len(disagreements) - teacher_right_on_disagree - student_right_on_disagree}")Speed Comparison
import time
# Benchmark inference speed
def benchmark_model(client, model_id, test_images, n_runs=50):
# Warmup
for img in test_images[:5]:
client.predict(model_id=model_id, item=img)
# Timed runs
start = time.time()
for i in range(n_runs):
img = test_images[i % len(test_images)]
client.predict(model_id=model_id, item=img)
elapsed = time.time() - start
return (elapsed / n_runs) * 1000 # ms per prediction
test_images = ["test1.jpg", "test2.jpg", "test3.jpg", "test4.jpg", "test5.jpg"]
teacher_latency = benchmark_model(client, teacher_model.id, test_images)
student_latency = benchmark_model(client, student_model.id, test_images)
print("\nInference Speed")
print("=" * 50)
print(f"{'Model':<20} {'Latency (ms)':<15} {'Throughput':<15}")
print("-" * 50)
print(f"{'Teacher':<20} {teacher_latency:<15.1f} {1000/teacher_latency:<15.1f}/sec")
print(f"{'Student':<20} {student_latency:<15.1f} {1000/student_latency:<15.1f}/sec")
print(f"{'Speedup':<20} {teacher_latency/student_latency:<15.1f}x")Cost Comparison
# Estimate cost per 1000 predictions
def estimate_cost(model, latency_ms, is_external_llm=False, external_cost_per_call=0.01):
if is_external_llm:
cost_per_1000 = external_cost_per_call * 1000
else:
# Estimate based on compute time (example rates)
gpu_cost_per_hour = 1.0 # $1/hour for GPU
predictions_per_hour = 3600 * 1000 / latency_ms
cost_per_1000 = (1000 / predictions_per_hour) * gpu_cost_per_hour
return cost_per_1000
teacher_cost = estimate_cost(teacher_model, teacher_latency, is_external_llm=True, external_cost_per_call=0.01)
student_cost = estimate_cost(student_model, student_latency, is_external_llm=False)
print("\nCost Comparison (per 1000 predictions)")
print("=" * 50)
print(f"Teacher: ${teacher_cost:.2f}")
print(f"Student: ${student_cost:.4f}")
print(f"Savings: {teacher_cost / student_cost:.0f}x")Summary Report
def distillation_summary(teacher_acc, student_acc, teacher_latency, student_latency,
teacher_size_mb, student_size_mb, teacher_cost, student_cost):
"""Generate distillation summary report."""
accuracy_gap = teacher_acc - student_acc
speedup = teacher_latency / student_latency
size_reduction = teacher_size_mb / student_size_mb
cost_reduction = teacher_cost / student_cost
report = f"""
╔══════════════════════════════════════════════════════════════╗
║ DISTILLATION SUMMARY ║
╠══════════════════════════════════════════════════════════════╣
║ Metric │ Teacher │ Student │ Change ║
╠──────────────────────┼─────────────┼────────────┼────────────╣
║ Accuracy │ {teacher_acc:>8.2%} │ {student_acc:>8.2%} │ {accuracy_gap:>+7.2%} ║
║ Latency (ms) │ {teacher_latency:>8.1f} │ {student_latency:>8.1f} │ {speedup:>6.1f}x ⚡ ║
║ Model Size (MB) │ {teacher_size_mb:>8.1f} │ {student_size_mb:>8.1f} │ {size_reduction:>6.1f}x 📦 ║
║ Cost per 1000 │ ${teacher_cost:>7.2f} │ ${student_cost:>7.4f} │ {cost_reduction:>6.0f}x 💰 ║
╚══════════════════════════════════════════════════════════════╝
"""
# Recommendation
if accuracy_gap <= 0.03:
recommendation = "✅ RECOMMENDED: Student matches teacher quality. Deploy student."
elif accuracy_gap <= 0.05:
recommendation = "⚠️ ACCEPTABLE: Small accuracy gap. Consider if tradeoff is worth it."
else:
recommendation = "❌ NEEDS WORK: Accuracy gap too large. Add more training data."
report += f"\n{recommendation}\n"
return report
# Generate report
report = distillation_summary(
teacher_acc=teacher_results['accuracy'],
student_acc=student_results['accuracy'],
teacher_latency=teacher_latency,
student_latency=student_latency,
teacher_size_mb=teacher_model.size_mb,
student_size_mb=student_model.size_mb,
teacher_cost=teacher_cost,
student_cost=student_cost
)
print(report)Decision Criteria
| Accuracy Gap | Recommendation |
|---|---|
| < 2% | ✅ Excellent - Deploy student |
| 2-5% | ✅ Good - Deploy if speed/cost matters |
| 5-10% | ⚠️ Consider - May need more data |
| > 10% | ❌ Too large - Iterate on distillation |
If Student Needs Improvement
If the accuracy gap is too large:
- Add more training data - Label more examples with the teacher
- Use a larger student - Try EfficientNet-B0 instead of MobileNet
- Focus on weak classes - Add more examples for classes where student struggles
- Improve teacher - If teacher accuracy is also low, improve it first
- Lower confidence threshold - Include more teacher predictions (with review)
Next Step
If the student meets your quality bar, proceed to Deploy to put it into production.