Evaluate Results
Thoroughly assess your finetuned model’s performance before deployment.
Evaluation Overview
graph LR
A[Trained Model] --> B[Validation Metrics]
A --> C[Test Set Evaluation]
A --> D[Per-Class Analysis]
A --> E[Error Analysis]
A --> F[Production Readiness]Get Model Metrics
Evaluate on Test Set
Use a held-out test set that was never used during training:
Classification Metrics
Confusion Matrix
from collections import defaultdict
# Get predictions on test set
predictions = client.predict_batch(
model_id=model.id,
dataset_id=dataset.id,
version_id=version.id,
split="test"
)
# Build confusion matrix
confusion = defaultdict(lambda: defaultdict(int))
for pred in predictions:
actual = pred.ground_truth
predicted = pred.prediction
confusion[actual][predicted] += 1
# Display confusion matrix
labels = sorted(set(p.ground_truth for p in predictions))
print("\nConfusion Matrix")
print("=" * 50)
print(f"{'Actual/Pred':<15}", end="")
for label in labels:
print(f"{label:<12}", end="")
print()
for actual in labels:
print(f"{actual:<15}", end="")
for predicted in labels:
count = confusion[actual][predicted]
print(f"{count:<12}", end="")
print()Per-Class Metrics
def calculate_per_class_metrics(predictions, labels):
"""Calculate precision, recall, F1 for each class."""
metrics = {}
for label in labels:
# True positives, false positives, false negatives
tp = sum(1 for p in predictions if p.prediction == label and p.ground_truth == label)
fp = sum(1 for p in predictions if p.prediction == label and p.ground_truth != label)
fn = sum(1 for p in predictions if p.prediction != label and p.ground_truth == label)
# Calculate metrics
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
support = tp + fn
metrics[label] = {
"precision": precision,
"recall": recall,
"f1": f1,
"support": support
}
return metrics
# Calculate and display
metrics = calculate_per_class_metrics(predictions, labels)
print("\nPer-Class Metrics")
print("=" * 70)
print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
print("-" * 70)
for label in labels:
m = metrics[label]
print(f"{label:<15} {m['precision']:<12.2%} {m['recall']:<12.2%} "
f"{m['f1']:<12.2%} {m['support']:<10}")
# Macro averages
avg_precision = sum(m['precision'] for m in metrics.values()) / len(metrics)
avg_recall = sum(m['recall'] for m in metrics.values()) / len(metrics)
avg_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics)
print("-" * 70)
print(f"{'Macro Avg':<15} {avg_precision:<12.2%} {avg_recall:<12.2%} {avg_f1:<12.2%}")Understanding Metrics
| Metric | What It Measures | When It Matters |
|---|---|---|
| Accuracy | Overall correct predictions | Balanced classes |
| Precision | Of predicted positives, how many are correct | Cost of false positives high |
| Recall | Of actual positives, how many were found | Cost of false negatives high |
| F1 Score | Balance of precision and recall | Need both |
Object Detection Metrics
# For object detection models
det_results = client.evaluate_detection_model(
model_id=model.id,
dataset_id=dataset.id,
version_id=version.id,
split="test",
iou_thresholds=[0.5, 0.75]
)
print("\nObject Detection Metrics")
print("=" * 50)
print(f"mAP@0.5: {det_results['mAP_50']:.2%}")
print(f"mAP@0.75: {det_results['mAP_75']:.2%}")
print(f"mAP@0.5:0.95: {det_results['mAP']:.2%}")
print("\nPer-Class AP@0.5:")
for label, ap in det_results['per_class_AP_50'].items():
print(f" {label}: {ap:.2%}")Error Analysis
Find Misclassified Examples
# Get all wrong predictions
errors = [p for p in predictions if p.prediction != p.ground_truth]
print(f"\nError Analysis")
print(f"Total errors: {len(errors)} / {len(predictions)} ({len(errors)/len(predictions):.1%})")
# Group by error type
error_types = defaultdict(list)
for e in errors:
key = f"{e.ground_truth} → {e.prediction}"
error_types[key].append(e)
# Most common errors
print("\nMost Common Errors:")
for error_type, examples in sorted(error_types.items(), key=lambda x: -len(x[1]))[:10]:
print(f" {error_type}: {len(examples)} cases")Analyze Low-Confidence Predictions
# Find predictions with low confidence
low_confidence = [p for p in predictions if p.confidence < 0.7]
print(f"\nLow Confidence Predictions (< 70%)")
print(f"Count: {len(low_confidence)} / {len(predictions)} ({len(low_confidence)/len(predictions):.1%})")
# Check accuracy on low-confidence predictions
low_conf_correct = sum(1 for p in low_confidence if p.prediction == p.ground_truth)
print(f"Accuracy on low-confidence: {low_conf_correct/len(low_confidence):.1%}")
# vs high confidence
high_confidence = [p for p in predictions if p.confidence >= 0.7]
high_conf_correct = sum(1 for p in high_confidence if p.prediction == p.ground_truth)
print(f"Accuracy on high-confidence: {high_conf_correct/len(high_confidence):.1%}")Export Errors for Review
import csv
# Export misclassified examples for manual review
with open("errors_to_review.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["item_id", "file_path", "actual", "predicted", "confidence"])
for e in errors:
writer.writerow([e.item_id, e.file_path, e.ground_truth, e.prediction, f"{e.confidence:.2%}"])
print(f"Exported {len(errors)} errors to errors_to_review.csv")Compare Models
Compare your new model against previous versions or baselines:
def compare_models(client, model_ids, dataset_id, version_id, split="test"):
"""Compare multiple models on the same test set."""
results = []
for model_id in model_ids:
model = client.get_model(model_id)
eval_result = client.evaluate_model(
model_id=model_id,
dataset_id=dataset_id,
version_id=version_id,
split=split
)
results.append({
"model_id": model_id,
"name": model.name,
"accuracy": eval_result['accuracy'],
"loss": eval_result['loss']
})
# Display comparison
print("\nModel Comparison")
print("=" * 70)
print(f"{'Model':<30} {'Accuracy':<15} {'Loss':<15}")
print("-" * 70)
for r in sorted(results, key=lambda x: -x['accuracy']):
print(f"{r['name']:<30} {r['accuracy']:<15.2%} {r['loss']:<15.4f}")
return results
# Compare models
results = compare_models(
client,
model_ids=[model_v1.id, model_v2.id, baseline_model.id],
dataset_id=dataset.id,
version_id=version.id
)Statistical Significance
Ensure improvements are real, not random:
import numpy as np
from scipy import stats
def bootstrap_accuracy(predictions, n_bootstrap=1000):
"""Calculate confidence interval for accuracy using bootstrap."""
accuracies = []
for _ in range(n_bootstrap):
# Sample with replacement
sample = np.random.choice(predictions, size=len(predictions), replace=True)
acc = sum(1 for p in sample if p.prediction == p.ground_truth) / len(sample)
accuracies.append(acc)
# 95% confidence interval
ci_lower = np.percentile(accuracies, 2.5)
ci_upper = np.percentile(accuracies, 97.5)
mean_acc = np.mean(accuracies)
return mean_acc, ci_lower, ci_upper
mean, ci_low, ci_high = bootstrap_accuracy(predictions)
print(f"\nAccuracy: {mean:.2%} (95% CI: {ci_low:.2%} - {ci_high:.2%})")Production Readiness Checklist
Before deploying, verify:
ℹ️
Performance
- Test accuracy meets requirements
- Per-class performance is acceptable (no catastrophic failures)
- Confidence calibration is reasonable
Robustness
- Tested on edge cases
- Handles out-of-distribution inputs gracefully
- Performance consistent across data subsets
Operational
- Inference latency acceptable
- Model size fits deployment target
- Monitoring and alerting planned
Generate Evaluation Report
def generate_evaluation_report(client, model_id, dataset_id, version_id):
"""Generate comprehensive evaluation report."""
model = client.get_model(model_id)
test_results = client.evaluate_model(
model_id=model_id,
dataset_id=dataset_id,
version_id=version_id,
split="test"
)
report = f"""
# Model Evaluation Report
## Model Information
- **Model ID:** {model.id}
- **Model Name:** {model.name}
- **Base Model:** {model.config.get('base_model', 'N/A')}
- **Training Dataset:** {dataset_id}
## Overall Performance
- **Test Accuracy:** {test_results['accuracy']:.2%}
- **Test Loss:** {test_results['loss']:.4f}
## Per-Class Performance
| Class | Precision | Recall | F1 | Support |
|-------|-----------|--------|-----|---------|
"""
for label, metrics in test_results.get('per_class', {}).items():
report += f"| {label} | {metrics['precision']:.2%} | {metrics['recall']:.2%} | {metrics['f1']:.2%} | {metrics['support']} |\n"
report += f"""
## Recommendations
{'✅ Model meets accuracy threshold' if test_results['accuracy'] > 0.9 else '⚠️ Consider improving model accuracy'}
"""
return report
report = generate_evaluation_report(client, model.id, dataset.id, version.id)
print(report)Next Step
If your model meets requirements, you’re ready for deployment. If not, proceed to Iterate & Improve to enhance performance.