Evaluate Results

Evaluate Results

Thoroughly assess your finetuned model’s performance before deployment.

Evaluation Overview

graph LR
    A[Trained Model] --> B[Validation Metrics]
    A --> C[Test Set Evaluation]
    A --> D[Per-Class Analysis]
    A --> E[Error Analysis]
    A --> F[Production Readiness]

Get Model Metrics

Evaluate on Test Set

Use a held-out test set that was never used during training:

Classification Metrics

Confusion Matrix

from collections import defaultdict

# Get predictions on test set
predictions = client.predict_batch(
    model_id=model.id,
    dataset_id=dataset.id,
    version_id=version.id,
    split="test"
)

# Build confusion matrix
confusion = defaultdict(lambda: defaultdict(int))
for pred in predictions:
    actual = pred.ground_truth
    predicted = pred.prediction
    confusion[actual][predicted] += 1

# Display confusion matrix
labels = sorted(set(p.ground_truth for p in predictions))

print("\nConfusion Matrix")
print("=" * 50)
print(f"{'Actual/Pred':<15}", end="")
for label in labels:
    print(f"{label:<12}", end="")
print()

for actual in labels:
    print(f"{actual:<15}", end="")
    for predicted in labels:
        count = confusion[actual][predicted]
        print(f"{count:<12}", end="")
    print()

Per-Class Metrics

def calculate_per_class_metrics(predictions, labels):
    """Calculate precision, recall, F1 for each class."""
    metrics = {}

    for label in labels:
        # True positives, false positives, false negatives
        tp = sum(1 for p in predictions if p.prediction == label and p.ground_truth == label)
        fp = sum(1 for p in predictions if p.prediction == label and p.ground_truth != label)
        fn = sum(1 for p in predictions if p.prediction != label and p.ground_truth == label)

        # Calculate metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        support = tp + fn

        metrics[label] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": support
        }

    return metrics

# Calculate and display
metrics = calculate_per_class_metrics(predictions, labels)

print("\nPer-Class Metrics")
print("=" * 70)
print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
print("-" * 70)

for label in labels:
    m = metrics[label]
    print(f"{label:<15} {m['precision']:<12.2%} {m['recall']:<12.2%} "
          f"{m['f1']:<12.2%} {m['support']:<10}")

# Macro averages
avg_precision = sum(m['precision'] for m in metrics.values()) / len(metrics)
avg_recall = sum(m['recall'] for m in metrics.values()) / len(metrics)
avg_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics)

print("-" * 70)
print(f"{'Macro Avg':<15} {avg_precision:<12.2%} {avg_recall:<12.2%} {avg_f1:<12.2%}")

Understanding Metrics

MetricWhat It MeasuresWhen It Matters
AccuracyOverall correct predictionsBalanced classes
PrecisionOf predicted positives, how many are correctCost of false positives high
RecallOf actual positives, how many were foundCost of false negatives high
F1 ScoreBalance of precision and recallNeed both

Object Detection Metrics

# For object detection models
det_results = client.evaluate_detection_model(
    model_id=model.id,
    dataset_id=dataset.id,
    version_id=version.id,
    split="test",
    iou_thresholds=[0.5, 0.75]
)

print("\nObject Detection Metrics")
print("=" * 50)
print(f"mAP@0.5: {det_results['mAP_50']:.2%}")
print(f"mAP@0.75: {det_results['mAP_75']:.2%}")
print(f"mAP@0.5:0.95: {det_results['mAP']:.2%}")

print("\nPer-Class AP@0.5:")
for label, ap in det_results['per_class_AP_50'].items():
    print(f"  {label}: {ap:.2%}")

Error Analysis

Find Misclassified Examples

# Get all wrong predictions
errors = [p for p in predictions if p.prediction != p.ground_truth]

print(f"\nError Analysis")
print(f"Total errors: {len(errors)} / {len(predictions)} ({len(errors)/len(predictions):.1%})")

# Group by error type
error_types = defaultdict(list)
for e in errors:
    key = f"{e.ground_truth}{e.prediction}"
    error_types[key].append(e)

# Most common errors
print("\nMost Common Errors:")
for error_type, examples in sorted(error_types.items(), key=lambda x: -len(x[1]))[:10]:
    print(f"  {error_type}: {len(examples)} cases")

Analyze Low-Confidence Predictions

# Find predictions with low confidence
low_confidence = [p for p in predictions if p.confidence < 0.7]

print(f"\nLow Confidence Predictions (< 70%)")
print(f"Count: {len(low_confidence)} / {len(predictions)} ({len(low_confidence)/len(predictions):.1%})")

# Check accuracy on low-confidence predictions
low_conf_correct = sum(1 for p in low_confidence if p.prediction == p.ground_truth)
print(f"Accuracy on low-confidence: {low_conf_correct/len(low_confidence):.1%}")

# vs high confidence
high_confidence = [p for p in predictions if p.confidence >= 0.7]
high_conf_correct = sum(1 for p in high_confidence if p.prediction == p.ground_truth)
print(f"Accuracy on high-confidence: {high_conf_correct/len(high_confidence):.1%}")

Export Errors for Review

import csv

# Export misclassified examples for manual review
with open("errors_to_review.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["item_id", "file_path", "actual", "predicted", "confidence"])

    for e in errors:
        writer.writerow([e.item_id, e.file_path, e.ground_truth, e.prediction, f"{e.confidence:.2%}"])

print(f"Exported {len(errors)} errors to errors_to_review.csv")

Compare Models

Compare your new model against previous versions or baselines:

def compare_models(client, model_ids, dataset_id, version_id, split="test"):
    """Compare multiple models on the same test set."""
    results = []

    for model_id in model_ids:
        model = client.get_model(model_id)
        eval_result = client.evaluate_model(
            model_id=model_id,
            dataset_id=dataset_id,
            version_id=version_id,
            split=split
        )
        results.append({
            "model_id": model_id,
            "name": model.name,
            "accuracy": eval_result['accuracy'],
            "loss": eval_result['loss']
        })

    # Display comparison
    print("\nModel Comparison")
    print("=" * 70)
    print(f"{'Model':<30} {'Accuracy':<15} {'Loss':<15}")
    print("-" * 70)

    for r in sorted(results, key=lambda x: -x['accuracy']):
        print(f"{r['name']:<30} {r['accuracy']:<15.2%} {r['loss']:<15.4f}")

    return results

# Compare models
results = compare_models(
    client,
    model_ids=[model_v1.id, model_v2.id, baseline_model.id],
    dataset_id=dataset.id,
    version_id=version.id
)

Statistical Significance

Ensure improvements are real, not random:

import numpy as np
from scipy import stats

def bootstrap_accuracy(predictions, n_bootstrap=1000):
    """Calculate confidence interval for accuracy using bootstrap."""
    accuracies = []

    for _ in range(n_bootstrap):
        # Sample with replacement
        sample = np.random.choice(predictions, size=len(predictions), replace=True)
        acc = sum(1 for p in sample if p.prediction == p.ground_truth) / len(sample)
        accuracies.append(acc)

    # 95% confidence interval
    ci_lower = np.percentile(accuracies, 2.5)
    ci_upper = np.percentile(accuracies, 97.5)
    mean_acc = np.mean(accuracies)

    return mean_acc, ci_lower, ci_upper

mean, ci_low, ci_high = bootstrap_accuracy(predictions)
print(f"\nAccuracy: {mean:.2%} (95% CI: {ci_low:.2%} - {ci_high:.2%})")

Production Readiness Checklist

Before deploying, verify:

ℹ️

Performance

  • Test accuracy meets requirements
  • Per-class performance is acceptable (no catastrophic failures)
  • Confidence calibration is reasonable

Robustness

  • Tested on edge cases
  • Handles out-of-distribution inputs gracefully
  • Performance consistent across data subsets

Operational

  • Inference latency acceptable
  • Model size fits deployment target
  • Monitoring and alerting planned

Generate Evaluation Report

def generate_evaluation_report(client, model_id, dataset_id, version_id):
    """Generate comprehensive evaluation report."""
    model = client.get_model(model_id)
    test_results = client.evaluate_model(
        model_id=model_id,
        dataset_id=dataset_id,
        version_id=version_id,
        split="test"
    )

    report = f"""
# Model Evaluation Report

## Model Information
- **Model ID:** {model.id}
- **Model Name:** {model.name}
- **Base Model:** {model.config.get('base_model', 'N/A')}
- **Training Dataset:** {dataset_id}

## Overall Performance
- **Test Accuracy:** {test_results['accuracy']:.2%}
- **Test Loss:** {test_results['loss']:.4f}

## Per-Class Performance
| Class | Precision | Recall | F1 | Support |
|-------|-----------|--------|-----|---------|
"""

    for label, metrics in test_results.get('per_class', {}).items():
        report += f"| {label} | {metrics['precision']:.2%} | {metrics['recall']:.2%} | {metrics['f1']:.2%} | {metrics['support']} |\n"

    report += f"""
## Recommendations
{'✅ Model meets accuracy threshold' if test_results['accuracy'] > 0.9 else '⚠️ Consider improving model accuracy'}
"""

    return report

report = generate_evaluation_report(client, model.id, dataset.id, version.id)
print(report)

Next Step

If your model meets requirements, you’re ready for deployment. If not, proceed to Iterate & Improve to enhance performance.