Evaluate Results

Thoroughly assess your finetuned model’s performance before deployment.

Evaluation Overview

graph LR
    A[Trained Model] --> B[Validation Metrics]
    A --> C[Test Set Evaluation]
    A --> D[Per-Class Analysis]
    A --> E[Error Analysis]
    A --> F[Production Readiness]

Get Model Metrics

from seeme import Client

client = Client()

## Get the trained model
job = client.get_job(job_id)
model = client.get_model(job.model_id)

# View training metrics
print("Training Results")
print("=" * 50)
print(f"Best epoch: {job.best_epoch}")
print(f"Training loss: {job.best_metrics['train_loss']:.4f}")
print(f"Validation loss: {job.best_metrics['val_loss']:.4f}")
print(f"Validation accuracy: {job.best_metrics['val_accuracy']:.2%}")

# Get the training job
curl -X GET "https://api.seeme.ai/api/v1/jobs/{job_id}" \
  -H "Authorization: myusername:my-api-key"

# Get the trained model
curl -X GET "https://api.seeme.ai/api/v1/models/{model_id}" \
  -H "Authorization: myusername:my-api-key"

Evaluate on Test Set

Use a held-out test set that was never used during training:

# Run inference on test set
test_results = client.evaluate_model(
    model_id=model.id,
    dataset_id=dataset.id,
    version_id=version.id,
    split="test"
)

print("\nTest Set Results")
print("=" * 50)
print(f"Test accuracy: {test_results['accuracy']:.2%}")
print(f"Test loss: {test_results['loss']:.4f}")

# Evaluate model on test set
curl -X POST "https://api.seeme.ai/api/v1/models/{model_id}/evaluate" \
  -H "Authorization: myusername:my-api-key" \
  -H "Content-Type: application/json" \
  -d '{
    "dataset_id": "your-dataset-id",
    "version_id": "your-version-id",
    "split": "test"
  }'

Classification Metrics

Confusion Matrix

from collections import defaultdict

# Get predictions on test set
predictions = client.predict_batch(
    model_id=model.id,
    dataset_id=dataset.id,
    version_id=version.id,
    split="test"
)

# Build confusion matrix
confusion = defaultdict(lambda: defaultdict(int))
for pred in predictions:
    actual = pred.ground_truth
    predicted = pred.prediction
    confusion[actual][predicted] += 1

# Display confusion matrix
labels = sorted(set(p.ground_truth for p in predictions))

print("\nConfusion Matrix")
print("=" * 50)
print(f"{'Actual/Pred':<15}", end="")
for label in labels:
    print(f"{label:<12}", end="")
print()

for actual in labels:
    print(f"{actual:<15}", end="")
    for predicted in labels:
        count = confusion[actual][predicted]
        print(f"{count:<12}", end="")
    print()

Per-Class Metrics

def calculate_per_class_metrics(predictions, labels):
    """Calculate precision, recall, F1 for each class."""
    metrics = {}

    for label in labels:
        # True positives, false positives, false negatives
        tp = sum(1 for p in predictions if p.prediction == label and p.ground_truth == label)
        fp = sum(1 for p in predictions if p.prediction == label and p.ground_truth != label)
        fn = sum(1 for p in predictions if p.prediction != label and p.ground_truth == label)

        # Calculate metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        support = tp + fn

        metrics[label] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": support
        }

    return metrics

# Calculate and display
metrics = calculate_per_class_metrics(predictions, labels)

print("\nPer-Class Metrics")
print("=" * 70)
print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
print("-" * 70)

for label in labels:
    m = metrics[label]
    print(f"{label:<15} {m['precision']:<12.2%} {m['recall']:<12.2%} "
          f"{m['f1']:<12.2%} {m['support']:<10}")

# Macro averages
avg_precision = sum(m['precision'] for m in metrics.values()) / len(metrics)
avg_recall = sum(m['recall'] for m in metrics.values()) / len(metrics)
avg_f1 = sum(m['f1'] for m in metrics.values()) / len(metrics)

print("-" * 70)
print(f"{'Macro Avg':<15} {avg_precision:<12.2%} {avg_recall:<12.2%} {avg_f1:<12.2%}")

Understanding Metrics

Metric	What It Measures	When It Matters
Accuracy	Overall correct predictions	Balanced classes
Precision	Of predicted positives, how many are correct	Cost of false positives high
Recall	Of actual positives, how many were found	Cost of false negatives high
F1 Score	Balance of precision and recall	Need both

Object Detection Metrics

# For object detection models
det_results = client.evaluate_detection_model(
    model_id=model.id,
    dataset_id=dataset.id,
    version_id=version.id,
    split="test",
    iou_thresholds=[0.5, 0.75]
)

print("\nObject Detection Metrics")
print("=" * 50)
print(f"mAP@0.5: {det_results['mAP_50']:.2%}")
print(f"mAP@0.75: {det_results['mAP_75']:.2%}")
print(f"mAP@0.5:0.95: {det_results['mAP']:.2%}")

print("\nPer-Class AP@0.5:")
for label, ap in det_results['per_class_AP_50'].items():
    print(f"  {label}: {ap:.2%}")

Error Analysis

Find Misclassified Examples

# Get all wrong predictions
errors = [p for p in predictions if p.prediction != p.ground_truth]

print(f"\nError Analysis")
print(f"Total errors: {len(errors)} / {len(predictions)} ({len(errors)/len(predictions):.1%})")

# Group by error type
error_types = defaultdict(list)
for e in errors:
    key = f"{e.ground_truth} → {e.prediction}"
    error_types[key].append(e)

# Most common errors
print("\nMost Common Errors:")
for error_type, examples in sorted(error_types.items(), key=lambda x: -len(x[1]))[:10]:
    print(f"  {error_type}: {len(examples)} cases")

Analyze Low-Confidence Predictions

# Find predictions with low confidence
low_confidence = [p for p in predictions if p.confidence < 0.7]

print(f"\nLow Confidence Predictions (< 70%)")
print(f"Count: {len(low_confidence)} / {len(predictions)} ({len(low_confidence)/len(predictions):.1%})")

# Check accuracy on low-confidence predictions
low_conf_correct = sum(1 for p in low_confidence if p.prediction == p.ground_truth)
print(f"Accuracy on low-confidence: {low_conf_correct/len(low_confidence):.1%}")

# vs high confidence
high_confidence = [p for p in predictions if p.confidence >= 0.7]
high_conf_correct = sum(1 for p in high_confidence if p.prediction == p.ground_truth)
print(f"Accuracy on high-confidence: {high_conf_correct/len(high_confidence):.1%}")

Export Errors for Review

import csv

# Export misclassified examples for manual review
with open("errors_to_review.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["item_id", "file_path", "actual", "predicted", "confidence"])

    for e in errors:
        writer.writerow([e.item_id, e.file_path, e.ground_truth, e.prediction, f"{e.confidence:.2%}"])

print(f"Exported {len(errors)} errors to errors_to_review.csv")

Compare Models

Compare your new model against previous versions or baselines:

def compare_models(client, model_ids, dataset_id, version_id, split="test"):
    """Compare multiple models on the same test set."""
    results = []

    for model_id in model_ids:
        model = client.get_model(model_id)
        eval_result = client.evaluate_model(
            model_id=model_id,
            dataset_id=dataset_id,
            version_id=version_id,
            split=split
        )
        results.append({
            "model_id": model_id,
            "name": model.name,
            "accuracy": eval_result['accuracy'],
            "loss": eval_result['loss']
        })

    # Display comparison
    print("\nModel Comparison")
    print("=" * 70)
    print(f"{'Model':<30} {'Accuracy':<15} {'Loss':<15}")
    print("-" * 70)

    for r in sorted(results, key=lambda x: -x['accuracy']):
        print(f"{r['name']:<30} {r['accuracy']:<15.2%} {r['loss']:<15.4f}")

    return results

# Compare models
results = compare_models(
    client,
    model_ids=[model_v1.id, model_v2.id, baseline_model.id],
    dataset_id=dataset.id,
    version_id=version.id
)

Statistical Significance

Ensure improvements are real, not random:

import numpy as np
from scipy import stats

def bootstrap_accuracy(predictions, n_bootstrap=1000):
    """Calculate confidence interval for accuracy using bootstrap."""
    accuracies = []

    for _ in range(n_bootstrap):
        # Sample with replacement
        sample = np.random.choice(predictions, size=len(predictions), replace=True)
        acc = sum(1 for p in sample if p.prediction == p.ground_truth) / len(sample)
        accuracies.append(acc)

    # 95% confidence interval
    ci_lower = np.percentile(accuracies, 2.5)
    ci_upper = np.percentile(accuracies, 97.5)
    mean_acc = np.mean(accuracies)

    return mean_acc, ci_lower, ci_upper

mean, ci_low, ci_high = bootstrap_accuracy(predictions)
print(f"\nAccuracy: {mean:.2%} (95% CI: {ci_low:.2%} - {ci_high:.2%})")

Production Readiness Checklist

Before deploying, verify:

ℹ️

Performance

Test accuracy meets requirements
Per-class performance is acceptable (no catastrophic failures)
Confidence calibration is reasonable

Robustness

Tested on edge cases
Handles out-of-distribution inputs gracefully
Performance consistent across data subsets

Operational

Inference latency acceptable
Model size fits deployment target
Monitoring and alerting planned

Generate Evaluation Report

def generate_evaluation_report(client, model_id, dataset_id, version_id):
    """Generate comprehensive evaluation report."""
    model = client.get_model(model_id)
    test_results = client.evaluate_model(
        model_id=model_id,
        dataset_id=dataset_id,
        version_id=version_id,
        split="test"
    )

    report = f"""
# Model Evaluation Report

## Model Information
- **Model ID:** {model.id}
- **Model Name:** {model.name}
- **Base Model:** {model.config.get('base_model', 'N/A')}
- **Training Dataset:** {dataset_id}

## Overall Performance
- **Test Accuracy:** {test_results['accuracy']:.2%}
- **Test Loss:** {test_results['loss']:.4f}

## Per-Class Performance
| Class | Precision | Recall | F1 | Support |
|-------|-----------|--------|-----|---------|
"""

    for label, metrics in test_results.get('per_class', {}).items():
        report += f"| {label} | {metrics['precision']:.2%} | {metrics['recall']:.2%} | {metrics['f1']:.2%} | {metrics['support']} |\n"

    report += f"""
## Recommendations
{'✅ Model meets accuracy threshold' if test_results['accuracy'] > 0.9 else '⚠️ Consider improving model accuracy'}
"""

    return report

report = generate_evaluation_report(client, model.id, dataset.id, version.id)
print(report)

Next Step

If your model meets requirements, you’re ready for deployment. If not, proceed to Iterate & Improve to enhance performance.

Train & Monitor Iterate & Improve