Deploy Student Model

Deploy your distilled student model to production.

Optimize for Production

Before deployment, optimize the model for inference:

from seeme import Client

client = Client()

# Optimize the student model
optimized_model = client.optimize_model(
    model_id=student_model.id,
    target_format="onnx",
    quantize=True,
    quantize_type="int8",  # 4x smaller, minimal accuracy loss
    config={
        "opset_version": 17,
        "dynamic_batch": True
    }
)

print(f"Optimized model: {optimized_model.id}")
print(f"Original size: {student_model.size_mb:.1f} MB")
print(f"Optimized size: {optimized_model.size_mb:.1f} MB")
print(f"Reduction: {student_model.size_mb / optimized_model.size_mb:.1f}x")

# Optimize the student model
curl -X POST "https://api.seeme.ai/api/v1/models/{model_id}/optimize" \
  -H "Authorization: myusername:my-api-key" \
  -H "Content-Type: application/json" \
  -d '{
    "target_format": "onnx",
    "quantize": true,
    "quantize_type": "int8",
    "config": {
      "opset_version": 17,
      "dynamic_batch": true
    }
  }'

# Get optimized model details
curl -X GET "https://api.seeme.ai/api/v1/models/{optimized_model_id}" \
  -H "Authorization: myusername:my-api-key"

Verify Optimized Model

Ensure optimization didn’t hurt accuracy:

# Evaluate optimized model on test set
optimized_results = client.evaluate_model(
    model_id=optimized_model.id,
    dataset_id=test_dataset.id,
    version_id=test_version.id,
    split="test"
)

print("\nPost-Optimization Accuracy Check")
print("-" * 50)
print(f"Original student: {student_results['accuracy']:.2%}")
print(f"Optimized student: {optimized_results['accuracy']:.2%}")
print(f"Accuracy loss: {student_results['accuracy'] - optimized_results['accuracy']:.2%}")

if student_results['accuracy'] - optimized_results['accuracy'] > 0.01:
    print("⚠️  Warning: Optimization caused >1% accuracy loss")
    print("   Consider: FP16 quantization instead of INT8")

Deploy to Cloud API

# Deploy as a cloud API endpoint
deployment = client.deploy_model(
    model_id=optimized_model.id,
    name="Product Classifier API",
    config={
        "replicas": 2,       # Minimum instances
        "max_replicas": 10,  # Auto-scale up to 10
        "gpu": False,        # CPU inference (cheaper)
        "timeout_ms": 5000   # Request timeout
    }
)

print(f"Deployed!")
print(f"Endpoint: {deployment.url}")
print(f"API Key: {deployment.api_key}")

# Deploy as a cloud API endpoint
curl -X POST "https://api.seeme.ai/api/v1/deployments" \
  -H "Authorization: myusername:my-api-key" \
  -H "Content-Type: application/json" \
  -d '{
    "model_id": "optimized-model-id",
    "name": "Product Classifier API",
    "config": {
      "replicas": 2,
      "max_replicas": 10,
      "gpu": false,
      "timeout_ms": 5000
    }
  }'

Use the Deployed Model

# Make predictions via API
result = client.predict(
    model_id=optimized_model.id,
    item="./new_image.jpg"
)

print(f"Prediction: {result.prediction}")
print(f"Confidence: {result.confidence:.2%}")

# Make predictions via REST API (image file)
curl -X POST "https://api.seeme.ai/api/v1/inferences/{model_id}" \
  -H "Authorization: myusername:my-api-key" \
  -F "file=@./new_image.jpg"

# Or using the deployment endpoint URL
curl -X POST "https://your-deployment.seeme.ai/predict" \
  -H "Authorization: myusername:my-api-key" \
  -F "file=@./new_image.jpg"

Export for Edge/Mobile

For on-device deployment:

# Export for iOS
coreml_model = client.optimize_model(
    model_id=student_model.id,
    target_format="coreml",
    config={
        "input_shape": [1, 3, 224, 224],
        "class_labels": ["good", "scratch", "dent", "discoloration", "crack"]
    }
)

client.export_model(
    model_id=coreml_model.id,
    format="coreml",
    output_path="./mobile/ProductClassifier.mlmodel"
)

print("Exported for iOS: ProductClassifier.mlmodel")

Use in Swift:

import CoreML

let model = try! ProductClassifier(configuration: MLModelConfiguration())
let input = try! ProductClassifierInput(imageWith: cgImage)
let output = try! model.prediction(input: input)

print("Prediction: \(output.classLabel)")
print("Confidence: \(output.classProbability[output.classLabel]!)")

# Export for Android
tflite_model = client.optimize_model(
    model_id=student_model.id,
    target_format="tflite",
    quantize=True,
    quantize_type="int8"
)

client.export_model(
    model_id=tflite_model.id,
    format="tflite",
    output_path="./mobile/product_classifier.tflite"
)

print("Exported for Android: product_classifier.tflite")

Use in Kotlin:

val model = ProductClassifier.newInstance(context)
val inputBuffer = TensorBuffer.createFixedSize(intArrayOf(1, 224, 224, 3), DataType.UINT8)
inputBuffer.loadBuffer(imageBuffer)

val outputs = model.process(inputBuffer)
val outputBuffer = outputs.outputFeature0AsTensorBuffer

val maxIdx = outputBuffer.floatArray.indices.maxByOrNull { outputBuffer.floatArray[it] } ?: -1
val labels = listOf("good", "scratch", "dent", "discoloration", "crack")
println("Prediction: ${labels[maxIdx]}")

# Export ONNX for ONNX Runtime
client.export_model(
    model_id=optimized_model.id,
    format="onnx",
    output_path="./deployment/product_classifier.onnx"
)

print("Exported: product_classifier.onnx")

Use with ONNX Runtime:

import onnxruntime as ort
import numpy as np
from PIL import Image

# Load model
session = ort.InferenceSession("product_classifier.onnx")

# Preprocess image
image = Image.open("test.jpg").resize((224, 224))
input_array = np.array(image).astype(np.float32)
input_array = input_array.transpose(2, 0, 1)  # CHW
input_array = input_array[np.newaxis, ...]  # Add batch
input_array /= 255.0  # Normalize

# Run inference
outputs = session.run(None, {"input": input_array})
prediction = outputs[0].argmax()
confidence = outputs[0].max()

labels = ["good", "scratch", "dent", "discoloration", "crack"]
print(f"Prediction: {labels[prediction]} ({confidence:.2%})")

Retire the Teacher

Once the student is deployed and validated in production:

# Disable teacher post-processor (stop paying for teacher inference)
client.update_post_processor(
    processor_id=teacher_processor.id,
    enabled=False
)

print("Teacher post-processor disabled")

# Optionally: Update to use student for future labeling
# (active learning - student labels, humans review, retrain)
student_processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Student Auto-Labeler",
    model_type="classification",
    model_id=optimized_model.id,
    output_target="annotations",
    confidence_threshold=0.9,  # High threshold for auto-labeling
    enabled=True
)

Monitor Production Performance

Set up monitoring to catch accuracy degradation:

# Log predictions for monitoring
def predict_with_logging(client, model_id, file_path):
    result = client.predict(model_id=model_id, item=file_path)

    # Log for monitoring
    client.log_prediction(
        model_id=model_id,
        prediction=result.prediction,
        confidence=result.confidence,
        metadata={
            "file": file_path,
            "timestamp": datetime.now().isoformat()
        }
    )

    return result

# Set up alerts for low confidence predictions
alert_config = {
    "low_confidence_threshold": 0.7,
    "low_confidence_alert_pct": 0.1,  # Alert if >10% predictions are low confidence
    "drift_detection": True
}

client.configure_model_monitoring(
    model_id=optimized_model.id,
    config=alert_config
)

Continuous Improvement

Keep improving the student over time:

graph LR
    A[Student in Production] --> B[Collect Low-Confidence Predictions]
    B --> C[Human Review]
    C --> D[Add to Training Data]
    D --> E[Retrain Student]
    E --> F[Evaluate]
    F --> G{Better?}
    G -->|Yes| H[Deploy New Student]
    G -->|No| A
    H --> A

# Collect predictions for review
low_conf_predictions = client.get_predictions(
    model_id=optimized_model.id,
    max_confidence=0.7,
    min_date="2024-01-01",
    limit=500
)

print(f"Found {len(low_conf_predictions)} low-confidence predictions to review")

# Add to training dataset for review
for pred in low_conf_predictions:
    client.create_dataset_item(
        version_id=review_version.id,
        split_id=review_split.id,
        file_path=pred.input_path,
        metadata={
            "source": "production_low_confidence",
            "original_prediction": pred.prediction,
            "original_confidence": pred.confidence
        }
    )

# After human review → retrain → evaluate → deploy if better

Deployment Checklist

ℹ️

Summary

You’ve completed the full distillation pipeline:

✅ Set up a teacher model
✅ Labeled data with the teacher
✅ Trained a small student model
✅ Evaluated and compared both models
✅ Deployed the optimized student

Results:

Smaller model (10-50x)
Faster inference (10-100x)
Lower cost (10-1000x)
Similar accuracy (within 2-5%)

Create a test dataset with human-verified labels