Deploy Your Object Detection Model

Deploy your trained model to production—cloud, on-premise, or edge.

Deployment Options

Option	Best For	Latency	Setup
Cloud API	Quick start, scaling	~100-500ms	Instant
On-Premise	Data sovereignty, low latency	~10-50ms	Docker/K8s
Edge/Mobile	Offline, real-time	~5-30ms	Model export

Cloud API Deployment

Deploy via Web

Go to your trained model
Click Deploy > Cloud API
Configure:
- Endpoint name: vehicle-detection-prod
- Auto-scaling: Min 1, Max 10 instances
- GPU type: T4 (recommended for YOLO)
Click Deploy

from seeme import Client

client = Client()

# Deploy to cloud
deployment = client.create_deployment(
    model_id=model.id,
    name="vehicle-detection-prod",
    deployment_type="cloud",
    config={
        "min_replicas": 1,
        "max_replicas": 10,
        "gpu_type": "t4"
    }
)

# Wait for deployment
deployment.wait_until_ready()

# Run predictions
results = client.predict(
    model_id=model.id,
    item="./test.jpg"
)

for det in results.inference_items:
    print(f"{det.prediction}: {det.confidence:.2%} at ({det.x}, {det.y})")

# Create deployment
curl -X POST "https://api.seeme.ai/api/v1/deployments" \
  -H "Authorization: myusername:my-api-key" \
  -H "Content-Type: application/json" \
  -d '{
    "model_id": "your-model-id",
    "name": "vehicle-detection-prod",
    "deployment_type": "cloud",
    "config": {
      "min_replicas": 1,
      "max_replicas": 10,
      "gpu_type": "t4"
    }
  }'

# Run prediction via API
curl -X POST "https://api.seeme.ai/api/v1/inferences/MODEL_ID" \
  -H "Authorization: myusername:my-api-key" \
  -F "file=@test.jpg"

# Response
{
  "detections": [
    {
      "label": "car",
      "confidence": 0.94,
      "x": 0.25,
      "y": 0.30,
      "width": 0.15,
      "height": 0.20
    }
  ]
}

On-Premise Deployment

Docker Deployment

# Pull the inference image
docker pull seeme/inference:latest

# Run with GPU support
docker run -d \
  --gpus all \
  -p 8080:8080 \
  -e SEEME_API_KEY=your-key \
  -e MODEL_ID=your-model-id \
  seeme/inference:latest

# Test the endpoint
curl -X POST http://localhost:8080/predict \
  -F "file=@test.jpg"

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: vehicle-detection
spec:
  replicas: 3
  selector:
    matchLabels:
      app: vehicle-detection
  template:
    metadata:
      labels:
        app: vehicle-detection
    spec:
      containers:
      - name: inference
        image: seeme/inference:latest
        ports:
        - containerPort: 8080
        env:
        - name: SEEME_API_KEY
          valueFrom:
            secretKeyRef:
              name: seeme-secrets
              key: api-key
        - name: MODEL_ID
          value: "your-model-id"
        resources:
          limits:
            nvidia.com/gpu: 1
---
apiVersion: v1
kind: Service
metadata:
  name: vehicle-detection
spec:
  selector:
    app: vehicle-detection
  ports:
  - port: 80
    targetPort: 8080
  type: LoadBalancer

Edge Deployment

Export Model

# Export to ONNX
export = client.export_model(
    model_id=model.id,
    format="onnx",
    config={
        "opset_version": 13,
        "image_size": 640
    }
)

# Download exported model
client.download_file(export.file_id, "./model.onnx")

# Export to TensorRT (NVIDIA)
export_trt = client.export_model(
    model_id=model.id,
    format="tensorrt",
    config={
        "precision": "fp16",
        "batch_size": 1
    }
)

Run on Edge Device

# Using ONNX Runtime
import onnxruntime as ort
import numpy as np
from PIL import Image

# Load model
session = ort.InferenceSession("model.onnx")

# Preprocess image
img = Image.open("test.jpg").resize((640, 640))
input_data = np.array(img).transpose(2, 0, 1)[np.newaxis] / 255.0

# Run inference
outputs = session.run(None, {"images": input_data.astype(np.float32)})

# Post-process detections
detections = postprocess_yolo(outputs, conf_threshold=0.5, iou_threshold=0.45)

Mobile Deployment

iOS (CoreML)

# Export to CoreML
export = client.export_model(
    model_id=model.id,
    format="coreml",
    config={"include_nms": True}
)

// Swift inference
import CoreML
import Vision

let model = try VNCoreMLModel(for: VehicleDetection().model)

let request = VNCoreMLRequest(model: model) { request, error in
    guard let results = request.results as? [VNRecognizedObjectObservation] else { return }

    for detection in results {
        print("Found \(detection.labels.first?.identifier ?? "unknown")")
        print("  Confidence: \(detection.confidence)")
        print("  Bounds: \(detection.boundingBox)")
    }
}

Android (TensorFlow Lite)

# Export to TFLite
export = client.export_model(
    model_id=model.id,
    format="tflite",
    config={
        "quantization": "float16",  # or "int8" for smaller size
        "include_metadata": True
    }
)

// Kotlin inference
val model = ObjectDetector.createFromFile(context, "model.tflite")

val image = TensorImage.fromBitmap(bitmap)
val results = model.detect(image)

for (detection in results) {
    Log.d("Detection", "${detection.categories[0].label}: ${detection.categories[0].score}")
}

Batch Processing

Process multiple images efficiently:

# Batch prediction
results = client.predict_batch(
    model_id=model.id,
    file_paths=["img1.jpg", "img2.jpg", "img3.jpg"],
    config={
        "batch_size": 8,
        "confidence_threshold": 0.5,
        "nms_threshold": 0.45
    }
)

for path, detections in results.items():
    print(f"\n{path}:")
    for det in detections:
        print(f"  {det.label}: {det.confidence:.2%}")

Configuration Options

Inference Parameters

Parameter	Description	Default
`confidence_threshold`	Minimum confidence	0.5
`nms_threshold`	Non-max suppression IoU	0.45
`max_detections`	Maximum detections per image	100
`classes`	Filter to specific classes	All

# Configured inference
results = client.predict(
    model_id=model.id,
    item="./test.jpg",
    config={
        "confidence_threshold": 0.7,
        "nms_threshold": 0.4,
        "max_detections": 50,
        "classes": ["car", "truck"]  # Only detect these
    }
)

Best Practices

Start with Cloud API - Validate before on-prem investment
Use GPU for YOLO - CPU inference is much slower
Batch when possible - Higher throughput than single images
Set appropriate thresholds - Balance precision/recall for your use case
Monitor latency - Track p50, p95, p99 response times

Next Step

5. Monitor →

Train Your Object Detection Model Monitor Your Object Detection Model