Deploy Your Object Detection Model

Deploy Your Object Detection Model

Deploy your trained model to production—cloud, on-premise, or edge.

Deployment Options

OptionBest ForLatencySetup
Cloud APIQuick start, scaling~100-500msInstant
On-PremiseData sovereignty, low latency~10-50msDocker/K8s
Edge/MobileOffline, real-time~5-30msModel export

Cloud API Deployment

On-Premise Deployment

Docker Deployment

# Pull the inference image
docker pull seeme/inference:latest

# Run with GPU support
docker run -d \
  --gpus all \
  -p 8080:8080 \
  -e SEEME_API_KEY=your-key \
  -e MODEL_ID=your-model-id \
  seeme/inference:latest

# Test the endpoint
curl -X POST http://localhost:8080/predict \
  -F "file=@test.jpg"

Kubernetes Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: vehicle-detection
spec:
  replicas: 3
  selector:
    matchLabels:
      app: vehicle-detection
  template:
    metadata:
      labels:
        app: vehicle-detection
    spec:
      containers:
      - name: inference
        image: seeme/inference:latest
        ports:
        - containerPort: 8080
        env:
        - name: SEEME_API_KEY
          valueFrom:
            secretKeyRef:
              name: seeme-secrets
              key: api-key
        - name: MODEL_ID
          value: "your-model-id"
        resources:
          limits:
            nvidia.com/gpu: 1
---
apiVersion: v1
kind: Service
metadata:
  name: vehicle-detection
spec:
  selector:
    app: vehicle-detection
  ports:
  - port: 80
    targetPort: 8080
  type: LoadBalancer

Edge Deployment

Export Model

# Export to ONNX
export = client.export_model(
    model_id=model.id,
    format="onnx",
    config={
        "opset_version": 13,
        "image_size": 640
    }
)

# Download exported model
client.download_file(export.file_id, "./model.onnx")

# Export to TensorRT (NVIDIA)
export_trt = client.export_model(
    model_id=model.id,
    format="tensorrt",
    config={
        "precision": "fp16",
        "batch_size": 1
    }
)

Run on Edge Device

# Using ONNX Runtime
import onnxruntime as ort
import numpy as np
from PIL import Image

# Load model
session = ort.InferenceSession("model.onnx")

# Preprocess image
img = Image.open("test.jpg").resize((640, 640))
input_data = np.array(img).transpose(2, 0, 1)[np.newaxis] / 255.0

# Run inference
outputs = session.run(None, {"images": input_data.astype(np.float32)})

# Post-process detections
detections = postprocess_yolo(outputs, conf_threshold=0.5, iou_threshold=0.45)

Mobile Deployment

iOS (CoreML)

# Export to CoreML
export = client.export_model(
    model_id=model.id,
    format="coreml",
    config={"include_nms": True}
)
// Swift inference
import CoreML
import Vision

let model = try VNCoreMLModel(for: VehicleDetection().model)

let request = VNCoreMLRequest(model: model) { request, error in
    guard let results = request.results as? [VNRecognizedObjectObservation] else { return }

    for detection in results {
        print("Found \(detection.labels.first?.identifier ?? "unknown")")
        print("  Confidence: \(detection.confidence)")
        print("  Bounds: \(detection.boundingBox)")
    }
}

Android (TensorFlow Lite)

# Export to TFLite
export = client.export_model(
    model_id=model.id,
    format="tflite",
    config={
        "quantization": "float16",  # or "int8" for smaller size
        "include_metadata": True
    }
)
// Kotlin inference
val model = ObjectDetector.createFromFile(context, "model.tflite")

val image = TensorImage.fromBitmap(bitmap)
val results = model.detect(image)

for (detection in results) {
    Log.d("Detection", "${detection.categories[0].label}: ${detection.categories[0].score}")
}

Batch Processing

Process multiple images efficiently:

# Batch prediction
results = client.predict_batch(
    model_id=model.id,
    file_paths=["img1.jpg", "img2.jpg", "img3.jpg"],
    config={
        "batch_size": 8,
        "confidence_threshold": 0.5,
        "nms_threshold": 0.45
    }
)

for path, detections in results.items():
    print(f"\n{path}:")
    for det in detections:
        print(f"  {det.label}: {det.confidence:.2%}")

Configuration Options

Inference Parameters

ParameterDescriptionDefault
confidence_thresholdMinimum confidence0.5
nms_thresholdNon-max suppression IoU0.45
max_detectionsMaximum detections per image100
classesFilter to specific classesAll
# Configured inference
results = client.predict(
    model_id=model.id,
    item="./test.jpg",
    config={
        "confidence_threshold": 0.7,
        "nms_threshold": 0.4,
        "max_detections": 50,
        "classes": ["car", "truck"]  # Only detect these
    }
)

Best Practices

  1. Start with Cloud API - Validate before on-prem investment
  2. Use GPU for YOLO - CPU inference is much slower
  3. Batch when possible - Higher throughput than single images
  4. Set appropriate thresholds - Balance precision/recall for your use case
  5. Monitor latency - Track p50, p95, p99 response times

Next Step

5. Monitor →