Deploy Your Object Detection Model
Deploy your trained model to production—cloud, on-premise, or edge.
Deployment Options
| Option | Best For | Latency | Setup |
|---|---|---|---|
| Cloud API | Quick start, scaling | ~100-500ms | Instant |
| On-Premise | Data sovereignty, low latency | ~10-50ms | Docker/K8s |
| Edge/Mobile | Offline, real-time | ~5-30ms | Model export |
Cloud API Deployment
On-Premise Deployment
Docker Deployment
# Pull the inference image
docker pull seeme/inference:latest
# Run with GPU support
docker run -d \
--gpus all \
-p 8080:8080 \
-e SEEME_API_KEY=your-key \
-e MODEL_ID=your-model-id \
seeme/inference:latest
# Test the endpoint
curl -X POST http://localhost:8080/predict \
-F "file=@test.jpg"Kubernetes Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: vehicle-detection
spec:
replicas: 3
selector:
matchLabels:
app: vehicle-detection
template:
metadata:
labels:
app: vehicle-detection
spec:
containers:
- name: inference
image: seeme/inference:latest
ports:
- containerPort: 8080
env:
- name: SEEME_API_KEY
valueFrom:
secretKeyRef:
name: seeme-secrets
key: api-key
- name: MODEL_ID
value: "your-model-id"
resources:
limits:
nvidia.com/gpu: 1
---
apiVersion: v1
kind: Service
metadata:
name: vehicle-detection
spec:
selector:
app: vehicle-detection
ports:
- port: 80
targetPort: 8080
type: LoadBalancerEdge Deployment
Export Model
# Export to ONNX
export = client.export_model(
model_id=model.id,
format="onnx",
config={
"opset_version": 13,
"image_size": 640
}
)
# Download exported model
client.download_file(export.file_id, "./model.onnx")
# Export to TensorRT (NVIDIA)
export_trt = client.export_model(
model_id=model.id,
format="tensorrt",
config={
"precision": "fp16",
"batch_size": 1
}
)Run on Edge Device
# Using ONNX Runtime
import onnxruntime as ort
import numpy as np
from PIL import Image
# Load model
session = ort.InferenceSession("model.onnx")
# Preprocess image
img = Image.open("test.jpg").resize((640, 640))
input_data = np.array(img).transpose(2, 0, 1)[np.newaxis] / 255.0
# Run inference
outputs = session.run(None, {"images": input_data.astype(np.float32)})
# Post-process detections
detections = postprocess_yolo(outputs, conf_threshold=0.5, iou_threshold=0.45)Mobile Deployment
iOS (CoreML)
# Export to CoreML
export = client.export_model(
model_id=model.id,
format="coreml",
config={"include_nms": True}
)// Swift inference
import CoreML
import Vision
let model = try VNCoreMLModel(for: VehicleDetection().model)
let request = VNCoreMLRequest(model: model) { request, error in
guard let results = request.results as? [VNRecognizedObjectObservation] else { return }
for detection in results {
print("Found \(detection.labels.first?.identifier ?? "unknown")")
print(" Confidence: \(detection.confidence)")
print(" Bounds: \(detection.boundingBox)")
}
}Android (TensorFlow Lite)
# Export to TFLite
export = client.export_model(
model_id=model.id,
format="tflite",
config={
"quantization": "float16", # or "int8" for smaller size
"include_metadata": True
}
)// Kotlin inference
val model = ObjectDetector.createFromFile(context, "model.tflite")
val image = TensorImage.fromBitmap(bitmap)
val results = model.detect(image)
for (detection in results) {
Log.d("Detection", "${detection.categories[0].label}: ${detection.categories[0].score}")
}Batch Processing
Process multiple images efficiently:
# Batch prediction
results = client.predict_batch(
model_id=model.id,
file_paths=["img1.jpg", "img2.jpg", "img3.jpg"],
config={
"batch_size": 8,
"confidence_threshold": 0.5,
"nms_threshold": 0.45
}
)
for path, detections in results.items():
print(f"\n{path}:")
for det in detections:
print(f" {det.label}: {det.confidence:.2%}")Configuration Options
Inference Parameters
| Parameter | Description | Default |
|---|---|---|
confidence_threshold | Minimum confidence | 0.5 |
nms_threshold | Non-max suppression IoU | 0.45 |
max_detections | Maximum detections per image | 100 |
classes | Filter to specific classes | All |
# Configured inference
results = client.predict(
model_id=model.id,
item="./test.jpg",
config={
"confidence_threshold": 0.7,
"nms_threshold": 0.4,
"max_detections": 50,
"classes": ["car", "truck"] # Only detect these
}
)Best Practices
- Start with Cloud API - Validate before on-prem investment
- Use GPU for YOLO - CPU inference is much slower
- Batch when possible - Higher throughput than single images
- Set appropriate thresholds - Balance precision/recall for your use case
- Monitor latency - Track p50, p95, p99 response times