Prepare Data for Finetuning
Quality data is the foundation of a good model. This guide covers how to prepare, structure, and upload your training data.
Data Requirements
Quantity Guidelines
| Task Type | Minimum | Recommended | Ideal |
|---|---|---|---|
| Image Classification | 50/class | 200-500/class | 1000+/class |
| Object Detection | 100 images | 300-500 images | 1000+ images |
| Text Classification | 100/class | 500/class | 2000+/class |
| NER | 200 sentences | 1000 sentences | 5000+ sentences |
Quality Checklist
ℹ️
- Representative: Data reflects real-world distribution
- Balanced: Similar number of examples per class (or weighted training)
- Clean: No mislabeled or corrupt data
- Varied: Captures different conditions, angles, variations
- Split properly: Train/validation/test sets don’t leak
Create a Dataset
Define Labels
Create the categories your model will predict.
Create Splits
Split your data into training, validation, and optionally test sets.
| Split | Purpose | Typical Size |
|---|---|---|
| train | Model learns from this data | 70-80% |
| validation | Tune hyperparameters, monitor overfitting | 10-15% |
| test | Final evaluation (never used during training) | 10-15% |
# Create splits
train_split = client.create_split(
version_id=version.id,
name="train",
description="Training data"
)
val_split = client.create_split(
version_id=version.id,
name="validation",
description="Validation data for hyperparameter tuning"
)
test_split = client.create_split(
version_id=version.id,
name="test",
description="Held-out test set for final evaluation"
)Upload Data
Image Data
import os
import random
from seeme.types import Annotation
def upload_image_dataset(client, dataset_id, version_id, data_dir, train_split_id, val_split_id, labels, val_ratio=0.2):
"""
Upload images organized in label folders:
data_dir/
good/
img1.jpg
img2.jpg
scratch/
img3.jpg
...
"""
for label_name in os.listdir(data_dir):
label_dir = os.path.join(data_dir, label_name)
if not os.path.isdir(label_dir):
continue
images = [f for f in os.listdir(label_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
random.shuffle(images)
# Split into train/val
val_count = int(len(images) * val_ratio)
val_images = images[:val_count]
train_images = images[val_count:]
print(f"Label '{label_name}': {len(train_images)} train, {len(val_images)} val")
# Upload training images
for img_name in train_images:
img_path = os.path.join(label_dir, img_name)
item = client.create_dataset_item(
version_id=version_id,
split_id=train_split_id,
file_path=img_path
)
client.annotate(
dataset_id=dataset_id,
dataset_version_id=version_id,
annotation=Annotation(
label_id=labels[label_name].id,
split_id=train_split_id,
item_id=item.id
)
)
# Upload validation images
for img_name in val_images:
img_path = os.path.join(label_dir, img_name)
item = client.create_dataset_item(
version_id=version_id,
split_id=val_split_id,
file_path=img_path
)
client.annotate(
dataset_id=dataset_id,
dataset_version_id=version_id,
annotation=Annotation(
label_id=labels[label_name].id,
split_id=val_split_id,
item_id=item.id
)
)
# Upload your data
upload_image_dataset(
client=client,
dataset_id=dataset.id,
version_id=version.id,
data_dir="./data/product_images",
train_split_id=train_split.id,
val_split_id=val_split.id,
labels=labels,
val_ratio=0.2
)Text Data
import csv
from seeme.types import Annotation
def upload_text_dataset(client, dataset_id, version_id, csv_path, train_split_id, val_split_id, text_col, label_col, labels, val_ratio=0.2):
"""
Upload text data from CSV:
text,label
"Great product!",positive
"Terrible experience",negative
"""
with open(csv_path, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)
random.shuffle(rows)
val_count = int(len(rows) * val_ratio)
val_rows = rows[:val_count]
train_rows = rows[val_count:]
print(f"Total: {len(train_rows)} train, {len(val_rows)} val")
# Upload training data
for row in train_rows:
item = client.create_dataset_item(
version_id=version_id,
split_id=train_split_id,
text=row[text_col]
)
client.annotate(
dataset_id=dataset_id,
dataset_version_id=version_id,
annotation=Annotation(
label_id=labels[row[label_col]].id,
split_id=train_split_id,
item_id=item.id
)
)
# Upload validation data
for row in val_rows:
item = client.create_dataset_item(
version_id=version_id,
split_id=val_split_id,
text=row[text_col]
)
client.annotate(
dataset_id=dataset_id,
dataset_version_id=version_id,
annotation=Annotation(
label_id=labels[row[label_col]].id,
split_id=val_split_id,
item_id=item.id
)
)
upload_text_dataset(
client=client,
dataset_id=dataset.id,
version_id=version.id,
csv_path="./data/reviews.csv",
train_split_id=train_split.id,
val_split_id=val_split.id,
text_col="review_text",
label_col="sentiment",
labels=labels
)Object Detection Data (Bounding Boxes)
import json
from seeme.types import Annotation
def upload_detection_dataset(client, dataset_id, version_id, images_dir, annotations_file, train_split_id, val_split_id, labels, val_ratio=0.2):
"""
Upload object detection data. Annotations in COCO format:
{
"images": [{"id": 1, "file_name": "img1.jpg"}],
"annotations": [{"image_id": 1, "category_id": 1, "bbox": [x, y, w, h]}],
"categories": [{"id": 1, "name": "cat"}]
}
"""
with open(annotations_file, 'r') as f:
coco = json.load(f)
# Build lookup tables
categories = {c['id']: c['name'] for c in coco['categories']}
image_annotations = {}
for ann in coco['annotations']:
img_id = ann['image_id']
if img_id not in image_annotations:
image_annotations[img_id] = []
image_annotations[img_id].append(ann)
# Shuffle images
images = coco['images']
random.shuffle(images)
val_count = int(len(images) * val_ratio)
for i, img_info in enumerate(images):
split_id = val_split_id if i < val_count else train_split_id
img_path = os.path.join(images_dir, img_info['file_name'])
# Upload image
item = client.create_dataset_item(
version_id=version_id,
split_id=split_id,
file_path=img_path
)
# Add bounding box annotations
for ann in image_annotations.get(img_info['id'], []):
x, y, w, h = ann['bbox']
# Convert to normalized coordinates (0-1)
img_w = img_info.get('width', 1)
img_h = img_info.get('height', 1)
client.annotate(
dataset_id=dataset_id,
dataset_version_id=version_id,
annotation=Annotation(
label_id=labels[categories[ann['category_id']]].id,
split_id=split_id,
item_id=item.id,
bbox={
"x": x / img_w,
"y": y / img_h,
"width": w / img_w,
"height": h / img_h
}
)
)
print(f"Uploaded {len(images)} images with bounding boxes")Verify Your Dataset
After uploading, verify the data:
# Get dataset statistics
stats = client.get_dataset_stats(version_id=version.id)
print("Dataset Statistics:")
print(f" Total items: {stats['total_items']}")
print(f" Training items: {stats['splits']['train']}")
print(f" Validation items: {stats['splits']['validation']}")
print("\nLabel Distribution:")
for label, count in stats['label_counts'].items():
print(f" {label}: {count}")
# Check for class imbalance
max_count = max(stats['label_counts'].values())
min_count = min(stats['label_counts'].values())
imbalance_ratio = max_count / min_count
if imbalance_ratio > 5:
print(f"\n⚠️ Warning: Class imbalance detected (ratio: {imbalance_ratio:.1f}x)")
print(" Consider: oversampling, weighted loss, or collecting more data")Handle Class Imbalance
If some classes have far fewer examples than others:
Option 1: Weighted Training
# Configure weighted loss in training
job = client.create_job(
dataset_id=dataset.id,
version_id=version.id,
name="Weighted Training",
config={
"base_model": "efficientnet_b0",
"epochs": 20,
"class_weights": "balanced" # Auto-compute weights from class frequencies
}
)Option 2: Oversampling
# Manually oversample minority classes during upload
# or use data augmentation more heavily on minority classes
job = client.create_job(
dataset_id=dataset.id,
version_id=version.id,
name="With Oversampling",
config={
"base_model": "efficientnet_b0",
"epochs": 20,
"oversampling": True,
"oversampling_strategy": "minority" # Oversample minority classes
}
)Option 3: Collect More Data
Use Automated Labeling to quickly label more examples for underrepresented classes.
Data Versioning
Create a new version when you modify your dataset:
# Create new version (copies structure, not data)
version_v2 = client.create_dataset_version(
dataset_id=dataset.id,
name="v2",
description="Added 200 more scratch examples",
copy_from_version_id=version.id # Optionally copy data from v1
)
# Add new data to v2
# ... upload additional items ...
# Both versions are preserved - you can train on eitherBest Practices
- Inspect samples - Look at 50-100 random items to catch labeling errors
- Check edge cases - Ensure ambiguous examples are consistently labeled
- No data leakage - Never put augmented versions of the same image in both train and val
- Document your criteria - Write down labeling guidelines for consistency
- Version everything - Create new dataset versions instead of modifying in place
- Stratified splits - Ensure each split has proportional class representation
Next Step
With your data ready, proceed to Choose Base Model to select the right pre-trained model for finetuning.