Prepare Data for Finetuning

Prepare Data for Finetuning

Quality data is the foundation of a good model. This guide covers how to prepare, structure, and upload your training data.

Data Requirements

Quantity Guidelines

Task TypeMinimumRecommendedIdeal
Image Classification50/class200-500/class1000+/class
Object Detection100 images300-500 images1000+ images
Text Classification100/class500/class2000+/class
NER200 sentences1000 sentences5000+ sentences

Quality Checklist

ℹ️
  • Representative: Data reflects real-world distribution
  • Balanced: Similar number of examples per class (or weighted training)
  • Clean: No mislabeled or corrupt data
  • Varied: Captures different conditions, angles, variations
  • Split properly: Train/validation/test sets don’t leak

Create a Dataset

Define Labels

Create the categories your model will predict.

Create Splits

Split your data into training, validation, and optionally test sets.

SplitPurposeTypical Size
trainModel learns from this data70-80%
validationTune hyperparameters, monitor overfitting10-15%
testFinal evaluation (never used during training)10-15%
# Create splits
train_split = client.create_split(
    version_id=version.id,
    name="train",
    description="Training data"
)

val_split = client.create_split(
    version_id=version.id,
    name="validation",
    description="Validation data for hyperparameter tuning"
)

test_split = client.create_split(
    version_id=version.id,
    name="test",
    description="Held-out test set for final evaluation"
)

Upload Data

Image Data

import os
import random
from seeme.types import Annotation

def upload_image_dataset(client, dataset_id, version_id, data_dir, train_split_id, val_split_id, labels, val_ratio=0.2):
    """
    Upload images organized in label folders:
    data_dir/
      good/
        img1.jpg
        img2.jpg
      scratch/
        img3.jpg
      ...
    """
    for label_name in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label_name)
        if not os.path.isdir(label_dir):
            continue

        images = [f for f in os.listdir(label_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
        random.shuffle(images)

        # Split into train/val
        val_count = int(len(images) * val_ratio)
        val_images = images[:val_count]
        train_images = images[val_count:]

        print(f"Label '{label_name}': {len(train_images)} train, {len(val_images)} val")

        # Upload training images
        for img_name in train_images:
            img_path = os.path.join(label_dir, img_name)
            item = client.create_dataset_item(
                version_id=version_id,
                split_id=train_split_id,
                file_path=img_path
            )
            client.annotate(
                dataset_id=dataset_id,
                dataset_version_id=version_id,
                annotation=Annotation(
                    label_id=labels[label_name].id,
                    split_id=train_split_id,
                    item_id=item.id
                )
            )

        # Upload validation images
        for img_name in val_images:
            img_path = os.path.join(label_dir, img_name)
            item = client.create_dataset_item(
                version_id=version_id,
                split_id=val_split_id,
                file_path=img_path
            )
            client.annotate(
                dataset_id=dataset_id,
                dataset_version_id=version_id,
                annotation=Annotation(
                    label_id=labels[label_name].id,
                    split_id=val_split_id,
                    item_id=item.id
                )
            )

# Upload your data
upload_image_dataset(
    client=client,
    dataset_id=dataset.id,
    version_id=version.id,
    data_dir="./data/product_images",
    train_split_id=train_split.id,
    val_split_id=val_split.id,
    labels=labels,
    val_ratio=0.2
)

Text Data

import csv
from seeme.types import Annotation

def upload_text_dataset(client, dataset_id, version_id, csv_path, train_split_id, val_split_id, text_col, label_col, labels, val_ratio=0.2):
    """
    Upload text data from CSV:
    text,label
    "Great product!",positive
    "Terrible experience",negative
    """
    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)

    random.shuffle(rows)
    val_count = int(len(rows) * val_ratio)
    val_rows = rows[:val_count]
    train_rows = rows[val_count:]

    print(f"Total: {len(train_rows)} train, {len(val_rows)} val")

    # Upload training data
    for row in train_rows:
        item = client.create_dataset_item(
            version_id=version_id,
            split_id=train_split_id,
            text=row[text_col]
        )
        client.annotate(
            dataset_id=dataset_id,
            dataset_version_id=version_id,
            annotation=Annotation(
                label_id=labels[row[label_col]].id,
                split_id=train_split_id,
                item_id=item.id
            )
        )

    # Upload validation data
    for row in val_rows:
        item = client.create_dataset_item(
            version_id=version_id,
            split_id=val_split_id,
            text=row[text_col]
        )
        client.annotate(
            dataset_id=dataset_id,
            dataset_version_id=version_id,
            annotation=Annotation(
                label_id=labels[row[label_col]].id,
                split_id=val_split_id,
                item_id=item.id
            )
        )

upload_text_dataset(
    client=client,
    dataset_id=dataset.id,
    version_id=version.id,
    csv_path="./data/reviews.csv",
    train_split_id=train_split.id,
    val_split_id=val_split.id,
    text_col="review_text",
    label_col="sentiment",
    labels=labels
)

Object Detection Data (Bounding Boxes)

import json
from seeme.types import Annotation

def upload_detection_dataset(client, dataset_id, version_id, images_dir, annotations_file, train_split_id, val_split_id, labels, val_ratio=0.2):
    """
    Upload object detection data. Annotations in COCO format:
    {
      "images": [{"id": 1, "file_name": "img1.jpg"}],
      "annotations": [{"image_id": 1, "category_id": 1, "bbox": [x, y, w, h]}],
      "categories": [{"id": 1, "name": "cat"}]
    }
    """
    with open(annotations_file, 'r') as f:
        coco = json.load(f)

    # Build lookup tables
    categories = {c['id']: c['name'] for c in coco['categories']}
    image_annotations = {}
    for ann in coco['annotations']:
        img_id = ann['image_id']
        if img_id not in image_annotations:
            image_annotations[img_id] = []
        image_annotations[img_id].append(ann)

    # Shuffle images
    images = coco['images']
    random.shuffle(images)
    val_count = int(len(images) * val_ratio)

    for i, img_info in enumerate(images):
        split_id = val_split_id if i < val_count else train_split_id
        img_path = os.path.join(images_dir, img_info['file_name'])

        # Upload image
        item = client.create_dataset_item(
            version_id=version_id,
            split_id=split_id,
            file_path=img_path
        )

        # Add bounding box annotations
        for ann in image_annotations.get(img_info['id'], []):
            x, y, w, h = ann['bbox']
            # Convert to normalized coordinates (0-1)
            img_w = img_info.get('width', 1)
            img_h = img_info.get('height', 1)

            client.annotate(
                dataset_id=dataset_id,
                dataset_version_id=version_id,
                annotation=Annotation(
                    label_id=labels[categories[ann['category_id']]].id,
                    split_id=split_id,
                    item_id=item.id,
                    bbox={
                        "x": x / img_w,
                        "y": y / img_h,
                        "width": w / img_w,
                        "height": h / img_h
                    }
                )
            )

    print(f"Uploaded {len(images)} images with bounding boxes")

Verify Your Dataset

After uploading, verify the data:

# Get dataset statistics
stats = client.get_dataset_stats(version_id=version.id)

print("Dataset Statistics:")
print(f"  Total items: {stats['total_items']}")
print(f"  Training items: {stats['splits']['train']}")
print(f"  Validation items: {stats['splits']['validation']}")
print("\nLabel Distribution:")
for label, count in stats['label_counts'].items():
    print(f"  {label}: {count}")

# Check for class imbalance
max_count = max(stats['label_counts'].values())
min_count = min(stats['label_counts'].values())
imbalance_ratio = max_count / min_count

if imbalance_ratio > 5:
    print(f"\n⚠️  Warning: Class imbalance detected (ratio: {imbalance_ratio:.1f}x)")
    print("   Consider: oversampling, weighted loss, or collecting more data")

Handle Class Imbalance

If some classes have far fewer examples than others:

Option 1: Weighted Training

# Configure weighted loss in training
job = client.create_job(
    dataset_id=dataset.id,
    version_id=version.id,
    name="Weighted Training",
    config={
        "base_model": "efficientnet_b0",
        "epochs": 20,
        "class_weights": "balanced"  # Auto-compute weights from class frequencies
    }
)

Option 2: Oversampling

# Manually oversample minority classes during upload
# or use data augmentation more heavily on minority classes
job = client.create_job(
    dataset_id=dataset.id,
    version_id=version.id,
    name="With Oversampling",
    config={
        "base_model": "efficientnet_b0",
        "epochs": 20,
        "oversampling": True,
        "oversampling_strategy": "minority"  # Oversample minority classes
    }
)

Option 3: Collect More Data

Use Automated Labeling to quickly label more examples for underrepresented classes.

Data Versioning

Create a new version when you modify your dataset:

# Create new version (copies structure, not data)
version_v2 = client.create_dataset_version(
    dataset_id=dataset.id,
    name="v2",
    description="Added 200 more scratch examples",
    copy_from_version_id=version.id  # Optionally copy data from v1
)

# Add new data to v2
# ... upload additional items ...

# Both versions are preserved - you can train on either

Best Practices

  1. Inspect samples - Look at 50-100 random items to catch labeling errors
  2. Check edge cases - Ensure ambiguous examples are consistently labeled
  3. No data leakage - Never put augmented versions of the same image in both train and val
  4. Document your criteria - Write down labeling guidelines for consistency
  5. Version everything - Create new dataset versions instead of modifying in place
  6. Stratified splits - Ensure each split has proportional class representation

Next Step

With your data ready, proceed to Choose Base Model to select the right pre-trained model for finetuning.