Post-Processor Configuration

Configure post-processors to automatically process data when uploaded to datasets.

Basic Configuration

from seeme import Client

client = Client()

# Create basic post-processor
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Document Classifier",
    model_type="classification",
    model_id=classifier_model.id,
    output_target="annotations",
    enabled=True
)

print(f"Post-processor created: {processor.id}")

Configuration Options

Core Settings

Setting	Description	Required
`name`	Descriptive name	Yes
`model_type`	Type of processing	Yes
`model_id`	ID of model to use	Yes*
`output_target`	Where to store results	Yes
`enabled`	Active or paused	No (default: true)
`order`	Execution order	No (default: 1)

*Not required when using external providers

Output Targets

# Store in text field
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Transcription",
    model_type="stt",
    model_id=stt_model.id,
    output_target="text"  # Replaces item's text field
)

# Store as annotations
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Entity Extraction",
    model_type="ner",
    model_id=ner_model.id,
    output_target="annotations"  # Creates label annotations
)

# Store both
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="OCR + Classification",
    model_type="ocr",
    model_id=ocr_model.id,
    output_target="both"  # Text and annotations
)

Filtering Options

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="High Confidence Only",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Only keep predictions above threshold
    confidence_threshold=0.8,

    # Filter by file type
    file_types=["jpg", "png", "webp"],

    # Only process items in specific splits
    splits=["train", "validation"]
)

Label Management

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Auto-Categorize",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Automatically create new labels from predictions
    auto_create_labels=True,

    # Or map predictions to existing labels
    label_mapping={
        "cat": "label_id_for_cat",
        "dog": "label_id_for_dog"
    },

    # Prefix for auto-created labels
    label_prefix="auto_"
)

External Providers

Use OpenAI, Anthropic, or other external APIs:

OpenAI

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="GPT-4 Analysis",
    model_type="llm",

    # External provider configuration
    external_provider="openai",
    external_model="gpt-4-turbo",
    external_config={
        "api_key": "sk-...",
        "temperature": 0.3,
        "max_tokens": 1000
    },

    # Custom prompt
    prompt="""
    Analyze this document and extract:
    - Main topic
    - Key entities
    - Sentiment

    Return as JSON.
    """,

    output_target="text"
)

Anthropic

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Claude Analysis",
    model_type="llm",

    external_provider="anthropic",
    external_model="claude-3-5-sonnet-20241022",
    external_config={
        "api_key": "sk-ant-...",
        "max_tokens": 2000
    },

    prompt="Summarize this document in 3 bullet points.",
    output_target="text"
)

Custom API

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Custom Model",
    model_type="custom",

    external_provider="custom",
    external_config={
        "endpoint": "https://my-api.com/predict",
        "api_key": "my-key",
        "headers": {
            "X-Custom-Header": "value"
        },
        "timeout": 30
    },

    output_target="text"
)

Advanced Configuration

Retry Settings

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Robust Processor",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Retry on failure
    retry_config={
        "max_retries": 3,
        "retry_delay": 5,  # seconds
        "retry_on": ["timeout", "rate_limit"]
    }
)

Batch Processing

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Batch Processor",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Process in batches for efficiency
    batch_config={
        "batch_size": 32,
        "max_concurrent": 4
    }
)

Conditional Processing

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Conditional Processor",
    model_type="ner",
    model_id=ner_model.id,
    output_target="annotations",

    # Only process if item has text
    conditions={
        "has_text": True,
        "text_min_length": 10
    }
)

Update Configuration

# Update processor settings
client.update_post_processor(
    processor_id=processor.id,
    confidence_threshold=0.9,
    enabled=True
)

# Disable processor
client.update_post_processor(
    processor_id=processor.id,
    enabled=False
)

# Change model
client.update_post_processor(
    processor_id=processor.id,
    model_id=new_model.id
)

Delete Post-Processor

# Delete processor (stops future processing)
client.delete_post_processor(processor_id=processor.id)

Configuration Examples

Audio Transcription

stt_processor = client.create_post_processor(
    dataset_id=audio_dataset.id,
    name="Whisper Transcription",
    model_type="stt",
    model_id=whisper_model.id,
    output_target="text",
    config={
        "language": "en",
        "timestamps": True,
        "word_timestamps": False
    }
)

Document Processing

# OCR first
ocr_processor = client.create_post_processor(
    dataset_id=docs_dataset.id,
    name="Extract Text",
    model_type="ocr",
    model_id=ocr_model.id,
    output_target="text",
    order=1
)

# Then NER
ner_processor = client.create_post_processor(
    dataset_id=docs_dataset.id,
    name="Find Entities",
    model_type="ner",
    model_id=ner_model.id,
    output_target="annotations",
    order=2
)

Best Practices

Set appropriate thresholds - Filter low-confidence results
Use order for dependencies - OCR before NER
Enable auto_create_labels carefully - Review created labels
Test before enabling - Run on sample data first
Monitor processing - Check for failures regularly

Next Steps

Processor Types Chaining Processors

Processor Types

Post-Processor Configuration

Basic Configuration

Create Post-Processor

Configuration Options

Core Settings

Output Targets

Filtering Options

Label Management

External Providers

OpenAI

Anthropic

Custom API

Advanced Configuration

Retry Settings

Batch Processing

Conditional Processing

Update Configuration

Delete Post-Processor

Configuration Examples

Audio Transcription

Document Processing

Best Practices

Next Steps