Post-Processor Configuration

Post-Processor Configuration

Configure post-processors to automatically process data when uploaded to datasets.

Basic Configuration

Configuration Options

Core Settings

SettingDescriptionRequired
nameDescriptive nameYes
model_typeType of processingYes
model_idID of model to useYes*
output_targetWhere to store resultsYes
enabledActive or pausedNo (default: true)
orderExecution orderNo (default: 1)

*Not required when using external providers

Output Targets

# Store in text field
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Transcription",
    model_type="stt",
    model_id=stt_model.id,
    output_target="text"  # Replaces item's text field
)

# Store as annotations
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Entity Extraction",
    model_type="ner",
    model_id=ner_model.id,
    output_target="annotations"  # Creates label annotations
)

# Store both
processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="OCR + Classification",
    model_type="ocr",
    model_id=ocr_model.id,
    output_target="both"  # Text and annotations
)

Filtering Options

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="High Confidence Only",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Only keep predictions above threshold
    confidence_threshold=0.8,

    # Filter by file type
    file_types=["jpg", "png", "webp"],

    # Only process items in specific splits
    splits=["train", "validation"]
)

Label Management

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Auto-Categorize",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Automatically create new labels from predictions
    auto_create_labels=True,

    # Or map predictions to existing labels
    label_mapping={
        "cat": "label_id_for_cat",
        "dog": "label_id_for_dog"
    },

    # Prefix for auto-created labels
    label_prefix="auto_"
)

External Providers

Use OpenAI, Anthropic, or other external APIs:

OpenAI

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="GPT-4 Analysis",
    model_type="llm",

    # External provider configuration
    external_provider="openai",
    external_model="gpt-4-turbo",
    external_config={
        "api_key": "sk-...",
        "temperature": 0.3,
        "max_tokens": 1000
    },

    # Custom prompt
    prompt="""
    Analyze this document and extract:
    - Main topic
    - Key entities
    - Sentiment

    Return as JSON.
    """,

    output_target="text"
)

Anthropic

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Claude Analysis",
    model_type="llm",

    external_provider="anthropic",
    external_model="claude-3-5-sonnet-20241022",
    external_config={
        "api_key": "sk-ant-...",
        "max_tokens": 2000
    },

    prompt="Summarize this document in 3 bullet points.",
    output_target="text"
)

Custom API

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Custom Model",
    model_type="custom",

    external_provider="custom",
    external_config={
        "endpoint": "https://my-api.com/predict",
        "api_key": "my-key",
        "headers": {
            "X-Custom-Header": "value"
        },
        "timeout": 30
    },

    output_target="text"
)

Advanced Configuration

Retry Settings

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Robust Processor",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Retry on failure
    retry_config={
        "max_retries": 3,
        "retry_delay": 5,  # seconds
        "retry_on": ["timeout", "rate_limit"]
    }
)

Batch Processing

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Batch Processor",
    model_type="classification",
    model_id=model.id,
    output_target="annotations",

    # Process in batches for efficiency
    batch_config={
        "batch_size": 32,
        "max_concurrent": 4
    }
)

Conditional Processing

processor = client.create_post_processor(
    dataset_id=dataset.id,
    name="Conditional Processor",
    model_type="ner",
    model_id=ner_model.id,
    output_target="annotations",

    # Only process if item has text
    conditions={
        "has_text": True,
        "text_min_length": 10
    }
)

Update Configuration

# Update processor settings
client.update_post_processor(
    processor_id=processor.id,
    confidence_threshold=0.9,
    enabled=True
)

# Disable processor
client.update_post_processor(
    processor_id=processor.id,
    enabled=False
)

# Change model
client.update_post_processor(
    processor_id=processor.id,
    model_id=new_model.id
)

Delete Post-Processor

# Delete processor (stops future processing)
client.delete_post_processor(processor_id=processor.id)

Configuration Examples

Audio Transcription

stt_processor = client.create_post_processor(
    dataset_id=audio_dataset.id,
    name="Whisper Transcription",
    model_type="stt",
    model_id=whisper_model.id,
    output_target="text",
    config={
        "language": "en",
        "timestamps": True,
        "word_timestamps": False
    }
)

Document Processing

# OCR first
ocr_processor = client.create_post_processor(
    dataset_id=docs_dataset.id,
    name="Extract Text",
    model_type="ocr",
    model_id=ocr_model.id,
    output_target="text",
    order=1
)

# Then NER
ner_processor = client.create_post_processor(
    dataset_id=docs_dataset.id,
    name="Find Entities",
    model_type="ner",
    model_id=ner_model.id,
    output_target="annotations",
    order=2
)

Best Practices

  1. Set appropriate thresholds - Filter low-confidence results
  2. Use order for dependencies - OCR before NER
  3. Enable auto_create_labels carefully - Review created labels
  4. Test before enabling - Run on sample data first
  5. Monitor processing - Check for failures regularly

Next Steps