Post-Processor Configuration
Post-Processor Configuration
Configure post-processors to automatically process data when uploaded to datasets.
Basic Configuration
Configuration Options
Core Settings
| Setting | Description | Required |
|---|---|---|
name | Descriptive name | Yes |
model_type | Type of processing | Yes |
model_id | ID of model to use | Yes* |
output_target | Where to store results | Yes |
enabled | Active or paused | No (default: true) |
order | Execution order | No (default: 1) |
*Not required when using external providers
Output Targets
# Store in text field
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Transcription",
model_type="stt",
model_id=stt_model.id,
output_target="text" # Replaces item's text field
)
# Store as annotations
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Entity Extraction",
model_type="ner",
model_id=ner_model.id,
output_target="annotations" # Creates label annotations
)
# Store both
processor = client.create_post_processor(
dataset_id=dataset.id,
name="OCR + Classification",
model_type="ocr",
model_id=ocr_model.id,
output_target="both" # Text and annotations
)Filtering Options
processor = client.create_post_processor(
dataset_id=dataset.id,
name="High Confidence Only",
model_type="classification",
model_id=model.id,
output_target="annotations",
# Only keep predictions above threshold
confidence_threshold=0.8,
# Filter by file type
file_types=["jpg", "png", "webp"],
# Only process items in specific splits
splits=["train", "validation"]
)Label Management
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Auto-Categorize",
model_type="classification",
model_id=model.id,
output_target="annotations",
# Automatically create new labels from predictions
auto_create_labels=True,
# Or map predictions to existing labels
label_mapping={
"cat": "label_id_for_cat",
"dog": "label_id_for_dog"
},
# Prefix for auto-created labels
label_prefix="auto_"
)External Providers
Use OpenAI, Anthropic, or other external APIs:
OpenAI
processor = client.create_post_processor(
dataset_id=dataset.id,
name="GPT-4 Analysis",
model_type="llm",
# External provider configuration
external_provider="openai",
external_model="gpt-4-turbo",
external_config={
"api_key": "sk-...",
"temperature": 0.3,
"max_tokens": 1000
},
# Custom prompt
prompt="""
Analyze this document and extract:
- Main topic
- Key entities
- Sentiment
Return as JSON.
""",
output_target="text"
)Anthropic
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Claude Analysis",
model_type="llm",
external_provider="anthropic",
external_model="claude-3-5-sonnet-20241022",
external_config={
"api_key": "sk-ant-...",
"max_tokens": 2000
},
prompt="Summarize this document in 3 bullet points.",
output_target="text"
)Custom API
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Custom Model",
model_type="custom",
external_provider="custom",
external_config={
"endpoint": "https://my-api.com/predict",
"api_key": "my-key",
"headers": {
"X-Custom-Header": "value"
},
"timeout": 30
},
output_target="text"
)Advanced Configuration
Retry Settings
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Robust Processor",
model_type="classification",
model_id=model.id,
output_target="annotations",
# Retry on failure
retry_config={
"max_retries": 3,
"retry_delay": 5, # seconds
"retry_on": ["timeout", "rate_limit"]
}
)Batch Processing
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Batch Processor",
model_type="classification",
model_id=model.id,
output_target="annotations",
# Process in batches for efficiency
batch_config={
"batch_size": 32,
"max_concurrent": 4
}
)Conditional Processing
processor = client.create_post_processor(
dataset_id=dataset.id,
name="Conditional Processor",
model_type="ner",
model_id=ner_model.id,
output_target="annotations",
# Only process if item has text
conditions={
"has_text": True,
"text_min_length": 10
}
)Update Configuration
# Update processor settings
client.update_post_processor(
processor_id=processor.id,
confidence_threshold=0.9,
enabled=True
)
# Disable processor
client.update_post_processor(
processor_id=processor.id,
enabled=False
)
# Change model
client.update_post_processor(
processor_id=processor.id,
model_id=new_model.id
)Delete Post-Processor
# Delete processor (stops future processing)
client.delete_post_processor(processor_id=processor.id)Configuration Examples
Audio Transcription
stt_processor = client.create_post_processor(
dataset_id=audio_dataset.id,
name="Whisper Transcription",
model_type="stt",
model_id=whisper_model.id,
output_target="text",
config={
"language": "en",
"timestamps": True,
"word_timestamps": False
}
)Document Processing
# OCR first
ocr_processor = client.create_post_processor(
dataset_id=docs_dataset.id,
name="Extract Text",
model_type="ocr",
model_id=ocr_model.id,
output_target="text",
order=1
)
# Then NER
ner_processor = client.create_post_processor(
dataset_id=docs_dataset.id,
name="Find Entities",
model_type="ner",
model_id=ner_model.id,
output_target="annotations",
order=2
)Best Practices
- Set appropriate thresholds - Filter low-confidence results
- Use order for dependencies - OCR before NER
- Enable auto_create_labels carefully - Review created labels
- Test before enabling - Run on sample data first
- Monitor processing - Check for failures regularly