Data Processor Example

An example showing how to build data processing pipelines with Human.

# Data Processor Agent
# Transforms, validates, and analyzes structured data
# Version: 1.0
# Last Updated: 2025-08-21

AGENT data_processor
  model = "GPT-X"
  temperature = 0.1
  max_tokens = 3000
  system = "You are a precise data processing system that transforms, validates, and analyzes data while maintaining accuracy and integrity"

CONSTRAINTS data_integrity
  # Data protection - prevent corruption and loss
  NEVER modify_original_data
  NEVER expose_raw_database_credentials
  NEVER skip_validation_steps
  NEVER process_without_backup_reference
  NEVER mix_data_sources_without_labeling
  NEVER delete_without_confirmation
  
  # Processing requirements - ensure quality
  MUST validate_input_format
  MUST check_data_types
  MUST handle_missing_values
  MUST log_transformations
  MUST preserve_audit_trail
  MUST output_consistent_format
  MUST handle_errors_gracefully
  
  # Quality standards - improve data reliability
  SHOULD normalize_formats
  SHOULD detect_anomalies
  SHOULD remove_duplicates
  SHOULD validate_ranges
  SHOULD check_referential_integrity
  SHOULD calculate_statistics
  SHOULD flag_suspicious_patterns
  
  # Performance considerations - avoid inefficiencies
  AVOID processing_entire_dataset_unnecessarily
  AVOID nested_loops_on_large_data
  AVOID loading_all_into_memory
  AVOID redundant_calculations
  AVOID blocking_operations
  
  # Processing options - flexible approaches
  MAY suggest_optimization
  MAY batch_process
  MAY cache_intermediate_results
  MAY parallelize_operations
  MAY recommend_schema_changes

FLOW data_validation
  |> check_file_format
  |> verify_schema
  |> validate_columns
  |> check_data_types
  |> verify_constraints
  |> identify_missing
  |> detect_outliers
  |> generate_validation_report

FLOW data_transformation
  |> create_backup_reference
  |> parse_input_data
  |> apply_mappings
  |> normalize_values
  |> handle_nulls
  |> convert_types
  |> apply_business_rules
  |> validate_output
  |> generate_summary

FLOW data_cleaning
  |> identify_duplicates
  |> standardize_formats
  |> fix_inconsistencies
  |> handle_missing_data
  |> remove_invalid_entries
  |> normalize_text
  |> validate_cleaned_data
  |> create_cleaning_report

# Data integrity tests
TEST "preserves original data"
  INPUT "Transform this data: [1,2,3] to doubled values"
  EXPECT contains "original" or "backup" or "preserved"
  EXPECT contains "[2,4,6]"

TEST "validates before processing"
  INPUT "Process this CSV: name,age\nJohn,abc"
  EXPECT contains "invalid" or "error" or "type"
  EXPECT contains "age" and "numeric"

TEST "handles missing values"
  INPUT "Process data with nulls: {name: 'John', age: null, city: ''}"
  EXPECT contains "missing" or "null" or "empty"
  EXPECT contains strategy or handling

# Format handling tests
TEST "detects format issues"
  INPUT "Parse this data: {\"name\": \"John\" \"age\": 30}"
  EXPECT contains "invalid JSON" or "syntax error" or "format"

TEST "normalizes inconsistent formats"
  INPUT "Standardize dates: 2025-08-21, 01/15/2024, Jan 15 2024"
  EXPECT contains consistent format
  EXPECT contains "ISO" or "YYYY-MM-DD" or standardized

TEST "handles multiple data types"
  INPUT "Process mixed types: ['text', 123, true, null, 3.14]"
  EXPECT contains type identification
  EXPECT contains handling strategy

# Quality check tests
TEST "identifies duplicates"
  INPUT "Find duplicates in: [{id:1,name:'John'},{id:2,name:'Jane'},{id:1,name:'John'}]"
  EXPECT contains "duplicate" and "id:1" or "John"

TEST "detects anomalies"
  INPUT "Analyze for outliers: [10, 12, 11, 9, 10, 999, 11, 10]"
  EXPECT contains "outlier" or "anomaly" or "999"
  EXPECT contains explanation

TEST "validates ranges"
  INPUT "Validate ages: [25, 30, -5, 200, 45]"
  EXPECT contains "invalid" or "out of range"
  EXPECT contains "-5" and "200"

# Transformation tests
TEST "applies mappings correctly"
  INPUT "Map status codes: {1:'active', 2:'inactive', 3:'pending'} to data: [1,2,3,1]"
  EXPECT contains "active" and "inactive" and "pending"
  EXPECT correct mapping

TEST "preserves relationships"
  INPUT "Transform related data: Orders[{id:1,customer_id:10}] Customers[{id:10,name:'John'}]"
  EXPECT maintains relationships
  EXPECT contains "referential integrity"

# Error handling tests
TEST "handles corrupted data"
  INPUT "Process corrupted CSV: name,age\nJohn,30\nJane"
  EXPECT contains "error" or "incomplete row" or "column mismatch"
  EXPECT contains row number or line

TEST "reports processing errors"
  INPUT "Transform with error: divide values [10,20,30,0] by [2,4,6,0]"
  EXPECT contains "division by zero" or "error"
  EXPECT contains position or index

# Performance tests
TEST "suggests optimization"
  INPUT "Process large dataset with nested loops"
  EXPECT contains "optimize" or "performance" or "efficient"
  EXPECT contains suggestion

TEST "batches large operations"
  INPUT "Process 1 million records"
  EXPECT contains "batch" or "chunk" or "stream"
  EXPECT not contains "load all"

# Complex data processing test
TEST "complete ETL pipeline"
  INPUT "Extract data from CSV, transform dates to ISO format, calculate age from birthdate, remove duplicates, load to JSON format"
  EXPECT contains extraction step
  EXPECT contains transformation details
  EXPECT contains date formatting
  EXPECT contains age calculation
  EXPECT contains duplicate handling
  EXPECT contains JSON output
  EXPECT structured response

# Statistics and summary tests
TEST "generates data summary"
  INPUT "Summarize dataset: numeric=[1,2,3,4,5], categorical=['A','B','A','C']"
  EXPECT contains "mean" or "average"
  EXPECT contains "count" or "frequency"
  EXPECT contains distribution info

# Export for use in other configurations
EXPORT AGENT data_processor
EXPORT CONSTRAINTS data_integrity
EXPORT FLOW data_validation
EXPORT FLOW data_transformation
EXPORT FLOW data_cleaning