Skip to content

L2↔L5 Contract: Orchestration ↔ Evaluation ​

Canonical specification for communication between Orchestration Layer (L2) and Evaluation Layer (L5).

Overview ​

Purpose: Enable Orchestration (L2) to submit completion results to Evaluation (L5) for quality assessment, benchmarking, and feedback collection.

Direction: Bidirectional

  • L2 β†’ L5: Task completion results from Orchestration to Evaluation
  • L5 β†’ L2: Quality scores, benchmarks, and feedback from Evaluation to Orchestration

Transport: HTTP REST API with JSON payloads

Request Contract: L2 β†’ L5 ​

evaluation_request.v0 ​

json
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "https://mnemoverse.dev/schemas/evaluation_request.v0.json",
  "title": "evaluation_request.v0",
  "type": "object",
  "required": ["request_id", "task_id", "intent", "completion", "metrics", "deadline_ms"],
  "properties": {
    "request_id": { "type": "string" },
    "task_id": { "type": "string" },
    "intent": { "type": "string", "minLength": 1 },
    "completion": {
      "type": "object",
      "required": ["fragments", "total_tokens", "latency_ms"],
      "properties": {
        "fragments": {
          "type": "array",
          "items": {
            "type": "object",
            "required": ["id", "text", "lod", "cost_tokens"],
            "properties": {
              "id": { "type": "string" },
              "text": { "type": "string" },
              "lod": { "type": "string", "enum": ["macro", "micro", "atomic"] },
              "entities": { "type": "array", "items": { "type": "string" } },
              "cost_tokens": { "type": "integer", "minimum": 0 },
              "source": { "type": "string", "enum": ["L1", "L4", "cached"] }
            },
            "additionalProperties": false
          }
        },
        "total_tokens": { "type": "integer", "minimum": 0 },
        "latency_ms": { "type": "integer", "minimum": 0 }
      },
      "additionalProperties": false
    },
    "metrics": {
      "type": "object",
      "required": ["coverage_entities", "quality_threshold"],
      "properties": {
        "coverage_entities": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
        "quality_threshold": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
        "provider_latencies": {
          "type": "object",
          "properties": {
            "L1_ms": { "type": "integer", "minimum": 0 },
            "L4_ms": { "type": "integer", "minimum": 0 }
          },
          "additionalProperties": false
        }
      },
      "additionalProperties": false
    },
    "deadline_ms": { "type": "integer", "minimum": 100 },
    "evaluation_type": { 
      "type": "string", 
      "enum": ["quality_score", "benchmark", "feedback", "all"],
      "default": "all"
    },
    "context": {
      "type": "object",
      "properties": {
        "user_id": { "type": "string" },
        "session_id": { "type": "string" },
        "domain": { "type": "string", "enum": ["code", "documentation", "research", "general"] },
        "budget_used": {
          "type": "object",
          "properties": {
            "tokens_max": { "type": "integer", "minimum": 0 },
            "time_ms": { "type": "integer", "minimum": 0 }
          },
          "additionalProperties": false
        }
      },
      "additionalProperties": false
    }
  },
  "additionalProperties": false
}

Request Examples ​

Quality Score Evaluation:

json
{
  "request_id": "req_eval_001",
  "task_id": "task_550e8400-e29b-41d4-a716-446655440000",
  "intent": "Find authentication issues in React app and suggest debugging steps",
  "completion": {
    "fragments": [
      {
        "id": "frag_auth_issue_001",
        "text": "Authentication middleware in Express.js verifies JWT tokens using jsonwebtoken library...",
        "lod": "macro",
        "entities": ["jwt", "middleware", "authentication", "express"],
        "cost_tokens": 150,
        "source": "L1"
      },
      {
        "id": "frag_debug_steps_002",
        "text": "To debug JWT issues: 1. Check token expiration using jwt.decode(), 2. Verify secret key matches...",
        "lod": "micro", 
        "entities": ["debugging", "jwt", "troubleshooting"],
        "cost_tokens": 200,
        "source": "L4"
      }
    ],
    "total_tokens": 350,
    "latency_ms": 1250
  },
  "metrics": {
    "coverage_entities": 0.85,
    "quality_threshold": 0.8,
    "provider_latencies": {
      "L1_ms": 800,
      "L4_ms": 450
    }
  },
  "deadline_ms": 3000,
  "evaluation_type": "quality_score",
  "context": {
    "user_id": "user_123",
    "session_id": "session_456",
    "domain": "code",
    "budget_used": {
      "tokens_max": 2000,
      "time_ms": 2500
    }
  }
}

Benchmark Evaluation:

json
{
  "request_id": "req_eval_002",
  "task_id": "task_benchmark_001",
  "intent": "Explain transformer architecture components",
  "completion": {
    "fragments": [
      {
        "id": "frag_transformer_001",
        "text": "Transformer architecture consists of encoder-decoder blocks with multi-head attention mechanisms...",
        "lod": "macro",
        "entities": ["transformer", "attention", "encoder", "decoder"],
        "cost_tokens": 300,
        "source": "L1"
      }
    ],
    "total_tokens": 300,
    "latency_ms": 900
  },
  "metrics": {
    "coverage_entities": 0.92,
    "quality_threshold": 0.85
  },
  "deadline_ms": 2000,
  "evaluation_type": "benchmark",
  "context": {
    "domain": "research"
  }
}

Response Contract: L5 β†’ L2 ​

evaluation_response.v0 ​

json
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "https://mnemoverse.dev/schemas/evaluation_response.v0.json", 
  "title": "evaluation_response.v0",
  "type": "object",
  "required": ["request_id", "task_id", "results", "metadata"],
  "properties": {
    "request_id": { "type": "string" },
    "task_id": { "type": "string" },
    "results": {
      "type": "object",
      "properties": {
        "quality_score": {
          "type": "object",
          "properties": {
            "overall": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
            "dimensions": {
              "type": "object",
              "properties": {
                "relevance": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
                "completeness": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
                "accuracy": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
                "coherence": { "type": "number", "minimum": 0.0, "maximum": 1.0 }
              },
              "additionalProperties": false
            },
            "confidence": { "type": "number", "minimum": 0.0, "maximum": 1.0 }
          },
          "additionalProperties": false
        },
        "benchmark": {
          "type": "object",
          "properties": {
            "dataset": { "type": "string" },
            "score": { "type": "number", "minimum": 0.0, "maximum": 1.0 },
            "percentile": { "type": "number", "minimum": 0.0, "maximum": 100.0 },
            "comparison": {
              "type": "object",
              "properties": {
                "baseline": { "type": "number" },
                "improvement": { "type": "number" }
              },
              "additionalProperties": false
            }
          },
          "additionalProperties": false
        },
        "feedback": {
          "type": "object",
          "properties": {
            "strengths": { "type": "array", "items": { "type": "string" } },
            "improvements": { "type": "array", "items": { "type": "string" } },
            "recommendations": {
              "type": "array",
              "items": {
                "type": "object",
                "required": ["action", "priority"],
                "properties": {
                  "action": { "type": "string" },
                  "priority": { "type": "string", "enum": ["low", "medium", "high"] },
                  "rationale": { "type": "string" }
                },
                "additionalProperties": false
              }
            }
          },
          "additionalProperties": false
        }
      },
      "additionalProperties": false
    },
    "metadata": {
      "type": "object",
      "required": ["evaluation_latency_ms", "evaluator_version"],
      "properties": {
        "evaluation_latency_ms": { "type": "integer", "minimum": 0 },
        "evaluator_version": { "type": "string" },
        "cost_breakdown": {
          "type": "object",
          "properties": {
            "evaluation_tokens": { "type": "integer", "minimum": 0 },
            "benchmark_queries": { "type": "integer", "minimum": 0 },
            "total_cost_cents": { "type": "number", "minimum": 0 }
          },
          "additionalProperties": false
        },
        "cache_status": { "type": "string", "enum": ["hit", "miss", "partial"] }
      },
      "additionalProperties": false
    },
    "warnings": {
      "type": "array",
      "items": {
        "type": "object",
        "required": ["code", "message"],
        "properties": {
          "code": { "type": "string", "enum": ["LOW_CONFIDENCE", "PARTIAL_EVALUATION", "TIMEOUT_RISK", "BENCHMARK_UNAVAILABLE"] },
          "message": { "type": "string" },
          "impact": { "type": "string", "enum": ["low", "medium", "high"] }
        },
        "additionalProperties": false
      }
    },
    "error": {
      "type": "object",
      "required": ["code", "message", "retriable"],
      "properties": {
        "code": { "type": "string", "enum": ["INVALID_COMPLETION", "TIMEOUT", "SYSTEM_ERROR", "CAPACITY_EXCEEDED"] },
        "message": { "type": "string" },
        "retriable": { "type": "boolean" },
        "retry_after_ms": { "type": "integer", "minimum": 0 }
      },
      "additionalProperties": false
    }
  },
  "additionalProperties": false
}

Response Examples ​

Quality Score Response:

json
{
  "request_id": "req_eval_001",
  "task_id": "task_550e8400-e29b-41d4-a716-446655440000",
  "results": {
    "quality_score": {
      "overall": 0.87,
      "dimensions": {
        "relevance": 0.92,
        "completeness": 0.84,
        "accuracy": 0.89,
        "coherence": 0.83
      },
      "confidence": 0.91
    },
    "feedback": {
      "strengths": [
        "Comprehensive coverage of JWT authentication concepts",
        "Practical debugging steps with specific code examples",
        "Good balance of theoretical and implementation details"
      ],
      "improvements": [
        "Could include more error handling edge cases",
        "Missing discussion of JWT security best practices"
      ],
      "recommendations": [
        {
          "action": "Add section on JWT expiration handling",
          "priority": "medium",
          "rationale": "Common source of authentication bugs"
        },
        {
          "action": "Include CSRF protection considerations",
          "priority": "high", 
          "rationale": "Critical security aspect often overlooked"
        }
      ]
    }
  },
  "metadata": {
    "evaluation_latency_ms": 1500,
    "evaluator_version": "v0.1.2",
    "cost_breakdown": {
      "evaluation_tokens": 450,
      "benchmark_queries": 0,
      "total_cost_cents": 2.25
    },
    "cache_status": "miss"
  }
}

Benchmark Response:

json
{
  "request_id": "req_eval_002", 
  "task_id": "task_benchmark_001",
  "results": {
    "benchmark": {
      "dataset": "transformer_explanations_v2",
      "score": 0.76,
      "percentile": 82.5,
      "comparison": {
        "baseline": 0.68,
        "improvement": 0.08
      }
    },
    "quality_score": {
      "overall": 0.79,
      "dimensions": {
        "relevance": 0.85,
        "completeness": 0.72,
        "accuracy": 0.81,
        "coherence": 0.78
      },
      "confidence": 0.88
    }
  },
  "metadata": {
    "evaluation_latency_ms": 2100,
    "evaluator_version": "v0.1.2",
    "cost_breakdown": {
      "evaluation_tokens": 320,
      "benchmark_queries": 5,
      "total_cost_cents": 3.80
    },
    "cache_status": "hit"
  }
}

Error Response:

json
{
  "request_id": "req_eval_003",
  "task_id": "task_error_001", 
  "results": {},
  "metadata": {
    "evaluation_latency_ms": 3000,
    "evaluator_version": "v0.1.2"
  },
  "error": {
    "code": "TIMEOUT",
    "message": "Evaluation operation exceeded deadline of 3000ms",
    "retriable": true,
    "retry_after_ms": 1000
  }
}

HTTP API Specification ​

Base URL ​

https://evaluation.mnemoverse.dev/api/v0

Endpoints ​

POST /evaluate ​

Submit task completion for quality evaluation.

Request:

  • Method: POST
  • Path: /evaluate
  • Headers:
    • Content-Type: application/json
    • Authorization: Bearer {api_key}
  • Body: evaluation_request.v0

Response:

  • Status: 200 OK | 400 Bad Request | 500 Internal Server Error | 503 Service Unavailable
  • Headers:
    • Content-Type: application/json
    • X-Request-ID: {request_id}
  • Body: evaluation_response.v0

GET /benchmarks ​

List available benchmark datasets.

Response:

json
{
  "benchmarks": [
    {
      "id": "transformer_explanations_v2",
      "name": "Transformer Architecture Explanations",
      "domain": "research",
      "version": "2.1",
      "size": 1247
    },
    {
      "id": "code_debugging_v1",
      "name": "Code Debugging Tasks", 
      "domain": "code",
      "version": "1.3",
      "size": 892
    }
  ],
  "timestamp": "2025-09-06T10:00:00Z"
}

GET /health ​

Health check endpoint.

Response:

json
{
  "status": "healthy",
  "components": {
    "quality_evaluator": "healthy",
    "benchmark_engine": "healthy",
    "feedback_generator": "degraded",
    "cache": "healthy"
  },
  "timestamp": "2025-09-06T10:00:00Z"
}

Error Handling ​

Standard Error Codes:

  • INVALID_COMPLETION: Malformed completion data or missing required fields
  • TIMEOUT: Evaluation operation exceeded deadline
  • SYSTEM_ERROR: Internal evaluation system failure
  • CAPACITY_EXCEEDED: Evaluation system at capacity, retry later

HTTP Status Mappings:

  • INVALID_COMPLETION β†’ 400 Bad Request
  • TIMEOUT β†’ 504 Gateway Timeout
  • SYSTEM_ERROR β†’ 500 Internal Server Error
  • CAPACITY_EXCEEDED β†’ 503 Service Unavailable

Performance Characteristics ​

SLA Targets ​

  • Availability: 99.9% uptime
  • Latency: P95 < 5 seconds for quality scores, P95 < 15 seconds for benchmarks
  • Throughput: 50 requests/second per instance
  • Accuracy: Quality score correlation > 0.85 with human evaluators

Rate Limiting ​

  • Default: 500 requests/hour per API key
  • Burst: 5 requests/second
  • Headers: X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Reset

Integration Examples ​

Orchestration β†’ Evaluation Quality Check ​

typescript
// L2 (Orchestration) calling L5 (Evaluation)
class EvaluationClient {
  async evaluate(taskResult: TaskCompletion): Promise<EvaluationResults> {
    const request: EvaluationRequest = {
      request_id: uuidv4(),
      task_id: taskResult.task_id,
      intent: taskResult.intent,
      completion: {
        fragments: taskResult.fragments,
        total_tokens: taskResult.total_tokens,
        latency_ms: taskResult.latency_ms
      },
      metrics: {
        coverage_entities: taskResult.coverage_entities,
        quality_threshold: 0.8,
        provider_latencies: taskResult.provider_latencies
      },
      deadline_ms: 5000,
      evaluation_type: "all",
      context: {
        user_id: taskResult.context.user_id,
        domain: taskResult.context.domain,
        budget_used: taskResult.budget_used
      }
    };

    const response = await fetch(`${this.baseUrl}/evaluate`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${this.apiKey}`
      },
      body: JSON.stringify(request)
    });

    if (!response.ok) {
      throw new Error(`Evaluation failed: ${response.statusText}`);
    }

    return response.json();
  }
}

Status: Canonical specification ready for implementation Schema Location: /architecture/contracts/schemas/evaluation_*.v0.json (to be created)