Source code for martian_apart_hack_sdk.judge_specs

"""Specifications for judges."""

import dataclasses
from typing import Any, Dict, Literal, Optional


[docs] @dataclasses.dataclass(frozen=True) class RubricJudgeSpec: """A specification for a rubric-based judge that evaluates submissions against defined criteria. This class defines the configuration for a judge that uses a rubric and a language model to evaluate submissions. The judge applies the rubric using the specified model to generate a numerical score within the defined range. Args: model_type (Literal["rubric_judge"]): The type of judge, must be "rubric_judge". rubric (str): The evaluation criteria or rubric text that the judge will use to assess submissions. model (str): The identifier of the language model to be used for evaluation. min_score (float): The minimum possible score that can be assigned. max_score (float): The maximum possible score that can be assigned. prescript (Optional[str]): Optional instructions or context to be provided before the evaluation. This is included in the prompt that is sent to the judge, before the rubric. postscript (Optional[str]): Optional instructions or processing steps to be applied after the evaluation. This is included in the prompt that is sent to the judge, after the rubric. extract_variables (Optional[Dict[str, Any]]): Optional configuration for extracting variables from the evaluation. extract_judgement (Optional[Dict[str, Any]]): Optional configuration for extracting the final judgment details. Notes: The default prescript is:: You are a helpful assistant that scores responses between ${min_score} and ${max_score} according to the following rubric: The ``${min_score}`` and ``${max_score}`` are replaced with the ``min_score`` and ``max_score`` args. The default postscript is:: Here's the conversation you are judging: <content> ${content} </content> Please evaluate the assistant's response in the conversation above according to the rubric. Think step-by-step to produce a score, and please provide a rationale for your score. Your score should be between ${min_score} and ${max_score}. Your response MUST include: 1. A <rationale>...</rationale> tag containing your explanation 2. A <score>...</score> tag containing your numerical score The ``${content}`` is replaced with the content of the conversation you are judging. The ``${min_score}`` and ``${max_score}`` are replaced with the ``min_score`` and ``max_score`` args. The full judging prompt looks like:: {filled_prescript} <rubric> {filled_rubric} </rubric> {filled_postscript} Warning: If you override the default prescript or postscript, you must include the ``${min_score}``, ``${max_score}``, and ``${content}`` tags in the prompt, and instruct the judge to include the <rationale> and <score> tags in the response. We do not recommend overriding the default prescript or postscript. Examples: >>> rubric = ''' ... You are tasked with evaluating whether a restaurant recommendation is good. ... The scoring is as follows: ... - 1: If the recommendation doesn't meet any of the criteria. ... - 2: If the recommendation meets only some small part of the criteria. ... - 3: If the recommendation is reasonable, but not perfect. ... - 4: If the recommendation is almost perfect. ... - 5: If the recommendation is perfect. ... ''' >>> rubric_judge_spec = RubricJudgeSpec( ... model_type="rubric_judge", ... rubric=rubric, ... model="openai/openai/gpt-4o", ... min_score=1, ... max_score=5, ... ) """ model_type: Literal["rubric_judge"] rubric: str model: str min_score: float max_score: float prescript: Optional[str] = None postscript: Optional[str] = None extract_variables: Optional[Dict[str, Any]] = None extract_judgement: Optional[Dict[str, Any]] = None def to_dict(self) -> Dict[str, Any]: """Convert the judge specification to a dictionary format suitable for API requests. This method serializes the specification, removing any None values to ensure only meaningful configuration is sent to the API. Returns: Dict[str, Any]: A dictionary containing all non-None attributes of this specification, ready to be sent to the API. """ result = dataclasses.asdict(self) # Filter out None values. return {k: v for k, v in result.items() if v is not None}
# For backward compatibility and future extensibility, JudgeSpec is an alias for RubricJudgeSpec. # If we add other types of judges in the future, this will become a Union type. JudgeSpec = RubricJudgeSpec