Source code for evoproc.scorers

# scorers.py
"""
Scorers for procedure evolution.

- StructuralHygieneScorer: scores a *procedure JSON* using validators + structure heuristics.
- TaskEvalScorer: runs a procedure end-to-end and scores with a user-provided eval_fn.
- ProcScorerAdapter: adapts any "proc scorer" (that scores JSON) to GA-style "individual" scoring.

Usage with GA
-------------
from src.scoring import StructuralHygieneScorer, ProcScorerAdapter
from src.validators import validate_procedure_structured

proc_scorer = StructuralHygieneScorer(validate_fn=validate_procedure_structured)
scorer = ProcScorerAdapter(proc_scorer)   # GA expects scorer.score(individual)
"""
from __future__ import annotations

from math import exp
from typing import Any, Callable, Dict, List, Optional, Protocol

from evoproc.helpers import _names

JSONDict = Dict[str, Any]
Validator = Callable[[JSONDict], List[Dict[str, Any]]]


[docs] class HasProc(Protocol): """Minimal protocol for GA 'individual'-like objects.""" proc: JSONDict
[docs] class Scorer(Protocol): """GA-facing scorer protocol."""
[docs] def score(self, ind: HasProc, **kwargs: Any) -> float: ...
[docs] class ProcScorer(Protocol): """Scores a raw procedure JSON (not an Individual)."""
[docs] def score_proc(self, p: JSONDict) -> float: ...
[docs] class ProcScorerAdapter(Scorer): """ Adapter that lets a procedure-level scorer be used by GA code that calls `scorer.score(individual)`. """
[docs] def __init__(self, proc_scorer: ProcScorer) -> None: self._proc_scorer = proc_scorer
[docs] def score(self, ind: HasProc, **kwargs: Any) -> float: return self._proc_scorer.score_proc(ind.proc)
[docs] class StructuralHygieneScorer: """ Structural hygiene scorer for global-state procedures (scores *procedure JSON*). Components (higher is better; starts at `base`): - Validator penalties: * fatal diagnostics: -w_fatal each * repairable diags: -w_repair each - Redefinition penalty: -w_redefine * (# vars redefined) - Unused outputs penalty: -w_unused * (# unused outputs) - Soft length cap: -w_len * sigmoid(max(0, n_steps - target_steps)) - Extraction-first reward:+w_extract if step 1 looks like an extraction """
[docs] def __init__( self, validate_fn: Validator, *, base: float = 1.0, w_fatal: float = 1.0, w_repair: float = 0.2, w_redefine: float = 0.25, w_unused: float = 0.25, w_len: float = 0.3, target_steps: int = 6, w_extract: float = 0.25, ) -> None: self.validate_fn = validate_fn self.base = base self.w_fatal = w_fatal self.w_repair = w_repair self.w_redefine = w_redefine self.w_unused = w_unused self.w_len = w_len self.target_steps = target_steps self.w_extract = w_extract
# ---- sub-metrics --------------------------------------------------------- def _count_redefinitions(self, p: JSONDict) -> int: seen = set() redefs = 0 for s in p.get("steps", []): for v in _names(s.get("output", [])): if v == "final_answer": continue if v in seen: redefs += 1 seen.add(v) return redefs def _count_unused_outputs(self, p: JSONDict) -> int: """Count outputs that never appear in any later step's inputs.""" steps = p.get("steps", []) n = len(steps) future_inputs: set[str] = set() unused_total = 0 for i in range(n - 1, -1, -1): cur_inputs = set(_names(steps[i].get("inputs", []))) # outputs at i that never appear later for v in _names(steps[i].get("output", [])): if v in {"final_answer", "problem_text"}: continue if v not in future_inputs: unused_total += 1 # add inputs seen at/after this step so earlier outputs see them as "future needs" future_inputs |= cur_inputs return unused_total def _looks_extraction_first(self, p: JSONDict) -> bool: if not p.get("steps"): return False s1 = str(p["steps"][0].get("stepDescription", "")).lower() return any(tok in s1 for tok in ("extract", "read", "gather", "identify", "parse")) # ---- public API ----------------------------------------------------------
[docs] def score_proc(self, p: JSONDict) -> float: """Return a scalar fitness for a procedure JSON.""" score = self.base # 1) validator penalties diags = self.validate_fn(p) fatal = sum(1 for d in diags if d.get("severity") == "fatal") repair = sum(1 for d in diags if d.get("severity") == "repairable") score -= self.w_fatal * fatal score -= self.w_repair * repair # 2) redefinitions score -= self.w_redefine * self._count_redefinitions(p) # 3) unused outputs score -= self.w_unused * self._count_unused_outputs(p) # 4) soft length cap n = len(p.get("steps", [])) excess = max(0, n - self.target_steps) score -= self.w_len * (1 / (1 + exp(-0.7 * excess)) - 0.5) * 2 # ~[0, w_len] # 5) extraction-first reward if self._looks_extraction_first(p): score += self.w_extract return float(score)
[docs] class TaskEvalScorer(Scorer): """ Execute a procedure (via `run_steps_fn`) and grade with a user-provided `eval_fn(state, proc) -> float`. Expects GA to call `score(individual)`. Arguments --------- run_steps_fn Callable that executes the procedure over the question with the given schema/model and returns a final `state` dict (e.g., your `run_steps`). eval_fn Callable `(state, proc) -> float` returning a scalar fitness. question The task prompt to run. final_answer_schema JSON schema passed to the last step runner. model LLM name for execution. strict_require_key If set, returns -1.0 when the key is missing from `state`. """
[docs] def __init__( self, run_steps_fn: Callable[[JSONDict, str, Dict[str, Any], str], Dict[str, Any]], eval_fn: Callable[[Dict[str, Any], Dict[str, Any]], float], question: str, final_answer_schema: Dict[str, Any], model: str, strict_require_key: Optional[str] = None, ) -> None: self.run_steps_fn = run_steps_fn self.eval_fn = eval_fn self.question = question self.final_answer_schema = final_answer_schema self.model = model self.strict_require_key = strict_require_key
[docs] def score(self, ind: HasProc, **kwargs: Any) -> float: try: state = self.run_steps_fn( ind.proc, self.question, self.final_answer_schema, self.model, # type: ignore[arg-type] ) if self.strict_require_key and self.strict_require_key not in state: return -1.0 return float(self.eval_fn(state, ind.proc)) except Exception: return -1.0
__all__ = [ "JSONDict", "Validator", "HasProc", "Scorer", "ProcScorer", "ProcScorerAdapter", "StructuralHygieneScorer", "TaskEvalScorer", ]