# scorers.py
"""
Scorers for procedure evolution.
- StructuralHygieneScorer: scores a *procedure JSON* using validators + structure heuristics.
- TaskEvalScorer: runs a procedure end-to-end and scores with a user-provided eval_fn.
- ProcScorerAdapter: adapts any "proc scorer" (that scores JSON) to GA-style "individual" scoring.
Usage with GA
-------------
from src.scoring import StructuralHygieneScorer, ProcScorerAdapter
from src.validators import validate_procedure_structured
proc_scorer = StructuralHygieneScorer(validate_fn=validate_procedure_structured)
scorer = ProcScorerAdapter(proc_scorer) # GA expects scorer.score(individual)
"""
from __future__ import annotations
from math import exp
from typing import Any, Callable, Dict, List, Optional, Protocol
from evoproc.helpers import _names
JSONDict = Dict[str, Any]
Validator = Callable[[JSONDict], List[Dict[str, Any]]]
[docs]
class HasProc(Protocol):
"""Minimal protocol for GA 'individual'-like objects."""
proc: JSONDict
[docs]
class Scorer(Protocol):
"""GA-facing scorer protocol."""
[docs]
def score(self, ind: HasProc, **kwargs: Any) -> float: ...
[docs]
class ProcScorer(Protocol):
"""Scores a raw procedure JSON (not an Individual)."""
[docs]
def score_proc(self, p: JSONDict) -> float: ...
[docs]
class ProcScorerAdapter(Scorer):
"""
Adapter that lets a procedure-level scorer be used by GA code that calls
`scorer.score(individual)`.
"""
[docs]
def __init__(self, proc_scorer: ProcScorer) -> None:
self._proc_scorer = proc_scorer
[docs]
def score(self, ind: HasProc, **kwargs: Any) -> float:
return self._proc_scorer.score_proc(ind.proc)
[docs]
class StructuralHygieneScorer:
"""
Structural hygiene scorer for global-state procedures (scores *procedure JSON*).
Components (higher is better; starts at `base`):
- Validator penalties:
* fatal diagnostics: -w_fatal each
* repairable diags: -w_repair each
- Redefinition penalty: -w_redefine * (# vars redefined)
- Unused outputs penalty: -w_unused * (# unused outputs)
- Soft length cap: -w_len * sigmoid(max(0, n_steps - target_steps))
- Extraction-first reward:+w_extract if step 1 looks like an extraction
"""
[docs]
def __init__(
self,
validate_fn: Validator,
*,
base: float = 1.0,
w_fatal: float = 1.0,
w_repair: float = 0.2,
w_redefine: float = 0.25,
w_unused: float = 0.25,
w_len: float = 0.3,
target_steps: int = 6,
w_extract: float = 0.25,
) -> None:
self.validate_fn = validate_fn
self.base = base
self.w_fatal = w_fatal
self.w_repair = w_repair
self.w_redefine = w_redefine
self.w_unused = w_unused
self.w_len = w_len
self.target_steps = target_steps
self.w_extract = w_extract
# ---- sub-metrics ---------------------------------------------------------
def _count_redefinitions(self, p: JSONDict) -> int:
seen = set()
redefs = 0
for s in p.get("steps", []):
for v in _names(s.get("output", [])):
if v == "final_answer":
continue
if v in seen:
redefs += 1
seen.add(v)
return redefs
def _count_unused_outputs(self, p: JSONDict) -> int:
"""Count outputs that never appear in any later step's inputs."""
steps = p.get("steps", [])
n = len(steps)
future_inputs: set[str] = set()
unused_total = 0
for i in range(n - 1, -1, -1):
cur_inputs = set(_names(steps[i].get("inputs", [])))
# outputs at i that never appear later
for v in _names(steps[i].get("output", [])):
if v in {"final_answer", "problem_text"}:
continue
if v not in future_inputs:
unused_total += 1
# add inputs seen at/after this step so earlier outputs see them as "future needs"
future_inputs |= cur_inputs
return unused_total
def _looks_extraction_first(self, p: JSONDict) -> bool:
if not p.get("steps"):
return False
s1 = str(p["steps"][0].get("stepDescription", "")).lower()
return any(tok in s1 for tok in ("extract", "read", "gather", "identify", "parse"))
# ---- public API ----------------------------------------------------------
[docs]
def score_proc(self, p: JSONDict) -> float:
"""Return a scalar fitness for a procedure JSON."""
score = self.base
# 1) validator penalties
diags = self.validate_fn(p)
fatal = sum(1 for d in diags if d.get("severity") == "fatal")
repair = sum(1 for d in diags if d.get("severity") == "repairable")
score -= self.w_fatal * fatal
score -= self.w_repair * repair
# 2) redefinitions
score -= self.w_redefine * self._count_redefinitions(p)
# 3) unused outputs
score -= self.w_unused * self._count_unused_outputs(p)
# 4) soft length cap
n = len(p.get("steps", []))
excess = max(0, n - self.target_steps)
score -= self.w_len * (1 / (1 + exp(-0.7 * excess)) - 0.5) * 2 # ~[0, w_len]
# 5) extraction-first reward
if self._looks_extraction_first(p):
score += self.w_extract
return float(score)
[docs]
class TaskEvalScorer(Scorer):
"""
Execute a procedure (via `run_steps_fn`) and grade with a user-provided
`eval_fn(state, proc) -> float`. Expects GA to call `score(individual)`.
Arguments
---------
run_steps_fn
Callable that executes the procedure over the question with the given schema/model
and returns a final `state` dict (e.g., your `run_steps`).
eval_fn
Callable `(state, proc) -> float` returning a scalar fitness.
question
The task prompt to run.
final_answer_schema
JSON schema passed to the last step runner.
model
LLM name for execution.
strict_require_key
If set, returns -1.0 when the key is missing from `state`.
"""
[docs]
def __init__(
self,
run_steps_fn: Callable[[JSONDict, str, Dict[str, Any], str], Dict[str, Any]],
eval_fn: Callable[[Dict[str, Any], Dict[str, Any]], float],
question: str,
final_answer_schema: Dict[str, Any],
model: str,
strict_require_key: Optional[str] = None,
) -> None:
self.run_steps_fn = run_steps_fn
self.eval_fn = eval_fn
self.question = question
self.final_answer_schema = final_answer_schema
self.model = model
self.strict_require_key = strict_require_key
[docs]
def score(self, ind: HasProc, **kwargs: Any) -> float:
try:
state = self.run_steps_fn(
ind.proc, self.question, self.final_answer_schema, self.model, # type: ignore[arg-type]
)
if self.strict_require_key and self.strict_require_key not in state:
return -1.0
return float(self.eval_fn(state, ind.proc))
except Exception:
return -1.0
__all__ = [
"JSONDict",
"Validator",
"HasProc",
"Scorer",
"ProcScorer",
"ProcScorerAdapter",
"StructuralHygieneScorer",
"TaskEvalScorer",
]