Research Example: Generating evoproc_procedures for the GSM8K Dataset Using a Genetic Algorithm and OLLaMa Queries¶

Step 1: Import packages and necessary application functions & variables¶

import re
import math
from evoproc.ga_scaffold_structured import ProcedureGA, GAConfig
from evoproc.validators import validate_procedure_structured
from evoproc_procedures.models import Procedure
from evoproc_procedures.schemas import get_schema
from evoproc_procedures.prompts import create_procedure_prompt
from evoproc_procedures.ollama import query, repair_fn_ollama
from evoproc_procedures.runners import run_steps_stateful_minimal

Step 2: Import the GSM8K Dataset¶

from datasets import load_dataset

train_dataset = load_dataset("openai/gsm8k", "main", split="train")
test_dataset = load_dataset("openai/gsm8k", "main", split="test")

Step 3: Set variable constants and instantiate necessary functions¶

FINAL_SCHEMA = get_schema("gsm")

def run_steps_fn(proc_json, question, final_answer_schema, model, print_bool=False):
    # use your general runner (backend-agnostic; pass Ollama query fn)
    state = run_steps_stateful_minimal(
        proc_json,
        problem_text=question,
        answer_schema=final_answer_schema,
        model=model,
        query_fn=query,
        print_bool=print_bool,
    )
    return state

def _extract_gold_number(gold_answer: str) -> float | None:
    # GSM8K gold answers are strings; often last number is the target
    nums = re.findall(r"-?\d+(?:\.\d+)?", gold_answer)
    return float(nums[-1]) if nums else None

def eval_fn(state, proc_json) -> float:
    """Return a fitness score in [0,1]."""
    # prefer model-extracted numeric if present, else try to parse its text
    pred_num = state.get("answer_numerical")
    if pred_num is None:
        try:
            pred_num = float(re.findall(r"-?\d+(?:\.\d+)?", state.get("answer",""))[-1])
        except Exception:
            return 0.0
    gold_num = state.get("_gold_num")  # we’ll inject this per item
    if gold_num is None:
        return 0.0
    # exact match or close within small tolerance
    return 1.0 if math.isclose(pred_num, gold_num, rel_tol=0, abs_tol=1e-6) else 0.0

Step 3: Instantiate the Procedure Genetic Algorithm Object¶

ga = ProcedureGA(
    model="gemma3:latest",
    create_proc_fn=lambda task: create_procedure_prompt(task),
    query_fn=query,                                     # backend call
    schema_json_fn=lambda: Procedure.model_json_schema(),
    validate_fn=validate_procedure_structured,          # pure function
    repair_fn=repair_fn_ollama,                         # GA expects (proc, model) -> proc
    cfg=GAConfig(population_size=3, max_generations=3, crossover_rate=0.7, mutation_rate=0.3, seed=42),
)

Step 4: For each Question-Answer pair, run the GA with the question as task_description¶

def run_gsm8k_batch(examples):
    """
    examples: iterable of dicts like {"id": ..., "question": "...", "answer": "..."} (GSM8K format)
    Returns: list of per-item result dicts with procedure, state, and score
    """
    results = []
    for ex in examples:
        qid = ex.get("id")
        question = ex["question"]
        gold_text = ex["answer"]
        gold_num = _extract_gold_number(gold_text)

        # CHOOSE ONE OF THE FOLLOWING:
        # 1. Task-eval path: supply all three args so GA uses TaskEval scoring each generation
        # best, history = ga.run(
        #     task_description=question,
        #     final_answer_schema=FINAL_SCHEMA,
        #     eval_fn=lambda state, proc: eval_fn({**state, "_gold_num": gold_num}, proc),
        #     run_steps_fn=run_steps_fn,
        #     print_progress=False,
        # )

        # 2. NO Task-eval path: don't supply all three args so GA uses Hygiene scoring each generation
        best, history = ga.run(
            task_description=question,
            final_answer_schema=FINAL_SCHEMA,
            eval_fn=None,
            print_progress=False,
        )

        # After GA finishes, run once more to collect the final state/answer
        final_state = run_steps_fn(best.proc, question, FINAL_SCHEMA, ga.model, print_bool=False)

        results.append({
            "id": qid,
            "question": question,
            "gold_answer": gold_text,
            "gold_num": gold_num,
            "fitness": best.fitness,
            "procedure": best.proc,            # JSON dict
            "state": final_state,              # includes "answer" and "answer_numerical"
            "pred_answer": final_state.get("final_answer"),
            "pred_num": final_state.get("final_answer_numerical"),
            "correct": bool(eval_fn({**final_state, "_gold_num": gold_num}, best.proc) >= 1.0),
            "steps": len(best.proc.get("steps", [])),
        })
    return results

# For testing purposes, just grab first 10 as this will take a long time to run
first_two = train_dataset.select(range(2))

run_gsm8k_batch(first_two)

[{'id': None,
  'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'gold_answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
  'gold_num': 72.0,
  'fitness': 1.25,
  'procedure': {'NameDescription': 'Calculate the total clips sold in April and May.',
   'steps': [{'id': 1,
     'stepDescription': 'Extract the number of clips sold in April from the problem text.',
     'inputs': [{'name': 'problem_text',
       'description': 'The original problem text.'}],
     'output': [{'name': 'clips_sold_in_april',
       'description': 'The number of clips sold in April.'}]},
    {'id': 2,
     'stepDescription': 'Calculate the number of clips sold in May. This is half the number sold in April.',
     'inputs': [{'name': 'clips_sold_in_april',
       'description': 'The number of clips sold in April.'}],
     'output': [{'name': 'clips_sold_in_may',
       'description': 'The number of clips sold in May.'}]},
    {'id': 3,
     'stepDescription': 'Calculate the total number of clips sold in April and May.',
     'inputs': [{'name': 'clips_sold_in_april',
       'description': 'The number of clips sold in April.'},
      {'name': 'clips_sold_in_may',
       'description': 'The number of clips sold in May.'}],
     'output': [{'name': 'final_answer',
       'description': 'The total number of clips sold in April and May.'}]}]},
  'state': {'problem_text': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
   'clips_sold_in_april': 48,
   'clips_sold_in_may': 24,
   'final_answer': 'The total number of clips sold in April and May is 72.',
   'final_answer_numerical': 72,
   'confidence': 1,
   'units': 'clips'},
  'pred_answer': 'The total number of clips sold in April and May is 72.',
  'pred_num': 72,
  'correct': False,
  'steps': 3},
 {'id': None,
  'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
  'gold_answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10',
  'gold_num': 10.0,
  'fitness': 1.25,
  'procedure': {'NameDescription': "Calculate Weng's earnings from babysitting.",
   'steps': [{'id': 1,
     'stepDescription': 'Extract the problem text from the input.',
     'inputs': [{'name': 'problem_text',
       'description': 'The problem description.'}],
     'output': [{'name': 'problem_text',
       'description': 'The problem description.'}]},
    {'id': 2,
     'stepDescription': 'Identify the hourly rate.',
     'inputs': [{'name': 'problem_text',
       'description': 'The problem description.'}],
     'output': [{'name': 'hourly_rate',
       'description': "Weng's hourly babysitting rate."}]},
    {'id': 3,
     'stepDescription': 'Calculate the babysitting time in hours.',
     'inputs': [{'name': 'problem_text',
       'description': 'The problem description.'},
      {'name': 'hourly_rate',
       'description': "Weng's hourly babysitting rate."}],
     'output': [{'name': 'babysitting_time_in_hours',
       'description': "Weng's babysitting time in hours."}]},
    {'id': 4,
     'stepDescription': "Calculate Weng's earnings.",
     'inputs': [{'name': 'hourly_rate',
       'description': "Weng's hourly babysitting rate."},
      {'name': 'babysitting_time_in_hours',
       'description': "Weng's babysitting time in hours."}],
     'output': [{'name': 'earnings',
       'description': "Weng's total earnings."}]},
    {'id': 5,
     'stepDescription': 'Output the final answer.',
     'inputs': [{'name': 'earnings', 'description': "Weng's total earnings."}],
     'output': [{'name': 'final_answer',
       'description': "Weng's earnings from babysitting."}]}]},
  'state': {'problem_text': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',
   'hourly_rate': 12,
   'babysitting_time_in_hours': 0.8333333333333334,
   'earnings': 10,
   'final_answer': 'The final answer is 10.',
   'final_answer_numerical': 10,
   'confidence': 1.0,
   'units': 'None'},
  'pred_answer': 'The final answer is 10.',
  'pred_num': 10,
  'correct': False,
  'steps': 5}]