{ "cells": [ { "cell_type": "markdown", "id": "e350c961", "metadata": {}, "source": [ "# Research Example: Generating evoproc_procedures for the GSM8K Dataset Using a Genetic Algorithm and OLLaMa Queries" ] }, { "cell_type": "markdown", "id": "59ad3473", "metadata": {}, "source": [ "## Step 1: Import packages and necessary application functions & variables" ] }, { "cell_type": "code", "execution_count": null, "id": "08cc3cf2", "metadata": {}, "outputs": [], "source": [ "import re\n", "import math\n", "from evoproc.ga_scaffold_structured import ProcedureGA, GAConfig\n", "from evoproc.validators import validate_procedure_structured\n", "from evoproc_procedures.models import Procedure\n", "from evoproc_procedures.schemas import get_schema\n", "from evoproc_procedures.prompts import create_procedure_prompt\n", "from evoproc_procedures.ollama import query, repair_fn_ollama\n", "from evoproc_procedures.runners import run_steps_stateful_minimal\n" ] }, { "cell_type": "markdown", "id": "7afda795", "metadata": {}, "source": [ "## Step 2: Import the GSM8K Dataset" ] }, { "cell_type": "code", "execution_count": 4, "id": "03411603", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "661e6381", "metadata": {}, "outputs": [], "source": [ "train_dataset = load_dataset(\"openai/gsm8k\", \"main\", split=\"train\")\n", "test_dataset = load_dataset(\"openai/gsm8k\", \"main\", split=\"test\")" ] }, { "cell_type": "markdown", "id": "bf261a71", "metadata": {}, "source": [ "## Step 3: Set variable constants and instantiate necessary functions" ] }, { "cell_type": "code", "execution_count": 6, "id": "c8d6ce95", "metadata": {}, "outputs": [], "source": [ "FINAL_SCHEMA = get_schema(\"gsm\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "2beac706", "metadata": {}, "outputs": [], "source": [ "def run_steps_fn(proc_json, question, final_answer_schema, model, print_bool=False):\n", " # use your general runner (backend-agnostic; pass Ollama query fn)\n", " state = run_steps_stateful_minimal(\n", " proc_json,\n", " problem_text=question,\n", " answer_schema=final_answer_schema,\n", " model=model,\n", " query_fn=query,\n", " print_bool=print_bool,\n", " )\n", " return state\n", "\n", "def _extract_gold_number(gold_answer: str) -> float | None:\n", " # GSM8K gold answers are strings; often last number is the target\n", " nums = re.findall(r\"-?\\d+(?:\\.\\d+)?\", gold_answer)\n", " return float(nums[-1]) if nums else None\n", "\n", "def eval_fn(state, proc_json) -> float:\n", " \"\"\"Return a fitness score in [0,1].\"\"\"\n", " # prefer model-extracted numeric if present, else try to parse its text\n", " pred_num = state.get(\"answer_numerical\")\n", " if pred_num is None:\n", " try:\n", " pred_num = float(re.findall(r\"-?\\d+(?:\\.\\d+)?\", state.get(\"answer\",\"\"))[-1])\n", " except Exception:\n", " return 0.0\n", " gold_num = state.get(\"_gold_num\") # we’ll inject this per item\n", " if gold_num is None:\n", " return 0.0\n", " # exact match or close within small tolerance\n", " return 1.0 if math.isclose(pred_num, gold_num, rel_tol=0, abs_tol=1e-6) else 0.0" ] }, { "cell_type": "markdown", "id": "b31c78fa", "metadata": {}, "source": [ "## Step 3: Instantiate the Procedure Genetic Algorithm Object" ] }, { "cell_type": "code", "execution_count": 8, "id": "4ee1f4bb", "metadata": {}, "outputs": [], "source": [ "ga = ProcedureGA(\n", " model=\"gemma3:latest\",\n", " create_proc_fn=lambda task: create_procedure_prompt(task),\n", " query_fn=query, # backend call\n", " schema_json_fn=lambda: Procedure.model_json_schema(),\n", " validate_fn=validate_procedure_structured, # pure function\n", " repair_fn=repair_fn_ollama, # GA expects (proc, model) -> proc\n", " cfg=GAConfig(population_size=3, max_generations=3, crossover_rate=0.7, mutation_rate=0.3, seed=42),\n", ")" ] }, { "cell_type": "markdown", "id": "77e71602", "metadata": {}, "source": [ "## Step 4: For each Question-Answer pair, run the GA with the question as task_description" ] }, { "cell_type": "code", "execution_count": 9, "id": "ddafe810", "metadata": {}, "outputs": [], "source": [ "def run_gsm8k_batch(examples):\n", " \"\"\"\n", " examples: iterable of dicts like {\"id\": ..., \"question\": \"...\", \"answer\": \"...\"} (GSM8K format)\n", " Returns: list of per-item result dicts with procedure, state, and score\n", " \"\"\"\n", " results = []\n", " for ex in examples:\n", " qid = ex.get(\"id\")\n", " question = ex[\"question\"]\n", " gold_text = ex[\"answer\"]\n", " gold_num = _extract_gold_number(gold_text)\n", "\n", " # CHOOSE ONE OF THE FOLLOWING:\n", " # 1. Task-eval path: supply all three args so GA uses TaskEval scoring each generation\n", " # best, history = ga.run(\n", " # task_description=question,\n", " # final_answer_schema=FINAL_SCHEMA,\n", " # eval_fn=lambda state, proc: eval_fn({**state, \"_gold_num\": gold_num}, proc),\n", " # run_steps_fn=run_steps_fn,\n", " # print_progress=False,\n", " # )\n", "\n", " # 2. NO Task-eval path: don't supply all three args so GA uses Hygiene scoring each generation\n", " best, history = ga.run(\n", " task_description=question,\n", " final_answer_schema=FINAL_SCHEMA,\n", " eval_fn=None,\n", " print_progress=False,\n", " )\n", "\n", " # After GA finishes, run once more to collect the final state/answer\n", " final_state = run_steps_fn(best.proc, question, FINAL_SCHEMA, ga.model, print_bool=False)\n", "\n", " results.append({\n", " \"id\": qid,\n", " \"question\": question,\n", " \"gold_answer\": gold_text,\n", " \"gold_num\": gold_num,\n", " \"fitness\": best.fitness,\n", " \"procedure\": best.proc, # JSON dict\n", " \"state\": final_state, # includes \"answer\" and \"answer_numerical\"\n", " \"pred_answer\": final_state.get(\"final_answer\"),\n", " \"pred_num\": final_state.get(\"final_answer_numerical\"),\n", " \"correct\": bool(eval_fn({**final_state, \"_gold_num\": gold_num}, best.proc) >= 1.0),\n", " \"steps\": len(best.proc.get(\"steps\", [])),\n", " })\n", " return results" ] }, { "cell_type": "code", "execution_count": 10, "id": "581ef386", "metadata": {}, "outputs": [], "source": [ "# For testing purposes, just grab first 10 as this will take a long time to run\n", "first_two = train_dataset.select(range(2))" ] }, { "cell_type": "code", "execution_count": 11, "id": "8a2263dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'id': None,\n", " 'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',\n", " 'gold_answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\\n#### 72',\n", " 'gold_num': 72.0,\n", " 'fitness': 1.25,\n", " 'procedure': {'NameDescription': 'Calculate the total clips sold in April and May.',\n", " 'steps': [{'id': 1,\n", " 'stepDescription': 'Extract the number of clips sold in April from the problem text.',\n", " 'inputs': [{'name': 'problem_text',\n", " 'description': 'The original problem text.'}],\n", " 'output': [{'name': 'clips_sold_in_april',\n", " 'description': 'The number of clips sold in April.'}]},\n", " {'id': 2,\n", " 'stepDescription': 'Calculate the number of clips sold in May. This is half the number sold in April.',\n", " 'inputs': [{'name': 'clips_sold_in_april',\n", " 'description': 'The number of clips sold in April.'}],\n", " 'output': [{'name': 'clips_sold_in_may',\n", " 'description': 'The number of clips sold in May.'}]},\n", " {'id': 3,\n", " 'stepDescription': 'Calculate the total number of clips sold in April and May.',\n", " 'inputs': [{'name': 'clips_sold_in_april',\n", " 'description': 'The number of clips sold in April.'},\n", " {'name': 'clips_sold_in_may',\n", " 'description': 'The number of clips sold in May.'}],\n", " 'output': [{'name': 'final_answer',\n", " 'description': 'The total number of clips sold in April and May.'}]}]},\n", " 'state': {'problem_text': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',\n", " 'clips_sold_in_april': 48,\n", " 'clips_sold_in_may': 24,\n", " 'final_answer': 'The total number of clips sold in April and May is 72.',\n", " 'final_answer_numerical': 72,\n", " 'confidence': 1,\n", " 'units': 'clips'},\n", " 'pred_answer': 'The total number of clips sold in April and May is 72.',\n", " 'pred_num': 72,\n", " 'correct': False,\n", " 'steps': 3},\n", " {'id': None,\n", " 'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',\n", " 'gold_answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\\n#### 10',\n", " 'gold_num': 10.0,\n", " 'fitness': 1.25,\n", " 'procedure': {'NameDescription': \"Calculate Weng's earnings from babysitting.\",\n", " 'steps': [{'id': 1,\n", " 'stepDescription': 'Extract the problem text from the input.',\n", " 'inputs': [{'name': 'problem_text',\n", " 'description': 'The problem description.'}],\n", " 'output': [{'name': 'problem_text',\n", " 'description': 'The problem description.'}]},\n", " {'id': 2,\n", " 'stepDescription': 'Identify the hourly rate.',\n", " 'inputs': [{'name': 'problem_text',\n", " 'description': 'The problem description.'}],\n", " 'output': [{'name': 'hourly_rate',\n", " 'description': \"Weng's hourly babysitting rate.\"}]},\n", " {'id': 3,\n", " 'stepDescription': 'Calculate the babysitting time in hours.',\n", " 'inputs': [{'name': 'problem_text',\n", " 'description': 'The problem description.'},\n", " {'name': 'hourly_rate',\n", " 'description': \"Weng's hourly babysitting rate.\"}],\n", " 'output': [{'name': 'babysitting_time_in_hours',\n", " 'description': \"Weng's babysitting time in hours.\"}]},\n", " {'id': 4,\n", " 'stepDescription': \"Calculate Weng's earnings.\",\n", " 'inputs': [{'name': 'hourly_rate',\n", " 'description': \"Weng's hourly babysitting rate.\"},\n", " {'name': 'babysitting_time_in_hours',\n", " 'description': \"Weng's babysitting time in hours.\"}],\n", " 'output': [{'name': 'earnings',\n", " 'description': \"Weng's total earnings.\"}]},\n", " {'id': 5,\n", " 'stepDescription': 'Output the final answer.',\n", " 'inputs': [{'name': 'earnings', 'description': \"Weng's total earnings.\"}],\n", " 'output': [{'name': 'final_answer',\n", " 'description': \"Weng's earnings from babysitting.\"}]}]},\n", " 'state': {'problem_text': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',\n", " 'hourly_rate': 12,\n", " 'babysitting_time_in_hours': 0.8333333333333334,\n", " 'earnings': 10,\n", " 'final_answer': 'The final answer is 10.',\n", " 'final_answer_numerical': 10,\n", " 'confidence': 1.0,\n", " 'units': 'None'},\n", " 'pred_answer': 'The final answer is 10.',\n", " 'pred_num': 10,\n", " 'correct': False,\n", " 'steps': 5}]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "run_gsm8k_batch(first_two)" ] }, { "cell_type": "code", "execution_count": null, "id": "09f7c84d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv (3.13.5)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }