Source code for fermilink.optimize.goal

"""Parse goal.md into a structured optimization goal specification.

Goal mode lets users describe optimization intent in natural-language
markdown instead of authoring a benchmark YAML contract and runner script
by hand.  This module parses the structured markdown into a dict that
the source-analysis / benchmark-generation pipeline can consume.
"""

from __future__ import annotations

import re
from typing import Any


# ---------------------------------------------------------------------------
# Goal detection
# ---------------------------------------------------------------------------

_GOAL_SECTION_MARKERS = [
    "## package",
    "## target",
    "## editable scope",
    "## performance metric",
    "## correctness constraints",
    "## correctness",
    "## representative workloads",
    "## workloads",
    "# optimization goal",
]



[docs]
def is_goal_markdown(text: str) -> bool:
    """Return *True* when *text* looks like a goal-structured markdown file.

    Detection is based on the presence of at least two known goal-section
    headings.  This intentionally stays loose so that users can omit
    optional sections while still triggering goal mode.
    """

    lowered = text.lower()
    count = sum(1 for marker in _GOAL_SECTION_MARKERS if marker in lowered)
    return count >= 2



# ---------------------------------------------------------------------------
# Section extraction helpers
# ---------------------------------------------------------------------------

_HEADING_RE = re.compile(r"^(#{1,3})\s+(.+)$")


def _extract_sections(text: str) -> dict[str, str]:
    """Split markdown into ``{lowercased heading: body text}`` pairs."""

    sections: dict[str, str] = {}
    current_heading = ""
    current_lines: list[str] = []

    for line in text.splitlines():
        match = _HEADING_RE.match(line)
        if match:
            if current_heading:
                sections[current_heading] = "\n".join(current_lines).strip()
            current_heading = match.group(2).strip().lower()
            current_lines = []
        else:
            current_lines.append(line)

    if current_heading:
        sections[current_heading] = "\n".join(current_lines).strip()

    return sections


def _text_section(sections: dict[str, str], *keys: str) -> str:
    """Return the body of the first matching section, or empty string."""

    for key in keys:
        text = sections.get(key, "").strip()
        if text:
            return text
    return ""


def _first_line(sections: dict[str, str], key: str) -> str:
    text = _text_section(sections, key)
    return text.splitlines()[0].strip() if text else ""


def _list_section(sections: dict[str, str], *keys: str) -> list[str]:
    """Extract a bullet-list section, trying *keys* in order."""

    for key in keys:
        text = sections.get(key, "").strip()
        if not text:
            continue
        items: list[str] = []
        for line in text.splitlines():
            stripped = line.strip()
            for prefix in ("- ", "* ", "+ "):
                if stripped.startswith(prefix):
                    stripped = stripped[len(prefix) :]
                    break
            stripped = stripped.strip()
            if stripped:
                items.append(stripped)
        if items:
            return items
    return []


def _code_blocks(sections: dict[str, str], key: str) -> list[str]:
    """Extract fenced code blocks from a section body."""

    text = sections.get(key, "")
    blocks: list[str] = []
    current: list[str] | None = None
    for line in text.splitlines():
        stripped = line.strip()
        if stripped.startswith("```"):
            if current is not None:
                blocks.append("\n".join(current))
                current = None
            else:
                current = []
        elif current is not None:
            current.append(line)
    return blocks


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------



[docs]
def parse_goal(text: str) -> dict[str, Any]:
    """Parse a goal markdown document into a structured specification.

    Returns a dict with the following keys (all strings or lists of
    strings; missing sections default to empty):

    * ``raw_text`` – the original markdown
    * ``package`` – package identifier (first line of ``## Package``)
    * ``target`` – free-text optimization target description
    * ``editable_scope`` – list of file-path globs the agent may edit
    * ``performance_metric`` – description of the metric to optimise
    * ``correctness_constraints`` – list of correctness requirements
    * ``workloads`` – list of representative workload descriptions
    * ``language`` – optional language hint (``python``, ``cpp``, ``fortran``, …)
    * ``notes`` – optional free-form notes
    * ``build_commands`` – optional build/install commands from code blocks
    """

    sections = _extract_sections(text)

    return {
        "raw_text": text,
        "package": _first_line(sections, "package"),
        "target": _text_section(sections, "target"),
        "editable_scope": _list_section(
            sections, "editable scope", "scope", "editable paths"
        ),
        "performance_metric": _text_section(
            sections, "performance metric", "metric", "objective"
        ),
        "correctness_constraints": _list_section(
            sections,
            "correctness constraints",
            "correctness",
            "constraints",
        ),
        "workloads": _list_section(
            sections,
            "representative workloads",
            "workloads",
            "cases",
            "test cases",
        ),
        "language": _first_line(sections, "language"),
        "notes": _text_section(sections, "notes"),
        "build_commands": _code_blocks(sections, "build")
        or _code_blocks(sections, "setup")
        or _code_blocks(sections, "install"),
    }