import re import json from typing import List, Dict, Any # ============================================================ # 1️⃣ CLEAN SPECIAL TOKENS # ============================================================ def clean_special_tokens(text: str) -> str: """ Remove model-specific special tokens but preserve structured tags. """ # Remove common HF chat artifacts text = re.sub(r"<\|im_start\|>.*?(\n|$)", "", text) text = re.sub(r"<\|im_end\|>", "", text) text = re.sub(r"<\|assistant\|>", "", text) text = re.sub(r"<\|user\|>", "", text) # Remove stray leading/trailing whitespace return text.strip() # ============================================================ # 2️⃣ PARSE THINKING # ============================================================ def parse_reasoning(text: str): """ Extract blocks. Supports: - Multiple think blocks - Broken/incomplete blocks (stream-safe) """ reasoning_blocks = [] # Complete blocks complete_pattern = r"(.*?)" for match in re.finditer(complete_pattern, text, re.DOTALL): reasoning_blocks.append(match.group(1).strip()) # Handle open block (no closing tag) if "" in text and "" not in text: open_pattern = r"(.*)" match = re.search(open_pattern, text, re.DOTALL) if match: reasoning_blocks.append(match.group(1).strip()) # Remove reasoning from text text = re.sub(complete_pattern, "", text, flags=re.DOTALL) text = re.sub(r".*", "", text, flags=re.DOTALL) return reasoning_blocks, text.strip() # ============================================================ # 3️⃣ PARSE TOOL CALLS # ============================================================ def parse_tool_calls(text: str): """ Extract JSON Handles: - Multiple calls - Missing closing tag - Invalid JSON """ tool_calls = [] # Fix truncated tool_call if "" in text and "" not in text: text += "" pattern = r"(.*?)" for match in re.finditer(pattern, text, re.DOTALL): raw = match.group(1).strip() try: parsed = json.loads(raw) tool_calls.append({ "name": parsed.get("name"), "arguments": parsed.get("arguments", {}), "raw": raw }) except json.JSONDecodeError: tool_calls.append({ "error": "Invalid JSON", "raw": raw }) # Remove tool blocks text = re.sub(pattern, "", text, flags=re.DOTALL) return tool_calls, text.strip() # ============================================================ # 4️⃣ MASTER PARSER # ============================================================ def parse_model_output(text: str) -> Dict[str, Any]: """ Master parser: - Clean tokens - Extract reasoning - Extract tool calls - Return structured output """ original_text = text text = clean_special_tokens(text) reasoning, text = parse_reasoning(text) tool_calls, text = parse_tool_calls(text) final_answer = text.strip() return { "raw_output": original_text, "reasoning": reasoning, "tool_calls": tool_calls, "final_answer": final_answer }