Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

prompt_builder.py +110 -0
render_utils.py +64 -0
visual_hint.py +112 -0

prompt_builder.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import json
+SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**.
+Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction.
+### 1. IMAGE INTERPRETATION RULES
+The input image contains visual cues denoting the user's action. You must interpret them as follows:
+*   **Red Circle**: Indicates a **Click** or **Long Press** target at that location.
+*   **Red Arrow**: Indicates a **Scroll** or **Swipe**.
+    *   The arrow points in the direction of finger movement.
+    *   *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down).
+*   **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.**
+### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW)
+*   **Format**: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`.
+*   **Root Element**: All visible content MUST be wrapped in:
+    `<div id="render-target"> ... </div>`
+*   **Container Style**: `#render-target` must have:
+    `width: 1080px; height: 2400px; position: relative; overflow: hidden;`
+    (Apply background colors and shadows here, NOT on the body).
+*   **Body Style**: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`.
+*   **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0).
+### 3. CONTENT GENERATION LOGIC
+*   **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates).
+*   **Images**: Use semantic text placeholders. DO NOT use real URLs.
+    *   Format: `<div style="...">[IMG: description]</div>`
+*   **Icons**: Use simple inline SVG paths or Unicode.
+### 4. OUTPUT REQUIREMENT
+*   Do NOT generate Markdown blocks (```html).
+*   Do NOT provide explanations or conversational text.
+*   Output the code directly.
+"""
+USER_PROMPT_TEMPLATE = """<image>
+### INPUT CONTEXT
+1.  **User Intent**: "{instruction_str}"
+2.  **Interaction Details**:
+    *   **Description**: {semantic_desc}
+    *   **Action Data**: {action_json}
+### COMMAND
+Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action).
+"""
+def get_action_semantic_description(action):
+    action_type = action.get("action_type")
+    if action_type == "click":
+        x, y = action.get("x"), action.get("y")
+        return (
+            f"User performed a CLICK at coordinates ({x}, {y}). "
+            f"Expect the button/element at this location to trigger."
+        )
+    if action_type == "long_press":
+        x, y = action.get("x"), action.get("y")
+        return (
+            f"User performed a LONG PRESS at coordinates ({x}, {y}). "
+            f"Expect a context menu or selection state."
+        )
+    if action_type in ["scroll", "swipe"]:
+        direction = action.get("direction", "down")
+        return (
+            f"User SCROLLED {direction.upper()}. "
+            f"The content should move, revealing new items from the {direction} direction."
+        )
+    if action_type == "input_text":
+        text = action.get("text", "")
+        return (
+            f"User is TYPING the text: '{text}'. "
+            f"The focused input field MUST now contain this text."
+        )
+    if action_type == "open_app":
+        app_name = action.get("app_name", "app")
+        return (
+            f"System Context Switch: The user opened the app '{app_name}'. "
+            f"Show the home screen of this app."
+        )
+    if action_type == "navigate_back":
+        return "System Navigation: The user pressed BACK. Return to the previous screen."
+    if action_type == "navigate_home":
+        return "System Navigation: The user pressed HOME. Show the Desktop."
+    if action_type == "wait":
+        return "Action: WAIT. Keep the UI mostly unchanged unless loading completes."
+    return f"Perform action: {action_type}."
+def build_user_prompt(instruction_str, action, semantic_desc=None):
+    if semantic_desc is None:
+        semantic_desc = get_action_semantic_description(action)
+    action_json = json.dumps(action, ensure_ascii=False)
+    return USER_PROMPT_TEMPLATE.format(
+        instruction_str=instruction_str,
+        semantic_desc=semantic_desc,
+        action_json=action_json,
+    )

render_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import re
+from io import BytesIO
+from PIL import Image
+from playwright.sync_api import sync_playwright
+def extract_clean_html(text):
+    """
+    清理模型输出，尽量提取完整 HTML。
+    """
+    text = text.replace("```html", "").replace("```", "")
+    start_match = re.search(r"<!DOCTYPE html>", text, re.IGNORECASE)
+    end_match = re.search(r"</html>", text, re.IGNORECASE)
+    if start_match and end_match:
+        start_idx = start_match.start()
+        end_idx = end_match.end()
+        if end_idx > start_idx:
+            return text[start_idx:end_idx]
+    return text.strip()
+def render_html_to_image(html, width=1080, height=2400):
+    """
+    用 Playwright 把 HTML 渲染成 PIL.Image。
+    """
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page(viewport={"width": width, "height": height})
+        page.set_content(html, wait_until="domcontentloaded")
+        screenshot_bytes = page.screenshot(full_page=False)
+        browser.close()
+    return Image.open(BytesIO(screenshot_bytes)).convert("RGB")
+def save_demo_outputs(output_dir, hinted_image, html, rendered_image=None):
+    """
+    保存 demo 输出：
+    - input_with_hint.png
+    - predicted_next_ui.html
+    - predicted_next_ui.png
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    hinted_path = os.path.join(output_dir, "input_with_hint.png")
+    html_path = os.path.join(output_dir, "predicted_next_ui.html")
+    rendered_path = os.path.join(output_dir, "predicted_next_ui.png")
+    hinted_image.save(hinted_path)
+    with open(html_path, "w", encoding="utf-8") as f:
+        f.write(html)
+    if rendered_image is not None:
+        rendered_image.save(rendered_path)
+    print(f"Saved hinted image to: {hinted_path}")
+    print(f"Saved HTML to: {html_path}")
+    if rendered_image is not None:
+        print(f"Saved rendered image to: {rendered_path}")

visual_hint.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import math
+from PIL import Image, ImageDraw
+def draw_arrow_refined(draw, start, end, color=(255, 0, 0, 255), width=15, arrow_len=50):
+    x1, y1 = start
+    x2, y2 = end
+    length = math.hypot(x2 - x1, y2 - y1)
+    if length < 1e-5:
+        return
+    angle = math.atan2(y2 - y1, x2 - x1)
+    p1_x = x2 - arrow_len * math.cos(angle - math.pi / 6)
+    p1_y = y2 - arrow_len * math.sin(angle - math.pi / 6)
+    p2_x = x2 - arrow_len * math.cos(angle + math.pi / 6)
+    p2_y = y2 - arrow_len * math.sin(angle + math.pi / 6)
+    back_off = arrow_len * 0.8
+    line_end_x = x2 - (back_off / length) * (x2 - x1)
+    line_end_y = y2 - (back_off / length) * (y2 - y1)
+    draw.line([start, (line_end_x, line_end_y)], fill=color, width=width)
+    draw.polygon([(x2, y2), (p1_x, p1_y), (p2_x, p2_y)], fill=color)
+def build_visual_hint(image, action):
+    """
+    根据 action 在图像上叠加 visual hint:
+    - click / long_press / input_text: 红圈
+    - scroll / swipe: 红箭头
+    支持的 action 格式示例：
+    1) click:
+        {
+            "action_type": "click",
+            "x": 540,
+            "y": 1470
+        }
+    2) scroll:
+        {
+            "action_type": "scroll",
+            "direction": "down",
+            "x1": 540,
+            "y1": 1600,
+            "x2": 540,
+            "y2": 900
+        }
+    """
+    image = image.convert("RGBA")
+    overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
+    draw = ImageDraw.Draw(overlay)
+    width, height = image.size
+    action_type = action.get("action_type", "")
+    fill_color = (255, 0, 0, 100)
+    outline_color = (255, 0, 0, 255)
+    if action_type in ["scroll", "swipe"]:
+        x1 = action.get("x1")
+        y1 = action.get("y1")
+        x2 = action.get("x2")
+        y2 = action.get("y2")
+        direction = action.get("direction", "down").lower()
+        if None not in [x1, y1, x2, y2]:
+            start_point = (int(x1), int(y1))
+            end_point = (int(x2), int(y2))
+        else:
+            cx, cy = width // 2, height // 2
+            arrow_len = 300
+            if direction == "down":
+                start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
+            elif direction == "up":
+                start_point, end_point = (cx, cy - arrow_len // 2), (cx, cy + arrow_len // 2)
+            elif direction == "right":
+                start_point, end_point = (cx + arrow_len // 2, cy), (cx - arrow_len // 2, cy)
+            elif direction == "left":
+                start_point, end_point = (cx - arrow_len // 2, cy), (cx + arrow_len // 2, cy)
+            else:
+                start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
+        draw_arrow_refined(
+            draw,
+            start_point,
+            end_point,
+            color=outline_color,
+            width=15,
+            arrow_len=50,
+        )
+    elif action_type in ["click", "long_press", "input_text", "open_app"]:
+        x = action.get("x")
+        y = action.get("y")
+        if x is not None and y is not None:
+            x = int(x)
+            y = int(y)
+            radius = 30 if action_type == "input_text" else 40
+            draw.ellipse(
+                (x - radius, y - radius, x + radius, y + radius),
+                fill=fill_color,
+                outline=outline_color,
+                width=5,
+            )
+    return Image.alpha_composite(image, overlay).convert("RGB")