Upload folder using huggingface_hub
Browse files- prompt_builder.py +110 -0
- render_utils.py +64 -0
- visual_hint.py +112 -0
prompt_builder.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**.
|
| 5 |
+
Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction.
|
| 6 |
+
|
| 7 |
+
### 1. IMAGE INTERPRETATION RULES
|
| 8 |
+
The input image contains visual cues denoting the user's action. You must interpret them as follows:
|
| 9 |
+
* **Red Circle**: Indicates a **Click** or **Long Press** target at that location.
|
| 10 |
+
* **Red Arrow**: Indicates a **Scroll** or **Swipe**.
|
| 11 |
+
* The arrow points in the direction of finger movement.
|
| 12 |
+
* *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down).
|
| 13 |
+
* **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.**
|
| 14 |
+
|
| 15 |
+
### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW)
|
| 16 |
+
* **Format**: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`.
|
| 17 |
+
* **Root Element**: All visible content MUST be wrapped in:
|
| 18 |
+
`<div id="render-target"> ... </div>`
|
| 19 |
+
* **Container Style**: `#render-target` must have:
|
| 20 |
+
`width: 1080px; height: 2400px; position: relative; overflow: hidden;`
|
| 21 |
+
(Apply background colors and shadows here, NOT on the body).
|
| 22 |
+
* **Body Style**: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`.
|
| 23 |
+
* **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0).
|
| 24 |
+
|
| 25 |
+
### 3. CONTENT GENERATION LOGIC
|
| 26 |
+
* **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates).
|
| 27 |
+
* **Images**: Use semantic text placeholders. DO NOT use real URLs.
|
| 28 |
+
* Format: `<div style="...">[IMG: description]</div>`
|
| 29 |
+
* **Icons**: Use simple inline SVG paths or Unicode.
|
| 30 |
+
|
| 31 |
+
### 4. OUTPUT REQUIREMENT
|
| 32 |
+
* Do NOT generate Markdown blocks (```html).
|
| 33 |
+
* Do NOT provide explanations or conversational text.
|
| 34 |
+
* Output the code directly.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
USER_PROMPT_TEMPLATE = """<image>
|
| 39 |
+
### INPUT CONTEXT
|
| 40 |
+
1. **User Intent**: "{instruction_str}"
|
| 41 |
+
2. **Interaction Details**:
|
| 42 |
+
* **Description**: {semantic_desc}
|
| 43 |
+
* **Action Data**: {action_json}
|
| 44 |
+
|
| 45 |
+
### COMMAND
|
| 46 |
+
Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action).
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_action_semantic_description(action):
|
| 51 |
+
action_type = action.get("action_type")
|
| 52 |
+
|
| 53 |
+
if action_type == "click":
|
| 54 |
+
x, y = action.get("x"), action.get("y")
|
| 55 |
+
return (
|
| 56 |
+
f"User performed a CLICK at coordinates ({x}, {y}). "
|
| 57 |
+
f"Expect the button/element at this location to trigger."
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if action_type == "long_press":
|
| 61 |
+
x, y = action.get("x"), action.get("y")
|
| 62 |
+
return (
|
| 63 |
+
f"User performed a LONG PRESS at coordinates ({x}, {y}). "
|
| 64 |
+
f"Expect a context menu or selection state."
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
if action_type in ["scroll", "swipe"]:
|
| 68 |
+
direction = action.get("direction", "down")
|
| 69 |
+
return (
|
| 70 |
+
f"User SCROLLED {direction.upper()}. "
|
| 71 |
+
f"The content should move, revealing new items from the {direction} direction."
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
if action_type == "input_text":
|
| 75 |
+
text = action.get("text", "")
|
| 76 |
+
return (
|
| 77 |
+
f"User is TYPING the text: '{text}'. "
|
| 78 |
+
f"The focused input field MUST now contain this text."
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
if action_type == "open_app":
|
| 82 |
+
app_name = action.get("app_name", "app")
|
| 83 |
+
return (
|
| 84 |
+
f"System Context Switch: The user opened the app '{app_name}'. "
|
| 85 |
+
f"Show the home screen of this app."
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
if action_type == "navigate_back":
|
| 89 |
+
return "System Navigation: The user pressed BACK. Return to the previous screen."
|
| 90 |
+
|
| 91 |
+
if action_type == "navigate_home":
|
| 92 |
+
return "System Navigation: The user pressed HOME. Show the Desktop."
|
| 93 |
+
|
| 94 |
+
if action_type == "wait":
|
| 95 |
+
return "Action: WAIT. Keep the UI mostly unchanged unless loading completes."
|
| 96 |
+
|
| 97 |
+
return f"Perform action: {action_type}."
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def build_user_prompt(instruction_str, action, semantic_desc=None):
|
| 101 |
+
if semantic_desc is None:
|
| 102 |
+
semantic_desc = get_action_semantic_description(action)
|
| 103 |
+
|
| 104 |
+
action_json = json.dumps(action, ensure_ascii=False)
|
| 105 |
+
|
| 106 |
+
return USER_PROMPT_TEMPLATE.format(
|
| 107 |
+
instruction_str=instruction_str,
|
| 108 |
+
semantic_desc=semantic_desc,
|
| 109 |
+
action_json=action_json,
|
| 110 |
+
)
|
render_utils.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from playwright.sync_api import sync_playwright
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def extract_clean_html(text):
|
| 9 |
+
"""
|
| 10 |
+
清理模型输出,尽量提取完整 HTML。
|
| 11 |
+
"""
|
| 12 |
+
text = text.replace("```html", "").replace("```", "")
|
| 13 |
+
|
| 14 |
+
start_match = re.search(r"<!DOCTYPE html>", text, re.IGNORECASE)
|
| 15 |
+
end_match = re.search(r"</html>", text, re.IGNORECASE)
|
| 16 |
+
|
| 17 |
+
if start_match and end_match:
|
| 18 |
+
start_idx = start_match.start()
|
| 19 |
+
end_idx = end_match.end()
|
| 20 |
+
if end_idx > start_idx:
|
| 21 |
+
return text[start_idx:end_idx]
|
| 22 |
+
|
| 23 |
+
return text.strip()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def render_html_to_image(html, width=1080, height=2400):
|
| 27 |
+
"""
|
| 28 |
+
用 Playwright 把 HTML 渲染成 PIL.Image。
|
| 29 |
+
"""
|
| 30 |
+
with sync_playwright() as p:
|
| 31 |
+
browser = p.chromium.launch(headless=True)
|
| 32 |
+
page = browser.new_page(viewport={"width": width, "height": height})
|
| 33 |
+
page.set_content(html, wait_until="domcontentloaded")
|
| 34 |
+
screenshot_bytes = page.screenshot(full_page=False)
|
| 35 |
+
browser.close()
|
| 36 |
+
|
| 37 |
+
return Image.open(BytesIO(screenshot_bytes)).convert("RGB")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def save_demo_outputs(output_dir, hinted_image, html, rendered_image=None):
|
| 41 |
+
"""
|
| 42 |
+
保存 demo 输出:
|
| 43 |
+
- input_with_hint.png
|
| 44 |
+
- predicted_next_ui.html
|
| 45 |
+
- predicted_next_ui.png
|
| 46 |
+
"""
|
| 47 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
hinted_path = os.path.join(output_dir, "input_with_hint.png")
|
| 50 |
+
html_path = os.path.join(output_dir, "predicted_next_ui.html")
|
| 51 |
+
rendered_path = os.path.join(output_dir, "predicted_next_ui.png")
|
| 52 |
+
|
| 53 |
+
hinted_image.save(hinted_path)
|
| 54 |
+
|
| 55 |
+
with open(html_path, "w", encoding="utf-8") as f:
|
| 56 |
+
f.write(html)
|
| 57 |
+
|
| 58 |
+
if rendered_image is not None:
|
| 59 |
+
rendered_image.save(rendered_path)
|
| 60 |
+
|
| 61 |
+
print(f"Saved hinted image to: {hinted_path}")
|
| 62 |
+
print(f"Saved HTML to: {html_path}")
|
| 63 |
+
if rendered_image is not None:
|
| 64 |
+
print(f"Saved rendered image to: {rendered_path}")
|
visual_hint.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from PIL import Image, ImageDraw
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def draw_arrow_refined(draw, start, end, color=(255, 0, 0, 255), width=15, arrow_len=50):
|
| 6 |
+
x1, y1 = start
|
| 7 |
+
x2, y2 = end
|
| 8 |
+
|
| 9 |
+
length = math.hypot(x2 - x1, y2 - y1)
|
| 10 |
+
if length < 1e-5:
|
| 11 |
+
return
|
| 12 |
+
|
| 13 |
+
angle = math.atan2(y2 - y1, x2 - x1)
|
| 14 |
+
|
| 15 |
+
p1_x = x2 - arrow_len * math.cos(angle - math.pi / 6)
|
| 16 |
+
p1_y = y2 - arrow_len * math.sin(angle - math.pi / 6)
|
| 17 |
+
p2_x = x2 - arrow_len * math.cos(angle + math.pi / 6)
|
| 18 |
+
p2_y = y2 - arrow_len * math.sin(angle + math.pi / 6)
|
| 19 |
+
|
| 20 |
+
back_off = arrow_len * 0.8
|
| 21 |
+
line_end_x = x2 - (back_off / length) * (x2 - x1)
|
| 22 |
+
line_end_y = y2 - (back_off / length) * (y2 - y1)
|
| 23 |
+
|
| 24 |
+
draw.line([start, (line_end_x, line_end_y)], fill=color, width=width)
|
| 25 |
+
draw.polygon([(x2, y2), (p1_x, p1_y), (p2_x, p2_y)], fill=color)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def build_visual_hint(image, action):
|
| 29 |
+
"""
|
| 30 |
+
根据 action 在图像上叠加 visual hint:
|
| 31 |
+
- click / long_press / input_text: 红圈
|
| 32 |
+
- scroll / swipe: 红箭头
|
| 33 |
+
|
| 34 |
+
支持的 action 格式示例:
|
| 35 |
+
1) click:
|
| 36 |
+
{
|
| 37 |
+
"action_type": "click",
|
| 38 |
+
"x": 540,
|
| 39 |
+
"y": 1470
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
2) scroll:
|
| 43 |
+
{
|
| 44 |
+
"action_type": "scroll",
|
| 45 |
+
"direction": "down",
|
| 46 |
+
"x1": 540,
|
| 47 |
+
"y1": 1600,
|
| 48 |
+
"x2": 540,
|
| 49 |
+
"y2": 900
|
| 50 |
+
}
|
| 51 |
+
"""
|
| 52 |
+
image = image.convert("RGBA")
|
| 53 |
+
overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
|
| 54 |
+
draw = ImageDraw.Draw(overlay)
|
| 55 |
+
|
| 56 |
+
width, height = image.size
|
| 57 |
+
action_type = action.get("action_type", "")
|
| 58 |
+
|
| 59 |
+
fill_color = (255, 0, 0, 100)
|
| 60 |
+
outline_color = (255, 0, 0, 255)
|
| 61 |
+
|
| 62 |
+
if action_type in ["scroll", "swipe"]:
|
| 63 |
+
x1 = action.get("x1")
|
| 64 |
+
y1 = action.get("y1")
|
| 65 |
+
x2 = action.get("x2")
|
| 66 |
+
y2 = action.get("y2")
|
| 67 |
+
direction = action.get("direction", "down").lower()
|
| 68 |
+
|
| 69 |
+
if None not in [x1, y1, x2, y2]:
|
| 70 |
+
start_point = (int(x1), int(y1))
|
| 71 |
+
end_point = (int(x2), int(y2))
|
| 72 |
+
else:
|
| 73 |
+
cx, cy = width // 2, height // 2
|
| 74 |
+
arrow_len = 300
|
| 75 |
+
|
| 76 |
+
if direction == "down":
|
| 77 |
+
start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
|
| 78 |
+
elif direction == "up":
|
| 79 |
+
start_point, end_point = (cx, cy - arrow_len // 2), (cx, cy + arrow_len // 2)
|
| 80 |
+
elif direction == "right":
|
| 81 |
+
start_point, end_point = (cx + arrow_len // 2, cy), (cx - arrow_len // 2, cy)
|
| 82 |
+
elif direction == "left":
|
| 83 |
+
start_point, end_point = (cx - arrow_len // 2, cy), (cx + arrow_len // 2, cy)
|
| 84 |
+
else:
|
| 85 |
+
start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
|
| 86 |
+
|
| 87 |
+
draw_arrow_refined(
|
| 88 |
+
draw,
|
| 89 |
+
start_point,
|
| 90 |
+
end_point,
|
| 91 |
+
color=outline_color,
|
| 92 |
+
width=15,
|
| 93 |
+
arrow_len=50,
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
elif action_type in ["click", "long_press", "input_text", "open_app"]:
|
| 97 |
+
x = action.get("x")
|
| 98 |
+
y = action.get("y")
|
| 99 |
+
|
| 100 |
+
if x is not None and y is not None:
|
| 101 |
+
x = int(x)
|
| 102 |
+
y = int(y)
|
| 103 |
+
radius = 30 if action_type == "input_text" else 40
|
| 104 |
+
|
| 105 |
+
draw.ellipse(
|
| 106 |
+
(x - radius, y - radius, x + radius, y + radius),
|
| 107 |
+
fill=fill_color,
|
| 108 |
+
outline=outline_color,
|
| 109 |
+
width=5,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
return Image.alpha_composite(image, overlay).convert("RGB")
|