yhzheng1031 commited on
Commit
4da1734
·
verified ·
1 Parent(s): 0983922

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. prompt_builder.py +110 -0
  2. render_utils.py +64 -0
  3. visual_hint.py +112 -0
prompt_builder.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ SYSTEM_PROMPT = """You are an expert **UI State Transition Simulator** and **Frontend Developer**.
5
+ Your task is to predict the **NEXT UI STATE** based on a screenshot of the current state and a user interaction.
6
+
7
+ ### 1. IMAGE INTERPRETATION RULES
8
+ The input image contains visual cues denoting the user's action. You must interpret them as follows:
9
+ * **Red Circle**: Indicates a **Click** or **Long Press** target at that location.
10
+ * **Red Arrow**: Indicates a **Scroll** or **Swipe**.
11
+ * The arrow points in the direction of finger movement.
12
+ * *Example*: An arrow pointing UP means the finger slides up, pushing content up (Scrolling Down).
13
+ * **Note**: These cues exist ONLY to show the action. **DO NOT render these red circles or arrows in your output HTML.**
14
+
15
+ ### 2. CRITICAL STRUCTURAL RULES (MUST FOLLOW)
16
+ * **Format**: Output ONLY raw HTML. Start with `<!DOCTYPE html>` and end with `</html>`.
17
+ * **Root Element**: All visible content MUST be wrapped in:
18
+ `<div id="render-target"> ... </div>`
19
+ * **Container Style**: `#render-target` must have:
20
+ `width: 1080px; height: 2400px; position: relative; overflow: hidden;`
21
+ (Apply background colors and shadows here, NOT on the body).
22
+ * **Body Style**: The `<body>` tag must have `margin: 0; padding: 0; background: transparent;`.
23
+ * **Layout**: Do NOT center the body. Let `#render-target` sit at (0,0).
24
+
25
+ ### 3. CONTENT GENERATION LOGIC
26
+ * **Transition**: Analyze the action. If the user clicks a button, show the *result* (e.g., a menu opens, a checkbox checks, page navigates).
27
+ * **Images**: Use semantic text placeholders. DO NOT use real URLs.
28
+ * Format: `<div style="...">[IMG: description]</div>`
29
+ * **Icons**: Use simple inline SVG paths or Unicode.
30
+
31
+ ### 4. OUTPUT REQUIREMENT
32
+ * Do NOT generate Markdown blocks (```html).
33
+ * Do NOT provide explanations or conversational text.
34
+ * Output the code directly.
35
+ """
36
+
37
+
38
+ USER_PROMPT_TEMPLATE = """<image>
39
+ ### INPUT CONTEXT
40
+ 1. **User Intent**: "{instruction_str}"
41
+ 2. **Interaction Details**:
42
+ * **Description**: {semantic_desc}
43
+ * **Action Data**: {action_json}
44
+
45
+ ### COMMAND
46
+ Based on the visual cues in the image and the interaction data above, generate the **HTML for the RESULTING UI STATE** (what the screen looks like *after* this action).
47
+ """
48
+
49
+
50
+ def get_action_semantic_description(action):
51
+ action_type = action.get("action_type")
52
+
53
+ if action_type == "click":
54
+ x, y = action.get("x"), action.get("y")
55
+ return (
56
+ f"User performed a CLICK at coordinates ({x}, {y}). "
57
+ f"Expect the button/element at this location to trigger."
58
+ )
59
+
60
+ if action_type == "long_press":
61
+ x, y = action.get("x"), action.get("y")
62
+ return (
63
+ f"User performed a LONG PRESS at coordinates ({x}, {y}). "
64
+ f"Expect a context menu or selection state."
65
+ )
66
+
67
+ if action_type in ["scroll", "swipe"]:
68
+ direction = action.get("direction", "down")
69
+ return (
70
+ f"User SCROLLED {direction.upper()}. "
71
+ f"The content should move, revealing new items from the {direction} direction."
72
+ )
73
+
74
+ if action_type == "input_text":
75
+ text = action.get("text", "")
76
+ return (
77
+ f"User is TYPING the text: '{text}'. "
78
+ f"The focused input field MUST now contain this text."
79
+ )
80
+
81
+ if action_type == "open_app":
82
+ app_name = action.get("app_name", "app")
83
+ return (
84
+ f"System Context Switch: The user opened the app '{app_name}'. "
85
+ f"Show the home screen of this app."
86
+ )
87
+
88
+ if action_type == "navigate_back":
89
+ return "System Navigation: The user pressed BACK. Return to the previous screen."
90
+
91
+ if action_type == "navigate_home":
92
+ return "System Navigation: The user pressed HOME. Show the Desktop."
93
+
94
+ if action_type == "wait":
95
+ return "Action: WAIT. Keep the UI mostly unchanged unless loading completes."
96
+
97
+ return f"Perform action: {action_type}."
98
+
99
+
100
+ def build_user_prompt(instruction_str, action, semantic_desc=None):
101
+ if semantic_desc is None:
102
+ semantic_desc = get_action_semantic_description(action)
103
+
104
+ action_json = json.dumps(action, ensure_ascii=False)
105
+
106
+ return USER_PROMPT_TEMPLATE.format(
107
+ instruction_str=instruction_str,
108
+ semantic_desc=semantic_desc,
109
+ action_json=action_json,
110
+ )
render_utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from io import BytesIO
4
+ from PIL import Image
5
+ from playwright.sync_api import sync_playwright
6
+
7
+
8
+ def extract_clean_html(text):
9
+ """
10
+ 清理模型输出,尽量提取完整 HTML。
11
+ """
12
+ text = text.replace("```html", "").replace("```", "")
13
+
14
+ start_match = re.search(r"<!DOCTYPE html>", text, re.IGNORECASE)
15
+ end_match = re.search(r"</html>", text, re.IGNORECASE)
16
+
17
+ if start_match and end_match:
18
+ start_idx = start_match.start()
19
+ end_idx = end_match.end()
20
+ if end_idx > start_idx:
21
+ return text[start_idx:end_idx]
22
+
23
+ return text.strip()
24
+
25
+
26
+ def render_html_to_image(html, width=1080, height=2400):
27
+ """
28
+ 用 Playwright 把 HTML 渲染成 PIL.Image。
29
+ """
30
+ with sync_playwright() as p:
31
+ browser = p.chromium.launch(headless=True)
32
+ page = browser.new_page(viewport={"width": width, "height": height})
33
+ page.set_content(html, wait_until="domcontentloaded")
34
+ screenshot_bytes = page.screenshot(full_page=False)
35
+ browser.close()
36
+
37
+ return Image.open(BytesIO(screenshot_bytes)).convert("RGB")
38
+
39
+
40
+ def save_demo_outputs(output_dir, hinted_image, html, rendered_image=None):
41
+ """
42
+ 保存 demo 输出:
43
+ - input_with_hint.png
44
+ - predicted_next_ui.html
45
+ - predicted_next_ui.png
46
+ """
47
+ os.makedirs(output_dir, exist_ok=True)
48
+
49
+ hinted_path = os.path.join(output_dir, "input_with_hint.png")
50
+ html_path = os.path.join(output_dir, "predicted_next_ui.html")
51
+ rendered_path = os.path.join(output_dir, "predicted_next_ui.png")
52
+
53
+ hinted_image.save(hinted_path)
54
+
55
+ with open(html_path, "w", encoding="utf-8") as f:
56
+ f.write(html)
57
+
58
+ if rendered_image is not None:
59
+ rendered_image.save(rendered_path)
60
+
61
+ print(f"Saved hinted image to: {hinted_path}")
62
+ print(f"Saved HTML to: {html_path}")
63
+ if rendered_image is not None:
64
+ print(f"Saved rendered image to: {rendered_path}")
visual_hint.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from PIL import Image, ImageDraw
3
+
4
+
5
+ def draw_arrow_refined(draw, start, end, color=(255, 0, 0, 255), width=15, arrow_len=50):
6
+ x1, y1 = start
7
+ x2, y2 = end
8
+
9
+ length = math.hypot(x2 - x1, y2 - y1)
10
+ if length < 1e-5:
11
+ return
12
+
13
+ angle = math.atan2(y2 - y1, x2 - x1)
14
+
15
+ p1_x = x2 - arrow_len * math.cos(angle - math.pi / 6)
16
+ p1_y = y2 - arrow_len * math.sin(angle - math.pi / 6)
17
+ p2_x = x2 - arrow_len * math.cos(angle + math.pi / 6)
18
+ p2_y = y2 - arrow_len * math.sin(angle + math.pi / 6)
19
+
20
+ back_off = arrow_len * 0.8
21
+ line_end_x = x2 - (back_off / length) * (x2 - x1)
22
+ line_end_y = y2 - (back_off / length) * (y2 - y1)
23
+
24
+ draw.line([start, (line_end_x, line_end_y)], fill=color, width=width)
25
+ draw.polygon([(x2, y2), (p1_x, p1_y), (p2_x, p2_y)], fill=color)
26
+
27
+
28
+ def build_visual_hint(image, action):
29
+ """
30
+ 根据 action 在图像上叠加 visual hint:
31
+ - click / long_press / input_text: 红圈
32
+ - scroll / swipe: 红箭头
33
+
34
+ 支持的 action 格式示例:
35
+ 1) click:
36
+ {
37
+ "action_type": "click",
38
+ "x": 540,
39
+ "y": 1470
40
+ }
41
+
42
+ 2) scroll:
43
+ {
44
+ "action_type": "scroll",
45
+ "direction": "down",
46
+ "x1": 540,
47
+ "y1": 1600,
48
+ "x2": 540,
49
+ "y2": 900
50
+ }
51
+ """
52
+ image = image.convert("RGBA")
53
+ overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
54
+ draw = ImageDraw.Draw(overlay)
55
+
56
+ width, height = image.size
57
+ action_type = action.get("action_type", "")
58
+
59
+ fill_color = (255, 0, 0, 100)
60
+ outline_color = (255, 0, 0, 255)
61
+
62
+ if action_type in ["scroll", "swipe"]:
63
+ x1 = action.get("x1")
64
+ y1 = action.get("y1")
65
+ x2 = action.get("x2")
66
+ y2 = action.get("y2")
67
+ direction = action.get("direction", "down").lower()
68
+
69
+ if None not in [x1, y1, x2, y2]:
70
+ start_point = (int(x1), int(y1))
71
+ end_point = (int(x2), int(y2))
72
+ else:
73
+ cx, cy = width // 2, height // 2
74
+ arrow_len = 300
75
+
76
+ if direction == "down":
77
+ start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
78
+ elif direction == "up":
79
+ start_point, end_point = (cx, cy - arrow_len // 2), (cx, cy + arrow_len // 2)
80
+ elif direction == "right":
81
+ start_point, end_point = (cx + arrow_len // 2, cy), (cx - arrow_len // 2, cy)
82
+ elif direction == "left":
83
+ start_point, end_point = (cx - arrow_len // 2, cy), (cx + arrow_len // 2, cy)
84
+ else:
85
+ start_point, end_point = (cx, cy + arrow_len // 2), (cx, cy - arrow_len // 2)
86
+
87
+ draw_arrow_refined(
88
+ draw,
89
+ start_point,
90
+ end_point,
91
+ color=outline_color,
92
+ width=15,
93
+ arrow_len=50,
94
+ )
95
+
96
+ elif action_type in ["click", "long_press", "input_text", "open_app"]:
97
+ x = action.get("x")
98
+ y = action.get("y")
99
+
100
+ if x is not None and y is not None:
101
+ x = int(x)
102
+ y = int(y)
103
+ radius = 30 if action_type == "input_text" else 40
104
+
105
+ draw.ellipse(
106
+ (x - radius, y - radius, x + radius, y + radius),
107
+ fill=fill_color,
108
+ outline=outline_color,
109
+ width=5,
110
+ )
111
+
112
+ return Image.alpha_composite(image, overlay).convert("RGB")