import os
import spaces
import torch
from transformers import AutoModelForMultimodalLM, AutoProcessor, TextIteratorStreamer
from huggingface_hub import login
import gradio as gr
from threading import Thread
MODEL = "AlekseyCalvin/LYRICAL_POET_Gemma4e2b_v1"
HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN:
login(token=HF_TOKEN)
TITLE = """
WORLDISH POET
Generate formal or free verse via our LYRICAL models.
"""
PLACEHOLDER = """
Specify poetic forms, historical poet inspirations, themes, subjects, or all of the above...
"""
css = """
.duplicate-button {
margin: auto !important;
color: white !important;
background: black !important;
border-radius: 100vh !important;
}
h3 {
text-align: center;
}
"""
# Thinking delimiters — must match what the Gemma 4 chat template emits
THINKING_START = "<|channel>"
THINKING_END = ""
_KEEP_TOKENS = {THINKING_START, THINKING_END}
# --- Load processor & model ---
print("Loading processor...")
processor = AutoProcessor.from_pretrained(MODEL, use_fast=False)
print("Loading model...")
model = AutoModelForMultimodalLM.from_pretrained(
MODEL,
device_map="auto",
dtype=torch.bfloat16,
trust_remote_code=True,
)
# Build list of special tokens to strip during thinking-mode streaming
# (keep the thinking delimiters so Gradio's reasoning_tags can find them)
_STRIP_TOKENS = sorted(
(t for t in processor.tokenizer.all_special_tokens if t not in _KEEP_TOKENS),
key=len,
reverse=True, # longest first to avoid partial-match issues
)
print(f"Model loaded on {model.device}, dtype: {model.dtype}")
def _strip_special_tokens(text: str) -> str:
"""Remove all special tokens except thinking delimiters."""
for tok in _STRIP_TOKENS:
text = text.replace(tok, "")
return text
def clean_content(content):
"""Sanitize message content to ensure it is strictly a string."""
if isinstance(content, list):
return " ".join(str(x) for x in content)
if content is None:
return ""
return str(content)
def _to_gemma_content(text: str) -> list[dict]:
"""Wrap a plain string in Gemma 4's structured content format."""
return [{"type": "text", "text": text}]
# ============================================================
# CHAT LOGIC
# ============================================================
@spaces.GPU(duration=180)
@torch.inference_mode()
def stream_chat(
message: str,
history: list,
system_prompt: str,
thinking: bool,
temperature: float = 0.01,
max_new_tokens: int = 1024,
top_p: float = 1.0,
top_k: int = 64,
repetition_penalty: float = 1.0,
):
print(f"Message received: {message}...")
# 1. Build conversation in Gemma 4 structured-content format
conversation = []
if system_prompt.strip():
conversation.append({
"role": "system",
"content": _to_gemma_content(clean_content(system_prompt)),
})
# 2. Add history (Gradio messages format → Gemma 4 format)
for turn in history:
conversation.append({
"role": turn.get("role"),
"content": _to_gemma_content(clean_content(turn.get("content"))),
})
# 3. Add current user message
conversation.append({
"role": "user",
"content": _to_gemma_content(clean_content(message)),
})
# 4. Apply chat template (handles <|turn|> delimiters, thinking tokens, etc.)
template_kwargs = {
"tokenize": True,
"return_dict": True,
"return_tensors": "pt",
"add_generation_prompt": True,
}
if thinking:
template_kwargs["enable_thinking"] = True
inputs = processor.apply_chat_template(conversation, **template_kwargs)
inputs = inputs.to(device=model.device, dtype=torch.bfloat16)
# 5. Setup streamer
# When thinking is ON we keep special tokens so <|channel>… passes
# through for Gradio's reasoning_tags; when OFF we strip them entirely.
streamer = TextIteratorStreamer(
processor,
timeout=30.0,
skip_prompt=True,
skip_special_tokens=not thinking,
)
# 6. Generation parameters
generate_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": max_new_tokens,
"do_sample": temperature > 0,
"temperature": temperature if temperature > 0 else None,
"top_p": top_p,
"top_k": top_k if top_k > 0 else None,
"repetition_penalty": repetition_penalty,
"disable_compile": True,
"use_cache": True,
}
generate_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None}
# 7. Generate in background thread
exception_holder: list[Exception] = []
def _generate() -> None:
try:
model.generate(**generate_kwargs)
except Exception as e:
exception_holder.append(e)
thread = Thread(target=_generate)
thread.start()
# 8. Stream output
chunks: list[str] = []
for new_text in streamer:
chunks.append(new_text)
accumulated = "".join(chunks)
if thinking:
yield _strip_special_tokens(accumulated)
else:
yield accumulated
thread.join()
if exception_holder:
raise gr.Error(f"Generation failed: {exception_holder[0]}")
# ============================================================
# GRADIO INTERFACE
# ============================================================
SYSTEM_PROMPT_DEFAULT = (
"You are the poet laureate of the world, an inventive, erudite, formally trained,"
" and stylistically versatile multilingual versifier. You are famous for nimbly"
" channeling into your original verse a kaleidoscopically diverse roster of literary "
"personas: ranging from iconic to forgotten to ever‑arcane, these are the voices "
"you re‑enact like a method actor, or – as a spirit medium of comparative lyricism – "
"transmit: the voices of your poetic heroes lovingly invoked from a vast (and oft "
"incongruous) library of verse canons. Translating their characteristic styles, "
"rhythms, registers, meters, and cadences into yours, you revive their muses to"
" the breath of their own unceasing poignancy by letting them speak to the concerns"
" and hopes of our anxious age. And just as crucially, you work hard to not betray "
"your heroes’ trust. How so? By imbibing and leveraging with self‑exacting "
"temperance and judgement the prosodic, literary, idiomatic, and formal"
" features characteristic of each given poet's style, or an entire versifying form.\n "
"Here are some notes you bring along:\n"
"1. Quatrain: 4‑line stanzas, often with cyclical or/and interlocking end‑line rhyme schemes.\n"
" Common subtypes:\n "
"i. ABAB, then CDCD, EFEF, etc...–> Interlocking cyclical scheme, introducing new end-rhyme "
"(phonetic) templates with each sequential quatrain stanza. "
"This pattern + iambic pentameter = Heroic/Elegiac stanza.\n"
"ii. ABAB, then BCBC, CDCD, DEDE, etc… –> The Pantoum scheme: rhyme template from "
"lines 2/4 of each quatrain stanza recurs in lines 1/3 of the following quatrain. \n"
"iii. AABA: The Ruba'i rhyme scheme. As in: AABA, then AABA (or CCDC),..., etc… , "
"iv. ABBA: The envelope quatrain stanza. As in: ABBA, then ABBA (or CDDC),..., etc... "
" This pattern + iambic tetrameter + grieving/nostalgic theme/angle = the In Memoriam stanza.\n"
"Quatrains can serve as sub‑forms of broader constructs (as can triads, couplets, etc); "
"Such as: poem with 5+ ABAB, ABCB, or ABxB quatrains is plausibly a Ballad, especially "
"if lines alternate iambic tetrameter (1st/3rd lines) & iambic trimeter (2nd/4th). \n"
"2. The Sonnet: often 14 lines; its types include:\n"
"i. Shakespearean Sonnet: ABAB CDCD EFEF GG rhyme scheme in 4 stanzas "
"(3 quatrains + 1 concluding matched-rhyme couplet); \n"
"ii. Spenserian Sonnet: ABAB BCBC CDCD EE scheme (interlocked quatrains, otherwise like Shakespearean); \n"
"iii. Petrarchan Sonnet: Either in 2 stanzas (as ABBAABBA CDCDCD: 1 octave + 1 sestet) or 4 "
"(as ABBA ABBA CDCD CD: 2 enveloped ABBA + 1 interlocked CDCD quatrains + 1 CD couplet). \n"
"3. Octave: 8 line stanza, used in such forms as: \n"
"i. Common Octave: ABCABCAC scheme. Balancedly complex purview of one aspect of theme/subject. \n"
"ii. Ottava Rima: pentameter lines with ABABABCC rhyme scheme. \n"
"4. Rondine: ABBAABR ABBAR (7-line septet + 5-line quintet; both end in R –> Refrain reprising or inverting line 1). \n"
"5. Limerick: strict AABBA quintet; humorous, satyrical, playful, or raunchy; often with named characters. \n"
"6. English Ode: ABAB CDECDE; 10-line poem of praise, tribute, love, or lust. \n"
"7. Triad: AAA BBB CCC; 3 mono-rhymed triplets/tercets. \n"
"Compose a poem formally, metrically/prosodically, tonally, stylistically, rhythmically, and thematically \n"
"indistinguishable from the work of any poet, style, or/and poetic form specified in the prompt. \n"
"Do not narrate or explain your workflow. Output the poem only: \n"
)
chatbot = gr.Chatbot(
height=600,
placeholder=PLACEHOLDER,
reasoning_tags=[(THINKING_START, THINKING_END)],
)
with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo:
gr.HTML(TITLE)
gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
gr.ChatInterface(
fn=stream_chat,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Textbox(
value=SYSTEM_PROMPT_DEFAULT,
label="System Prompt",
render=False,
),
gr.Checkbox(
label="Thinking",
value=False,
info="Enable model reasoning (shown in a collapsible section)",
render=False,
),
gr.Slider(minimum=0, maximum=2, step=0.01, value=0.7, label="Temperature", render=False),
gr.Slider(minimum=128, maximum=8192, step=1, value=2048, label="Max new tokens", render=False),
gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.90, label="top_p", render=False),
gr.Slider(minimum=0, maximum=100, step=1, value=64, label="top_k", render=False),
gr.Slider(minimum=0.0, maximum=4.0, step=0.1, value=1.0, label="Repetition penalty", render=False),
],
examples=[
["Compose a Petrarchan Sonnet in the style of Boris Poplavsky."],
["Выстрочи ка мне стишок Хильды Дулитл под слог, а по тону сущий Летов – Уильям Блейк страны советов. Тема: новый зомби бог."],
["Écris un pantoum en imitant le style de Jules Laforgue."],
["Write an ode to San Francisco."],
],
cache_examples=False,
)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=None)
demo.launch(share=True, mcp_server=True)