import os import spaces import torch from transformers import AutoModelForMultimodalLM, AutoProcessor, TextIteratorStreamer from huggingface_hub import login import gradio as gr from threading import Thread MODEL = "AlekseyCalvin/LYRICAL_POET_Gemma4e2b_v1" HF_TOKEN = os.environ.get("HF_TOKEN") if HF_TOKEN: login(token=HF_TOKEN) TITLE = """

WORLDISH POET

Generate formal or free verse via our LYRICAL models.

""" PLACEHOLDER = """

Specify poetic forms, historical poet inspirations, themes, subjects, or all of the above...

""" css = """ .duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important; } h3 { text-align: center; } """ # Thinking delimiters — must match what the Gemma 4 chat template emits THINKING_START = "<|channel>" THINKING_END = "" _KEEP_TOKENS = {THINKING_START, THINKING_END} # --- Load processor & model --- print("Loading processor...") processor = AutoProcessor.from_pretrained(MODEL, use_fast=False) print("Loading model...") model = AutoModelForMultimodalLM.from_pretrained( MODEL, device_map="auto", dtype=torch.bfloat16, trust_remote_code=True, ) # Build list of special tokens to strip during thinking-mode streaming # (keep the thinking delimiters so Gradio's reasoning_tags can find them) _STRIP_TOKENS = sorted( (t for t in processor.tokenizer.all_special_tokens if t not in _KEEP_TOKENS), key=len, reverse=True, # longest first to avoid partial-match issues ) print(f"Model loaded on {model.device}, dtype: {model.dtype}") def _strip_special_tokens(text: str) -> str: """Remove all special tokens except thinking delimiters.""" for tok in _STRIP_TOKENS: text = text.replace(tok, "") return text def clean_content(content): """Sanitize message content to ensure it is strictly a string.""" if isinstance(content, list): return " ".join(str(x) for x in content) if content is None: return "" return str(content) def _to_gemma_content(text: str) -> list[dict]: """Wrap a plain string in Gemma 4's structured content format.""" return [{"type": "text", "text": text}] # ============================================================ # CHAT LOGIC # ============================================================ @spaces.GPU(duration=180) @torch.inference_mode() def stream_chat( message: str, history: list, system_prompt: str, thinking: bool, temperature: float = 0.01, max_new_tokens: int = 1024, top_p: float = 1.0, top_k: int = 64, repetition_penalty: float = 1.0, ): print(f"Message received: {message}...") # 1. Build conversation in Gemma 4 structured-content format conversation = [] if system_prompt.strip(): conversation.append({ "role": "system", "content": _to_gemma_content(clean_content(system_prompt)), }) # 2. Add history (Gradio messages format → Gemma 4 format) for turn in history: conversation.append({ "role": turn.get("role"), "content": _to_gemma_content(clean_content(turn.get("content"))), }) # 3. Add current user message conversation.append({ "role": "user", "content": _to_gemma_content(clean_content(message)), }) # 4. Apply chat template (handles <|turn|> delimiters, thinking tokens, etc.) template_kwargs = { "tokenize": True, "return_dict": True, "return_tensors": "pt", "add_generation_prompt": True, } if thinking: template_kwargs["enable_thinking"] = True inputs = processor.apply_chat_template(conversation, **template_kwargs) inputs = inputs.to(device=model.device, dtype=torch.bfloat16) # 5. Setup streamer # When thinking is ON we keep special tokens so <|channel>… passes # through for Gradio's reasoning_tags; when OFF we strip them entirely. streamer = TextIteratorStreamer( processor, timeout=30.0, skip_prompt=True, skip_special_tokens=not thinking, ) # 6. Generation parameters generate_kwargs = { **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens, "do_sample": temperature > 0, "temperature": temperature if temperature > 0 else None, "top_p": top_p, "top_k": top_k if top_k > 0 else None, "repetition_penalty": repetition_penalty, "disable_compile": True, "use_cache": True, } generate_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None} # 7. Generate in background thread exception_holder: list[Exception] = [] def _generate() -> None: try: model.generate(**generate_kwargs) except Exception as e: exception_holder.append(e) thread = Thread(target=_generate) thread.start() # 8. Stream output chunks: list[str] = [] for new_text in streamer: chunks.append(new_text) accumulated = "".join(chunks) if thinking: yield _strip_special_tokens(accumulated) else: yield accumulated thread.join() if exception_holder: raise gr.Error(f"Generation failed: {exception_holder[0]}") # ============================================================ # GRADIO INTERFACE # ============================================================ SYSTEM_PROMPT_DEFAULT = ( "You are the poet laureate of the world, an inventive, erudite, formally trained," " and stylistically versatile multilingual versifier. You are famous for nimbly" " channeling into your original verse a kaleidoscopically diverse roster of literary " "personas: ranging from iconic to forgotten to ever‑arcane, these are the voices " "you re‑enact like a method actor, or – as a spirit medium of comparative lyricism – " "transmit: the voices of your poetic heroes lovingly invoked from a vast (and oft " "incongruous) library of verse canons. Translating their characteristic styles, " "rhythms, registers, meters, and cadences into yours, you revive their muses to" " the breath of their own unceasing poignancy by letting them speak to the concerns" " and hopes of our anxious age. And just as crucially, you work hard to not betray " "your heroes’ trust. How so? By imbibing and leveraging with self‑exacting " "temperance and judgement the prosodic, literary, idiomatic, and formal" " features characteristic of each given poet's style, or an entire versifying form.\n " "Here are some notes you bring along:\n" "1. Quatrain: 4‑line stanzas, often with cyclical or/and interlocking end‑line rhyme schemes.\n" " Common subtypes:\n " "i. ABAB, then CDCD, EFEF, etc...–> Interlocking cyclical scheme, introducing new end-rhyme " "(phonetic) templates with each sequential quatrain stanza. " "This pattern + iambic pentameter = Heroic/Elegiac stanza.\n" "ii. ABAB, then BCBC, CDCD, DEDE, etc… –> The Pantoum scheme: rhyme template from " "lines 2/4 of each quatrain stanza recurs in lines 1/3 of the following quatrain. \n" "iii. AABA: The Ruba'i rhyme scheme. As in: AABA, then AABA (or CCDC),..., etc… , " "iv. ABBA: The envelope quatrain stanza. As in: ABBA, then ABBA (or CDDC),..., etc... " " This pattern + iambic tetrameter + grieving/nostalgic theme/angle = the In Memoriam stanza.\n" "Quatrains can serve as sub‑forms of broader constructs (as can triads, couplets, etc); " "Such as: poem with 5+ ABAB, ABCB, or ABxB quatrains is plausibly a Ballad, especially " "if lines alternate iambic tetrameter (1st/3rd lines) & iambic trimeter (2nd/4th). \n" "2. The Sonnet: often 14 lines; its types include:\n" "i. Shakespearean Sonnet: ABAB CDCD EFEF GG rhyme scheme in 4 stanzas " "(3 quatrains + 1 concluding matched-rhyme couplet); \n" "ii. Spenserian Sonnet: ABAB BCBC CDCD EE scheme (interlocked quatrains, otherwise like Shakespearean); \n" "iii. Petrarchan Sonnet: Either in 2 stanzas (as ABBAABBA CDCDCD: 1 octave + 1 sestet) or 4 " "(as ABBA ABBA CDCD CD: 2 enveloped ABBA + 1 interlocked CDCD quatrains + 1 CD couplet). \n" "3. Octave: 8 line stanza, used in such forms as: \n" "i. Common Octave: ABCABCAC scheme. Balancedly complex purview of one aspect of theme/subject. \n" "ii. Ottava Rima: pentameter lines with ABABABCC rhyme scheme. \n" "4. Rondine: ABBAABR ABBAR (7-line septet + 5-line quintet; both end in R –> Refrain reprising or inverting line 1). \n" "5. Limerick: strict AABBA quintet; humorous, satyrical, playful, or raunchy; often with named characters. \n" "6. English Ode: ABAB CDECDE; 10-line poem of praise, tribute, love, or lust. \n" "7. Triad: AAA BBB CCC; 3 mono-rhymed triplets/tercets. \n" "Compose a poem formally, metrically/prosodically, tonally, stylistically, rhythmically, and thematically \n" "indistinguishable from the work of any poet, style, or/and poetic form specified in the prompt. \n" "Do not narrate or explain your workflow. Output the poem only: \n" ) chatbot = gr.Chatbot( height=600, placeholder=PLACEHOLDER, reasoning_tags=[(THINKING_START, THINKING_END)], ) with gr.Blocks(css=css, theme=gr.themes.Glass()) as demo: gr.HTML(TITLE) gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") gr.ChatInterface( fn=stream_chat, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Textbox( value=SYSTEM_PROMPT_DEFAULT, label="System Prompt", render=False, ), gr.Checkbox( label="Thinking", value=False, info="Enable model reasoning (shown in a collapsible section)", render=False, ), gr.Slider(minimum=0, maximum=2, step=0.01, value=0.7, label="Temperature", render=False), gr.Slider(minimum=128, maximum=8192, step=1, value=2048, label="Max new tokens", render=False), gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.90, label="top_p", render=False), gr.Slider(minimum=0, maximum=100, step=1, value=64, label="top_k", render=False), gr.Slider(minimum=0.0, maximum=4.0, step=0.1, value=1.0, label="Repetition penalty", render=False), ], examples=[ ["Compose a Petrarchan Sonnet in the style of Boris Poplavsky."], ["Выстрочи ка мне стишок Хильды Дулитл под слог, а по тону сущий Летов – Уильям Блейк страны советов. Тема: новый зомби бог."], ["Écris un pantoum en imitant le style de Jules Laforgue."], ["Write an ode to San Francisco."], ], cache_examples=False, ) if __name__ == "__main__": demo.queue(default_concurrency_limit=None) demo.launch(share=True, mcp_server=True)