LYRICAL Machine Translation Russian2English Model Testing Hall

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from huggingface_hub import hf_hub_download, HfFileSystem, ModelCard
from huggingface_hub import login
try:
    import charactertokenizer
except ImportError:
    # charactertokenizer incompatible with transformers 5.x (uses removed download_url)
    # Preserved for future iteration with older model versions if needed
    charactertokenizer = None
import gradio as gr
import tempfile
import requests
import re

from threading import Thread

#MODEL = "AlekseyCalvin/LYRICAL_MT_ru2en_28_charllama3bsophist_2.6b"

MODEL = "AlekseyCalvin/LYRICAL_MT_ru2en_21_SystemGemma2_9b_4epochs"
HF_TOKEN = os.environ.get("HF_TOKEN")
login(token=HF_TOKEN)

TITLE = """
<h1><center>LYRICAL Machine Translation Russian2English Model Testing Hall </center></h1>
<center>
<p>The model is licensed under apache 2.0</p>
</center>
"""

PLACEHOLDER = """
<center>
<p>Prototyping an LLM for song/poem translation fluently adaptive to complexly dimensioned fidelity constraints.</p>
</center>
"""

css = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
"""

# ZeroGPU compatible device handling - don't initialize CUDA at module level
# device is determined dynamically in the GPU-decorated function
device = "cuda"  # Placeholder for clarity, actual device set by spaces.GPU context

#nf4_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_use_double_quant=True
#)

# Load tokenizer and model with modern transformers 5.x best practices
# use_kernels=True enables optimized kernels from HF Kernel Hub
# attn_implementation="flash_attention_2" for efficient attention (ZeroGPU compatible)
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

# Model loading happens at module level but CUDA is only initialized inside @spaces.GPU
# This is compatible with ZeroGPU because spaces module intercepts CUDA calls
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    dtype="auto",  # Transformers 5.x: avoid double-loading weights
#    quantization_config=nf4_config,
    device_map="auto",
    trust_remote_code=True,
    ignore_mismatched_sizes=True,
    use_kernels=True,  # Enable HF Kernel Hub optimizations
    attn_implementation="kernels-community/vllm-flash-attn3",  # Use Flash Attention 2 (ZeroGPU H200 compatible)
)

def format_chat(system_prompt, history, message):
    formatted_chat = f"<start_of_turn>system\n{system_prompt}<end_of_turn>\n"

    # Gradio 6+: history is now list of dicts with 'role' and 'content', not tuples
    for turn in history:
        if isinstance(turn, dict):
            # New format: {"role": "user"/"assistant", "content": "..."}
            if turn.get("role") == "user":
                formatted_chat += f"<start_of_turn>user\n{turn.get('content', '')}<end_of_turn>\n"
            elif turn.get("role") == "assistant":
                formatted_chat += f"<start_of_turn>assistant\nEnglish:{turn.get('content', '')}<end_of_turn>\n"
        else:
            # Fallback for tuple format (legacy compatibility)
            prompt, answer = turn
            formatted_chat += f"<start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>assistant\nEnglish:{answer}<end_of_turn>\n"

    formatted_chat += f"<start_of_turn>user\n{message}<end_of_turn>\n<start_of_turn>assistant\n"
    return formatted_chat

@spaces.GPU # Explicit duration for translation tasks
def stream_chat(
    message: str, 
    history: list,  # Gradio 6+: list of dicts, not tuples
    system_prompt: str,
    temperature: float = 0.2, 
    max_new_tokens: int = 512, 
    top_p: float = 1.0,
    top_k: int = 0, 
    repetition_penalty: float = 1.3,
):
    print(f'message: {message}')
    print(f'history: {history}')

    formatted_prompt = format_chat(system_prompt, history, message)
    
    # Modern transformers: use model.device instead of global device variable
    inputs = tokenizer(formatted_prompt, return_tensors="pt")
    # Move to model's device (handled by device_map="auto")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=5000.0,
        skip_prompt=True, 
        skip_special_tokens=True
    )
    
    # Modern generation arguments for transformers 5.x
    generate_kwargs = dict(
        input_ids=inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        do_sample=False if temperature == 0 else True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        streamer=streamer,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        # Enable efficient caching for generation
        cache_implementation="static",  # transformers 5.x: static KV cache for speed
        use_cache=True,
    )

    # Thread-based generation compatible with ZeroGPU forked processes
    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        if "<|endoftext|>" in buffer:
            yield buffer.split("<|endoftext|>")[0]
            break
        yield buffer

# Gradio 6+: Chatbot uses messages format by default (allow_tags=True default)
chatbot = gr.Chatbot(
    height=600, 
    placeholder=PLACEHOLDER
)

with gr.Blocks() as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    
    # Gradio 6+: ChatInterface with updated API
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Dials", open=True, render=False),
        additional_inputs=[
            gr.Textbox(
                value="You are a virtuosic multilingual poet, versatile literary translator, computational linguist, philologist, and singer-songwriter. Translate the below Russian poem, song lyric, or fragment by rigorously discerning its most holistically representative English adaptation possible, faithfully reproducing every source line's lyrical, semantic, formal, poetic, symbolic, cultural, rhythmic, idiomatic, & phonetic features. Fully conveying source meanings, results must also exhibit full fidelity to its meter, imagery, rhyme scheme, tone, and nuances of style and wordplay. \nCrucially, to translate Russian syllabotonic verse, start by analyzing source scansion, rhythm,  rhyme, and phonetic patterns to map the adaptation's formal constraints.\nThus, before translating, silently discern:\n1. Each source line's SYLLABLE COUNT\n2. Each line's likeliest syllable STRESS PATTERN (For ex.: if (/) = stressed, (x) = unstressed, then '/ x x / x x / x x / x x' is the stress pattern for the line 'Двигаю ручками, двигаю ножками' and its adaptation 'Moving my handy-arms, leg-footies motioning')\n3.Its METER (dactylic (/ x x) in our example lines, other meters include amphibrachic (x / x), anapestic (x x /), trochaic (/ x), iambic (x /), etc)\n4. Its FOOT (which may be  tetrameter (our examples) or dimeter, pentameter, hexameter, etc)\n5. RHYME SCHEME of entire verse (if 1st/3rd lines & 2nd/4th lines rhyme = ABAB scheme, etc)\n6.Other FEATURES, formal or idiomatic or literary (tone, speaker, theme, imagery, meaning, allusions).\n7. Phonetic/musical character of source lines.\nFinally, devise a natural-sounding English adaptation faithful to source meanings and other features alike. Do not add anything besides the translation. Do not explain process. Present only the translation. Переведи на Английский. Translate the following Russian source text to English.\nRussian: ",
                label="System Prompt",
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=2,
                step=0.01,
                value=0.1,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=8192,
                step=1,
                value=512,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=0.8,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=50,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=4.0,
                step=0.1,
                value=1.8,
                label="Repetition penalty",
                render=False,
            ),
        ],
        # Gradio 6+: examples format updated to match messages format
        examples=[
            ["Translate the following song, adapting all formal and poetic characteristics (like meter, tone, mood, rhyme-scheme, and syllable patterns): // На карте кружком обозначено солнце, Пунктирные линии, ветер и соль, Под веками плавают сны на оконце, Спадает волнами зеленая боль, На наших глазах исчезают потери; Душа выпускает скопившийся страх; Я слышу шаги, открываются двери И смерть исчезает на наших глазах; На наших глазах; На наших глазах"],
            ["Translate these song lyrics into English: Иду я на верёвочке вздыхаю на ходу / Доска моя кончается сейчас я упаду / Под ноги под колёса под тяжёлый молоток / Всё с молотка / О, продана смерть моя…"],
            ["Translate: Мы вышли за рамки людских представлений / И даже представить себе не могли / Что выше всех горестей, бед и мучений / Мы будем под слоем промёрзшей земли."],
        ],
        cache_examples=False,  # Gradio 6+: boolean only, use cache_mode for lazy/eager
    )

if __name__ == "__main__":
    demo.queue(default_concurrency_limit=None)
    demo.launch(share=True, theme=gr.themes.Soft(), css=css, mcp_server=True)