import os
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from huggingface_hub import login
import gradio as gr
from threading import Thread
# Unused but preserved for iteration
try:
import charactertokenizer
except ImportError:
charactertokenizer = None
MODEL = "AlekseyCalvin/Lyrical_Llama31_8B_ru2en_SFT"
HF_TOKEN = os.environ.get("HF_TOKEN")
login(token=HF_TOKEN)
TITLE = """
LYRICAL Machine Translation Russian2English Model Testing Hall
The model is licensed under apache 2.0
"""
PLACEHOLDER = """
Prototyping an LLM for song/poem translation fluently adaptive to complexly dimensioned fidelity constraints.
"""
css = """
.duplicate-button {
margin: auto !important;
color: white !important;
background: black !important;
border-radius: 100vh !important;
}
h3 {
text-align: center;
}
"""
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
# NV FP4 / 4-bit quantization config
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NF4 = Normal Float 4 (optimal for weights)
bnb_4bit_use_double_quant=True, # Double quantization for memory savings
bnb_4bit_compute_dtype=torch.bfloat16, # Compute in BF16
)
# Load model with FP4 + FA3 kernels (NO AOT - simpler and reliable)
print("Loading model with FP4 quantization and Flash Attention 3...")
model = AutoModelForCausalLM.from_pretrained(
MODEL,
quantization_config=nf4_config,
device_map="cuda",
trust_remote_code=True,
use_kernels=True, # Enable HF Kernel Hub
attn_implementation="kernels-community/vllm-flash-attn3", # FA3 for H200
)
print(f"Model loaded on {model.device}, dtype: {model.dtype}")
# ============================================================
# CHAT LOGIC
# ============================================================
def format_chat(system_prompt, history, message):
"""Format using proper ChatML templating"""
messages = [{"role": "system", "content": system_prompt}]
for turn in history:
if isinstance(turn, dict):
messages.append(turn)
else:
# Legacy tuple support (shouldn't happen in Gradio 6+)
user_msg, assistant_msg = turn
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
return tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
@spaces.GPU
def stream_chat(
message: str,
history: list,
system_prompt: str,
temperature: float = 0.1,
max_new_tokens: int = 512,
top_p: float = 0.8,
top_k: int = 20,
repetition_penalty: float = 1.8,
):
print(f"Message: {message}")
print(f"History: {history}")
# Format prompt using ChatML
prompt = format_chat(system_prompt, history, message)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Setup streamer for token-by-token generation
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True,
)
# Generation kwargs
generate_kwargs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"max_new_tokens": max_new_tokens,
"do_sample": temperature > 0,
"temperature": temperature if temperature > 0 else None,
"top_p": top_p,
"top_k": top_k if top_k > 0 else None,
"repetition_penalty": repetition_penalty,
"streamer": streamer,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"use_cache": True,
}
# Remove None values for clean kwargs
generate_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None}
# Generate in background thread (required for streaming)
thread = Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
# Stream output
buffer = ""
for new_text in streamer:
buffer += new_text
# Check for end tokens
if "<|endoftext|>" in buffer:
buffer = buffer.split("<|endoftext|>")[0].strip()
yield buffer
break
yield buffer
# ============================================================
# GRADIO INTERFACE
# ============================================================
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
gr.ChatInterface(
fn=stream_chat,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Dials", open=True),
additional_inputs=[
gr.Textbox(
value="You are a virtuosic multilingual poet, versatile literary translator, computational linguist, philologist, and singer-songwriter. Translate the below Russian poem, song lyric, or fragment by rigorously discerning its most holistically representative English adaptation possible, faithfully reproducing every source line's lyrical, semantic, formal, poetic, symbolic, cultural, rhythmic, idiomatic, & phonetic features. Fully conveying source meanings, results must also exhibit full fidelity to its meter, imagery, rhyme scheme, tone, and nuances of style and wordplay. \nCrucially, to translate Russian syllabotonic verse, start by analyzing source scansion, rhythm, rhyme, and phonetic patterns to map the adaptation's formal constraints.\nThus, before translating, silently discern:\n1. Each source line's SYLLABLE COUNT\n2. Each line's likeliest syllable STRESS PATTERN (For ex.: if (/) = stressed, (x) = unstressed, then '/ x x / x x / x x / x x' is the stress pattern for the line 'Двигаю ручками, двигаю ножками' and its adaptation 'Moving my handy-arms, leg-footies motioning')\n3.Its METER (dactylic (/ x x) in our example lines, other meters include amphibrachic (x / x), anapestic (x x /), trochaic (/ x), iambic (x /), etc)\n4. Its FOOT (which may be tetrameter (our examples) or dimeter, pentameter, hexameter, etc)\n5. RHYME SCHEME of entire verse (if 1st/3rd lines & 2nd/4th lines rhyme = ABAB scheme, etc)\n6.Other FEATURES, formal or idiomatic or literary (tone, speaker, theme, imagery, meaning, allusions).\n7. Phonetic/musical character of source lines.\nFinally, devise a natural-sounding English adaptation faithful to source meanings and other features alike. Do not add anything besides the translation. Do not explain process. Present only the translation. Переведи на Английский. Translate the following Russian source text to English.\nRussian: ",
label="System Prompt",
),
gr.Slider(minimum=0, maximum=2, step=0.01, value=0.1, label="Temperature"),
gr.Slider(minimum=128, maximum=8192, step=1, value=512, label="Max new tokens"),
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.8, label="top_p"),
gr.Slider(minimum=0, maximum=50, step=1, value=20, label="top_k"),
gr.Slider(minimum=0.0, maximum=4.0, step=0.1, value=1.8, label="Repetition penalty"),
],
examples=[
["Translate the following song, adapting all formal and poetic characteristics (like meter, tone, mood, rhyme-scheme, and syllable patterns): // На карте кружком обозначено солнце, Пунктирные линии, ветер и соль, Под веками плавают сны на оконце, Спадает волнами зеленая боль, На наших глазах исчезают потери; Душа выпускает скопившийся страх; Я слышу шаги, открываются двери И смерть исчезает на наших глазах; На наших глазах; На наших глазах"],
["Translate these song lyrics into English: Иду я на верёвочке вздыхаю на ходу / Доска моя кончается сейчас я упаду / Под ноги под колёса под тяжёлый молоток / Всё с молотка / О, продана смерть моя…"],
["Translate: Мы вышли за рамки людских представлений / И даже представить себе не могли / Что выше всех горестей, бед и мучений / Мы будем под слоем промёрзшей земли."],
],
cache_examples=False,
)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=None)
demo.launch(share=True, css=css, theme=gr.themes.Glass(), mcp_server=True)