Worldish_Poet / app5.py
AlekseyCalvin's picture
Rename app.py to app5.py
1876d6f verified
import os
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
from huggingface_hub import login
import gradio as gr
from threading import Thread
# Unused but preserved for iteration
try:
import charactertokenizer
except ImportError:
charactertokenizer = None
MODEL = "AlekseyCalvin/Lyrical_Llama31_8B_ru2en_SFT"
HF_TOKEN = os.environ.get("HF_TOKEN")
if HF_TOKEN:
login(token=HF_TOKEN)
TITLE = """
<h1><center>LYRICAL Machine Translation Russian2English Model Testing Hall </center></h1>
<center>
<p>The model is licensed under apache 2.0</p>
</center>
"""
PLACEHOLDER = """
<center>
<p>Prototyping an LLM for song/poem translation fluently adaptive to complexly dimensioned fidelity constraints.</p>
</center>
"""
css = """
.duplicate-button {
margin: auto !important;
color: white !important;
background: black !important;
border-radius: 100vh !important;
}
h3 {
text-align: center;
}
"""
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
# NV FP4 / 4-bit quantization config
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
# Load model with FP4 + Sage Attention Kernel
print("Loading model with FP4 quantization and Sage Attention...")
model = AutoModelForCausalLM.from_pretrained(
MODEL,
quantization_config=nf4_config,
device_map="cuda",
trust_remote_code=True,
use_kernels=True,
attn_implementation="kernels-community/flash-attn3", # Sage Attention implementation
)
print(f"Model loaded on {model.device}, dtype: {model.dtype}")
# ============================================================
# CHAT LOGIC
# ============================================================
@spaces.GPU
def stream_chat(
message: str,
history: list,
system_prompt: str,
temperature: float = 0.1,
max_new_tokens: int = 512,
top_p: float = 0.8,
top_k: int = 20,
repetition_penalty: float = 1.8,
):
print(f"Message: {message}")
# Build conversation list starting with System Prompt
conversation = [{"role": "system", "content": system_prompt}]
# Add history (Gradio type="messages" passes list of dicts)
conversation.extend(history)
# Add current user message
conversation.append({"role": "user", "content": message})
# Apply ChatML template
prompt = tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=True,
)
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# Setup streamer
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=True,
)
# Generation kwargs
generate_kwargs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"max_new_tokens": max_new_tokens,
"do_sample": temperature > 0,
"temperature": temperature if temperature > 0 else None,
"top_p": top_p,
"top_k": top_k if top_k > 0 else None,
"repetition_penalty": repetition_penalty,
"streamer": streamer,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"use_cache": True,
}
# Filter None values to avoid warnings
generate_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None}
# Generate in thread
thread = Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
# Stream output
buffer = ""
for new_text in streamer:
buffer += new_text
# Clean up potential artifacts if necessary, though skip_special_tokens handles most
if "<|endoftext|>" in buffer:
buffer = buffer.split("<|endoftext|>")[0].strip()
yield buffer
break
yield buffer
# ============================================================
# GRADIO INTERFACE
# ============================================================
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
# Blocks context WITHOUT css or theme args, as requested
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
gr.ChatInterface(
fn=stream_chat,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Dials", open=True),
additional_inputs=[
gr.Textbox(
value="You are a virtuosic multilingual poet, versatile literary translator, computational linguist, philologist, and singer-songwriter. Translate the below Russian poem, song lyric, or fragment by rigorously discerning its most holistically representative English adaptation possible, faithfully reproducing every source line's lyrical, semantic, formal, poetic, symbolic, cultural, rhythmic, idiomatic, & phonetic features. Fully conveying source meanings, results must also exhibit full fidelity to its meter, imagery, rhyme scheme, tone, and nuances of style and wordplay. \nCrucially, to translate Russian syllabotonic verse, start by analyzing source scansion, rhythm, rhyme, and phonetic patterns to map the adaptation's formal constraints.\nThus, before translating, silently discern:\n1. Each source line's SYLLABLE COUNT\n2. Each line's likeliest syllable STRESS PATTERN (For ex.: if (/) = stressed, (x) = unstressed, then '/ x x / x x / x x / x x' is the stress pattern for the line 'Двигаю ручками, двигаю ножками' and its adaptation 'Moving my handy-arms, leg-footies motioning')\n3.Its METER (dactylic (/ x x) in our example lines, other meters include amphibrachic (x / x), anapestic (x x /), trochaic (/ x), iambic (x /), etc)\n4. Its FOOT (which may be tetrameter (our examples) or dimeter, pentameter, hexameter, etc)\n5. RHYME SCHEME of entire verse (if 1st/3rd lines & 2nd/4th lines rhyme = ABAB scheme, etc)\n6.Other FEATURES, formal or idiomatic or literary (tone, speaker, theme, imagery, meaning, allusions).\n7. Phonetic/musical character of source lines.\nFinally, devise a natural-sounding English adaptation faithful to source meanings and other features alike. Do not add anything besides the translation. Do not explain process. Present only the translation. Переведи на Английский. Translate the following Russian source text to English.\nRussian: ",
label="System Prompt",
),
gr.Slider(minimum=0, maximum=2, step=0.01, value=0.1, label="Temperature"),
gr.Slider(minimum=128, maximum=8192, step=1, value=512, label="Max new tokens"),
gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.8, label="top_p"),
gr.Slider(minimum=0, maximum=50, step=1, value=20, label="top_k"),
gr.Slider(minimum=0.0, maximum=4.0, step=0.1, value=1.8, label="Repetition penalty"),
],
examples=[
["Translate the following song, adapting all formal and poetic characteristics (like meter, tone, mood, rhyme-scheme, and syllable patterns): // На карте кружком обозначено солнце, Пунктирные линии, ветер и соль, Под веками плавают сны на оконце, Спадает волнами зеленая боль, На наших глазах исчезают потери; Душа выпускает скопившийся страх; Я слышу шаги, открываются двери И смерть исчезает на наших глазах; На наших глазах; На наших глазах"],
["Translate these song lyrics into English: Иду я на верёвочке вздыхаю на ходу / Доска моя кончается сейчас я упаду / Под ноги под колёса под тяжёлый молоток / Всё с молотка / О, продана смерть моя…"],
["Translate: Мы вышли за рамки людских представлений / И даже представить себе не могли / Что выше всех горестей, бед и мучений / Мы будем под слоем промёрзшей земли."],
],
cache_examples=False,
)
if __name__ == "__main__":
demo.queue(default_concurrency_limit=None)
# Strictly adhering to requested launch parameters structure
demo.launch(share=True, css=css, theme=gr.themes.Glass(), mcp_server=True)