Spaces:

AlekseyCalvin
/

Worldish_Poet

Sleeping

App Files Files Community

Worldish_Poet / app5.py

AlekseyCalvin

Rename app.py to app5.py

1876d6f verified 4 months ago

raw

history blame contribute delete

8.77 kB

	import os
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
	from huggingface_hub import login
	import gradio as gr
	from threading import Thread

	# Unused but preserved for iteration
	try:
	import charactertokenizer
	except ImportError:
	charactertokenizer = None

	MODEL = "AlekseyCalvin/Lyrical_Llama31_8B_ru2en_SFT"
	HF_TOKEN = os.environ.get("HF_TOKEN")
	if HF_TOKEN:
	login(token=HF_TOKEN)

	TITLE = """
	<h1><center>LYRICAL Machine Translation Russian2English Model Testing Hall </center></h1>
	<center>
	<p>The model is licensed under apache 2.0</p>
	</center>
	"""

	PLACEHOLDER = """
	<center>
	<p>Prototyping an LLM for song/poem translation fluently adaptive to complexly dimensioned fidelity constraints.</p>
	</center>
	"""

	css = """
	.duplicate-button {
	margin: auto !important;
	color: white !important;
	background: black !important;
	border-radius: 100vh !important;
	}
	h3 {
	text-align: center;
	}
	"""

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

	# NV FP4 / 4-bit quantization config
	nf4_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	# Load model with FP4 + Sage Attention Kernel
	print("Loading model with FP4 quantization and Sage Attention...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL,
	quantization_config=nf4_config,
	device_map="cuda",
	trust_remote_code=True,
	use_kernels=True,
	attn_implementation="kernels-community/flash-attn3", # Sage Attention implementation
	)

	print(f"Model loaded on {model.device}, dtype: {model.dtype}")

	# ============================================================
	# CHAT LOGIC
	# ============================================================

	@spaces.GPU
	def stream_chat(
	message: str,
	history: list,
	system_prompt: str,
	temperature: float = 0.1,
	max_new_tokens: int = 512,
	top_p: float = 0.8,
	top_k: int = 20,
	repetition_penalty: float = 1.8,
	):
	print(f"Message: {message}")

	# Build conversation list starting with System Prompt
	conversation = [{"role": "system", "content": system_prompt}]

	# Add history (Gradio type="messages" passes list of dicts)
	conversation.extend(history)

	# Add current user message
	conversation.append({"role": "user", "content": message})

	# Apply ChatML template
	prompt = tokenizer.apply_chat_template(
	conversation,
	tokenize=False,
	add_generation_prompt=True,
	)

	# Tokenize
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	# Setup streamer
	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	)

	# Generation kwargs
	generate_kwargs = {
	"input_ids": inputs["input_ids"],
	"attention_mask": inputs["attention_mask"],
	"max_new_tokens": max_new_tokens,
	"do_sample": temperature > 0,
	"temperature": temperature if temperature > 0 else None,
	"top_p": top_p,
	"top_k": top_k if top_k > 0 else None,
	"repetition_penalty": repetition_penalty,
	"streamer": streamer,
	"pad_token_id": tokenizer.pad_token_id,
	"eos_token_id": tokenizer.eos_token_id,
	"use_cache": True,
	}

	# Filter None values to avoid warnings
	generate_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None}

	# Generate in thread
	thread = Thread(target=model.generate, kwargs=generate_kwargs)
	thread.start()

	# Stream output
	buffer = ""
	for new_text in streamer:
	buffer += new_text
	# Clean up potential artifacts if necessary, though skip_special_tokens handles most
	if "<\|endoftext\|>" in buffer:
	buffer = buffer.split("<\|endoftext\|>")[0].strip()
	yield buffer
	break
	yield buffer

	# ============================================================
	# GRADIO INTERFACE
	# ============================================================

	chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)

	# Blocks context WITHOUT css or theme args, as requested
	with gr.Blocks() as demo:
	gr.HTML(TITLE)
	gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")

	gr.ChatInterface(
	fn=stream_chat,
	chatbot=chatbot,
	fill_height=True,
	additional_inputs_accordion=gr.Accordion(label="⚙️ Dials", open=True),
	additional_inputs=[
	gr.Textbox(
	value="You are a virtuosic multilingual poet, versatile literary translator, computational linguist, philologist, and singer-songwriter. Translate the below Russian poem, song lyric, or fragment by rigorously discerning its most holistically representative English adaptation possible, faithfully reproducing every source line's lyrical, semantic, formal, poetic, symbolic, cultural, rhythmic, idiomatic, & phonetic features. Fully conveying source meanings, results must also exhibit full fidelity to its meter, imagery, rhyme scheme, tone, and nuances of style and wordplay. \nCrucially, to translate Russian syllabotonic verse, start by analyzing source scansion, rhythm, rhyme, and phonetic patterns to map the adaptation's formal constraints.\nThus, before translating, silently discern:\n1. Each source line's SYLLABLE COUNT\n2. Each line's likeliest syllable STRESS PATTERN (For ex.: if (/) = stressed, (x) = unstressed, then '/ x x / x x / x x / x x' is the stress pattern for the line 'Двигаю ручками, двигаю ножками' and its adaptation 'Moving my handy-arms, leg-footies motioning')\n3.Its METER (dactylic (/ x x) in our example lines, other meters include amphibrachic (x / x), anapestic (x x /), trochaic (/ x), iambic (x /), etc)\n4. Its FOOT (which may be tetrameter (our examples) or dimeter, pentameter, hexameter, etc)\n5. RHYME SCHEME of entire verse (if 1st/3rd lines & 2nd/4th lines rhyme = ABAB scheme, etc)\n6.Other FEATURES, formal or idiomatic or literary (tone, speaker, theme, imagery, meaning, allusions).\n7. Phonetic/musical character of source lines.\nFinally, devise a natural-sounding English adaptation faithful to source meanings and other features alike. Do not add anything besides the translation. Do not explain process. Present only the translation. Переведи на Английский. Translate the following Russian source text to English.\nRussian: ",
	label="System Prompt",
	),
	gr.Slider(minimum=0, maximum=2, step=0.01, value=0.1, label="Temperature"),
	gr.Slider(minimum=128, maximum=8192, step=1, value=512, label="Max new tokens"),
	gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.8, label="top_p"),
	gr.Slider(minimum=0, maximum=50, step=1, value=20, label="top_k"),
	gr.Slider(minimum=0.0, maximum=4.0, step=0.1, value=1.8, label="Repetition penalty"),
	],
	examples=[
	["Translate the following song, adapting all formal and poetic characteristics (like meter, tone, mood, rhyme-scheme, and syllable patterns): // На карте кружком обозначено солнце, Пунктирные линии, ветер и соль, Под веками плавают сны на оконце, Спадает волнами зеленая боль, На наших глазах исчезают потери; Душа выпускает скопившийся страх; Я слышу шаги, открываются двери И смерть исчезает на наших глазах; На наших глазах; На наших глазах"],
	["Translate these song lyrics into English: Иду я на верёвочке вздыхаю на ходу / Доска моя кончается сейчас я упаду / Под ноги под колёса под тяжёлый молоток / Всё с молотка / О, продана смерть моя…"],
	["Translate: Мы вышли за рамки людских представлений / И даже представить себе не могли / Что выше всех горестей, бед и мучений / Мы будем под слоем промёрзшей земли."],
	],
	cache_examples=False,
	)

	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=None)
	# Strictly adhering to requested launch parameters structure
	demo.launch(share=True, css=css, theme=gr.themes.Glass(), mcp_server=True)