Spaces:
Sleeping
Sleeping
File size: 5,656 Bytes
6e82314 4f711b0 0673a12 2cbdbe8 937d274 e3ea79a 0673a12 7fcb72d 4bf4a35 4f711b0 3f4ce15 2cbdbe8 d78035c 8d2936e 937d274 8d2936e 937d274 8d2936e 2cbdbe8 abe4cc3 b638223 2cbdbe8 9f79fe4 2454010 9f79fe4 3f4ce15 6c4916c 9f79fe4 7fcb72d 4bf4a35 d78035c c0af0d6 d78035c 0673a12 4f711b0 2cbdbe8 6c05290 d78035c 6fc92b9 2cbdbe8 e3ea79a 9f79fe4 4f711b0 d78035c e3ea79a 6fc92b9 e3ea79a 6c4916c e3ea79a 6c4916c e3ea79a 937d274 e3ea79a 6c4916c e3ea79a 9f79fe4 6c4916c 6fc92b9 c0af0d6 8d2936e 6fc92b9 b638223 e3ea79a 6c05290 e3ea79a 9f79fe4 e3ea79a 4f711b0 d78035c abe4cc3 2cbdbe8 6fc92b9 4f711b0 e3ea79a 937d274 2cbdbe8 9f79fe4 7fcb72d 9f79fe4 1cf6170 7fcb72d 1cf6170 a539a2e d78035c a539a2e e3ea79a 1cf6170 7fcb72d e3ea79a 7fcb72d 1cf6170 3f4ce15 99c05e6 8d2936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import time
import warnings
import re
import gc
warnings.filterWarnings("ignore", category=UserWarning, module="torch._utils")
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import gradio as gr
import psutil
# Print system resources for debugging
def print_system_resources():
memory = psutil.virtual_memory()
cpu_percent = psutil.cpu_percent(interval=1)
# Get container memory limit (for Docker)
try:
with open('/sys/fs/cgroup/memory/memory.limit_in_bytes', 'r') as f:
mem_limit = min(int(f.read().strip()) / 1e9, 16.0) # Cap at 16GB for HFS free
except:
mem_limit = 16.0 # Fallback for HFS free
print(f"Total physical memory (psutil): {memory.total/1e9:.2f} GB")
print(f"Container memory limit: {mem_limit:.2f} GB")
print(f"CPU usage: {cpu_percent}%")
print(f"Memory usage: {min(memory.used / (mem_limit * 1e9) * 100, 100):.1f}% ({memory.used/1e9:.2f}/{mem_limit:.2f} GB)")
print(f"Active processes: {len(psutil.pids())}")
# Print Gradio version for debugging
print(f"Gradio version: {gr.__version__}")
# Load model and tokenizer
model_id = "NlpHUST/gpt2-vietnamese"
try:
tokenizer = GPT2Tokenizer.from_pretrained(model_id)
model = GPT2LMHeadModel.from_pretrained(model_id)
except Exception as e:
print(f"Error loading model: {e}")
raise e
# Set pad_token_id to eos_token_id if not set
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.eos_token_id
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Apply quantization to reduce memory and speed up
model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
print(f"Model quantized: {model.__class__.__name__}")
# Print device and memory info for debugging
print(f"Device: {device}")
print(f"Memory allocated: {torch.cuda.memory_allocated(device)/1e9:.2f} GB" if torch.cuda.is_available() else "CPU only")
print_system_resources()
def clean_text(text):
"""Normalize text by removing invalid characters and extra spaces."""
text = re.sub(r'[^\w\s.,!?àáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def generate_text(prompt, temperature=0.5, max_new_tokens=30):
try:
start_time = time.time()
print_system_resources()
# Fixed parameters
max_length = 50
top_k = 20
repetition_penalty = 1.2
# Log parameters
print(f"Parameters: max_length={max_length}, temperature={temperature}, max_new_tokens={max_new_tokens}, top_k={top_k}, repetition_penalty={repetition_penalty}")
# Encode input with attention mask
encode_time = time.time()
inputs = tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_length
).to(device)
print(f"Encoding time: {time.time() - encode_time:.2f} seconds")
print(f"Input tokens: {len(inputs['input_ids'][0])}")
# Define EOS token IDs for '.', '!', '?'
eos_token_ids = [tokenizer.encode(s)[0] for s in ['.', '!', '?']]
print(f"EOS token IDs: {eos_token_ids}")
# Generate text
gen_time = time.time()
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=int(max_new_tokens),
min_length=3,
do_sample=True,
top_k=int(top_k),
temperature=temperature,
repetition_penalty=repetition_penalty,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=eos_token_ids
)
print(f"Generation time: {time.time() - gen_time:.2f} seconds")
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Raw output: {generated_text}")
print(f"Generated token count: {len(outputs[0])}")
cleaned_text = clean_text(generated_text)
print(f"Cleaned output: {cleaned_text}")
elapsed_time = time.time() - start_time
print(f"Total time: {elapsed_time:.2f} seconds")
# Clear memory cache
gc.collect()
return cleaned_text
except Exception as e:
return f"Error generating text: {e}"
# Gradio interface
demo = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(
label="Nhập văn bản đầu vào",
placeholder="Viết gì đó bằng tiếng Việt...",
value="Hôm nay là một ngày đẹp trời" # Default text
),
gr.Slider(0.3, 0.7, value=0.5, step=0.1, label="Nhiệt độ (Temperature, 0.3-0.5 cho tốc độ nhanh, 0.6-0.7 cho đa dạng hơn)"),
gr.Slider(20, 50, value=30, step=5, label="Số token mới tối đa (max_new_tokens, 20-30 cho tốc độ nhanh, 40-50 cho câu dài hơn)")
],
outputs="text",
title="Sinh văn bản tiếng Việt",
description="Dùng mô hình GPT-2 Vietnamese từ NlpHUST để sinh văn bản tiếng Việt. Chọn temperature 0.3-0.5 và max_new_tokens 20-30 để đạt thời gian <2 giây. Dùng temperature 0.6-0.7 và max_new_tokens 40-50 cho câu dài và đa dạng hơn.",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |