File size: 5,656 Bytes
6e82314
4f711b0
0673a12
2cbdbe8
937d274
e3ea79a
0673a12
7fcb72d
 
4bf4a35
4f711b0
3f4ce15
2cbdbe8
 
 
d78035c
8d2936e
 
 
937d274
8d2936e
937d274
8d2936e
 
2cbdbe8
abe4cc3
b638223
 
 
 
2cbdbe8
9f79fe4
2454010
9f79fe4
 
 
 
 
 
3f4ce15
6c4916c
 
 
 
 
9f79fe4
7fcb72d
 
 
4bf4a35
d78035c
 
 
 
c0af0d6
d78035c
0673a12
 
 
4f711b0
 
2cbdbe8
6c05290
d78035c
 
6fc92b9
2cbdbe8
e3ea79a
9f79fe4
4f711b0
d78035c
e3ea79a
 
 
 
6fc92b9
e3ea79a
6c4916c
e3ea79a
6c4916c
 
 
 
 
 
 
e3ea79a
937d274
e3ea79a
 
 
6c4916c
e3ea79a
9f79fe4
6c4916c
 
6fc92b9
c0af0d6
8d2936e
6fc92b9
b638223
e3ea79a
6c05290
e3ea79a
9f79fe4
e3ea79a
4f711b0
d78035c
abe4cc3
2cbdbe8
6fc92b9
4f711b0
e3ea79a
937d274
 
2cbdbe8
9f79fe4
 
7fcb72d
9f79fe4
1cf6170
7fcb72d
1cf6170
a539a2e
 
 
d78035c
a539a2e
e3ea79a
 
1cf6170
 
7fcb72d
e3ea79a
7fcb72d
1cf6170
3f4ce15
99c05e6
8d2936e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import time
import warnings
import re
import gc
warnings.filterWarnings("ignore", category=UserWarning, module="torch._utils")

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import gradio as gr
import psutil

# Print system resources for debugging
def print_system_resources():
    memory = psutil.virtual_memory()
    cpu_percent = psutil.cpu_percent(interval=1)
    # Get container memory limit (for Docker)
    try:
        with open('/sys/fs/cgroup/memory/memory.limit_in_bytes', 'r') as f:
            mem_limit = min(int(f.read().strip()) / 1e9, 16.0)  # Cap at 16GB for HFS free
    except:
        mem_limit = 16.0  # Fallback for HFS free
    print(f"Total physical memory (psutil): {memory.total/1e9:.2f} GB")
    print(f"Container memory limit: {mem_limit:.2f} GB")
    print(f"CPU usage: {cpu_percent}%")
    print(f"Memory usage: {min(memory.used / (mem_limit * 1e9) * 100, 100):.1f}% ({memory.used/1e9:.2f}/{mem_limit:.2f} GB)")
    print(f"Active processes: {len(psutil.pids())}")

# Print Gradio version for debugging
print(f"Gradio version: {gr.__version__}")

# Load model and tokenizer
model_id = "NlpHUST/gpt2-vietnamese"
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_id)
    model = GPT2LMHeadModel.from_pretrained(model_id)
except Exception as e:
    print(f"Error loading model: {e}")
    raise e

# Set pad_token_id to eos_token_id if not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.eos_token_id

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Apply quantization to reduce memory and speed up
model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
print(f"Model quantized: {model.__class__.__name__}")

# Print device and memory info for debugging
print(f"Device: {device}")
print(f"Memory allocated: {torch.cuda.memory_allocated(device)/1e9:.2f} GB" if torch.cuda.is_available() else "CPU only")
print_system_resources()

def clean_text(text):
    """Normalize text by removing invalid characters and extra spaces."""
    text = re.sub(r'[^\w\s.,!?àáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def generate_text(prompt, temperature=0.5, max_new_tokens=30):
    try:
        start_time = time.time()
        print_system_resources()
        # Fixed parameters
        max_length = 50
        top_k = 20
        repetition_penalty = 1.2
        # Log parameters
        print(f"Parameters: max_length={max_length}, temperature={temperature}, max_new_tokens={max_new_tokens}, top_k={top_k}, repetition_penalty={repetition_penalty}")
        # Encode input with attention mask
        encode_time = time.time()
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(device)
        print(f"Encoding time: {time.time() - encode_time:.2f} seconds")
        print(f"Input tokens: {len(inputs['input_ids'][0])}")
        # Define EOS token IDs for '.', '!', '?'
        eos_token_ids = [tokenizer.encode(s)[0] for s in ['.', '!', '?']]
        print(f"EOS token IDs: {eos_token_ids}")
        # Generate text
        gen_time = time.time()
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=int(max_new_tokens),
            min_length=3,
            do_sample=True,
            top_k=int(top_k),
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=eos_token_ids
        )
        print(f"Generation time: {time.time() - gen_time:.2f} seconds")
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Raw output: {generated_text}")
        print(f"Generated token count: {len(outputs[0])}")
        cleaned_text = clean_text(generated_text)
        print(f"Cleaned output: {cleaned_text}")
        elapsed_time = time.time() - start_time
        print(f"Total time: {elapsed_time:.2f} seconds")
        # Clear memory cache
        gc.collect()
        return cleaned_text
    except Exception as e:
        return f"Error generating text: {e}"

# Gradio interface
demo = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(
            label="Nhập văn bản đầu vào",
            placeholder="Viết gì đó bằng tiếng Việt...",
            value="Hôm nay là một ngày đẹp trời"  # Default text
        ),
        gr.Slider(0.3, 0.7, value=0.5, step=0.1, label="Nhiệt độ (Temperature, 0.3-0.5 cho tốc độ nhanh, 0.6-0.7 cho đa dạng hơn)"),
        gr.Slider(20, 50, value=30, step=5, label="Số token mới tối đa (max_new_tokens, 20-30 cho tốc độ nhanh, 40-50 cho câu dài hơn)")
    ],
    outputs="text",
    title="Sinh văn bản tiếng Việt",
    description="Dùng mô hình GPT-2 Vietnamese từ NlpHUST để sinh văn bản tiếng Việt. Chọn temperature 0.3-0.5 và max_new_tokens 20-30 để đạt thời gian <2 giây. Dùng temperature 0.6-0.7 và max_new_tokens 40-50 cho câu dài và đa dạng hơn.",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)