VietCat commited on
Commit
4f711b0
·
1 Parent(s): 6c4916c

fix slow response

Browse files
Files changed (1) hide show
  1. app.py +23 -6
app.py CHANGED
@@ -1,10 +1,12 @@
1
  import os
 
2
  import warnings
3
  warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
4
 
5
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
6
  import torch
7
  import gradio as gr
 
8
 
9
  # Load model and tokenizer
10
  model_id = "NlpHUST/gpt2-vietnamese"
@@ -29,8 +31,18 @@ model.eval()
29
  print(f"Device: {device}")
30
  print(f"Memory allocated: {torch.cuda.memory_allocated(device)/1e9:.2f} GB" if torch.cuda.is_available() else "CPU only")
31
 
32
- def generate_text(prompt, max_length=100, temperature=1.0):
 
 
 
 
 
 
 
 
 
33
  try:
 
34
  # Encode input with attention mask
35
  inputs = tokenizer(
36
  prompt,
@@ -44,13 +56,18 @@ def generate_text(prompt, max_length=100, temperature=1.0):
44
  outputs = model.generate(
45
  input_ids=inputs["input_ids"],
46
  attention_mask=inputs["attention_mask"],
47
- max_length=max_length,
48
  temperature=temperature,
49
  do_sample=True,
50
- num_beams=1,
51
- pad_token_id=tokenizer.pad_token_id
 
 
52
  )
53
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
54
  except Exception as e:
55
  return f"Error generating text: {e}"
56
 
@@ -59,7 +76,7 @@ demo = gr.Interface(
59
  fn=generate_text,
60
  inputs=[
61
  gr.Textbox(label="Nhập văn bản đầu vào", placeholder="Viết gì đó bằng tiếng Việt..."),
62
- gr.Slider(20, 300, value=100, step=10, label="Độ dài tối đa"),
63
  gr.Slider(0.5, 1.5, value=1.0, step=0.1, label="Nhiệt độ (Temperature)")
64
  ],
65
  outputs="text",
 
1
  import os
2
+ import time
3
  import warnings
4
  warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
5
 
6
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
7
  import torch
8
  import gradio as gr
9
+ import psutil
10
 
11
  # Load model and tokenizer
12
  model_id = "NlpHUST/gpt2-vietnamese"
 
31
  print(f"Device: {device}")
32
  print(f"Memory allocated: {torch.cuda.memory_allocated(device)/1e9:.2f} GB" if torch.cuda.is_available() else "CPU only")
33
 
34
+ def print_system_resources():
35
+ cpu_percent = psutil.cpu_percent(interval=1)
36
+ memory = psutil.virtual_memory()
37
+ print(f"CPU usage: {cpu_percent}%")
38
+ print(f"Memory usage: {memory.percent}% ({memory.used/1e9:.2f}/{memory.total/1e9:.2f} GB)")
39
+
40
+ # Call before generation
41
+ print_system_resources()
42
+
43
+ def generate_text(prompt, max_length=50, temperature=1.0):
44
  try:
45
+ start_time = time.time()
46
  # Encode input with attention mask
47
  inputs = tokenizer(
48
  prompt,
 
56
  outputs = model.generate(
57
  input_ids=inputs["input_ids"],
58
  attention_mask=inputs["attention_mask"],
59
+ max_new_tokens=30, # Limit new tokens to reduce computation
60
  temperature=temperature,
61
  do_sample=True,
62
+ num_beams=3, # Use beam search for faster generation
63
+ no_repeat_ngram_size=2, # Prevent repetitive phrases
64
+ pad_token_id=tokenizer.pad_token_id,
65
+ early_stopping=True # Stop when generation is complete
66
  )
67
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
68
+ elapsed_time = time.time() - start_time
69
+ print(f"Generation time: {elapsed_time:.2f} seconds")
70
+ return generated_text
71
  except Exception as e:
72
  return f"Error generating text: {e}"
73
 
 
76
  fn=generate_text,
77
  inputs=[
78
  gr.Textbox(label="Nhập văn bản đầu vào", placeholder="Viết gì đó bằng tiếng Việt..."),
79
+ gr.Slider(20, 100, value=50, step=10, label="Độ dài tối đa"),
80
  gr.Slider(0.5, 1.5, value=1.0, step=0.1, label="Nhiệt độ (Temperature)")
81
  ],
82
  outputs="text",