feliksius commited on
Commit
a50bc7d
·
verified ·
1 Parent(s): 66a22e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -62
app.py CHANGED
@@ -1,4 +1,6 @@
 
1
  from fastapi import FastAPI, HTTPException
 
2
  from transformers import pipeline
3
  import langdetect
4
  import logging
@@ -7,39 +9,32 @@ from typing import Optional
7
  import re
8
  from functools import lru_cache
9
  import asyncio
 
 
 
 
 
 
10
 
11
  # Set environment variables for Hugging Face cache
12
- os.environ["HF_HOME"] = "/app/cache"
13
- os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
14
 
15
  # Environment configuration
16
- USE_8BIT = False
17
- try:
18
- import bitsandbytes # hanya untuk memastikan modul tersedia
19
- USE_8BIT = os.getenv("USE_QUANTIZATION", "0") == "1"
20
- except ImportError:
21
- USE_8BIT = False
22
-
23
- DEVICE = int(os.getenv("DEVICE", "-1")) # -1 for CPU, 0+ for GPU
24
  MAX_TEXT_LENGTH = int(os.getenv("MAX_TEXT_LENGTH", "5000"))
25
 
26
- app = FastAPI()
27
-
28
- # Configure logging with timestamp and level
29
  logging.basicConfig(
30
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
31
- level=logging.INFO,
32
- handlers=[
33
- logging.handlers.RotatingFileHandler("/app/logs/app.log", maxBytes=1000000, backupCount=1),
34
- logging.StreamHandler()
35
- ]
36
  )
37
  logger = logging.getLogger(__name__)
38
 
39
  # Map of supported language models
40
  MODEL_MAP = {
41
  "th": "Helsinki-NLP/opus-mt-th-en",
42
- "ja": "Helsinki-NLP/opus-mt-ja-en",
43
  "zh": "Helsinki-NLP/opus-mt-zh-en",
44
  "vi": "Helsinki-NLP/opus-mt-vi-en",
45
  }
@@ -47,19 +42,30 @@ MODEL_MAP = {
47
  # List of terms to protect from translation
48
  PROTECTED_TERMS = ["2030 Aspirations", "Griffith"]
49
 
50
- # Cache for translators to avoid reloading models unnecessarily
51
  translators = {}
52
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_translator(lang: str):
54
  """Load or retrieve cached translator for the given language."""
55
  if lang not in translators:
56
- logger.info(f"Loading model for {lang} from {MODEL_MAP[lang]}...")
57
  try:
58
  translators[lang] = pipeline(
59
  "translation",
60
  model=MODEL_MAP[lang],
61
- device=DEVICE,
62
- model_kwargs={"load_in_8bit": USE_8BIT}
63
  )
64
  logger.info(f"Model for {lang} loaded successfully.")
65
  except Exception as e:
@@ -69,30 +75,24 @@ def get_translator(lang: str):
69
 
70
  @lru_cache(maxsize=100)
71
  def detect_language(text: str) -> str:
72
- """Cached language detection to reduce overhead for repeated inputs."""
73
  try:
74
  detected_lang = langdetect.detect(text)
75
- logger.debug(f"langdetect raw result: '{detected_lang}' for text: '{text[:50]}...'")
76
  if detected_lang.startswith('zh'):
77
- logger.debug(f"Normalizing '{detected_lang}' to 'zh' for Mandarin.")
78
  return 'zh'
79
- final_lang = detected_lang if detected_lang in MODEL_MAP else "en"
80
- logger.debug(f"Final determined language: '{final_lang}'. (Based on raw detected: '{detected_lang}')")
81
- return final_lang
82
  except Exception as e:
83
- logger.warning(f"Language detection FAILED for text: '{text[:50]}...'. Error: {str(e)}. Defaulting to English.")
84
  return "en"
85
 
86
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
87
- """Replace protected terms with placeholders using regex for efficiency."""
88
  modified_text = text
89
  replacements = {}
90
  for i, term in enumerate(protected_terms):
91
  placeholder = f"__PROTECTED_{i}__"
92
  replacements[placeholder] = term
93
- modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text)
94
- if replacements:
95
- logger.debug(f"Protected terms replaced: {replacements}")
96
  return modified_text, replacements
97
 
98
  def restore_terms(text: str, replacements: dict) -> str:
@@ -102,13 +102,24 @@ def restore_terms(text: str, replacements: dict) -> str:
102
  restored_text = restored_text.replace(placeholder, term)
103
  return restored_text
104
 
105
- @app.post("/translate")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  async def translate(text: str, source_lang_override: Optional[str] = None):
107
- """
108
- Translate text to English, preserving protected terms like '2030 Aspirations'.
109
- Automatically detects source language or uses override.
110
- """
111
- if not text:
112
  raise HTTPException(status_code=400, detail="Text input is required.")
113
 
114
  if len(text) > MAX_TEXT_LENGTH:
@@ -121,42 +132,200 @@ async def translate(text: str, source_lang_override: Optional[str] = None):
121
  # Determine source language
122
  if source_lang_override and source_lang_override in MODEL_MAP:
123
  source_lang = source_lang_override
124
- logger.debug(f"Source language overridden by user to: '{source_lang_override}'.")
125
  else:
126
- source_lang = await asyncio.to_thread(detect_language, text)
127
- logger.debug(f"Determined source language for translation: '{source_lang}'.")
128
 
129
  # If source language is English, return original text
130
  if source_lang == "en":
131
- logger.debug("Source language is English or unrecognized, returning original text.")
132
- return {"translated_text": text}
 
 
133
 
134
- # Get translator (lazy-loaded)
135
  translator = get_translator(source_lang)
136
- if not translator:
137
- logger.error(f"No translator found for language: '{source_lang}'.")
138
- raise HTTPException(
139
- status_code=400,
140
- detail=f"Translation not supported for language: {source_lang}."
141
- )
142
 
143
  # Protect terms before translation
144
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
145
- logger.debug(f"Text after protecting terms: '{modified_text[:50]}...'")
146
 
147
- # Perform translation in a thread to avoid blocking the event loop
148
- logger.debug(f"Translating text from '{source_lang}' to English...")
149
- result = await asyncio.to_thread(translator, modified_text, max_length=512, num_beams=4)
150
  translated_text = result[0]["translation_text"]
151
- logger.debug(f"Translation successful. Original: '{modified_text[:50]}...', Translated: '{translated_text[:50]}...'")
152
 
153
  # Restore protected terms
154
  final_text = restore_terms(translated_text, replacements)
155
- logger.debug(f"Final translated text with restored terms: '{final_text[:50]}...'")
156
 
157
- return {"translated_text": final_text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  except HTTPException as e:
159
- raise e
160
  except Exception as e:
161
- logger.error(f"An unexpected error occurred during processing: {str(e)}", exc_info=True)
162
- raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
  from transformers import pipeline
5
  import langdetect
6
  import logging
 
9
  import re
10
  from functools import lru_cache
11
  import asyncio
12
+ import threading
13
+ import time
14
+
15
+ # Create necessary directories
16
+ os.makedirs("./cache", exist_ok=True)
17
+ os.makedirs("./logs", exist_ok=True)
18
 
19
  # Set environment variables for Hugging Face cache
20
+ os.environ["HF_HOME"] = "./cache"
21
+ os.environ["TRANSFORMERS_CACHE"] = "./cache"
22
 
23
  # Environment configuration
24
+ DEVICE = -1 # Always use CPU for HF Spaces
 
 
 
 
 
 
 
25
  MAX_TEXT_LENGTH = int(os.getenv("MAX_TEXT_LENGTH", "5000"))
26
 
27
+ # Configure logging
 
 
28
  logging.basicConfig(
29
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
30
+ level=logging.INFO
 
 
 
 
31
  )
32
  logger = logging.getLogger(__name__)
33
 
34
  # Map of supported language models
35
  MODEL_MAP = {
36
  "th": "Helsinki-NLP/opus-mt-th-en",
37
+ "ja": "Helsinki-NLP/opus-mt-ja-en",
38
  "zh": "Helsinki-NLP/opus-mt-zh-en",
39
  "vi": "Helsinki-NLP/opus-mt-vi-en",
40
  }
 
42
  # List of terms to protect from translation
43
  PROTECTED_TERMS = ["2030 Aspirations", "Griffith"]
44
 
45
+ # Cache for translators
46
  translators = {}
47
 
48
+ # Pydantic models
49
+ class TranslationRequest(BaseModel):
50
+ text: str
51
+ source_lang_override: Optional[str] = None
52
+
53
+ class TranslationResponse(BaseModel):
54
+ translated_text: str
55
+ source_language: Optional[str] = None
56
+
57
+ # FastAPI app
58
+ app = FastAPI(title="Translation Service API")
59
+
60
  def get_translator(lang: str):
61
  """Load or retrieve cached translator for the given language."""
62
  if lang not in translators:
63
+ logger.info(f"Loading model for {lang}...")
64
  try:
65
  translators[lang] = pipeline(
66
  "translation",
67
  model=MODEL_MAP[lang],
68
+ device=-1
 
69
  )
70
  logger.info(f"Model for {lang} loaded successfully.")
71
  except Exception as e:
 
75
 
76
  @lru_cache(maxsize=100)
77
  def detect_language(text: str) -> str:
78
+ """Cached language detection."""
79
  try:
80
  detected_lang = langdetect.detect(text)
 
81
  if detected_lang.startswith('zh'):
 
82
  return 'zh'
83
+ return detected_lang if detected_lang in MODEL_MAP else "en"
 
 
84
  except Exception as e:
85
+ logger.warning(f"Language detection failed: {str(e)}")
86
  return "en"
87
 
88
  def protect_terms(text: str, protected_terms: list) -> tuple[str, dict]:
89
+ """Replace protected terms with placeholders."""
90
  modified_text = text
91
  replacements = {}
92
  for i, term in enumerate(protected_terms):
93
  placeholder = f"__PROTECTED_{i}__"
94
  replacements[placeholder] = term
95
+ modified_text = re.sub(r'\b' + re.escape(term) + r'\b', placeholder, modified_text, flags=re.IGNORECASE)
 
 
96
  return modified_text, replacements
97
 
98
  def restore_terms(text: str, replacements: dict) -> str:
 
102
  restored_text = restored_text.replace(placeholder, term)
103
  return restored_text
104
 
105
+ # FastAPI endpoints
106
+ @app.get("/")
107
+ async def root():
108
+ return {"message": "Translation Service API is running"}
109
+
110
+ @app.get("/health")
111
+ async def health_check():
112
+ return {"status": "healthy", "supported_languages": list(MODEL_MAP.keys())}
113
+
114
+ @app.post("/translate", response_model=TranslationResponse)
115
+ async def translate_api(request: TranslationRequest):
116
+ """API endpoint for translation."""
117
+ return await translate(request.text, request.source_lang_override)
118
+
119
+ # Core translation function
120
  async def translate(text: str, source_lang_override: Optional[str] = None):
121
+ """Core translation function used by both API and Gradio."""
122
+ if not text or not text.strip():
 
 
 
123
  raise HTTPException(status_code=400, detail="Text input is required.")
124
 
125
  if len(text) > MAX_TEXT_LENGTH:
 
132
  # Determine source language
133
  if source_lang_override and source_lang_override in MODEL_MAP:
134
  source_lang = source_lang_override
 
135
  else:
136
+ source_lang = detect_language(text)
 
137
 
138
  # If source language is English, return original text
139
  if source_lang == "en":
140
+ return TranslationResponse(
141
+ translated_text=text,
142
+ source_language=source_lang
143
+ )
144
 
145
+ # Get translator
146
  translator = get_translator(source_lang)
 
 
 
 
 
 
147
 
148
  # Protect terms before translation
149
  modified_text, replacements = protect_terms(text, PROTECTED_TERMS)
 
150
 
151
+ # Perform translation
152
+ result = translator(modified_text, max_length=512, num_beams=4)
 
153
  translated_text = result[0]["translation_text"]
 
154
 
155
  # Restore protected terms
156
  final_text = restore_terms(translated_text, replacements)
 
157
 
158
+ return TranslationResponse(
159
+ translated_text=final_text,
160
+ source_language=source_lang
161
+ )
162
+
163
+ except Exception as e:
164
+ logger.error(f"Translation error: {str(e)}")
165
+ raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
166
+
167
+ # Gradio interface functions
168
+ def translate_gradio(text: str, source_lang: str = "auto"):
169
+ """Gradio wrapper for translation function."""
170
+ if not text.strip():
171
+ return "Please enter some text to translate.", "N/A"
172
+
173
+ try:
174
+ source_lang_param = source_lang if source_lang != "auto" else None
175
+
176
+ # Call the async function synchronously for Gradio
177
+ import asyncio
178
+ loop = asyncio.new_event_loop()
179
+ asyncio.set_event_loop(loop)
180
+
181
+ result = loop.run_until_complete(translate(text, source_lang_param))
182
+
183
+ return result.translated_text, result.source_language or "Unknown"
184
+
185
  except HTTPException as e:
186
+ return f"Error: {e.detail}", "Error"
187
  except Exception as e:
188
+ return f"Error: {str(e)}", "Error"
189
+
190
+ # Create Gradio interface
191
+ def create_gradio_interface():
192
+ with gr.Blocks(
193
+ title="Multi-Language Translation Service",
194
+ theme=gr.themes.Soft(),
195
+ css="""
196
+ .gradio-container {
197
+ max-width: 1200px !important;
198
+ }
199
+ """
200
+ ) as interface:
201
+
202
+ gr.Markdown("""
203
+ # 🌐 Multi-Language Translation Service
204
+
205
+ Translate text from **Thai**, **Japanese**, **Chinese**, or **Vietnamese** to **English**
206
+
207
+ ✨ Features: Automatic language detection • Protected terms preservation • Fast Helsinki-NLP models
208
+ """)
209
+
210
+ with gr.Row():
211
+ with gr.Column(scale=1):
212
+ text_input = gr.Textbox(
213
+ label="📝 Input Text",
214
+ placeholder="Enter text to translate...",
215
+ lines=6,
216
+ max_lines=10
217
+ )
218
+
219
+ with gr.Row():
220
+ lang_dropdown = gr.Dropdown(
221
+ choices=[
222
+ ("🔍 Auto-detect", "auto"),
223
+ ("🇹🇭 Thai", "th"),
224
+ ("🇯🇵 Japanese", "ja"),
225
+ ("🇨🇳 Chinese", "zh"),
226
+ ("🇻🇳 Vietnamese", "vi")
227
+ ],
228
+ value="auto",
229
+ label="Source Language"
230
+ )
231
+
232
+ translate_btn = gr.Button(
233
+ "🚀 Translate",
234
+ variant="primary",
235
+ size="lg"
236
+ )
237
+
238
+ with gr.Column(scale=1):
239
+ output_text = gr.Textbox(
240
+ label="🎯 Translation Result",
241
+ lines=6,
242
+ max_lines=10,
243
+ interactive=False
244
+ )
245
+
246
+ detected_lang = gr.Textbox(
247
+ label="🔍 Detected Language",
248
+ interactive=False,
249
+ max_lines=1
250
+ )
251
+
252
+ # Examples section
253
+ with gr.Row():
254
+ gr.Examples(
255
+ examples=[
256
+ ["สวัสดีครับ ยินดีที่ได้รู้จัก การพัฒนา 2030 Aspirations เป็นเป้าหมายสำคัญ", "th"],
257
+ ["こんにちは、はじめまして。Griffith大学での研究が進んでいます。", "ja"],
258
+ ["你好,很高兴认识你。我们正在为2030 Aspirations制定计划。", "zh"],
259
+ ["Xin chào, rất vui được gặp bạn. Griffith là trường đại học tuyệt vời.", "vi"],
260
+ ],
261
+ inputs=[text_input, lang_dropdown],
262
+ outputs=[output_text, detected_lang],
263
+ fn=translate_gradio,
264
+ cache_examples=False,
265
+ label="📋 Try these examples:"
266
+ )
267
+
268
+ # Event handlers
269
+ translate_btn.click(
270
+ fn=translate_gradio,
271
+ inputs=[text_input, lang_dropdown],
272
+ outputs=[output_text, detected_lang]
273
+ )
274
+
275
+ text_input.submit(
276
+ fn=translate_gradio,
277
+ inputs=[text_input, lang_dropdown],
278
+ outputs=[output_text, detected_lang]
279
+ )
280
+
281
+ # Information accordion
282
+ with gr.Accordion("ℹ️ About this service", open=False):
283
+ gr.Markdown("""
284
+ ### 🎯 Supported Languages:
285
+ - **Thai (th)** → English
286
+ - **Japanese (ja)** → English
287
+ - **Chinese (zh)** → English
288
+ - **Vietnamese (vi)** → English
289
+
290
+ ### 🛡️ Special Features:
291
+ - **Protected Terms**: Certain terms like "2030 Aspirations" and "Griffith" are preserved during translation
292
+ - **Auto Detection**: Automatically detects the source language if not specified
293
+ - **Fast Processing**: Uses optimized Helsinki-NLP translation models
294
+
295
+ ### 🚀 How to use:
296
+ 1. Paste or type your text in the input box
297
+ 2. Choose source language or leave as 'Auto-detect'
298
+ 3. Click 'Translate' or press Enter
299
+ 4. Get your English translation instantly!
300
+
301
+ ### 🔧 API Access:
302
+ This service also provides REST API endpoints:
303
+ - `GET /health` - Check service status
304
+ - `POST /translate` - Translate text (JSON payload required)
305
+ """)
306
+
307
+ return interface
308
+
309
+ # Start FastAPI in a separate thread
310
+ def start_fastapi():
311
+ import uvicorn
312
+ uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
313
+
314
+ # Main execution
315
+ if __name__ == "__main__":
316
+ # Start FastAPI server in background thread
317
+ fastapi_thread = threading.Thread(target=start_fastapi, daemon=True)
318
+ fastapi_thread.start()
319
+
320
+ # Give FastAPI time to start
321
+ time.sleep(2)
322
+
323
+ # Create and launch Gradio interface
324
+ demo = create_gradio_interface()
325
+ demo.queue(max_size=10)
326
+ demo.launch(
327
+ server_name="0.0.0.0",
328
+ server_port=7861,
329
+ share=False,
330
+ show_error=True
331
+ )