Spaces:

BabelSeniorDesignTeam
/

babel-router-api

Sleeping

mayacou commited on Jun 12

Commit

07ea3d5

verified ·

1 Parent(s): be484c1

Update app.py

added chunking service references

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,9 @@ from fastapi import FastAPI, Request
 from transformers import MarianMTModel, MarianTokenizer
 import torch
 app = FastAPI()
 # Map target languages to Hugging Face model IDs
@@ -60,14 +63,29 @@ async def translate(request: Request):
     if not model_id:
         return {"error": f"No model found for target language '{target_lang}'"}
     if model_id.startswith("facebook/"):
         return {"translation": f"[{target_lang}] uses model '{model_id}', which is not supported in this Space yet."}
     try:
         tokenizer, model = load_model(model_id)
-        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
-        outputs = model.generate(**inputs, num_beams=5, length_penalty=1.2, early_stopping=True)
-        return {"translation": tokenizer.decode(outputs[0], skip_special_tokens=True)}
     except Exception as e:
         return {"error": f"Translation failed: {str(e)}"}

 from transformers import MarianMTModel, MarianTokenizer
 import torch
+# import chunking service
+from chunking import get_max_word_length, chunk_text
 app = FastAPI()
 # Map target languages to Hugging Face model IDs
     if not model_id:
         return {"error": f"No model found for target language '{target_lang}'"}
+    # Facebook/mbart placeholder check
     if model_id.startswith("facebook/"):
         return {"translation": f"[{target_lang}] uses model '{model_id}', which is not supported in this Space yet."}
     try:
+        # 1. figure out your safe word limit for this language
+        safe_limit = get_max_word_length([target_lang])
+        # 2. break the input up into chunks
+        chunks = chunk_text(text, safe_limit)
+        # 3. translate each chunk and collect results
         tokenizer, model = load_model(model_id)
+        full_translation = []
+        for chunk in chunks:
+            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True).to(model.device)
+            outputs = model.generate(**inputs, num_beams=5, length_penalty=1.2, early_stopping=True)
+            full_translation.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        # 4. re-join the translated pieces
+        joined = " ".join(full_translation)
+        return {"translation": joined}
     except Exception as e:
         return {"error": f"Translation failed: {str(e)}"}