File size: 3,824 Bytes
efb198f 07ea3d5 efb198f 6df8ecd efb198f 6df8ecd be484c1 6df8ecd efb198f 6df8ecd be484c1 6df8ecd be484c1 6df8ecd be484c1 6df8ecd be484c1 efb198f 6df8ecd efb198f 6df8ecd efb198f 6df8ecd efb198f 6df8ecd efb198f 6df8ecd 07ea3d5 6df8ecd 07ea3d5 6df8ecd 07ea3d5 6df8ecd efb198f 6df8ecd efb198f 6df8ecd efb198f 6df8ecd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from fastapi import FastAPI, Request
from transformers import MarianMTModel, MarianTokenizer
import torch
# import chunking service
from chunking import get_max_word_length, chunk_text
app = FastAPI()
# Map target languages to Hugging Face model IDs
MODEL_MAP = {
"bg": "Helsinki-NLP/opus-mt-tc-big-en-bg",
"cs": "Helsinki-NLP/opus-mt-en-cs",
"da": "Helsinki-NLP/opus-mt-en-da",
"de": "Helsinki-NLP/opus-mt-en-de",
"el": "Helsinki-NLP/opus-mt-tc-big-en-el",
"es": "Helsinki-NLP/opus-mt-tc-big-en-es",
"et": "Helsinki-NLP/opus-mt-tc-big-en-et",
"fi": "Helsinki-NLP/opus-mt-tc-big-en-fi",
"fr": "Helsinki-NLP/opus-mt-en-fr",
"hr": "facebook/mbart-large-50-many-to-many-mmt",
"hu": "Helsinki-NLP/opus-mt-tc-big-en-hu",
"is": "Helsinki-NLP/opus-mt-tc-big-en-gmq",
"it": "Helsinki-NLP/opus-mt-tc-big-en-it",
"lt": "Helsinki-NLP/opus-mt-tc-big-en-lt",
"lv": "facebook/mbart-large-50-many-to-many-mmt",
"mk": "Helsinki-NLP/opus-mt-en-mk",
"nb": "facebook/mbart-large-50-many-to-many-mmt", #place holder!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
"nl": "facebook/mbart-large-50-many-to-many-mmt",
"no": "facebook/mbart-large-50-many-to-many-mmt", #place holder!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
"pl": "Helsinki-NLP/opus-mt-en-sla",
"pt": "facebook/mbart-large-50-many-to-many-mmt",
"ro": "facebook/mbart-large-50-many-to-many-mmt",
"sk": "Helsinki-NLP/opus-mt-en-sk",
"sl": "alirezamsh/small100",
"sq": "alirezamsh/small100",
"sv": "Helsinki-NLP/opus-mt-en-sv",
"tr": "Helsinki-NLP/opus-mt-tc-big-en-tr"
}
MODEL_CACHE = {}
# β
Load Hugging Face model (Helsinki or Small100)
def load_model(model_id):
if model_id not in MODEL_CACHE:
tokenizer = MarianTokenizer.from_pretrained(model_id)
model = MarianMTModel.from_pretrained(model_id).to("cpu")
MODEL_CACHE[model_id] = (tokenizer, model)
return MODEL_CACHE[model_id]
# β
POST /translate
@app.post("/translate")
async def translate(request: Request):
data = await request.json()
text = data.get("text")
target_lang = data.get("target_lang")
if not text or not target_lang:
return {"error": "Missing 'text' or 'target_lang'"}
model_id = MODEL_MAP.get(target_lang)
if not model_id:
return {"error": f"No model found for target language '{target_lang}'"}
# Facebook/mbart placeholder check
if model_id.startswith("facebook/"):
return {"translation": f"[{target_lang}] uses model '{model_id}', which is not supported in this Space yet."}
try:
# 1. figure out your safe word limit for this language
safe_limit = get_max_word_length([target_lang])
# 2. break the input up into chunks
chunks = chunk_text(text, safe_limit)
# 3. translate each chunk and collect results
tokenizer, model = load_model(model_id)
full_translation = []
for chunk in chunks:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True).to(model.device)
outputs = model.generate(**inputs, num_beams=5, length_penalty=1.2, early_stopping=True)
full_translation.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
# 4. re-join the translated pieces
joined = " ".join(full_translation)
return {"translation": joined}
except Exception as e:
return {"error": f"Translation failed: {str(e)}"}
# β
GET /languages
@app.get("/languages")
def list_languages():
return {"supported_languages": list(MODEL_MAP.keys())}
# β
GET /health
@app.get("/health")
def health():
return {"status": "ok"}
# β
Uvicorn startup (required by Hugging Face)
import uvicorn
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=7860) |