File size: 3,824 Bytes
efb198f
 
 
 
07ea3d5
 
 
efb198f
 
6df8ecd
efb198f
6df8ecd
 
 
 
 
be484c1
6df8ecd
 
efb198f
6df8ecd
 
be484c1
 
6df8ecd
 
be484c1
6df8ecd
 
 
be484c1
6df8ecd
 
 
 
 
 
be484c1
efb198f
 
6df8ecd
efb198f
 
6df8ecd
efb198f
 
 
 
 
 
 
6df8ecd
efb198f
 
 
 
 
 
 
6df8ecd
efb198f
 
 
6df8ecd
 
07ea3d5
6df8ecd
 
 
 
07ea3d5
 
 
 
 
 
 
6df8ecd
07ea3d5
 
 
 
 
 
 
 
 
 
6df8ecd
 
 
 
 
 
 
efb198f
6df8ecd
 
 
 
efb198f
6df8ecd
efb198f
 
6df8ecd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from fastapi import FastAPI, Request
from transformers import MarianMTModel, MarianTokenizer
import torch

# import chunking service
from chunking import get_max_word_length, chunk_text

app = FastAPI()

# Map target languages to Hugging Face model IDs
MODEL_MAP = {
    "bg": "Helsinki-NLP/opus-mt-tc-big-en-bg",
    "cs": "Helsinki-NLP/opus-mt-en-cs",
    "da": "Helsinki-NLP/opus-mt-en-da",
    "de": "Helsinki-NLP/opus-mt-en-de",
    "el": "Helsinki-NLP/opus-mt-tc-big-en-el",
    "es": "Helsinki-NLP/opus-mt-tc-big-en-es",
    "et": "Helsinki-NLP/opus-mt-tc-big-en-et",
    "fi": "Helsinki-NLP/opus-mt-tc-big-en-fi",
    "fr": "Helsinki-NLP/opus-mt-en-fr",
    "hr": "facebook/mbart-large-50-many-to-many-mmt",
    "hu": "Helsinki-NLP/opus-mt-tc-big-en-hu",
    "is": "Helsinki-NLP/opus-mt-tc-big-en-gmq",
    "it": "Helsinki-NLP/opus-mt-tc-big-en-it",
    "lt": "Helsinki-NLP/opus-mt-tc-big-en-lt",
    "lv": "facebook/mbart-large-50-many-to-many-mmt",
    "mk": "Helsinki-NLP/opus-mt-en-mk",
    "nb": "facebook/mbart-large-50-many-to-many-mmt", #place holder!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    "nl": "facebook/mbart-large-50-many-to-many-mmt",
    "no": "facebook/mbart-large-50-many-to-many-mmt", #place holder!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    "pl": "Helsinki-NLP/opus-mt-en-sla",
    "pt": "facebook/mbart-large-50-many-to-many-mmt",
    "ro": "facebook/mbart-large-50-many-to-many-mmt",
    "sk": "Helsinki-NLP/opus-mt-en-sk",
    "sl": "alirezamsh/small100",
    "sq": "alirezamsh/small100",
    "sv": "Helsinki-NLP/opus-mt-en-sv",
    "tr": "Helsinki-NLP/opus-mt-tc-big-en-tr"
}


MODEL_CACHE = {}

# βœ… Load Hugging Face model (Helsinki or Small100)
def load_model(model_id):
    if model_id not in MODEL_CACHE:
        tokenizer = MarianTokenizer.from_pretrained(model_id)
        model = MarianMTModel.from_pretrained(model_id).to("cpu")
        MODEL_CACHE[model_id] = (tokenizer, model)
    return MODEL_CACHE[model_id]

# βœ… POST /translate
@app.post("/translate")
async def translate(request: Request):
    data = await request.json()
    text = data.get("text")
    target_lang = data.get("target_lang")

    if not text or not target_lang:
        return {"error": "Missing 'text' or 'target_lang'"}

    model_id = MODEL_MAP.get(target_lang)
    if not model_id:
        return {"error": f"No model found for target language '{target_lang}'"}

    # Facebook/mbart placeholder check
    if model_id.startswith("facebook/"):
        return {"translation": f"[{target_lang}] uses model '{model_id}', which is not supported in this Space yet."}

    try:
        # 1. figure out your safe word limit for this language
        safe_limit = get_max_word_length([target_lang])

        # 2. break the input up into chunks
        chunks = chunk_text(text, safe_limit)

        # 3. translate each chunk and collect results
        tokenizer, model = load_model(model_id)
        full_translation = []
        for chunk in chunks:
            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True).to(model.device)
            outputs = model.generate(**inputs, num_beams=5, length_penalty=1.2, early_stopping=True)
            full_translation.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

        # 4. re-join the translated pieces
        joined = " ".join(full_translation)
        return {"translation": joined}

    except Exception as e:
        return {"error": f"Translation failed: {str(e)}"}

# βœ… GET /languages
@app.get("/languages")
def list_languages():
    return {"supported_languages": list(MODEL_MAP.keys())}

# βœ… GET /health
@app.get("/health")
def health():
    return {"status": "ok"}

# βœ… Uvicorn startup (required by Hugging Face)
import uvicorn
if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=7860)