Spaces:

BabelSeniorDesignTeam
/

babel-router-api

Sleeping

App Files Files Community

mayacou commited on Jun 12

Commit

36ed2b7

verified ·

1 Parent(s): 07ea3d5

Create chunking.py

Browse files

added chunking service

Files changed (1) hide show

chunking.py +67 -0

chunking.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import re
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def get_max_word_length(target_languages: list[str]) -> int:
+    helsinki_word_limits = {
+        "el": 50,
+        "et": 50,
+        "fi": 50,
+        "fr": 40,
+        "sv": 140,
+        "hu": 50,
+        "lt": 50,
+        "sk": 140,
+        "bg": 50,
+        "cs": 140,
+        "da": 140,
+        "de": 150,
+    }
+    max_word_length = 700  # Default for non-Helsinki languages
+    for lang in target_languages:
+        if lang in helsinki_word_limits:
+            if helsinki_word_limits[lang] < max_word_length:
+                max_word_length = helsinki_word_limits[lang]
+    return max_word_length
+def chunk_text(text: str, safe_word_limit: int) -> list[str]:
+    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+    chunks = []
+    current_chunk = []
+    current_word_count = 0
+    for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        word_count = len(sentence.split())
+        # If sentence is longer than the safe word limit by itself, split it
+        if word_count > safe_word_limit:
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = []
+                current_word_count = 0
+            words = sentence.split()
+            for i in range(0, len(words), safe_word_limit):
+                chunks.append(' '.join(words[i:i+safe_word_limit]))
+            continue
+        # Otherwise, see if it fits in the current chunk
+        if current_word_count + word_count <= safe_word_limit:
+            current_chunk.append(sentence)
+            current_word_count += word_count
+        else:
+            # Start a new chunk
+            chunks.append(' '.join(current_chunk))
+            current_chunk = [sentence]
+            current_word_count = word_count
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks