mayacou commited on
Commit
36ed2b7
·
verified ·
1 Parent(s): 07ea3d5

Create chunking.py

Browse files

added chunking service

Files changed (1) hide show
  1. chunking.py +67 -0
chunking.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+
4
+ def get_max_word_length(target_languages: list[str]) -> int:
5
+ helsinki_word_limits = {
6
+ "el": 50,
7
+ "et": 50,
8
+ "fi": 50,
9
+ "fr": 40,
10
+ "sv": 140,
11
+ "hu": 50,
12
+ "lt": 50,
13
+ "sk": 140,
14
+ "bg": 50,
15
+ "cs": 140,
16
+ "da": 140,
17
+ "de": 150,
18
+ }
19
+
20
+ max_word_length = 700 # Default for non-Helsinki languages
21
+
22
+ for lang in target_languages:
23
+ if lang in helsinki_word_limits:
24
+ if helsinki_word_limits[lang] < max_word_length:
25
+ max_word_length = helsinki_word_limits[lang]
26
+
27
+ return max_word_length
28
+
29
+ def chunk_text(text: str, safe_word_limit: int) -> list[str]:
30
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
31
+
32
+ chunks = []
33
+ current_chunk = []
34
+ current_word_count = 0
35
+
36
+ for sentence in sentences:
37
+ sentence = sentence.strip()
38
+ if not sentence:
39
+ continue
40
+
41
+ word_count = len(sentence.split())
42
+
43
+ # If sentence is longer than the safe word limit by itself, split it
44
+ if word_count > safe_word_limit:
45
+ if current_chunk:
46
+ chunks.append(' '.join(current_chunk))
47
+ current_chunk = []
48
+ current_word_count = 0
49
+ words = sentence.split()
50
+ for i in range(0, len(words), safe_word_limit):
51
+ chunks.append(' '.join(words[i:i+safe_word_limit]))
52
+ continue
53
+
54
+ # Otherwise, see if it fits in the current chunk
55
+ if current_word_count + word_count <= safe_word_limit:
56
+ current_chunk.append(sentence)
57
+ current_word_count += word_count
58
+ else:
59
+ # Start a new chunk
60
+ chunks.append(' '.join(current_chunk))
61
+ current_chunk = [sentence]
62
+ current_word_count = word_count
63
+
64
+ if current_chunk:
65
+ chunks.append(' '.join(current_chunk))
66
+
67
+ return chunks