File size: 2,020 Bytes
0eb636f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import uuid
from pathlib import Path
from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR

from src.utils.data_utils import chunk_text
from src.db.vector_store import VectorStore
from src.modelling.embed import DalaEmbedder
from src.modelling.transliterate import DalaTransliterator


def load_documents(input_dir: Path) -> list[tuple[str, str]]:
    """
    Loads all .txt documents from input_dir. Returns a list of 
    tuples: (filename, content)
    """
    docs = []
    
    for file in input_dir.glob("*.txt"):
        with open(file, 'r', encoding = "utf-8") as f:
            text = f.read()

            docs.append((file.stem, text))
    
    return docs


def process_documents() -> None:
    """
    Main processing procedure.
    """
    # Components
    transliterator = DalaTransliterator()
    embedder = DalaEmbedder()
    vector_store = VectorStore()

    docs = load_documents(DOC_INPUT_DIR)
    all_chunks = []
    all_transliterated = []
    all_metadata = []

    for doc_id, text in docs:
        # Chunk the data
        chunks = chunk_text(text)

        all_chunks.extend(chunks)

        # Transliterate chunks
        translit_chunks = transliterator.batch_transliterate(chunks)

        all_transliterated.extend(translit_chunks)

        # Save transliterated version
        output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt"

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("\n\n".join(translit_chunks))

        # Create metadata entries
        for i, chunk in enumerate(translit_chunks):
            meta = {
                "id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}",
                "text": chunk
            }

            all_metadata.append(meta)

    # Embed all chunks
    embeddings = embedder.embed_batch(all_transliterated)

    # Add to vector DB
    vector_store.add(embeddings, all_metadata)

    print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.")


if __name__ == "__main__":
    process_documents()