import uuid from pathlib import Path from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR from src.utils.data_utils import chunk_text from src.db.vector_store import VectorStore from src.modelling.embed import DalaEmbedder from src.modelling.transliterate import DalaTransliterator def load_documents(input_dir: Path) -> list[tuple[str, str]]: """ Loads all .txt documents from input_dir. Returns a list of tuples: (filename, content) """ docs = [] for file in input_dir.glob("*.txt"): with open(file, 'r', encoding = "utf-8") as f: text = f.read() docs.append((file.stem, text)) return docs def process_documents() -> None: """ Main processing procedure. """ # Components transliterator = DalaTransliterator() embedder = DalaEmbedder() vector_store = VectorStore() docs = load_documents(DOC_INPUT_DIR) all_chunks = [] all_transliterated = [] all_metadata = [] for doc_id, text in docs: # Chunk the data chunks = chunk_text(text) all_chunks.extend(chunks) # Transliterate chunks translit_chunks = transliterator.batch_transliterate(chunks) all_transliterated.extend(translit_chunks) # Save transliterated version output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt" with open(output_path, 'w', encoding='utf-8') as f: f.write("\n\n".join(translit_chunks)) # Create metadata entries for i, chunk in enumerate(translit_chunks): meta = { "id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}", "text": chunk } all_metadata.append(meta) # Embed all chunks embeddings = embedder.embed_batch(all_transliterated) # Add to vector DB vector_store.add(embeddings, all_metadata) print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.") if __name__ == "__main__": process_documents()