Spaces:
Sleeping
Sleeping
import uuid | |
from pathlib import Path | |
from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR | |
from src.utils.data_utils import chunk_text | |
from src.db.vector_store import VectorStore | |
from src.modelling.embed import DalaEmbedder | |
from src.modelling.transliterate import DalaTransliterator | |
def load_documents(input_dir: Path) -> list[tuple[str, str]]: | |
""" | |
Loads all .txt documents from input_dir. Returns a list of | |
tuples: (filename, content) | |
""" | |
docs = [] | |
for file in input_dir.glob("*.txt"): | |
with open(file, 'r', encoding = "utf-8") as f: | |
text = f.read() | |
docs.append((file.stem, text)) | |
return docs | |
def process_documents() -> None: | |
""" | |
Main processing procedure. | |
""" | |
# Components | |
transliterator = DalaTransliterator() | |
embedder = DalaEmbedder() | |
vector_store = VectorStore() | |
docs = load_documents(DOC_INPUT_DIR) | |
all_chunks = [] | |
all_transliterated = [] | |
all_metadata = [] | |
for doc_id, text in docs: | |
# Chunk the data | |
chunks = chunk_text(text) | |
all_chunks.extend(chunks) | |
# Transliterate chunks | |
translit_chunks = transliterator.batch_transliterate(chunks) | |
all_transliterated.extend(translit_chunks) | |
# Save transliterated version | |
output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt" | |
with open(output_path, 'w', encoding='utf-8') as f: | |
f.write("\n\n".join(translit_chunks)) | |
# Create metadata entries | |
for i, chunk in enumerate(translit_chunks): | |
meta = { | |
"id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}", | |
"text": chunk | |
} | |
all_metadata.append(meta) | |
# Embed all chunks | |
embeddings = embedder.embed_batch(all_transliterated) | |
# Add to vector DB | |
vector_store.add(embeddings, all_metadata) | |
print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.") | |
if __name__ == "__main__": | |
process_documents() | |