semanticdala / src /utils /ingest.py
crossroderick's picture
Added all files
0eb636f
import uuid
from pathlib import Path
from src.utils.config import DOC_INPUT_DIR, TRANS_OUTPUT_DIR
from src.utils.data_utils import chunk_text
from src.db.vector_store import VectorStore
from src.modelling.embed import DalaEmbedder
from src.modelling.transliterate import DalaTransliterator
def load_documents(input_dir: Path) -> list[tuple[str, str]]:
"""
Loads all .txt documents from input_dir. Returns a list of
tuples: (filename, content)
"""
docs = []
for file in input_dir.glob("*.txt"):
with open(file, 'r', encoding = "utf-8") as f:
text = f.read()
docs.append((file.stem, text))
return docs
def process_documents() -> None:
"""
Main processing procedure.
"""
# Components
transliterator = DalaTransliterator()
embedder = DalaEmbedder()
vector_store = VectorStore()
docs = load_documents(DOC_INPUT_DIR)
all_chunks = []
all_transliterated = []
all_metadata = []
for doc_id, text in docs:
# Chunk the data
chunks = chunk_text(text)
all_chunks.extend(chunks)
# Transliterate chunks
translit_chunks = transliterator.batch_transliterate(chunks)
all_transliterated.extend(translit_chunks)
# Save transliterated version
output_path = TRANS_OUTPUT_DIR / f"{doc_id}_transliterated.txt"
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n\n".join(translit_chunks))
# Create metadata entries
for i, chunk in enumerate(translit_chunks):
meta = {
"id": f"{doc_id}_{i}_{uuid.uuid4().hex[:6]}",
"text": chunk
}
all_metadata.append(meta)
# Embed all chunks
embeddings = embedder.embed_batch(all_transliterated)
# Add to vector DB
vector_store.add(embeddings, all_metadata)
print(f"[INFO] Successfully ingested {len(all_chunks)} chunks.")
if __name__ == "__main__":
process_documents()