import os import glob import faiss import numpy as np from datasets import Dataset from unstructured.partition.pdf import partition_pdf from transformers import RagTokenizer from sentence_transformers import SentenceTransformer def ingest_and_push( dataset_name="DurgaDeepak/meal_plans", index_path="mealplan.index" ): # 1) Tokenizer for chunking rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") # 2) Embedder for FAISS embedder = SentenceTransformer("all-MiniLM-L6-v2") texts, sources, pages = [], [], [] # 3) Chunk each PDF for pdf_path in glob.glob("meal_plans/*.pdf"): book = os.path.basename(pdf_path) pages_data = partition_pdf(filename=pdf_path) for pg_num, page in enumerate(pages_data, start=1): enc = rag_tokenizer( page.text, max_length=800, truncation=True, return_overflowing_tokens=True, stride=50, return_tensors="pt" ) for token_ids in enc["input_ids"]: chunk = rag_tokenizer.decode(token_ids, skip_special_tokens=True) texts.append(chunk) sources.append(book) pages.append(pg_num) # 4) Build HF Dataset ds = Dataset.from_dict({ "text": texts, "source": sources, "page": pages }) ds.push_to_hub(dataset_name, token=True) # 5) Build FAISS index embeddings = embedder.encode(texts, convert_to_numpy=True) dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) # CPU index index.add(embeddings) faiss.write_index(index, index_path) if __name__ == "__main__": ingest_and_push()