from embeddings.agents.interface import Ingest from haystack.components.converters import PyPDFToDocument from haystack.components.routers import FileTypeRouter from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner from haystack.components.embedders import SentenceTransformersDocumentEmbedder from haystack import Pipeline from haystack_integrations.document_stores.weaviate.document_store import WeaviateDocumentStore from haystack.components.writers import DocumentWriter import timeit import box import yaml from rich import print # Import config vars with open('config.yml', 'r', encoding='utf8') as ymlfile: cfg = box.Box(yaml.safe_load(ymlfile)) class HaystackIngest(Ingest): def run_ingest(self, payload: str, file_path: str, index_name: str) -> None: print(f"\nRunning embeddings with {payload}\n") file_list = [file_path] start = timeit.default_timer() document_store = WeaviateDocumentStore(url=cfg.WEAVIATE_URL, collection_settings={"class": index_name}) file_type_router = FileTypeRouter(mime_types=["application/pdf"]) pdf_converter = PyPDFToDocument() document_cleaner = DocumentCleaner() document_splitter = DocumentSplitter( split_by="word", split_length=cfg.SPLIT_LENGTH_HAYSTACK, split_overlap=cfg.SPLIT_OVERLAP_HAYSTACK ) document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") document_writer = DocumentWriter(document_store) preprocessing_pipeline = Pipeline() preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router") preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter") preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner") preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter") preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder") preprocessing_pipeline.add_component(instance=document_writer, name="document_writer") preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources") preprocessing_pipeline.connect("pypdf_converter", "document_cleaner") preprocessing_pipeline.connect("document_cleaner", "document_splitter") preprocessing_pipeline.connect("document_splitter", "document_embedder") preprocessing_pipeline.connect("document_embedder", "document_writer") # preprocessing_pipeline.draw("pipeline.png") preprocessing_pipeline.run({ "file_type_router": {"sources": file_list} }) print(f"Number of documents in document store: {document_store.count_documents()}") end = timeit.default_timer() print(f"Time to embeddings data: {end - start}")