LISA-demo / documents.py
Kadi-IAM's picture
Remove some old codes
e607fab
raw
history blame
7.96 kB
import os
import shutil
from langchain.document_loaders import (
PyMuPDFLoader,
)
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
SpacyTextSplitter,
)
def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None):
"""Load and parse pdf file(s)."""
if pdf_path.endswith('.pdf'): # single file
pdf_docs = [pdf_path]
else: # a directory
pdf_docs = [os.path.join(pdf_path, f) for f in os.listdir(pdf_path) if f.endswith('.pdf')]
if load_kwargs is None:
load_kwargs = {}
docs = []
if loader_module is None: # set pdf loader
loader_module = PyMuPDFLoader
for pdf in pdf_docs:
loader = loader_module(pdf, **load_kwargs)
doc = loader.load()
docs.extend(doc)
return docs
def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None):
"""Load and parse xml file(s)."""
from bs4 import BeautifulSoup
from unstructured.cleaners.core import group_broken_paragraphs
if xml_path.endswith('.xml'): # single file
xml_docs = [xml_path]
else: # a directory
xml_docs = [os.path.join(xml_path, f) for f in os.listdir(xml_path) if f.endswith('.xml')]
if load_kwargs is None:
load_kwargs = {}
docs = []
for xml_file in xml_docs:
# print("now reading file...")
with open(xml_file) as fp:
soup = BeautifulSoup(fp, features="xml") # txt is simply the a string with your XML file
pageText = soup.findAll(string=True)
parsed_text = '\n'.join(pageText) # or " ".join, seems similar
# # Clean text
parsed_text_grouped = group_broken_paragraphs(parsed_text)
# get metadata
try:
from lxml import etree as ET
tree = ET.parse(xml_file)
# Define namespace
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
# Read Author personal names as an example
pers_name_elements = tree.xpath("tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName", namespaces=ns)
first_per = pers_name_elements[0].text
author_info = first_per + " et al"
title_elements = tree.xpath("tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns)
title = title_elements[0].text
# Combine source info
source_info = "_".join([author_info, title])
except:
source_info = "unknown"
# maybe even better TODO: discuss with Jens
# first_author = soup.find("author")
# publication_year = soup.find("date", attrs={'type': 'published'})
# title = soup.find("title")
# source_info = [first_author, publication_year, title]
# source_info_str = "_".join([info.text.strip() if info is not None else "unknown" for info in source_info])
doc = [Document(page_content=parsed_text_grouped, metadata={"source": source_info})]#, metadata={"source": "local"})
docs.extend(doc)
return docs
def get_doc_chunks(docs, splitter=None):
"""Split docs into chunks."""
if splitter is None:
# splitter = RecursiveCharacterTextSplitter(
# # separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256
# separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128
# )
splitter = SpacyTextSplitter.from_tiktoken_encoder(
chunk_size=512,
chunk_overlap=128,
)
chunks = splitter.split_documents(docs)
return chunks
def persist_vectorstore(document_chunks, embeddings, persist_directory="db", overwrite=False):
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
if overwrite:
shutil.rmtree(persist_directory) # Empty and reset db
db = Chroma.from_documents(documents=document_chunks, embedding=embeddings, persist_directory=persist_directory)
# db.delete_collection()
db.persist()
# db = None
# db = Chroma(persist_directory="db", embedding_function = embeddings, client_settings=CHROMA_SETTINGS)
# vectorstore = FAISS.from_documents(documents=document_chunks, embedding=embeddings)
return db
class VectorstoreManager:
def __init__(self):
self.vectorstore_class = Chroma
def create_db(self, embeddings):
db = self.vectorstore_class(embedding_function=embeddings)
self.db = db
return db
def load_db(self, persist_directory, embeddings):
"""Load local vectorestore."""
db = self.vectorstore_class(persist_directory=persist_directory, embedding_function=embeddings)
self.db = db
return db
def create_db_from_documents(self, document_chunks, embeddings, persist_directory="db", overwrite=False):
"""Create db from documents."""
if overwrite:
shutil.rmtree(persist_directory) # Empty and reset db
db = self.vectorstore_class.from_documents(documents=document_chunks, embedding=embeddings, persist_directory=persist_directory)
self.db = db
return db
def persist_db(self, persist_directory="db"):
"""Persist db."""
assert self.db
self.db.persist() # Chroma
class RetrieverManager:
# some other retrievers Using Advanced Retrievers in LangChain https://www.comet.com/site/blog/using-advanced-retrievers-in-langchain/
def __init__(self, vectorstore, k=10):
self.vectorstore = vectorstore
self.retriever = vectorstore.as_retriever(search_kwargs={"k": k}) #search_kwargs={"k": 8}),
def get_rerank_retriver(self, base_retriever=None):
if base_retriever is None:
base_retriever = self.retriever
# with rerank
from rerank import BgeRerank
from langchain.retrievers import ContextualCompressionRetriever
compressor = BgeRerank()
compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor, base_retriever=base_retriever
)
return compression_retriever
def get_parent_doc_retriver(self, documents, store_file="./store_location"):
# TODO need better design
# Ref: explain how it works: https://clusteredbytes.pages.dev/posts/2023/langchain-parent-document-retriever/
from langchain.storage.file_system import LocalFileStore
from langchain.storage import InMemoryStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.retrievers import ParentDocumentRetriever
# Ref: https://stackoverflow.com/questions/77385587/persist-parentdocumentretriever-of-langchain
# fs = LocalFileStore("./store_location")
# store = create_kv_docstore(fs)
docstore = InMemoryStore()
# TODO: how to better set this?
parent_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256)
child_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128)
retriever = ParentDocumentRetriever(
vectorstore=self.vectorstore,
docstore=docstore,
child_splitter=child_splitter,
parent_splitter=parent_splitter,
search_kwargs={"k":10} # Better settings?
)
retriever.add_documents(documents)#, ids=None)
return retriever