import os import gradio as gr import re from langchain.vectorstores import FAISS from langchain.embeddings.base import Embeddings from typing import List from sentence_transformers import SentenceTransformer from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.prompts import PromptTemplate from langchain_community.llms.huggingface_hub import HuggingFaceHub from read_photodocument import convert_PDF_to_Text from doctr.io import DocumentFile from doctr.models import ocr_predictor import contextlib from langchain.schema import Document from langchain.text_splitter import CharacterTextSplitter from langchain.chains.summarize import load_summarize_chain import logging logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S", ) DEVICE = 'cpu' FILE_EXT = ['pdf','jpg','jpeg'] DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ." MAX_NEW_TOKENS = 2048 DEFAULT_TEMPERATURE = 0.1 DEFAULT_MAX_NEW_TOKENS = 1024 MAX_INPUT_TOKEN_LENGTH = 2048 embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2" local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False}) with contextlib.redirect_stdout(None): ocr_model = ocr_predictor( "db_resnet50", "crnn_mobilenet_v3_large", pretrained=True, assume_straight_pages=True, ) def loading_file(): return "Loading..." def summarize_data(docs,llm_model,chain_type='refine'): prompt_template = """ Write a concise summary of the following pointwise avoid repetion: {text} CONCISE SUMMARY: """ refine_template = ( "Your job is to produce a final summary in points.\n" "Existing summary up to a certain point: {existing_answer}\n" "write the details of summary pointwise and avoid repetion." ) prompt = PromptTemplate.from_template(prompt_template) refine_prompt = PromptTemplate.from_template(refine_template) chain = load_summarize_chain(llm=llm_model, chain_type=chain_type, # question_prompt=prompt, # refine_prompt=, return_intermediate_steps=False, input_key="input_documents", output_key="output_text", ) summary = chain({"input_documents": docs}, return_only_outputs=True) output_text = summary["output_text"].strip() regex = r"CONCISE SUMMARY:(.*)" matches = re.finditer(regex, output_text, re.DOTALL) for matchNum, match in enumerate(matches, start=1): for groupNum in range(0, len(match.groups())): groupNum = groupNum + 1 lines = match.group(groupNum).strip().split("\n") return lines def process_documents(texts,data_chunk=1000,chunk_overlap=10): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=data_chunk, chunk_overlap=chunk_overlap, length_function=len ) texts = text_splitter.split_text(texts) docs = [Document(page_content=txt) for txt in texts] return docs def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None): llm = HuggingFaceHub( huggingfacehub_api_token =API_key , repo_id=model_id, model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens} ) return llm def document_loader(temperature,max_tokens,api_key,model_name,file_path): model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens) converted_txt = None if file_path.endswith('.pdf'): conversion_stats = convert_PDF_to_Text(document_file=file_path,ocr_model=ocr_model) converted_txt = conversion_stats["converted_text"] num_pages = conversion_stats["num_pages"] was_truncated = conversion_stats["truncated"] print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages)) if converted_txt: print("Document Processed ..") texts = process_documents(documents=converted_txt) lines = summarize_data(docs=texts,llm_model=model) return lines else: return "Error in Processsing document " iface = gr.Interface( fn= document_loader,inputs = [ gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"), gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'), gr.Textbox(label="Add API key", type="password"), gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'), "file" ] ouputs="text", description ="Summarize your PDF Document having Image • HuggingFace", ) iface.launch() # with gr.Blocks(css=css) as demo: # with gr.Column(elem_id="col-container"): # gr.HTML(title) # with gr.Group(): # chatbot = gr.Chatbot(height=300) # with gr.Row(): # sumarize_btn = gr.Button(value="Summarize", variant="primary", scale = 1) # clean_chat_btn = gr.Button("Delete Chat") # with gr.Column(): # LLM_option = gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model Selection',info='LLM Service') # with gr.Column(): # with gr.Box(): # file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !") # pdf_doc = gr.File(label="Upload File", file_types=FILE_EXT, type="file") # with gr.Accordion(label='Advanced options', open=False): # max_new_tokens = gr.Slider( # label='Max new tokens', # minimum=512, # maximum=MAX_NEW_TOKENS, # step=1024, # value=DEFAULT_MAX_NEW_TOKENS, # ) # temperature = gr.Slider( # label='Temperature', # minimum=0.01, # maximum=1.0, # step=0.05, # value=DEFAULT_TEMPERATURE, # ) # with gr.Row(): # langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False) # load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False) # # chatbot = gr.Chatbot()l̥ # # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter") # # submit_button = gr.Button("Send Message") # if pdf_doc: # load_pdf.click(loading_file, None, langchain_status, queue=False) # load_pdf.click(document_loader, inputs=[pdf_doc,file_extension,temperature,max_new_tokens], outputs=[langchain_status], queue=False) # #question.submit(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot) # #submit_btn.click(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot) # sumarize_btn.click() # # submit_btn.then(chatf.highlight_found_text, [chatbot, sources], [sources]) # clean_chat_btn.click(clear_chat, [], chatbot) # demo.launch()