Spaces:
Sleeping
Sleeping
| from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.document_loaders import PDFMinerLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain import HuggingFaceHub | |
| from langchain.chains.summarize import load_summarize_chain | |
| from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain | |
| from langchain.prompts import PromptTemplate | |
| import os | |
| import gradio as gr | |
| import shutil | |
| import re | |
| import tempfile | |
| from pathlib import Path | |
| api_token=os.environ['api'] | |
| os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token | |
| # api=userdata.get('api') | |
| # api_token=api | |
| # # api_token = | |
| # os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token | |
| temp_dir = "/content/sample_data" | |
| # file_path_dummy = "/content/2401.10231.pdf" | |
| # if file_path_dummy.lower().endswith(".pdf") : | |
| # loader = TextLoader(file_path_dummy) | |
| # document= loader.load() | |
| # print(document) | |
| def data_ingestion(file_path): | |
| if not os.path.exists(file_path): | |
| raise ValueError(f"File path {file_path} does not exist.") | |
| path = Path(file_path) | |
| file_ext = path.suffix | |
| # file_ext = os.path.splitext(file_path)[-1] | |
| # if file_ext == ".pdf": | |
| if file_path.lower().endswith(".pdf"): | |
| loader = PDFMinerLoader(file_path) | |
| elif file_path.lower().endswith(".txt"): | |
| loader = TextLoader(file_path) | |
| else: | |
| loader = Docx2txtLoader(file_path) | |
| # document= loader.load() | |
| # loader = PDFMinerLoader(file_path) | |
| document= loader.load() | |
| length = len(document[0].page_content) | |
| # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=0.03*length, chunk_overlap=0) | |
| split_docs = text_splitter.split_documents(document) | |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}) | |
| llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| model_kwargs={"temperature":1, "max_length":10000}, | |
| huggingfacehub_api_token=api_token) | |
| return split_docs | |
| # text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
| # chunk_size=2000, chunk_overlap=0 | |
| # ) | |
| # split_docs = text_splitter.split_documents(document) | |
| # documents=split_text_into_batches(str(document),400) | |
| # len(documents) | |
| # documents[0] | |
| # # | |
| # from langchain.text_splitter import CharacterTextSplitter | |
| # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0) | |
| # documents = text_splitter.split_documents(document) | |
| # Embeddings | |
| # from langchain.chains.question_answering import load_qa_chain | |
| ########## CHAIN 1 norm text | |
| def chain1(): | |
| prompt_template = """Please provide a summary of the given study material. Summarize the key concepts, findings, and important details. | |
| Pay special attention to any definitions, theories, or conclusions presented in the text. | |
| Your summary should be concise yet comprehensive, capturing the main points of the study material. | |
| Your job is to write a summary of the document such that every summary of the text is of 2 sentences. | |
| here is the content of the section: | |
| "{text}" | |
| SUMMARY:""" | |
| prompt = PromptTemplate.from_template(prompt_template) | |
| refine_template = ( | |
| "Your job is to produce a final summary\n" | |
| # "We have provided an existing summary up to a certain point: {existing_answer}\n" | |
| "We have the opportunity to refine the existing summary" | |
| "(only if needed) with some more context below.\n" | |
| "------------\n" | |
| "{text}\n" | |
| "------------\n" | |
| "Given the new context, refine the original summary in English" | |
| "If the context isn't useful, return the original summary." ) | |
| refine_prompt = PromptTemplate.from_template(refine_template) | |
| chain1 = load_summarize_chain( | |
| llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| model_kwargs={"temperature":1, "max_length":10000}, | |
| huggingfacehub_api_token=api_token), | |
| chain_type="refine", | |
| question_prompt=prompt, | |
| # refine_prompt=refine_prompt, | |
| return_intermediate_steps=False, | |
| input_key="input_documents", | |
| output_key="output_text", | |
| ) | |
| return chain1 | |
| # result = chain({"input_documents":split_docs}, return_only_outputs=True) | |
| ########## CHAIN 2 Legal Document | |
| def chain2(): | |
| prompt_template = """Summarize the provided legal document. Highlight key legal arguments, decisions, and any crucial precedents mentioned. | |
| Include a concise overview of the case or legal matter, focusing on the most significant details. | |
| Ensure the summary captures the essential legal aspects and implications of the document. | |
| Your job is to write a summary of the document such that every summary of the text is of 2 sentences. | |
| here is the content of the section: | |
| "{text}" | |
| SUMMARY:""" | |
| prompt = PromptTemplate.from_template(prompt_template) | |
| refine_template = ( | |
| "Your job is to produce a final summary\n" | |
| # "We have provided an existing summary up to a certain point: {existing_answer}\n" | |
| "We have the opportunity to refine the existing summary" | |
| "(only if needed) with some more context below.\n" | |
| "------------\n" | |
| "{text}\n" | |
| "------------\n" | |
| "Given the new context, refine the original summary in English" | |
| "If the context isn't useful, return the original summary." ) | |
| refine_prompt = PromptTemplate.from_template(refine_template) | |
| chain2 = load_summarize_chain( | |
| llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| model_kwargs={"temperature":1, "max_length":10000}, | |
| huggingfacehub_api_token=api_token), | |
| chain_type = "refine", | |
| question_prompt = prompt, | |
| # refine_prompt = refine_prompt, | |
| return_intermediate_steps=False, | |
| input_key="input_documents", | |
| output_key="output_text", | |
| ) | |
| return chain2 | |
| # result = chain({"input_documents":split_docs}, return_only_outputs=True) | |
| ########## CHAIN 3 arxiv_paper_1 | |
| def chain3(): | |
| prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper. | |
| Provide a summary of the given research paper. Capture the main research question, methodology, key findings, and any novel contributions made by the paper. | |
| Emphasize the significance of the research and its potential implications. | |
| The summary should be succinct while conveying the essential information presented in the research paper. | |
| Your job is to write a summary of the document such that every summary of the text is of 2 sentences. | |
| here is the content of the section: | |
| "{text}" | |
| SUMMARY:""" | |
| prompt = PromptTemplate.from_template(prompt_template) | |
| refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper. | |
| DOCUMENT SNIPPETS: | |
| "{text}" | |
| INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets. | |
| It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper. | |
| Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n) | |
| SUMMARY: | |
| """ | |
| ) | |
| refine_prompt = PromptTemplate.from_template(refine_template) | |
| chain3 = load_summarize_chain( | |
| llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| model_kwargs={"temperature":1, "max_length":10000}, | |
| huggingfacehub_api_token=api_token), | |
| chain_type="refine", | |
| question_prompt=prompt, | |
| # refine_prompt=refine_prompt, | |
| return_intermediate_steps=False, | |
| input_key="input_documents", | |
| output_key="output_text", | |
| ) | |
| return chain3 | |
| # result = chain({"input_documents":split_docs}, return_only_outputs=True) | |
| # chain.run(document) | |
| # print(result["output_text"]) | |
| def chain_function(checkbox_values): | |
| if "Research Paper" in checkbox_values: | |
| output = chain3() | |
| elif "Legal Document" in checkbox_values: | |
| output = chain2() | |
| elif "General Text" in checkbox_values: | |
| output = chain1() | |
| else: | |
| output = "Please select a document type to run." | |
| return output | |
| def result(chain, split_docs): | |
| summaries = [] | |
| for doc in split_docs: | |
| result = chain({"input_documents": [doc]}) | |
| # result = chain({"input_documents": [doc]}, return_only_outputs=True) | |
| summaries.append(result["output_text"]) | |
| text_concat = "" | |
| for i in summaries: | |
| text_concat += i | |
| # output = re.sub(r'\n'," "," ",text_concat) | |
| return text_concat | |
| title = """<p style="font-family:Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>""" | |
| # description = r"""<p style="font-family: Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p> | |
| # """ | |
| # article = r""" | |
| # If PhotoMaker is helpful, please help to β the <a href='https://github.com/TencentARC/PhotoMaker' target='_blank'>Github Repo</a>. Thanks! | |
| # [](https://github.com/TencentARC/PhotoMaker) | |
| # --- | |
| # π **Citation** | |
| # <br> | |
| # If our work is useful for your research, please consider citing: | |
| # ```bibtex | |
| # @article{li2023photomaker, | |
| # title={PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding}, | |
| # author={Li, Zhen and Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Cheng, Ming-Ming and Shan, Ying}, | |
| # booktitle={arXiv preprint arxiv:2312.04461}, | |
| # year={2023} | |
| # } | |
| # ``` | |
| # π **License** | |
| # <br> | |
| # Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/TencentARC/PhotoMaker/blob/main/LICENSE) for details. | |
| # π§ **Contact** | |
| # <br> | |
| # If you have any questions, please feel free to reach me out at <b>zhenli1031@gmail.com</b>. | |
| # """ | |
| # tips = r""" | |
| # ### Usage tips of PhotoMaker | |
| # 1. Upload more photos of the person to be customized to **improve ID fidelty**. If the input is Asian face(s), maybe consider adding 'asian' before the class word, e.g., `asian woman img` | |
| # 2. When stylizing, does the generated face look too realistic? Adjust the **Style strength** to 30-50, the larger the number, the less ID fidelty, but the stylization ability will be better. | |
| # 3. If you want to generate realistic photos, you could try switching to our other gradio application [PhotoMaker](https://huggingface.co/spaces/TencentARC/PhotoMaker). | |
| # 4. For **faster** speed, reduce the number of generated images and sampling steps. However, please note that reducing the sampling steps may compromise the ID fidelity. | |
| # """ | |
| # def process_file(file_obj): | |
| # destination_path = "/content/sample_data" # Replace with your desired path | |
| # shutil.copy(file_obj, destination_path) # Save file to specified path | |
| # return os.path.join(destination_path, file_obj) | |
| def process_file(list_file_obj): | |
| # list_file_path = [x.name for x in list_file_obj if x is not None] | |
| # file_content = file_obj.data | |
| # with tempfile.TemporaryFile() as temp_file: | |
| # temp_file.write(file_content) | |
| # temp_file_path = temp_file.name | |
| return list_file_obj[0].name | |
| def inference(checkbox_values, uploaded_file): | |
| file_path = process_file(uploaded_file) | |
| split_docs = data_ingestion(file_path) | |
| chain = chain_function(checkbox_values) | |
| summary = result(chain, split_docs) | |
| return summary | |
| with gr.Blocks(theme="monochrome") as demo: | |
| gr.Markdown(title) | |
| with gr.Row(): | |
| with gr.Column(): | |
| checkbox_values = gr.CheckboxGroup(["Research Paper", "Legal Document", "General Text"], label="Choose the document type") | |
| uploaded_file = gr.Files(height=100, file_count="multiple", file_types=["text", ".docx", "pdf"], interactive=True, label="Upload your File.") | |
| btn = gr.Button("Submit") # Place the button outside the Row for vertical alignment | |
| with gr.Column(): | |
| txt = gr.Textbox( | |
| show_label=False, | |
| # placeholder="Simplify." | |
| ) | |
| btn.click( | |
| fn=inference, | |
| inputs=[checkbox_values, uploaded_file], | |
| outputs=[txt], | |
| queue=False | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch(debug = True) | |