anishde commited on
Commit
d341c58
·
verified ·
1 Parent(s): 5722a5d

Delete app1.py

Browse files
Files changed (1) hide show
  1. app1.py +0 -288
app1.py DELETED
@@ -1,288 +0,0 @@
1
- from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
2
- from langchain.text_splitter import CharacterTextSplitter
3
- from langchain.document_loaders import PDFMinerLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
- from langchain import HuggingFaceHub
7
- from langchain.chains.summarize import load_summarize_chain
8
- from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain
9
- from langchain.prompts import PromptTemplate
10
- import os
11
- import gradio as gr
12
- import shutil
13
- import re
14
- import tempfile
15
- import cache
16
- from pathlib import Path
17
-
18
- api_token=os.environ['api']
19
- os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
20
-
21
- temp_dir = "/content/sample_data"
22
-
23
- def data_ingestion(file_path):
24
- if not os.path.exists(file_path):
25
- raise ValueError(f"File path {file_path} does not exist.")
26
-
27
- path = Path(file_path)
28
- file_ext = path.suffix
29
-
30
- # file_ext = os.path.splitext(file_path)[-1]
31
- # if file_ext == ".pdf":
32
- # # loader = PyPDFLoader(file_path)
33
- # loader = PDFMinerLoader(file_path)
34
- # document= loader.load()
35
-
36
- # elif file_ext in {".docx", ".doc"}:
37
- # loader = Docx2txtLoader(file_path)
38
- # document= loader.load()
39
-
40
- # elif file_ext == ".txt":
41
- # loader = TextLoader(file_path)
42
- # document= loader.load()
43
-
44
- loader = PDFMinerLoader(file_path)
45
- document= loader.load()
46
-
47
- length = len(document[0].page_content)
48
-
49
- # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter
50
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
51
- split_docs = text_splitter.split_documents(document)
52
-
53
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
54
-
55
- llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
56
- model_kwargs={"temperature":1, "max_length":10000},
57
- huggingfacehub_api_token=api_token)
58
-
59
- return split_docs
60
-
61
- # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
62
- # chunk_size=2000, chunk_overlap=0
63
- # )
64
- # split_docs = text_splitter.split_documents(document)
65
-
66
- # documents=split_text_into_batches(str(document),400)
67
- # len(documents)
68
- # documents[0]
69
- # #
70
- # from langchain.text_splitter import CharacterTextSplitter
71
- # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
72
- # documents = text_splitter.split_documents(document)
73
- # Embeddings
74
-
75
- # from langchain.chains.question_answering import load_qa_chain
76
-
77
- ########## CHAIN 1 norm text
78
-
79
- def chain1():
80
- prompt_template = """Write a concise summary of the following:
81
- {text}
82
- SUMMARY:"""
83
- prompt = PromptTemplate.from_template(prompt_template)
84
-
85
- refine_template = (
86
- "Your job is to produce a final summary\n"
87
- # "We have provided an existing summary up to a certain point: {existing_answer}\n"
88
- "We have the opportunity to refine the existing summary"
89
- "(only if needed) with some more context below.\n"
90
- "------------\n"
91
- "{text}\n"
92
- "------------\n"
93
- "Given the new context, refine the original summary in English"
94
- "If the context isn't useful, return the original summary." )
95
-
96
- refine_prompt = PromptTemplate.from_template(refine_template)
97
- chain1 = load_summarize_chain(
98
- llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
99
- model_kwargs={"temperature":1, "max_length":10000},
100
- huggingfacehub_api_token=api_token),
101
- chain_type="refine",
102
- question_prompt=prompt,
103
- # refine_prompt=refine_prompt,
104
- return_intermediate_steps=False,
105
- input_key="input_documents",
106
- output_key="output_text",
107
- )
108
- return chain1
109
-
110
- # result = chain({"input_documents":split_docs}, return_only_outputs=True)
111
-
112
- ########## CHAIN 2 research paper
113
-
114
- def chain2():
115
- prompt_template = """This is a Research Paper,your job is to summarise the text portion without any symbols or special characters, skip the mathematical equations for now:
116
- {text}
117
- SUMMARY:"""
118
- prompt = PromptTemplate.from_template(prompt_template)
119
-
120
- refine_template = (
121
- "Your job is to produce a final summary\n"
122
- # "We have provided an existing summary up to a certain point: {existing_answer}\n"
123
- "We have the opportunity to refine the existing summary"
124
- "(only if needed) with some more context below.\n"
125
- "------------\n"
126
- "{text}\n"
127
- "------------\n"
128
- "Given the new context, refine the original summary in English"
129
- "If the context isn't useful, return the original summary." )
130
-
131
- refine_prompt = PromptTemplate.from_template(refine_template)
132
- chain2 = load_summarize_chain(
133
- llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
134
- model_kwargs={"temperature":1, "max_length":10000},
135
- huggingfacehub_api_token=api_token),
136
- chain_type = "refine",
137
- question_prompt = prompt,
138
- # refine_prompt = refine_prompt,
139
- return_intermediate_steps=False,
140
- input_key="input_documents",
141
- output_key="output_text",
142
- )
143
- return chain2
144
-
145
- # result = chain({"input_documents":split_docs}, return_only_outputs=True)
146
-
147
- ########## CHAIN 3 arxiv_paper_1
148
-
149
- def chain3():
150
- prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper. Your job is to write a summary of the document.
151
- here is the content of the section:
152
- "{text}"
153
-
154
- SUMMARY:"""
155
- prompt = PromptTemplate.from_template(prompt_template)
156
-
157
- refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper.
158
-
159
- DOCUMENT SNIPPETS:
160
- "{text}"
161
-
162
- INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets.
163
- It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper.
164
- Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n)
165
- SUMMARY:
166
- """
167
- )
168
-
169
- refine_prompt = PromptTemplate.from_template(refine_template)
170
- chain3 = load_summarize_chain(
171
- llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
172
- model_kwargs={"temperature":1, "max_length":10000},
173
- huggingfacehub_api_token=api_token),
174
- chain_type="refine",
175
- question_prompt=prompt,
176
- # refine_prompt=refine_prompt,
177
- return_intermediate_steps=False,
178
- input_key="input_documents",
179
- output_key="output_text",
180
- )
181
- return chain3
182
- # result = chain({"input_documents":split_docs}, return_only_outputs=True)
183
- # chain.run(document)
184
- # print(result["output_text"])
185
-
186
- def chain_function(checkbox_values):
187
-
188
- if "Research Paper" in checkbox_values:
189
- output = chain3()
190
- elif "Legal Document" in checkbox_values:
191
- output = chain2()
192
- elif "Study Material" in checkbox_values:
193
- output = chain1()
194
- else:
195
- output = "Please select a document type to run."
196
- return output
197
-
198
- def result(chain, split_docs):
199
- summaries = []
200
- for doc in split_docs:
201
- result = chain({"input_documents": [doc]})
202
- # result = chain({"input_documents": [doc]}, return_only_outputs=True)
203
- summaries.append(result["output_text"])
204
- text_concat = ""
205
- for i in summaries:
206
- text_concat += i
207
- # output = re.sub(r'\n'," "," ",text_concat)
208
- return text_concat
209
-
210
- title = """<p style="font-family:Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>"""
211
-
212
- # description = r"""<p style="font-family: Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>
213
- # """
214
-
215
- # article = r"""
216
- # If PhotoMaker is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/PhotoMaker' target='_blank'>Github Repo</a>. Thanks!
217
- # [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/PhotoMaker?style=social)](https://github.com/TencentARC/PhotoMaker)
218
- # ---
219
- # 📝 **Citation**
220
- # <br>
221
- # If our work is useful for your research, please consider citing:
222
- # ```bibtex
223
- # @article{li2023photomaker,
224
- # title={PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding},
225
- # author={Li, Zhen and Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Cheng, Ming-Ming and Shan, Ying},
226
- # booktitle={arXiv preprint arxiv:2312.04461},
227
- # year={2023}
228
- # }
229
- # ```
230
- # 📋 **License**
231
- # <br>
232
- # Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/TencentARC/PhotoMaker/blob/main/LICENSE) for details.
233
- # 📧 **Contact**
234
- # <br>
235
- # If you have any questions, please feel free to reach me out at <b>zhenli1031@gmail.com</b>.
236
- # """
237
-
238
- # tips = r"""
239
- # ### Usage tips of PhotoMaker
240
- # 1. Upload more photos of the person to be customized to **improve ID fidelty**. If the input is Asian face(s), maybe consider adding 'asian' before the class word, e.g., `asian woman img`
241
- # 2. When stylizing, does the generated face look too realistic? Adjust the **Style strength** to 30-50, the larger the number, the less ID fidelty, but the stylization ability will be better.
242
- # 3. If you want to generate realistic photos, you could try switching to our other gradio application [PhotoMaker](https://huggingface.co/spaces/TencentARC/PhotoMaker).
243
- # 4. For **faster** speed, reduce the number of generated images and sampling steps. However, please note that reducing the sampling steps may compromise the ID fidelity.
244
- # """
245
-
246
- # def process_file(file_obj):
247
- # destination_path = "/content/sample_data" # Replace with your desired path
248
- # shutil.copy(file_obj, destination_path) # Save file to specified path
249
- # return os.path.join(destination_path, file_obj)
250
- def process_file(list_file_obj):
251
- # list_file_path = [x.name for x in list_file_obj if x is not None]
252
- # file_content = file_obj.data
253
- # with tempfile.TemporaryFile() as temp_file:
254
- # temp_file.write(file_content)
255
- # temp_file_path = temp_file.name
256
- return list_file_obj[0].name
257
-
258
- def inference(checkbox_values, uploaded_file):
259
- file_path = process_file(uploaded_file)
260
- split_docs = data_ingestion(file_path)
261
- chain = chain_function(checkbox_values)
262
- summary = result(chain, split_docs)
263
- return summary
264
-
265
- with gr.Blocks(theme="monochrome") as demo:
266
- gr.Markdown(title)
267
-
268
- with gr.Row():
269
- with gr.Column():
270
- checkbox_values = gr.CheckboxGroup(["Research Paper", "Legal Document", "Study Material"], label="Choose the document type")
271
- uploaded_file = gr.Files(height=100, file_count="multiple", file_types=["text", ".docx", "pdf"], interactive=True, label="Upload your File.")
272
- btn = gr.Button("Submit") # Place the button outside the Row for vertical alignment
273
- with gr.Column():
274
- txt = gr.Textbox(
275
- show_label=False,scale=2,
276
- # placeholder="Simplify."
277
- )
278
-
279
-
280
- btn.click(
281
- fn=inference,
282
- inputs=[checkbox_values, uploaded_file],
283
- outputs=[txt],
284
- queue=False
285
- )
286
- # debug = True
287
- demo.launch(debug = True)
288
-