anishde commited on
Commit
25885e2
·
verified ·
1 Parent(s): 3eb1372

Delete app1.py

Browse files
Files changed (1) hide show
  1. app1.py +0 -311
app1.py DELETED
@@ -1,311 +0,0 @@
1
- from langchain.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
2
- from langchain.text_splitter import CharacterTextSplitter
3
- from langchain.document_loaders import PDFMinerLoader
4
- from langchain.text_splitter import RecursiveCharacterTextSplitter
5
- from langchain.embeddings import HuggingFaceEmbeddings
6
- from langchain import HuggingFaceHub
7
- from langchain.chains.summarize import load_summarize_chain
8
- from langchain.chains.llm_summarization_checker.base import LLMSummarizationCheckerChain
9
- from langchain.prompts import PromptTemplate
10
- import os
11
- import gradio as gr
12
- import shutil
13
- import re
14
- import tempfile
15
- import cache
16
- from pathlib import Path
17
-
18
-
19
- api_token=os.environ['api']
20
- os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
21
-
22
- # api=userdata.get('api')
23
- # api_token=api
24
- # # api_token =
25
- # os.environ["HUGGINFACEHUB_API_TOKEN"]=api_token
26
-
27
- temp_dir = "/content/sample_data"
28
-
29
- # file_path_dummy = "/content/2401.10231.pdf"
30
- # if file_path_dummy.lower().endswith(".pdf") :
31
- # loader = TextLoader(file_path_dummy)
32
- # document= loader.load()
33
- # print(document)
34
-
35
- def data_ingestion(file_path):
36
- if not os.path.exists(file_path):
37
- raise ValueError(f"File path {file_path} does not exist.")
38
-
39
- path = Path(file_path)
40
- file_ext = path.suffix
41
-
42
- # file_ext = os.path.splitext(file_path)[-1]
43
- # if file_ext == ".pdf":
44
-
45
- if file_path.lower().endswith(".pdf"):
46
- loader = PDFMinerLoader(file_path)
47
-
48
- elif file_path.lower().endswith(".txt"):
49
- loader = TextLoader(file_path)
50
-
51
- else:
52
- loader = Docx2txtLoader(file_path)
53
-
54
-
55
- # document= loader.load()
56
-
57
- # loader = PDFMinerLoader(file_path)
58
- document= loader.load()
59
-
60
- length = len(document[0].page_content)
61
-
62
- # Replace CharacterTextSplitter with RecursiveCharacterTextSplitter
63
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=0.03*length, chunk_overlap=0)
64
- split_docs = text_splitter.split_documents(document)
65
-
66
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
67
-
68
- llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
69
- model_kwargs={"temperature":1, "max_length":10000},
70
- huggingfacehub_api_token=api_token)
71
-
72
- return split_docs
73
-
74
- # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
75
- # chunk_size=2000, chunk_overlap=0
76
- # )
77
- # split_docs = text_splitter.split_documents(document)
78
-
79
- # documents=split_text_into_batches(str(document),400)
80
- # len(documents)
81
- # documents[0]
82
- # #
83
- # from langchain.text_splitter import CharacterTextSplitter
84
- # text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
85
- # documents = text_splitter.split_documents(document)
86
- # Embeddings
87
-
88
- # from langchain.chains.question_answering import load_qa_chain
89
-
90
- ########## CHAIN 1 norm text
91
-
92
- def chain1():
93
- prompt_template = """Your job is to write a summary of the document such that every summary of the text is of 2 sentences
94
- here is the content of the section:
95
- "{text}"
96
-
97
- SUMMARY:"""
98
- prompt = PromptTemplate.from_template(prompt_template)
99
-
100
- refine_template = (
101
- "Your job is to produce a final summary\n"
102
- # "We have provided an existing summary up to a certain point: {existing_answer}\n"
103
- "We have the opportunity to refine the existing summary"
104
- "(only if needed) with some more context below.\n"
105
- "------------\n"
106
- "{text}\n"
107
- "------------\n"
108
- "Given the new context, refine the original summary in English"
109
- "If the context isn't useful, return the original summary." )
110
-
111
- refine_prompt = PromptTemplate.from_template(refine_template)
112
- chain1 = load_summarize_chain(
113
- llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
114
- model_kwargs={"temperature":1, "max_length":10000},
115
- huggingfacehub_api_token=api_token),
116
- chain_type="refine",
117
- question_prompt=prompt,
118
- # refine_prompt=refine_prompt,
119
- return_intermediate_steps=False,
120
- input_key="input_documents",
121
- output_key="output_text",
122
- )
123
- return chain1
124
-
125
- # result = chain({"input_documents":split_docs}, return_only_outputs=True)
126
-
127
- ########## CHAIN 2 research paper
128
-
129
- def chain2():
130
- prompt_template = """Your job is to write a summary of the document such that every summary of the text is of 2 sentences
131
- here is the content of the section:
132
- "{text}"
133
-
134
- SUMMARY:"""
135
- prompt = PromptTemplate.from_template(prompt_template)
136
-
137
- refine_template = (
138
- "Your job is to produce a final summary\n"
139
- # "We have provided an existing summary up to a certain point: {existing_answer}\n"
140
- "We have the opportunity to refine the existing summary"
141
- "(only if needed) with some more context below.\n"
142
- "------------\n"
143
- "{text}\n"
144
- "------------\n"
145
- "Given the new context, refine the original summary in English"
146
- "If the context isn't useful, return the original summary." )
147
-
148
- refine_prompt = PromptTemplate.from_template(refine_template)
149
- chain2 = load_summarize_chain(
150
- llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
151
- model_kwargs={"temperature":1, "max_length":10000},
152
- huggingfacehub_api_token=api_token),
153
- chain_type = "refine",
154
- question_prompt = prompt,
155
- # refine_prompt = refine_prompt,
156
- return_intermediate_steps=False,
157
- input_key="input_documents",
158
- output_key="output_text",
159
- )
160
- return chain2
161
-
162
- # result = chain({"input_documents":split_docs}, return_only_outputs=True)
163
-
164
- ########## CHAIN 3 arxiv_paper_1
165
-
166
- def chain3():
167
- prompt_template = """You are being given a markdown document with headers, this is part of a larger arxiv paper. Your job is to write a summary of the document such that every summary of the text is of 2 sentences
168
- here is the content of the section:
169
- "{text}"
170
-
171
- SUMMARY:"""
172
- prompt = PromptTemplate.from_template(prompt_template)
173
-
174
- refine_template = ("""You are presented with a collection of text snippets. Each snippet is a summary of a specific section from an academic paper published on arXiv. Your objective is to synthesize these snippets into a coherent, concise summary of the entire paper.
175
-
176
- DOCUMENT SNIPPETS:
177
- "{text}"
178
-
179
- INSTRUCTIONS: Craft a concise summary below, capturing the essence of the paper based on the provided snippets.
180
- It is also important that you highlight the key contributions of the paper, and 3 key takeaways from the paper.
181
- Lastly you should provide a list of 5 questions that you would ask the author of the paper if you had the chance. Remove all the backslash n (\n)
182
- SUMMARY:
183
- """
184
- )
185
-
186
- refine_prompt = PromptTemplate.from_template(refine_template)
187
- chain3 = load_summarize_chain(
188
- llm=HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
189
- model_kwargs={"temperature":1, "max_length":10000},
190
- huggingfacehub_api_token=api_token),
191
- chain_type="refine",
192
- question_prompt=prompt,
193
- # refine_prompt=refine_prompt,
194
- return_intermediate_steps=False,
195
- input_key="input_documents",
196
- output_key="output_text",
197
- )
198
- return chain3
199
- # result = chain({"input_documents":split_docs}, return_only_outputs=True)
200
- # chain.run(document)
201
- # print(result["output_text"])
202
-
203
- def chain_function(checkbox_values):
204
- if "Research Paper" in checkbox_values:
205
- output = chain3()
206
- elif "Legal Document" in checkbox_values:
207
- output = chain2()
208
- elif "Study Material" in checkbox_values:
209
- output = chain1()
210
- else:
211
- output = "Please select a document type to run."
212
- return output
213
-
214
- def result(chain, split_docs):
215
- summaries = []
216
- for doc in split_docs:
217
- result = chain({"input_documents": [doc]})
218
- # result = chain({"input_documents": [doc]}, return_only_outputs=True)
219
- summaries.append(result["output_text"])
220
- text_concat = ""
221
- for i in summaries:
222
- text_concat += i
223
- # output = re.sub(r'\n'," "," ",text_concat)
224
- return text_concat
225
-
226
- title = """<p style="font-family:Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>"""
227
-
228
- # description = r"""<p style="font-family: Century Gothic; text-align:center; font-size: 100px">S I M P L I F Y</p>
229
- # """
230
-
231
- # article = r"""
232
- # If PhotoMaker is helpful, please help to ⭐ the <a href='https://github.com/TencentARC/PhotoMaker' target='_blank'>Github Repo</a>. Thanks!
233
- # [![GitHub Stars](https://img.shields.io/github/stars/TencentARC/PhotoMaker?style=social)](https://github.com/TencentARC/PhotoMaker)
234
- # ---
235
- # 📝 **Citation**
236
- # <br>
237
- # If our work is useful for your research, please consider citing:
238
- # ```bibtex
239
- # @article{li2023photomaker,
240
- # title={PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding},
241
- # author={Li, Zhen and Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Cheng, Ming-Ming and Shan, Ying},
242
- # booktitle={arXiv preprint arxiv:2312.04461},
243
- # year={2023}
244
- # }
245
- # ```
246
- # 📋 **License**
247
- # <br>
248
- # Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/TencentARC/PhotoMaker/blob/main/LICENSE) for details.
249
- # 📧 **Contact**
250
- # <br>
251
- # If you have any questions, please feel free to reach me out at <b>zhenli1031@gmail.com</b>.
252
- # """
253
-
254
- # tips = r"""
255
- # ### Usage tips of PhotoMaker
256
- # 1. Upload more photos of the person to be customized to **improve ID fidelty**. If the input is Asian face(s), maybe consider adding 'asian' before the class word, e.g., `asian woman img`
257
- # 2. When stylizing, does the generated face look too realistic? Adjust the **Style strength** to 30-50, the larger the number, the less ID fidelty, but the stylization ability will be better.
258
- # 3. If you want to generate realistic photos, you could try switching to our other gradio application [PhotoMaker](https://huggingface.co/spaces/TencentARC/PhotoMaker).
259
- # 4. For **faster** speed, reduce the number of generated images and sampling steps. However, please note that reducing the sampling steps may compromise the ID fidelity.
260
- # """
261
-
262
- # def process_file(file_obj):
263
- # destination_path = "/content/sample_data" # Replace with your desired path
264
- # shutil.copy(file_obj, destination_path) # Save file to specified path
265
- # return os.path.join(destination_path, file_obj)
266
- def process_file(list_file_obj):
267
- # list_file_path = [x.name for x in list_file_obj if x is not None]
268
- # file_content = file_obj.data
269
- # with tempfile.TemporaryFile() as temp_file:
270
- # temp_file.write(file_content)
271
- # temp_file_path = temp_file.name
272
- return list_file_obj[0].name
273
-
274
- def inference(checkbox_values, uploaded_file):
275
- file_path = process_file(uploaded_file)
276
- split_docs = data_ingestion(file_path)
277
- chain = chain_function(checkbox_values)
278
- summary = result(chain, split_docs)
279
- return summary
280
-
281
- def main():
282
- with gr.Blocks(theme="monochrome") as demo:
283
- gr.Markdown(title)
284
-
285
- with gr.Row():
286
- with gr.Column():
287
- checkbox_values = gr.CheckboxGroup(["Research Paper", "Legal Document", "Study Material"], label="Choose the document type")
288
- uploaded_file = gr.Files(height=100, file_count="multiple", file_types=["text", ".docx", "pdf"], interactive=True, label="Upload your File.")
289
- btn = gr.Button("Submit") # Place the button outside the Row for vertical alignment
290
- with gr.Column():
291
- txt = gr.Textbox(
292
- show_label=False,
293
- # placeholder="Simplify."
294
- )
295
-
296
-
297
- btn.click(
298
- fn=inference,
299
- inputs=[checkbox_values, uploaded_file],
300
- outputs=[txt],
301
- queue=True
302
- )
303
- demo.queue()
304
- demo.launch()
305
-
306
- if __name__ == "__main__":
307
- # debug = True
308
- # demo.queue()
309
- # demo.launch()
310
- main()
311
-