Mojo3 commited on
Commit
d1e1da5
·
verified ·
1 Parent(s): 2a1b8e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -203
app.py CHANGED
@@ -1,208 +1,19 @@
1
  import streamlit as st
2
- from docx import Document
3
- import os
4
- from langchain_core.prompts import PromptTemplate
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
- import torch
7
- import time
8
- from sentence_transformers import SentenceTransformer
9
- from langchain.vectorstores import Chroma
10
- from langchain.docstore.document import Document as Document2
11
- from langchain_community.embeddings import HuggingFaceEmbeddings
12
 
13
- from huggingface_hub import HfFolder
 
 
14
 
15
- # Load token from environment variable
16
- token = os.getenv("HF_TOKEN")
17
 
18
- print("my token is ", token)
19
- # Save the token to Hugging Face's system directory
20
- HfFolder.save_token(token)
21
 
22
- docs_folder = "./converted_docs"
23
-
24
-
25
- # Function to load .docx files from Google Drive folder
26
- def load_docx_files_from_drive(drive_folder):
27
- docx_files = [f for f in os.listdir(drive_folder) if f.endswith(".docx")]
28
- documents = []
29
-
30
- for file_name in docx_files:
31
- file_path = os.path.join(drive_folder, file_name)
32
- doc = Document(file_path)
33
- content = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
34
- documents.append(content)
35
-
36
- return documents
37
-
38
-
39
- # Load .docx files from Google Drive folder
40
- documents = load_docx_files_from_drive(docs_folder)
41
-
42
-
43
- def split_extracted_text_into_chunks(documents):
44
- print("Splitting text into chunks")
45
- # List to hold all chunks
46
- chunks = []
47
-
48
- for doc_text in documents:
49
- # Split the document text into lines
50
- lines = doc_text.splitlines()
51
-
52
- # Initialize variables for splitting
53
- current_chunk = []
54
- for line in lines:
55
- # Check if the line starts with "File Name:"
56
- if line.startswith("File Name:"):
57
- # If there's a current chunk, save it before starting a new one
58
- if current_chunk:
59
- chunks.append("\n".join(current_chunk))
60
- current_chunk = [] # Reset the current chunk
61
-
62
- # Add the line to the current chunk
63
- current_chunk.append(line)
64
-
65
- # Add the last chunk for the current document
66
- if current_chunk:
67
- chunks.append("\n".join(current_chunk))
68
-
69
- return chunks
70
-
71
-
72
- # Split the extracted documents into chunks
73
- chunks = split_extracted_text_into_chunks(documents)
74
-
75
-
76
- def save_chunks_to_file(chunks, output_file_path):
77
- print("Saving chunks to file")
78
- # Open the file in write mode
79
- with open(output_file_path, "w", encoding="utf-8") as file:
80
- for i, chunk in enumerate(chunks, start=1):
81
- # Write each chunk with a header for easy identification
82
- file.write(f"Chunk {i}:\n")
83
- file.write(chunk)
84
- file.write("\n" + "=" * 50 + "\n")
85
-
86
-
87
- # Path to save the chunks file
88
- output_file_path = "./chunks_output.txt"
89
-
90
- # Split the extracted documents into chunks
91
- chunks = split_extracted_text_into_chunks(documents)
92
-
93
- # Save the chunks to the file
94
- save_chunks_to_file(chunks, output_file_path)
95
-
96
-
97
- # Step 1: Load the model through LangChain's wrapper
98
- embedding_model = HuggingFaceEmbeddings(
99
- model_name="Omartificial-Intelligence-Space/Arabic-Triplet-Matryoshka-V2"
100
- )
101
-
102
-
103
- # Step 2: Embed the chunks (now simplified)
104
- def embed_chunks(chunks):
105
- print("Embedding the chunks")
106
- return [
107
- {"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
108
- for chunk in chunks
109
- ]
110
-
111
-
112
- embeddings = embed_chunks(chunks)
113
-
114
-
115
- # Step 3: Prepare documents (unchanged)
116
- def prepare_documents_for_chroma(embeddings):
117
- print("Preparing documents for chroma")
118
- return [
119
- Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
120
- for i, entry in enumerate(embeddings, start=1)
121
- ]
122
-
123
-
124
- documents = prepare_documents_for_chroma(embeddings)
125
-
126
- # Step 4: Create Chroma store (fixed)
127
- vectorstore = Chroma.from_documents(
128
- documents=documents,
129
- embedding=embedding_model, # Proper embedding object
130
- persist_directory="./chroma_db", # Optional persistence
131
- )
132
-
133
-
134
- import cohere
135
- from langchain_core.prompts import PromptTemplate
136
-
137
-
138
- class RAGPipeline:
139
- def __init__(self, vectorstore, api_key, model_name="c4ai-aya-expanse-8b", k=3):
140
- self.vectorstore = vectorstore
141
- self.model_name = model_name
142
- self.k = k
143
- self.api_key = api_key
144
- self.client = cohere.Client(api_key) # Initialize the Cohere client
145
- self.retriever = self.vectorstore.as_retriever(
146
- search_type="mmr", search_kwargs={"k": 3}
147
- )
148
- self.prompt_template = PromptTemplate.from_template(self._get_template())
149
-
150
- def _get_template(self):
151
- return """<s>[INST] <<SYS>>
152
- أنت مساعد مفيد يقدم إجابات باللغة العربية بناءً على السياق المقدم.
153
- - أجب فقط باللغة العربية
154
- - إذا لم تجد إجابة في السياق، قل أنك لا تعرف
155
- - كن دقيقاً وواضحاً في إجاباتك
156
- -جاوب من السياق حصريا
157
- <</SYS>>
158
-
159
- السياق: {context}
160
-
161
- السؤال: {question}
162
- الإجابة: [/INST]\
163
-
164
- """
165
-
166
- def generate_response(self, question):
167
- retrieved_docs = self._retrieve_documents(question)
168
- prompt = self._create_prompt(retrieved_docs, question)
169
- response = self._generate_response_cohere(prompt)
170
- return response
171
-
172
- def _retrieve_documents(self, question):
173
- retrieved_docs = self.retriever.invoke(question)
174
- # print("\n=== المستندات المسترجعة ===")
175
- # for i, doc in enumerate(retrieved_docs):
176
- # print(f"المستند {i+1}: {doc.page_content}")
177
- # print("==========================\n")
178
-
179
- # دمج النصوص المسترجعة في سياق واحد
180
- return " ".join([doc.page_content for doc in retrieved_docs])
181
-
182
- def _create_prompt(self, docs, question):
183
- return self.prompt_template.format(context=docs, question=question)
184
-
185
- def _generate_response_cohere(self, prompt):
186
- # Call Cohere's generate API
187
- response = self.client.generate(
188
- model=self.model_name,
189
- prompt=prompt,
190
- max_tokens=2000, # Adjust token limit based on requirements
191
- temperature=0.3, # Control creativity
192
- stop_sequences=None,
193
- )
194
-
195
- if response.generations:
196
- return response.generations[0].text.strip()
197
- else:
198
- raise Exception("No response generated by Cohere API.")
199
-
200
- api_key = os.getenv("API_KEY")
201
- rag_pipeline = RAGPipeline(vectorstore=vectorstore, api_key=api_key)
202
-
203
- question = st.text_area("أدخل سؤالك هنا")
204
- if st.button("Generate Answer"):
205
- response = rag_pipeline.generate_response(question)
206
- st.write(response)
207
- print("Question: ", question)
208
- print("Response: ", response)
 
1
  import streamlit as st
2
+ from transformers import pipeline
 
 
 
 
 
 
 
 
 
3
 
4
+ @st.cache_resource
5
+ def load_model():
6
+ return pipeline("text-generation", model="gpt2")
7
 
8
+ model = load_model()
 
9
 
10
+ st.title("Simple Text Generator")
11
+ user_input = st.text_input("Enter your prompt here:")
 
12
 
13
+ if st.button("Generate"):
14
+ if user_input:
15
+ with st.spinner("Generating..."):
16
+ output = model(user_input, max_length=100, do_sample=True)[0]['generated_text']
17
+ st.write(output)
18
+ else:
19
+ st.warning("Please enter a prompt")