File size: 11,512 Bytes
e23d57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bfc0c2
e23d57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bfc0c2
e23d57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bfc0c2
e23d57a
2bfc0c2
e23d57a
2bfc0c2
e23d57a
 
2bfc0c2
 
e23d57a
2bfc0c2
 
e23d57a
2bfc0c2
 
 
e23d57a
2bfc0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e23d57a
2bfc0c2
 
e23d57a
2bfc0c2
 
e23d57a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2bfc0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
e23d57a
2bfc0c2
 
 
 
 
 
 
 
e23d57a
2bfc0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e23d57a
 
 
 
2bfc0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
from datasets import load_dataset

# Load dataset from Hugging Face
dataset = load_dataset("MedRAG/textbooks")

# Preview dataset
print(dataset)

import pandas as pd

# Convert to Pandas DataFrame
df = pd.DataFrame(dataset["train"])

# Display first rows
print(df.head())

# Check file format
print(df.dtypes)

import nltk
import shutil

# Supprimer les ressources existantes
nltk.data.path.append('/root/nltk_data')  # Ajouter le chemin de nltk_data
nltk.data.clear_cache()  # Effacer le cache des donnΓ©es


# RΓ©installer le package 'punkt'
nltk.download('all')


import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK components
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

# Load stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Step 1: Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    words = word_tokenize(text)  # Tokenization
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]  # Lemmatization & stopword removal
    return " ".join(words)

# Apply preprocessing before chunking
dataset = dataset.map(lambda row: {"cleaned_content": preprocess_text(row["content"])})

# Step 2: Chunking Function
def chunk_text(text, chunk_size=3):
    sentences = sent_tokenize(text)  # Split text into sentences
    return [" ".join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]

# Apply chunking on the cleaned text
dataset = dataset.map(lambda row: {"chunks": chunk_text(row["cleaned_content"])})

from sentence_transformers import SentenceTransformer

# Load BioBERT or MiniLM for fast embedding
 embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def generate_embedding(row):
    embedding = embed_model.encode(row["chunks"], convert_to_tensor=False).tolist()

    # Fix: Ensure embedding is a flat list, not nested
    row["embedding"] = embedding[0] if isinstance(embedding, list) and len(embedding) == 1 else embedding
    return row

dataset = dataset.map(generate_embedding)

import numpy as np

# Flatten embeddings (convert [[...]] β†’ [...])
valid_embeddings = [
    np.array(row["embedding"]).flatten().tolist()  # Ensure each embedding is 1D
    for row in dataset["train"]
    if isinstance(row["embedding"], list) and len(row["embedding"]) == 384
]

# Convert to NumPy array
embeddings_np = np.array(valid_embeddings, dtype=np.float32)

# Check shape
print("βœ… Fixed Embeddings Shape:", embeddings_np.shape)  # Expected: (num_samples, 384)

import numpy as np

# Flatten embeddings (convert [[...]] β†’ [...])
valid_embeddings = [
    np.array(row["embedding"]).flatten().tolist()  # Ensure each embedding is 1D
    for row in dataset["train"]
    if isinstance(row["embedding"], list) and len(row["embedding"]) == 384
]

# Convert to NumPy array
embeddings_np = np.array(valid_embeddings, dtype=np.float32)

# Check shape
print("βœ… Fixed Embeddings Shape:", embeddings_np.shape)  # Expected: (num_samples, 384)

import faiss

# Check if embeddings are 2D
if len(embeddings_np.shape) == 1:
    embeddings_np = embeddings_np.reshape(1, -1)  # Ensure it's (num_samples, embedding_dim)

# Check final shape
print("Fixed Embeddings Shape:", embeddings_np.shape)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)  # Add all embeddings

print("βœ… Embeddings successfully stored in FAISS!")
print("Total embeddings in FAISS:", index.ntotal)

FAISS_INDEX_PATH = "/content/faiss_medical.index"  # Save in Colab's file system

# Save the FAISS index
faiss.write_index(index, FAISS_INDEX_PATH)

print(f"βœ… FAISS index successfully saved at: {FAISS_INDEX_PATH}")

# Load FAISS index from file
index = faiss.read_index(FAISS_INDEX_PATH)

print(f"βœ… FAISS index loaded from: {FAISS_INDEX_PATH}")
print(f"Total embeddings stored: {index.ntotal}")

print("πŸ” Available columns:", dataset.column_names)  # Should include "chunks"

medical_texts = dataset["train"]["chunks"]  # βœ… Correct way to access chunks
 # Use the same text that will be encoded

print("πŸ” Dataset structure:", dataset)
print("πŸ” Available columns in train:", dataset["train"].column_names)
print("βœ… First 3 chunked texts:", dataset["train"]["chunks"][:3])

import json
id_to_text = {idx: text for idx, text in enumerate(medical_texts)}

with open("id_to_text.json", "w") as f:
    json.dump(id_to_text, f)

import os

# βœ… Check if file exists
if os.path.exists("id_to_text.json"):
    print("βœ… `id_to_text.json` exists!")

    # βœ… Load the JSON file
    with open("id_to_text.json", "r") as f:
        id_to_text = json.load(f)

    # βœ… Compare number of records
    print(f"πŸ“Š Records in `id_to_text.json`: {len(id_to_text)}")
    print(f"πŸ“Š Records in `medical_texts`: {len(medical_texts)}")

    if len(id_to_text) == len(medical_texts):
        print("βœ… JSON file contains the correct number of records!")
    else:
        print("❌ Mismatch! FAISS ID mapping and dataset size are different.")

else:
    print("❌ `id_to_text.json` was not found! Make sure it was saved correctly.")

import random

# βœ… Pick 3 random FAISS IDs
sample_ids = random.sample(list(id_to_text.keys()), 3)

# βœ… Print their corresponding texts
for faiss_id in sample_ids:
    print(f"FAISS ID {faiss_id} β†’ Text: {id_to_text[faiss_id][:100]}...")  # Show only first 100 chars

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# βœ… Load FAISS
FAISS_INDEX_PATH = "/content/faiss_medical.index"
index = faiss.read_index(FAISS_INDEX_PATH)

# βœ… Load Sentence Transformer model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# βœ… Test a retrieval query
query = "What are the symptoms of pneumonia?"
query_embedding = embed_model.encode([query])

# βœ… Perform FAISS search
D, I = index.search(np.array(query_embedding).astype("float32"), 3)  # Retrieve top 3 matches

# βœ… Print the FAISS results & compare with JSON mapping
print("πŸ” FAISS Search Results:", I[0])
print("πŸ“ FAISS Distances:", D[0])

# βœ… Load `id_to_text.json`
with open("id_to_text.json", "r") as f:
    id_to_text = json.load(f)

id_to_text = {int(k): v for k, v in id_to_text.items()}  # Ensure keys are integers

# βœ… Print the matching texts
for faiss_id in I[0]:
    print(f"FAISS ID {faiss_id} β†’ Text: {id_to_text[faiss_id][:100]}...")  # Show first 100 characters

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json

# βœ… Load FAISS index
FAISS_INDEX_PATH = "/content/faiss_medical.index"
index = faiss.read_index(FAISS_INDEX_PATH)

# βœ… Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# βœ… Load FAISS ID β†’ Text Mapping
with open("id_to_text.json", "r") as f:
    id_to_text = json.load(f)

# βœ… Convert JSON keys to integers (FAISS returns int IDs)
id_to_text = {int(k): v for k, v in id_to_text.items()}

def retrieve_medical_summary(query, k=3):
    """

    Retrieve the most relevant medical literature from FAISS.



    Args:

        query (str): The medical question.

        k (int, optional): Number of closest documents to retrieve. Defaults to 3.



    Returns:

        str: The most relevant retrieved medical documents.

    """
    # Convert query to embedding
    query_embedding = embed_model.encode([query])

    # Perform FAISS search
    D, I = index.search(np.array(query_embedding).astype("float32"), k)

    # Retrieve the closest matching text using FAISS index IDs
    retrieved_docs = [id_to_text.get(int(idx), "No relevant data found.") for idx in I[0]]

    # βœ… Ensure all retrieved texts are strings (Flatten lists if needed)
    retrieved_docs = [doc if isinstance(doc, str) else " ".join(doc) for doc in retrieved_docs]

    # βœ… Join multiple retrieved documents into one response
    return "\n\n---\n\n".join(retrieved_docs) if retrieved_docs else "No relevant data found."


# βœ… Example Test
query = "What are the symptoms of pneumonia?"
retrieved_summary = retrieve_medical_summary(query, k=3)

print("πŸ“– Retrieved Medical Summary:\n", retrieved_summary)



import os
from groq import Groq

# βœ… Store API Key in Environment Variable
os.environ["GROQ_API_KEY"] = "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"  # Replace with your actual key

# βœ… Initialize Groq client correctly (Retrieve API key properly)
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

def generate_medical_answer_groq(query, model="llama-3.3-70b-versatile", max_tokens=500, temperature=0.3):
    """

    Generates a medical response using Groq's API with LLaMA 3.3-70B, after retrieving relevant literature from FAISS.



    Args:

        query (str): The patient's medical question.

        model (str, optional): The model to use. Defaults to "llama-3.3-70b-versatile".

        max_tokens (int, optional): Max number of tokens to generate. Defaults to 200.

        temperature (float, optional): Sampling temperature (higher = more creative). Defaults to 0.7.



    Returns:

        str: The AI-generated medical advice.

    """

    # βœ… Retrieve relevant medical literature from FAISS
    retrieved_summary = retrieve_medical_summary(query)
    print("\nπŸ” Retrieved Medical Text for Query:", query)
    print(retrieved_summary, "\n")

    if not retrieved_summary or retrieved_summary == "No relevant data found.":
        return "No relevant medical data found. Please consult a healthcare professional."

    try:
        # βœ… Send request to Groq API
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert AI specializing in medical knowledge."},
                {"role": "user", "content": f"Summarize the following medical literature and provide a structured medical answer:\n\n### Medical Literature ###\n{retrieved_summary}\n\n### Patient Question ###\n{query}\n\n### Medical Advice ###"}
            ],
            max_tokens=max_tokens,
            temperature=temperature
        )

        return response.choices[0].message.content.strip()  # Ensure clean output

    except Exception as e:
        return f"Error generating response: {str(e)}"

# βœ… Example Usage
query = "What are the symptoms of pneumonia?"
print("🩺 AI-Generated Response:", generate_medical_answer_groq(query))

# Gradio Interface
def ask_medical_question(question):
    return generate_medical_answer_groq(question)

# Create Gradio Interface
iface = gr.Interface(
    fn=ask_medical_question,
    inputs=gr.Textbox(lines=2, placeholder="Enter your medical question here..."),
    outputs=gr.Textbox(lines=10, placeholder="AI-generated medical advice will appear here..."),
    title="Medical Question Answering System",
    description="Ask any medical question, and the AI will provide an answer based on medical literature."
)

# Launch the Gradio app
iface.launch()