AryanRajSaxena commited on
Commit
f4dcafb
·
1 Parent(s): 0c56589

second iteration

Browse files
Files changed (5) hide show
  1. app.py +51 -19
  2. chatbot.py +12 -0
  3. qa_engine.py +7 -7
  4. suggestions.py +8 -0
  5. summarizer.py +22 -6
app.py CHANGED
@@ -2,29 +2,61 @@ import gradio as gr
2
  from utils.pdf_parser import extract_text_from_pdf
3
  from summarizer import Summarizer
4
  from qa_engine import QABot
 
 
5
 
 
6
  summarizer = Summarizer()
 
 
 
7
 
8
- def summarize_and_answer(pdf_file, question):
9
- text = extract_text_from_pdf(pdf_file.name)
 
 
 
 
10
  summary = summarizer.summarize(text)
11
- bot = QABot(text.split("\n\n"))
12
- context = bot.retrieve_context(question)
13
- return summary, context
14
-
15
- iface = gr.Interface(
16
- fn=summarize_and_answer,
17
- inputs=[
18
- gr.File(label="Upload Biotech Research PDF"),
19
- gr.Textbox(label="Ask a Question")
20
- ],
21
- outputs=[
22
- gr.Textbox(label="Summary"),
23
- gr.Textbox(label="Context")
24
- ],
25
- title="🧬 BioSummarize.ai",
26
- description="Summarize biotech papers and ask questions using AI"
27
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
29
  if __name__ == "__main__":
30
  iface.launch()
 
2
  from utils.pdf_parser import extract_text_from_pdf
3
  from summarizer import Summarizer
4
  from qa_engine import QABot
5
+ from chatbot import ask_model
6
+ from suggestions import suggest_questions
7
 
8
+ # Initialize summarizer and global variables
9
  summarizer = Summarizer()
10
+ qa_bot = None
11
+ summary = ""
12
+ text_chunks = []
13
 
14
+ # Gradio chat history
15
+ chat_history = []
16
+
17
+ def process_pdf(file):
18
+ global summary, qa_bot, text_chunks, chat_history
19
+ text = extract_text_from_pdf(file.name)
20
  summary = summarizer.summarize(text)
21
+ text_chunks = text.split("\n\n")
22
+ qa_bot = QABot(text_chunks)
23
+ chat_history.clear()
24
+ return summary, "PDF processed. You can now ask questions."
25
+
26
+ def chat_with_doc(question):
27
+ if not qa_bot:
28
+ return chat_history, "Please upload and summarize a document first."
29
+
30
+ context = qa_bot.retrieve_context(question)
31
+ response = ask_model(context, question)
32
+
33
+ chat_history.append((question, response))
34
+ suggestions = suggest_questions(summary)
35
+ suggestions_block = "💡 You can also ask:\n" + "\n".join([f"• {q}" for q in suggestions])
36
+
37
+ return chat_history, suggestions_block
38
+
39
+ # UI layout
40
+ with gr.Blocks(title="BioSummarize.ai") as iface:
41
+ gr.Markdown("# 🧬 BioSummarize.ai")
42
+ gr.Markdown("Upload a biotech research paper, generate its summary, and chat with it using an AI-powered assistant.")
43
+
44
+ with gr.Row():
45
+ file_input = gr.File(label="Upload Biotech Research PDF")
46
+ summarize_btn = gr.Button("Summarize + Start Chat")
47
+
48
+ summary_box = gr.Textbox(label="📘 Summary", lines=6)
49
+ summary_status = gr.Textbox(label="Status / Info", lines=2)
50
+
51
+ chat_input = gr.Textbox(label="💬 Ask a Question", placeholder="What is the main finding?")
52
+ chatbot = gr.Chatbot(label="🧠 BioResearch Chatbot")
53
+
54
+ suggestions_box = gr.Textbox(label="💡 Follow-up Suggestions", interactive=False)
55
+
56
+ # Bind actions
57
+ summarize_btn.click(fn=process_pdf, inputs=file_input, outputs=[summary_box, summary_status])
58
+ chat_input.submit(fn=chat_with_doc, inputs=chat_input, outputs=[chatbot, suggestions_box])
59
 
60
+ # Launch the app
61
  if __name__ == "__main__":
62
  iface.launch()
chatbot.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai # or use requests if OpenRouter API
2
+ import os
3
+
4
+ openai.api_key = os.getenv("OPENROUTER_API_KEY")
5
+
6
+ def ask_model(context, query, model="mistral"):
7
+ prompt = f"""Context: {context}\n\nUser: {query}\nAI:"""
8
+ response = openai.ChatCompletion.create(
9
+ model=model,
10
+ messages=[{"role": "user", "content": prompt}]
11
+ )
12
+ return response['choices'][0]['message']['content']
qa_engine.py CHANGED
@@ -4,13 +4,13 @@ import numpy as np
4
 
5
  class QABot:
6
  def __init__(self, chunks):
7
- self.embedder = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
8
  self.chunks = chunks
9
- self.index = faiss.IndexFlatL2(768)
10
- self.embeddings = self.embedder.encode(chunks)
11
  self.index.add(np.array(self.embeddings))
12
 
13
- def retrieve_context(self, query, k=3):
14
- q_embed = self.embedder.encode([query])
15
- D, I = self.index.search(np.array(q_embed), k)
16
- return "\n".join([self.chunks[i] for i in I[0]])
 
4
 
5
  class QABot:
6
  def __init__(self, chunks):
7
+ self.model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
8
  self.chunks = chunks
9
+ self.embeddings = self.model.encode(chunks)
10
+ self.index = faiss.IndexFlatL2(self.embeddings.shape[1])
11
  self.index.add(np.array(self.embeddings))
12
 
13
+ def retrieve(self, query, k=3):
14
+ query_vec = self.model.encode([query])
15
+ D, I = self.index.search(np.array(query_vec), k)
16
+ return "\n\n".join([self.chunks[i] for i in I[0]])
suggestions.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ def suggest_questions(summary):
2
+ return [
3
+ "What is the objective of this research?",
4
+ "What methods were used in the study?",
5
+ "What are the key results?",
6
+ "What limitations or future work are mentioned?",
7
+ "What is the significance of this study?"
8
+ ]
summarizer.py CHANGED
@@ -1,11 +1,27 @@
1
  from transformers import AutoTokenizer, BartForConditionalGeneration
 
 
2
 
3
  class Summarizer:
4
  def __init__(self):
5
- self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
6
- self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
7
 
8
- def summarize(self, text, max_tokens=200):
9
- inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
10
- summary_ids = self.model.generate(inputs["input_ids"], max_new_tokens=max_tokens)
11
- return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer, BartForConditionalGeneration
2
+ import torch
3
+ import math
4
 
5
  class Summarizer:
6
  def __init__(self):
7
+ self.tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
8
+ self.model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
9
 
10
+ def split_text(self, text, max_tokens=1024):
11
+ words = text.split()
12
+ chunks = [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
13
+ return chunks
14
+
15
+ def summarize(self, text):
16
+ chunks = self.split_text(text)
17
+ partial_summaries = []
18
+
19
+ for chunk in chunks:
20
+ inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=1024)
21
+ summary_ids = self.model.generate(inputs["input_ids"], max_new_tokens=200)
22
+ summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
23
+ partial_summaries.append(summary)
24
+
25
+ # Final merged summary
26
+ full_summary = " ".join(partial_summaries)
27
+ return full_summary