AryanRajSaxena commited on
Commit
7c114b1
·
1 Parent(s): 10e986a

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
README.md CHANGED
@@ -1,13 +0,0 @@
1
- ---
2
- title: Biosummarize Ai
3
- emoji: 📚
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.33.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils.pdf_parser import extract_text_from_pdf
3
+ from summarizer import Summarizer
4
+ from qa_engine import QABot
5
+
6
+ summarizer = Summarizer()
7
+
8
+ def summarize_and_answer(pdf_file, question):
9
+ text = extract_text_from_pdf(pdf_file.name)
10
+ summary = summarizer.summarize(text)
11
+ bot = QABot(text.split("\n\n"))
12
+ context = bot.retrieve_context(question)
13
+ return summary, context
14
+
15
+ iface = gr.Interface(
16
+ fn=summarize_and_answer,
17
+ inputs=[
18
+ gr.File(label="Upload Biotech Research PDF"),
19
+ gr.Textbox(label="Ask a Question")
20
+ ],
21
+ outputs=[
22
+ gr.Textbox(label="Summary"),
23
+ gr.Textbox(label="Context")
24
+ ],
25
+ title="🧬 BioSummarize.ai",
26
+ description="Summarize biotech papers and ask questions using AI"
27
+ )
28
+
29
+ if __name__ == "__main__":
30
+ iface.launch()
qa_engine.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import faiss
3
+ import numpy as np
4
+
5
+ class QABot:
6
+ def __init__(self, chunks):
7
+ self.embedder = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
8
+ self.chunks = chunks
9
+ self.index = faiss.IndexFlatL2(768)
10
+ self.embeddings = self.embedder.encode(chunks)
11
+ self.index.add(np.array(self.embeddings))
12
+
13
+ def retrieve_context(self, query, k=3):
14
+ q_embed = self.embedder.encode([query])
15
+ D, I = self.index.search(np.array(q_embed), k)
16
+ return "\n".join([self.chunks[i] for i in I[0]])
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ torch
5
+ faiss-cpu
6
+ PyMuPDF
summarizer.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, BartForConditionalGeneration
2
+
3
+ class Summarizer:
4
+ def __init__(self):
5
+ self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
6
+ self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
7
+
8
+ def summarize(self, text, max_tokens=200):
9
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
10
+ summary_ids = self.model.generate(inputs["input_ids"], max_new_tokens=max_tokens)
11
+ return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
utils/__pycache__/pdf_parser.cpython-312.pyc ADDED
Binary file (480 Bytes). View file
 
utils/pdf_parser.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+
3
+ def extract_text_from_pdf(pdf_path):
4
+ text = ""
5
+ doc = fitz.open(pdf_path)
6
+ for page in doc:
7
+ text += page.get_text()
8
+ return text