Spaces:
Build error
Build error
Commit
·
7c114b1
1
Parent(s):
10e986a
initial commit
Browse files- .gitignore +1 -0
- README.md +0 -13
- app.py +30 -0
- qa_engine.py +16 -0
- requirements.txt +6 -0
- summarizer.py +11 -0
- utils/__pycache__/pdf_parser.cpython-312.pyc +0 -0
- utils/pdf_parser.py +8 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
venv
|
README.md
CHANGED
@@ -1,13 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Biosummarize Ai
|
3 |
-
emoji: 📚
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: pink
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.33.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils.pdf_parser import extract_text_from_pdf
|
3 |
+
from summarizer import Summarizer
|
4 |
+
from qa_engine import QABot
|
5 |
+
|
6 |
+
summarizer = Summarizer()
|
7 |
+
|
8 |
+
def summarize_and_answer(pdf_file, question):
|
9 |
+
text = extract_text_from_pdf(pdf_file.name)
|
10 |
+
summary = summarizer.summarize(text)
|
11 |
+
bot = QABot(text.split("\n\n"))
|
12 |
+
context = bot.retrieve_context(question)
|
13 |
+
return summary, context
|
14 |
+
|
15 |
+
iface = gr.Interface(
|
16 |
+
fn=summarize_and_answer,
|
17 |
+
inputs=[
|
18 |
+
gr.File(label="Upload Biotech Research PDF"),
|
19 |
+
gr.Textbox(label="Ask a Question")
|
20 |
+
],
|
21 |
+
outputs=[
|
22 |
+
gr.Textbox(label="Summary"),
|
23 |
+
gr.Textbox(label="Context")
|
24 |
+
],
|
25 |
+
title="🧬 BioSummarize.ai",
|
26 |
+
description="Summarize biotech papers and ask questions using AI"
|
27 |
+
)
|
28 |
+
|
29 |
+
if __name__ == "__main__":
|
30 |
+
iface.launch()
|
qa_engine.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import faiss
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
class QABot:
|
6 |
+
def __init__(self, chunks):
|
7 |
+
self.embedder = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
|
8 |
+
self.chunks = chunks
|
9 |
+
self.index = faiss.IndexFlatL2(768)
|
10 |
+
self.embeddings = self.embedder.encode(chunks)
|
11 |
+
self.index.add(np.array(self.embeddings))
|
12 |
+
|
13 |
+
def retrieve_context(self, query, k=3):
|
14 |
+
q_embed = self.embedder.encode([query])
|
15 |
+
D, I = self.index.search(np.array(q_embed), k)
|
16 |
+
return "\n".join([self.chunks[i] for i in I[0]])
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
transformers
|
3 |
+
sentence-transformers
|
4 |
+
torch
|
5 |
+
faiss-cpu
|
6 |
+
PyMuPDF
|
summarizer.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, BartForConditionalGeneration
|
2 |
+
|
3 |
+
class Summarizer:
|
4 |
+
def __init__(self):
|
5 |
+
self.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
6 |
+
self.model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
|
7 |
+
|
8 |
+
def summarize(self, text, max_tokens=200):
|
9 |
+
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
|
10 |
+
summary_ids = self.model.generate(inputs["input_ids"], max_new_tokens=max_tokens)
|
11 |
+
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
utils/__pycache__/pdf_parser.cpython-312.pyc
ADDED
Binary file (480 Bytes). View file
|
|
utils/pdf_parser.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz # PyMuPDF
|
2 |
+
|
3 |
+
def extract_text_from_pdf(pdf_path):
|
4 |
+
text = ""
|
5 |
+
doc = fitz.open(pdf_path)
|
6 |
+
for page in doc:
|
7 |
+
text += page.get_text()
|
8 |
+
return text
|