biosummarize-ai / summarizer.py
AryanRajSaxena's picture
second iteration
f4dcafb
from transformers import AutoTokenizer, BartForConditionalGeneration
import torch
import math
class Summarizer:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
self.model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
def split_text(self, text, max_tokens=1024):
words = text.split()
chunks = [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
return chunks
def summarize(self, text):
chunks = self.split_text(text)
partial_summaries = []
for chunk in chunks:
inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=1024)
summary_ids = self.model.generate(inputs["input_ids"], max_new_tokens=200)
summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
partial_summaries.append(summary)
# Final merged summary
full_summary = " ".join(partial_summaries)
return full_summary