File size: 1,078 Bytes
7c114b1
f4dcafb
 
7c114b1
 
 
f4dcafb
 
7c114b1
f4dcafb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from transformers import AutoTokenizer, BartForConditionalGeneration
import torch
import math

class Summarizer:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
        self.model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")

    def split_text(self, text, max_tokens=1024):
        words = text.split()
        chunks = [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
        return chunks

    def summarize(self, text):
        chunks = self.split_text(text)
        partial_summaries = []

        for chunk in chunks:
            inputs = self.tokenizer(chunk, return_tensors="pt", truncation=True, max_length=1024)
            summary_ids = self.model.generate(inputs["input_ids"], max_new_tokens=200)
            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            partial_summaries.append(summary)

        # Final merged summary
        full_summary = " ".join(partial_summaries)
        return full_summary