Spaces:

dang-w
/

ai-content-summariser-api

Running

File size: 9,702 Bytes

import numpy as np  # Import NumPy first
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time
import os
import re
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SummariserService:
    def __init__(self):
        # Status tracking
        self.model_loading_status = {
            "is_loading": False,
            "step": "",
            "progress": 0
        }

        # Consider these alternative models
        model_options = {
            "general": "facebook/bart-large-cnn",
            "news": "facebook/bart-large-xsum",
            "long_form": "google/pegasus-large",
            "literary": "t5-large"
        }

        # Choose the most appropriate model - BART works better for web content
        model_name = model_options["general"]  # Use BART for better web content summarization

        # Update loading status
        self.model_loading_status["is_loading"] = True
        self.model_loading_status["step"] = "Initializing tokenizer"

        # Ensure cache directory exists and is writable
        cache_dir = os.environ.get("TRANSFORMERS_CACHE", "/tmp/huggingface_cache")
        os.makedirs(cache_dir, exist_ok=True)

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                cache_dir=cache_dir,
                local_files_only=False
            )

            self.model_loading_status["step"] = "Loading model"
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                cache_dir=cache_dir,
                force_download=False,
                local_files_only=False
            )

            # Move to GPU if available
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.model.to(self.device)

        except Exception as e:
            # Fallback to a smaller model if the main one fails
            print(f"Error loading model {model_name}: {str(e)}")
            print("Falling back to smaller model...")

            fallback_model = "sshleifer/distilbart-cnn-6-6"  # Much smaller model

            self.tokenizer = AutoTokenizer.from_pretrained(
                fallback_model,
                cache_dir=cache_dir,
                local_files_only=False
            )

            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                fallback_model,
                cache_dir=cache_dir,
                force_download=False,
                local_files_only=False
            )

            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.model.to(self.device)

            # Update model name for metadata
            model_name = fallback_model

        self.model_loading_status["is_loading"] = False
        self.model_loading_status["progress"] = 100

        # Store the actual model name used
        self.model_name = model_name

        # Track current processing job
        self.current_job = {
            "in_progress": False,
            "start_time": None,
            "input_word_count": 0,
            "estimated_time": 0,
            "stage": "",
            "progress": 0
        }

    def clean_summary(self, summary):
        """Clean and format the summary text"""
        # Remove any leading punctuation or spaces
        summary = re.sub(r'^[,.\s]+', '', summary)

        # Ensure the first letter is capitalized
        if summary and len(summary) > 0:
            summary = summary[0].upper() + summary[1:]

        # Ensure proper ending punctuation
        if summary and not any(summary.endswith(end) for end in ['.', '!', '?']):
            last_sentence_end = max(
                summary.rfind('.'),
                summary.rfind('!'),
                summary.rfind('?')
            )
            if last_sentence_end > 0:
                summary = summary[:last_sentence_end + 1]
            else:
                summary = summary + '.'

        return summary

    def get_status(self):
        """Return the current status of the summarizer service"""
        status = {
            "model_loading": self.model_loading_status,
            "device": self.device,
            "current_job": self.current_job
        }

        # Update estimated time remaining if job in progress
        if self.current_job["in_progress"] and self.current_job["start_time"]:
            elapsed = time.time() - self.current_job["start_time"]
            estimated = self.current_job["estimated_time"]
            remaining = max(0, estimated - elapsed)
            status["current_job"]["time_remaining"] = round(remaining, 1)

            # Update progress based on time
            if estimated > 0:
                progress = min(95, (elapsed / estimated) * 100)
                status["current_job"]["progress"] = round(progress, 0)

        return status

    def summarise(self, text, max_length=250, min_length=100, do_sample=True, temperature=1.2):
        """
        Summarise the given text using the loaded model.

        Args:
            text (str): The text to summarise
            max_length (int): Maximum length of the summary in characters
            min_length (int): Minimum length of the summary in characters
            do_sample (bool): Whether to use sampling for generation
            temperature (float): Sampling temperature (higher = more random)

        Returns:
            dict: The generated summary and processing metadata
        """
        logger.info(f"Starting summarization of text with {len(text)} characters")

        # Reset and start job tracking
        self.current_job = {
            "in_progress": True,
            "start_time": time.time(),
            "input_word_count": len(text.split()),
            "estimated_time": max(1, min(30, len(text.split()) / 500)),  # Rough estimate
            "stage": "Tokenizing input text",
            "progress": 5
        }

        result = {
            "summary": "",
            "metadata": {
                "input_word_count": self.current_job["input_word_count"],
                "estimated_time_seconds": self.current_job["estimated_time"],
                "model_used": self.model_name,
                "processing_device": self.device
            }
        }

        try:
            # Preprocess the text to focus on main content
            text = self.preprocess_text(text)
            logger.info(f"After preprocessing: {len(text)} characters")

            # Tokenization step
            inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
            input_ids = inputs.input_ids.to(self.device)

            # Update metadata with token info
            result["metadata"]["input_token_count"] = len(input_ids[0])
            result["metadata"]["truncated"] = len(input_ids[0]) == 1024

            # Update job status
            self.current_job["stage"] = "Generating summary"
            self.current_job["progress"] = 30

            # Enhanced generation parameters for better web content summarization
            summary_ids = self.model.generate(
                input_ids,
                max_length=max_length,
                min_length=min_length,
                do_sample=do_sample,
                temperature=temperature,
                num_beams=5,  # Increased from 4 to 5
                early_stopping=True,
                no_repeat_ngram_size=3,
                length_penalty=2.0,
                top_k=50,  # Added for better quality
                top_p=0.95,  # Added for better quality
            )

            # Update job status
            self.current_job["stage"] = "Post-processing summary"
            self.current_job["progress"] = 90

            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

            # Clean and format the summary
            summary = self.clean_summary(summary)

            result["summary"] = summary
            result["metadata"]["output_word_count"] = len(summary.split())
            result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)

            logger.info(f"Generated summary with {len(summary)} characters")

        except Exception as e:
            logger.error(f"Error during summarization: {str(e)}")
            result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
            result["error"] = str(e)
        finally:
            # Complete job
            self.current_job["in_progress"] = False
            self.current_job["stage"] = "Complete"
            self.current_job["progress"] = 100

        return result

    def preprocess_text(self, text):
        """Preprocess text to improve summarization quality."""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove common web page boilerplate text
        text = re.sub(r'Skip to (content|main).*?»', '', text)
        text = re.sub(r'Search for:.*?Search', '', text)
        text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)

        # Remove comment sections (often start with phrases like "X responses to")
        text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)

        # Remove form fields and subscription prompts
        text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)

        # Focus on the first part of very long texts (likely the main content)
        if len(text) > 10000:
            text = text[:10000]

        return text