Spaces:

dang-w
/

ai-content-summariser-api

Sleeping

App Files Files Community

Dan Walsh commited on Mar 10

Commit

9cf5fee

1 Parent(s): 61f18ef

Initial deployment of AI Content Summariser API

Browse files

Files changed (13) hide show

Dockerfile +16 -0
README.md +14 -10
__pycache__/main.cpython-311.pyc +0 -0
app/api/__pycache__/routes.cpython-311.pyc +0 -0
app/api/routes.py +81 -0
app/check_transformers.py +22 -0
app/services/__pycache__/summariser.cpython-311.pyc +0 -0
app/services/__pycache__/summarizer.cpython-311.pyc +0 -0
app/services/__pycache__/url_extractor.cpython-311.pyc +0 -0
app/services/summariser.py +78 -0
app/services/url_extractor.py +43 -0
main.py +35 -0
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Expose the port
+EXPOSE 7860
+# Command to run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,14 @@
----
-title: Ai Content Summariser Api
-emoji: 🚀
-colorFrom: red
-colorTo: pink
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# AI Content Summariser API
+This is the backend API for the AI Content Summariser, a tool that automatically generates concise summaries of articles, documents, and web content using natural language processing.
+## API Endpoints
+- POST /api/summarise - Summarize text content
+- POST /api/summarise-url - Extract and summarize content from a URL
+## Features
+- Text summarization using state-of-the-art NLP models
+- URL content extraction and summarization
+- Adjustable parameters for summary length and style

__pycache__/main.cpython-311.pyc ADDED Viewed

Binary file (1.75 kB). View file

app/api/__pycache__/routes.cpython-311.pyc ADDED Viewed

Binary file (5.34 kB). View file

app/api/routes.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from fastapi import APIRouter, HTTPException, Depends
+from pydantic import BaseModel, Field, HttpUrl
+from typing import Optional, Union
+from app.services.summariser import SummariserService
+from app.services.url_extractor import URLExtractorService
+router = APIRouter()
+class TextSummaryRequest(BaseModel):
+    text: str = Field(..., min_length=10, description="The text to summarise")
+    max_length: Optional[int] = Field(150, ge=30, le=500, description="Maximum length of the summary")
+    min_length: Optional[int] = Field(50, ge=10, le=200, description="Minimum length of the summary")
+    do_sample: Optional[bool] = Field(False, description="Whether to use sampling for generation")
+    temperature: Optional[float] = Field(1.0, ge=0.7, le=2.0, description="Sampling temperature")
+class URLSummaryRequest(BaseModel):
+    url: HttpUrl = Field(..., description="The URL to extract content from and summarise")
+    max_length: Optional[int] = Field(150, ge=30, le=500, description="Maximum length of the summary")
+    min_length: Optional[int] = Field(50, ge=10, le=200, description="Minimum length of the summary")
+    do_sample: Optional[bool] = Field(False, description="Whether to use sampling for generation")
+    temperature: Optional[float] = Field(1.0, ge=0.7, le=2.0, description="Sampling temperature")
+class SummaryResponse(BaseModel):
+    original_text_length: int
+    summary: str
+    summary_length: int
+    source_type: str = "text"  # "text" or "url"
+    source_url: Optional[str] = None
+@router.post("/summarise", response_model=SummaryResponse)
+async def summarise_text(request: TextSummaryRequest):
+    try:
+        summariser = SummariserService()
+        summary = summariser.summarise(
+            text=request.text,
+            max_length=request.max_length,
+            min_length=request.min_length,
+            do_sample=request.do_sample,
+            temperature=request.temperature
+        )
+        return {
+            "original_text_length": len(request.text),
+            "summary": summary,
+            "summary_length": len(summary),
+            "source_type": "text"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/summarise-url", response_model=SummaryResponse)
+async def summarise_url(request: URLSummaryRequest):
+    try:
+        # Extract content from URL
+        url_extractor = URLExtractorService()
+        content = await url_extractor.extract_content(str(request.url))
+        if not content or len(content) < 100:
+            raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
+        # Summarise the extracted content
+        summariser = SummariserService()
+        summary = summariser.summarise(
+            text=content,
+            max_length=request.max_length,
+            min_length=request.min_length,
+            do_sample=request.do_sample,
+            temperature=request.temperature
+        )
+        return {
+            "original_text_length": len(content),
+            "summary": summary,
+            "summary_length": len(summary),
+            "source_type": "url",
+            "source_url": str(request.url)
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

app/check_transformers.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Simple script to check if transformers is installed correctly
+try:
+    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+    print("Transformers library is installed correctly!")
+    # Try loading a small model to verify functionality
+    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+    print("Model loaded successfully!")
+    # Test a simple summarization
+    text = "This is a test sentence to check if the summarisation model works correctly. It should be able to process this text and generate a summary."
+    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
+    summary_ids = model.generate(inputs["input_ids"], max_length=50, min_length=10, num_beams=4)
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    print(f"Test summary: {summary}")
+except ImportError as e:
+    print(f"Error importing transformers: {e}")
+    print("Please try reinstalling with: pip install transformers torch")
+except Exception as e:
+    print(f"Error during model loading or inference: {e}")

app/services/__pycache__/summariser.cpython-311.pyc ADDED Viewed

Binary file (3.52 kB). View file

app/services/__pycache__/summarizer.cpython-311.pyc ADDED Viewed

Binary file (1.77 kB). View file

app/services/__pycache__/url_extractor.cpython-311.pyc ADDED Viewed

Binary file (3.23 kB). View file

app/services/summariser.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+class SummariserService:
+    def __init__(self):
+        # Initialize with a smaller model for faster loading
+        model_name = "facebook/bart-large-cnn"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        # Move to GPU if available
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+    def summarise(self, text, max_length=150, min_length=50, do_sample=False, temperature=1.0):
+        """
+        Summarise the given text using the loaded model.
+        Args:
+            text (str): The text to summarise
+            max_length (int): Maximum length of the summary
+            min_length (int): Minimum length of the summary
+            do_sample (bool): Whether to use sampling for generation
+            temperature (float): Sampling temperature (higher = more random)
+        Returns:
+            str: The generated summary
+        """
+        # Ensure text is within model's max token limit
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
+        inputs = inputs.to(self.device)
+        # Set generation parameters
+        generation_params = {
+            "max_length": max_length,
+            "min_length": min_length,
+            "num_beams": 4,
+            "length_penalty": 2.0,
+            "early_stopping": True,
+        }
+        # Handle sampling and temperature
+        if do_sample:
+            try:
+                # First attempt: try with the requested temperature
+                generation_params["do_sample"] = True
+                generation_params["temperature"] = temperature
+                summary_ids = self.model.generate(
+                    inputs["input_ids"],
+                    **generation_params
+                )
+            except Exception as e:
+                # If that fails, try with default temperature (1.0)
+                print(f"Error with temperature {temperature}, falling back to default: {str(e)}")
+                generation_params["temperature"] = 1.0
+                try:
+                    summary_ids = self.model.generate(
+                        inputs["input_ids"],
+                        **generation_params
+                    )
+                except Exception:
+                    # If sampling still fails, fall back to beam search without sampling
+                    print("Sampling failed, falling back to beam search")
+                    generation_params.pop("do_sample", None)
+                    generation_params.pop("temperature", None)
+                    summary_ids = self.model.generate(
+                        inputs["input_ids"],
+                        **generation_params
+                    )
+        else:
+            # Standard beam search without sampling
+            summary_ids = self.model.generate(
+                inputs["input_ids"],
+                **generation_params
+            )
+        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        return summary

app/services/url_extractor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import httpx
+from bs4 import BeautifulSoup
+import re
+class URLExtractorService:
+    def __init__(self):
+        self.client = httpx.AsyncClient(timeout=30.0)
+    async def extract_content(self, url):
+        """
+        Extract the main content from a URL.
+        Args:
+            url (str): The URL to extract content from
+        Returns:
+            str: The extracted text content
+        """
+        try:
+            response = await self.client.get(url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Remove script and style elements
+            for script in soup(["script", "style", "header", "footer", "nav"]):
+                script.extract()
+            # Get text and clean it
+            text = soup.get_text()
+            # Break into lines and remove leading/trailing space
+            lines = (line.strip() for line in text.splitlines())
+            # Break multi-headlines into a line each
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            # Remove blank lines
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+            return text
+        except Exception as e:
+            raise Exception(f"Failed to extract content from URL: {str(e)}")
+        finally:
+            await self.client.aclose()

main.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import os
+app = FastAPI(
+    title="AI Content Summariser API",
+    description="API for summarising text content using NLP models",
+    version="0.1.0"
+)
+# Configure CORS
+origins = os.getenv("CORS_ORIGINS", "http://localhost:3000").split(",")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the AI Content Summariser API"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+# Import and include API routes
+from app.api.routes import router as api_router
+app.include_router(api_router, prefix="/api")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.100.1
+uvicorn==0.23.2
+pydantic==2.1.1
+transformers==4.31.0
+torch==2.0.1
+sentencepiece==0.1.99
+python-dotenv==1.0.0
+httpx==0.24.1
+accelerate==0.21.0
+beautifulsoup4==4.12.2