Dan Walsh commited on
Commit
9cf5fee
·
1 Parent(s): 61f18ef

Initial deployment of AI Content Summariser API

Browse files
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Copy requirements first for better caching
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Copy the rest of the application
10
+ COPY . .
11
+
12
+ # Expose the port
13
+ EXPOSE 7860
14
+
15
+ # Command to run the application
16
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,14 @@
1
- ---
2
- title: Ai Content Summariser Api
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ # AI Content Summariser API
2
+
3
+ This is the backend API for the AI Content Summariser, a tool that automatically generates concise summaries of articles, documents, and web content using natural language processing.
4
+
5
+ ## API Endpoints
6
+
7
+ - POST /api/summarise - Summarize text content
8
+ - POST /api/summarise-url - Extract and summarize content from a URL
9
+
10
+ ## Features
11
+
12
+ - Text summarization using state-of-the-art NLP models
13
+ - URL content extraction and summarization
14
+ - Adjustable parameters for summary length and style
__pycache__/main.cpython-311.pyc ADDED
Binary file (1.75 kB). View file
 
app/api/__pycache__/routes.cpython-311.pyc ADDED
Binary file (5.34 kB). View file
 
app/api/routes.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Depends
2
+ from pydantic import BaseModel, Field, HttpUrl
3
+ from typing import Optional, Union
4
+ from app.services.summariser import SummariserService
5
+ from app.services.url_extractor import URLExtractorService
6
+
7
+ router = APIRouter()
8
+
9
+ class TextSummaryRequest(BaseModel):
10
+ text: str = Field(..., min_length=10, description="The text to summarise")
11
+ max_length: Optional[int] = Field(150, ge=30, le=500, description="Maximum length of the summary")
12
+ min_length: Optional[int] = Field(50, ge=10, le=200, description="Minimum length of the summary")
13
+ do_sample: Optional[bool] = Field(False, description="Whether to use sampling for generation")
14
+ temperature: Optional[float] = Field(1.0, ge=0.7, le=2.0, description="Sampling temperature")
15
+
16
+ class URLSummaryRequest(BaseModel):
17
+ url: HttpUrl = Field(..., description="The URL to extract content from and summarise")
18
+ max_length: Optional[int] = Field(150, ge=30, le=500, description="Maximum length of the summary")
19
+ min_length: Optional[int] = Field(50, ge=10, le=200, description="Minimum length of the summary")
20
+ do_sample: Optional[bool] = Field(False, description="Whether to use sampling for generation")
21
+ temperature: Optional[float] = Field(1.0, ge=0.7, le=2.0, description="Sampling temperature")
22
+
23
+ class SummaryResponse(BaseModel):
24
+ original_text_length: int
25
+ summary: str
26
+ summary_length: int
27
+ source_type: str = "text" # "text" or "url"
28
+ source_url: Optional[str] = None
29
+
30
+ @router.post("/summarise", response_model=SummaryResponse)
31
+ async def summarise_text(request: TextSummaryRequest):
32
+ try:
33
+ summariser = SummariserService()
34
+ summary = summariser.summarise(
35
+ text=request.text,
36
+ max_length=request.max_length,
37
+ min_length=request.min_length,
38
+ do_sample=request.do_sample,
39
+ temperature=request.temperature
40
+ )
41
+
42
+ return {
43
+ "original_text_length": len(request.text),
44
+ "summary": summary,
45
+ "summary_length": len(summary),
46
+ "source_type": "text"
47
+ }
48
+ except Exception as e:
49
+ raise HTTPException(status_code=500, detail=str(e))
50
+
51
+ @router.post("/summarise-url", response_model=SummaryResponse)
52
+ async def summarise_url(request: URLSummaryRequest):
53
+ try:
54
+ # Extract content from URL
55
+ url_extractor = URLExtractorService()
56
+ content = await url_extractor.extract_content(str(request.url))
57
+
58
+ if not content or len(content) < 100:
59
+ raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
60
+
61
+ # Summarise the extracted content
62
+ summariser = SummariserService()
63
+ summary = summariser.summarise(
64
+ text=content,
65
+ max_length=request.max_length,
66
+ min_length=request.min_length,
67
+ do_sample=request.do_sample,
68
+ temperature=request.temperature
69
+ )
70
+
71
+ return {
72
+ "original_text_length": len(content),
73
+ "summary": summary,
74
+ "summary_length": len(summary),
75
+ "source_type": "url",
76
+ "source_url": str(request.url)
77
+ }
78
+ except HTTPException:
79
+ raise
80
+ except Exception as e:
81
+ raise HTTPException(status_code=500, detail=str(e))
app/check_transformers.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Simple script to check if transformers is installed correctly
2
+ try:
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+ print("Transformers library is installed correctly!")
5
+
6
+ # Try loading a small model to verify functionality
7
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
8
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
9
+ print("Model loaded successfully!")
10
+
11
+ # Test a simple summarization
12
+ text = "This is a test sentence to check if the summarisation model works correctly. It should be able to process this text and generate a summary."
13
+ inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
14
+ summary_ids = model.generate(inputs["input_ids"], max_length=50, min_length=10, num_beams=4)
15
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
16
+ print(f"Test summary: {summary}")
17
+
18
+ except ImportError as e:
19
+ print(f"Error importing transformers: {e}")
20
+ print("Please try reinstalling with: pip install transformers torch")
21
+ except Exception as e:
22
+ print(f"Error during model loading or inference: {e}")
app/services/__pycache__/summariser.cpython-311.pyc ADDED
Binary file (3.52 kB). View file
 
app/services/__pycache__/summarizer.cpython-311.pyc ADDED
Binary file (1.77 kB). View file
 
app/services/__pycache__/url_extractor.cpython-311.pyc ADDED
Binary file (3.23 kB). View file
 
app/services/summariser.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torch
3
+
4
+ class SummariserService:
5
+ def __init__(self):
6
+ # Initialize with a smaller model for faster loading
7
+ model_name = "facebook/bart-large-cnn"
8
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
+
11
+ # Move to GPU if available
12
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ self.model.to(self.device)
14
+
15
+ def summarise(self, text, max_length=150, min_length=50, do_sample=False, temperature=1.0):
16
+ """
17
+ Summarise the given text using the loaded model.
18
+
19
+ Args:
20
+ text (str): The text to summarise
21
+ max_length (int): Maximum length of the summary
22
+ min_length (int): Minimum length of the summary
23
+ do_sample (bool): Whether to use sampling for generation
24
+ temperature (float): Sampling temperature (higher = more random)
25
+
26
+ Returns:
27
+ str: The generated summary
28
+ """
29
+ # Ensure text is within model's max token limit
30
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
31
+ inputs = inputs.to(self.device)
32
+
33
+ # Set generation parameters
34
+ generation_params = {
35
+ "max_length": max_length,
36
+ "min_length": min_length,
37
+ "num_beams": 4,
38
+ "length_penalty": 2.0,
39
+ "early_stopping": True,
40
+ }
41
+
42
+ # Handle sampling and temperature
43
+ if do_sample:
44
+ try:
45
+ # First attempt: try with the requested temperature
46
+ generation_params["do_sample"] = True
47
+ generation_params["temperature"] = temperature
48
+ summary_ids = self.model.generate(
49
+ inputs["input_ids"],
50
+ **generation_params
51
+ )
52
+ except Exception as e:
53
+ # If that fails, try with default temperature (1.0)
54
+ print(f"Error with temperature {temperature}, falling back to default: {str(e)}")
55
+ generation_params["temperature"] = 1.0
56
+ try:
57
+ summary_ids = self.model.generate(
58
+ inputs["input_ids"],
59
+ **generation_params
60
+ )
61
+ except Exception:
62
+ # If sampling still fails, fall back to beam search without sampling
63
+ print("Sampling failed, falling back to beam search")
64
+ generation_params.pop("do_sample", None)
65
+ generation_params.pop("temperature", None)
66
+ summary_ids = self.model.generate(
67
+ inputs["input_ids"],
68
+ **generation_params
69
+ )
70
+ else:
71
+ # Standard beam search without sampling
72
+ summary_ids = self.model.generate(
73
+ inputs["input_ids"],
74
+ **generation_params
75
+ )
76
+
77
+ summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
78
+ return summary
app/services/url_extractor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+
5
+ class URLExtractorService:
6
+ def __init__(self):
7
+ self.client = httpx.AsyncClient(timeout=30.0)
8
+
9
+ async def extract_content(self, url):
10
+ """
11
+ Extract the main content from a URL.
12
+
13
+ Args:
14
+ url (str): The URL to extract content from
15
+
16
+ Returns:
17
+ str: The extracted text content
18
+ """
19
+ try:
20
+ response = await self.client.get(url)
21
+ response.raise_for_status()
22
+
23
+ soup = BeautifulSoup(response.text, 'html.parser')
24
+
25
+ # Remove script and style elements
26
+ for script in soup(["script", "style", "header", "footer", "nav"]):
27
+ script.extract()
28
+
29
+ # Get text and clean it
30
+ text = soup.get_text()
31
+
32
+ # Break into lines and remove leading/trailing space
33
+ lines = (line.strip() for line in text.splitlines())
34
+ # Break multi-headlines into a line each
35
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
36
+ # Remove blank lines
37
+ text = '\n'.join(chunk for chunk in chunks if chunk)
38
+
39
+ return text
40
+ except Exception as e:
41
+ raise Exception(f"Failed to extract content from URL: {str(e)}")
42
+ finally:
43
+ await self.client.aclose()
main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import os
4
+
5
+ app = FastAPI(
6
+ title="AI Content Summariser API",
7
+ description="API for summarising text content using NLP models",
8
+ version="0.1.0"
9
+ )
10
+
11
+ # Configure CORS
12
+ origins = os.getenv("CORS_ORIGINS", "http://localhost:3000").split(",")
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=origins,
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ @app.get("/")
22
+ async def root():
23
+ return {"message": "Welcome to the AI Content Summariser API"}
24
+
25
+ @app.get("/health")
26
+ async def health_check():
27
+ return {"status": "healthy"}
28
+
29
+ # Import and include API routes
30
+ from app.api.routes import router as api_router
31
+ app.include_router(api_router, prefix="/api")
32
+
33
+ if __name__ == "__main__":
34
+ import uvicorn
35
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.100.1
2
+ uvicorn==0.23.2
3
+ pydantic==2.1.1
4
+ transformers==4.31.0
5
+ torch==2.0.1
6
+ sentencepiece==0.1.99
7
+ python-dotenv==1.0.0
8
+ httpx==0.24.1
9
+ accelerate==0.21.0
10
+ beautifulsoup4==4.12.2