Spaces:

AuraSystems
/

spanish-embeddings-api

Running

App Files Files Community

Jordi Catafal commited on Jun 1

Commit

0a6cb95

1 Parent(s): 8c3e1fb

Add Jina v3 and Legal-BERT models - total 4 models

Browse files

Files changed (8) hide show

Dockerfile +10 -1
README.md +110 -25
app.py +25 -7
models/__init__.py +1 -0
models/schemas.py +5 -5
requirements.txt +2 -1
utils/__init__.py +7 -0
utils/helpers.py +70 -12

Dockerfile CHANGED Viewed

@@ -5,6 +5,14 @@ ENV PYTHONUNBUFFERED=1
 ENV TRANSFORMERS_CACHE=/app/cache
 ENV HF_HOME=/app/cache
 ENV PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
 # Create non-root user
 RUN useradd -m -u 1000 user
@@ -18,7 +26,8 @@ WORKDIR /app
 # Copy requirements and install dependencies
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY --chown=user . .

 ENV TRANSFORMERS_CACHE=/app/cache
 ENV HF_HOME=/app/cache
 ENV PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
+# Add this to handle the larger models
+ENV TRANSFORMERS_OFFLINE=0
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+# Install system dependencies for better performance
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
 # Create non-root user
 RUN useradd -m -u 1000 user
 # Copy requirements and install dependencies
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir hf_transfer
 # Copy application code
 COPY --chown=user . .

README.md CHANGED Viewed

@@ -11,9 +11,9 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 --------------------------------
-# Spanish Embeddings API
-A high-performance API for generating embeddings from Spanish text using state-of-the-art models. This API provides access to two specialized models optimized for different use cases.
 ## 🚀 Quick Start
@@ -26,7 +26,9 @@ A high-performance API for generating embeddings from Spanish text using state-o
 | Model | Max Tokens | Languages | Dimensions | Best Use Case |
 |-------|------------|-----------|------------|---------------|
 | **jina** | 8,192 | Spanish, English | 768 | General purpose, long documents, cross-lingual tasks |
-| **robertalex** | 512 | Spanish | 768 | Legal documents, formal Spanish, domain-specific text |
 ## 🔗 API Endpoints
@@ -64,7 +66,7 @@ import numpy as np
 API_URL = "https://aurasystems-spanish-embeddings-api.hf.space"
-# Example 1: Basic usage
 response = requests.post(
     f"{API_URL}/embed",
     json={
@@ -78,13 +80,24 @@ result = response.json()
 embeddings = result["embeddings"]
 print(f"Generated {len(embeddings)} embeddings of {result['dimensions']} dimensions")
-# Example 2: Using with numpy for similarity
-embeddings_array = np.array(embeddings)
-similarity = np.dot(embeddings_array[0], embeddings_array[1])
-print(f"Cosine similarity: {similarity:.4f}")
-# Example 3: Legal text with RoBERTalex
-legal_response = requests.post(
     f"{API_URL}/embed",
     json={
         "texts": [
@@ -95,12 +108,38 @@ legal_response = requests.post(
         "normalize": True
     }
 )
 ```
 ### cURL
 ```bash
-# Basic embedding generation
 curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
      -H "Content-Type: application/json" \
      -d '{
@@ -109,17 +148,35 @@ curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
        "normalize": true
      }'
-# With custom max length
 curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
      -H "Content-Type: application/json" \
      -d '{
-       "texts": ["Documento muy largo..."],
-       "model": "jina",
        "normalize": true,
-       "max_length": 2048
      }'
-# Get model information
 curl "https://aurasystems-spanish-embeddings-api.hf.space/models"
 ```
@@ -169,10 +226,16 @@ from langchain.embeddings.base import Embeddings
 from typing import List
 import requests
-class SpanishEmbeddings(Embeddings):
-    """Custom LangChain embeddings class for Spanish text"""
-    def __init__(self, model: str = "jina"):
         self.api_url = "https://aurasystems-spanish-embeddings-api.hf.space/embed"
         self.model = model
@@ -191,13 +254,35 @@ class SpanishEmbeddings(Embeddings):
     def embed_query(self, text: str) -> List[float]:
         return self.embed_documents([text])[0]
-# Usage with LangChain
-embeddings = SpanishEmbeddings(model="jina")
-doc_embeddings = embeddings.embed_documents([
-    "Primer documento",
-    "Segundo documento"
 ])
-query_embedding = embeddings.embed_query("consulta de búsqueda")
 ```
 ## 📋 Request/Response Formats

 --------------------------------
+# Spanish & Legal Embeddings API
+A high-performance API for generating embeddings from Spanish, English, and multilingual text using state-of-the-art models. This API provides access to four specialized models optimized for different use cases and languages.
 ## 🚀 Quick Start
 | Model | Max Tokens | Languages | Dimensions | Best Use Case |
 |-------|------------|-----------|------------|---------------|
 | **jina** | 8,192 | Spanish, English | 768 | General purpose, long documents, cross-lingual tasks |
+| **robertalex** | 512 | Spanish | 768 | Spanish legal documents, formal Spanish |
+| **jina-v3** | 8,192 | Multilingual (30+ languages) | 1,024 | Superior multilingual embeddings, long context |
+| **legal-bert** | 512 | English | 768 | English legal documents, contracts, law texts |
 ## 🔗 API Endpoints
 API_URL = "https://aurasystems-spanish-embeddings-api.hf.space"
+# Example 1: Basic usage with Jina v2 Spanish
 response = requests.post(
     f"{API_URL}/embed",
     json={
 embeddings = result["embeddings"]
 print(f"Generated {len(embeddings)} embeddings of {result['dimensions']} dimensions")
+# Example 2: Using Jina v3 for multilingual texts
+multilingual_response = requests.post(
+    f"{API_URL}/embed",
+    json={
+        "texts": [
+            "Hello world",  # English
+            "Hola mundo",   # Spanish
+            "Bonjour le monde",  # French
+            "Hallo Welt"    # German
+        ],
+        "model": "jina-v3",
+        "normalize": True
+    }
+)
+print(f"Jina v3 dimensions: {multilingual_response.json()['dimensions']}")  # 1024 dims
+# Example 3: Legal text with RoBERTalex (Spanish)
+spanish_legal_response = requests.post(
     f"{API_URL}/embed",
     json={
         "texts": [
         "normalize": True
     }
 )
+# Example 4: Legal text with Legal-BERT (English)
+english_legal_response = requests.post(
+    f"{API_URL}/embed",
+    json={
+        "texts": [
+            "The contract shall be valid from the date of signature",
+            "This agreement is governed by the laws of the state"
+        ],
+        "model": "legal-bert",
+        "normalize": True
+    }
+)
+# Example 5: Compare similarity across models
+text = "artificial intelligence and law"
+models_comparison = {}
+for model in ["jina", "jina-v3", "legal-bert"]:
+    resp = requests.post(
+        f"{API_URL}/embed",
+        json={"texts": [text], "model": model, "normalize": True}
+    )
+    models_comparison[model] = resp.json()["dimensions"]
+print("Embedding dimensions by model:", models_comparison)
 ```
 ### cURL
 ```bash
+# Basic embedding generation with Jina v2 Spanish
 curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
      -H "Content-Type: application/json" \
      -d '{
        "normalize": true
      }'
+# Using Jina v3 for multilingual embeddings
 curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
      -H "Content-Type: application/json" \
      -d '{
+       "texts": ["Hello world", "Hola mundo", "Bonjour le monde"],
+       "model": "jina-v3",
+       "normalize": true
+     }'
+# English legal text with Legal-BERT
+curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
+     -H "Content-Type: application/json" \
+     -d '{
+       "texts": ["This agreement is legally binding"],
+       "model": "legal-bert",
+       "normalize": true
+     }'
+# Spanish legal text with RoBERTalex
+curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
+     -H "Content-Type: application/json" \
+     -d '{
+       "texts": ["Artículo primero de la constitución"],
+       "model": "robertalex",
        "normalize": true,
+       "max_length": 512
      }'
+# Get all model information
 curl "https://aurasystems-spanish-embeddings-api.hf.space/models"
 ```
 from typing import List
 import requests
+class MultilingualEmbeddings(Embeddings):
+    """Custom LangChain embeddings class for multilingual text"""
+    def __init__(self, model: str = "jina-v3"):
+        """
+        Initialize embeddings
+        Args:
+            model: One of "jina", "robertalex", "jina-v3", "legal-bert"
+        """
         self.api_url = "https://aurasystems-spanish-embeddings-api.hf.space/embed"
         self.model = model
     def embed_query(self, text: str) -> List[float]:
         return self.embed_documents([text])[0]
+# Usage examples with different models
+# Spanish embeddings
+spanish_embeddings = MultilingualEmbeddings(model="jina")
+spanish_docs = spanish_embeddings.embed_documents([
+    "Primer documento en español",
+    "Segundo documento en español"
+])
+# Multilingual embeddings with Jina v3
+multilingual_embeddings = MultilingualEmbeddings(model="jina-v3")
+mixed_docs = multilingual_embeddings.embed_documents([
+    "English document",
+    "Documento en español",
+    "Document en français"
+])
+# Legal embeddings for English
+legal_embeddings = MultilingualEmbeddings(model="legal-bert")
+legal_docs = legal_embeddings.embed_documents([
+    "This contract is governed by English law",
+    "The party shall indemnify and hold harmless"
+])
+# Spanish legal embeddings
+spanish_legal_embeddings = MultilingualEmbeddings(model="robertalex")
+spanish_legal_docs = spanish_legal_embeddings.embed_documents([
+    "Artículo 1: De los derechos fundamentales",
+    "La presente ley entrará en vigor"
 ])
 ```
 ## 📋 Request/Response Formats

app.py CHANGED Viewed

@@ -9,9 +9,9 @@ from models.schemas import EmbeddingRequest, EmbeddingResponse, ModelInfo
 from utils.helpers import load_models, get_embeddings, cleanup_memory
 app = FastAPI(
-    title="Spanish Embedding API",
-    description="Dual Spanish embedding models API",
-    version="1.0.0"
 )
 # Global model cache
@@ -22,13 +22,13 @@ async def startup_event():
     """Load models on startup"""
     global models_cache
     models_cache = load_models()
-    print("Models loaded successfully!")
 @app.get("/")
 async def root():
     return {
-        "message": "Spanish Embedding API",
-        "models": ["jina", "robertalex"],
         "status": "running",
         "docs": "/docs"
     }
@@ -88,6 +88,24 @@ async def list_models():
             languages=["Spanish"],
             model_type="legal domain",
             description="Spanish legal domain specialized embeddings"
         )
     ]
@@ -96,7 +114,7 @@ async def health_check():
     """Health check endpoint"""
     return {
         "status": "healthy",
-        "models_loaded": len(models_cache) == 2,
         "available_models": list(models_cache.keys())
     }

 from utils.helpers import load_models, get_embeddings, cleanup_memory
 app = FastAPI(
+    title="Spanish & Legal Embedding API",
+    description="Multi-model embedding API for Spanish and Legal texts",
+    version="2.0.0"
 )
 # Global model cache
     """Load models on startup"""
     global models_cache
     models_cache = load_models()
+    print("All models loaded successfully!")
 @app.get("/")
 async def root():
     return {
+        "message": "Spanish & Legal Embedding API",
+        "models": ["jina", "robertalex", "jina-v3", "legal-bert"],
         "status": "running",
         "docs": "/docs"
     }
             languages=["Spanish"],
             model_type="legal domain",
             description="Spanish legal domain specialized embeddings"
+        ),
+        ModelInfo(
+            model_id="jina-v3",
+            name="jinaai/jina-embeddings-v3",
+            dimensions=1024,
+            max_sequence_length=8192,
+            languages=["Multilingual"],
+            model_type="multilingual",
+            description="Latest Jina v3 with superior multilingual performance"
+        ),
+        ModelInfo(
+            model_id="legal-bert",
+            name="nlpaueb/legal-bert-base-uncased",
+            dimensions=768,
+            max_sequence_length=512,
+            languages=["English"],
+            model_type="legal domain",
+            description="English legal domain BERT model"
         )
     ]
     """Health check endpoint"""
     return {
         "status": "healthy",
+        "models_loaded": len(models_cache) == 4,
         "available_models": list(models_cache.keys())
     }

models/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # models/__init__.py
 """Models package for embedding API schemas and configurations"""


1	+
2	# models/__init__.py
3	"""Models package for embedding API schemas and configurations"""
4

models/schemas.py CHANGED Viewed

@@ -11,7 +11,7 @@ class EmbeddingRequest(BaseModel):
         description="List of texts to embed",
         example=["Hola mundo", "¿Cómo estás?"]
     )
-    model: Literal["jina", "robertalex"] = Field(
         default="jina",
         description="Model to use for embeddings"
     )
@@ -39,10 +39,10 @@ class EmbeddingRequest(BaseModel):
     def validate_max_length(cls, v, values):
         if v is not None:
             model = values.get('model', 'jina')
-            if model == 'jina' and v > 8192:
-                raise ValueError("Max length for Jina model is 8192")
-            elif model == 'robertalex' and v > 512:
-                raise ValueError("Max length for RoBERTalex model is 512")
             if v < 1:
                 raise ValueError("Max length must be positive")
         return v

         description="List of texts to embed",
         example=["Hola mundo", "¿Cómo estás?"]
     )
+    model: Literal["jina", "robertalex", "jina-v3", "legal-bert"] = Field(
         default="jina",
         description="Model to use for embeddings"
     )
     def validate_max_length(cls, v, values):
         if v is not None:
             model = values.get('model', 'jina')
+            if model in ['jina', 'jina-v3'] and v > 8192:
+                raise ValueError(f"Max length for {model} model is 8192")
+            elif model in ['robertalex', 'legal-bert'] and v > 512:
+                raise ValueError(f"Max length for {model} model is 512")
             if v < 1:
                 raise ValueError("Max length must be positive")
         return v

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ numpy<2.0.0
 scikit-learn==1.3.2
 pydantic==2.5.0
 huggingface-hub==0.19.4
-python-multipart==0.0.6

 scikit-learn==1.3.2
 pydantic==2.5.0
 huggingface-hub==0.19.4
+python-multipart==0.0.6
+protobuf>=3.20.0

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# utils/__init__.py
+"""Utils package for helper functions"""
+from .helpers import load_models, get_embeddings, cleanup_memory, validate_input_texts, get_model_info
+__all__ = ['load_models', 'get_embeddings', 'cleanup_memory', 'validate_input_texts', 'get_model_info']

utils/helpers.py CHANGED Viewed

@@ -3,14 +3,18 @@
 import torch
 import torch.nn.functional as F
-from transformers import AutoTokenizer, AutoModel, RobertaTokenizer, RobertaModel
 from typing import List, Dict, Optional
 import gc
 import os
 def load_models() -> Dict:
     """
-    Load both embedding models with memory optimization
     Returns:
         Dict containing loaded models and tokenizers
@@ -21,8 +25,8 @@ def load_models() -> Dict:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     try:
-        # Load Jina model
-        print("Loading Jina embeddings model...")
         jina_tokenizer = AutoTokenizer.from_pretrained(
             'jinaai/jina-embeddings-v2-base-es',
             trust_remote_code=True
@@ -43,16 +47,52 @@ def load_models() -> Dict:
         ).to(device)
         robertalex_model.eval()
         models_cache = {
             'jina': {
                 'tokenizer': jina_tokenizer,
                 'model': jina_model,
-                'device': device
             },
             'robertalex': {
                 'tokenizer': robertalex_tokenizer,
                 'model': robertalex_model,
-                'device': device
             }
         }
@@ -92,7 +132,7 @@ def get_embeddings(
     Args:
         texts: List of texts to embed
-        model_name: Name of model to use ('jina' or 'robertalex')
         models_cache: Dictionary containing loaded models
         normalize: Whether to normalize embeddings
         max_length: Maximum sequence length
@@ -101,15 +141,19 @@ def get_embeddings(
         List of embedding vectors
     """
     if model_name not in models_cache:
-        raise ValueError(f"Model {model_name} not available. Choose 'jina' or 'robertalex'")
     tokenizer = models_cache[model_name]['tokenizer']
     model = models_cache[model_name]['model']
     device = models_cache[model_name]['device']
     # Set max length based on model capabilities
     if max_length is None:
-        max_length = 8192 if model_name == 'jina' else 512
     # Process in batches for memory efficiency
     batch_size = 8 if len(texts) > 8 else len(texts)
@@ -131,11 +175,11 @@ def get_embeddings(
         with torch.no_grad():
             model_output = model(**encoded_input)
-            if model_name == 'jina':
-                # Jina models require mean pooling
                 embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
             else:
-                # RoBERTalex: use [CLS] token embedding
                 embeddings = model_output.last_hidden_state[:, 0, :]
         # Normalize if requested
@@ -201,6 +245,20 @@ def get_model_info(model_name: str) -> Dict:
             'max_length': 512,
             'pooling': 'cls',
             'languages': ['Spanish']
         }
     }

 import torch
 import torch.nn.functional as F
+from transformers import (
+    AutoTokenizer, AutoModel,
+    RobertaTokenizer, RobertaModel,
+    BertTokenizer, BertModel
+)
 from typing import List, Dict, Optional
 import gc
 import os
 def load_models() -> Dict:
     """
+    Load all embedding models with memory optimization
     Returns:
         Dict containing loaded models and tokenizers
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     try:
+        # Load Jina v2 Spanish model
+        print("Loading Jina embeddings v2 Spanish model...")
         jina_tokenizer = AutoTokenizer.from_pretrained(
             'jinaai/jina-embeddings-v2-base-es',
             trust_remote_code=True
         ).to(device)
         robertalex_model.eval()
+        # Load Jina v3 model
+        print("Loading Jina embeddings v3 model...")
+        jina_v3_tokenizer = AutoTokenizer.from_pretrained(
+            'jinaai/jina-embeddings-v3',
+            trust_remote_code=True
+        )
+        jina_v3_model = AutoModel.from_pretrained(
+            'jinaai/jina-embeddings-v3',
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        ).to(device)
+        jina_v3_model.eval()
+        # Load Legal BERT model
+        print("Loading Legal BERT model...")
+        legal_bert_tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
+        legal_bert_model = BertModel.from_pretrained(
+            'nlpaueb/legal-bert-base-uncased',
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        ).to(device)
+        legal_bert_model.eval()
         models_cache = {
             'jina': {
                 'tokenizer': jina_tokenizer,
                 'model': jina_model,
+                'device': device,
+                'pooling': 'mean'
             },
             'robertalex': {
                 'tokenizer': robertalex_tokenizer,
                 'model': robertalex_model,
+                'device': device,
+                'pooling': 'cls'
+            },
+            'jina-v3': {
+                'tokenizer': jina_v3_tokenizer,
+                'model': jina_v3_model,
+                'device': device,
+                'pooling': 'mean'
+            },
+            'legal-bert': {
+                'tokenizer': legal_bert_tokenizer,
+                'model': legal_bert_model,
+                'device': device,
+                'pooling': 'cls'
             }
         }
     Args:
         texts: List of texts to embed
+        model_name: Name of model to use
         models_cache: Dictionary containing loaded models
         normalize: Whether to normalize embeddings
         max_length: Maximum sequence length
         List of embedding vectors
     """
     if model_name not in models_cache:
+        raise ValueError(f"Model {model_name} not available. Choose from: {list(models_cache.keys())}")
     tokenizer = models_cache[model_name]['tokenizer']
     model = models_cache[model_name]['model']
     device = models_cache[model_name]['device']
+    pooling_strategy = models_cache[model_name]['pooling']
     # Set max length based on model capabilities
     if max_length is None:
+        if model_name in ['jina', 'jina-v3']:
+            max_length = 8192
+        else:  # robertalex, legal-bert
+            max_length = 512
     # Process in batches for memory efficiency
     batch_size = 8 if len(texts) > 8 else len(texts)
         with torch.no_grad():
             model_output = model(**encoded_input)
+            if pooling_strategy == 'mean':
+                # Mean pooling for Jina models
                 embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
             else:
+                # CLS token for BERT-based models
                 embeddings = model_output.last_hidden_state[:, 0, :]
         # Normalize if requested
             'max_length': 512,
             'pooling': 'cls',
             'languages': ['Spanish']
+        },
+        'jina-v3': {
+            'full_name': 'jinaai/jina-embeddings-v3',
+            'dimensions': 1024,
+            'max_length': 8192,
+            'pooling': 'mean',
+            'languages': ['Multilingual']
+        },
+        'legal-bert': {
+            'full_name': 'nlpaueb/legal-bert-base-uncased',
+            'dimensions': 768,
+            'max_length': 512,
+            'pooling': 'cls',
+            'languages': ['English']
         }
     }