File size: 2,872 Bytes

from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer, Pipeline
import torch.nn.functional as F
import torch
from typing import Any, Dict, List

# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class SentenceEmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        # we don't have any hyperameters to sanitize
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}

    def preprocess(self, inputs):
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        return encoded_inputs

    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}

    def postprocess(self, model_outputs):
        # Perform pooling
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings


class EndpointHandler():
    def __init__(self, path="./"):
        # load the optimized model
        model = ORTModelForFeatureExtraction.from_pretrained(
                path,file_name="model_optimized.onnx",
                # provider="CPUExecutionProvider",
            )
        tokenizer = AutoTokenizer.from_pretrained(path)
        # create inference pipeline

        self.pl = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)
        


    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
                - "label": A string representing what the label/class is. There can be multiple labels.
                - "score": A score between 0 and 1 describing how confident the model is for this label/class.
        """
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", None)

        # pass inputs with all kwargs in data
        if parameters is not None:
            prediction = self.pl(inputs, **parameters)
        else:
            prediction = self.pl(inputs)
        # postprocess the prediction
        return {
            "embeddings": prediction.cpu().tolist()
        }