File size: 1,440 Bytes
2fafc94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646f8c2
2fafc94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

import torch
from langchain.embeddings import HuggingFaceEmbeddings


def get_hf_embeddings(model_name=None):
    """Get huggingface embedding."""
    
    if model_name is None:
        # Some candiates: 
        # "BAAI/bge-m3" (good, though large and slow)
        # "BAAI/bge-base-en-v1.5" -> seems not that good with current settings
        # "sentence-transformers/all-mpnet-base-v2", "maidalun1020/bce-embedding-base_v1", "intfloat/multilingual-e5-large"
        # Ref: https://huggingface.co/spaces/mteb/leaderboard, https://huggingface.co/maidalun1020/bce-embedding-base_v1
        model_name = "BAAI/bge-large-en-v1.5"  # or ""
    
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    
    return embeddings


def get_jinaai_embeddings(model_name="jinaai/jina-embeddings-v2-base-en", device="auto"):
    """Get jinaai embedding."""
    
    # device: cpu or cuda
    if device == "auto":
        device = "cuda" if torch.cuda.is_available() else "cpu"
    # For jinaai. Ref: https://github.com/langchain-ai/langchain/issues/6080
    from transformers import AutoModel
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)  # -> will yield error, need bug fixing

    model_name = model_name
    model_kwargs = {'device': device, "trust_remote_code": True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
    )
    
    return embeddings