Spaces:
Sleeping
Sleeping
File size: 2,097 Bytes
231d431 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# models_config.py
CANONICAL_MODELS = {
"all-MiniLM-L6-v2": {
"name": "sentence-transformers/all-MiniLM-L6-v2",
"dimension": 384,
"requires_remote_code": False,
"max_tokens": 512,
},
"gte-multilingual-base": {
"name": "Alibaba-NLP/gte-multilingual-base",
"dimension": 768,
"requires_remote_code": True,
"max_tokens": 8192,
},
"nomic-embed-text-v1.5": {
"name": "nomic-ai/nomic-embed-text-v1.5",
"dimension": 768,
"requires_remote_code": True,
"max_tokens": 8192,
"instruction_prefix_required": True,
"default_instruction_prefix": "search_document:",
"known_instruction_prefixes": [
"search_document:",
"search_query:",
"clustering:",
"classification:",
],
},
"all-mpnet-base-v2": {
"name": "sentence-transformers/all-mpnet-base-v2",
"dimension": 768,
"requires_remote_code": False,
"max_tokens": 384,
},
}
# Mapping of aliases to their canonical model names
MODEL_ALIASES = {
"all-minilm": "all-MiniLM-L6-v2",
"text-embedding-3-small": "all-MiniLM-L6-v2",
"text-embedding-3-large": "gte-multilingual-base",
"nomic-embed-text": "nomic-embed-text-v1.5",
}
# This global MODELS dictionary will be used for listing available models and validation.
# It combines canonical names and aliases for easy lookup.
MODELS = {**CANONICAL_MODELS, **{alias: CANONICAL_MODELS[canonical] for alias, canonical in MODEL_ALIASES.items()}}
def get_model_config(requested_model_name: str) -> dict:
"""
Resolves a requested model name (which might be an alias) to its canonical
configuration. Raises ValueError if the model is not found.
"""
canonical_name = MODEL_ALIASES.get(requested_model_name, requested_model_name)
if canonical_name not in CANONICAL_MODELS:
raise ValueError(f"Model '{requested_model_name}' (canonical: '{canonical_name}') is not a recognized model.")
return CANONICAL_MODELS[canonical_name] |