--- datasets: - DIBT/10k_prompts_ranked - NickyNicky/10k_prompts_ranked_all_chatml_json_gemma - NickyNicky/10k_prompts_ranked_all model: - NickyNicky/gemma-2b-it_oasst2_chatML_Cluster_2_V1 language: - en library_name: transformers widget: - text: | system You are a prompt evaluator response format json. ngrams_length: "8" | cluster_length: "15". lista de codigos linguisticos disponibles: ["en", "en"]. user ### |detect_prompt|: What were the main contributions of Eratosthenes to the development of mathematics in ancient Greece? model\n license: apache-2.0 --- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/WlTKZRULBsCGcZ8L71ViI.png) ``` reference data model: datasets: link: https://huggingface.co/datasets/NickyNicky/oasst2_clusters model: - google/gemma-2b-it Link: https://huggingface.co/google/gemma-2b-it base fine tune: NickyNicky/gemma-2b-it_oasst2_chatML_Cluster_2_V1 Epoch: 2 future experts: test Eval model: - link: soon ``` ## train/loss 0.5407 ![image/png](https://cdn-uploads.huggingface.co/production/uploads/641b435ba5f876fe30c5ae0a/n3HVaz58rb-nOR0Bc64LO.png) ## ```Python !python -m pip install --upgrade pip !pip install "torch>=2.1.1" -U !pip install torchaudio==2.2.0 !pip install -q datasets trl peft bitsandbytes sentencepiece wandb !pip install -q accelerate safetensors deepspeed !pip install -q scipy ninja -U !pip install -q -U transformers==4.38.0 !pip install flash-attn==2.5.5 --no-build-isolation ``` ## Version ```py import torch torch.__version__ #OUTPUTS: ('2.2.0+cu121' ) ``` ## How to use ```py from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, GenerationConfig, TextIteratorStreamer, ) from transformers import StoppingCriteria, StoppingCriteriaList import torch # model_id='NickyNicky/gemma-2b-it_oasst2_chatML_Cluster2_aya_multilingual' model_id= "NickyNicky/gemma-2b-it_oasst2_chatML_Cluster2_aya_multilingual_10k_prompts_ranked_all_json_V1" model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", # load_in_4bit=True, # low_cpu_mem_usage= True, ) max_length=2100 print("max_length",max_length) tokenizer = AutoTokenizer.from_pretrained(model_id, # use_fast = False, max_length=max_length,) class ListOfTokensStoppingCriteria(StoppingCriteria): """ Clase para definir un criterio de parada basado en una lista de tokens específicos. """ def __init__(self, tokenizer, stop_tokens): self.tokenizer = tokenizer # Codifica cada token de parada y guarda sus IDs en una lista self.stop_token_ids_list = [tokenizer.encode(stop_token, add_special_tokens=False) for stop_token in stop_tokens] def __call__(self, input_ids, scores, **kwargs): # Verifica si los últimos tokens generados coinciden con alguno de los conjuntos de tokens de parada for stop_token_ids in self.stop_token_ids_list: len_stop_tokens = len(stop_token_ids) if len(input_ids[0]) >= len_stop_tokens: if input_ids[0, -len_stop_tokens:].tolist() == stop_token_ids: return True return False # Uso del criterio de parada personalizado stop_tokens = [""] # Lista de tokens de parada # Inicializa tu criterio de parada con el tokenizer y la lista de tokens de parada stopping_criteria = ListOfTokensStoppingCriteria(tokenizer, stop_tokens) # Añade tu criterio de parada a una StoppingCriteriaList stopping_criteria_list = StoppingCriteriaList([stopping_criteria]) prompt="""What were the main contributions of Eratosthenes to the development of mathematics in ancient Greece?""" #EXAMPLE #1 input_text = f'''system You are a prompt evaluator response format json. ngrams_length: "8" | cluster_length: "15". lista de codigos linguisticos disponibles: ["en", "en"]. user ### |detect_prompt|: {prompt} model ''' ### OUTPUT EXAMPLE ''' { "ngrams_length": "8", "ngrams": ["main", "contribution", "eratosthenes", "development", "mathematic", "ancient", "greece", "ancient greece"], "cluster_length": "15", "cluster": ["quantum", "magnetic", "star", "metal", "planet", "gravity", "force", "universe", "distance", "compound", "gravitational", "quantum computing", "solar", "sun", "earth"], "cluster_desc": ["Astrophysics", "Quantum Computing"], "avg_rating": "5.0", "kind": "synthetic" } ''' inputs = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens=False).to("cuda:0") max_new_tokens=700 generation_config = GenerationConfig( max_new_tokens=max_new_tokens, temperature=0.32, #top_p=0.9, top_k=45, repetition_penalty=1., #1.1 do_sample=True, ) outputs = model.generate(generation_config=generation_config, input_ids=inputs, stopping_criteria=stopping_criteria_list,) tokenizer.decode(outputs[0], skip_special_tokens=False) #True ``` ## code ``` https://colab.research.google.com/drive/1z26uLnTZWZ994G_dgyghNzh4hF2eEA6Z?usp=sharing ``` ## generated dataset model NickyNicky/prompts_ranked_808. ``` https://huggingface.co/datasets/NickyNicky/prompts_ranked_808 ```