--- library_name: transformers license: mit datasets: - Zhengping/UNLI - Zhengping/UNLI-style-synthetic language: - en metrics: - pearsonr - spearmanr - accuracy base_model: - Qwen/Qwen2.5-14B-Instruct --- # Model Card for Model ID ## Model Details ### Model Description This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. - **Developed by:** Liaoyaqi Wang, Zhengping Jiang, Anqi Liu, Benjamin Van Durme - **Model type:** Decoding-based Regression Model (Classification) - **Language(s) (NLP):** `en` - **License:** mit - **Finetuned from model [optional]:** Qwen/Qwen2.5-14B-Instruct ### Model Sources [optional] - **Repository:** [Decoding-based Regression](https://github.com/zipJiang/decoding-based-regression.git) - **Paper [optional]:** [Always Tell Me The Odds: Fine-grained Conditional Probability Estimation](https://arxiv.org/pdf/2505.01595) ## Uses ### Direct Use ```python import enum import transformers import torch from transformers.pipelines import PIPELINE_REGISTRY from transformers import ( pipeline, Pipeline, TextGenerationPipeline, PreTrainedTokenizer, AutoModelForCausalLM, PreTrainedTokenizer ) from transformers.pipelines.text_generation import Chat, ReturnType from typing import ( Dict, Callable, Tuple, List, ) class LevelToScorePipeline(TextGenerationPipeline): def __init__( self, level_to_score_func: Callable[[Tuple[torch.FloatTensor], PreTrainedTokenizer], Tuple[List[float], List[List[float]]]], *args, **kwargs ): super().__init__(*args, **kwargs) self._level_to_score_func = level_to_score_func def preprocess( self, prompt_text, prefix="", handle_long_generation=None, add_special_tokens=None, truncation=None, padding=None, max_length=None, continue_final_message=None, **generate_kwargs, ): # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults tokenizer_kwargs = { "add_special_tokens": add_special_tokens, "truncation": truncation, "padding": padding, "max_length": max_length, } tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None} if isinstance(prompt_text, Chat): tokenizer_kwargs.pop("add_special_tokens", None) # ignore add_special_tokens on chats # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default # because very few models support multiple separate, consecutive assistant messages if continue_final_message is None: continue_final_message = prompt_text.messages[-1]["role"] == "assistant" inputs = self.tokenizer.apply_chat_template( prompt_text.messages, add_generation_prompt=not continue_final_message, continue_final_message=continue_final_message, return_dict=True, return_tensors=self.framework, **tokenizer_kwargs, ) else: inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs) inputs["prompt_text"] = prompt_text if handle_long_generation == "hole": cur_len = inputs["input_ids"].shape[-1] if "max_new_tokens" in generate_kwargs: new_tokens = generate_kwargs["max_new_tokens"] else: new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len if new_tokens < 0: raise ValueError("We cannot infer how many new tokens are expected") if cur_len + new_tokens > self.tokenizer.model_max_length: keep_length = self.tokenizer.model_max_length - new_tokens if keep_length <= 0: raise ValueError( "We cannot use `hole` to handle this generation the number of desired tokens exceeds the" " models max length" ) inputs["input_ids"] = inputs["input_ids"][:, -keep_length:] if "attention_mask" in inputs: inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:] return inputs def _forward(self, model_inputs, **generate_kwargs): input_ids = model_inputs["input_ids"] attention_mask = model_inputs.get("attention_mask", None) # Allow empty prompts if input_ids.shape[1] == 0: input_ids = None attention_mask = None in_b = 1 else: in_b = input_ids.shape[0] prompt_text = model_inputs.pop("prompt_text") # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline. prefix_length = generate_kwargs.pop("prefix_length", 0) if prefix_length > 0: has_max_new_tokens = "max_new_tokens" in generate_kwargs or ( "generation_config" in generate_kwargs and generate_kwargs["generation_config"].max_new_tokens is not None ) if not has_max_new_tokens: generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length generate_kwargs["max_length"] += prefix_length has_min_new_tokens = "min_new_tokens" in generate_kwargs or ( "generation_config" in generate_kwargs and generate_kwargs["generation_config"].min_new_tokens is not None ) if not has_min_new_tokens and "min_length" in generate_kwargs: generate_kwargs["min_length"] += prefix_length # User-defined `generation_config` passed to the pipeline call take precedence if "generation_config" not in generate_kwargs: generate_kwargs["generation_config"] = self.generation_config generate_kwargs["output_scores"] = not generate_kwargs.get("do_sample", False) generate_kwargs["return_dict_in_generate"] = True generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) logits = None # TODO: check good default if generate_kwargs.get("return_scores", True): assert not generate_kwargs.get("do_sample", False), "return_logits=True is only supported for do_sample=False" # Proceed to process logits and convert to score average. # next_token_logits is [batch_size, vocab_size] # raw_logits is a tuple of ([next_token_logits, past_key_values]) logits = generated_sequence.scores out_b = generated_sequence.sequences.shape[0] if self.framework == "pt": generated_sequence = generated_sequence.sequences.reshape(in_b, out_b // in_b, *generated_sequence.sequences.shape[1:]) # elif self.framework == "tf": # generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])) return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text, "logits": logits} def postprocess( self, model_outputs, return_type=ReturnType.FULL_TEXT, clean_up_tokenization_spaces=True, continue_final_message=None, ): generated_sequence = model_outputs["generated_sequence"][0] input_ids = model_outputs["input_ids"] prompt_text = model_outputs["prompt_text"] logits = model_outputs["logits"] #TODO: This is now making many assumptions about how the logits are ordered, # Should think about how to make this explicit scores, selective_logits = self._level_to_score_func(logits, self.tokenizer) generated_sequence = generated_sequence.numpy().tolist() records = [] for sequence in generated_sequence: if return_type == ReturnType.TENSORS: record = {"generated_token_ids": sequence} elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}: # Decode text text = self.tokenizer.decode( sequence, skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used if input_ids is None: prompt_length = 0 else: prompt_length = len( self.tokenizer.decode( input_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=clean_up_tokenization_spaces, ) ) all_text = text[prompt_length:] if return_type == ReturnType.FULL_TEXT: if isinstance(prompt_text, str): all_text = prompt_text + all_text elif isinstance(prompt_text, Chat): if continue_final_message is None: # If the user passes a chat ending in an assistant message, we treat it as a prefill by # default because very few models support multiple separate, consecutive assistant messages continue_final_message = prompt_text.messages[-1]["role"] == "assistant" if continue_final_message: # With assistant prefill, concat onto the end of the last message all_text = list(prompt_text.messages)[:-1] + [ { "role": prompt_text.messages[-1]["role"], "content": prompt_text.messages[-1]["content"] + all_text, } ] else: # When we're not starting from a prefill, the output is a new assistant message all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}] record = { "generated_text": all_text, "score": scores[0], "selective_logits": selective_logits[0] } records.append(record) return records class SingleLabelRankDict: def __init__( self, rank_dict: Dict[Text, Any] ): self._rank_dict = rank_dict def __len__(self) -> int: return len(self._rank_dict) def get_rank_dict(self, tokenizer: PreTrainedTokenizer) -> Dict[int, Any]: return {tokenizer.convert_tokens_to_ids([token])[0]: value for token, value in self._rank_dict.items()} def to_tokenizer(self, tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: """Augment tokenizer vocab with `rank_dict` IN-PLACE. """ vocabs: List[Text] = self._rank_dict.keys() new_vocab = [vocab for vocab in vocabs if vocab not in tokenizer.get_vocab()] tokenizer.add_tokens(new_vocab) return tokenizer def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> "SingleLabelRankDict": vocab = tokenizer.get_vocab() rank_dict = {} pattern = re.compile(r" <\|label_level_(\d+)\|>") for token in vocab.keys(): match = pattern.match(token) if match: value = int(match.group(1)) # normalized_value = value / (len(vocab) - 1) rank_dict[token] = value # normalize rank_values num_levels = max(rank_dict.values()) + 1 for token in rank_dict.keys(): rank_dict[token] = 1. / num_levels * (rank_dict[token] + 0.5) return cls(rank_dict=rank_dict) model = transformers.AutoModelForCausalLM.from_pretrained( "Zhengping/conditional-probability-regression", torch_dtype="auto", attn_implementation="flash_attention_2", ) tokenizer = transformers.AutoTokenizer.from_pretrained( "Zhengping/conditional-probability-regression", ) rank_dict = SingleLabelRankDict.from_tokenizer(tokenizer) PIPELINE_REGISTRY.register_pipeline( "level-to-score", pipeline_class=LevelToScorePipeline, pt_model=AutoModelForCausalLM ) # This allows fine-grained labeling, the greedy decoding gives a coarse score, # one can also attach their own level-to-score function to the pipeline, e.g. using UNLI # label transformation to get it more binarized def _level_to_score_func( logits: Tuple[torch.FloatTensor], tokenizer: PreTrainedTokenizer ) -> Tuple[List[float], List[float]]: """ """ logits = logits[0] num_labels = len(rank_dict) considering_ids = tokenizer.convert_tokens_to_ids([f" <|label_level_{i}|>" for i in range(num_labels)]) selective_logits = torch.index_select(logits, 1, torch.tensor(considering_ids, device=logits.device)) step_size = 1 / num_labels expectation = torch.tensor([[i * step_size + 1 / 2 * step_size for i in range(num_labels)]], device=selective_logits.device) scores = torch.softmax(selective_logits, dim=-1) @ expectation.T scores = scores.squeeze(-1).tolist() return scores, selective_logits.tolist() pipe = pipeline( "level-to-score", model=model, max_new_tokens=2, tokenizer=tokenizer, device=0, level_to_score_func=_level_to_score_func, torch_dtype=torch.bfloat16, ) template = UNLITemplate() premise = "Sam is sleeping." hypothesis = "Sam is awake." inputs = [ { "role": "user", "content": "### Question: Given the premise \"{premise}\", how likely is it that the hypothesis \"{hypothesis}\" is true?\n\n".format( premise=premise, hypothesis=hypothesis ) }, { "role": "assitant", "content": "### Answer:" } ] result = pipe(inputs) print(result) ``` ## Use with vLLM `TODO` #### Summary LLM-based Fine-grained Conditional Probability Estimation ## Citation [optional] ```bibtex @article{wang2025always, title={Always Tell Me The Odds: Fine-grained Conditional Probability Estimation}, author={Wang, Liaoyaqi and Jiang, Zhengping and Liu, Anqi and Van Durme, Benjamin}, journal={arXiv preprint arXiv:2505.01595}, year={2025} } ```