update example usage

Files changed (6) hide show

README.md +74 -1
examples/__pycache__/prefixLLM.cpython-310.pyc +0 -0
examples/__pycache__/template.cpython-310.pyc +0 -0
examples/inference.py +20 -0
examples/prefixLLM.py +150 -0
examples/template.py +26 -0

README.md CHANGED Viewed

@@ -1,3 +1,76 @@
 ---
 license: apache-2.0
----

 ---
 license: apache-2.0
+---
+# AutoL2S-7B
+This is the official model repository for **AutoL2S-7B**, a model fine-tuned for efficient reasoning based on [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/tree/main).
+## 💡 Overview
+AutoL2S enables automatically switching between short and long reasoning paths based on input complexity. This repository contains:
+- Model weights
+- Configuration files
+- necessary scripts in the `examples/` directory
+---
+## 🧩 Dependencies
+We recommend using the model with [vLLM](https://github.com/vllm-project/vllm).
+The code has been tested with:
+```
+vLLM == 0.6.2
+```
+---
+## 🚀 How to Use
+Run the inference example:
+```bash
+cd examples
+python run_inference.py
+```
+Alternatively, **please download examples/prefixLLM.py and examples/template.py from this repository and put them in your working dir**.
+```python
+from vllm import SamplingParams
+from prefixLLM import PrefixLLM
+from template import SYSTEM_PROMPT, SHORT_TRIGGER
+llm = PrefixLLM(model="amandaa/AutoL2S-7b")
+max_tokens, temp = 32768, 0.7
+sampling_params_route = SamplingParams(max_tokens=max_tokens, temperature=temp, stop=["<specialLong>"], include_stop_str_in_output=True)
+sampling_params_force_think = SamplingParams(max_tokens=max_tokens, temperature=temp)
+question = "Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$"
+messages = [
+    {"role": "system", "content": SYSTEM_PROMPT},
+    {"role": "user", "content": question}
+]
+responses = llm.route_chat(messages=messages, sampling_params_route=sampling_params_route, sampling_params_force_think=sampling_params_force_think, use_tqdm=True, trigger_word=SHORT_TRIGGER)
+print(SHORT_TRIGGER + responses[0].outputs[0].text)
+```
+---
+## 🔍 Citation
+If you use this model in your work, please consider citing:
+```bibtex
+@misc{autol2s2025,
+  title        = {AutoL2S: Auto Long-Short Reasoning for Efficient Large Language Models},
+  author       = {Luo, Feng* and Chuang, Yu-Neng* and Wang, Guanchu* and Le, Duy and Zhong, Shaochen and Liu, Hongyi and Yuan, Jiayi and Sui, Yang and Braverman, Vladimir and Chaudhary, Vipin and Hu, Xia},
+  journal={arXiv preprint},
+  year={2025}
+}
+```

examples/__pycache__/prefixLLM.cpython-310.pyc ADDED Viewed

Binary file (3.98 kB). View file

examples/__pycache__/template.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

examples/inference.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from vllm import SamplingParams
+from prefixLLM import PrefixLLM
+from template import SYSTEM_PROMPT, SHORT_TRIGGER
+if __name__ == "__main__":
+    llm = PrefixLLM(model="amandaa/AutoL2S-7b")
+    max_tokens, temp = 32768, 0.7
+    sampling_params_route = SamplingParams(max_tokens=max_tokens, temperature=temp, stop=["<specialLong>"], include_stop_str_in_output=True)
+    sampling_params_force_think = SamplingParams(max_tokens=max_tokens, temperature=temp)
+    question = "Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$"
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": question}
+    ]
+    responses = llm.route_chat(messages=messages, sampling_params_route=sampling_params_route, sampling_params_force_think=sampling_params_force_think, use_tqdm=True, trigger_word=SHORT_TRIGGER)
+    print(SHORT_TRIGGER + responses[0].outputs[0].text)

examples/prefixLLM.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import re
+from typing import Dict, List, Optional, Sequence, Union
+from vllm import LLM, SamplingParams
+from vllm.entrypoints.chat_utils import (
+    ChatCompletionMessageParam,
+    apply_hf_chat_template,
+    apply_mistral_chat_template,
+    parse_chat_messages,
+)
+from vllm.inputs import PromptInputs, TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.utils import is_list_of
+_TAIL_WS_RE = re.compile(r"(?:\r?\n|\s)+$")
+def needs_newline(text: str) -> bool:
+    """Return True when *text* does NOT already end with whitespace/newline."""
+    return _TAIL_WS_RE.search(text[-8:]) is None  # inspect last few chars
+def add_prefix(prompt: str, prefix: str, eos_token: str) -> str:
+    """Insert *prefix* before the first generated token.
+    Keeps EOS token at the very end if the template already appended it.
+    """
+    if prompt.endswith(eos_token):
+        return prompt[:-len(eos_token)] + prefix + eos_token
+    return prompt + prefix
+class PrefixLLM(LLM):
+    """vLLM LLM subclass that conditionally prepends *trigger_word*."""
+    def route_chat(
+        self,
+        messages: Union[
+            List[ChatCompletionMessageParam],
+            List[List[ChatCompletionMessageParam]],
+        ],
+        sampling_params_route: Optional[Union[SamplingParams,
+                                        List[SamplingParams]]] = None,
+        sampling_params_force_think: Optional[Union[SamplingParams,
+                                        List[SamplingParams]]] = None,
+        use_tqdm: bool = True,
+        lora_request: Optional[LoRARequest] = None,
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = True,
+        tools: Optional[List[Dict[str, any]]] = None,
+        *,
+        trigger_word: Optional[str] = None,
+    ) -> List[RequestOutput]:
+        """Drop-in replacement for `LLM.chat` with one extra keyword:
+        Parameters
+        ----------
+        trigger_word : str | None, default None
+            The prefix to inject.  If ``None`` → no prefix injection.
+        """
+        tokenizer = self.get_tokenizer()
+        model_config = self.llm_engine.get_model_config()
+        eos_token   = tokenizer.eos_token
+        orig_prompts: List[Union[TokensPrompt, TextPrompt]] = []
+        pref_prompts: List[Union[TokensPrompt, TextPrompt]] = []
+        mm_payloads: List[Optional[Dict[str, Any]]] = []
+        list_of_messages: List[List[ChatCompletionMessageParam]]
+        # Handle multi and single conversations
+        if is_list_of(messages, list):
+            # messages is List[List[...]]
+            list_of_messages = messages
+        else:
+            # messages is List[...]
+            list_of_messages = [messages]
+        prompts: List[Union[TokensPrompt, TextPrompt]] = []
+        for msgs in list_of_messages:
+            # ---- render chat template exactly once ----
+            if isinstance(tokenizer, MistralTokenizer):
+                prompt_data: Union[str, List[int]] = apply_mistral_chat_template(
+                    tokenizer,
+                    messages=msgs,
+                    chat_template=chat_template,
+                    add_generation_prompt=add_generation_prompt,
+                    tools=tools,
+                )
+                mm_data = None  # mistral util returns already embedded image tokens
+            else:
+                conversation, mm_data = parse_chat_messages(msgs, model_config, tokenizer)
+                prompt_data = apply_hf_chat_template(
+                    tokenizer,
+                    conversation=conversation,
+                    chat_template=chat_template,
+                    add_generation_prompt=add_generation_prompt,
+                    tools=tools,
+                )
+            if is_list_of(prompt_data, int):
+                raise NotImplementedError
+            else:
+                orig_prompt = TextPrompt(prompt=prompt_data)
+                if trigger_word is None:
+                    raise ValueError("trigger_word must be provided when using force_think logic")
+                need_nl = needs_newline(prompt_data)
+                prefix   = trigger_word + ("\n" if need_nl else "")
+                pref_txt = add_prefix(prompt_data, prefix, eos_token)
+                pref_prompt = TextPrompt(prompt=pref_txt)
+            if mm_data is not None:
+                orig_prompt["multi_modal_data"] = mm_data
+                pref_prompt["multi_modal_data"] = copy.deepcopy(mm_data)
+            orig_prompts.append(orig_prompt)
+            pref_prompts.append(pref_prompt)
+        results = self.generate(
+            orig_prompts,
+            sampling_params=sampling_params_route,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+        )
+        need_force = [i for i, out in enumerate(results) if "<specialLong>" in out.outputs[0].text[:100]]
+        if len(need_force) == 0:
+            return results  # early exit, nothing to redo
+        prompts_force = [pref_prompts[i] for i in need_force]
+        results_force = self.generate(
+            prompts_force,
+            sampling_params=sampling_params_force_think,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+        )
+        for idx, new_out in zip(need_force, results_force):
+            results[idx] = new_out
+        return results

examples/template.py ADDED Viewed

	@@ -0,0 +1,26 @@

+SYSTEM_PROMPT = "Your role as an assistant is to solve problems using one of two ways. \
+        First, think step-by-step and give the answer. The solution should be formatted as follows: \
+        <|begin_of_solution|> \
+        {precise and concise solution} \
+        <|end_of_solution|> \
+        Second, your output thoroughly exploring questions through a systematic long \
+        thinking process before providing the final precise and accurate solutions. This requires \
+        engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, \
+        backtracing, and iteration to develop well-considered thinking process. \
+        Please structure your response into two main sections: Thought and Solution. \
+        In the Thought section, detail your reasoning process using the specified format: \
+        <|begin_of_thought|> {thought with steps separated with '\n\n'} \
+        <|end_of_thought|> \
+        Each step should include detailed considerations such as analisying questions, summarizing \
+        relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining \
+        any errors, and revisiting previous steps. \
+        In the Solution section, based on various attempts, explorations, and reflections from the Thought \
+        section, systematically present the final solution that you deem correct. The solution should \
+        remain a logical, accurate, concise expression style and detail necessary step needed to reach the \
+        conclusion, formatted as follows: \
+        <|begin_of_solution|> \
+        {final formatted, precise, and clear solution} \
+        <|end_of_solution|> \
+        Now, try to solve the following question through the above guidelines:"
+SHORT_TRIGGER = "<|begin_of_solution|>\n\nThis is a trigger to ensure the model’s upcoming output <short>."