Mettaton

Running

App Files Files Community

DragonProgrammer commited on 19 days ago

Commit

619ec30

verified ·

1 Parent(s): 451d4d7

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -20

app.py CHANGED Viewed

@@ -66,8 +66,7 @@ class LangChainAgentWrapper:
     def __init__(self):
         print("Initializing LangChainAgentWrapper...")
-        # We will keep using the gemma-2b-it model, but load it in 4-bit
-        model_id = "google/gemma-2b-it"
         try:
             hf_auth_token = os.getenv("HF_TOKEN")
@@ -76,34 +75,48 @@ class LangChainAgentWrapper:
             else:
                 print("HF_TOKEN secret found.")
-            # --- NEW: 4-Bit Quantization Configuration ---
-            # Create a configuration for loading the model in 4-bit precision.
-            # This makes the model faster and use less memory.
             print("Creating 4-bit quantization config...")
             quantization_config = transformers.BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype="bfloat16" # Use bfloat16 for faster computation
             )
             print("Quantization config created.")
-            # --- END NEW ---
-            # Create the Hugging Face pipeline
-            print(f"Loading model pipeline for: {model_id} with quantization")
             llm_pipeline = transformers.pipeline(
                 "text-generation",
-                model=model_id,
-                model_kwargs={"torch_dtype": "auto"},
-                device_map="auto",
-                token=hf_auth_token,
-                quantization_config=quantization_config # <<< --- PASS THE NEW CONFIG HERE
             )
-            print("Model pipeline loaded successfully.")
             # Wrap the pipeline in a LangChain LLM object
             self.llm = HuggingFacePipeline(pipeline=llm_pipeline)
-            # Define the list of LangChain tools (this part is unchanged)
             self.tools = [
                 Tool(
                     name="get_current_time_in_timezone",
@@ -119,7 +132,7 @@ class LangChainAgentWrapper:
             ]
             print(f"Tools prepared for agent: {[tool.name for tool in self.tools]}")
-            # Create the ReAct agent prompt from a template (this part is unchanged)
             react_prompt = PromptTemplate.from_template(
                 """
                 You are a helpful assistant. Answer the following questions as best you can.
@@ -145,10 +158,8 @@ class LangChainAgentWrapper:
                 """
             )
-            # Create the agent (this part is unchanged)
             agent = create_react_agent(self.llm, self.tools, react_prompt)
-            # Create the agent executor (this part is unchanged)
             self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True)
             print("LangChain agent created successfully.")

     def __init__(self):
         print("Initializing LangChainAgentWrapper...")
+        model_id = "google/gemma-2b-it"
         try:
             hf_auth_token = os.getenv("HF_TOKEN")
             else:
                 print("HF_TOKEN secret found.")
+            # --- CORRECTED MODEL LOADING ---
+            # 1. Create the 4-bit quantization configuration
             print("Creating 4-bit quantization config...")
             quantization_config = transformers.BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype="bfloat16"
             )
             print("Quantization config created.")
+            # 2. Load the tokenizer
+            print(f"Loading tokenizer for: {model_id}")
+            tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_auth_token)
+            print("Tokenizer loaded successfully.")
+            # 3. Load the model with the quantization config
+            print(f"Loading model '{model_id}' with quantization...")
+            model = transformers.AutoModelForCausalLM.from_pretrained(
+                model_id,
+                quantization_config=quantization_config,
+                device_map="auto", # Automatically maps model to available hardware (CPU/GPU)
+                token=hf_auth_token
+            )
+            print("Model loaded successfully.")
+            # 4. Create the Hugging Face pipeline with the pre-loaded model and tokenizer
+            print("Creating text-generation pipeline...")
             llm_pipeline = transformers.pipeline(
                 "text-generation",
+                model=model,
+                tokenizer=tokenizer,
+                # No need to pass quantization_config here anymore
             )
+            print("Model pipeline created successfully.")
+            # --- END CORRECTION ---
             # Wrap the pipeline in a LangChain LLM object
             self.llm = HuggingFacePipeline(pipeline=llm_pipeline)
+            # Define the list of LangChain tools (this part is unchanged and correct)
             self.tools = [
                 Tool(
                     name="get_current_time_in_timezone",
             ]
             print(f"Tools prepared for agent: {[tool.name for tool in self.tools]}")
+            # Create the ReAct agent prompt (this part is unchanged and correct)
             react_prompt = PromptTemplate.from_template(
                 """
                 You are a helpful assistant. Answer the following questions as best you can.
                 """
             )
+            # Create the agent and executor (this part is unchanged and correct)
             agent = create_react_agent(self.llm, self.tools, react_prompt)
             self.agent_executor = AgentExecutor(agent=agent, tools=self.tools, verbose=True, handle_parsing_errors=True)
             print("LangChain agent created successfully.")