venkatviswa
/

healthcare-standards-raft

@@ -9,67 +9,55 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from llama_index.core import VectorStoreIndex, load_index_from_storage
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 class HealthcareStandardsRAFT:
     """
     Healthcare Standards RAFT system that combines RAG and LoRA fine-tuning.
     """
     def __init__(self, model_path=None, device="cuda" if torch.cuda.is_available() else "cpu"):
         """
         Initialize the Healthcare Standards RAFT system.
         Args:
             model_path: Path to model directory or Hugging Face repo name
             device: Device to use for inference (cuda/cpu)
         """
-        # Use this repo if no model path provided
         if model_path is None:
-            model_path = "venkatviswa/healthcare-standards-raft"
-        self.device = device
         self.model_path = model_path
         # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-        # Load base model and apply weights
         self._load_model()
         # Load vector index for RAG
         self._load_vector_index()
     def _load_model(self):
-        """Load base model and apply LoRA weights."""
-        # Load base model
         self.model = AutoModelForCausalLM.from_pretrained(
             "microsoft/phi-4-mini-instruct",
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
             device_map="auto" if self.device == "cuda" else None
         )
-        # Check if model_path is local directory or Hugging Face repo
-        if os.path.isdir(self.model_path):
-            # Local directory
-            adapter_path = os.path.join(self.model_path, "model", "adapter_model.bin")
-        else:
-            # Download adapter weights from Hugging Face
-            from huggingface_hub import hf_hub_download
-            adapter_path = hf_hub_download(
-                repo_id=self.model_path,
-                filename="model/adapter_model.bin"
-            )
-        # Load LoRA weights
-        if os.path.exists(adapter_path):
-            # Load the weights using PEFT or direct state dict loading
-            # Implementation depends on your specific LoRA setup
-            weights = torch.load(adapter_path, map_location="cpu")
-            # This is a simplified example - you'll need to adapt this
-            # to your specific LoRA implementation
-            self.model.load_state_dict(weights, strict=False)
-        else:
-            print(f"Warning: Adapter weights not found at {adapter_path}")
     def _load_vector_index(self):
         """Load the vector index for RAG."""
@@ -78,10 +66,8 @@ class HealthcareStandardsRAFT:
             if os.path.isdir(self.model_path):
                 index_path = os.path.join(self.model_path, "vector_index")
             else:
-                # Download index from Hugging Face if needed
-                # This is a simplified implementation
-                index_path = "vector_index"  # Local cached path
-                # You would need to download the index files here
             # Create embedding model
             embed_model = HuggingFaceEmbedding(
@@ -163,24 +149,71 @@ Question: {question}
 Answer:"""
         # Generate response
-        inputs = self.tokenizer(prompt, return_tensors="pt")
-        if self.device == "cuda":
-            inputs = inputs.to("cuda")
         # Generate
         with torch.no_grad():
             outputs = self.model.generate(
-                inputs.input_ids,
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 top_p=0.9,
-                do_sample=temperature > 0
             )
         # Decode response
         response = self.tokenizer.decode(
-            outputs[0][inputs.input_ids.shape[1]:],
             skip_special_tokens=True
         )
-        return response

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from llama_index.core import VectorStoreIndex, load_index_from_storage
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from peft import PeftModel, PeftConfig
 class HealthcareStandardsRAFT:
     """
     Healthcare Standards RAFT system that combines RAG and LoRA fine-tuning.
     """
     def __init__(self, model_path=None, device="cuda" if torch.cuda.is_available() else "cpu"):
         """
         Initialize the Healthcare Standards RAFT system.
         Args:
             model_path: Path to model directory or Hugging Face repo name
             device: Device to use for inference (cuda/cpu)
         """
+        # Handle local fallback if no path is provided
         if model_path is None:
+            model_path = "./healthcare-standards-raft"
         self.model_path = model_path
+        self.adapter_dir = os.path.join(self.model_path, "model")
+        self.device = device
         # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-4-mini-instruct")
+        # Load base model and apply LoRA adapter
         self._load_model()
         # Load vector index for RAG
         self._load_vector_index()
     def _load_model(self):
+        """Load base model and apply LoRA weights using PEFT ."""
+        print("Loading base Phi-4-mini model...")
         self.model = AutoModelForCausalLM.from_pretrained(
             "microsoft/phi-4-mini-instruct",
             torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
             device_map="auto" if self.device == "cuda" else None
         )
+        adapter_dir = os.path.join(self.model_path, "model")
+        print(f"Applying LoRA adapter from: {adapter_dir}")
+        self.model = PeftModel.from_pretrained(self.model,adapter_dir)
     def _load_vector_index(self):
         """Load the vector index for RAG."""
             if os.path.isdir(self.model_path):
                 index_path = os.path.join(self.model_path, "vector_index")
             else:
+                index_path = "healthcare-standards-raft/vector_index"  # Local cached path
             # Create embedding model
             embed_model = HuggingFaceEmbedding(
 Answer:"""
         # Generate response
+        # Ensure pad token is set
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Tokenize
+        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True)
+        # Create attention mask
+        inputs["attention_mask"] = (inputs["input_ids"] != self.tokenizer.pad_token_id).long()
+        # Move to device
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         # Generate
         with torch.no_grad():
             outputs = self.model.generate(
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
                 max_new_tokens=max_tokens,
                 temperature=temperature,
                 top_p=0.9,
+                do_sample=temperature > 0,
+                pad_token_id=self.tokenizer.pad_token_id  # good practice
             )
         # Decode response
         response = self.tokenizer.decode(
+            outputs[0][inputs["input_ids"].shape[1]:],
             skip_special_tokens=True
         )
+        return response
+    def get_retrieved_contexts(self, question):
+        """
+        Get the contexts retrieved for a specific question.
+        Args:
+            question (str): The question to retrieve contexts for
+        Returns:
+            list: List of retrieved context strings
+        """
+        try:
+            if hasattr(self, 'index') and self.index is not None:
+                # Use the retriever to get relevant documents
+                retriever = self.index.as_retriever(similarity_top_k=3)
+                nodes = retriever.retrieve(question)
+                # Extract text from the retrieved nodes
+                contexts = []
+                for node in nodes:
+                    if hasattr(node, 'node') and hasattr(node.node, 'get_content'):
+                        contexts.append(node.node.get_content())
+                    elif hasattr(node, 'get_content'):
+                        contexts.append(node.get_content())
+                    elif hasattr(node, 'text'):
+                        contexts.append(node.text)
+                    else:
+                        contexts.append(str(node))
+                return contexts
+            else:
+                return ["Vector index not available"]
+        except Exception as e:
+            print(f"Error retrieving contexts: {e}")
+            return [f"Error retrieving contexts: {e}"]