import torch from sparrow_parse.vllm.inference_base import ModelInference class LocalGPUInference(ModelInference): def __init__(self, model, device='cuda'): self.model = model self.device = device self.model.to(self.device) def inference(self, input_data, mode=None): self.model.eval() # Set the model to evaluation mode with torch.no_grad(): # No need to calculate gradients input_tensor = torch.tensor(input_data).to(self.device) output = self.model(input_tensor) return output.cpu().numpy() # Convert the output back to NumPy if necessary