VT45

Running

App Files Files Community

Ashrafb commited on Nov 9, 2024

Commit

c93b3ba

verified ·

1 Parent(s): ae78d7f

Update vtoonify_model.py

Browse files

Files changed (1) hide show

vtoonify_model.py +70 -62

vtoonify_model.py CHANGED Viewed

@@ -14,10 +14,13 @@ from model.vtoonify import VToonify
 from model.bisenet.model import BiSeNet
 import torch.nn.functional as F
 from torchvision import transforms
-from model.encoder.align_all_parallel import align_face
 import gc
 import huggingface_hub
 import os
 MODEL_REPO = 'PKUWilliamYang/VToonify'
@@ -68,7 +71,7 @@ class Model():
     def _create_insightface_detector(self):
         # Initialize InsightFace
         app = insightface.app.FaceAnalysis()
-        app.prepare(ctx_id=0, det_size=(640, 640))  # ctx_id=-1 for CPU, 0 for GPU
         return app
     def _create_parsing_model(self):
@@ -94,66 +97,7 @@ class Model():
             exstyle = vtoonify.zplus2wplus(exstyle)
         return vtoonify, exstyle
-    def detect_and_align(self, frame, top, bottom, left, right, return_para=False):
-        message = 'Error: no face detected! Please retry or change the photo.'
-        instyle = None
-        # Use InsightFace for face detection
-        faces = self.face_detector.get(frame)
-        if len(faces) > 0:
-            face = faces[0]
-            bbox = face.bbox.astype(int)
-            x, y, w, h = bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]
-            top, bottom, left, right = y, y + h, x, x + w
-            scale = 1.0  # Adjust scale as needed
-            h, w = frame.shape[:2]
-            H, W = int(bottom-top), int(right-left)
-            # for HR image, we apply gaussian blur to it to avoid over-sharp stylization results
-            kernel_1d = np.array([[0.125], [0.375], [0.375], [0.125]])
-            if scale <= 0.75:
-                frame = cv2.sepFilter2D(frame, -1, kernel_1d, kernel_1d)
-            if scale <= 0.375:
-                frame = cv2.sepFilter2D(frame, -1, kernel_1d, kernel_1d)
-            frame = cv2.resize(frame, (w, h))[top:bottom, left:right]
-            with torch.no_grad():
-                I = align_face(frame, self.face_detector)
-                if I is not None:
-                    I = self.transform(I).unsqueeze(dim=0).to(self.device)
-                    instyle = self.pspencoder(I)
-                    instyle = self.vtoonify.zplus2wplus(instyle)
-                    message = 'Successfully rescale the frame to (%d, %d)' % (bottom-top, right-left)
-                else:
-                    frame = np.zeros((256, 256, 3), np.uint8)
-        else:
-            frame = np.zeros((256, 256, 3), np.uint8)
-        if return_para:
-            return frame, instyle, message, w, h, top, bottom, left, right, scale
-        return frame, instyle, message
-    # Other methods remain unchanged
-    def _create_parsing_model(self):
-        parsingpredictor = BiSeNet(n_classes=19)
-        parsingpredictor.load_state_dict(torch.load(huggingface_hub.hf_hub_download(MODEL_REPO, 'models/faceparsing.pth'),
-                                                    map_location=lambda storage, loc: storage))
-        parsingpredictor.to(self.device).eval()
-        return parsingpredictor
-    def _load_encoder(self) -> nn.Module:
-        style_encoder_path = huggingface_hub.hf_hub_download(MODEL_REPO, 'models/encoder.pt')
-        return load_psp_standalone(style_encoder_path, self.device)
-    def _load_default_model(self) -> tuple:
-        vtoonify = VToonify(backbone='dualstylegan')
-        vtoonify.load_state_dict(torch.load(huggingface_hub.hf_hub_download(MODEL_REPO,
-                                            'models/vtoonify_d_cartoon/vtoonify_s026_d0.5.pt'),
-                                            map_location=lambda storage, loc: storage)['g_ema'])
-        vtoonify.to(self.device)
-        tmp = np.load(huggingface_hub.hf_hub_download(MODEL_REPO, 'models/vtoonify_d_cartoon/exstyle_code.npy'), allow_pickle=True).item()
-        exstyle = torch.tensor(tmp[list(tmp.keys())[26]]).to(self.device)
-        with torch.no_grad():
-            exstyle = vtoonify.zplus2wplus(exstyle)
-        return vtoonify, exstyle
-    def load_model(self, style_type: str) -> tuple:
         if 'illustration' in style_type:
             self.color_transfer = True
         else:
@@ -170,7 +114,71 @@ class Model():
         with torch.no_grad():
             exstyle = self.vtoonify.zplus2wplus(exstyle)
         return exstyle, 'Model of %s loaded.' % (style_type)
     def detect_and_align_image(self, frame_rgb: np.ndarray, top: int, bottom: int, left: int, right: int) -> tuple:
         if frame_rgb is None:
             return np.zeros((256, 256, 3), np.uint8), None, 'Error: fail to load the image.'

 from model.bisenet.model import BiSeNet
 import torch.nn.functional as F
 from torchvision import transforms
 import gc
 import huggingface_hub
 import os
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
 MODEL_REPO = 'PKUWilliamYang/VToonify'
     def _create_insightface_detector(self):
         # Initialize InsightFace
         app = insightface.app.FaceAnalysis()
+        app.prepare(ctx_id=0 if self.device == 'cuda' else -1, det_size=(640, 640))
         return app
     def _create_parsing_model(self):
             exstyle = vtoonify.zplus2wplus(exstyle)
         return vtoonify, exstyle
+    def load_model(self, style_type: str) -> tuple[torch.Tensor, str]:
         if 'illustration' in style_type:
             self.color_transfer = True
         else:
         with torch.no_grad():
             exstyle = self.vtoonify.zplus2wplus(exstyle)
         return exstyle, 'Model of %s loaded.' % (style_type)
+    def detect_and_align(self, frame, top, bottom, left, right, return_para=False):
+        message = 'Error: no face detected! Please retry or change the photo.'
+        instyle = None
+        h, w, scale = 0, 0, 0
+        # Use InsightFace for face detection
+        faces = self.face_detector.get(frame)
+        if len(faces) > 0:
+            logging.info(f"Detected {len(faces)} face(s).")
+            face = faces[0]
+            bbox = face.bbox.astype(int)
+            landmarks = face.landmark_2d_106
+            # Align face based on landmarks
+            aligned_face = self.align_face(frame, landmarks)
+            if aligned_face is not None:
+                with torch.no_grad():
+                    I = self.transform(aligned_face).unsqueeze(dim=0).to(self.device)
+                    instyle = self.pspencoder(I)
+                    instyle = self.vtoonify.zplus2wplus(instyle)
+                    message = 'Successfully aligned the face.'
+            else:
+                frame = np.zeros((256, 256, 3), np.uint8)
+        else:
+            logging.warning("No face detected.")
+            frame = np.zeros((256, 256, 3), np.uint8)
+        if return_para:
+            return frame, instyle, message, h, w, top, bottom, left, right, scale
+        return frame, instyle, message
+    def align_face(self, image, landmarks):
+        # Calculate auxiliary vectors for alignment
+        eye_left = np.mean(landmarks[36:42], axis=0)
+        eye_right = np.mean(landmarks[42:48], axis=0)
+        mouth_left = landmarks[48]
+        mouth_right = landmarks[54]
+        # Calculate transformation parameters
+        eye_center = (eye_left + eye_right) / 2
+        mouth_center = (mouth_left + mouth_right) / 2
+        eye_to_eye = eye_right - eye_left
+        eye_to_mouth = mouth_center - eye_center
+        # Define the transformation matrix
+        x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+        x /= np.hypot(*x)
+        x *= np.hypot(*eye_to_eye) * 2.0
+        y = np.flipud(x) * [-1, 1]
+        c = eye_center + eye_to_mouth * 0.1
+        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+        qsize = np.hypot(*x) * 2
+        # Transform and crop the image
+        transform_size = 256
+        output_size = 256
+        img = PIL.Image.fromarray(image)
+        img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
+        if output_size < transform_size:
+            img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+        return np.array(img)
+    # Other methods remain unchanged
     def detect_and_align_image(self, frame_rgb: np.ndarray, top: int, bottom: int, left: int, right: int) -> tuple:
         if frame_rgb is None:
             return np.zeros((256, 256, 3), np.uint8), None, 'Error: fail to load the image.'