Spaces:

tttoaster
/

SEED-X-17B

Build error

App Files Files Community

tttoaster commited on May 14, 2024

Commit

281a32c

verified ·

1 Parent(s): 8c7cb04

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -14

app.py CHANGED Viewed

@@ -187,7 +187,7 @@ class LLMService:
         model_id_or_path = "stablediffusionapi/realistic-vision-v51"
         self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None, torch_dtype=torch.float16)
-        self.vae_pipe = self.vae_pipe.to(self.vit_sd_device)
         self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
         self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
@@ -207,7 +207,7 @@ class LLMService:
 service = LLMService(args)
 @spaces.GPU
-def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox):
   with torch.no_grad():
     text_list = text_list.split(IMG_FLAG)
     top_p = 0.5
@@ -360,14 +360,21 @@ def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox):
             img_feat = img_gen_feat[img_idx:img_idx + 1]
             generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
-            init_image = generated_image.resize((1024, 1024))
-            prompt = ""
-            images = service.vae_pipe(prompt=prompt, image=init_image,
-                          num_inference_steps=50, guidance_scale=8.0, strength=0.38).images
-            generated_image = images[0]
-            image_base64 = encode_image(generated_image)
-            gen_imgs_base64_list.append(image_base64)
         # print('loading visual encoder and llm to GPU, and sd to CPU')
         # a = time.time()
@@ -387,7 +394,7 @@ def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox):
     print(input_text + generated_text)
     return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
-def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_gen, force_bbox,
              request: gr.Request):
     print('input_state:', input_state)
@@ -409,7 +416,7 @@ def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_g
     force_boi = force_image_gen
     force_bbox = force_bbox
-    results = generate(text, images, max_new_tokens, force_boi, force_bbox)
     print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
     output_state = init_input_state()
@@ -652,6 +659,8 @@ If you want to experience the normal model inference speed, you can run [[Infere
 * You can click "Force Image Generation" to compel the model to produce images when necessary. For example, our model might struggle to generate images when there is an excessive amount of text-only context.
 * You can click "Force Bounding Box" to compel the model to produce bounding box for object detection.
 * SEED-X was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
@@ -755,6 +764,7 @@ if __name__ == '__main__':
                                           label="Max History Rounds")
                     force_img_gen = gr.Radio(choices=[True, False], value=False, label='Force Image Generation')
                     force_bbox = gr.Radio(choices=[True, False], value=False, label='Force Bounding Box')
             with gr.Column(scale=7):
                 chatbot = gr.Chatbot(elem_id='chatbot', label="SEED-X-I", height=700)
@@ -776,7 +786,7 @@ if __name__ == '__main__':
         downvote_btn.click(downvote_last_response, [dialog_state], [upvote_btn, downvote_btn])
         regenerate_btn.click(regenerate, [dialog_state], [dialog_state, chatbot] + btn_list).then(
-            http_bot, [dialog_state, input_state, max_new_tokens, max_turns, force_img_gen, force_bbox],
             [dialog_state, input_state, chatbot] + btn_list)
         add_image_btn.click(add_image, [dialog_state, input_state, image],
                             [dialog_state, input_state, image, chatbot] + btn_list)
@@ -789,7 +799,7 @@ if __name__ == '__main__':
             add_text, [dialog_state, input_state, text],
             [dialog_state, input_state, text, chatbot, upvote_btn, downvote_btn, regenerate_btn, clear_btn]).then(
             http_bot,
-            [dialog_state, input_state, max_new_tokens, max_turns, force_img_gen, force_bbox],
             [dialog_state, input_state, chatbot] + btn_list)
         clear_btn.click(clear_history, None, [dialog_state, input_state, chatbot] + btn_list)

         model_id_or_path = "stablediffusionapi/realistic-vision-v51"
         self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None, torch_dtype=torch.float16)
+        self.vae_pipe = self.vae_pipe.cpu()
         self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
         self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
 service = LLMService(args)
 @spaces.GPU
+def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox, force_polish):
   with torch.no_grad():
     text_list = text_list.split(IMG_FLAG)
     top_p = 0.5
             img_feat = img_gen_feat[img_idx:img_idx + 1]
             generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
+            if force_polish:
+                service.sd_adapter = service.sd_adapter.cpu()
+                service.vae_pipe = service.vae_pipe.to(service.vit_sd_device, dtype=service.dtype)
+                init_image = generated_image.resize((1024, 1024))
+                prompt = ""
+                images = service.vae_pipe(prompt=prompt, image=init_image,
+                              num_inference_steps=50, guidance_scale=8.0, strength=0.38).images
+                generated_image = images[0]
+                image_base64 = encode_image(generated_image)
+                gen_imgs_base64_list.append(image_base64)
+                service.sd_adapter = service.sd_adapter.to(service.vit_sd_device, dtype=service.dtype)
+                service.vae_pipe = service.vae_pipe.cpu()
         # print('loading visual encoder and llm to GPU, and sd to CPU')
         # a = time.time()
     print(input_text + generated_text)
     return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
+def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_gen, force_bbox, force_polish,
              request: gr.Request):
     print('input_state:', input_state)
     force_boi = force_image_gen
     force_bbox = force_bbox
+    results = generate(text, images, max_new_tokens, force_boi, force_bbox, force_polish)
     print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
     output_state = init_input_state()
 * You can click "Force Image Generation" to compel the model to produce images when necessary. For example, our model might struggle to generate images when there is an excessive amount of text-only context.
 * You can click "Force Bounding Box" to compel the model to produce bounding box for object detection.
+* You can click "Force Polishing Generated Image" to compel the model to polish the generated image with image post-processing.
 * SEED-X was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
                                           label="Max History Rounds")
                     force_img_gen = gr.Radio(choices=[True, False], value=False, label='Force Image Generation')
                     force_bbox = gr.Radio(choices=[True, False], value=False, label='Force Bounding Box')
+                    force_polish = gr.Radio(choices=[True, False], value=True, label='Force Polishing Generated Image')
             with gr.Column(scale=7):
                 chatbot = gr.Chatbot(elem_id='chatbot', label="SEED-X-I", height=700)
         downvote_btn.click(downvote_last_response, [dialog_state], [upvote_btn, downvote_btn])
         regenerate_btn.click(regenerate, [dialog_state], [dialog_state, chatbot] + btn_list).then(
+            http_bot, [dialog_state, input_state, max_new_tokens, max_turns, force_img_gen, force_bbox, force_polish],
             [dialog_state, input_state, chatbot] + btn_list)
         add_image_btn.click(add_image, [dialog_state, input_state, image],
                             [dialog_state, input_state, image, chatbot] + btn_list)
             add_text, [dialog_state, input_state, text],
             [dialog_state, input_state, text, chatbot, upvote_btn, downvote_btn, regenerate_btn, clear_btn]).then(
             http_bot,
+            [dialog_state, input_state, max_new_tokens, max_turns, force_img_gen, force_bbox, force_polish],
             [dialog_state, input_state, chatbot] + btn_list)
         clear_btn.click(clear_history, None, [dialog_state, input_state, chatbot] + btn_list)