Spaces:

RCaz
/

MCP_Track3_Discover

Runtime error

App Files Files Community

RCaz commited on Jun 6

Commit

7e4124b

1 Parent(s): 6a052d6

added transcription

Browse files

Files changed (5) hide show

app.py +10 -2
get_transcripts_with_openai.py +66 -0
load_vision_model_locally.py +2 -2
requirements.txt +2 -1
utils.py +8 -16

app.py CHANGED Viewed

@@ -15,9 +15,17 @@ def answer_video_question(query : str, url : str, file : bytes) -> dict:
             temp_vid.write(file)
             temp_video_path = temp_vid.name
-        # Output frame folder
         all_frames_data = extract_nfps_frames(temp_video_path)
         langchain_documents = provide_video_RAG(all_frames_data)
         return {"status_vid_frame_from_file":all_frames_data}
     elif url:
@@ -44,4 +52,4 @@ demo = gr.Interface(
 # Launch the interface and MCP server
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

             temp_vid.write(file)
             temp_video_path = temp_vid.name
+        # Output frames Documents()
         all_frames_data = extract_nfps_frames(temp_video_path)
         langchain_documents = provide_video_RAG(all_frames_data)
+        langchain_transcripts = get_langchain_Document_for_rag(temp_video_path)
+        os.unlink(temp_video_path) # clean up extracted file
         return {"status_vid_frame_from_file":all_frames_data}
     elif url:
 # Launch the interface and MCP server
 if __name__ == "__main__":
+    demo.launch(mcp_server=True, server_port=7776)

get_transcripts_with_openai.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import math
+import tempfile
+from pydub import AudioSegment
+from langchain.schema import Document
+from openai import OpenAI
+import moviepy
+def get_langchain_Document_for_rag(video_path):
+    # Extract audio from video file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
+        video_clip = moviepy.editor.VideoFileClip(video_path)
+        video_clip.audio.write_audiofile(temp_audio_file.name, logger=None)
+        temp_audio_path = temp_audio_file.name
+        video_clip.close()
+    # Instantiate llm client
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    # Load extracted audio
+    audio = AudioSegment.from_file(temp_audio_path)
+    # Chunk audio for translation
+    translations = []
+    chunk_duration_ms = 5 * 60 * 1000  # 5 minutes
+    num_chunks = math.ceil(len(audio) / chunk_duration_ms)
+    for i in range(num_chunks):
+        start = i * chunk_duration_ms
+        end = min((i + 1) * chunk_duration_ms, len(audio))
+        chunk = audio[start:end]
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as chunk_file:
+            chunk.export(chunk_file.name, format="mp3")
+            chunk_file.seek(0)
+            with open(chunk_file.name, "rb") as f:
+                translation = client.audio.translations.create(
+                    model="whisper-1",  # or use your preferred model
+                    file=f,
+                )
+                translations.append({
+                    'chunk_id': i,
+                    'start_time': start,
+                    'end_time': end,
+                    'transcript': translation.text,
+                })
+        os.unlink(chunk_file.name)  # clean up chunk file
+    os.unlink(temp_audio_path)  # clean up extracted audio file
+    # Create LangChain documents
+    langchain_documents = []
+    for data in translations:
+        content = f"Transcript: {data['transcript']}"
+        doc = Document(
+            page_content=content,
+            metadata={
+                "start_time": data['start_time'],
+                "end_time": data['end_time'],
+                "chunk_id": data['chunk_id']
+            }
+        )
+        langchain_documents.append(doc)
+    return langchain_documents

load_vision_model_locally.py CHANGED Viewed

@@ -4,7 +4,7 @@ import io
 from PIL import Image
 import torch
 from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
 class VideoAnalyzer:
@@ -46,7 +46,7 @@ class VideoAnalyzer:
                 object_name = self.detection_model.config.id2label[label.item()]
                 detected_objects.append(object_name)
-            return list(set(detected_objects))
         except Exception as e:
             print(f"Error detecting objects: {e}")
             return []

 from PIL import Image
 import torch
 from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
+from collections import Counter
 class VideoAnalyzer:
                 object_name = self.detection_model.config.id2label[label.item()]
                 detected_objects.append(object_name)
+            return dict(Counter(detected_objects))
         except Exception as e:
             print(f"Error detecting objects: {e}")
             return []

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ scikit-image
 langchain
 transformers
 torch
-pillow

 langchain
 transformers
 torch
+pillow
+timm

utils.py CHANGED Viewed

@@ -126,7 +126,7 @@ def extract_keyframes(video_path, diff_threshold=0.4):
     return "success"
-def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
     """Extract 1 frame per second from a video.
     Args:
         video_path (str): Path to the input video file.
@@ -145,9 +145,13 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
     frame_id = 0
     saved_id = 0
     success, prev_frame = cap.read()
     while True:
         success, frame = cap.read()
         if not success:
             break
@@ -160,7 +164,8 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
             # append to a list that will constitute RAG Docuement
             timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
             timestamp_sec = timestamp_ms / 1000.0
-            description, objects = get_frame_infos(filename)
             frame_data = {
                 "frame_id": saved_id,
                 "timestamp_sec": timestamp_sec,
@@ -175,19 +180,6 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
     print(f"Extracted {saved_id} frames (1 per second).")
     return all_frames_data
-def get_frame_infos(filename:str) -> dict:
-    from load_vision_model_locally import VideoAnalyser
-    analyser = VideoAnalyser()
-    description = analyser.describe_frame(filename)
-    objects = analyser.detect_objects(filename)
-    print("description",type(description),description)
-    print("detection",type(detection),detection)
-    return (descrition, objects)
 from langchain.docstore.document import Document

     return "success"
+def extract_nfps_frames(video_path, nfps=30,diff_threshold=0.4):
     """Extract 1 frame per second from a video.
     Args:
         video_path (str): Path to the input video file.
     frame_id = 0
     saved_id = 0
     success, prev_frame = cap.read()
+    all_frames_data=[]
     while True:
         success, frame = cap.read()
+        from load_vision_model_locally import VideoAnalyzer
+        analyser = VideoAnalyzer()
         if not success:
             break
             # append to a list that will constitute RAG Docuement
             timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
             timestamp_sec = timestamp_ms / 1000.0
+            description = analyser.describe_frame(filename)
+            objects = analyser.detect_objects(filename)
             frame_data = {
                 "frame_id": saved_id,
                 "timestamp_sec": timestamp_sec,
     print(f"Extracted {saved_id} frames (1 per second).")
     return all_frames_data
 from langchain.docstore.document import Document