Spaces:
Runtime error
Runtime error
added transcription
Browse files- app.py +10 -2
- get_transcripts_with_openai.py +66 -0
- load_vision_model_locally.py +2 -2
- requirements.txt +2 -1
- utils.py +8 -16
app.py
CHANGED
@@ -15,9 +15,17 @@ def answer_video_question(query : str, url : str, file : bytes) -> dict:
|
|
15 |
temp_vid.write(file)
|
16 |
temp_video_path = temp_vid.name
|
17 |
|
18 |
-
|
|
|
|
|
19 |
all_frames_data = extract_nfps_frames(temp_video_path)
|
20 |
langchain_documents = provide_video_RAG(all_frames_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
return {"status_vid_frame_from_file":all_frames_data}
|
22 |
|
23 |
elif url:
|
@@ -44,4 +52,4 @@ demo = gr.Interface(
|
|
44 |
|
45 |
# Launch the interface and MCP server
|
46 |
if __name__ == "__main__":
|
47 |
-
demo.launch(mcp_server=True)
|
|
|
15 |
temp_vid.write(file)
|
16 |
temp_video_path = temp_vid.name
|
17 |
|
18 |
+
|
19 |
+
|
20 |
+
# Output frames Documents()
|
21 |
all_frames_data = extract_nfps_frames(temp_video_path)
|
22 |
langchain_documents = provide_video_RAG(all_frames_data)
|
23 |
+
|
24 |
+
|
25 |
+
langchain_transcripts = get_langchain_Document_for_rag(temp_video_path)
|
26 |
+
|
27 |
+
|
28 |
+
os.unlink(temp_video_path) # clean up extracted file
|
29 |
return {"status_vid_frame_from_file":all_frames_data}
|
30 |
|
31 |
elif url:
|
|
|
52 |
|
53 |
# Launch the interface and MCP server
|
54 |
if __name__ == "__main__":
|
55 |
+
demo.launch(mcp_server=True, server_port=7776)
|
get_transcripts_with_openai.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import math
|
3 |
+
import tempfile
|
4 |
+
from pydub import AudioSegment
|
5 |
+
from langchain.schema import Document
|
6 |
+
from openai import OpenAI
|
7 |
+
import moviepy
|
8 |
+
|
9 |
+
def get_langchain_Document_for_rag(video_path):
|
10 |
+
# Extract audio from video file
|
11 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
|
12 |
+
video_clip = moviepy.editor.VideoFileClip(video_path)
|
13 |
+
video_clip.audio.write_audiofile(temp_audio_file.name, logger=None)
|
14 |
+
temp_audio_path = temp_audio_file.name
|
15 |
+
video_clip.close()
|
16 |
+
|
17 |
+
# Instantiate llm client
|
18 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
19 |
+
|
20 |
+
# Load extracted audio
|
21 |
+
audio = AudioSegment.from_file(temp_audio_path)
|
22 |
+
|
23 |
+
# Chunk audio for translation
|
24 |
+
translations = []
|
25 |
+
chunk_duration_ms = 5 * 60 * 1000 # 5 minutes
|
26 |
+
num_chunks = math.ceil(len(audio) / chunk_duration_ms)
|
27 |
+
for i in range(num_chunks):
|
28 |
+
start = i * chunk_duration_ms
|
29 |
+
end = min((i + 1) * chunk_duration_ms, len(audio))
|
30 |
+
chunk = audio[start:end]
|
31 |
+
|
32 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as chunk_file:
|
33 |
+
chunk.export(chunk_file.name, format="mp3")
|
34 |
+
chunk_file.seek(0)
|
35 |
+
with open(chunk_file.name, "rb") as f:
|
36 |
+
translation = client.audio.translations.create(
|
37 |
+
model="whisper-1", # or use your preferred model
|
38 |
+
file=f,
|
39 |
+
)
|
40 |
+
|
41 |
+
translations.append({
|
42 |
+
'chunk_id': i,
|
43 |
+
'start_time': start,
|
44 |
+
'end_time': end,
|
45 |
+
'transcript': translation.text,
|
46 |
+
})
|
47 |
+
|
48 |
+
os.unlink(chunk_file.name) # clean up chunk file
|
49 |
+
|
50 |
+
os.unlink(temp_audio_path) # clean up extracted audio file
|
51 |
+
|
52 |
+
# Create LangChain documents
|
53 |
+
langchain_documents = []
|
54 |
+
for data in translations:
|
55 |
+
content = f"Transcript: {data['transcript']}"
|
56 |
+
doc = Document(
|
57 |
+
page_content=content,
|
58 |
+
metadata={
|
59 |
+
"start_time": data['start_time'],
|
60 |
+
"end_time": data['end_time'],
|
61 |
+
"chunk_id": data['chunk_id']
|
62 |
+
}
|
63 |
+
)
|
64 |
+
langchain_documents.append(doc)
|
65 |
+
|
66 |
+
return langchain_documents
|
load_vision_model_locally.py
CHANGED
@@ -4,7 +4,7 @@ import io
|
|
4 |
from PIL import Image
|
5 |
import torch
|
6 |
from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
|
7 |
-
|
8 |
|
9 |
|
10 |
class VideoAnalyzer:
|
@@ -46,7 +46,7 @@ class VideoAnalyzer:
|
|
46 |
object_name = self.detection_model.config.id2label[label.item()]
|
47 |
detected_objects.append(object_name)
|
48 |
|
49 |
-
return
|
50 |
except Exception as e:
|
51 |
print(f"Error detecting objects: {e}")
|
52 |
return []
|
|
|
4 |
from PIL import Image
|
5 |
import torch
|
6 |
from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
|
7 |
+
from collections import Counter
|
8 |
|
9 |
|
10 |
class VideoAnalyzer:
|
|
|
46 |
object_name = self.detection_model.config.id2label[label.item()]
|
47 |
detected_objects.append(object_name)
|
48 |
|
49 |
+
return dict(Counter(detected_objects))
|
50 |
except Exception as e:
|
51 |
print(f"Error detecting objects: {e}")
|
52 |
return []
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ scikit-image
|
|
6 |
langchain
|
7 |
transformers
|
8 |
torch
|
9 |
-
pillow
|
|
|
|
6 |
langchain
|
7 |
transformers
|
8 |
torch
|
9 |
+
pillow
|
10 |
+
timm
|
utils.py
CHANGED
@@ -126,7 +126,7 @@ def extract_keyframes(video_path, diff_threshold=0.4):
|
|
126 |
return "success"
|
127 |
|
128 |
|
129 |
-
def extract_nfps_frames(video_path, nfps=
|
130 |
"""Extract 1 frame per second from a video.
|
131 |
Args:
|
132 |
video_path (str): Path to the input video file.
|
@@ -145,9 +145,13 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
|
|
145 |
frame_id = 0
|
146 |
saved_id = 0
|
147 |
success, prev_frame = cap.read()
|
148 |
-
|
149 |
while True:
|
150 |
success, frame = cap.read()
|
|
|
|
|
|
|
|
|
151 |
if not success:
|
152 |
break
|
153 |
|
@@ -160,7 +164,8 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
|
|
160 |
# append to a list that will constitute RAG Docuement
|
161 |
timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
|
162 |
timestamp_sec = timestamp_ms / 1000.0
|
163 |
-
description
|
|
|
164 |
frame_data = {
|
165 |
"frame_id": saved_id,
|
166 |
"timestamp_sec": timestamp_sec,
|
@@ -175,19 +180,6 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
|
|
175 |
print(f"Extracted {saved_id} frames (1 per second).")
|
176 |
return all_frames_data
|
177 |
|
178 |
-
def get_frame_infos(filename:str) -> dict:
|
179 |
-
from load_vision_model_locally import VideoAnalyser
|
180 |
-
analyser = VideoAnalyser()
|
181 |
-
|
182 |
-
description = analyser.describe_frame(filename)
|
183 |
-
objects = analyser.detect_objects(filename)
|
184 |
-
|
185 |
-
print("description",type(description),description)
|
186 |
-
print("detection",type(detection),detection)
|
187 |
-
|
188 |
-
return (descrition, objects)
|
189 |
-
|
190 |
-
|
191 |
|
192 |
from langchain.docstore.document import Document
|
193 |
|
|
|
126 |
return "success"
|
127 |
|
128 |
|
129 |
+
def extract_nfps_frames(video_path, nfps=30,diff_threshold=0.4):
|
130 |
"""Extract 1 frame per second from a video.
|
131 |
Args:
|
132 |
video_path (str): Path to the input video file.
|
|
|
145 |
frame_id = 0
|
146 |
saved_id = 0
|
147 |
success, prev_frame = cap.read()
|
148 |
+
all_frames_data=[]
|
149 |
while True:
|
150 |
success, frame = cap.read()
|
151 |
+
|
152 |
+
from load_vision_model_locally import VideoAnalyzer
|
153 |
+
analyser = VideoAnalyzer()
|
154 |
+
|
155 |
if not success:
|
156 |
break
|
157 |
|
|
|
164 |
# append to a list that will constitute RAG Docuement
|
165 |
timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
|
166 |
timestamp_sec = timestamp_ms / 1000.0
|
167 |
+
description = analyser.describe_frame(filename)
|
168 |
+
objects = analyser.detect_objects(filename)
|
169 |
frame_data = {
|
170 |
"frame_id": saved_id,
|
171 |
"timestamp_sec": timestamp_sec,
|
|
|
180 |
print(f"Extracted {saved_id} frames (1 per second).")
|
181 |
return all_frames_data
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
from langchain.docstore.document import Document
|
185 |
|