RCaz commited on
Commit
7e4124b
·
1 Parent(s): 6a052d6

added transcription

Browse files
app.py CHANGED
@@ -15,9 +15,17 @@ def answer_video_question(query : str, url : str, file : bytes) -> dict:
15
  temp_vid.write(file)
16
  temp_video_path = temp_vid.name
17
 
18
- # Output frame folder
 
 
19
  all_frames_data = extract_nfps_frames(temp_video_path)
20
  langchain_documents = provide_video_RAG(all_frames_data)
 
 
 
 
 
 
21
  return {"status_vid_frame_from_file":all_frames_data}
22
 
23
  elif url:
@@ -44,4 +52,4 @@ demo = gr.Interface(
44
 
45
  # Launch the interface and MCP server
46
  if __name__ == "__main__":
47
- demo.launch(mcp_server=True)
 
15
  temp_vid.write(file)
16
  temp_video_path = temp_vid.name
17
 
18
+
19
+
20
+ # Output frames Documents()
21
  all_frames_data = extract_nfps_frames(temp_video_path)
22
  langchain_documents = provide_video_RAG(all_frames_data)
23
+
24
+
25
+ langchain_transcripts = get_langchain_Document_for_rag(temp_video_path)
26
+
27
+
28
+ os.unlink(temp_video_path) # clean up extracted file
29
  return {"status_vid_frame_from_file":all_frames_data}
30
 
31
  elif url:
 
52
 
53
  # Launch the interface and MCP server
54
  if __name__ == "__main__":
55
+ demo.launch(mcp_server=True, server_port=7776)
get_transcripts_with_openai.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import tempfile
4
+ from pydub import AudioSegment
5
+ from langchain.schema import Document
6
+ from openai import OpenAI
7
+ import moviepy
8
+
9
+ def get_langchain_Document_for_rag(video_path):
10
+ # Extract audio from video file
11
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
12
+ video_clip = moviepy.editor.VideoFileClip(video_path)
13
+ video_clip.audio.write_audiofile(temp_audio_file.name, logger=None)
14
+ temp_audio_path = temp_audio_file.name
15
+ video_clip.close()
16
+
17
+ # Instantiate llm client
18
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
19
+
20
+ # Load extracted audio
21
+ audio = AudioSegment.from_file(temp_audio_path)
22
+
23
+ # Chunk audio for translation
24
+ translations = []
25
+ chunk_duration_ms = 5 * 60 * 1000 # 5 minutes
26
+ num_chunks = math.ceil(len(audio) / chunk_duration_ms)
27
+ for i in range(num_chunks):
28
+ start = i * chunk_duration_ms
29
+ end = min((i + 1) * chunk_duration_ms, len(audio))
30
+ chunk = audio[start:end]
31
+
32
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as chunk_file:
33
+ chunk.export(chunk_file.name, format="mp3")
34
+ chunk_file.seek(0)
35
+ with open(chunk_file.name, "rb") as f:
36
+ translation = client.audio.translations.create(
37
+ model="whisper-1", # or use your preferred model
38
+ file=f,
39
+ )
40
+
41
+ translations.append({
42
+ 'chunk_id': i,
43
+ 'start_time': start,
44
+ 'end_time': end,
45
+ 'transcript': translation.text,
46
+ })
47
+
48
+ os.unlink(chunk_file.name) # clean up chunk file
49
+
50
+ os.unlink(temp_audio_path) # clean up extracted audio file
51
+
52
+ # Create LangChain documents
53
+ langchain_documents = []
54
+ for data in translations:
55
+ content = f"Transcript: {data['transcript']}"
56
+ doc = Document(
57
+ page_content=content,
58
+ metadata={
59
+ "start_time": data['start_time'],
60
+ "end_time": data['end_time'],
61
+ "chunk_id": data['chunk_id']
62
+ }
63
+ )
64
+ langchain_documents.append(doc)
65
+
66
+ return langchain_documents
load_vision_model_locally.py CHANGED
@@ -4,7 +4,7 @@ import io
4
  from PIL import Image
5
  import torch
6
  from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
7
-
8
 
9
 
10
  class VideoAnalyzer:
@@ -46,7 +46,7 @@ class VideoAnalyzer:
46
  object_name = self.detection_model.config.id2label[label.item()]
47
  detected_objects.append(object_name)
48
 
49
- return list(set(detected_objects))
50
  except Exception as e:
51
  print(f"Error detecting objects: {e}")
52
  return []
 
4
  from PIL import Image
5
  import torch
6
  from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection
7
+ from collections import Counter
8
 
9
 
10
  class VideoAnalyzer:
 
46
  object_name = self.detection_model.config.id2label[label.item()]
47
  detected_objects.append(object_name)
48
 
49
+ return dict(Counter(detected_objects))
50
  except Exception as e:
51
  print(f"Error detecting objects: {e}")
52
  return []
requirements.txt CHANGED
@@ -6,4 +6,5 @@ scikit-image
6
  langchain
7
  transformers
8
  torch
9
- pillow
 
 
6
  langchain
7
  transformers
8
  torch
9
+ pillow
10
+ timm
utils.py CHANGED
@@ -126,7 +126,7 @@ def extract_keyframes(video_path, diff_threshold=0.4):
126
  return "success"
127
 
128
 
129
- def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
130
  """Extract 1 frame per second from a video.
131
  Args:
132
  video_path (str): Path to the input video file.
@@ -145,9 +145,13 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
145
  frame_id = 0
146
  saved_id = 0
147
  success, prev_frame = cap.read()
148
-
149
  while True:
150
  success, frame = cap.read()
 
 
 
 
151
  if not success:
152
  break
153
 
@@ -160,7 +164,8 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
160
  # append to a list that will constitute RAG Docuement
161
  timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
162
  timestamp_sec = timestamp_ms / 1000.0
163
- description, objects = get_frame_infos(filename)
 
164
  frame_data = {
165
  "frame_id": saved_id,
166
  "timestamp_sec": timestamp_sec,
@@ -175,19 +180,6 @@ def extract_nfps_frames(video_path, nfps=5,diff_threshold=0.4):
175
  print(f"Extracted {saved_id} frames (1 per second).")
176
  return all_frames_data
177
 
178
- def get_frame_infos(filename:str) -> dict:
179
- from load_vision_model_locally import VideoAnalyser
180
- analyser = VideoAnalyser()
181
-
182
- description = analyser.describe_frame(filename)
183
- objects = analyser.detect_objects(filename)
184
-
185
- print("description",type(description),description)
186
- print("detection",type(detection),detection)
187
-
188
- return (descrition, objects)
189
-
190
-
191
 
192
  from langchain.docstore.document import Document
193
 
 
126
  return "success"
127
 
128
 
129
+ def extract_nfps_frames(video_path, nfps=30,diff_threshold=0.4):
130
  """Extract 1 frame per second from a video.
131
  Args:
132
  video_path (str): Path to the input video file.
 
145
  frame_id = 0
146
  saved_id = 0
147
  success, prev_frame = cap.read()
148
+ all_frames_data=[]
149
  while True:
150
  success, frame = cap.read()
151
+
152
+ from load_vision_model_locally import VideoAnalyzer
153
+ analyser = VideoAnalyzer()
154
+
155
  if not success:
156
  break
157
 
 
164
  # append to a list that will constitute RAG Docuement
165
  timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
166
  timestamp_sec = timestamp_ms / 1000.0
167
+ description = analyser.describe_frame(filename)
168
+ objects = analyser.detect_objects(filename)
169
  frame_data = {
170
  "frame_id": saved_id,
171
  "timestamp_sec": timestamp_sec,
 
180
  print(f"Extracted {saved_id} frames (1 per second).")
181
  return all_frames_data
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  from langchain.docstore.document import Document
185