File size: 2,392 Bytes
ffde8eb
 
9aaba9b
ffde8eb
c2024d5
 
fcb35fa
90d99db
 
 
 
 
 
 
 
 
c2024d5
 
 
 
 
 
90d99db
c2024d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffde8eb
c2024d5
fcb35fa
 
 
c2024d5
 
 
 
ffde8eb
 
b893b69
c2024d5
ffde8eb
9e391aa
 
 
90d99db
c2024d5
900328d
9e391aa
ffde8eb
 
 
339c973
ffde8eb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import onnxruntime
from transformers import AutoTokenizer
import torch
import os
from transformers import pipeline
import subprocess

token  = AutoTokenizer.from_pretrained('distilroberta-base')

inf_session = onnxruntime.InferenceSession('classifier1-quantized.onnx')
input_name = inf_session.get_inputs()[0].name
output_name = inf_session.get_outputs()[0].name

classes = ['Art', 'Astrology', 'Biology', 'Chemistry', 'Economics', 'History', 'Literature', 'Philosophy', 'Physics', 'Politics', 'Psychology', 'Sociology']

### --- Audio/Video to txt ---###
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline("automatic-speech-recognition", 
                    model="openai/whisper-base.en",
                    chunk_length_s=30, device=device)

### --- Text Summary --- ###
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
     

def video_identity(video):
    transcription = pipe(video)["text"]
    return transcription

def summary(text):
    text = text.split('.')
    max_chunk = 500
    current_chunk = 0
    chunks = []
    
    
    for t in text:
        if len(chunks) == current_chunk + 1:
            if len(chunks[current_chunk]) + len(t.split(' ')) <= max_chunk:
                chunks[current_chunk].extend(t.split(' '))
            else:
                current_chunk += 1
                chunks.append(t.split(' '))
        else:
          chunks.append(t.split(' '))
    
    for chunk in range(len(chunks)):
        chunks[chunk] =' '.join(chunks[chunk])
    
    summ = summarizer(chunks,max_length = 100)

    return summ

def classify(vid):
    filename = vid[:-4]
    subprocess.call(['ffmpeg','-i',f'{filename}.wav'])
    full_text = video_identity(f'{filename}.wav')
    sum = summary(full_text)[0]['summary_text']

    
    input_ids = token(sum)['input_ids'][:512]
    logits = inf_session.run([output_name],{input_name : [input_ids]})[0]
    logits = torch.FloatTensor(logits)
    probs = torch.sigmoid(logits)[0]
    return full_text, sum, dict(zip(classes,map(float,probs)))

text1 = gr.Textbox(label="Text")
text2 =  gr.Textbox(label="Summary")

    
iface = gr.Interface(fn=classify,
                     inputs=gr.inputs.Video(source="upload", type="filepath"),
                     outputs = [text1,text2,gr.outputs.Label(num_top_classes=3)])
iface.launch(inline=False)