Spaces:

angeloqq
/

MARIEL_PROJECT

Runtime error

File size: 15,957 Bytes

806c931

from flask import Flask, request, jsonify, render_template, send_from_directory
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TFCLIPModel,
    CLIPProcessor,
    pipeline,
    BertTokenizer,
    BertForSequenceClassification
)
import cv2
import os
import subprocess
import torch
from PIL import Image
import numpy as np
import base64
import uuid
from ultralytics import YOLO
import tensorflow as tf
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = Flask(__name__)

# Create directories
os.makedirs('save', exist_ok=True)
os.makedirs('temp', exist_ok=True)
os.makedirs('unsafe_frames', exist_ok=True)
os.makedirs('audio', exist_ok=True)
os.makedirs('logs', exist_ok=True)
os.makedirs('text_output', exist_ok=True)

print("Loading models...")
try:
    # Load models
    nudity_model = YOLO("Models/nudenet/320n.pt")

    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

    profanity_model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert")
    profanity_tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")

    hate_speech_model = AutoModelForSequenceClassification.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")
    hate_speech_tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/dehatebert-mono-english")

    clip_model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

    print("All models loaded successfully")
except Exception as e:
    logger.error(f"Error loading models: {str(e)}")
    raise

@app.route("/")
def home():
    return render_template('index.html')

@app.route("/extract_text", methods=["POST"])
def extract_text():
    try:
        audio_file = request.form.get('audio_file')
        if not audio_file:
            return jsonify({"error": "No audio file specified"}), 400

        audio_path = os.path.join('audio', audio_file)
        if not os.path.exists(audio_path):
            return jsonify({"error": "Audio file not found"}), 404

        # Process audio and get text
        audio_result = process_audio(audio_path)

        if not audio_result['success']:
            return jsonify({"error": audio_result['error']}), 500

        # Save extracted text
        text_filename = f"text_{uuid.uuid4().hex}.txt"
        text_path = os.path.join('text_output', text_filename)

        with open(text_path, 'w', encoding='utf-8') as f:
            f.write(audio_result['text'])

        # Analyze text content
        text_analysis = analyze_text_content(audio_result['text'])

        return jsonify({
            "success": True,
            "text": audio_result['text'],
            "text_file": text_filename,
            "confidence": audio_result['confidence'],
            "analysis": text_analysis
        })

    except Exception as e:
        logger.error(f"Error extracting text: {str(e)}")
        return jsonify({"error": str(e)}), 500

@app.route('/audio/<path:filename>')
def serve_audio(filename):
    return send_from_directory('audio', filename)

@app.route("/upload", methods=["POST"])
def upload_file():
    try:
        if 'file' not in request.files:
            return jsonify({"error": "No file uploaded"}), 400

        video = request.files['file']
        if video.filename == '':
            return jsonify({"error": "No file selected"}), 400

        video_path = os.path.join('save', video.filename)
        video.save(video_path)

        try:
            frames = extract_frames(video_path)
            results = []

            audio_filename = f"audio_{uuid.uuid4().hex}.wav"
            audio_path = os.path.join('audio', audio_filename)
            audio_result = extract_audio(video_path, audio_path)

            if audio_result:
                audio_text = process_audio(audio_path)
                text_content = audio_text.get('text', '')

                # Save extracted text
                if text_content:
                    text_filename = f"text_{uuid.uuid4().hex}.txt"
                    text_path = os.path.join('text_output', text_filename)

                    with open(text_path, 'w', encoding='utf-8') as f:
                        f.write(text_content)

                    text_analysis = analyze_text_content(text_content)
                else:
                    text_filename = None
                    text_analysis = None
            else:
                text_content = ''
                text_filename = None
                text_analysis = None

            batch_size = 15
            for i in range(0, len(frames), batch_size):
                batch_frames = frames[i:i + batch_size]
                result = analyze_batch(batch_frames, text_content)

                if result is None:
                    continue

                results.extend(result)

                # Cleanup frames
                for frame_data in batch_frames:
                    if frame_data.get('is_inappropriate', False) or frame_data.get('is_harmful', False):
                        unique_filename = f'unsafe_{uuid.uuid4().hex}.png'
                        unsafe_frame_path = os.path.join('unsafe_frames', unique_filename)
                        os.rename(frame_data['frame'], unsafe_frame_path)
                    else:
                        os.remove(frame_data['frame'])
                    os.remove(frame_data['thumbnail'])

            if os.path.exists(video_path):
                os.remove(video_path)

            if results:
                total_meta_score = sum(r['meta_standards']['score'] for r in results) / len(results)
                overall_assessment = {
                    "total_score": total_meta_score,
                    "risk_level": "High" if total_meta_score > 35 else "Medium" if total_meta_score > 30 else "Low",
                    "recommendation": get_recommendation(total_meta_score)
                }
            else:
                overall_assessment = {
                    "total_score": 0,
                    "risk_level": "Low",
                    "recommendation": "No issues detected"
                }

            return jsonify({
                "success": True,
                "results": results,
                "audio_path": audio_filename,
                "audio_text": text_content,
                "text_file": text_filename,
                "text_analysis": text_analysis,
                "overall_assessment": overall_assessment
            })

        except Exception as e:
            if os.path.exists(video_path):
                os.remove(video_path)
            logger.error(f"Error in content analysis: {str(e)}")
            return jsonify({"error": str(e)}), 500

    except Exception as e:
        logger.error(f"Error in upload: {str(e)}")
        return jsonify({"error": str(e)}), 500

def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise Exception("Error opening video file")

    frames = []
    frame_count = 0
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % fps == 0:
            frame_path = os.path.join('temp', f'frame_{frame_count}.jpg')
            thumbnail_path = os.path.join('temp', f'thumb_{frame_count}.jpg')

            cv2.imwrite(frame_path, frame)
            thumbnail = cv2.resize(frame, (648, 648))
            cv2.imwrite(thumbnail_path, thumbnail)

            frames.append({
                'frame': frame_path,
                'thumbnail': thumbnail_path,
                'timestamp': frame_count // fps
            })
        frame_count += 1

    cap.release()
    return frames

def extract_audio(video_path, output_path):
    try:
        command = [
            'ffmpeg',
            '-i', video_path,
            '-vn',
            '-acodec', 'pcm_s16le',
            '-ar', '16000',
            '-ac', '1',
            '-y',
            output_path
        ]

        result = subprocess.run(
            command,
            check=True,
            stderr=subprocess.PIPE,
            stdout=subprocess.PIPE
        )

        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            logger.info(f"Audio extracted successfully: {output_path}")
            return output_path
        else:
            raise Exception("Audio extraction failed - empty or missing file")

    except Exception as e:
        logger.error(f"Audio extraction error: {str(e)}")
        return None

def process_audio(audio_path):
    try:
        if not os.path.exists(audio_path):
            logger.error(f"Audio file not found: {audio_path}")
            return {
                'success': False,
                'text': "Audio file not found",
                'error': "File not found"
            }

        logger.info(f"Processing audio file: {audio_path}")

        # First pass with Whisper
        whisper_result = whisper_model(audio_path)

        logger.info(f"Whisper result: {whisper_result}")

        if not whisper_result.get('text'):
            logger.error("Whisper failed to extract text")
            return {
                'success': False,
                'text': "Whisper failed to extract text",
                'error': "No text found in Whisper output"
            }

        text = whisper_result['text']

        # Second pass with BERT
        chunks = [text[i:i+512] for i in range(0, len(text), 512)]
        processed_chunks = []

        for chunk in chunks:
            inputs = bert_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                outputs = bert_model(**inputs)

            processed_chunk = bert_tokenizer.decode(
                inputs['input_ids'][0],
                skip_special_tokens=True
            )
            processed_chunks.append(processed_chunk)

        final_text = " ".join(processed_chunks)

        return {
            'success': True,
            'text': final_text,
            'confidence': whisper_result.get('confidence', 0)
        }

    except Exception as e:
        logger.error(f"Audio processing error: {str(e)}")
        return {
            'success': False,
            'text': "Audio processing failed",
            'error': str(e)
        }

def analyze_text_content(text):
    try:
        # Analyze profanity
        profanity_inputs = profanity_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            profanity_outputs = profanity_model(**profanity_inputs)
            profanity_scores = torch.nn.functional.softmax(profanity_outputs.logits, dim=-1)

        # Analyze hate speech
        hate_speech_inputs = hate_speech_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            hate_speech_outputs = hate_speech_model(**hate_speech_inputs)
            hate_speech_scores = torch.nn.functional.softmax(hate_speech_outputs.logits, dim=-1)

        return {
            "profanity": {
                "score": float(profanity_scores[0][1]) * 100,
                "is_offensive": float(profanity_scores[0][1]) > 0.5
            },
            "hate_speech": {
                "score": float(hate_speech_scores[0][1]) * 100,
                "is_hateful": float(hate_speech_scores[0][1]) > 0.5
            }
        }
    except Exception as e:
        logger.error(f"Error analyzing text: {str(e)}")
        return None

def analyze_batch(batch_frames, text):
    try:
        results = []
        images = []
        timestamps = []

        for frame_data in batch_frames:
            image = Image.open(frame_data['frame'])
            image = image.resize((128, 128))
            images.append(image)
            timestamps.append(frame_data['timestamp'])

        # Prepare image data
        image_arrays = np.array([np.array(img) / 255.0 for img in images])
        image_tensors = torch.tensor(image_arrays).permute(0, 3, 1, 2).float()

        # Run analyses
        with torch.no_grad():
            nudity_results = nudity_model(image_tensors)
            nudity_predictions = [result.boxes for result in nudity_results]

        if text:
            profanity_inputs = profanity_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                profanity_outputs = profanity_model(**profanity_inputs)
                profanity_scores = torch.nn.functional.softmax(profanity_outputs.logits, dim=-1)

            hate_speech_inputs = hate_speech_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                hate_speech_outputs = hate_speech_model(**hate_speech_inputs)
                hate_speech_scores = torch.nn.functional.softmax(hate_speech_outputs.logits, dim=-1)
        else:
            profanity_scores = torch.zeros(1, 2)
            hate_speech_scores = torch.zeros(1, 2)

        clip_inputs = clip_processor(text=[text if text else ""] * len(images), images=images, return_tensors="tf", padding=True)
        clip_outputs = clip_model(**clip_inputs)
        clip_scores = tf.nn.softmax(clip_outputs.logits_per_image, axis=-1).numpy()

        for i in range(len(batch_frames)):
            with open(batch_frames[i]['thumbnail'], 'rb') as img_file:
                thumbnail_base64 = base64.b64encode(img_file.read()).decode('utf-8')

            nudity_score = float(nudity_predictions[i].conf[0]) * 99 if nudity_predictions[i] else 0.0
            profanity_score = float(profanity_scores[0][1]) * 99
            hate_speech_score = float(hate_speech_scores[0][1]) * 99
            harm_score = float(clip_scores[i][1]) * 99 if clip_scores[i].size > 1 else 0.0

            meta_standards_score = (
                (nudity_score * 0.6) +
                (hate_speech_score * 0.1) +
                (harm_score * 0.1) +
                (profanity_score * 0.1)
            )

            results.append({
                "nudity": {
                    "score": nudity_score,
                    "is_inappropriate": nudity_score > 65
                },
                "profanity": {
                    "score": profanity_score,
                    "is_offensive": profanity_score > 65
                },
                "hate_speech": {
                    "score": hate_speech_score,
                    "is_hateful": hate_speech_score > 40
                },
                "harm": {
                    "score": harm_score,
                    "is_harmful": harm_score > 40
                },
                "meta_standards": {
                    "score": meta_standards_score,
                    "is_violating": meta_standards_score > 30,
                    "risk_level": "High" if meta_standards_score > 60 else "Medium" if meta_standards_score > 25 else "Low",
                    "recommendation": get_recommendation(meta_standards_score)
                },
                "thumbnail": thumbnail_base64,
                "timestamp": timestamps[i]
            })

        return results
    except Exception as e:
        logger.error(f"Error in batch analysis: {str(e)}")
        return None

def get_recommendation(score):
    if score > 70:
        return "Content likely violates Meta Community Standards. Major modifications needed."
    elif score > 30:
        return "Content may need modifications to comply with Meta Community Standards."
    else:
        return "Content likely complies with Meta Community Standards."

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000, debug=True)