from byaldi import RAGMultiModalModel
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import torch
from qwen_vl_utils import process_vision_info
from PIL import Image
import re
import gradio as gr

rag = RAGMultiModalModel.from_pretrained("vidore/colpali")
vlm = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    torch_dtype=torch.float32,
    trust_remote_code=True,
    device_map="auto",
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)

def extract_text(image, query):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": query},
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
    inputs = inputs.to("cpu")
    with torch.no_grad():
        generated_ids = vlm.generate(**inputs, max_new_tokens=200, temperature=0.7, top_p=0.9)
        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
def post_process_text(text):
    # Split the text into lines
    lines = text.split('. ')

    processed_lines = []
    for line in lines:
        # Separate Hindi and English text
        parts = re.split(r'([^\u0900-\u097F\s]+:)', line, 1)
        if len(parts) > 1:
            processed_lines.append(f"{parts[0]}{parts[1]}\n    {parts[2]}")
        else:
            processed_lines.append(line)

    # Join the lines with double line breaks
    text = '\n\n'.join(processed_lines)

    # Remove repeated phrases
    unique_phrases = list(dict.fromkeys(text.split('\n\n')))
    text = '\n\n'.join(unique_phrases)
    return text

def ocr(image):
    queries = [
        "Extract and transcribe all the text visible in the image, including any small or partially visible text.",
        # "Look closely at the image and list any text you see, no matter how small or unclear.",
        # "What text can you identify in this image? Include everything, even if it's partially obscured or in the background."
    ]

    all_extracted_text = []
    for query in queries:
        extracted_text = extract_text(image, query)
        all_extracted_text.append(extracted_text)

    # Combine and deduplicate the results
    final_text = "\n".join(set(all_extracted_text))

    final_text = post_process_text(final_text)
    return final_text
    
    
def main_fun(image, keyword):
    ext_text = ocr(image)
    
    if keyword:
        highlight_text = re.sub(f'({re.escape(keyword)})', r'<span style="background-color: yellow;">\1</span>', ext_text, flags=re.IGNORECASE)
    else:
        highlight_text = ext_text
    
    return ext_text, highlight_text

iface = gr.Interface(
    fn=main_fun,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Textbox(label="Enter search term", placeholder="Search")
    ],
    outputs=[
        gr.Textbox(label="Extracted Text"),
        gr.HTML(label="Search Results")
    ],
    title="Document Search using OCR (English/Hindi)"
)

iface.launch()