"""
Author: Khanh Phan
Date: 2024-12-04
"""

import warnings
from typing import Optional

import numpy as np
from pandas import DataFrame
from sentence_transformers import util

from src.application.config import (
    DEVICE,
    MAX_CHAR_SIZE,
    PARAPHRASE_MODEL,
    PARAPHRASE_THRESHOLD,
    PARAPHRASE_THRESHOLD_HUMAN,
    PARAPHRASE_THRESHOLD_MACHINE,
    TOP_URLS_PER_SEARCH,
)
from src.application.text.helper import split_into_sentences
from src.application.text.search import (
    generate_search_phrases,
    search_by_google,
)
from src.application.url_reader import URLReader

warnings.simplefilter(action="ignore", category=FutureWarning)


def find_sentence_source(
    text: list,
    text_index: str,
    sentences_df: DataFrame,
) -> tuple[DataFrame, list]:
    """
    Finds the source URL for a given sentence by searching Google
    and checking for paraphrases.

    Args:
        text (list): A list of sentences.
        text_index (int): The index of the sentence to find the source for.
        sentences_df (pd.DataFrame): A DF to store sentence information.

    Returns:
        tuple: A tuple of the updated sentences_df and a list of image URLs.
            If a source is found, the DF is updated with source information.
            If no source is found, the DF is updated with the original input.
    """
    checked_urls = (
        set()
    )  # Keep track of visited URLs to avoid redundant checks
    searched_phrases = generate_search_phrases(text[text_index])

    for candidate in searched_phrases:
        # Search Google for the generated phrase
        search_results = search_by_google(candidate)

        # Extract URLs from search results
        urls = [item["link"] for item in search_results.get("items", [])]

        # Check the top 3 URLs from the search results
        for url in urls[:TOP_URLS_PER_SEARCH]:
            if url in checked_urls:  # Skip already checked URLs
                continue
            if "bbc.com" not in url:  # TODO: remove when releasing
                continue

            checked_urls.add(url)
            print(f"\t\tChecking URL: {url}")

            content = URLReader(url)

            if content.is_extracted is True:
                if content.title is None or content.text is None:
                    print("\t\t\t↑↑↑ Title or text not found")
                    continue

                source_text = content.title + "\n" + content.text
                if len(source_text) > MAX_CHAR_SIZE:
                    print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                    continue
                print(f"\t\t\t↑↑↑ Title: {content.title}")
                aligned_sentence = check_paraphrase(
                    text[text_index],
                    source_text,
                    url,
                )

                if aligned_sentence["paraphrase"] is False:
                    sentences_df.loc[text_index, "input"] = aligned_sentence[
                        "input"
                    ]
                    sentences_df.loc[text_index, "paraphrase"] = (
                        aligned_sentence["paraphrase"]
                    )
                    return sentences_df, []

                if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD:
                    columns = [
                        "input",
                        "source",
                        "label",
                        "similarity",
                        "paraphrase",
                        "url",
                    ]
                else:
                    columns = [
                        "input",
                        "label",
                        "paraphrase",
                    ]

                for c in columns:
                    if c in sentences_df.columns:
                        sentences_df.loc[text_index, c] = aligned_sentence[c]

                # Check other sentences for better matches in the same source
                for idx, _ in sentences_df.iterrows():
                    similarity = sentences_df.loc[idx, "similarity"]
                    if similarity is not None:
                        if similarity > PARAPHRASE_THRESHOLD_MACHINE:
                            continue

                    aligned_sentence = check_paraphrase(
                        text[idx],
                        source_text,
                        url,
                    )

                    if (
                        similarity is None
                        or aligned_sentence["similarity"] > similarity
                    ):
                        if (
                            aligned_sentence["similarity"]
                            > PARAPHRASE_THRESHOLD
                        ):
                            columns = [
                                "input",
                                "source",
                                "label",
                                "similarity",
                                "url",
                            ]
                        else:
                            columns = [
                                "input",
                                "label",
                            ]
                        for c in columns:
                            if c in sentences_df.columns:
                                sentences_df.loc[idx, c] = aligned_sentence[c]
                return sentences_df, content.images

    # If no source is found, update the DF with the original input
    sentences_df.loc[text_index, "input"] = text[text_index]
    return sentences_df, []


def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
    """
    Checks if the input text is a paraphrase of the source text
        by comparing sentence-level similarities.

    Args:
        input_text (str): The text to be checked for paraphrasing.
        source_text (str): The source text to compare against.
        url (str): The URL of the source text (for storing in the result).

    Returns:
        dict: A dictionary containing the alignment information, including:
                - "input": Concatenated input sentences.
                - "source": Concatenated best-matched source sentences.
                - "similarity": Average cosine similarity score.
                - "label": Label determined based on similarity.
                - "paraphrase": Boolean indicating if it's a paraphrase.
                - "url": The source URL.
    """
    # Extract sentences from input text and web page
    input_sentences = split_into_sentences(input_text)

    if not source_text:
        return {}
    source_sentences = split_into_sentences(source_text)

    if not input_sentences or not source_sentences:
        return {}

    # Handle external references in source sentences
    # This is specified for bbc news articles
    additional_sentences = []
    for sentence in source_sentences:
        if ", external" in sentence:
            additional_sentences.append(sentence.replace(", external", ""))
    source_sentences.extend(additional_sentences)

    # Encode sentences into embeddings using the PARAPHASE_MODEL
    embeddings1 = PARAPHRASE_MODEL.encode(
        input_sentences,
        convert_to_tensor=True,
        device=DEVICE,
        show_progress_bar=False,
    )
    embeddings2 = PARAPHRASE_MODEL.encode(
        source_sentences,
        convert_to_tensor=True,
        device=DEVICE,
        show_progress_bar=False,
    )

    # Compute cosine similarity matrix
    similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()

    # Find sentence alignments
    inputs = ""
    sources = ""
    similarities = []

    for i, sentence in enumerate(input_sentences):
        max_sim_index = np.argmax(similarity_matrix[i])
        max_similarity = similarity_matrix[i][max_sim_index]
        best_matched_sentence = source_sentences[max_sim_index]

        inputs += sentence + " "
        sources += best_matched_sentence + " "
        similarities.append(max_similarity)

    # Calculate average similarity and determine paraphrase label
    similarity = sum(similarities) / len(similarities)
    label, is_paraphrased = determine_label(max_similarity)

    # Create the alignment dictionary
    alignment = {
        "input": inputs,
        "source": sources,
        "similarity": similarity,
        "label": label,
        "paraphrase": is_paraphrased,
        "url": url,
    }

    print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')

    return alignment


def determine_label(similarity: float) -> tuple[Optional[str], bool]:
    """
    Determines a label and paraphrase status based on the similarity score.

    Args:
        similarity (float): The similarity score between two texts.

    Returns:
        tuple: A tuple containing the label (str or None)
                and a boolean indicating if it's a paraphrase.
    """
    if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
        return "HUMAN", True  # Human paraphrase
    elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
        return "MACHINE", True  # Machine paraphrase
    else:
        return None, False  # Not a paraphrase


if __name__ == "__main__":
    pass