Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 15,314 Bytes

"""
Author: Khanh Phan
Date: 2024-12-04
"""

import re
import string
from collections import Counter
from difflib import SequenceMatcher

from nltk.tokenize import (
    sent_tokenize,
    word_tokenize,
)
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer

from src.application.config import PREFIX


def clean_text(text: str) -> str:
    """
    Cleans and preprocesses a given text string.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned and preprocessed text, containing the first 18 words.
    """
    # Define a set of punctuation characters to exclude,
    # exclude comma and period due to numbers
    punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""

    # Lowering text
    text = text.lower()

    # Removing punctuation
    text = "".join([c for c in text if c not in punctuations])

    # Removing whitespace and newlines
    text = re.sub(r"\s+", " ", text)

    # Replace £ with * because Google search doesn't recognize £
    text.replace("£", " * ")

    # Split the text into a list of words.
    words = text.split()

    # Join the first 18 words back into a string
    text = " ".join(words[:18])  # TODO: consider another number

    return text


def remove_punctuation(text: str) -> str:
    """
    Removes all punctuation characters from a string, except for periods (.).

    Args:
        text (str): The input string.

    Returns:
        str: The string with all punctuation characters removed,
            except for periods.
    """
    # Create a string containing all punctuation characters,
    # except for periods.
    punctuation_without_dot = string.punctuation.replace(".", "")

    # Create a translation table to remove the specified punctuation chars.
    translator = str.maketrans("", "", punctuation_without_dot)

    # Apply the translation table to the input text and return the result.
    return text.translate(translator)


def get_keywords(text, num_keywords=5):
    """
    Extracts the top k keywords from a document using the TF-IDF method.

    Args:
        text (str): The input text from which to extract keywords.
        num_keywords (int, optional): The number of top keywords to return.

    Returns:
        list: A list of the top keywords extracted from the text.
    """

    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english")

    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform([text])

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get TF-IDF scores
    tfidf_scores = tfidf_matrix.toarray()[0]

    # Sort words by TF-IDF score
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)

    # Return top keywords
    return [word for word, score in word_scores[:num_keywords]]


def get_important_sentences(
    sentence: str,
    keywords: list[str],
    num_sentences: int = 3,
) -> list[str]:
    """
    Selects important sentences based on a list of keywords.

    Args:
        sentence (str): The input sentence.
        keywords (list[str]): List of important keywords.
        num_sentences (int): Number of sentences to return (default is 3).

    Returns:
        list: A list of important sentences.
    """
    # Clean and split the sentence into sentences
    sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s]

    # Calculate the importance score for each sentence
    sentence_scores = []
    for sentence in sentences:
        processed_sentence = clean_text(sentence)
        score = 0
        words = processed_sentence.lower().split()
        word_count = Counter(words)

        for keyword in keywords:
            if keyword.lower() in word_count:
                score += word_count[keyword.lower()]

        sentence_scores.append((sentence, score))

    # Sort sentences by their scores in descending order
    sentence_scores.sort(key=lambda x: x[1], reverse=True)

    # Return the top N sentences
    return [sentence for sentence, score in sentence_scores[:num_sentences]]


def extract_important_phrases(
    text: str,
    keywords: list[str],
    phrase_length: int = 5,
) -> list[str]:
    """
    Extracts important phrases based on a list of keywords.
    Phrase length is auto-determined, and overlapped parts are less than 20%.

    Args:
        text (str): The input text.
        keywords (list[str]): List of important keywords.
        phrase_length (int): Length of phrases to extract (default: 5 words).

    Returns:
        list: A list of important phrases.
    """
    # Tokenize the text into words
    words = word_tokenize(text.lower())

    # Determine phrase length (between 3 and 7 words)
    phrase_length = min(max(len(words) // 10, 5), 7)

    # Generate n-grams (phrases) from the text
    phrases = list(ngrams(words, phrase_length))

    important_phrases = []
    used_indices = set()

    for i, phrase in enumerate(phrases):
        # Check if the phrase contains any keyword
        if any(keyword.lower() in phrase for keyword in keywords):
            # Check overlap with previously selected phrases
            if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
                important_phrases.append(clean_text(" ".join(phrase)))
                used_indices.add(i)

    return important_phrases


def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]:
    """
    Extracts the indices of equal text segments between two strings.

    Args:
        text1 (str): The first input string.
        text2 (str): The second input string.

    Returns:
        tuple[
            list[dict{"start": int, "end": int}],
            list[dict{"start": int, "end": int}]
            ]
            - list: the start and end indices of equal segments in text1.
            - list: the start and end indices of equal segments in text2.
    """

    def cleanup(text: str) -> str:
        """
        Cleans up a text string by converting to lowercase
            and removing punctuation.

        Args:
            text (str): The input text.

        Returns:
            str: The cleaned text.
        """
        text = text.lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        return text

    # Clean and split the input texts into lists of words.
    splited_text1 = cleanup(text1).split()
    splited_text2 = cleanup(text2).split()

    # Create a SequenceMatcher object to compare the cleaned word lists.
    s = SequenceMatcher(None, splited_text1, splited_text2)

    equal_idx_1 = []
    equal_idx_2 = []

    # Split the original texts into lists of words (without cleaning).
    text1 = text1.split()
    text2 = text2.split()
    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == "equal":
            # Append the start and end indices of the equal segment
            # to the respective lists.
            equal_idx_1.append({"start": i1, "end": i2})
            equal_idx_2.append({"start": j1, "end": j2})

            # subtext_1 = " ".join(text1[i1:i2])
            # subtext_2 = " ".join(text2[j1:j2])
            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] '
            #       f'{subtext_1!r:>55} --> {subtext_2!r}')
    return equal_idx_1, equal_idx_2


def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
    """
    Connects consecutive integers in a list.

    Args:
        nums (list): A list of integers.

    Returns:
        list: A list of lists,
            where each inner list represents a consecutive range.
            For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]].
    """

    if not nums:  # Handle empty input
        return []

    result = []
    start = nums[0]
    end = nums[0]

    for i in range(1, len(nums)):
        # Check if the current number is consecutive to the previous end.
        if nums[i] == end + 1:
            end = nums[i]  # Extend the current range.
        else:
            # Add the current range to the result and start a new range.
            result.append([start, end])
            start = nums[i]
            end = nums[i]

    # Add the last range to the result.
    result.append([start, end])
    return result


def postprocess_label(labels: list[str]) -> str:
    """
    Creates a label string with the format
    "Partially generated by [label1] and [label2] and ...".
    Removes duplicate labels while preserving the original order.

    Args:
        labels: A list of strings representing labels.

    Returns:
        A string with the formatted label.
    """

    for index, label in enumerate(labels):
        # if label.startswith(PREFIX):
        #     labels[index] = label[len(PREFIX) :]
        if PREFIX in label:
            labels[index] = label.replace(PREFIX, "")

    labels = list(set(labels))

    label = ""

    if len(labels) == 1:
        label += labels[0]
    elif len(labels) == 2:
        label += f"{labels[0]} and {labels[1]}"
    else:
        combination = ", ".join(labels[0 : len(labels) - 1])
        label += f"{combination}, and {labels[-1]}"
    return label


def split_into_sentences(input_text: str) -> list[str]:
    """
    Splits input text into sentences by newlines
        and then tokenizes each paragraph into sentences.

    Args:
        input_text (str): The input text as a string.

    Returns:
        list: A list of sentences.
            Returns an empty list if input is not a string.
    """
    if not isinstance(input_text, str):
        return []

    # Split the input text into paragraphs based on newline characters,
    # keeping the newline characters.
    paragraphs = input_text.splitlines(keepends=True)
    sentences = []
    for paragraph in paragraphs:
        # Remove leading/trailing whitespace
        paragraph = paragraph.strip()

        if paragraph and paragraph != "\n":
            # Tokenize the paragraph into sentences
            sentences.extend(sent_tokenize(paragraph))

    return sentences


def split_into_paragraphs(input_text: str) -> list[str]:
    """
    Splits input text into paragraphs based on newline characters.

    Args:
        input_text (str): The input text as a string.

    Returns:
        list: A list of paragraphs.
            Returns an empty list if input is not a string.
    """
    if not isinstance(input_text, str):
        return []

    # Split the input text into paragraphs based on newline characters,
    # keeping the newline characters.
    paragraphs = input_text.splitlines(keepends=True)
    out_paragraphs = []

    for paragraph in paragraphs:
        # Remove leading/trailing whitespace
        # paragraph = paragraph.strip()

        if paragraph and paragraph != "\n":
            # Append the cleaned paragraph to the output list.
            out_paragraphs.append(paragraph)

    return out_paragraphs


def extract_starts_ends(
    colored_idx: list[dict],
) -> tuple[list[int], list[int]]:
    """
    Extracts start and end indices from a list of dictionaries.

    Args:
        colored_idx (list[dict]): A list of dictionaries,
            where each dictionary has 'start' and 'end' keys.

    Returns:
        tuple: A tuple containing two lists:
            - starts (list[int]): A list of start indices.
            - ends (list[int]): A list of end indices.
    """
    starts = []
    ends = []
    for index in colored_idx:
        starts.append(index["start"])
        ends.append(index["end"])
    return starts, ends


def filter_indices(
    starts: list[int],
    ends: list[int],
    ignore_indices: list[int],
):
    """
    Filters start and end indices to exclude any indices present in the
        ignore_indices list.

    Args:
        starts (list[int]): A list of starting indices.
        ends (list[int]): A list of ending indices.
            Must be the same length as starts.
        ignore_indices (list[int]): A list of indices to exclude.

    Returns:
        A tuple of two lists of integers:
            - filtered_starts
            - filtered_ends
        Returns empty lists if the input is invalid
            or if all ranges are filtered out.

    Examples:
        starts = [0, 5, 10]
        ends = [3, 7, 12]  # words at the end will not be colored.
        ignore_indices = [1, 2, 12, 17]

        # Output:
            starts = [0, 3, 5, 10]
            ends = [1, 4, 7, 12]

    """

    if len(starts) != len(ends):
        print(
            "Error: The 'starts' & 'ends' lists must have the same length.",
        )
        return [], []

    filtered_starts = []
    filtered_ends = []

    for i in range(len(starts)):
        start = starts[i]
        end = ends[i]

        if end < start:
            print(
                f"Error: End index {end} < start index {start} at position {i}.",  # noqa: E501
            )
            return [], []

        start_end = list(range(start, end + 1, 1))
        start_end = list(set(start_end) - set(ignore_indices))
        # new_start, new_end = self.extract_sequences(start_end)
        new_start, new_end = extract_new_startend(
            start,
            end,
            ignore_indices,
        )
        filtered_starts.extend(new_start)
        filtered_ends.extend(new_end)

    return filtered_starts, filtered_ends


def replace_leading_spaces(text: str) -> str:
    """
    Replaces leading spaces in a string with '&nbsp;'.

    Args:
        text: The input string.

    Returns:
        The string with leading spaces replaced by '&nbsp;'.
    """

    if text is None:
        return None

    leading_spaces = 0
    for char in text:
        if char == " ":
            leading_spaces += 1
        else:
            break

    if leading_spaces > 0:
        return "&nbsp;" * leading_spaces + text[leading_spaces:]
    else:
        return text


def extract_new_startend(
    start: int,
    end: int,
    ignore_indices: list[int],
) -> tuple[list[int], list[int]]:
    """
    Extracts new start and end indices by splitting a range based on
        ignored indices.

    Args:
        start (int): The starting index of the range.
        end (int): The ending index of the range (exclusive).
        ignore_indices (list): indices to ignore within the range.

    Returns:
        tuple: A tuple containing two lists:
            - new_starts (list): Starting indices for the sub-ranges.
            - new_ends (list): Ending indices for the sub-ranges.
    """
    # Sort the set of ignore_indices in ascending order.
    indexes = list(set(ignore_indices))
    indexes.sort()

    new_starts = []
    new_ends = []
    new_start = start

    # If no indices to ignore, return the original range.
    if indexes is None or len(indexes) < 1:
        new_starts.append(start)
        new_ends.append(end)
        return new_starts, new_ends

    for index in indexes:
        # Skip indices that are outside the range [start, end).
        if index < start:
            continue
        elif index >= end:
            continue

        new_starts.append(new_start)
        new_ends.append(index)

        new_start = index + 1

    new_starts.append(new_start)
    new_ends.append(end)

    return new_starts, new_ends