""" Author: Khanh Phan Date: 2024-12-04 """ import re import string from collections import Counter from difflib import SequenceMatcher from nltk.tokenize import ( sent_tokenize, word_tokenize, ) from nltk.util import ngrams from sklearn.feature_extraction.text import TfidfVectorizer from src.application.config import PREFIX def clean_text(text: str) -> str: """ Cleans and preprocesses a given text string. Args: text (str): The input text to be cleaned. Returns: str: The cleaned and preprocessed text, containing the first 18 words. """ # Define a set of punctuation characters to exclude, # exclude comma and period due to numbers punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~""" # Lowering text text = text.lower() # Removing punctuation text = "".join([c for c in text if c not in punctuations]) # Removing whitespace and newlines text = re.sub(r"\s+", " ", text) # Replace £ with * because Google search doesn't recognize £ text.replace("£", " * ") # Split the text into a list of words. words = text.split() # Join the first 18 words back into a string text = " ".join(words[:18]) # TODO: consider another number return text def remove_punctuation(text: str) -> str: """ Removes all punctuation characters from a string, except for periods (.). Args: text (str): The input string. Returns: str: The string with all punctuation characters removed, except for periods. """ # Create a string containing all punctuation characters, # except for periods. punctuation_without_dot = string.punctuation.replace(".", "") # Create a translation table to remove the specified punctuation chars. translator = str.maketrans("", "", punctuation_without_dot) # Apply the translation table to the input text and return the result. return text.translate(translator) def get_keywords(text, num_keywords=5): """ Extracts the top k keywords from a document using the TF-IDF method. Args: text (str): The input text from which to extract keywords. num_keywords (int, optional): The number of top keywords to return. Returns: list: A list of the top keywords extracted from the text. """ # Create a TF-IDF Vectorizer vectorizer = TfidfVectorizer(stop_words="english") # Fit and transform the text tfidf_matrix = vectorizer.fit_transform([text]) # Get feature names (words) feature_names = vectorizer.get_feature_names_out() # Get TF-IDF scores tfidf_scores = tfidf_matrix.toarray()[0] # Sort words by TF-IDF score word_scores = list(zip(feature_names, tfidf_scores)) word_scores.sort(key=lambda x: x[1], reverse=True) # Return top keywords return [word for word, score in word_scores[:num_keywords]] def get_important_sentences( sentence: str, keywords: list[str], num_sentences: int = 3, ) -> list[str]: """ Selects important sentences based on a list of keywords. Args: sentence (str): The input sentence. keywords (list[str]): List of important keywords. num_sentences (int): Number of sentences to return (default is 3). Returns: list: A list of important sentences. """ # Clean and split the sentence into sentences sentences = [s for s in re.split(r"(?<=[.!?])\s+", sentence) if s] # Calculate the importance score for each sentence sentence_scores = [] for sentence in sentences: processed_sentence = clean_text(sentence) score = 0 words = processed_sentence.lower().split() word_count = Counter(words) for keyword in keywords: if keyword.lower() in word_count: score += word_count[keyword.lower()] sentence_scores.append((sentence, score)) # Sort sentences by their scores in descending order sentence_scores.sort(key=lambda x: x[1], reverse=True) # Return the top N sentences return [sentence for sentence, score in sentence_scores[:num_sentences]] def extract_important_phrases( text: str, keywords: list[str], phrase_length: int = 5, ) -> list[str]: """ Extracts important phrases based on a list of keywords. Phrase length is auto-determined, and overlapped parts are less than 20%. Args: text (str): The input text. keywords (list[str]): List of important keywords. phrase_length (int): Length of phrases to extract (default: 5 words). Returns: list: A list of important phrases. """ # Tokenize the text into words words = word_tokenize(text.lower()) # Determine phrase length (between 3 and 7 words) phrase_length = min(max(len(words) // 10, 5), 7) # Generate n-grams (phrases) from the text phrases = list(ngrams(words, phrase_length)) important_phrases = [] used_indices = set() for i, phrase in enumerate(phrases): # Check if the phrase contains any keyword if any(keyword.lower() in phrase for keyword in keywords): # Check overlap with previously selected phrases if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices): important_phrases.append(clean_text(" ".join(phrase))) used_indices.add(i) return important_phrases def extract_equal_text(text1: str, text2: str) -> tuple[list[int], list[int]]: """ Extracts the indices of equal text segments between two strings. Args: text1 (str): The first input string. text2 (str): The second input string. Returns: tuple[ list[dict{"start": int, "end": int}], list[dict{"start": int, "end": int}] ] - list: the start and end indices of equal segments in text1. - list: the start and end indices of equal segments in text2. """ def cleanup(text: str) -> str: """ Cleans up a text string by converting to lowercase and removing punctuation. Args: text (str): The input text. Returns: str: The cleaned text. """ text = text.lower() text = text.translate(str.maketrans("", "", string.punctuation)) return text # Clean and split the input texts into lists of words. splited_text1 = cleanup(text1).split() splited_text2 = cleanup(text2).split() # Create a SequenceMatcher object to compare the cleaned word lists. s = SequenceMatcher(None, splited_text1, splited_text2) equal_idx_1 = [] equal_idx_2 = [] # Split the original texts into lists of words (without cleaning). text1 = text1.split() text2 = text2.split() for tag, i1, i2, j1, j2 in s.get_opcodes(): if tag == "equal": # Append the start and end indices of the equal segment # to the respective lists. equal_idx_1.append({"start": i1, "end": i2}) equal_idx_2.append({"start": j1, "end": j2}) # subtext_1 = " ".join(text1[i1:i2]) # subtext_2 = " ".join(text2[j1:j2]) # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] ' # f'{subtext_1!r:>55} --> {subtext_2!r}') return equal_idx_1, equal_idx_2 def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]: """ Connects consecutive integers in a list. Args: nums (list): A list of integers. Returns: list: A list of lists, where each inner list represents a consecutive range. For example: [1, 2, 3, 5, 6] becomes [[1, 3], [5, 6]]. """ if not nums: # Handle empty input return [] result = [] start = nums[0] end = nums[0] for i in range(1, len(nums)): # Check if the current number is consecutive to the previous end. if nums[i] == end + 1: end = nums[i] # Extend the current range. else: # Add the current range to the result and start a new range. result.append([start, end]) start = nums[i] end = nums[i] # Add the last range to the result. result.append([start, end]) return result def postprocess_label(labels: list[str]) -> str: """ Creates a label string with the format "Partially generated by [label1] and [label2] and ...". Removes duplicate labels while preserving the original order. Args: labels: A list of strings representing labels. Returns: A string with the formatted label. """ for index, label in enumerate(labels): # if label.startswith(PREFIX): # labels[index] = label[len(PREFIX) :] if PREFIX in label: labels[index] = label.replace(PREFIX, "") labels = list(set(labels)) label = "" if len(labels) == 1: label += labels[0] elif len(labels) == 2: label += f"{labels[0]} and {labels[1]}" else: combination = ", ".join(labels[0 : len(labels) - 1]) label += f"{combination}, and {labels[-1]}" return label def split_into_sentences(input_text: str) -> list[str]: """ Splits input text into sentences by newlines and then tokenizes each paragraph into sentences. Args: input_text (str): The input text as a string. Returns: list: A list of sentences. Returns an empty list if input is not a string. """ if not isinstance(input_text, str): return [] # Split the input text into paragraphs based on newline characters, # keeping the newline characters. paragraphs = input_text.splitlines(keepends=True) sentences = [] for paragraph in paragraphs: # Remove leading/trailing whitespace paragraph = paragraph.strip() if paragraph and paragraph != "\n": # Tokenize the paragraph into sentences sentences.extend(sent_tokenize(paragraph)) return sentences def split_into_paragraphs(input_text: str) -> list[str]: """ Splits input text into paragraphs based on newline characters. Args: input_text (str): The input text as a string. Returns: list: A list of paragraphs. Returns an empty list if input is not a string. """ if not isinstance(input_text, str): return [] # Split the input text into paragraphs based on newline characters, # keeping the newline characters. paragraphs = input_text.splitlines(keepends=True) out_paragraphs = [] for paragraph in paragraphs: # Remove leading/trailing whitespace # paragraph = paragraph.strip() if paragraph and paragraph != "\n": # Append the cleaned paragraph to the output list. out_paragraphs.append(paragraph) return out_paragraphs def extract_starts_ends( colored_idx: list[dict], ) -> tuple[list[int], list[int]]: """ Extracts start and end indices from a list of dictionaries. Args: colored_idx (list[dict]): A list of dictionaries, where each dictionary has 'start' and 'end' keys. Returns: tuple: A tuple containing two lists: - starts (list[int]): A list of start indices. - ends (list[int]): A list of end indices. """ starts = [] ends = [] for index in colored_idx: starts.append(index["start"]) ends.append(index["end"]) return starts, ends def filter_indices( starts: list[int], ends: list[int], ignore_indices: list[int], ): """ Filters start and end indices to exclude any indices present in the ignore_indices list. Args: starts (list[int]): A list of starting indices. ends (list[int]): A list of ending indices. Must be the same length as starts. ignore_indices (list[int]): A list of indices to exclude. Returns: A tuple of two lists of integers: - filtered_starts - filtered_ends Returns empty lists if the input is invalid or if all ranges are filtered out. Examples: starts = [0, 5, 10] ends = [3, 7, 12] # words at the end will not be colored. ignore_indices = [1, 2, 12, 17] # Output: starts = [0, 3, 5, 10] ends = [1, 4, 7, 12] """ if len(starts) != len(ends): print( "Error: The 'starts' & 'ends' lists must have the same length.", ) return [], [] filtered_starts = [] filtered_ends = [] for i in range(len(starts)): start = starts[i] end = ends[i] if end < start: print( f"Error: End index {end} < start index {start} at position {i}.", # noqa: E501 ) return [], [] start_end = list(range(start, end + 1, 1)) start_end = list(set(start_end) - set(ignore_indices)) # new_start, new_end = self.extract_sequences(start_end) new_start, new_end = extract_new_startend( start, end, ignore_indices, ) filtered_starts.extend(new_start) filtered_ends.extend(new_end) return filtered_starts, filtered_ends def replace_leading_spaces(text: str) -> str: """ Replaces leading spaces in a string with ' '. Args: text: The input string. Returns: The string with leading spaces replaced by ' '. """ if text is None: return None leading_spaces = 0 for char in text: if char == " ": leading_spaces += 1 else: break if leading_spaces > 0: return " " * leading_spaces + text[leading_spaces:] else: return text def extract_new_startend( start: int, end: int, ignore_indices: list[int], ) -> tuple[list[int], list[int]]: """ Extracts new start and end indices by splitting a range based on ignored indices. Args: start (int): The starting index of the range. end (int): The ending index of the range (exclusive). ignore_indices (list): indices to ignore within the range. Returns: tuple: A tuple containing two lists: - new_starts (list): Starting indices for the sub-ranges. - new_ends (list): Ending indices for the sub-ranges. """ # Sort the set of ignore_indices in ascending order. indexes = list(set(ignore_indices)) indexes.sort() new_starts = [] new_ends = [] new_start = start # If no indices to ignore, return the original range. if indexes is None or len(indexes) < 1: new_starts.append(start) new_ends.append(end) return new_starts, new_ends for index in indexes: # Skip indices that are outside the range [start, end). if index < start: continue elif index >= end: continue new_starts.append(new_start) new_ends.append(index) new_start = index + 1 new_starts.append(new_start) new_ends.append(end) return new_starts, new_ends