""" Author: Khanh Phan Date: 2024-12-04 """ import warnings from typing import Optional import numpy as np from pandas import DataFrame from sentence_transformers import util from src.application.config import ( DEVICE, MAX_CHAR_SIZE, PARAPHRASE_MODEL, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_HUMAN, PARAPHRASE_THRESHOLD_MACHINE, TOP_URLS_PER_SEARCH, ) from src.application.text.helper import split_into_sentences from src.application.text.search import ( generate_search_phrases, search_by_google, ) from src.application.url_reader import URLReader warnings.simplefilter(action="ignore", category=FutureWarning) def find_sentence_source( text: list, text_index: str, sentences_df: DataFrame, ) -> tuple[DataFrame, list]: """ Finds the source URL for a given sentence by searching Google and checking for paraphrases. Args: text (list): A list of sentences. text_index (int): The index of the sentence to find the source for. sentences_df (pd.DataFrame): A DF to store sentence information. Returns: tuple: A tuple of the updated sentences_df and a list of image URLs. If a source is found, the DF is updated with source information. If no source is found, the DF is updated with the original input. """ checked_urls = ( set() ) # Keep track of visited URLs to avoid redundant checks searched_phrases = generate_search_phrases(text[text_index]) for candidate in searched_phrases: # Search Google for the generated phrase search_results = search_by_google(candidate) # Extract URLs from search results urls = [item["link"] for item in search_results.get("items", [])] # Check the top 3 URLs from the search results for url in urls[:TOP_URLS_PER_SEARCH]: if url in checked_urls: # Skip already checked URLs continue if "bbc.com" not in url: # TODO: remove when releasing continue checked_urls.add(url) print(f"\t\tChecking URL: {url}") content = URLReader(url) if content.is_extracted is True: if content.title is None or content.text is None: print("\t\t\t↑↑↑ Title or text not found") continue source_text = content.title + "\n" + content.text if len(source_text) > MAX_CHAR_SIZE: print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters") continue print(f"\t\t\t↑↑↑ Title: {content.title}") aligned_sentence = check_paraphrase( text[text_index], source_text, url, ) if aligned_sentence["paraphrase"] is False: sentences_df.loc[text_index, "input"] = aligned_sentence[ "input" ] sentences_df.loc[text_index, "paraphrase"] = ( aligned_sentence["paraphrase"] ) return sentences_df, [] if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD: columns = [ "input", "source", "label", "similarity", "paraphrase", "url", ] else: columns = [ "input", "label", "paraphrase", ] for c in columns: if c in sentences_df.columns: sentences_df.loc[text_index, c] = aligned_sentence[c] # Check other sentences for better matches in the same source for idx, _ in sentences_df.iterrows(): similarity = sentences_df.loc[idx, "similarity"] if similarity is not None: if similarity > PARAPHRASE_THRESHOLD_MACHINE: continue aligned_sentence = check_paraphrase( text[idx], source_text, url, ) if ( similarity is None or aligned_sentence["similarity"] > similarity ): if ( aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD ): columns = [ "input", "source", "label", "similarity", "url", ] else: columns = [ "input", "label", ] for c in columns: if c in sentences_df.columns: sentences_df.loc[idx, c] = aligned_sentence[c] return sentences_df, content.images # If no source is found, update the DF with the original input sentences_df.loc[text_index, "input"] = text[text_index] return sentences_df, [] def check_paraphrase(input_text: str, source_text: str, url: str) -> dict: """ Checks if the input text is a paraphrase of the source text by comparing sentence-level similarities. Args: input_text (str): The text to be checked for paraphrasing. source_text (str): The source text to compare against. url (str): The URL of the source text (for storing in the result). Returns: dict: A dictionary containing the alignment information, including: - "input": Concatenated input sentences. - "source": Concatenated best-matched source sentences. - "similarity": Average cosine similarity score. - "label": Label determined based on similarity. - "paraphrase": Boolean indicating if it's a paraphrase. - "url": The source URL. """ # Extract sentences from input text and web page input_sentences = split_into_sentences(input_text) if not source_text: return {} source_sentences = split_into_sentences(source_text) if not input_sentences or not source_sentences: return {} # Handle external references in source sentences # This is specified for bbc news articles additional_sentences = [] for sentence in source_sentences: if ", external" in sentence: additional_sentences.append(sentence.replace(", external", "")) source_sentences.extend(additional_sentences) # Encode sentences into embeddings using the PARAPHASE_MODEL embeddings1 = PARAPHRASE_MODEL.encode( input_sentences, convert_to_tensor=True, device=DEVICE, show_progress_bar=False, ) embeddings2 = PARAPHRASE_MODEL.encode( source_sentences, convert_to_tensor=True, device=DEVICE, show_progress_bar=False, ) # Compute cosine similarity matrix similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy() # Find sentence alignments inputs = "" sources = "" similarities = [] for i, sentence in enumerate(input_sentences): max_sim_index = np.argmax(similarity_matrix[i]) max_similarity = similarity_matrix[i][max_sim_index] best_matched_sentence = source_sentences[max_sim_index] inputs += sentence + " " sources += best_matched_sentence + " " similarities.append(max_similarity) # Calculate average similarity and determine paraphrase label similarity = sum(similarities) / len(similarities) label, is_paraphrased = determine_label(max_similarity) # Create the alignment dictionary alignment = { "input": inputs, "source": sources, "similarity": similarity, "label": label, "paraphrase": is_paraphrased, "url": url, } print(f'Result: [{alignment["similarity"]}] {alignment["source"]}') return alignment def determine_label(similarity: float) -> tuple[Optional[str], bool]: """ Determines a label and paraphrase status based on the similarity score. Args: similarity (float): The similarity score between two texts. Returns: tuple: A tuple containing the label (str or None) and a boolean indicating if it's a paraphrase. """ if similarity >= PARAPHRASE_THRESHOLD_HUMAN: return "HUMAN", True # Human paraphrase elif similarity >= PARAPHRASE_THRESHOLD_MACHINE: return "MACHINE", True # Machine paraphrase else: return None, False # Not a paraphrase if __name__ == "__main__": pass