Spaces:
Running
Running
""" | |
Author: Khanh Phan | |
Date: 2024-12-04 | |
""" | |
import warnings | |
from typing import Optional | |
import numpy as np | |
from pandas import DataFrame | |
from sentence_transformers import util | |
from src.application.config import ( | |
DEVICE, | |
MAX_CHAR_SIZE, | |
PARAPHRASE_MODEL, | |
PARAPHRASE_THRESHOLD, | |
PARAPHRASE_THRESHOLD_HUMAN, | |
PARAPHRASE_THRESHOLD_MACHINE, | |
TOP_URLS_PER_SEARCH, | |
) | |
from src.application.text.helper import split_into_sentences | |
from src.application.text.search import ( | |
generate_search_phrases, | |
search_by_google, | |
) | |
from src.application.url_reader import URLReader | |
warnings.simplefilter(action="ignore", category=FutureWarning) | |
def find_sentence_source( | |
text: list, | |
text_index: str, | |
sentences_df: DataFrame, | |
) -> tuple[DataFrame, list]: | |
""" | |
Finds the source URL for a given sentence by searching Google | |
and checking for paraphrases. | |
Args: | |
text (list): A list of sentences. | |
text_index (int): The index of the sentence to find the source for. | |
sentences_df (pd.DataFrame): A DF to store sentence information. | |
Returns: | |
tuple: A tuple of the updated sentences_df and a list of image URLs. | |
If a source is found, the DF is updated with source information. | |
If no source is found, the DF is updated with the original input. | |
""" | |
checked_urls = ( | |
set() | |
) # Keep track of visited URLs to avoid redundant checks | |
searched_phrases = generate_search_phrases(text[text_index]) | |
for candidate in searched_phrases: | |
# Search Google for the generated phrase | |
search_results = search_by_google(candidate) | |
# Extract URLs from search results | |
urls = [item["link"] for item in search_results.get("items", [])] | |
# Check the top 3 URLs from the search results | |
for url in urls[:TOP_URLS_PER_SEARCH]: | |
if url in checked_urls: # Skip already checked URLs | |
continue | |
if "bbc.com" not in url: # TODO: remove when releasing | |
continue | |
checked_urls.add(url) | |
print(f"\t\tChecking URL: {url}") | |
content = URLReader(url) | |
if content.is_extracted is True: | |
if content.title is None or content.text is None: | |
print("\t\t\tβββ Title or text not found") | |
continue | |
source_text = content.title + "\n" + content.text | |
if len(source_text) > MAX_CHAR_SIZE: | |
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters") | |
continue | |
print(f"\t\t\tβββ Title: {content.title}") | |
aligned_sentence = check_paraphrase( | |
text[text_index], | |
source_text, | |
url, | |
) | |
if aligned_sentence["paraphrase"] is False: | |
sentences_df.loc[text_index, "input"] = aligned_sentence[ | |
"input" | |
] | |
sentences_df.loc[text_index, "paraphrase"] = ( | |
aligned_sentence["paraphrase"] | |
) | |
return sentences_df, [] | |
if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD: | |
columns = [ | |
"input", | |
"source", | |
"label", | |
"similarity", | |
"paraphrase", | |
"url", | |
] | |
else: | |
columns = [ | |
"input", | |
"label", | |
"paraphrase", | |
] | |
for c in columns: | |
if c in sentences_df.columns: | |
sentences_df.loc[text_index, c] = aligned_sentence[c] | |
# Check other sentences for better matches in the same source | |
for idx, _ in sentences_df.iterrows(): | |
similarity = sentences_df.loc[idx, "similarity"] | |
if similarity is not None: | |
if similarity > PARAPHRASE_THRESHOLD_MACHINE: | |
continue | |
aligned_sentence = check_paraphrase( | |
text[idx], | |
source_text, | |
url, | |
) | |
if ( | |
similarity is None | |
or aligned_sentence["similarity"] > similarity | |
): | |
if ( | |
aligned_sentence["similarity"] | |
> PARAPHRASE_THRESHOLD | |
): | |
columns = [ | |
"input", | |
"source", | |
"label", | |
"similarity", | |
"url", | |
] | |
else: | |
columns = [ | |
"input", | |
"label", | |
] | |
for c in columns: | |
if c in sentences_df.columns: | |
sentences_df.loc[idx, c] = aligned_sentence[c] | |
return sentences_df, content.images | |
# If no source is found, update the DF with the original input | |
sentences_df.loc[text_index, "input"] = text[text_index] | |
return sentences_df, [] | |
def check_paraphrase(input_text: str, source_text: str, url: str) -> dict: | |
""" | |
Checks if the input text is a paraphrase of the source text | |
by comparing sentence-level similarities. | |
Args: | |
input_text (str): The text to be checked for paraphrasing. | |
source_text (str): The source text to compare against. | |
url (str): The URL of the source text (for storing in the result). | |
Returns: | |
dict: A dictionary containing the alignment information, including: | |
- "input": Concatenated input sentences. | |
- "source": Concatenated best-matched source sentences. | |
- "similarity": Average cosine similarity score. | |
- "label": Label determined based on similarity. | |
- "paraphrase": Boolean indicating if it's a paraphrase. | |
- "url": The source URL. | |
""" | |
# Extract sentences from input text and web page | |
input_sentences = split_into_sentences(input_text) | |
if not source_text: | |
return {} | |
source_sentences = split_into_sentences(source_text) | |
if not input_sentences or not source_sentences: | |
return {} | |
# Handle external references in source sentences | |
# This is specified for bbc news articles | |
additional_sentences = [] | |
for sentence in source_sentences: | |
if ", external" in sentence: | |
additional_sentences.append(sentence.replace(", external", "")) | |
source_sentences.extend(additional_sentences) | |
# Encode sentences into embeddings using the PARAPHASE_MODEL | |
embeddings1 = PARAPHRASE_MODEL.encode( | |
input_sentences, | |
convert_to_tensor=True, | |
device=DEVICE, | |
show_progress_bar=False, | |
) | |
embeddings2 = PARAPHRASE_MODEL.encode( | |
source_sentences, | |
convert_to_tensor=True, | |
device=DEVICE, | |
show_progress_bar=False, | |
) | |
# Compute cosine similarity matrix | |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy() | |
# Find sentence alignments | |
inputs = "" | |
sources = "" | |
similarities = [] | |
for i, sentence in enumerate(input_sentences): | |
max_sim_index = np.argmax(similarity_matrix[i]) | |
max_similarity = similarity_matrix[i][max_sim_index] | |
best_matched_sentence = source_sentences[max_sim_index] | |
inputs += sentence + " " | |
sources += best_matched_sentence + " " | |
similarities.append(max_similarity) | |
# Calculate average similarity and determine paraphrase label | |
similarity = sum(similarities) / len(similarities) | |
label, is_paraphrased = determine_label(max_similarity) | |
# Create the alignment dictionary | |
alignment = { | |
"input": inputs, | |
"source": sources, | |
"similarity": similarity, | |
"label": label, | |
"paraphrase": is_paraphrased, | |
"url": url, | |
} | |
print(f'Result: [{alignment["similarity"]}] {alignment["source"]}') | |
return alignment | |
def determine_label(similarity: float) -> tuple[Optional[str], bool]: | |
""" | |
Determines a label and paraphrase status based on the similarity score. | |
Args: | |
similarity (float): The similarity score between two texts. | |
Returns: | |
tuple: A tuple containing the label (str or None) | |
and a boolean indicating if it's a paraphrase. | |
""" | |
if similarity >= PARAPHRASE_THRESHOLD_HUMAN: | |
return "HUMAN", True # Human paraphrase | |
elif similarity >= PARAPHRASE_THRESHOLD_MACHINE: | |
return "MACHINE", True # Machine paraphrase | |
else: | |
return None, False # Not a paraphrase | |
if __name__ == "__main__": | |
pass | |