news_verification / src /application /text /search_detection.py
pmkhanh7890's picture
refactor code + fix bug of label after grouping url
00b1038
raw
history blame
9.25 kB
"""
Author: Khanh Phan
Date: 2024-12-04
"""
import warnings
from typing import Optional
import numpy as np
from pandas import DataFrame
from sentence_transformers import util
from src.application.config import (
DEVICE,
MAX_CHAR_SIZE,
PARAPHRASE_MODEL,
PARAPHRASE_THRESHOLD,
PARAPHRASE_THRESHOLD_HUMAN,
PARAPHRASE_THRESHOLD_MACHINE,
TOP_URLS_PER_SEARCH,
)
from src.application.text.helper import split_into_sentences
from src.application.text.search import (
generate_search_phrases,
search_by_google,
)
from src.application.url_reader import URLReader
warnings.simplefilter(action="ignore", category=FutureWarning)
def find_sentence_source(
text: list,
text_index: str,
sentences_df: DataFrame,
) -> tuple[DataFrame, list]:
"""
Finds the source URL for a given sentence by searching Google
and checking for paraphrases.
Args:
text (list): A list of sentences.
text_index (int): The index of the sentence to find the source for.
sentences_df (pd.DataFrame): A DF to store sentence information.
Returns:
tuple: A tuple of the updated sentences_df and a list of image URLs.
If a source is found, the DF is updated with source information.
If no source is found, the DF is updated with the original input.
"""
checked_urls = (
set()
) # Keep track of visited URLs to avoid redundant checks
searched_phrases = generate_search_phrases(text[text_index])
for candidate in searched_phrases:
# Search Google for the generated phrase
search_results = search_by_google(candidate)
# Extract URLs from search results
urls = [item["link"] for item in search_results.get("items", [])]
# Check the top 3 URLs from the search results
for url in urls[:TOP_URLS_PER_SEARCH]:
if url in checked_urls: # Skip already checked URLs
continue
if "bbc.com" not in url: # TODO: remove when releasing
continue
checked_urls.add(url)
print(f"\t\tChecking URL: {url}")
content = URLReader(url)
if content.is_extracted is True:
if content.title is None or content.text is None:
print("\t\t\t↑↑↑ Title or text not found")
continue
source_text = content.title + "\n" + content.text
if len(source_text) > MAX_CHAR_SIZE:
print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
continue
print(f"\t\t\t↑↑↑ Title: {content.title}")
aligned_sentence = check_paraphrase(
text[text_index],
source_text,
url,
)
if aligned_sentence["paraphrase"] is False:
sentences_df.loc[text_index, "input"] = aligned_sentence[
"input"
]
sentences_df.loc[text_index, "paraphrase"] = (
aligned_sentence["paraphrase"]
)
return sentences_df, []
if aligned_sentence["similarity"] > PARAPHRASE_THRESHOLD:
columns = [
"input",
"source",
"label",
"similarity",
"paraphrase",
"url",
]
else:
columns = [
"input",
"label",
"paraphrase",
]
for c in columns:
if c in sentences_df.columns:
sentences_df.loc[text_index, c] = aligned_sentence[c]
# Check other sentences for better matches in the same source
for idx, _ in sentences_df.iterrows():
similarity = sentences_df.loc[idx, "similarity"]
if similarity is not None:
if similarity > PARAPHRASE_THRESHOLD_MACHINE:
continue
aligned_sentence = check_paraphrase(
text[idx],
source_text,
url,
)
if (
similarity is None
or aligned_sentence["similarity"] > similarity
):
if (
aligned_sentence["similarity"]
> PARAPHRASE_THRESHOLD
):
columns = [
"input",
"source",
"label",
"similarity",
"url",
]
else:
columns = [
"input",
"label",
]
for c in columns:
if c in sentences_df.columns:
sentences_df.loc[idx, c] = aligned_sentence[c]
return sentences_df, content.images
# If no source is found, update the DF with the original input
sentences_df.loc[text_index, "input"] = text[text_index]
return sentences_df, []
def check_paraphrase(input_text: str, source_text: str, url: str) -> dict:
"""
Checks if the input text is a paraphrase of the source text
by comparing sentence-level similarities.
Args:
input_text (str): The text to be checked for paraphrasing.
source_text (str): The source text to compare against.
url (str): The URL of the source text (for storing in the result).
Returns:
dict: A dictionary containing the alignment information, including:
- "input": Concatenated input sentences.
- "source": Concatenated best-matched source sentences.
- "similarity": Average cosine similarity score.
- "label": Label determined based on similarity.
- "paraphrase": Boolean indicating if it's a paraphrase.
- "url": The source URL.
"""
# Extract sentences from input text and web page
input_sentences = split_into_sentences(input_text)
if not source_text:
return {}
source_sentences = split_into_sentences(source_text)
if not input_sentences or not source_sentences:
return {}
# Handle external references in source sentences
# This is specified for bbc news articles
additional_sentences = []
for sentence in source_sentences:
if ", external" in sentence:
additional_sentences.append(sentence.replace(", external", ""))
source_sentences.extend(additional_sentences)
# Encode sentences into embeddings using the PARAPHASE_MODEL
embeddings1 = PARAPHRASE_MODEL.encode(
input_sentences,
convert_to_tensor=True,
device=DEVICE,
show_progress_bar=False,
)
embeddings2 = PARAPHRASE_MODEL.encode(
source_sentences,
convert_to_tensor=True,
device=DEVICE,
show_progress_bar=False,
)
# Compute cosine similarity matrix
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
# Find sentence alignments
inputs = ""
sources = ""
similarities = []
for i, sentence in enumerate(input_sentences):
max_sim_index = np.argmax(similarity_matrix[i])
max_similarity = similarity_matrix[i][max_sim_index]
best_matched_sentence = source_sentences[max_sim_index]
inputs += sentence + " "
sources += best_matched_sentence + " "
similarities.append(max_similarity)
# Calculate average similarity and determine paraphrase label
similarity = sum(similarities) / len(similarities)
label, is_paraphrased = determine_label(max_similarity)
# Create the alignment dictionary
alignment = {
"input": inputs,
"source": sources,
"similarity": similarity,
"label": label,
"paraphrase": is_paraphrased,
"url": url,
}
print(f'Result: [{alignment["similarity"]}] {alignment["source"]}')
return alignment
def determine_label(similarity: float) -> tuple[Optional[str], bool]:
"""
Determines a label and paraphrase status based on the similarity score.
Args:
similarity (float): The similarity score between two texts.
Returns:
tuple: A tuple containing the label (str or None)
and a boolean indicating if it's a paraphrase.
"""
if similarity >= PARAPHRASE_THRESHOLD_HUMAN:
return "HUMAN", True # Human paraphrase
elif similarity >= PARAPHRASE_THRESHOLD_MACHINE:
return "MACHINE", True # Machine paraphrase
else:
return None, False # Not a paraphrase
if __name__ == "__main__":
pass