news_verification / src /application /content_detection.py
pmkhanh7890's picture
grouping input news
a6b0abd
raw
history blame
16.6 kB
from difflib import SequenceMatcher
import difflib
from src.application.highlight_text import generate_color
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
from src.application.text.model_detection import detect_text_by_ai_model
from src.application.text.preprocessing import split_into_sentences
from src.application.text.search_detection import check_human, detect_text_by_relative_search
class NewsVerification():
def __init__(self):
self.news_text = ""
self.news_title = ""
self.news_content = ""
self.news_image = ""
self.text_prediction_label:list[str] = []
self.text_prediction_score:list[float] = []
self.text_referent_url:list[str] = []
self.image_prediction_label:list[str] = []
self.image_prediction_score:list[str] = []
self.image_referent_url:list[str] = []
self.news_prediction_label = ""
self.news_prediction_score = -1
self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
self.aligned_sentences:list[dict] = []
self.is_paraphrased:list[bool] = []
self.analyzed_table:list[list] = []
def load_news(self, news_title, news_content, news_image):
self.news_text = news_title + "\n\n" + news_content
self.news_title = news_title
self.news_content = news_content
self.news_image = news_image
def determine_text_origin(self):
"""
Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
Args:
text: The input text to be analyzed.
Returns:
str: The predicted origin of the text:
- "HUMAN": If the text is likely written by a human.
- "MACHINE": If the text is likely generated by a machine.
"""
print("CHECK TEXT:")
print("\tFrom search engine:")
# Classify by search engine
input_sentences = split_into_sentences(self.news_text)
current_index = 0
previous_paraphrase = None
ai_sentence = {
"input_sentence": "",
"matched_sentence": "",
"label": "",
"similarity": None,
"paraphrase": False,
"url": "",
}
for index, sentence in enumerate(input_sentences):
if current_index >= index:
continue
print(f"-------index = {index}-------")
paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
if paraphrase is False:
# add sentence to ai_sentence
ai_sentence["input_sentence"] += sentence
if index == len(input_sentences) - 1:
# add ai_sentences to align_sentences
text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
ai_sentence["label"] = text_prediction_label
ai_sentence["similarity"] = text_prediction_score
self.aligned_sentences.append(ai_sentence)
else:
if previous_paraphrase is False or previous_paraphrase is None:
# add ai_sentences to align_sentences
if ai_sentence["input_sentence"] != "":
text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
ai_sentence["label"] = text_prediction_label
ai_sentence["similarity"] = text_prediction_score
self.aligned_sentences.append(ai_sentence)
# reset
ai_sentence = {
"input_sentence": "",
"matched_sentence": "",
"label": "",
"similarity": None,
"paraphrase": False,
"url": "",
}
# add searched_sentences to align_sentences
if searched_sentences["input_sentence"] != "":
self.found_img_url.extend(img_urls)
if check_human(searched_sentences):
searched_sentences["label"] = "HUMAN"
else:
searched_sentences["label"] = "MACHINE"
self.aligned_sentences.append(searched_sentences)
previous_paraphrase = paraphrase
#self.found_img_url = list(set(self.found_img_url))
def detect_image_origin(self):
print("CHECK IMAGE:")
if self.news_image is None:
self.image_prediction_label = "UNKNOWN"
self.image_prediction_score = 0.0
self.image_referent_url = None
return
for image in self.found_img_url:
print(f"\tfound_img_url: {image}")
matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
if matched_url is not None:
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
self.image_prediction_label = "HUMAN"
self.image_prediction_score = similarity
self.image_referent_url = matched_url
return
matched_url, similarity = detect_image_by_reverse_search(self.news_image)
if matched_url is not None:
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
self.image_prediction_label = "HUMAN"
self.image_prediction_score = similarity
self.image_referent_url = matched_url
return
detected_label, score = detect_image_by_ai_model(self.news_image)
if detected_label:
print(f"detected_label: {detected_label} ({score})")
self.image_prediction_label = detected_label
self.image_prediction_score = score
self.image_referent_url = None
return
self.image_prediction_label = "UNKNOWN"
self.image_prediction_score = 50
self.image_referent_url = None
def determine_news_origin(self):
if self.text_prediction_label == "MACHINE":
text_prediction_score = 100 - self.text_prediction_score
elif self.text_prediction_label == "UNKNOWN":
text_prediction_score = 50
else:
text_prediction_score = self.text_prediction_score
if self.image_prediction_label == "MACHINE":
image_prediction_score = 100 - self.image_prediction_score
elif self.image_prediction_label == "UNKNOWN":
image_prediction_score = 50
else:
image_prediction_score = self.image_prediction_score
news_prediction_score = (text_prediction_score + image_prediction_score) / 2
if news_prediction_score > 50:
self.news_prediction_score = news_prediction_score
self.news_prediction_label = "HUMAN"
else:
self.news_prediction_score = 100 - news_prediction_score
self.news_prediction_label = "MACHINE"
def generate_analysis_report(self):
self.determine_text_origin()
self.detect_image_origin()
def analyze_details(self):
self.analyzed_table = []
# IMAGES:
# TEXT
for pair in self.aligned_sentences:
print(f"pair: {pair}")
if "input_sentence" not in pair:
continue
input_words, source_words, input_indexes, source_indexes = (
self.highlight_overlap_by_word_to_list(
pair["input_sentence"],
pair["matched_sentence"],
)
)
self.analyzed_table.append(
(input_words, source_words, input_indexes, source_indexes),
)
if len(self.analyzed_table) != 0:
html_table = self.create_table()
else:
html_table = ""
return html_table
def highlight_overlap_by_word_to_list(self, text1, text2):
"""
Return
- list of words in text1
- list of words in text2
- list of index of highlight words in text 1
- list of index of highlight words in text 2
"""
# Tách chuỗi thành các từ (word) dựa vào khoảng trắng
words1 = text1.split()
words2 = text2.split()
index1 = []
index2 = []
# Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
matcher = SequenceMatcher(None, words1, words2)
highlighted_text1 = []
highlighted_text2 = []
# Theo dõi vị trí hiện tại trong words1 và words2
current_pos1 = 0
current_pos2 = 0
# Lặp qua các đoạn so khớp
for match in matcher.get_matching_blocks():
start1, start2, length = match
print(start1, start2, length)
# Thêm các từ không trùng lặp vào (giữ nguyên)
highlighted_text1.extend(words1[current_pos1:start1])
highlighted_text2.extend(words2[current_pos2:start2])
if length > 0:
for i in range(start1, start1 + length):
index1.append(i)
for i in range(start2, start2 + length):
index2.append(i)
# Cập nhật vị trí hiện tại
current_pos1 = start1 + length
current_pos2 = start2 + length
return words1, words2, index1, index2
def get_text_urls(self):
return set(self.text_referent_url)
def generate_colors_list(self, set_urls):
color_dict = {}
num_urls = len(set_urls)
for i in range(num_urls):
color_dict[i] = generate_color(i, num_urls)
return color_dict
def analyze_details_2(self):
html_text = ""
self.analyzed_table = []
# TEXT
# Assign unique colors to each index
set_urls = self.get_text_urls()
color_dict = self.generate_colors_list(set_urls)
# position of the color in the input contents
position = 0
for pair in self.aligned_sentences:
if "input_sentence" not in pair:
continue
common_phrases, position = self.compare_sentences(
pair["input_sentence"],
pair["matched_sentence"],
position,
color_dict["0"], # TODO: set color
)
if len(self.analyzed_table) != 0:
html_table = self.create_table()
else:
html_table = ""
return html_text, html_table
def compare_sentences(self, sentence_1, sentence_2, position, color):
"""
Compares two sentences and identifies common phrases, outputting their start and end positions.
Args:
sentence_1: The first sentence (string).
sentence_2: The second sentence (string).
Returns:
A list of dictionaries, where each dictionary represents a common phrase and contains:
- "phrase": The common phrase (string).
- "start_1": The starting index of the phrase in sentence_1 (int).
- "end_1": The ending index of the phrase in sentence_1 (int).
- "start_2": The starting index of the phrase in sentence_2 (int).
- "end_2": The ending index of the phrase in sentence_2 (int).
Returns an empty list if no common phrases are found. Handles edge cases like empty strings.
"""
if not sentence_1 or not sentence_2: # Handle empty strings
return []
s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
common_phrases = []
for block in s.get_matching_blocks():
if block.size > 0: # Ignore zero-length matches
start_1 = block.a
end_1 = block.a + block.size
start_2 = block.b
end_2 = block.b + block.size
phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same
common_phrases.append({
"phrase": phrase,
"start_1": start_1 + position,
"end_1": end_1 + position,
"start_2": start_2,
"end_2": end_2,
"color": color,
})
position += len(sentence_1)
return common_phrases, position
def create_table(self):
#table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
# loop of self.analyzed_table with index:
rows = []
max_length = 30 # TODO: put this in configuration
rows.append(self.format_image_row(max_length))
for index, row in enumerate(self.analyzed_table):
formatted_row = self.format_text_row(row, index, max_length)
rows.append(formatted_row)
table = "\n".join(rows)
return f"""
<h5>Comparison between input news and source news</h5>
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
<thead>
<tr>
<th>Input news</th>
<th>Source (URL provided in Originality column correspondingly)</th>
<th>Forensic</th>
<th>Originality</th>
</tr>
</thead>
<tbody>
{table}
</tbody>
</table>
<style>
"""
def format_text_row(self, row, index = 0, max_length=30):
input_sentence = self.highlight_text(row[0], row[2]) # text, index of highlight words
source_sentence = self.highlight_text(row[1], row[3]) # text, index of highlight words
url = self.aligned_sentences[index]["url"] #
short_url = self.shorten_url(url, max_length)
source_text_url = f"""<a href="{url}">{short_url}</a>"""
# short_url = self.shorten_url(self.text_referent_url[index], max_length)
# source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""
#label = self.aligned_sentences[index]["label"]
print(self.aligned_sentences)
print(index)
label = self.aligned_sentences[index]["label"]
score = self.aligned_sentences[index]["similarity"]
return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{label}<br>({score*100:.2f}%)</td><td>{source_text_url}</td></tr>"""
def format_image_row(self, max_length=30):
# input_image = f"""<img src="example_image_input.jpg" width="200" height="150">"""
if self.image_referent_url is not None or self.image_referent_url != "":
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
short_url = self.shorten_url(self.image_referent_url, max_length)
source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
else:
source_image = "Image not found"
source_image_url = ""
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
def shorten_url(self, url, max_length=30):
if url is None:
return ""
if len(url) > max_length:
short_url = url[:max_length] + "..."
else:
short_url = url
return short_url
def highlight_text(self, words, indexes):
final_words = words
for index in indexes:
final_words[index] = (
f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
)
return " ".join(final_words)