from difflib import SequenceMatcher from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image from src.application.text.model_detection import detect_text_by_ai_model from src.application.text.search_detection import check_human, detect_text_by_relative_search class NewsVerification(): def __init__(self): self.news_text = "" self.news_title = "" self.news_content = "" self.news_image = "" self.text_prediction_label = "" self.text_prediction_score = -1 self.text_referent_url = None self.image_prediction_label = "" self.image_prediction_score = -1 self.image_referent_url = None self.news_prediction_label = "" self.news_prediction_score = -1 self.found_img_url = [] self.aligned_sentences = [] self.is_paraphrased = False def load_news(self, news_title, news_content, news_image): self.news_text = news_title + "\n\n" + news_content self.news_title = news_title self.news_content = news_content self.news_image = news_image def determine_text_origin(self): """ Determines the origin of the given text based on paraphrasing detection and human authorship analysis. Args: text: The input text to be analyzed. Returns: str: The predicted origin of the text: - "HUMAN": If the text is likely written by a human. - "MACHINE": If the text is likely generated by a machine. """ print("CHECK TEXT:") print("\tFrom search engine:") # Classify by search engine self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text) if self.is_paraphrased is False: self.text_prediction_label = "UNKNOWN" else: self.text_prediction_score = 100 if check_human(self.aligned_sentences): self.text_prediction_label = "HUMAN" else: self.text_prediction_label = "MACHINE" # Classify text by AI model print("\tFrom AI model:") if self.text_prediction_label == "UNKNOWN": self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text) self.text_prediction_score *= 100 def detect_image_origin(self): print("CHECK IMAGE:") if self.news_image is None: self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 0.0 self.image_referent_url = None return print(f"\t: Img path: {self.news_image}") matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return matched_url, similarity = detect_image_by_reverse_search(self.news_image) if matched_url is not None: print(f"matching image: {matched_url}\nsimilarity: {similarity}\n") self.image_prediction_label = "HUMAN" self.image_prediction_score = similarity self.image_referent_url = matched_url return detected_label, score = detect_image_by_ai_model(self.news_image) if detected_label: self.image_prediction_label = detected_label self.image_prediction_score = score self.image_referent_url = None return self.image_prediction_label = "UNKNOWN" self.image_prediction_score = 50 self.image_referent_url = None def determine_news_origin(self): if self.text_prediction_label == "MACHINE": text_prediction_score = 100 - self.text_prediction_score elif self.text_prediction_label == "UNKNOWN": text_prediction_score = 50 else: text_prediction_score = self.text_prediction_score if self.image_prediction_label == "MACHINE": image_prediction_score = 100 - self.image_prediction_score elif self.image_prediction_label == "UNKNOWN": image_prediction_score = 50 else: image_prediction_score = self.image_prediction_score news_prediction_score = (text_prediction_score + image_prediction_score) / 2 if news_prediction_score > 50: self.news_prediction_score = news_prediction_score self.news_prediction_label = "HUMAN" else: self.news_prediction_score = 100 - news_prediction_score self.news_prediction_label = "MACHINE" def generate_analysis_report(self): self.determine_text_origin() self.detect_image_origin() self.determine_news_origin() # Forensic analysis if self.text_prediction_label == "MACHINE": text_prediction_label = "The text is modified by GPT-4o (AI)" else: text_prediction_label = "The text is written by HUMAN" if self.image_prediction_label == "MACHINE": image_prediction_label = "The image is generated by Dall-e (AI)" else: image_prediction_label = "The image is generated by HUMAN" if self.news_prediction_label == "MACHINE": news_prediction_label = "The whole news generated by AI" else: news_prediction_label = "The whole news written by HUMAN" # Misinformation analysis out_of_context_results = "cohesive" if out_of_context_results == "cohesive": out_of_context_results = "The input news is cohesive (non-out-of-context)" else: out_of_context_results = "The input news is out-of-context" out_of_context_prediction_score = 96.7 # Description description = "The description should be concise, clear, and aimed at helping general readers understand the case." if self.text_referent_url is None: referred_news = "
  • No referent information
  • " else: if len(self.text_referent_url) > 40: url_max_length = 40 else: url_max_length = len(self.text_referent_url) referred_news = f"""
  • {"Referred news: " + self.text_referent_url[:url_max_length] + "..."}
  • """ if self.image_referent_url is None: referred_image = "
  • No referent information
  • " else: if len(self.image_referent_url) > 40: url_max_length = 40 else: url_max_length = len(self.text_referent_url) referred_image = f"""
  • {"Referred news: " + self.image_referent_url[:url_max_length] + "..."}
  • """ html_template = f"""

    Originality:

    Forensic:

    {news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)

    Misinformation (placeholder):

    Description (optional, placeholder):

    """ return html_template def analyze_details(self): self.aligned_sentences final_table = [] for pair in self.aligned_sentences: input_words, source_words, input_indexes, source_indexes = ( self.highlight_overlap_by_word_to_list( pair["input_sentence"], pair["matched_sentence"], ) ) final_table.append( (input_words, source_words, input_indexes, source_indexes), ) if len(final_table) != 0: html_table = self.create_table(final_table) else: html_table = "" return html_table def highlight_overlap_by_word_to_list(self, text1, text2): """ Return - list of words in text1 - list of words in text2 - list of index of highlight words in text 1 - list of index of highlight words in text 2 """ # Tách chuỗi thành các từ (word) dựa vào khoảng trắng words1 = text1.split() words2 = text2.split() index1 = [] index2 = [] # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ matcher = SequenceMatcher(None, words1, words2) highlighted_text1 = [] highlighted_text2 = [] # Theo dõi vị trí hiện tại trong words1 và words2 current_pos1 = 0 current_pos2 = 0 # Lặp qua các đoạn so khớp for match in matcher.get_matching_blocks(): start1, start2, length = match # Thêm các từ không trùng lặp vào (giữ nguyên) highlighted_text1.extend(words1[current_pos1:start1]) highlighted_text2.extend(words2[current_pos2:start2]) if length > 0: for i in range(start1, start1 + length): index1.append(i) for i in range(start2, start2 + length): index2.append(i) # Cập nhật vị trí hiện tại current_pos1 = start1 + length current_pos2 = start2 + length return words1, words2, index1, index2 def create_table(self, data): table_rows = "\n".join([self.format_pair(pair) for pair in data]) return f"""
    Comparison between input news and source news
    {table_rows}
    Input sentence Source sentence
    """ def format_pair(self, pair): input_sentence = self.highlight_text(pair[0], pair[2]) source_sentence = self.highlight_text(pair[1], pair[3]) return f"{input_sentence}{source_sentence}" def highlight_text(self, words, indexes): final_words = words for index in indexes: final_words[index] = ( f"{words[index]}" ) return " ".join(final_words)