Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 28

Commit

b73a4fc

1 Parent(s): 0827f9d

refactor code and fix bugs

Browse files

Files changed (13) hide show

application.py +2 -2
src/application/config.py +8 -1
src/application/content_detection.py +226 -258
src/application/content_generation.py +92 -58
src/application/formatting.py +67 -0
src/application/formatting_ordinary_user.py +87 -0
src/application/text/entity.py +1 -0
src/application/text/helper.py +224 -1
src/application/text/model_detection.py +2 -3
src/application/text/preprocessing.py +0 -67
src/application/text/search_detection.py +1 -1
src/application/url_reader.py +55 -57
test.py +27 -3

application.py CHANGED Viewed

@@ -50,8 +50,8 @@ def generate_analysis_report(
 ):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
-    news_analysis.generate_analysis_report()
-    return news_analysis.analyze_details()
 # Define the GUI

 ):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
+    news_analysis.determine_origin()
+    return news_analysis.generate_report()
 # Define the GUI

src/application/config.py CHANGED Viewed

@@ -1,8 +1,8 @@
-# Download necessary NLTK data files
 """
 Author: Khanh Phan
 Date: 2024-12-04
 """
 import os
 import nltk
@@ -22,6 +22,7 @@ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
 # GPT Model
 GPT_ENTITY_MODEL = "o1-mini"  # "gpt-4o-mini" or "o1-mini"
 GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
 AZUREOPENAI_CLIENT = openai.AzureOpenAI(
     api_version=AZURE_OPENAI_API_VERSION,  # AZURE_OPENAI_API_VERSION,
     api_key=AZURE_OPENAI_API_KEY,
@@ -54,6 +55,7 @@ MAX_CHAR_SIZE = 30000
 # Number of top URLs per search
 TOP_URLS_PER_SEARCH = 3
 # Search parameters
 GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
@@ -80,5 +82,10 @@ factor = 0: Black.
 """
 ENTITY_LIGHTEN_COLOR = 2.2
 ENTITY_DARKEN_COLOR = 0.7
 ENTITY_SATURATION = 0.65  # Saturation: color's intensity (vividness).
 ENTITY_BRIGHTNESS = 0.75  # color's brightness.

 """
 Author: Khanh Phan
 Date: 2024-12-04
 """
 import os
 import nltk
 # GPT Model
 GPT_ENTITY_MODEL = "o1-mini"  # "gpt-4o-mini" or "o1-mini"
 GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
+GPT_IMAGE_MODEL = "dall-e-3"
 AZUREOPENAI_CLIENT = openai.AzureOpenAI(
     api_version=AZURE_OPENAI_API_VERSION,  # AZURE_OPENAI_API_VERSION,
     api_key=AZURE_OPENAI_API_KEY,
 # Number of top URLs per search
 TOP_URLS_PER_SEARCH = 3
+MAX_URL_SIZE = 2 * 1024 * 1024  # ~2 MB
 # Search parameters
 GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
 """
 ENTITY_LIGHTEN_COLOR = 2.2
 ENTITY_DARKEN_COLOR = 0.7
 ENTITY_SATURATION = 0.65  # Saturation: color's intensity (vividness).
 ENTITY_BRIGHTNESS = 0.75  # color's brightness.
+# HTML formatting
+WORD_BREAK = "word-break: break-all;"

src/application/content_detection.py CHANGED Viewed

@@ -1,7 +1,12 @@
-from difflib import SequenceMatcher
 import pandas as pd
 from src.application.image.image_detection import (
     detect_image_by_ai_model,
     detect_image_by_reverse_search,
@@ -11,24 +16,27 @@ from src.application.text.entity import (
     apply_highlight,
     highlight_entities,
 )
-from src.application.text.helper import extract_equal_text
 from src.application.text.model_detection import (
     detect_text_by_ai_model,
     predict_generation_model,
 )
-from src.application.text.preprocessing import split_into_paragraphs
-from src.application.text.search_detection import (
-    PARAPHRASE_THRESHOLD_MACHINE,
-    find_sentence_source,
-)
 class NewsVerification:
     def __init__(self):
-        self.news_text = ""
-        self.news_title = ""
-        self.news_content = ""
-        self.news_image = ""
         self.text_prediction_label: list[str] = ["UNKNOWN"]
         self.text_prediction_score: list[float] = [0.0]
@@ -37,8 +45,8 @@ class NewsVerification:
         self.image_prediction_score: list[str] = [0.0]
         self.image_referent_url: list[str] = []
-        self.news_prediction_label = ""
-        self.news_prediction_score = -1
         # news' urls to find img
         self.found_img_url: list[str] = []
@@ -52,8 +60,7 @@ class NewsVerification:
                 "similarity",
                 "paraphrase",
                 "url",
-                "group",
-                "entities",
             ],
         )
         self.grouped_url_df: pd.DataFrame = pd.DataFrame()
@@ -63,95 +70,165 @@ class NewsVerification:
         self.fact_checker_table: list = []
         self.governor_table: list = []
-    def load_news(self, news_title, news_content, news_image):
         self.news_text = (news_title + "\n\n" + news_content).strip()
         self.news_title = news_title
         self.news_content = news_content
         self.news_image = news_image
-    def determine_text_origin(self):
-        self.find_text_source()
-        # Group inout and source by url
         def concat_text(series):
             return " ".join(
                 series.astype(str).tolist(),
             )  # Handle mixed data types and NaNs
-        self.grouped_url_df = self.aligned_sentences_df.groupby("url").agg(
-            {
-                "input": concat_text,
-                "source": concat_text,
-            },
-        )
-        self.grouped_url_df = self.grouped_url_df.reset_index()
         # Add new columns for label and score
         self.grouped_url_df["label"] = None
         self.grouped_url_df["score"] = None
         print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
         for index, row in self.grouped_url_df.iterrows():
             label, score = self.verify_text(row["url"])
             if label == "UNKNOWN":
-                # Concatenate text from "input" in sentence_df
                 text = " ".join(row["input"])
-                # detect by baseline model
                 label, score = detect_text_by_ai_model(text)
             self.grouped_url_df.at[index, "label"] = label
             self.grouped_url_df.at[index, "score"] = score
-        # Overall label or score for the whole input text
-        if len(self.grouped_url_df) > 0:
             machine_label = self.grouped_url_df[
                 self.grouped_url_df["label"].str.contains(
-                    "MACHINE",
                     case=False,
                     na=False,
                 )
             ]
-            if len(machine_label) > 0:
-                label = " ".join(machine_label["label"].tolist())
                 self.text_prediction_label[0] = label
                 self.text_prediction_score[0] = machine_label["score"].mean()
             else:
                 machine_label = self.aligned_sentences_df[
                     self.aligned_sentences_df["label"] == "HUMAN"
                 ]
                 self.text_prediction_label[0] = "HUMAN"
                 self.text_prediction_score[0] = machine_label["score"].mean()
-        else:  # no source found in the input text
             print("No source found in the input text")
             text = " ".join(self.aligned_sentences_df["input"].tolist())
-            # detect by baseline model
             label, score = detect_text_by_ai_model(text)
             self.text_prediction_label[0] = label
             self.text_prediction_score[0] = score
     def find_text_source(self):
         """
-        Determines the origin of the given text based on paraphrasing detection
-            and human authorship analysis.
-        Args:
-            text: The input text to be analyzed.
-        Returns:
-            str: The predicted origin of the text:
-                - "HUMAN": If the text is likely written by a human.
-                - "MACHINE": If the text is likely generated by a machine.
         """
         print("CHECK TEXT:")
         print("\tFrom search engine:")
-        # Classify by search engine
-        # input_sentences = split_into_sentences(self.news_text)
         input_paragraphs = split_into_paragraphs(self.news_text)
-        # Setup df for input_sentences
         for _ in range(len(input_paragraphs)):
             self.aligned_sentences_df = pd.concat(
                 [
@@ -173,7 +250,7 @@ class NewsVerification:
                 ignore_index=True,
             )
-        # find a source for each sentence
         for index, _ in enumerate(input_paragraphs):
             similarity = self.aligned_sentences_df.loc[index, "similarity"]
             if similarity is not None:
@@ -188,23 +265,47 @@ class NewsVerification:
                 index,
                 self.aligned_sentences_df,
             )
             self.found_img_url.extend(img_urls)
-        # determine if the whole source is from a news or not
     def verify_text(self, url):
         label = "UNKNOWN"
         score = 0
         # calculate the average similarity when the similary score
         # in each row of sentences_df is higher than 0.8
         filtered_by_url = self.aligned_sentences_df[
             self.aligned_sentences_df["url"] == url
         ]
         filtered_by_similarity = filtered_by_url[
-            filtered_by_url["similarity"] > 0.8
         ]
-        if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 0.5:
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
             contains_machine = (
                 filtered_by_similarity["label"]
@@ -215,8 +316,10 @@ class NewsVerification:
                 )
                 .any()
             )
             if contains_machine:
-                label = "MACHINE"
                 machine_rows = filtered_by_similarity[
                     filtered_by_similarity["label"].str.contains(
                         "MACHINE",
@@ -225,9 +328,10 @@ class NewsVerification:
                     )
                 ]
                 generated_model, _ = predict_generation_model(self.news_text)
-                label += f"<br>({generated_model})"
                 score = machine_rows["similarity"].mean()
             else:
                 label = "HUMAN"
                 human_rows = filtered_by_similarity[
                     filtered_by_similarity["label"].str.contains(
@@ -241,13 +345,26 @@ class NewsVerification:
         return label, score
     def determine_image_origin(self):
         print("CHECK IMAGE:")
         if self.news_image is None:
             self.image_prediction_label = "UNKNOWN"
             self.image_prediction_score = 0.0
             self.image_referent_url = None
             return
         matched_url, similarity = detect_image_from_news_image(
             self.news_image,
             self.found_img_url,
@@ -259,6 +376,8 @@ class NewsVerification:
             self.image_referent_url = matched_url
             return
         matched_url, similarity = detect_image_by_reverse_search(
             self.news_image,
         )
@@ -269,6 +388,8 @@ class NewsVerification:
             self.image_referent_url = matched_url
             return
         detected_label, score = detect_image_by_ai_model(self.news_image)
         if detected_label:
             print(f"detected_label: {detected_label} ({score})")
@@ -277,18 +398,34 @@ class NewsVerification:
             self.image_referent_url = None
             return
         self.image_prediction_label = "UNKNOWN"
         self.image_prediction_score = 50
         self.image_referent_url = None
-    def generate_analysis_report(self):
         if self.news_text != "":
             self.determine_text_origin()
         if self.news_image != "":
             self.determine_image_origin()
-    def analyze_details(self):
         self.handle_entities()
         ordinary_user_table = self.create_ordinary_user_table()
         fact_checker_table = self.create_fact_checker_table()
         governor_table = self.create_governor_table()
@@ -296,6 +433,16 @@ class NewsVerification:
         return ordinary_user_table, fact_checker_table, governor_table
     def handle_entities(self):
         entities_with_colors = []
         for index, row in self.grouped_url_df.iterrows():
             # Get entity-words (in pair) with colors
@@ -304,51 +451,23 @@ class NewsVerification:
                 row["source"],
             )
             for index, sentence in self.aligned_sentences_df.iterrows():
                 if sentence["url"] == row["url"]:
                     self.aligned_sentences_df.at[index, "entities"] = (
-                        entities_with_colors  # must use at
                     )
-    def get_text_urls(self):
-        return set(self.text_referent_url)
-    def compare_sentences(self, sentence_1, sentence_2, position, color):
         """
-        Compares two sentences and identifies common phrases,
-            outputting their start and end positions.
         """
-        if not sentence_1 or not sentence_2:  # Handle empty strings
-            return []
-        s = SequenceMatcher(None, sentence_1, sentence_2)
-        common_phrases = []
-        for block in s.get_matching_blocks():
-            if block.size > 0:  # Ignore zero-length matches
-                start_1 = block.a
-                end_1 = block.a + block.size
-                start_2 = block.b
-                end_2 = block.b + block.size
-                phrase = sentence_1[
-                    start_1:end_1
-                ]  # Or sentence_2[start_2:end_2], they are the same
-                common_phrases.append(
-                    {
-                        "phrase": phrase,
-                        "start_1": start_1 + position,
-                        "end_1": end_1 + position,
-                        "start_2": start_2,
-                        "end_2": end_2,
-                        "color": color,
-                    },
-                )
-        position += len(sentence_1)
-        return common_phrases, position
     def create_fact_checker_table(self):
         rows = []
@@ -387,7 +506,7 @@ class NewsVerification:
             if index == 0 or current_url != previous_url:
                 first_url_row = True
                 previous_url = current_url
-                # Increase counter  "span_row" when the next url is the same
                 while (
                     index + span_row < len(self.fact_checker_table)
                     and self.fact_checker_table[index + span_row][4]
@@ -432,7 +551,7 @@ class NewsVerification:
 </table>
 <style>
-    """
     def format_text_fact_checker_row(
         self,
@@ -467,12 +586,12 @@ class NewsVerification:
                 entity_count = len(row[3])
             # Color overlapping words
-            input_sentence = self.color_text(
                 input_sentence,
                 row[1],
                 highlight_idx_input,
             )  # text, index of highlight words
-            source_sentence = self.color_text(
                 source_sentence,
                 row[2],
                 highlight_idx_source,
@@ -493,6 +612,7 @@ class NewsVerification:
             source_sentence = row[0]["source"]
         url = row[0]["url"]
         # Displayed label and score by url
         filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
         if len(filterby_url) > 0:
@@ -506,7 +626,7 @@ class NewsVerification:
         source_text_url = f"""<a href="{url}">{url}</a>"""
         # Format displayed entity count
-        entity_count_text = self.get_entity_count_text(entity_count)
         border_top = "border-top: 1px solid transparent;"
         border_bottom = "border-bottom: 1px solid transparent;"
@@ -600,7 +720,7 @@ class NewsVerification:
 <style>
     """
-    def format_text_ordinary_user_row(self, max_length=30):
         input_sentences = ""
         source_text_urls = ""
         urls = []
@@ -623,7 +743,7 @@ class NewsVerification:
                 </tr>
                 """
-    def format_image_ordinary_user_row(self, max_length=30):
         if (
             self.image_referent_url is not None
@@ -720,12 +840,12 @@ class NewsVerification:
                 )
                 # Color overlapping words
-                input_sentence = self.color_text(
                     input_sentence,
                     row[1],
                     highlight_idx_input,
                 )  # text, index of highlight words
-                source_sentence = self.color_text(
                     source_sentence,
                     row[2],
                     highlight_idx_source,
@@ -759,7 +879,7 @@ class NewsVerification:
                 if row[3] is not None:
                     entity_count.append(len(row[3]))
-        entity_count_text = self.get_entity_count_text(sum(entity_count))
         word_break = "word-break: break-all;"
         return f"""
 <tr>
@@ -785,155 +905,3 @@ class NewsVerification:
         word_break = "word-break: break-all;"
         return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>"""  # noqa: E501
-    def get_entity_count_text(self, entity_count):
-        if entity_count <= 0:
-            entity_count_text = ""
-        elif entity_count == 1:
-            entity_count_text = "with 1 altered entity"
-        else:
-            entity_count_text = "with altered entities"
-        return entity_count_text
-    def color_text(self, text, colored_idx, highlighted_idx):
-        sentence = ""
-        words = text.split()
-        starts, ends = self.extract_starts_ends(colored_idx)
-        starts, ends = self.filter_indices(starts, ends, highlighted_idx)
-        previous_end = 0
-        for start, end in zip(starts, ends):
-            sentence += " ".join(words[previous_end:start])
-            equal_words = " ".join(words[start:end])
-            sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
-            previous_end = end
-        sentence += " ".join(words[previous_end:])
-        return sentence
-    def extract_starts_ends(self, colored_idx):
-        starts = []
-        ends = []
-        for index in colored_idx:
-            starts.append(index["start"])
-            ends.append(index["end"])
-        return starts, ends
-    def filter_indices(self, starts, ends, ignore_indices):
-        """
-        Filters start and end indices to exclude any indices present in the
-            ignore_indices list.
-        Args:
-            starts: A list of starting indices.
-            ends: A list of ending indices. Must be the same length as starts.
-            ignore_indices: A list of indices to exclude.
-        Returns:
-            A tuple of two lists: filtered_starts and filtered_ends.
-            Returns empty lists if the input is invalid
-                or if all ranges are filtered out.
-            Prints error messages for invalid input.
-        Examples:
-            starts = [0, 5, 10]
-            ends = [3, 7, 12]  # words at the end will not be colored.
-            ignore_indices = [1, 2, 12, 17]
-            # Output:
-                starts = [0, 3, 5, 10]
-                ends = [1, 4, 7, 12]
-        """
-        if len(starts) != len(ends):
-            print(
-                "Error: The 'starts' and 'ends' lists must have the same length.",  # noqa: E501
-            )
-            return [], []
-        filtered_starts = []
-        filtered_ends = []
-        for i in range(len(starts)):
-            start = starts[i]
-            end = ends[i]
-            if end < start:
-                print(
-                    f"Error: End index {end} is less than start index {start} at position {i}.",  # noqa: E501
-                )
-                return [], []
-            start_end = list(range(start, end + 1, 1))
-            start_end = list(set(start_end) - set(ignore_indices))
-            # new_start, new_end = self.extract_sequences(start_end)
-            new_start, new_end = self.extract_new_startend(
-                start,
-                end,
-                ignore_indices,
-            )
-            filtered_starts.extend(new_start)
-            filtered_ends.extend(new_end)
-        return filtered_starts, filtered_ends
-    def extract_new_startend(self, start, end, ignore_indices):
-        # sort a set of ignore_indices
-        indexes = list(set(ignore_indices))
-        indexes.sort()
-        new_starts = []
-        new_ends = []
-        new_start = start
-        if indexes is None or len(indexes) < 1:
-            new_starts.append(start)
-            new_ends.append(end)
-            return new_starts, new_ends
-        for index in indexes:
-            if index < start:
-                continue
-            elif index >= end:
-                continue
-            new_starts.append(new_start)
-            new_ends.append(index)
-            new_start = index + 1
-        new_starts.append(new_start)
-        new_ends.append(end)
-        return new_starts, new_ends
-    def extract_sequences(self, numbers):
-        if len(numbers) == 1:
-            return [numbers[0]], [numbers[0]]
-        numbers.sort()
-        starts = []
-        ends = []
-        for i, number in enumerate(numbers):
-            if i == 0:
-                start = number
-                end = number
-                continue
-            if number - 1 == numbers[i - 1]:
-                end = number
-            else:
-                starts.append(start)
-                ends.append(end)
-                start = number
-                end = number
-            if i == len(numbers) - 1:
-                starts.append(start)
-                ends.append(end)
-        return starts, ends

+"""
+Author: Khanh Phan
+Date: 2024-12-04
+"""
 import pandas as pd
+from src.application.config import MIN_RATIO_PARAPHRASE_NUM, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_MACHINE
+from src.application.formatting import color_text, format_entity_count
 from src.application.image.image_detection import (
     detect_image_by_ai_model,
     detect_image_by_reverse_search,
     apply_highlight,
     highlight_entities,
 )
+from src.application.text.helper import (
+    extract_equal_text,
+    postprocess_label,
+    split_into_paragraphs,
+)
 from src.application.text.model_detection import (
     detect_text_by_ai_model,
     predict_generation_model,
 )
+from src.application.text.search_detection import find_sentence_source
 class NewsVerification:
     def __init__(self):
+        """
+        Initializes the NewsVerification object.
+        """
+        self.news_text: str = ""
+        self.news_title: str = ""
+        self.news_content: str = ""
+        self.news_image: str = ""
         self.text_prediction_label: list[str] = ["UNKNOWN"]
         self.text_prediction_score: list[float] = [0.0]
         self.image_prediction_score: list[str] = [0.0]
         self.image_referent_url: list[str] = []
+        self.news_prediction_label: str = ""
+        self.news_prediction_score: float = -1
         # news' urls to find img
         self.found_img_url: list[str] = []
                 "similarity",
                 "paraphrase",
                 "url",
+                # "entities",
             ],
         )
         self.grouped_url_df: pd.DataFrame = pd.DataFrame()
         self.fact_checker_table: list = []
         self.governor_table: list = []
+    def load_news(self, news_title: str, news_content: str, news_image: str):
+        """
+        Loads news data into the object's attributes.
+        Args:
+            news_title (str): The title of the news article.
+            news_content (str): The content of the news article.
+            news_image (str): The url of image in news article.
+        """
+        # Combine title and content for a full text representation.
+        # .strip() removes leading/trailing whitespace for cleaner text.
         self.news_text = (news_title + "\n\n" + news_content).strip()
+        # if not isinstance(news_title, str) or not isinstance(
+        #     news_content,
+        #     str,
+        # ):
+        #     raise TypeError("News title and content must be strings.")
+        # if not isinstance(news_image, str) or news_image is not None:
+        #     Warning("News image must be a string.")
         self.news_title = news_title
         self.news_content = news_content
         self.news_image = news_image
+    def group_by_url(self):
+        """
+        Groups aligned sentences by URL
+        Then, concatenates text the 'input' and 'source' text for each group.
+        """
         def concat_text(series):
+            """
+            Concatenates the elements of a pd.Series into a single string.
+            """
             return " ".join(
                 series.astype(str).tolist(),
             )  # Handle mixed data types and NaNs
+        # Group sentences by URL and concatenate 'input' and 'source' text.
+        self.grouped_url_df = (
+            self.aligned_sentences_df.groupby("url")
+            .agg(
+                {
+                    "input": concat_text,
+                    "source": concat_text,
+                },
+            )
+            .reset_index()
+        )  # Reset index to make 'url' a regular column
         # Add new columns for label and score
         self.grouped_url_df["label"] = None
         self.grouped_url_df["score"] = None
         print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
+    def determine_text_origin_by_url(self):
+        """
+        Determines the text origin for each URL group.
+        """
         for index, row in self.grouped_url_df.iterrows():
+            # Verify text origin using URL-based verification.
             label, score = self.verify_text(row["url"])
+            # If URL-based verification returns 'UNKNOWN', use AI detection
             if label == "UNKNOWN":
+                # Concatenate text from "input" column in sentence_df
                 text = " ".join(row["input"])
+                # Detect text origin using an AI model.
                 label, score = detect_text_by_ai_model(text)
             self.grouped_url_df.at[index, "label"] = label
             self.grouped_url_df.at[index, "score"] = score
+    def determine_text_origin(self):
+        """
+        Determines the origin of the input text by analyzing
+            its sources and applying AI detection models.
+        This method groups sentences by their source URLs,
+            applies verification and AI detection, and then determines
+            an overall label and score for the input text.
+        """
+        # Find the text URLs associated with the input text
+        self.find_text_source()
+        # Group sentences by URL and concatenate 'input' and 'source' text.
+        self.group_by_url()
+        # Determine the text origin for each URL group
+        self.determine_text_origin_by_url()
+        # Determine the overall label and score for the entire input text.
+        if not self.grouped_url_df.empty:
+            # Check for 'gpt-4o' labels in the grouped URLs.
             machine_label = self.grouped_url_df[
                 self.grouped_url_df["label"].str.contains(
+                    "gpt-4o",
                     case=False,
                     na=False,
                 )
             ]
+            if not machine_label.empty:
+                # If 'gpt-4o' labels are found, post-process and assign.
+                labels = machine_label["label"].tolist()
+                label = postprocess_label(labels)
+                # labels = " and ".join(machine_label["label"].tolist())
+                # label = remove_duplicate_words(label)
                 self.text_prediction_label[0] = label
                 self.text_prediction_score[0] = machine_label["score"].mean()
             else:
+                # If no 'gpt-4o' labels, assign for 'HUMAN' labels.
                 machine_label = self.aligned_sentences_df[
                     self.aligned_sentences_df["label"] == "HUMAN"
                 ]
                 self.text_prediction_label[0] = "HUMAN"
                 self.text_prediction_score[0] = machine_label["score"].mean()
+        else:
+            # If no found URLs, use AI detection on the entire input text.
             print("No source found in the input text")
             text = " ".join(self.aligned_sentences_df["input"].tolist())
+            # Detect text origin using an AI model.
             label, score = detect_text_by_ai_model(text)
             self.text_prediction_label[0] = label
             self.text_prediction_score[0] = score
     def find_text_source(self):
         """
+        Determines the origin of the given text based on paraphrasing
+            detection and human authorship analysis.
+        1. Splits the input news text into sentences,
+        2. Searches for sources for each sentence
+        3. Updates the aligned_sentences_df with the found sources.
         """
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         input_paragraphs = split_into_paragraphs(self.news_text)
+        # Initialize an empty DataFrame if it doesn't exist, otherwise extend it.
+        if not hasattr(self, 'aligned_sentences_df') or self.aligned_sentences_df is None:
+            self.aligned_sentences_df = pd.DataFrame(columns=[
+                "input",
+                "source",
+                "label",
+                "similarity",
+                "paraphrase",
+                "url",
+                "entities",
+                ])
+        # Setup DataFrame for input_sentences
         for _ in range(len(input_paragraphs)):
             self.aligned_sentences_df = pd.concat(
                 [
                 ignore_index=True,
             )
+        # Find a source for each sentence
         for index, _ in enumerate(input_paragraphs):
             similarity = self.aligned_sentences_df.loc[index, "similarity"]
             if similarity is not None:
                 index,
                 self.aligned_sentences_df,
             )
+            # Initialize found_img_url if it does not exist.
+            if not hasattr(self, 'found_img_url'):
+                self.found_img_url = []
             self.found_img_url.extend(img_urls)
     def verify_text(self, url):
+        """
+        Verifies the text origin based on similarity scores and labels
+            associated with a given URL.
+        1. Filters sentences by URL and similarity score,
+        2. Determines if the text is likely generated by a machine or a human.
+        3. Calculates an average similarity score.
+        Args:
+            url (str): The URL to filter sentences by.
+        Returns:
+            tuple: A
+                - Label ("MACHINE", "HUMAN", or "UNKNOWN")
+                - Score
+        """
         label = "UNKNOWN"
         score = 0
         # calculate the average similarity when the similary score
         # in each row of sentences_df is higher than 0.8
+        # Filter sentences by URL.
         filtered_by_url = self.aligned_sentences_df[
             self.aligned_sentences_df["url"] == url
         ]
+        # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
         filtered_by_similarity = filtered_by_url[
+            filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
         ]
+        # Check if a ratio of remaining filtering-sentences is more than 50%.
+        if len(filtered_by_similarity) / len(self.aligned_sentences_df) > MIN_RATIO_PARAPHRASE_NUM:
             # check if "MACHINE" is in self.aligned_sentences_df["label"]:
             contains_machine = (
                 filtered_by_similarity["label"]
                 )
                 .any()
             )
+            # TODO: integrate with determine_text_origin
             if contains_machine:
+                # If "MACHINE" label is present, set label and calculate score.
                 machine_rows = filtered_by_similarity[
                     filtered_by_similarity["label"].str.contains(
                         "MACHINE",
                     )
                 ]
                 generated_model, _ = predict_generation_model(self.news_text)
+                label = f"Partially generated by {generated_model}"
                 score = machine_rows["similarity"].mean()
             else:
+                # If no "MACHINE" label, assign "HUMAN" label and calculate score.
                 label = "HUMAN"
                 human_rows = filtered_by_similarity[
                     filtered_by_similarity["label"].str.contains(
         return label, score
     def determine_image_origin(self):
+        """
+        Determines the origin of the news image using various detection methods.
+        1.  Matching against previously found image URLs.
+        2.  Reverse image search.
+        3.  AI-based image detection.
+        If none of these methods succeed, the image origin is marked as "UNKNOWN".
+        """
         print("CHECK IMAGE:")
+        # Handle the case where no image is provided.
         if self.news_image is None:
             self.image_prediction_label = "UNKNOWN"
             self.image_prediction_score = 0.0
             self.image_referent_url = None
             return
+        # Attempt to match the image against previously found image URLs.
+        print("\tFrom found image URLs...")
         matched_url, similarity = detect_image_from_news_image(
             self.news_image,
             self.found_img_url,
             self.image_referent_url = matched_url
             return
+        # Attempt to find the image origin using reverse image search.
+        print("\tFrom reverse image search...")
         matched_url, similarity = detect_image_by_reverse_search(
             self.news_image,
         )
             self.image_referent_url = matched_url
             return
+        # Attempt to detect the image origin using an AI model.
+        print("\tFrom an AI model...")
         detected_label, score = detect_image_by_ai_model(self.news_image)
         if detected_label:
             print(f"detected_label: {detected_label} ({score})")
             self.image_referent_url = None
             return
+        # If all detection methods fail, mark the image origin as "UNKNOWN".
         self.image_prediction_label = "UNKNOWN"
         self.image_prediction_score = 50
         self.image_referent_url = None
+    def determine_origin(self):
+        """
+        Determine origins by analyzing the news text and image.
+        """
         if self.news_text != "":
             self.determine_text_origin()
         if self.news_image != "":
             self.determine_image_origin()
+        # Handle entity recognition and processing.
         self.handle_entities()
+    def generate_report(self) -> tuple[str, str, str]:
+        """
+        Generates reports tailored for different user roles
+            (ordinary users, fact checkers, governors).
+        Returns:
+            tuple: A tuple containing three html-formatted reports:
+                - ordinary_user_table: Report for ordinary users.
+                - fact_checker_table: Report for fact checkers.
+                - governor_table: Report for governors.
+        """
         ordinary_user_table = self.create_ordinary_user_table()
         fact_checker_table = self.create_fact_checker_table()
         governor_table = self.create_governor_table()
         return ordinary_user_table, fact_checker_table, governor_table
     def handle_entities(self):
+        """
+        Highlights and assigns entities with colors to aligned sentences
+            based on grouped URLs.
+        For each grouped URL:
+        1. Highlights entities in the input and source text
+        2. Then assigns these highlighted entities to the corresponding
+            sentences in the aligned sentences DataFrame.
+        """
         entities_with_colors = []
         for index, row in self.grouped_url_df.iterrows():
             # Get entity-words (in pair) with colors
                 row["source"],
             )
+            # Assign the highlighted entities to the corresponding sentences
+            # in aligned_sentences_df.
             for index, sentence in self.aligned_sentences_df.iterrows():
                 if sentence["url"] == row["url"]:
+                    # Use .at to modify the DataFrame efficiently.
                     self.aligned_sentences_df.at[index, "entities"] = (
+                        entities_with_colors
                     )
+    def get_text_urls(self) -> set:
         """
+        Returns a set of unique URLs referenced in the text analysis.
+        Returns:
+            set: A set containing the unique URLs referenced in the text.
         """
+        return set(self.text_referent_url)
     def create_fact_checker_table(self):
         rows = []
             if index == 0 or current_url != previous_url:
                 first_url_row = True
                 previous_url = current_url
+                # Increase counter "span_row" when the next url is the same
                 while (
                     index + span_row < len(self.fact_checker_table)
                     and self.fact_checker_table[index + span_row][4]
 </table>
 <style>
+"""
     def format_text_fact_checker_row(
         self,
                 entity_count = len(row[3])
             # Color overlapping words
+            input_sentence = color_text(
                 input_sentence,
                 row[1],
                 highlight_idx_input,
             )  # text, index of highlight words
+            source_sentence = color_text(
                 source_sentence,
                 row[2],
                 highlight_idx_source,
             source_sentence = row[0]["source"]
         url = row[0]["url"]
         # Displayed label and score by url
         filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
         if len(filterby_url) > 0:
         source_text_url = f"""<a href="{url}">{url}</a>"""
         # Format displayed entity count
+        entity_count_text = format_entity_count(entity_count)
         border_top = "border-top: 1px solid transparent;"
         border_bottom = "border-bottom: 1px solid transparent;"
 <style>
     """
+    def format_text_ordinary_user_row(self):
         input_sentences = ""
         source_text_urls = ""
         urls = []
                 </tr>
                 """
+    def format_image_ordinary_user_row(self):
         if (
             self.image_referent_url is not None
                 )
                 # Color overlapping words
+                input_sentence = color_text(
                     input_sentence,
                     row[1],
                     highlight_idx_input,
                 )  # text, index of highlight words
+                source_sentence = color_text(
                     source_sentence,
                     row[2],
                     highlight_idx_source,
                 if row[3] is not None:
                     entity_count.append(len(row[3]))
+        entity_count_text = format_entity_count(sum(entity_count))
         word_break = "word-break: break-all;"
         return f"""
 <tr>
         word_break = "word-break: break-all;"
         return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>"""  # noqa: E501

src/application/content_generation.py CHANGED Viewed

@@ -1,30 +1,39 @@
 import json
-import os
 import openai
-from dotenv import load_dotenv
-load_dotenv()
-AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
-AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
-AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
-client = openai.AzureOpenAI(
-    api_version=AZURE_OPENAI_API_VERSION,
-    api_key=AZURE_OPENAI_API_KEY,
-    azure_endpoint=AZURE_OPENAI_ENDPOINT,
 )
-def generate_fake_text(text_generation_model, title, content):
     # Generate text using the selected models
     prompt = """Generate a random fake news tittle in this format:
-    ---
-    # Title: [Fake Title]
-    # Content:
-    [Fake Content]
-    ---
-    """
     if title and content:
         prompt += """base on the following context:
         # Title: {news_title}:\n# Content: {news_content}"""
@@ -38,7 +47,7 @@ def generate_fake_text(text_generation_model, title, content):
     # Generate text using the text generation model
     # Generate text using the selected model
     try:
-        response = client.chat.completions.create(
             model=text_generation_model,
             messages=[{"role": "system", "content": prompt}],
         )
@@ -58,67 +67,92 @@ def generate_fake_text(text_generation_model, title, content):
     return fake_title, fake_content
-def extract_title_content(fake_news):
     """
-    Extracts the title and content from the generated fake news string.
-    This function parses a string containing fake news, which is expected
-    to have a specific format with a title and content section marked by
-    '# Title:' and '# Content:' respectively.
     Args:
-        fake_news (str): A string containing the generated fake news.
     Returns:
-        tuple: A tuple containing two elements:
-            - title (str): The extracted title of the fake news.
-            - content (str): The extracted content of the fake news.
-    Note:
-        The function assumes that the input string follows the expected format.
-        If the format is not as expected, it may return unexpected results.
     """
-    # Extract the title and content from the generated fake news
-    title_start_index = fake_news.find("# Title: ") + len("# Title: ")
-    title_end_index = fake_news.find("\n", title_start_index)
-    title = fake_news[title_start_index:title_end_index].strip()
-    content_start_index = fake_news.find("\n# Content: ") + len(
-        "\n# Content: ",
-    )
-    content = fake_news[content_start_index:].strip()
     return title, content
-def generate_fake_image(model, title):
-    if len(title) > 0:
-        IMAGE_PROMPT = f"Generate a random image about {title}"
-    else:
-        IMAGE_PROMPT = "Generate a random image"
-    result = client.images.generate(
-        model="dall-e-3",  # the name of your DALL-E 3 deployment
-        prompt=IMAGE_PROMPT,
-        n=1,
-    )
-    image_url = json.loads(result.model_dump_json())["data"][0]["url"]
-    return image_url
-def replace_text(news_title, news_content, replace_df):
     """
-    Replaces occurrences in the input text based on the provided DataFrame.
     Args:
-        text: The input text.
-        replace_df: A DF with 2 columns: "find_what" & "replace_with".
     Returns:
-        The text after all replacements have been made.
     """
     for _, row in replace_df.iterrows():
         find_what = row["Find what:"]
         replace_with = row["Replace with:"]
         news_content = news_content.replace(find_what, replace_with)
         news_title = news_title.replace(find_what, replace_with)
     return news_title, news_content

 import json
 import openai
+import pandas as pd
+from src.application.config import (
+    AZUREOPENAI_CLIENT,
+    GPT_IMAGE_MODEL,
 )
+def generate_fake_text(
+    text_generation_model: str,
+    title: str = None,
+    content: str = None,
+) -> tuple[str, str]:
+    """
+    Generates fake news title and content using an Azure OpenAI model.
+    Args:
+        text_generation_model: The name of the Azure OpenAI model to use.
+        title: Optional title to use as context for fake text generation.
+        content: Optional content to use as context for fake text generation.
+    Returns:
+        A tuple containing the generated fake title and content (both strings).
+        Returns empty strings if generation fails.
+    """
     # Generate text using the selected models
     prompt = """Generate a random fake news tittle in this format:
+---
+# Title: [Fake Title]
+# Content:
+[Fake Content]
+---
+"""
     if title and content:
         prompt += """base on the following context:
         # Title: {news_title}:\n# Content: {news_content}"""
     # Generate text using the text generation model
     # Generate text using the selected model
     try:
+        response = AZUREOPENAI_CLIENT.chat.completions.create(
             model=text_generation_model,
             messages=[{"role": "system", "content": prompt}],
         )
     return fake_title, fake_content
+def extract_title_content(fake_news: str) -> tuple[str, str]:
     """
+    Extracts the title and content from the generated fake text.
     Args:
+        fake_news: The generated fake text string.
     Returns:
+        A tuple containing the extracted title and content.
     """
+    title = ""
+    content = ""
+    try:
+        # Extract the title and content from the generated fake news
+        title_start = fake_news.find("# Title: ") + len("# Title: ")
+        title_end = fake_news.find("\n", title_start)
+        if title_start != -1 and title_end != -1:
+            title = fake_news[title_start:title_end].strip()
+        title_start = fake_news.find("\n# Content: ") + len(
+            "\n# Content: ",
+        )
+        content = fake_news[title_start:].strip()
+    except Exception as e:
+        print(f"Error extracting title and content: {e}")
     return title, content
+def generate_fake_image(
+    title: str,
+    model: str = GPT_IMAGE_MODEL,
+) -> str | None:
+    """
+    Generates a fake image URL using Azure OpenAI's image generation API.
+    Args:
+        title: The title to use as a prompt for image generation.
+        model: The name of the Azure OpenAI image generation model to use.
+    Returns:
+        The URL of the generated image, or None if an error occurs.
     """
+    try:
+        if title:
+            image_prompt = f"Generate a random image about {title}"
+        else:
+            image_prompt = "Generate a random image"
+        result = AZUREOPENAI_CLIENT.images.generate(
+            model=model,
+            prompt=image_prompt,
+            n=1,
+        )
+        image_url = json.loads(result.model_dump_json())["data"][0]["url"]
+        return image_url
+    except Exception as e:
+        print(f"Error generating fake image: {e}")
+        return None  # Return None if an error occurs
+def replace_text(
+    news_title: str,
+    news_content: str,
+    replace_df: pd.DataFrame,
+) -> tuple[str, str]:
+    """
+    Replaces occurrences in the input title and content
+        based on the provided DataFrame.
     Args:
+        news_title: The input news title.
+        news_content: The input news content.
+        replace_df: A DataFrame with two columns:
+            "Find what:" and "Replace with:".
     Returns:
+        A tuple containing the modified news title and content.
     """
     for _, row in replace_df.iterrows():
         find_what = row["Find what:"]
         replace_with = row["Replace with:"]
         news_content = news_content.replace(find_what, replace_with)
         news_title = news_title.replace(find_what, replace_with)
     return news_title, news_content

src/application/formatting.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from src.application.text.helper import extract_starts_ends, filter_indices
+def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -> str:
+    """
+    Colors specific words in a text based on provided indices.
+    This method takes a text, a list of indices to color, and a list of indices to exclude.
+    It splits the text into words, filters the indices, and then wraps the words within
+    the specified ranges with a green span tag for coloring.
+    Args:
+        text (str): The input text.
+        colored_idx (list): A list of dictionaries, where each dictionary contains
+                            'start' and 'end' keys representing indices of words to color.
+        highlighted_idx (list): A list of indices to exclude from coloring.
+    Returns:
+        str: The text with colored words.
+    """
+    sentence = ""
+    words = text.split()
+    # Extract start and end indices from colored_idx.
+    starts, ends = extract_starts_ends(colored_idx)
+    # Filter the start and end indices to exclude highlighted_idx.
+    starts, ends = filter_indices(starts, ends, highlighted_idx)
+    previous_end = 0
+    for start, end in zip(starts, ends):
+        # Add the words before the current colored range to the sentence.
+        sentence += " ".join(words[previous_end:start])
+        # Add the colored range to the sentence.
+        equal_words = " ".join(words[start:end])
+        sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
+        # Update the previous end index.
+        previous_end = end
+    # Add the remaining words after the last colored range to the sentence.
+    sentence += " ".join(words[previous_end:])
+    return sentence
+def format_entity_count(entity_count: int) -> str:
+    """
+    Generates a text description based on the number of altered entities.
+    Args:
+        entity_count (int): The number of altered entities.
+    Returns:
+        str: A text description of the entity count.
+            - "" if entity_count is 0 or negative.
+            - "with 1 altered entity" if entity_count is 1.
+            - "with altered entities" if entity_count is greater than 1.
+    """
+    if entity_count <= 0:
+        entity_count_text = ""
+    elif entity_count == 1:
+        entity_count_text = "with 1 altered entity"
+    else:
+        entity_count_text = "with altered entities"
+    return entity_count_text

src/application/formatting_ordinary_user.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from src.application.config import WORD_BREAK
+def create_ordinary_user_table(self):
+    rows = []
+    rows.append(self.format_image_ordinary_user_row())
+    rows.append(self.format_text_ordinary_user_row())
+    table = "\n".join(rows)
+    return f"""
+<h5>Comparison between input news and source news:</h5>
+<table border="1" style="width:100%; text-align:left;">
+<col style="width: 340px;">
+<col style="width: 30px;">
+<col style="width: 75px;">
+    <thead>
+        <tr>
+            <th>Input news</th>
+            <th>Forensic</th>
+            <th>Originality</th>
+        </tr>
+    </thead>
+    <tbody>
+        {table}
+    </tbody>
+</table>
+<style>
+    """
+def format_text_ordinary_user_row(self):
+    input_sentences = ""
+    source_text_urls = ""
+    urls = []
+    for _, row in self.aligned_sentences_df.iterrows():
+        if row["input"] is None:
+            continue
+        input_sentences += row["input"] + "<br><br>"
+        url = row["url"]
+        if url not in urls:
+            urls.append(url)
+            source_text_urls += f"""<a href="{url}">{url}</a><br>"""
+    return f"""
+            <tr>
+                <td>{input_sentences}</td>
+                <td>{self.text_prediction_label[0]}<br>
+                ({self.text_prediction_score[0] * 100:.2f}%)</td>
+                <td style="{WORD_BREAK}";>{source_text_urls}</td>
+            </tr>
+            """
+def format_image_ordinary_user_row(
+    image_referent_url: str,
+    image_prediction_label: str,
+    image_prediction_score: float,
+):
+    """
+    Formats an HTML table row for ordinary users,
+        displaying image analysis results.
+    Args:
+        image_referent_url (str): The URL of the referenced image.
+        image_prediction_label (str): The predicted label for the image.
+        image_prediction_score (float): The prediction score for the image.
+    Returns:
+        str: An HTML table row string containing the image analysis results.
+    """
+    # Put image, label, and score into html tag
+    if (
+        image_referent_url is not None
+        or image_referent_url != ""
+    ):
+        source_image_url = f"""<a href="{image_referent_url}">{image_referent_url}</a>"""  # noqa: E501
+    else:
+        source_image_url = ""
+    return f"""
+<tr>
+    <td>input image</td>
+    <td>{image_prediction_label}<br>({image_prediction_score:.2f}%)</td>
+    <td style="{WORD_BREAK}";>{source_image_url}</td>
+</tr>
+"""

src/application/text/entity.py CHANGED Viewed

@@ -44,6 +44,7 @@ def extract_entities_gpt(
     """
     # Construct the prompt for the GPT model.
     prompt = f"""
 Compare the ORIGINAL TEXT and the COMPARED TEXT.
 Find entity pairs with significantly different meanings after paraphrasing.

     """
     # Construct the prompt for the GPT model.
+    # TODO: Move to config or prompt file
     prompt = f"""
 Compare the ORIGINAL TEXT and the COMPARED TEXT.
 Find entity pairs with significantly different meanings after paraphrasing.

src/application/text/helper.py CHANGED Viewed

@@ -8,7 +8,10 @@ import string
 from collections import Counter
 from difflib import SequenceMatcher
-from nltk.tokenize import word_tokenize
 from nltk.util import ngrams
 from sklearn.feature_extraction.text import TfidfVectorizer
@@ -276,3 +279,223 @@ def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
     # Add the last range to the result.
     result.append([start, end])
     return result

 from collections import Counter
 from difflib import SequenceMatcher
+from nltk.tokenize import (
+    sent_tokenize,
+    word_tokenize,
+)
 from nltk.util import ngrams
 from sklearn.feature_extraction.text import TfidfVectorizer
     # Add the last range to the result.
     result.append([start, end])
     return result
+def postprocess_label(labels: list[str]) -> str:
+    """
+    Creates a label string with the format
+    "Partially generated by [label1] and [label2] and ...".
+    Removes duplicate labels while preserving the original order.
+    Args:
+        labels: A list of strings representing labels.
+    Returns:
+        A string with the formatted label.
+    """
+    prefix = "Partially generated by "
+    for index, label in enumerate(labels):
+        if label.startswith(prefix):
+            labels[index] = label[len(prefix):]
+    labels = list(set(labels))
+    label = prefix
+    if len(labels) == 1:
+        label += labels[0]
+    elif len(labels) == 2:
+        label += f"{labels[0]} and {labels[1]}"
+    else:
+        combination = ", ".join(labels[0 : len(labels) - 1])
+        label += f"{combination}, and {labels[-1]}"
+    return label
+def split_into_sentences(input_text: str) -> list[str]:
+    """
+    Splits input text into sentences by newlines
+        and then tokenizes each paragraph into sentences.
+    Args:
+        input_text (str): The input text as a string.
+    Returns:
+        list: A list of sentences.
+            Returns an empty list if input is not a string.
+    """
+    if not isinstance(input_text, str):
+        return []
+    # Split the input text into paragraphs based on newline characters,
+    # keeping the newline characters.
+    paragraphs = input_text.splitlines(keepends=True)
+    sentences = []
+    for paragraph in paragraphs:
+        # Remove leading/trailing whitespace
+        paragraph = paragraph.strip()
+        if paragraph and paragraph != "\n":
+            # Tokenize the paragraph into sentences
+            sentences.extend(sent_tokenize(paragraph))
+    return sentences
+def split_into_paragraphs(input_text: str) -> list[str]:
+    """
+    Splits input text into paragraphs based on newline characters.
+    Args:
+        input_text (str): The input text as a string.
+    Returns:
+        list: A list of paragraphs.
+            Returns an empty list if input is not a string.
+    """
+    if not isinstance(input_text, str):
+        return []
+    # Split the input text into paragraphs based on newline characters,
+    # keeping the newline characters.
+    paragraphs = input_text.splitlines(keepends=True)
+    out_paragraphs = []
+    for paragraph in paragraphs:
+        # Remove leading/trailing whitespace
+        paragraph = paragraph.strip()
+        if paragraph and paragraph != "\n":
+            # Append the cleaned paragraph to the output list.
+            out_paragraphs.append(paragraph)
+    return out_paragraphs
+def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
+    """
+    Extracts start and end indices from a list of dictionaries.
+    Args:
+        colored_idx (list[dict]): A list of dictionaries,
+            where each dictionary has 'start' and 'end' keys.
+    Returns:
+        tuple: A tuple containing two lists:
+            - starts (list[int]): A list of start indices.
+            - ends (list[int]): A list of end indices.
+    """
+    starts = []
+    ends = []
+    for index in colored_idx:
+        starts.append(index["start"])
+        ends.append(index["end"])
+    return starts, ends
+def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]):
+    """
+    Filters start and end indices to exclude any indices present in the
+        ignore_indices list.
+    Args:
+        starts (list[int]): A list of starting indices.
+        ends (list[int]): A list of ending indices.
+            Must be the same length as starts.
+        ignore_indices (list[int]): A list of indices to exclude.
+    Returns:
+        A tuple of two lists of integers:
+            - filtered_starts
+            - filtered_ends
+        Returns empty lists if the input is invalid
+            or if all ranges are filtered out.
+    Examples:
+        starts = [0, 5, 10]
+        ends = [3, 7, 12]  # words at the end will not be colored.
+        ignore_indices = [1, 2, 12, 17]
+        # Output:
+            starts = [0, 3, 5, 10]
+            ends = [1, 4, 7, 12]
+    """
+    if len(starts) != len(ends):
+        print(
+            "Error: The 'starts' & 'ends' lists must have the same length.",
+        )
+        return [], []
+    filtered_starts = []
+    filtered_ends = []
+    for i in range(len(starts)):
+        start = starts[i]
+        end = ends[i]
+        if end < start:
+            print(
+                f"Error: End index {end} < start index {start} at position {i}.",  # noqa: E501
+            )
+            return [], []
+        start_end = list(range(start, end + 1, 1))
+        start_end = list(set(start_end) - set(ignore_indices))
+        # new_start, new_end = self.extract_sequences(start_end)
+        new_start, new_end = extract_new_startend(
+            start,
+            end,
+            ignore_indices,
+        )
+        filtered_starts.extend(new_start)
+        filtered_ends.extend(new_end)
+    return filtered_starts, filtered_ends
+def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tuple[list[int], list[int]]:
+    """
+    Extracts new start and end indices by splitting a range based on
+        ignored indices.
+    Args:
+        start (int): The starting index of the range.
+        end (int): The ending index of the range (exclusive).
+        ignore_indices (list): indices to ignore within the range.
+    Returns:
+        tuple: A tuple containing two lists:
+            - new_starts (list): Starting indices for the sub-ranges.
+            - new_ends (list): Ending indices for the sub-ranges.
+    """
+    # Sort the set of ignore_indices in ascending order.
+    indexes = list(set(ignore_indices))
+    indexes.sort()
+    new_starts = []
+    new_ends = []
+    new_start = start
+    # If no indices to ignore, return the original range.
+    if indexes is None or len(indexes) < 1:
+        new_starts.append(start)
+        new_ends.append(end)
+        return new_starts, new_ends
+    for index in indexes:
+        # Skip indices that are outside the range [start, end).
+        if index < start:
+            continue
+        elif index >= end:
+            continue
+        new_starts.append(new_start)
+        new_ends.append(index)
+        new_start = index + 1
+    new_starts.append(new_start)
+    new_ends.append(end)
+    return new_starts, new_ends

src/application/text/model_detection.py CHANGED Viewed

@@ -13,7 +13,6 @@ from src.application.config import (
     DEVICE,
     GPT_PARAPHRASE_MODELS,
     HUMAN,
-    MACHINE,
     MODEL_HUMAN_LABEL,
     PARAPHRASE_MODEL,
     UNKNOWN,
@@ -62,9 +61,9 @@ def detect_text_by_ai_model(
         if result["label"] == MODEL_HUMAN_LABEL[model]:
             label = HUMAN
         else:
-            label = MACHINE
             generated_model, _ = predict_generation_model(input_text)
-            label += f"<br>({generated_model})"
         return label, confidence_score

     DEVICE,
     GPT_PARAPHRASE_MODELS,
     HUMAN,
     MODEL_HUMAN_LABEL,
     PARAPHRASE_MODEL,
     UNKNOWN,
         if result["label"] == MODEL_HUMAN_LABEL[model]:
             label = HUMAN
         else:
+            # label = MACHINE
             generated_model, _ = predict_generation_model(input_text)
+            label = f"Partially generated by {generated_model}"
         return label, confidence_score

src/application/text/preprocessing.py DELETED Viewed

@@ -1,67 +0,0 @@
-"""
-Author: Khanh Phan
-Date: 2024-12-04
-"""
-from nltk.tokenize import sent_tokenize
-# TODO: consider moving to helpers
-def split_into_sentences(input_text: str) -> list[str]:
-    """
-    Splits input text into sentences by newlines
-        and then tokenizes each paragraph into sentences.
-    Args:
-        input_text (str): The input text as a string.
-    Returns:
-        list: A list of sentences.
-            Returns an empty list if input is not a string.
-    """
-    if not isinstance(input_text, str):
-        return []
-    # Split the input text into paragraphs based on newline characters,
-    # keeping the newline characters.
-    paragraphs = input_text.splitlines(keepends=True)
-    sentences = []
-    for paragraph in paragraphs:
-        # Remove leading/trailing whitespace
-        paragraph = paragraph.strip()
-        if paragraph and paragraph != "\n":
-            # Tokenize the paragraph into sentences
-            sentences.extend(sent_tokenize(paragraph))
-    return sentences
-def split_into_paragraphs(input_text: str) -> list[str]:
-    """
-    Splits input text into paragraphs based on newline characters.
-    Args:
-        input_text (str): The input text as a string.
-    Returns:
-        list: A list of paragraphs.
-            Returns an empty list if input is not a string.
-    """
-    if not isinstance(input_text, str):
-        return []
-    # Split the input text into paragraphs based on newline characters,
-    # keeping the newline characters.
-    paragraphs = input_text.splitlines(keepends=True)
-    out_paragraphs = []
-    for paragraph in paragraphs:
-        # Remove leading/trailing whitespace
-        paragraph = paragraph.strip()
-        if paragraph and paragraph != "\n":
-            # Append the cleaned paragraph to the output list.
-            out_paragraphs.append(paragraph)
-    return out_paragraphs

src/application/text/search_detection.py CHANGED Viewed

@@ -17,7 +17,7 @@ from src.application.config import (
     PARAPHRASE_THRESHOLD_MACHINE,
     TOP_URLS_PER_SEARCH,
 )
-from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search import (
     generate_search_phrases,
     search_by_google,

     PARAPHRASE_THRESHOLD_MACHINE,
     TOP_URLS_PER_SEARCH,
 )
+from src.application.text.helper import split_into_sentences
 from src.application.text.search import (
     generate_search_phrases,
     search_by_google,

src/application/url_reader.py CHANGED Viewed

@@ -8,18 +8,29 @@ from newspaper import (
     article,
 )
-# TODO: move this to a config file
-MAX_URL_SIZE = 2000000  # ~2MB
 class URLReader:
     def __init__(self, url: string, newspaper: bool = True):
-        self.url = url
-        self.text = None  # string
-        self.title = None  # string
-        self.images = None  # list of Image objects
-        self.top_image = None  # Image object
-        self.is_extracted = False
         url_size = self.get_size()
         if url_size is None or url_size > MAX_URL_SIZE:
@@ -27,9 +38,7 @@ class URLReader:
         else:
             self.is_extracted = True
-        self.newspaper = (
-            newspaper  # True if using newspaper4k, False if using BS
-        )
         if self.newspaper is True:
             self.extract_content_newspaper()
         else:
@@ -37,81 +46,70 @@ class URLReader:
     def extract_content_newspaper(self):
         """
-        Use newspaper4k to extracts content from a URL
-        Args:
-            url: The URL of the web page.
-        Returns:
-            The extracted content (title, text, images)
         """
         try:
             response = requests.get(self.url)
-            response.raise_for_status()
         except requests.exceptions.RequestException as e:
             print(f"Error fetching URL: {e}")
             return None
-        try:
-            news = article(url=self.url, fetch_images=True)
         except (ArticleException, ArticleBinaryDataException) as e:
             print(f"\t\t↑↑↑ Error downloading article: {e}")
             return None
-        self.title = news.title
-        self.text = news.text
-        self.images = list(set(news.images))  # Remove duplicates
-        self.top_image = news.top_image
     def extract_content_bs(self):
         """
-        Use BS and process content
         """
-        response = requests.get(self.url)
-        response.raise_for_status()
-        response.encoding = response.apparent_encoding
-        try:
             soup = BeautifulSoup(response.content, "html.parser")
-        except Exception as e:
-            print(f"Error parsing HTML content from {self.url}: {e}")
-            return None
-        self.title = soup.title.string.strip() if soup.title else None
-        image_urls = [img["src"] for img in soup.find_all("img")]
-        self.images = image_urls
-        self.top_image = self.images[0]
-        # Exclude text within specific elements
-        for element in soup(["img", "figcaption", "table", "script", "style"]):
-            element.extract()
-        # text = soup.get_text(separator="\n")
-        paragraphs = soup.find_all("p")
-        text = " ".join([p.get_text() for p in paragraphs])
-        self.text = text
     def get_size(self):
         """
         Retrieves the size of a URL's content using a HEAD request.
-        Args:
-            url: The URL to check.
-        Returns:
-            The size of the content in bytes,
-            or None if the size cannot be determined
-            (e.g., due to network errors or missing Content-Length header).
         """
         try:
             response = requests.head(
                 self.url,
                 allow_redirects=True,
                 timeout=5,
-            )  # Add timeout
             response.raise_for_status()  # Raise HTTPError for bad responses
             content_length = response.headers.get("Content-Length")
@@ -123,7 +121,7 @@ class URLReader:
         except requests.exceptions.RequestException as e:
             print(f"\t\t↑↑↑ Error getting URL size: {e}")
-        return None
 if __name__ == "__main__":

     article,
 )
+from src.application.config import MAX_URL_SIZE
 class URLReader:
+    """
+    A class to extract content (title, text, images) from a given URL.
+    Supports two extraction methods: newspaper4k and BeautifulSoup.
+    """
     def __init__(self, url: string, newspaper: bool = True):
+        """
+        Initializes the URLReader object.
+        Args:
+            url: The URL to extract content from.
+            newspaper: True to use newspaper4k, False to use BeautifulSoup.
+        """
+        self.url: str = url
+        self.text: str = None  # Extracted text content
+        self.title: str = None  # Extracted title
+        self.images: list[str] = None  # list of image URLs
+        self.top_image: str = None  # URL of the top image
+        self.is_extracted: bool = False  # Indicating successful extraction
         url_size = self.get_size()
         if url_size is None or url_size > MAX_URL_SIZE:
         else:
             self.is_extracted = True
+        self.newspaper = newspaper
         if self.newspaper is True:
             self.extract_content_newspaper()
         else:
     def extract_content_newspaper(self):
         """
+        Extracts content from a URL using the newspaper4k library.
         """
         try:
             response = requests.get(self.url)
+            response.raise_for_status()  # Raise HTTPError for bad responses
+            news = article(url=self.url, fetch_images=True)
+            self.title = news.title
+            self.text = news.text
+            self.images = list(set(news.images))  # Remove duplicates
+            self.top_image = news.top_image
         except requests.exceptions.RequestException as e:
             print(f"Error fetching URL: {e}")
             return None
         except (ArticleException, ArticleBinaryDataException) as e:
             print(f"\t\t↑↑↑ Error downloading article: {e}")
             return None
     def extract_content_bs(self):
         """
+        Extracts content from a URL using BeautifulSoup.
         """
+        try:
+            response = requests.get(self.url)
+            response.raise_for_status()
+            response.encoding = response.apparent_encoding  # Detect encoding
             soup = BeautifulSoup(response.content, "html.parser")
+            self.title = soup.title.string.strip() if soup.title else None
+            image_urls = [img["src"] for img in soup.find_all("img")]
+            self.images = image_urls
+            self.top_image = self.images[0]
+            # Remove unwanted elements from the HTML
+            for element in soup(
+                ["img", "figcaption", "table", "script", "style"],
+            ):
+                element.extract()
+            paragraphs = soup.find_all("p")
+            self.text = " ".join([p.get_text() for p in paragraphs])
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching URL: {e}")
+            return None
+        except Exception as e:
+            print(f"Error parsing HTML content from {self.url}: {e}")
+            return None
     def get_size(self):
         """
         Retrieves the size of a URL's content using a HEAD request.
         """
         try:
             response = requests.head(
                 self.url,
                 allow_redirects=True,
                 timeout=5,
+            )
             response.raise_for_status()  # Raise HTTPError for bad responses
             content_length = response.headers.get("Content-Length")
         except requests.exceptions.RequestException as e:
             print(f"\t\t↑↑↑ Error getting URL size: {e}")
+            return None
 if __name__ == "__main__":

test.py CHANGED Viewed

@@ -1,3 +1,27 @@
-a = [1, 2]
-a.append(None)
-print(a)

+def postprocess_label(labels: list[str]) -> str:
+    """
+    Creates a label string with the format
+    "Partially generated by [label1] and [label2] and ...".
+    Removes duplicate labels while preserving the original order.
+    Args:
+        labels: A list of strings representing labels.
+    Returns:
+        A string with the formatted label.
+    """
+    labels = list(set(labels))
+    label = "Partially generated by "
+    if len(label) == 1:
+        label += labels[0]
+    elif len(labels) == 2:
+        label += f"{labels[0]} and {labels[1]}"
+    else:
+        combination = ", ".join(labels[0 : len(labels) - 1])
+        label += f"{combination}, and {labels[-1]}"
+    return label
+labels = ["gpt-4o", "gpt-4o-mini", "gpt-4o-l"]
+postprocessed_label = postprocess_label(labels)
+print(postprocessed_label)