Spaces:
Sleeping
Sleeping
Commit
Β·
b73a4fc
1
Parent(s):
0827f9d
refactor code and fix bugs
Browse files- application.py +2 -2
- src/application/config.py +8 -1
- src/application/content_detection.py +226 -258
- src/application/content_generation.py +92 -58
- src/application/formatting.py +67 -0
- src/application/formatting_ordinary_user.py +87 -0
- src/application/text/entity.py +1 -0
- src/application/text/helper.py +224 -1
- src/application/text/model_detection.py +2 -3
- src/application/text/preprocessing.py +0 -67
- src/application/text/search_detection.py +1 -1
- src/application/url_reader.py +55 -57
- test.py +27 -3
application.py
CHANGED
@@ -50,8 +50,8 @@ def generate_analysis_report(
|
|
50 |
):
|
51 |
news_analysis = NewsVerification()
|
52 |
news_analysis.load_news(news_title, news_content, news_image)
|
53 |
-
news_analysis.
|
54 |
-
return news_analysis.
|
55 |
|
56 |
|
57 |
# Define the GUI
|
|
|
50 |
):
|
51 |
news_analysis = NewsVerification()
|
52 |
news_analysis.load_news(news_title, news_content, news_image)
|
53 |
+
news_analysis.determine_origin()
|
54 |
+
return news_analysis.generate_report()
|
55 |
|
56 |
|
57 |
# Define the GUI
|
src/application/config.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
# Download necessary NLTK data files
|
2 |
"""
|
3 |
Author: Khanh Phan
|
4 |
Date: 2024-12-04
|
5 |
"""
|
|
|
6 |
import os
|
7 |
|
8 |
import nltk
|
@@ -22,6 +22,7 @@ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
|
22 |
# GPT Model
|
23 |
GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini"
|
24 |
GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
|
|
|
25 |
AZUREOPENAI_CLIENT = openai.AzureOpenAI(
|
26 |
api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION,
|
27 |
api_key=AZURE_OPENAI_API_KEY,
|
@@ -54,6 +55,7 @@ MAX_CHAR_SIZE = 30000
|
|
54 |
|
55 |
# Number of top URLs per search
|
56 |
TOP_URLS_PER_SEARCH = 3
|
|
|
57 |
|
58 |
# Search parameters
|
59 |
GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
|
@@ -80,5 +82,10 @@ factor = 0: Black.
|
|
80 |
"""
|
81 |
ENTITY_LIGHTEN_COLOR = 2.2
|
82 |
ENTITY_DARKEN_COLOR = 0.7
|
|
|
83 |
ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness).
|
84 |
ENTITY_BRIGHTNESS = 0.75 # color's brightness.
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
Author: Khanh Phan
|
3 |
Date: 2024-12-04
|
4 |
"""
|
5 |
+
|
6 |
import os
|
7 |
|
8 |
import nltk
|
|
|
22 |
# GPT Model
|
23 |
GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini"
|
24 |
GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
|
25 |
+
GPT_IMAGE_MODEL = "dall-e-3"
|
26 |
AZUREOPENAI_CLIENT = openai.AzureOpenAI(
|
27 |
api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION,
|
28 |
api_key=AZURE_OPENAI_API_KEY,
|
|
|
55 |
|
56 |
# Number of top URLs per search
|
57 |
TOP_URLS_PER_SEARCH = 3
|
58 |
+
MAX_URL_SIZE = 2 * 1024 * 1024 # ~2 MB
|
59 |
|
60 |
# Search parameters
|
61 |
GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
|
|
|
82 |
"""
|
83 |
ENTITY_LIGHTEN_COLOR = 2.2
|
84 |
ENTITY_DARKEN_COLOR = 0.7
|
85 |
+
|
86 |
ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness).
|
87 |
ENTITY_BRIGHTNESS = 0.75 # color's brightness.
|
88 |
+
|
89 |
+
|
90 |
+
# HTML formatting
|
91 |
+
WORD_BREAK = "word-break: break-all;"
|
src/application/content_detection.py
CHANGED
@@ -1,7 +1,12 @@
|
|
1 |
-
|
|
|
|
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
|
|
|
|
|
5 |
from src.application.image.image_detection import (
|
6 |
detect_image_by_ai_model,
|
7 |
detect_image_by_reverse_search,
|
@@ -11,24 +16,27 @@ from src.application.text.entity import (
|
|
11 |
apply_highlight,
|
12 |
highlight_entities,
|
13 |
)
|
14 |
-
from src.application.text.helper import
|
|
|
|
|
|
|
|
|
15 |
from src.application.text.model_detection import (
|
16 |
detect_text_by_ai_model,
|
17 |
predict_generation_model,
|
18 |
)
|
19 |
-
from src.application.text.
|
20 |
-
from src.application.text.search_detection import (
|
21 |
-
PARAPHRASE_THRESHOLD_MACHINE,
|
22 |
-
find_sentence_source,
|
23 |
-
)
|
24 |
|
25 |
|
26 |
class NewsVerification:
|
27 |
def __init__(self):
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
self.
|
|
|
|
|
|
|
32 |
|
33 |
self.text_prediction_label: list[str] = ["UNKNOWN"]
|
34 |
self.text_prediction_score: list[float] = [0.0]
|
@@ -37,8 +45,8 @@ class NewsVerification:
|
|
37 |
self.image_prediction_score: list[str] = [0.0]
|
38 |
self.image_referent_url: list[str] = []
|
39 |
|
40 |
-
self.news_prediction_label = ""
|
41 |
-
self.news_prediction_score = -1
|
42 |
|
43 |
# news' urls to find img
|
44 |
self.found_img_url: list[str] = []
|
@@ -52,8 +60,7 @@ class NewsVerification:
|
|
52 |
"similarity",
|
53 |
"paraphrase",
|
54 |
"url",
|
55 |
-
"
|
56 |
-
"entities",
|
57 |
],
|
58 |
)
|
59 |
self.grouped_url_df: pd.DataFrame = pd.DataFrame()
|
@@ -63,95 +70,165 @@ class NewsVerification:
|
|
63 |
self.fact_checker_table: list = []
|
64 |
self.governor_table: list = []
|
65 |
|
66 |
-
def load_news(self, news_title, news_content, news_image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
self.news_text = (news_title + "\n\n" + news_content).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
self.news_title = news_title
|
69 |
self.news_content = news_content
|
70 |
self.news_image = news_image
|
71 |
|
72 |
-
def
|
73 |
-
|
|
|
|
|
|
|
74 |
|
75 |
-
# Group inout and source by url
|
76 |
def concat_text(series):
|
|
|
|
|
|
|
77 |
return " ".join(
|
78 |
series.astype(str).tolist(),
|
79 |
) # Handle mixed data types and NaNs
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
88 |
# Add new columns for label and score
|
89 |
self.grouped_url_df["label"] = None
|
90 |
self.grouped_url_df["score"] = None
|
91 |
|
92 |
print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
|
93 |
|
|
|
|
|
|
|
|
|
94 |
for index, row in self.grouped_url_df.iterrows():
|
|
|
95 |
label, score = self.verify_text(row["url"])
|
|
|
|
|
96 |
if label == "UNKNOWN":
|
97 |
-
# Concatenate text from "input" in sentence_df
|
98 |
text = " ".join(row["input"])
|
99 |
|
100 |
-
#
|
101 |
label, score = detect_text_by_ai_model(text)
|
102 |
|
103 |
self.grouped_url_df.at[index, "label"] = label
|
104 |
self.grouped_url_df.at[index, "score"] = score
|
105 |
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
machine_label = self.grouped_url_df[
|
109 |
self.grouped_url_df["label"].str.contains(
|
110 |
-
"
|
111 |
case=False,
|
112 |
na=False,
|
113 |
)
|
114 |
]
|
115 |
|
116 |
-
if
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
118 |
self.text_prediction_label[0] = label
|
119 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
120 |
else:
|
|
|
121 |
machine_label = self.aligned_sentences_df[
|
122 |
self.aligned_sentences_df["label"] == "HUMAN"
|
123 |
]
|
124 |
self.text_prediction_label[0] = "HUMAN"
|
125 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
126 |
-
else:
|
|
|
127 |
print("No source found in the input text")
|
128 |
text = " ".join(self.aligned_sentences_df["input"].tolist())
|
129 |
-
|
|
|
130 |
label, score = detect_text_by_ai_model(text)
|
131 |
self.text_prediction_label[0] = label
|
132 |
self.text_prediction_score[0] = score
|
133 |
|
134 |
def find_text_source(self):
|
135 |
"""
|
136 |
-
Determines the origin of the given text based on paraphrasing
|
137 |
-
and human authorship analysis.
|
138 |
-
|
139 |
-
Args:
|
140 |
-
text: The input text to be analyzed.
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
- "MACHINE": If the text is likely generated by a machine.
|
146 |
"""
|
147 |
print("CHECK TEXT:")
|
148 |
print("\tFrom search engine:")
|
149 |
-
|
150 |
-
# input_sentences = split_into_sentences(self.news_text)
|
151 |
input_paragraphs = split_into_paragraphs(self.news_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
-
# Setup
|
154 |
-
|
155 |
for _ in range(len(input_paragraphs)):
|
156 |
self.aligned_sentences_df = pd.concat(
|
157 |
[
|
@@ -173,7 +250,7 @@ class NewsVerification:
|
|
173 |
ignore_index=True,
|
174 |
)
|
175 |
|
176 |
-
#
|
177 |
for index, _ in enumerate(input_paragraphs):
|
178 |
similarity = self.aligned_sentences_df.loc[index, "similarity"]
|
179 |
if similarity is not None:
|
@@ -188,23 +265,47 @@ class NewsVerification:
|
|
188 |
index,
|
189 |
self.aligned_sentences_df,
|
190 |
)
|
191 |
-
|
|
|
|
|
|
|
192 |
self.found_img_url.extend(img_urls)
|
193 |
|
194 |
-
# determine if the whole source is from a news or not
|
195 |
-
|
196 |
def verify_text(self, url):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
label = "UNKNOWN"
|
198 |
score = 0
|
|
|
199 |
# calculate the average similarity when the similary score
|
200 |
# in each row of sentences_df is higher than 0.8
|
|
|
|
|
201 |
filtered_by_url = self.aligned_sentences_df[
|
202 |
self.aligned_sentences_df["url"] == url
|
203 |
]
|
|
|
|
|
204 |
filtered_by_similarity = filtered_by_url[
|
205 |
-
filtered_by_url["similarity"] >
|
206 |
]
|
207 |
-
|
|
|
|
|
208 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
209 |
contains_machine = (
|
210 |
filtered_by_similarity["label"]
|
@@ -215,8 +316,10 @@ class NewsVerification:
|
|
215 |
)
|
216 |
.any()
|
217 |
)
|
|
|
|
|
218 |
if contains_machine:
|
219 |
-
|
220 |
machine_rows = filtered_by_similarity[
|
221 |
filtered_by_similarity["label"].str.contains(
|
222 |
"MACHINE",
|
@@ -225,9 +328,10 @@ class NewsVerification:
|
|
225 |
)
|
226 |
]
|
227 |
generated_model, _ = predict_generation_model(self.news_text)
|
228 |
-
label
|
229 |
score = machine_rows["similarity"].mean()
|
230 |
else:
|
|
|
231 |
label = "HUMAN"
|
232 |
human_rows = filtered_by_similarity[
|
233 |
filtered_by_similarity["label"].str.contains(
|
@@ -241,13 +345,26 @@ class NewsVerification:
|
|
241 |
return label, score
|
242 |
|
243 |
def determine_image_origin(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
print("CHECK IMAGE:")
|
|
|
|
|
245 |
if self.news_image is None:
|
246 |
self.image_prediction_label = "UNKNOWN"
|
247 |
self.image_prediction_score = 0.0
|
248 |
self.image_referent_url = None
|
249 |
return
|
250 |
|
|
|
|
|
251 |
matched_url, similarity = detect_image_from_news_image(
|
252 |
self.news_image,
|
253 |
self.found_img_url,
|
@@ -259,6 +376,8 @@ class NewsVerification:
|
|
259 |
self.image_referent_url = matched_url
|
260 |
return
|
261 |
|
|
|
|
|
262 |
matched_url, similarity = detect_image_by_reverse_search(
|
263 |
self.news_image,
|
264 |
)
|
@@ -269,6 +388,8 @@ class NewsVerification:
|
|
269 |
self.image_referent_url = matched_url
|
270 |
return
|
271 |
|
|
|
|
|
272 |
detected_label, score = detect_image_by_ai_model(self.news_image)
|
273 |
if detected_label:
|
274 |
print(f"detected_label: {detected_label} ({score})")
|
@@ -277,18 +398,34 @@ class NewsVerification:
|
|
277 |
self.image_referent_url = None
|
278 |
return
|
279 |
|
|
|
280 |
self.image_prediction_label = "UNKNOWN"
|
281 |
self.image_prediction_score = 50
|
282 |
self.image_referent_url = None
|
283 |
|
284 |
-
def
|
|
|
|
|
|
|
285 |
if self.news_text != "":
|
286 |
self.determine_text_origin()
|
287 |
if self.news_image != "":
|
288 |
self.determine_image_origin()
|
289 |
-
|
290 |
-
|
291 |
self.handle_entities()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
ordinary_user_table = self.create_ordinary_user_table()
|
293 |
fact_checker_table = self.create_fact_checker_table()
|
294 |
governor_table = self.create_governor_table()
|
@@ -296,6 +433,16 @@ class NewsVerification:
|
|
296 |
return ordinary_user_table, fact_checker_table, governor_table
|
297 |
|
298 |
def handle_entities(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
entities_with_colors = []
|
300 |
for index, row in self.grouped_url_df.iterrows():
|
301 |
# Get entity-words (in pair) with colors
|
@@ -304,51 +451,23 @@ class NewsVerification:
|
|
304 |
row["source"],
|
305 |
)
|
306 |
|
|
|
|
|
307 |
for index, sentence in self.aligned_sentences_df.iterrows():
|
308 |
if sentence["url"] == row["url"]:
|
|
|
309 |
self.aligned_sentences_df.at[index, "entities"] = (
|
310 |
-
entities_with_colors
|
311 |
)
|
312 |
|
313 |
-
def get_text_urls(self):
|
314 |
-
return set(self.text_referent_url)
|
315 |
-
|
316 |
-
def compare_sentences(self, sentence_1, sentence_2, position, color):
|
317 |
"""
|
318 |
-
|
319 |
-
outputting their start and end positions.
|
320 |
|
|
|
|
|
321 |
"""
|
322 |
-
|
323 |
-
if not sentence_1 or not sentence_2: # Handle empty strings
|
324 |
-
return []
|
325 |
-
|
326 |
-
s = SequenceMatcher(None, sentence_1, sentence_2)
|
327 |
-
common_phrases = []
|
328 |
-
|
329 |
-
for block in s.get_matching_blocks():
|
330 |
-
if block.size > 0: # Ignore zero-length matches
|
331 |
-
start_1 = block.a
|
332 |
-
end_1 = block.a + block.size
|
333 |
-
start_2 = block.b
|
334 |
-
end_2 = block.b + block.size
|
335 |
-
|
336 |
-
phrase = sentence_1[
|
337 |
-
start_1:end_1
|
338 |
-
] # Or sentence_2[start_2:end_2], they are the same
|
339 |
-
|
340 |
-
common_phrases.append(
|
341 |
-
{
|
342 |
-
"phrase": phrase,
|
343 |
-
"start_1": start_1 + position,
|
344 |
-
"end_1": end_1 + position,
|
345 |
-
"start_2": start_2,
|
346 |
-
"end_2": end_2,
|
347 |
-
"color": color,
|
348 |
-
},
|
349 |
-
)
|
350 |
-
position += len(sentence_1)
|
351 |
-
return common_phrases, position
|
352 |
|
353 |
def create_fact_checker_table(self):
|
354 |
rows = []
|
@@ -387,7 +506,7 @@ class NewsVerification:
|
|
387 |
if index == 0 or current_url != previous_url:
|
388 |
first_url_row = True
|
389 |
previous_url = current_url
|
390 |
-
# Increase counter
|
391 |
while (
|
392 |
index + span_row < len(self.fact_checker_table)
|
393 |
and self.fact_checker_table[index + span_row][4]
|
@@ -432,7 +551,7 @@ class NewsVerification:
|
|
432 |
</table>
|
433 |
|
434 |
<style>
|
435 |
-
|
436 |
|
437 |
def format_text_fact_checker_row(
|
438 |
self,
|
@@ -467,12 +586,12 @@ class NewsVerification:
|
|
467 |
entity_count = len(row[3])
|
468 |
|
469 |
# Color overlapping words
|
470 |
-
input_sentence =
|
471 |
input_sentence,
|
472 |
row[1],
|
473 |
highlight_idx_input,
|
474 |
) # text, index of highlight words
|
475 |
-
source_sentence =
|
476 |
source_sentence,
|
477 |
row[2],
|
478 |
highlight_idx_source,
|
@@ -493,6 +612,7 @@ class NewsVerification:
|
|
493 |
source_sentence = row[0]["source"]
|
494 |
|
495 |
url = row[0]["url"]
|
|
|
496 |
# Displayed label and score by url
|
497 |
filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
|
498 |
if len(filterby_url) > 0:
|
@@ -506,7 +626,7 @@ class NewsVerification:
|
|
506 |
source_text_url = f"""<a href="{url}">{url}</a>"""
|
507 |
|
508 |
# Format displayed entity count
|
509 |
-
entity_count_text =
|
510 |
|
511 |
border_top = "border-top: 1px solid transparent;"
|
512 |
border_bottom = "border-bottom: 1px solid transparent;"
|
@@ -600,7 +720,7 @@ class NewsVerification:
|
|
600 |
<style>
|
601 |
"""
|
602 |
|
603 |
-
def format_text_ordinary_user_row(self
|
604 |
input_sentences = ""
|
605 |
source_text_urls = ""
|
606 |
urls = []
|
@@ -623,7 +743,7 @@ class NewsVerification:
|
|
623 |
</tr>
|
624 |
"""
|
625 |
|
626 |
-
def format_image_ordinary_user_row(self
|
627 |
|
628 |
if (
|
629 |
self.image_referent_url is not None
|
@@ -720,12 +840,12 @@ class NewsVerification:
|
|
720 |
)
|
721 |
|
722 |
# Color overlapping words
|
723 |
-
input_sentence =
|
724 |
input_sentence,
|
725 |
row[1],
|
726 |
highlight_idx_input,
|
727 |
) # text, index of highlight words
|
728 |
-
source_sentence =
|
729 |
source_sentence,
|
730 |
row[2],
|
731 |
highlight_idx_source,
|
@@ -759,7 +879,7 @@ class NewsVerification:
|
|
759 |
if row[3] is not None:
|
760 |
entity_count.append(len(row[3]))
|
761 |
|
762 |
-
entity_count_text =
|
763 |
word_break = "word-break: break-all;"
|
764 |
return f"""
|
765 |
<tr>
|
@@ -785,155 +905,3 @@ class NewsVerification:
|
|
785 |
|
786 |
word_break = "word-break: break-all;"
|
787 |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501
|
788 |
-
|
789 |
-
def get_entity_count_text(self, entity_count):
|
790 |
-
if entity_count <= 0:
|
791 |
-
entity_count_text = ""
|
792 |
-
elif entity_count == 1:
|
793 |
-
entity_count_text = "with 1 altered entity"
|
794 |
-
else:
|
795 |
-
entity_count_text = "with altered entities"
|
796 |
-
return entity_count_text
|
797 |
-
|
798 |
-
def color_text(self, text, colored_idx, highlighted_idx):
|
799 |
-
sentence = ""
|
800 |
-
words = text.split()
|
801 |
-
|
802 |
-
starts, ends = self.extract_starts_ends(colored_idx)
|
803 |
-
starts, ends = self.filter_indices(starts, ends, highlighted_idx)
|
804 |
-
|
805 |
-
previous_end = 0
|
806 |
-
for start, end in zip(starts, ends):
|
807 |
-
sentence += " ".join(words[previous_end:start])
|
808 |
-
|
809 |
-
equal_words = " ".join(words[start:end])
|
810 |
-
sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
811 |
-
|
812 |
-
previous_end = end
|
813 |
-
|
814 |
-
sentence += " ".join(words[previous_end:])
|
815 |
-
|
816 |
-
return sentence
|
817 |
-
|
818 |
-
def extract_starts_ends(self, colored_idx):
|
819 |
-
starts = []
|
820 |
-
ends = []
|
821 |
-
for index in colored_idx:
|
822 |
-
starts.append(index["start"])
|
823 |
-
ends.append(index["end"])
|
824 |
-
return starts, ends
|
825 |
-
|
826 |
-
def filter_indices(self, starts, ends, ignore_indices):
|
827 |
-
"""
|
828 |
-
Filters start and end indices to exclude any indices present in the
|
829 |
-
ignore_indices list.
|
830 |
-
|
831 |
-
Args:
|
832 |
-
starts: A list of starting indices.
|
833 |
-
ends: A list of ending indices. Must be the same length as starts.
|
834 |
-
ignore_indices: A list of indices to exclude.
|
835 |
-
|
836 |
-
Returns:
|
837 |
-
A tuple of two lists: filtered_starts and filtered_ends.
|
838 |
-
Returns empty lists if the input is invalid
|
839 |
-
or if all ranges are filtered out.
|
840 |
-
Prints error messages for invalid input.
|
841 |
-
|
842 |
-
Examples:
|
843 |
-
starts = [0, 5, 10]
|
844 |
-
ends = [3, 7, 12] # words at the end will not be colored.
|
845 |
-
ignore_indices = [1, 2, 12, 17]
|
846 |
-
|
847 |
-
# Output:
|
848 |
-
starts = [0, 3, 5, 10]
|
849 |
-
ends = [1, 4, 7, 12]
|
850 |
-
|
851 |
-
"""
|
852 |
-
|
853 |
-
if len(starts) != len(ends):
|
854 |
-
print(
|
855 |
-
"Error: The 'starts' and 'ends' lists must have the same length.", # noqa: E501
|
856 |
-
)
|
857 |
-
return [], []
|
858 |
-
|
859 |
-
filtered_starts = []
|
860 |
-
filtered_ends = []
|
861 |
-
|
862 |
-
for i in range(len(starts)):
|
863 |
-
start = starts[i]
|
864 |
-
end = ends[i]
|
865 |
-
|
866 |
-
if end < start:
|
867 |
-
print(
|
868 |
-
f"Error: End index {end} is less than start index {start} at position {i}.", # noqa: E501
|
869 |
-
)
|
870 |
-
return [], []
|
871 |
-
|
872 |
-
start_end = list(range(start, end + 1, 1))
|
873 |
-
start_end = list(set(start_end) - set(ignore_indices))
|
874 |
-
# new_start, new_end = self.extract_sequences(start_end)
|
875 |
-
new_start, new_end = self.extract_new_startend(
|
876 |
-
start,
|
877 |
-
end,
|
878 |
-
ignore_indices,
|
879 |
-
)
|
880 |
-
filtered_starts.extend(new_start)
|
881 |
-
filtered_ends.extend(new_end)
|
882 |
-
|
883 |
-
return filtered_starts, filtered_ends
|
884 |
-
|
885 |
-
def extract_new_startend(self, start, end, ignore_indices):
|
886 |
-
# sort a set of ignore_indices
|
887 |
-
indexes = list(set(ignore_indices))
|
888 |
-
indexes.sort()
|
889 |
-
|
890 |
-
new_starts = []
|
891 |
-
new_ends = []
|
892 |
-
new_start = start
|
893 |
-
if indexes is None or len(indexes) < 1:
|
894 |
-
new_starts.append(start)
|
895 |
-
new_ends.append(end)
|
896 |
-
return new_starts, new_ends
|
897 |
-
|
898 |
-
for index in indexes:
|
899 |
-
if index < start:
|
900 |
-
continue
|
901 |
-
elif index >= end:
|
902 |
-
continue
|
903 |
-
|
904 |
-
new_starts.append(new_start)
|
905 |
-
new_ends.append(index)
|
906 |
-
|
907 |
-
new_start = index + 1
|
908 |
-
|
909 |
-
new_starts.append(new_start)
|
910 |
-
new_ends.append(end)
|
911 |
-
|
912 |
-
return new_starts, new_ends
|
913 |
-
|
914 |
-
def extract_sequences(self, numbers):
|
915 |
-
if len(numbers) == 1:
|
916 |
-
return [numbers[0]], [numbers[0]]
|
917 |
-
|
918 |
-
numbers.sort()
|
919 |
-
starts = []
|
920 |
-
ends = []
|
921 |
-
for i, number in enumerate(numbers):
|
922 |
-
if i == 0:
|
923 |
-
start = number
|
924 |
-
end = number
|
925 |
-
continue
|
926 |
-
|
927 |
-
if number - 1 == numbers[i - 1]:
|
928 |
-
end = number
|
929 |
-
else:
|
930 |
-
starts.append(start)
|
931 |
-
ends.append(end)
|
932 |
-
start = number
|
933 |
-
end = number
|
934 |
-
|
935 |
-
if i == len(numbers) - 1:
|
936 |
-
starts.append(start)
|
937 |
-
ends.append(end)
|
938 |
-
|
939 |
-
return starts, ends
|
|
|
1 |
+
"""
|
2 |
+
Author: Khanh Phan
|
3 |
+
Date: 2024-12-04
|
4 |
+
"""
|
5 |
|
6 |
import pandas as pd
|
7 |
|
8 |
+
from src.application.config import MIN_RATIO_PARAPHRASE_NUM, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_MACHINE
|
9 |
+
from src.application.formatting import color_text, format_entity_count
|
10 |
from src.application.image.image_detection import (
|
11 |
detect_image_by_ai_model,
|
12 |
detect_image_by_reverse_search,
|
|
|
16 |
apply_highlight,
|
17 |
highlight_entities,
|
18 |
)
|
19 |
+
from src.application.text.helper import (
|
20 |
+
extract_equal_text,
|
21 |
+
postprocess_label,
|
22 |
+
split_into_paragraphs,
|
23 |
+
)
|
24 |
from src.application.text.model_detection import (
|
25 |
detect_text_by_ai_model,
|
26 |
predict_generation_model,
|
27 |
)
|
28 |
+
from src.application.text.search_detection import find_sentence_source
|
|
|
|
|
|
|
|
|
29 |
|
30 |
|
31 |
class NewsVerification:
|
32 |
def __init__(self):
|
33 |
+
"""
|
34 |
+
Initializes the NewsVerification object.
|
35 |
+
"""
|
36 |
+
self.news_text: str = ""
|
37 |
+
self.news_title: str = ""
|
38 |
+
self.news_content: str = ""
|
39 |
+
self.news_image: str = ""
|
40 |
|
41 |
self.text_prediction_label: list[str] = ["UNKNOWN"]
|
42 |
self.text_prediction_score: list[float] = [0.0]
|
|
|
45 |
self.image_prediction_score: list[str] = [0.0]
|
46 |
self.image_referent_url: list[str] = []
|
47 |
|
48 |
+
self.news_prediction_label: str = ""
|
49 |
+
self.news_prediction_score: float = -1
|
50 |
|
51 |
# news' urls to find img
|
52 |
self.found_img_url: list[str] = []
|
|
|
60 |
"similarity",
|
61 |
"paraphrase",
|
62 |
"url",
|
63 |
+
# "entities",
|
|
|
64 |
],
|
65 |
)
|
66 |
self.grouped_url_df: pd.DataFrame = pd.DataFrame()
|
|
|
70 |
self.fact_checker_table: list = []
|
71 |
self.governor_table: list = []
|
72 |
|
73 |
+
def load_news(self, news_title: str, news_content: str, news_image: str):
|
74 |
+
"""
|
75 |
+
Loads news data into the object's attributes.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
news_title (str): The title of the news article.
|
79 |
+
news_content (str): The content of the news article.
|
80 |
+
news_image (str): The url of image in news article.
|
81 |
+
"""
|
82 |
+
# Combine title and content for a full text representation.
|
83 |
+
# .strip() removes leading/trailing whitespace for cleaner text.
|
84 |
self.news_text = (news_title + "\n\n" + news_content).strip()
|
85 |
+
|
86 |
+
# if not isinstance(news_title, str) or not isinstance(
|
87 |
+
# news_content,
|
88 |
+
# str,
|
89 |
+
# ):
|
90 |
+
# raise TypeError("News title and content must be strings.")
|
91 |
+
|
92 |
+
# if not isinstance(news_image, str) or news_image is not None:
|
93 |
+
# Warning("News image must be a string.")
|
94 |
+
|
95 |
self.news_title = news_title
|
96 |
self.news_content = news_content
|
97 |
self.news_image = news_image
|
98 |
|
99 |
+
def group_by_url(self):
|
100 |
+
"""
|
101 |
+
Groups aligned sentences by URL
|
102 |
+
Then, concatenates text the 'input' and 'source' text for each group.
|
103 |
+
"""
|
104 |
|
|
|
105 |
def concat_text(series):
|
106 |
+
"""
|
107 |
+
Concatenates the elements of a pd.Series into a single string.
|
108 |
+
"""
|
109 |
return " ".join(
|
110 |
series.astype(str).tolist(),
|
111 |
) # Handle mixed data types and NaNs
|
112 |
|
113 |
+
# Group sentences by URL and concatenate 'input' and 'source' text.
|
114 |
+
self.grouped_url_df = (
|
115 |
+
self.aligned_sentences_df.groupby("url")
|
116 |
+
.agg(
|
117 |
+
{
|
118 |
+
"input": concat_text,
|
119 |
+
"source": concat_text,
|
120 |
+
},
|
121 |
+
)
|
122 |
+
.reset_index()
|
123 |
+
) # Reset index to make 'url' a regular column
|
124 |
+
|
125 |
# Add new columns for label and score
|
126 |
self.grouped_url_df["label"] = None
|
127 |
self.grouped_url_df["score"] = None
|
128 |
|
129 |
print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
|
130 |
|
131 |
+
def determine_text_origin_by_url(self):
|
132 |
+
"""
|
133 |
+
Determines the text origin for each URL group.
|
134 |
+
"""
|
135 |
for index, row in self.grouped_url_df.iterrows():
|
136 |
+
# Verify text origin using URL-based verification.
|
137 |
label, score = self.verify_text(row["url"])
|
138 |
+
|
139 |
+
# If URL-based verification returns 'UNKNOWN', use AI detection
|
140 |
if label == "UNKNOWN":
|
141 |
+
# Concatenate text from "input" column in sentence_df
|
142 |
text = " ".join(row["input"])
|
143 |
|
144 |
+
# Detect text origin using an AI model.
|
145 |
label, score = detect_text_by_ai_model(text)
|
146 |
|
147 |
self.grouped_url_df.at[index, "label"] = label
|
148 |
self.grouped_url_df.at[index, "score"] = score
|
149 |
|
150 |
+
def determine_text_origin(self):
|
151 |
+
"""
|
152 |
+
Determines the origin of the input text by analyzing
|
153 |
+
its sources and applying AI detection models.
|
154 |
+
|
155 |
+
This method groups sentences by their source URLs,
|
156 |
+
applies verification and AI detection, and then determines
|
157 |
+
an overall label and score for the input text.
|
158 |
+
"""
|
159 |
+
# Find the text URLs associated with the input text
|
160 |
+
self.find_text_source()
|
161 |
+
|
162 |
+
# Group sentences by URL and concatenate 'input' and 'source' text.
|
163 |
+
self.group_by_url()
|
164 |
+
|
165 |
+
# Determine the text origin for each URL group
|
166 |
+
self.determine_text_origin_by_url()
|
167 |
+
|
168 |
+
# Determine the overall label and score for the entire input text.
|
169 |
+
if not self.grouped_url_df.empty:
|
170 |
+
# Check for 'gpt-4o' labels in the grouped URLs.
|
171 |
machine_label = self.grouped_url_df[
|
172 |
self.grouped_url_df["label"].str.contains(
|
173 |
+
"gpt-4o",
|
174 |
case=False,
|
175 |
na=False,
|
176 |
)
|
177 |
]
|
178 |
|
179 |
+
if not machine_label.empty:
|
180 |
+
# If 'gpt-4o' labels are found, post-process and assign.
|
181 |
+
labels = machine_label["label"].tolist()
|
182 |
+
label = postprocess_label(labels)
|
183 |
+
|
184 |
+
# labels = " and ".join(machine_label["label"].tolist())
|
185 |
+
# label = remove_duplicate_words(label)
|
186 |
self.text_prediction_label[0] = label
|
187 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
188 |
else:
|
189 |
+
# If no 'gpt-4o' labels, assign for 'HUMAN' labels.
|
190 |
machine_label = self.aligned_sentences_df[
|
191 |
self.aligned_sentences_df["label"] == "HUMAN"
|
192 |
]
|
193 |
self.text_prediction_label[0] = "HUMAN"
|
194 |
self.text_prediction_score[0] = machine_label["score"].mean()
|
195 |
+
else:
|
196 |
+
# If no found URLs, use AI detection on the entire input text.
|
197 |
print("No source found in the input text")
|
198 |
text = " ".join(self.aligned_sentences_df["input"].tolist())
|
199 |
+
|
200 |
+
# Detect text origin using an AI model.
|
201 |
label, score = detect_text_by_ai_model(text)
|
202 |
self.text_prediction_label[0] = label
|
203 |
self.text_prediction_score[0] = score
|
204 |
|
205 |
def find_text_source(self):
|
206 |
"""
|
207 |
+
Determines the origin of the given text based on paraphrasing
|
208 |
+
detection and human authorship analysis.
|
|
|
|
|
|
|
209 |
|
210 |
+
1. Splits the input news text into sentences,
|
211 |
+
2. Searches for sources for each sentence
|
212 |
+
3. Updates the aligned_sentences_df with the found sources.
|
|
|
213 |
"""
|
214 |
print("CHECK TEXT:")
|
215 |
print("\tFrom search engine:")
|
216 |
+
|
|
|
217 |
input_paragraphs = split_into_paragraphs(self.news_text)
|
218 |
+
|
219 |
+
# Initialize an empty DataFrame if it doesn't exist, otherwise extend it.
|
220 |
+
if not hasattr(self, 'aligned_sentences_df') or self.aligned_sentences_df is None:
|
221 |
+
self.aligned_sentences_df = pd.DataFrame(columns=[
|
222 |
+
"input",
|
223 |
+
"source",
|
224 |
+
"label",
|
225 |
+
"similarity",
|
226 |
+
"paraphrase",
|
227 |
+
"url",
|
228 |
+
"entities",
|
229 |
+
])
|
230 |
|
231 |
+
# Setup DataFrame for input_sentences
|
|
|
232 |
for _ in range(len(input_paragraphs)):
|
233 |
self.aligned_sentences_df = pd.concat(
|
234 |
[
|
|
|
250 |
ignore_index=True,
|
251 |
)
|
252 |
|
253 |
+
# Find a source for each sentence
|
254 |
for index, _ in enumerate(input_paragraphs):
|
255 |
similarity = self.aligned_sentences_df.loc[index, "similarity"]
|
256 |
if similarity is not None:
|
|
|
265 |
index,
|
266 |
self.aligned_sentences_df,
|
267 |
)
|
268 |
+
|
269 |
+
# Initialize found_img_url if it does not exist.
|
270 |
+
if not hasattr(self, 'found_img_url'):
|
271 |
+
self.found_img_url = []
|
272 |
self.found_img_url.extend(img_urls)
|
273 |
|
|
|
|
|
274 |
def verify_text(self, url):
|
275 |
+
"""
|
276 |
+
Verifies the text origin based on similarity scores and labels
|
277 |
+
associated with a given URL.
|
278 |
+
|
279 |
+
1. Filters sentences by URL and similarity score,
|
280 |
+
2. Determines if the text is likely generated by a machine or a human.
|
281 |
+
3. Calculates an average similarity score.
|
282 |
+
|
283 |
+
Args:
|
284 |
+
url (str): The URL to filter sentences by.
|
285 |
+
|
286 |
+
Returns:
|
287 |
+
tuple: A
|
288 |
+
- Label ("MACHINE", "HUMAN", or "UNKNOWN")
|
289 |
+
- Score
|
290 |
+
"""
|
291 |
label = "UNKNOWN"
|
292 |
score = 0
|
293 |
+
|
294 |
# calculate the average similarity when the similary score
|
295 |
# in each row of sentences_df is higher than 0.8
|
296 |
+
|
297 |
+
# Filter sentences by URL.
|
298 |
filtered_by_url = self.aligned_sentences_df[
|
299 |
self.aligned_sentences_df["url"] == url
|
300 |
]
|
301 |
+
|
302 |
+
# Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
|
303 |
filtered_by_similarity = filtered_by_url[
|
304 |
+
filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
|
305 |
]
|
306 |
+
|
307 |
+
# Check if a ratio of remaining filtering-sentences is more than 50%.
|
308 |
+
if len(filtered_by_similarity) / len(self.aligned_sentences_df) > MIN_RATIO_PARAPHRASE_NUM:
|
309 |
# check if "MACHINE" is in self.aligned_sentences_df["label"]:
|
310 |
contains_machine = (
|
311 |
filtered_by_similarity["label"]
|
|
|
316 |
)
|
317 |
.any()
|
318 |
)
|
319 |
+
|
320 |
+
# TODO: integrate with determine_text_origin
|
321 |
if contains_machine:
|
322 |
+
# If "MACHINE" label is present, set label and calculate score.
|
323 |
machine_rows = filtered_by_similarity[
|
324 |
filtered_by_similarity["label"].str.contains(
|
325 |
"MACHINE",
|
|
|
328 |
)
|
329 |
]
|
330 |
generated_model, _ = predict_generation_model(self.news_text)
|
331 |
+
label = f"Partially generated by {generated_model}"
|
332 |
score = machine_rows["similarity"].mean()
|
333 |
else:
|
334 |
+
# If no "MACHINE" label, assign "HUMAN" label and calculate score.
|
335 |
label = "HUMAN"
|
336 |
human_rows = filtered_by_similarity[
|
337 |
filtered_by_similarity["label"].str.contains(
|
|
|
345 |
return label, score
|
346 |
|
347 |
def determine_image_origin(self):
|
348 |
+
"""
|
349 |
+
Determines the origin of the news image using various detection methods.
|
350 |
+
|
351 |
+
1. Matching against previously found image URLs.
|
352 |
+
2. Reverse image search.
|
353 |
+
3. AI-based image detection.
|
354 |
+
|
355 |
+
If none of these methods succeed, the image origin is marked as "UNKNOWN".
|
356 |
+
"""
|
357 |
print("CHECK IMAGE:")
|
358 |
+
|
359 |
+
# Handle the case where no image is provided.
|
360 |
if self.news_image is None:
|
361 |
self.image_prediction_label = "UNKNOWN"
|
362 |
self.image_prediction_score = 0.0
|
363 |
self.image_referent_url = None
|
364 |
return
|
365 |
|
366 |
+
# Attempt to match the image against previously found image URLs.
|
367 |
+
print("\tFrom found image URLs...")
|
368 |
matched_url, similarity = detect_image_from_news_image(
|
369 |
self.news_image,
|
370 |
self.found_img_url,
|
|
|
376 |
self.image_referent_url = matched_url
|
377 |
return
|
378 |
|
379 |
+
# Attempt to find the image origin using reverse image search.
|
380 |
+
print("\tFrom reverse image search...")
|
381 |
matched_url, similarity = detect_image_by_reverse_search(
|
382 |
self.news_image,
|
383 |
)
|
|
|
388 |
self.image_referent_url = matched_url
|
389 |
return
|
390 |
|
391 |
+
# Attempt to detect the image origin using an AI model.
|
392 |
+
print("\tFrom an AI model...")
|
393 |
detected_label, score = detect_image_by_ai_model(self.news_image)
|
394 |
if detected_label:
|
395 |
print(f"detected_label: {detected_label} ({score})")
|
|
|
398 |
self.image_referent_url = None
|
399 |
return
|
400 |
|
401 |
+
# If all detection methods fail, mark the image origin as "UNKNOWN".
|
402 |
self.image_prediction_label = "UNKNOWN"
|
403 |
self.image_prediction_score = 50
|
404 |
self.image_referent_url = None
|
405 |
|
406 |
+
def determine_origin(self):
|
407 |
+
"""
|
408 |
+
Determine origins by analyzing the news text and image.
|
409 |
+
"""
|
410 |
if self.news_text != "":
|
411 |
self.determine_text_origin()
|
412 |
if self.news_image != "":
|
413 |
self.determine_image_origin()
|
414 |
+
|
415 |
+
# Handle entity recognition and processing.
|
416 |
self.handle_entities()
|
417 |
+
|
418 |
+
def generate_report(self) -> tuple[str, str, str]:
|
419 |
+
"""
|
420 |
+
Generates reports tailored for different user roles
|
421 |
+
(ordinary users, fact checkers, governors).
|
422 |
+
|
423 |
+
Returns:
|
424 |
+
tuple: A tuple containing three html-formatted reports:
|
425 |
+
- ordinary_user_table: Report for ordinary users.
|
426 |
+
- fact_checker_table: Report for fact checkers.
|
427 |
+
- governor_table: Report for governors.
|
428 |
+
"""
|
429 |
ordinary_user_table = self.create_ordinary_user_table()
|
430 |
fact_checker_table = self.create_fact_checker_table()
|
431 |
governor_table = self.create_governor_table()
|
|
|
433 |
return ordinary_user_table, fact_checker_table, governor_table
|
434 |
|
435 |
def handle_entities(self):
|
436 |
+
"""
|
437 |
+
Highlights and assigns entities with colors to aligned sentences
|
438 |
+
based on grouped URLs.
|
439 |
+
|
440 |
+
For each grouped URL:
|
441 |
+
1. Highlights entities in the input and source text
|
442 |
+
2. Then assigns these highlighted entities to the corresponding
|
443 |
+
sentences in the aligned sentences DataFrame.
|
444 |
+
"""
|
445 |
+
|
446 |
entities_with_colors = []
|
447 |
for index, row in self.grouped_url_df.iterrows():
|
448 |
# Get entity-words (in pair) with colors
|
|
|
451 |
row["source"],
|
452 |
)
|
453 |
|
454 |
+
# Assign the highlighted entities to the corresponding sentences
|
455 |
+
# in aligned_sentences_df.
|
456 |
for index, sentence in self.aligned_sentences_df.iterrows():
|
457 |
if sentence["url"] == row["url"]:
|
458 |
+
# Use .at to modify the DataFrame efficiently.
|
459 |
self.aligned_sentences_df.at[index, "entities"] = (
|
460 |
+
entities_with_colors
|
461 |
)
|
462 |
|
463 |
+
def get_text_urls(self) -> set:
|
|
|
|
|
|
|
464 |
"""
|
465 |
+
Returns a set of unique URLs referenced in the text analysis.
|
|
|
466 |
|
467 |
+
Returns:
|
468 |
+
set: A set containing the unique URLs referenced in the text.
|
469 |
"""
|
470 |
+
return set(self.text_referent_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
471 |
|
472 |
def create_fact_checker_table(self):
|
473 |
rows = []
|
|
|
506 |
if index == 0 or current_url != previous_url:
|
507 |
first_url_row = True
|
508 |
previous_url = current_url
|
509 |
+
# Increase counter "span_row" when the next url is the same
|
510 |
while (
|
511 |
index + span_row < len(self.fact_checker_table)
|
512 |
and self.fact_checker_table[index + span_row][4]
|
|
|
551 |
</table>
|
552 |
|
553 |
<style>
|
554 |
+
"""
|
555 |
|
556 |
def format_text_fact_checker_row(
|
557 |
self,
|
|
|
586 |
entity_count = len(row[3])
|
587 |
|
588 |
# Color overlapping words
|
589 |
+
input_sentence = color_text(
|
590 |
input_sentence,
|
591 |
row[1],
|
592 |
highlight_idx_input,
|
593 |
) # text, index of highlight words
|
594 |
+
source_sentence = color_text(
|
595 |
source_sentence,
|
596 |
row[2],
|
597 |
highlight_idx_source,
|
|
|
612 |
source_sentence = row[0]["source"]
|
613 |
|
614 |
url = row[0]["url"]
|
615 |
+
|
616 |
# Displayed label and score by url
|
617 |
filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
|
618 |
if len(filterby_url) > 0:
|
|
|
626 |
source_text_url = f"""<a href="{url}">{url}</a>"""
|
627 |
|
628 |
# Format displayed entity count
|
629 |
+
entity_count_text = format_entity_count(entity_count)
|
630 |
|
631 |
border_top = "border-top: 1px solid transparent;"
|
632 |
border_bottom = "border-bottom: 1px solid transparent;"
|
|
|
720 |
<style>
|
721 |
"""
|
722 |
|
723 |
+
def format_text_ordinary_user_row(self):
|
724 |
input_sentences = ""
|
725 |
source_text_urls = ""
|
726 |
urls = []
|
|
|
743 |
</tr>
|
744 |
"""
|
745 |
|
746 |
+
def format_image_ordinary_user_row(self):
|
747 |
|
748 |
if (
|
749 |
self.image_referent_url is not None
|
|
|
840 |
)
|
841 |
|
842 |
# Color overlapping words
|
843 |
+
input_sentence = color_text(
|
844 |
input_sentence,
|
845 |
row[1],
|
846 |
highlight_idx_input,
|
847 |
) # text, index of highlight words
|
848 |
+
source_sentence = color_text(
|
849 |
source_sentence,
|
850 |
row[2],
|
851 |
highlight_idx_source,
|
|
|
879 |
if row[3] is not None:
|
880 |
entity_count.append(len(row[3]))
|
881 |
|
882 |
+
entity_count_text = format_entity_count(sum(entity_count))
|
883 |
word_break = "word-break: break-all;"
|
884 |
return f"""
|
885 |
<tr>
|
|
|
905 |
|
906 |
word_break = "word-break: break-all;"
|
907 |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/application/content_generation.py
CHANGED
@@ -1,30 +1,39 @@
|
|
1 |
import json
|
2 |
-
import os
|
3 |
|
4 |
import openai
|
5 |
-
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
11 |
-
|
12 |
-
client = openai.AzureOpenAI(
|
13 |
-
api_version=AZURE_OPENAI_API_VERSION,
|
14 |
-
api_key=AZURE_OPENAI_API_KEY,
|
15 |
-
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
16 |
)
|
17 |
|
18 |
|
19 |
-
def generate_fake_text(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Generate text using the selected models
|
21 |
prompt = """Generate a random fake news tittle in this format:
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
if title and content:
|
29 |
prompt += """base on the following context:
|
30 |
# Title: {news_title}:\n# Content: {news_content}"""
|
@@ -38,7 +47,7 @@ def generate_fake_text(text_generation_model, title, content):
|
|
38 |
# Generate text using the text generation model
|
39 |
# Generate text using the selected model
|
40 |
try:
|
41 |
-
response =
|
42 |
model=text_generation_model,
|
43 |
messages=[{"role": "system", "content": prompt}],
|
44 |
)
|
@@ -58,67 +67,92 @@ def generate_fake_text(text_generation_model, title, content):
|
|
58 |
return fake_title, fake_content
|
59 |
|
60 |
|
61 |
-
def extract_title_content(fake_news):
|
62 |
"""
|
63 |
-
Extracts the title and content from the generated fake
|
64 |
-
|
65 |
-
This function parses a string containing fake news, which is expected
|
66 |
-
to have a specific format with a title and content section marked by
|
67 |
-
'# Title:' and '# Content:' respectively.
|
68 |
|
69 |
Args:
|
70 |
-
fake_news
|
71 |
|
72 |
Returns:
|
73 |
-
|
74 |
-
- title (str): The extracted title of the fake news.
|
75 |
-
- content (str): The extracted content of the fake news.
|
76 |
-
|
77 |
-
Note:
|
78 |
-
The function assumes that the input string follows the expected format.
|
79 |
-
If the format is not as expected, it may return unexpected results.
|
80 |
"""
|
81 |
-
|
82 |
-
|
83 |
-
title_end_index = fake_news.find("\n", title_start_index)
|
84 |
-
title = fake_news[title_start_index:title_end_index].strip()
|
85 |
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
return title, content
|
92 |
|
93 |
|
94 |
-
def generate_fake_image(
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
model="dall-e-3", # the name of your DALL-E 3 deployment
|
101 |
-
prompt=IMAGE_PROMPT,
|
102 |
-
n=1,
|
103 |
-
)
|
104 |
-
image_url = json.loads(result.model_dump_json())["data"][0]["url"]
|
105 |
-
return image_url
|
106 |
|
|
|
|
|
|
|
107 |
|
108 |
-
|
|
|
109 |
"""
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
Args:
|
113 |
-
|
114 |
-
|
|
|
|
|
115 |
|
116 |
Returns:
|
117 |
-
|
118 |
"""
|
119 |
for _, row in replace_df.iterrows():
|
120 |
find_what = row["Find what:"]
|
121 |
replace_with = row["Replace with:"]
|
122 |
news_content = news_content.replace(find_what, replace_with)
|
123 |
news_title = news_title.replace(find_what, replace_with)
|
|
|
124 |
return news_title, news_content
|
|
|
1 |
import json
|
|
|
2 |
|
3 |
import openai
|
4 |
+
import pandas as pd
|
5 |
|
6 |
+
from src.application.config import (
|
7 |
+
AZUREOPENAI_CLIENT,
|
8 |
+
GPT_IMAGE_MODEL,
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
)
|
10 |
|
11 |
|
12 |
+
def generate_fake_text(
|
13 |
+
text_generation_model: str,
|
14 |
+
title: str = None,
|
15 |
+
content: str = None,
|
16 |
+
) -> tuple[str, str]:
|
17 |
+
"""
|
18 |
+
Generates fake news title and content using an Azure OpenAI model.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
text_generation_model: The name of the Azure OpenAI model to use.
|
22 |
+
title: Optional title to use as context for fake text generation.
|
23 |
+
content: Optional content to use as context for fake text generation.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
A tuple containing the generated fake title and content (both strings).
|
27 |
+
Returns empty strings if generation fails.
|
28 |
+
"""
|
29 |
# Generate text using the selected models
|
30 |
prompt = """Generate a random fake news tittle in this format:
|
31 |
+
---
|
32 |
+
# Title: [Fake Title]
|
33 |
+
# Content:
|
34 |
+
[Fake Content]
|
35 |
+
---
|
36 |
+
"""
|
37 |
if title and content:
|
38 |
prompt += """base on the following context:
|
39 |
# Title: {news_title}:\n# Content: {news_content}"""
|
|
|
47 |
# Generate text using the text generation model
|
48 |
# Generate text using the selected model
|
49 |
try:
|
50 |
+
response = AZUREOPENAI_CLIENT.chat.completions.create(
|
51 |
model=text_generation_model,
|
52 |
messages=[{"role": "system", "content": prompt}],
|
53 |
)
|
|
|
67 |
return fake_title, fake_content
|
68 |
|
69 |
|
70 |
+
def extract_title_content(fake_news: str) -> tuple[str, str]:
|
71 |
"""
|
72 |
+
Extracts the title and content from the generated fake text.
|
|
|
|
|
|
|
|
|
73 |
|
74 |
Args:
|
75 |
+
fake_news: The generated fake text string.
|
76 |
|
77 |
Returns:
|
78 |
+
A tuple containing the extracted title and content.
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
"""
|
80 |
+
title = ""
|
81 |
+
content = ""
|
|
|
|
|
82 |
|
83 |
+
try:
|
84 |
+
# Extract the title and content from the generated fake news
|
85 |
+
title_start = fake_news.find("# Title: ") + len("# Title: ")
|
86 |
+
title_end = fake_news.find("\n", title_start)
|
87 |
+
if title_start != -1 and title_end != -1:
|
88 |
+
title = fake_news[title_start:title_end].strip()
|
89 |
+
|
90 |
+
title_start = fake_news.find("\n# Content: ") + len(
|
91 |
+
"\n# Content: ",
|
92 |
+
)
|
93 |
+
content = fake_news[title_start:].strip()
|
94 |
+
except Exception as e:
|
95 |
+
print(f"Error extracting title and content: {e}")
|
96 |
|
97 |
return title, content
|
98 |
|
99 |
|
100 |
+
def generate_fake_image(
|
101 |
+
title: str,
|
102 |
+
model: str = GPT_IMAGE_MODEL,
|
103 |
+
) -> str | None:
|
104 |
+
"""
|
105 |
+
Generates a fake image URL using Azure OpenAI's image generation API.
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
Args:
|
108 |
+
title: The title to use as a prompt for image generation.
|
109 |
+
model: The name of the Azure OpenAI image generation model to use.
|
110 |
|
111 |
+
Returns:
|
112 |
+
The URL of the generated image, or None if an error occurs.
|
113 |
"""
|
114 |
+
try:
|
115 |
+
if title:
|
116 |
+
image_prompt = f"Generate a random image about {title}"
|
117 |
+
else:
|
118 |
+
image_prompt = "Generate a random image"
|
119 |
+
|
120 |
+
result = AZUREOPENAI_CLIENT.images.generate(
|
121 |
+
model=model,
|
122 |
+
prompt=image_prompt,
|
123 |
+
n=1,
|
124 |
+
)
|
125 |
+
|
126 |
+
image_url = json.loads(result.model_dump_json())["data"][0]["url"]
|
127 |
+
return image_url
|
128 |
+
|
129 |
+
except Exception as e:
|
130 |
+
print(f"Error generating fake image: {e}")
|
131 |
+
return None # Return None if an error occurs
|
132 |
+
|
133 |
+
|
134 |
+
def replace_text(
|
135 |
+
news_title: str,
|
136 |
+
news_content: str,
|
137 |
+
replace_df: pd.DataFrame,
|
138 |
+
) -> tuple[str, str]:
|
139 |
+
"""
|
140 |
+
Replaces occurrences in the input title and content
|
141 |
+
based on the provided DataFrame.
|
142 |
|
143 |
Args:
|
144 |
+
news_title: The input news title.
|
145 |
+
news_content: The input news content.
|
146 |
+
replace_df: A DataFrame with two columns:
|
147 |
+
"Find what:" and "Replace with:".
|
148 |
|
149 |
Returns:
|
150 |
+
A tuple containing the modified news title and content.
|
151 |
"""
|
152 |
for _, row in replace_df.iterrows():
|
153 |
find_what = row["Find what:"]
|
154 |
replace_with = row["Replace with:"]
|
155 |
news_content = news_content.replace(find_what, replace_with)
|
156 |
news_title = news_title.replace(find_what, replace_with)
|
157 |
+
|
158 |
return news_title, news_content
|
src/application/formatting.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.application.text.helper import extract_starts_ends, filter_indices
|
2 |
+
|
3 |
+
|
4 |
+
def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -> str:
|
5 |
+
"""
|
6 |
+
Colors specific words in a text based on provided indices.
|
7 |
+
|
8 |
+
This method takes a text, a list of indices to color, and a list of indices to exclude.
|
9 |
+
It splits the text into words, filters the indices, and then wraps the words within
|
10 |
+
the specified ranges with a green span tag for coloring.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
text (str): The input text.
|
14 |
+
colored_idx (list): A list of dictionaries, where each dictionary contains
|
15 |
+
'start' and 'end' keys representing indices of words to color.
|
16 |
+
highlighted_idx (list): A list of indices to exclude from coloring.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
str: The text with colored words.
|
20 |
+
"""
|
21 |
+
sentence = ""
|
22 |
+
words = text.split()
|
23 |
+
|
24 |
+
# Extract start and end indices from colored_idx.
|
25 |
+
starts, ends = extract_starts_ends(colored_idx)
|
26 |
+
|
27 |
+
# Filter the start and end indices to exclude highlighted_idx.
|
28 |
+
starts, ends = filter_indices(starts, ends, highlighted_idx)
|
29 |
+
|
30 |
+
previous_end = 0
|
31 |
+
for start, end in zip(starts, ends):
|
32 |
+
# Add the words before the current colored range to the sentence.
|
33 |
+
sentence += " ".join(words[previous_end:start])
|
34 |
+
|
35 |
+
# Add the colored range to the sentence.
|
36 |
+
equal_words = " ".join(words[start:end])
|
37 |
+
sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
38 |
+
|
39 |
+
# Update the previous end index.
|
40 |
+
previous_end = end
|
41 |
+
|
42 |
+
# Add the remaining words after the last colored range to the sentence.
|
43 |
+
sentence += " ".join(words[previous_end:])
|
44 |
+
|
45 |
+
return sentence
|
46 |
+
|
47 |
+
|
48 |
+
def format_entity_count(entity_count: int) -> str:
|
49 |
+
"""
|
50 |
+
Generates a text description based on the number of altered entities.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
entity_count (int): The number of altered entities.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
str: A text description of the entity count.
|
57 |
+
- "" if entity_count is 0 or negative.
|
58 |
+
- "with 1 altered entity" if entity_count is 1.
|
59 |
+
- "with altered entities" if entity_count is greater than 1.
|
60 |
+
"""
|
61 |
+
if entity_count <= 0:
|
62 |
+
entity_count_text = ""
|
63 |
+
elif entity_count == 1:
|
64 |
+
entity_count_text = "with 1 altered entity"
|
65 |
+
else:
|
66 |
+
entity_count_text = "with altered entities"
|
67 |
+
return entity_count_text
|
src/application/formatting_ordinary_user.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.application.config import WORD_BREAK
|
2 |
+
|
3 |
+
|
4 |
+
def create_ordinary_user_table(self):
|
5 |
+
rows = []
|
6 |
+
rows.append(self.format_image_ordinary_user_row())
|
7 |
+
rows.append(self.format_text_ordinary_user_row())
|
8 |
+
table = "\n".join(rows)
|
9 |
+
|
10 |
+
return f"""
|
11 |
+
<h5>Comparison between input news and source news:</h5>
|
12 |
+
<table border="1" style="width:100%; text-align:left;">
|
13 |
+
<col style="width: 340px;">
|
14 |
+
<col style="width: 30px;">
|
15 |
+
<col style="width: 75px;">
|
16 |
+
<thead>
|
17 |
+
<tr>
|
18 |
+
<th>Input news</th>
|
19 |
+
<th>Forensic</th>
|
20 |
+
<th>Originality</th>
|
21 |
+
</tr>
|
22 |
+
</thead>
|
23 |
+
<tbody>
|
24 |
+
{table}
|
25 |
+
</tbody>
|
26 |
+
</table>
|
27 |
+
|
28 |
+
<style>
|
29 |
+
"""
|
30 |
+
|
31 |
+
def format_text_ordinary_user_row(self):
|
32 |
+
input_sentences = ""
|
33 |
+
source_text_urls = ""
|
34 |
+
urls = []
|
35 |
+
for _, row in self.aligned_sentences_df.iterrows():
|
36 |
+
if row["input"] is None:
|
37 |
+
continue
|
38 |
+
|
39 |
+
input_sentences += row["input"] + "<br><br>"
|
40 |
+
url = row["url"]
|
41 |
+
if url not in urls:
|
42 |
+
urls.append(url)
|
43 |
+
source_text_urls += f"""<a href="{url}">{url}</a><br>"""
|
44 |
+
|
45 |
+
return f"""
|
46 |
+
<tr>
|
47 |
+
<td>{input_sentences}</td>
|
48 |
+
<td>{self.text_prediction_label[0]}<br>
|
49 |
+
({self.text_prediction_score[0] * 100:.2f}%)</td>
|
50 |
+
<td style="{WORD_BREAK}";>{source_text_urls}</td>
|
51 |
+
</tr>
|
52 |
+
"""
|
53 |
+
|
54 |
+
def format_image_ordinary_user_row(
|
55 |
+
image_referent_url: str,
|
56 |
+
image_prediction_label: str,
|
57 |
+
image_prediction_score: float,
|
58 |
+
):
|
59 |
+
"""
|
60 |
+
Formats an HTML table row for ordinary users,
|
61 |
+
displaying image analysis results.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
image_referent_url (str): The URL of the referenced image.
|
65 |
+
image_prediction_label (str): The predicted label for the image.
|
66 |
+
image_prediction_score (float): The prediction score for the image.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
str: An HTML table row string containing the image analysis results.
|
70 |
+
"""
|
71 |
+
|
72 |
+
# Put image, label, and score into html tag
|
73 |
+
if (
|
74 |
+
image_referent_url is not None
|
75 |
+
or image_referent_url != ""
|
76 |
+
):
|
77 |
+
source_image_url = f"""<a href="{image_referent_url}">{image_referent_url}</a>""" # noqa: E501
|
78 |
+
else:
|
79 |
+
source_image_url = ""
|
80 |
+
|
81 |
+
return f"""
|
82 |
+
<tr>
|
83 |
+
<td>input image</td>
|
84 |
+
<td>{image_prediction_label}<br>({image_prediction_score:.2f}%)</td>
|
85 |
+
<td style="{WORD_BREAK}";>{source_image_url}</td>
|
86 |
+
</tr>
|
87 |
+
"""
|
src/application/text/entity.py
CHANGED
@@ -44,6 +44,7 @@ def extract_entities_gpt(
|
|
44 |
"""
|
45 |
|
46 |
# Construct the prompt for the GPT model.
|
|
|
47 |
prompt = f"""
|
48 |
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
49 |
Find entity pairs with significantly different meanings after paraphrasing.
|
|
|
44 |
"""
|
45 |
|
46 |
# Construct the prompt for the GPT model.
|
47 |
+
# TODO: Move to config or prompt file
|
48 |
prompt = f"""
|
49 |
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
50 |
Find entity pairs with significantly different meanings after paraphrasing.
|
src/application/text/helper.py
CHANGED
@@ -8,7 +8,10 @@ import string
|
|
8 |
from collections import Counter
|
9 |
from difflib import SequenceMatcher
|
10 |
|
11 |
-
from nltk.tokenize import
|
|
|
|
|
|
|
12 |
from nltk.util import ngrams
|
13 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
14 |
|
@@ -276,3 +279,223 @@ def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
|
|
276 |
# Add the last range to the result.
|
277 |
result.append([start, end])
|
278 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from collections import Counter
|
9 |
from difflib import SequenceMatcher
|
10 |
|
11 |
+
from nltk.tokenize import (
|
12 |
+
sent_tokenize,
|
13 |
+
word_tokenize,
|
14 |
+
)
|
15 |
from nltk.util import ngrams
|
16 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
17 |
|
|
|
279 |
# Add the last range to the result.
|
280 |
result.append([start, end])
|
281 |
return result
|
282 |
+
|
283 |
+
|
284 |
+
def postprocess_label(labels: list[str]) -> str:
|
285 |
+
"""
|
286 |
+
Creates a label string with the format
|
287 |
+
"Partially generated by [label1] and [label2] and ...".
|
288 |
+
Removes duplicate labels while preserving the original order.
|
289 |
+
|
290 |
+
Args:
|
291 |
+
labels: A list of strings representing labels.
|
292 |
+
|
293 |
+
Returns:
|
294 |
+
A string with the formatted label.
|
295 |
+
"""
|
296 |
+
prefix = "Partially generated by "
|
297 |
+
for index, label in enumerate(labels):
|
298 |
+
if label.startswith(prefix):
|
299 |
+
labels[index] = label[len(prefix):]
|
300 |
+
|
301 |
+
labels = list(set(labels))
|
302 |
+
label = prefix
|
303 |
+
|
304 |
+
if len(labels) == 1:
|
305 |
+
label += labels[0]
|
306 |
+
elif len(labels) == 2:
|
307 |
+
label += f"{labels[0]} and {labels[1]}"
|
308 |
+
else:
|
309 |
+
combination = ", ".join(labels[0 : len(labels) - 1])
|
310 |
+
label += f"{combination}, and {labels[-1]}"
|
311 |
+
return label
|
312 |
+
|
313 |
+
|
314 |
+
def split_into_sentences(input_text: str) -> list[str]:
|
315 |
+
"""
|
316 |
+
Splits input text into sentences by newlines
|
317 |
+
and then tokenizes each paragraph into sentences.
|
318 |
+
|
319 |
+
Args:
|
320 |
+
input_text (str): The input text as a string.
|
321 |
+
|
322 |
+
Returns:
|
323 |
+
list: A list of sentences.
|
324 |
+
Returns an empty list if input is not a string.
|
325 |
+
"""
|
326 |
+
if not isinstance(input_text, str):
|
327 |
+
return []
|
328 |
+
|
329 |
+
# Split the input text into paragraphs based on newline characters,
|
330 |
+
# keeping the newline characters.
|
331 |
+
paragraphs = input_text.splitlines(keepends=True)
|
332 |
+
sentences = []
|
333 |
+
for paragraph in paragraphs:
|
334 |
+
# Remove leading/trailing whitespace
|
335 |
+
paragraph = paragraph.strip()
|
336 |
+
|
337 |
+
if paragraph and paragraph != "\n":
|
338 |
+
# Tokenize the paragraph into sentences
|
339 |
+
sentences.extend(sent_tokenize(paragraph))
|
340 |
+
|
341 |
+
return sentences
|
342 |
+
|
343 |
+
|
344 |
+
def split_into_paragraphs(input_text: str) -> list[str]:
|
345 |
+
"""
|
346 |
+
Splits input text into paragraphs based on newline characters.
|
347 |
+
|
348 |
+
Args:
|
349 |
+
input_text (str): The input text as a string.
|
350 |
+
|
351 |
+
Returns:
|
352 |
+
list: A list of paragraphs.
|
353 |
+
Returns an empty list if input is not a string.
|
354 |
+
"""
|
355 |
+
if not isinstance(input_text, str):
|
356 |
+
return []
|
357 |
+
|
358 |
+
# Split the input text into paragraphs based on newline characters,
|
359 |
+
# keeping the newline characters.
|
360 |
+
paragraphs = input_text.splitlines(keepends=True)
|
361 |
+
out_paragraphs = []
|
362 |
+
|
363 |
+
for paragraph in paragraphs:
|
364 |
+
# Remove leading/trailing whitespace
|
365 |
+
paragraph = paragraph.strip()
|
366 |
+
|
367 |
+
if paragraph and paragraph != "\n":
|
368 |
+
# Append the cleaned paragraph to the output list.
|
369 |
+
out_paragraphs.append(paragraph)
|
370 |
+
|
371 |
+
return out_paragraphs
|
372 |
+
|
373 |
+
|
374 |
+
def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
|
375 |
+
"""
|
376 |
+
Extracts start and end indices from a list of dictionaries.
|
377 |
+
|
378 |
+
Args:
|
379 |
+
colored_idx (list[dict]): A list of dictionaries,
|
380 |
+
where each dictionary has 'start' and 'end' keys.
|
381 |
+
|
382 |
+
Returns:
|
383 |
+
tuple: A tuple containing two lists:
|
384 |
+
- starts (list[int]): A list of start indices.
|
385 |
+
- ends (list[int]): A list of end indices.
|
386 |
+
"""
|
387 |
+
starts = []
|
388 |
+
ends = []
|
389 |
+
for index in colored_idx:
|
390 |
+
starts.append(index["start"])
|
391 |
+
ends.append(index["end"])
|
392 |
+
return starts, ends
|
393 |
+
|
394 |
+
|
395 |
+
def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]):
|
396 |
+
"""
|
397 |
+
Filters start and end indices to exclude any indices present in the
|
398 |
+
ignore_indices list.
|
399 |
+
|
400 |
+
Args:
|
401 |
+
starts (list[int]): A list of starting indices.
|
402 |
+
ends (list[int]): A list of ending indices.
|
403 |
+
Must be the same length as starts.
|
404 |
+
ignore_indices (list[int]): A list of indices to exclude.
|
405 |
+
|
406 |
+
Returns:
|
407 |
+
A tuple of two lists of integers:
|
408 |
+
- filtered_starts
|
409 |
+
- filtered_ends
|
410 |
+
Returns empty lists if the input is invalid
|
411 |
+
or if all ranges are filtered out.
|
412 |
+
|
413 |
+
Examples:
|
414 |
+
starts = [0, 5, 10]
|
415 |
+
ends = [3, 7, 12] # words at the end will not be colored.
|
416 |
+
ignore_indices = [1, 2, 12, 17]
|
417 |
+
|
418 |
+
# Output:
|
419 |
+
starts = [0, 3, 5, 10]
|
420 |
+
ends = [1, 4, 7, 12]
|
421 |
+
|
422 |
+
"""
|
423 |
+
|
424 |
+
if len(starts) != len(ends):
|
425 |
+
print(
|
426 |
+
"Error: The 'starts' & 'ends' lists must have the same length.",
|
427 |
+
)
|
428 |
+
return [], []
|
429 |
+
|
430 |
+
filtered_starts = []
|
431 |
+
filtered_ends = []
|
432 |
+
|
433 |
+
for i in range(len(starts)):
|
434 |
+
start = starts[i]
|
435 |
+
end = ends[i]
|
436 |
+
|
437 |
+
if end < start:
|
438 |
+
print(
|
439 |
+
f"Error: End index {end} < start index {start} at position {i}.", # noqa: E501
|
440 |
+
)
|
441 |
+
return [], []
|
442 |
+
|
443 |
+
start_end = list(range(start, end + 1, 1))
|
444 |
+
start_end = list(set(start_end) - set(ignore_indices))
|
445 |
+
# new_start, new_end = self.extract_sequences(start_end)
|
446 |
+
new_start, new_end = extract_new_startend(
|
447 |
+
start,
|
448 |
+
end,
|
449 |
+
ignore_indices,
|
450 |
+
)
|
451 |
+
filtered_starts.extend(new_start)
|
452 |
+
filtered_ends.extend(new_end)
|
453 |
+
|
454 |
+
return filtered_starts, filtered_ends
|
455 |
+
|
456 |
+
|
457 |
+
def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tuple[list[int], list[int]]:
|
458 |
+
"""
|
459 |
+
Extracts new start and end indices by splitting a range based on
|
460 |
+
ignored indices.
|
461 |
+
|
462 |
+
Args:
|
463 |
+
start (int): The starting index of the range.
|
464 |
+
end (int): The ending index of the range (exclusive).
|
465 |
+
ignore_indices (list): indices to ignore within the range.
|
466 |
+
|
467 |
+
Returns:
|
468 |
+
tuple: A tuple containing two lists:
|
469 |
+
- new_starts (list): Starting indices for the sub-ranges.
|
470 |
+
- new_ends (list): Ending indices for the sub-ranges.
|
471 |
+
"""
|
472 |
+
# Sort the set of ignore_indices in ascending order.
|
473 |
+
indexes = list(set(ignore_indices))
|
474 |
+
indexes.sort()
|
475 |
+
|
476 |
+
new_starts = []
|
477 |
+
new_ends = []
|
478 |
+
new_start = start
|
479 |
+
|
480 |
+
# If no indices to ignore, return the original range.
|
481 |
+
if indexes is None or len(indexes) < 1:
|
482 |
+
new_starts.append(start)
|
483 |
+
new_ends.append(end)
|
484 |
+
return new_starts, new_ends
|
485 |
+
|
486 |
+
for index in indexes:
|
487 |
+
# Skip indices that are outside the range [start, end).
|
488 |
+
if index < start:
|
489 |
+
continue
|
490 |
+
elif index >= end:
|
491 |
+
continue
|
492 |
+
|
493 |
+
new_starts.append(new_start)
|
494 |
+
new_ends.append(index)
|
495 |
+
|
496 |
+
new_start = index + 1
|
497 |
+
|
498 |
+
new_starts.append(new_start)
|
499 |
+
new_ends.append(end)
|
500 |
+
|
501 |
+
return new_starts, new_ends
|
src/application/text/model_detection.py
CHANGED
@@ -13,7 +13,6 @@ from src.application.config import (
|
|
13 |
DEVICE,
|
14 |
GPT_PARAPHRASE_MODELS,
|
15 |
HUMAN,
|
16 |
-
MACHINE,
|
17 |
MODEL_HUMAN_LABEL,
|
18 |
PARAPHRASE_MODEL,
|
19 |
UNKNOWN,
|
@@ -62,9 +61,9 @@ def detect_text_by_ai_model(
|
|
62 |
if result["label"] == MODEL_HUMAN_LABEL[model]:
|
63 |
label = HUMAN
|
64 |
else:
|
65 |
-
label = MACHINE
|
66 |
generated_model, _ = predict_generation_model(input_text)
|
67 |
-
label
|
68 |
|
69 |
return label, confidence_score
|
70 |
|
|
|
13 |
DEVICE,
|
14 |
GPT_PARAPHRASE_MODELS,
|
15 |
HUMAN,
|
|
|
16 |
MODEL_HUMAN_LABEL,
|
17 |
PARAPHRASE_MODEL,
|
18 |
UNKNOWN,
|
|
|
61 |
if result["label"] == MODEL_HUMAN_LABEL[model]:
|
62 |
label = HUMAN
|
63 |
else:
|
64 |
+
# label = MACHINE
|
65 |
generated_model, _ = predict_generation_model(input_text)
|
66 |
+
label = f"Partially generated by {generated_model}"
|
67 |
|
68 |
return label, confidence_score
|
69 |
|
src/application/text/preprocessing.py
DELETED
@@ -1,67 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Author: Khanh Phan
|
3 |
-
Date: 2024-12-04
|
4 |
-
"""
|
5 |
-
|
6 |
-
from nltk.tokenize import sent_tokenize
|
7 |
-
|
8 |
-
|
9 |
-
# TODO: consider moving to helpers
|
10 |
-
def split_into_sentences(input_text: str) -> list[str]:
|
11 |
-
"""
|
12 |
-
Splits input text into sentences by newlines
|
13 |
-
and then tokenizes each paragraph into sentences.
|
14 |
-
|
15 |
-
Args:
|
16 |
-
input_text (str): The input text as a string.
|
17 |
-
|
18 |
-
Returns:
|
19 |
-
list: A list of sentences.
|
20 |
-
Returns an empty list if input is not a string.
|
21 |
-
"""
|
22 |
-
if not isinstance(input_text, str):
|
23 |
-
return []
|
24 |
-
|
25 |
-
# Split the input text into paragraphs based on newline characters,
|
26 |
-
# keeping the newline characters.
|
27 |
-
paragraphs = input_text.splitlines(keepends=True)
|
28 |
-
sentences = []
|
29 |
-
for paragraph in paragraphs:
|
30 |
-
# Remove leading/trailing whitespace
|
31 |
-
paragraph = paragraph.strip()
|
32 |
-
|
33 |
-
if paragraph and paragraph != "\n":
|
34 |
-
# Tokenize the paragraph into sentences
|
35 |
-
sentences.extend(sent_tokenize(paragraph))
|
36 |
-
|
37 |
-
return sentences
|
38 |
-
|
39 |
-
|
40 |
-
def split_into_paragraphs(input_text: str) -> list[str]:
|
41 |
-
"""
|
42 |
-
Splits input text into paragraphs based on newline characters.
|
43 |
-
|
44 |
-
Args:
|
45 |
-
input_text (str): The input text as a string.
|
46 |
-
|
47 |
-
Returns:
|
48 |
-
list: A list of paragraphs.
|
49 |
-
Returns an empty list if input is not a string.
|
50 |
-
"""
|
51 |
-
if not isinstance(input_text, str):
|
52 |
-
return []
|
53 |
-
|
54 |
-
# Split the input text into paragraphs based on newline characters,
|
55 |
-
# keeping the newline characters.
|
56 |
-
paragraphs = input_text.splitlines(keepends=True)
|
57 |
-
out_paragraphs = []
|
58 |
-
|
59 |
-
for paragraph in paragraphs:
|
60 |
-
# Remove leading/trailing whitespace
|
61 |
-
paragraph = paragraph.strip()
|
62 |
-
|
63 |
-
if paragraph and paragraph != "\n":
|
64 |
-
# Append the cleaned paragraph to the output list.
|
65 |
-
out_paragraphs.append(paragraph)
|
66 |
-
|
67 |
-
return out_paragraphs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/application/text/search_detection.py
CHANGED
@@ -17,7 +17,7 @@ from src.application.config import (
|
|
17 |
PARAPHRASE_THRESHOLD_MACHINE,
|
18 |
TOP_URLS_PER_SEARCH,
|
19 |
)
|
20 |
-
from src.application.text.
|
21 |
from src.application.text.search import (
|
22 |
generate_search_phrases,
|
23 |
search_by_google,
|
|
|
17 |
PARAPHRASE_THRESHOLD_MACHINE,
|
18 |
TOP_URLS_PER_SEARCH,
|
19 |
)
|
20 |
+
from src.application.text.helper import split_into_sentences
|
21 |
from src.application.text.search import (
|
22 |
generate_search_phrases,
|
23 |
search_by_google,
|
src/application/url_reader.py
CHANGED
@@ -8,18 +8,29 @@ from newspaper import (
|
|
8 |
article,
|
9 |
)
|
10 |
|
11 |
-
|
12 |
-
MAX_URL_SIZE = 2000000 # ~2MB
|
13 |
|
14 |
|
15 |
class URLReader:
|
|
|
|
|
|
|
|
|
|
|
16 |
def __init__(self, url: string, newspaper: bool = True):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
url_size = self.get_size()
|
25 |
if url_size is None or url_size > MAX_URL_SIZE:
|
@@ -27,9 +38,7 @@ class URLReader:
|
|
27 |
else:
|
28 |
self.is_extracted = True
|
29 |
|
30 |
-
self.newspaper =
|
31 |
-
newspaper # True if using newspaper4k, False if using BS
|
32 |
-
)
|
33 |
if self.newspaper is True:
|
34 |
self.extract_content_newspaper()
|
35 |
else:
|
@@ -37,81 +46,70 @@ class URLReader:
|
|
37 |
|
38 |
def extract_content_newspaper(self):
|
39 |
"""
|
40 |
-
|
41 |
-
|
42 |
-
Args:
|
43 |
-
url: The URL of the web page.
|
44 |
-
|
45 |
-
Returns:
|
46 |
-
The extracted content (title, text, images)
|
47 |
"""
|
48 |
-
|
49 |
try:
|
50 |
response = requests.get(self.url)
|
51 |
-
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
except requests.exceptions.RequestException as e:
|
53 |
print(f"Error fetching URL: {e}")
|
54 |
return None
|
55 |
-
|
56 |
-
try:
|
57 |
-
news = article(url=self.url, fetch_images=True)
|
58 |
except (ArticleException, ArticleBinaryDataException) as e:
|
59 |
print(f"\t\tβββ Error downloading article: {e}")
|
60 |
return None
|
61 |
|
62 |
-
self.title = news.title
|
63 |
-
self.text = news.text
|
64 |
-
self.images = list(set(news.images)) # Remove duplicates
|
65 |
-
self.top_image = news.top_image
|
66 |
-
|
67 |
def extract_content_bs(self):
|
68 |
"""
|
69 |
-
|
70 |
"""
|
71 |
-
|
72 |
-
|
|
|
73 |
|
74 |
-
|
75 |
|
76 |
-
try:
|
77 |
soup = BeautifulSoup(response.content, "html.parser")
|
78 |
-
except Exception as e:
|
79 |
-
print(f"Error parsing HTML content from {self.url}: {e}")
|
80 |
-
return None
|
81 |
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
text = " ".join([p.get_text() for p in paragraphs])
|
94 |
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def get_size(self):
|
98 |
"""
|
99 |
Retrieves the size of a URL's content using a HEAD request.
|
100 |
-
|
101 |
-
Args:
|
102 |
-
url: The URL to check.
|
103 |
-
|
104 |
-
Returns:
|
105 |
-
The size of the content in bytes,
|
106 |
-
or None if the size cannot be determined
|
107 |
-
(e.g., due to network errors or missing Content-Length header).
|
108 |
"""
|
109 |
try:
|
110 |
response = requests.head(
|
111 |
self.url,
|
112 |
allow_redirects=True,
|
113 |
timeout=5,
|
114 |
-
)
|
115 |
response.raise_for_status() # Raise HTTPError for bad responses
|
116 |
|
117 |
content_length = response.headers.get("Content-Length")
|
@@ -123,7 +121,7 @@ class URLReader:
|
|
123 |
|
124 |
except requests.exceptions.RequestException as e:
|
125 |
print(f"\t\tβββ Error getting URL size: {e}")
|
126 |
-
|
127 |
|
128 |
|
129 |
if __name__ == "__main__":
|
|
|
8 |
article,
|
9 |
)
|
10 |
|
11 |
+
from src.application.config import MAX_URL_SIZE
|
|
|
12 |
|
13 |
|
14 |
class URLReader:
|
15 |
+
"""
|
16 |
+
A class to extract content (title, text, images) from a given URL.
|
17 |
+
Supports two extraction methods: newspaper4k and BeautifulSoup.
|
18 |
+
"""
|
19 |
+
|
20 |
def __init__(self, url: string, newspaper: bool = True):
|
21 |
+
"""
|
22 |
+
Initializes the URLReader object.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
url: The URL to extract content from.
|
26 |
+
newspaper: True to use newspaper4k, False to use BeautifulSoup.
|
27 |
+
"""
|
28 |
+
self.url: str = url
|
29 |
+
self.text: str = None # Extracted text content
|
30 |
+
self.title: str = None # Extracted title
|
31 |
+
self.images: list[str] = None # list of image URLs
|
32 |
+
self.top_image: str = None # URL of the top image
|
33 |
+
self.is_extracted: bool = False # Indicating successful extraction
|
34 |
|
35 |
url_size = self.get_size()
|
36 |
if url_size is None or url_size > MAX_URL_SIZE:
|
|
|
38 |
else:
|
39 |
self.is_extracted = True
|
40 |
|
41 |
+
self.newspaper = newspaper
|
|
|
|
|
42 |
if self.newspaper is True:
|
43 |
self.extract_content_newspaper()
|
44 |
else:
|
|
|
46 |
|
47 |
def extract_content_newspaper(self):
|
48 |
"""
|
49 |
+
Extracts content from a URL using the newspaper4k library.
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
"""
|
|
|
51 |
try:
|
52 |
response = requests.get(self.url)
|
53 |
+
response.raise_for_status() # Raise HTTPError for bad responses
|
54 |
+
|
55 |
+
news = article(url=self.url, fetch_images=True)
|
56 |
+
|
57 |
+
self.title = news.title
|
58 |
+
self.text = news.text
|
59 |
+
self.images = list(set(news.images)) # Remove duplicates
|
60 |
+
self.top_image = news.top_image
|
61 |
+
|
62 |
except requests.exceptions.RequestException as e:
|
63 |
print(f"Error fetching URL: {e}")
|
64 |
return None
|
|
|
|
|
|
|
65 |
except (ArticleException, ArticleBinaryDataException) as e:
|
66 |
print(f"\t\tβββ Error downloading article: {e}")
|
67 |
return None
|
68 |
|
|
|
|
|
|
|
|
|
|
|
69 |
def extract_content_bs(self):
|
70 |
"""
|
71 |
+
Extracts content from a URL using BeautifulSoup.
|
72 |
"""
|
73 |
+
try:
|
74 |
+
response = requests.get(self.url)
|
75 |
+
response.raise_for_status()
|
76 |
|
77 |
+
response.encoding = response.apparent_encoding # Detect encoding
|
78 |
|
|
|
79 |
soup = BeautifulSoup(response.content, "html.parser")
|
|
|
|
|
|
|
80 |
|
81 |
+
self.title = soup.title.string.strip() if soup.title else None
|
82 |
|
83 |
+
image_urls = [img["src"] for img in soup.find_all("img")]
|
84 |
+
self.images = image_urls
|
85 |
+
self.top_image = self.images[0]
|
86 |
|
87 |
+
# Remove unwanted elements from the HTML
|
88 |
+
for element in soup(
|
89 |
+
["img", "figcaption", "table", "script", "style"],
|
90 |
+
):
|
91 |
+
element.extract()
|
|
|
92 |
|
93 |
+
paragraphs = soup.find_all("p")
|
94 |
+
self.text = " ".join([p.get_text() for p in paragraphs])
|
95 |
+
|
96 |
+
except requests.exceptions.RequestException as e:
|
97 |
+
print(f"Error fetching URL: {e}")
|
98 |
+
return None
|
99 |
+
except Exception as e:
|
100 |
+
print(f"Error parsing HTML content from {self.url}: {e}")
|
101 |
+
return None
|
102 |
|
103 |
def get_size(self):
|
104 |
"""
|
105 |
Retrieves the size of a URL's content using a HEAD request.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
"""
|
107 |
try:
|
108 |
response = requests.head(
|
109 |
self.url,
|
110 |
allow_redirects=True,
|
111 |
timeout=5,
|
112 |
+
)
|
113 |
response.raise_for_status() # Raise HTTPError for bad responses
|
114 |
|
115 |
content_length = response.headers.get("Content-Length")
|
|
|
121 |
|
122 |
except requests.exceptions.RequestException as e:
|
123 |
print(f"\t\tβββ Error getting URL size: {e}")
|
124 |
+
return None
|
125 |
|
126 |
|
127 |
if __name__ == "__main__":
|
test.py
CHANGED
@@ -1,3 +1,27 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def postprocess_label(labels: list[str]) -> str:
|
2 |
+
"""
|
3 |
+
Creates a label string with the format
|
4 |
+
"Partially generated by [label1] and [label2] and ...".
|
5 |
+
Removes duplicate labels while preserving the original order.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
labels: A list of strings representing labels.
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
A string with the formatted label.
|
12 |
+
"""
|
13 |
+
labels = list(set(labels))
|
14 |
+
label = "Partially generated by "
|
15 |
+
if len(label) == 1:
|
16 |
+
label += labels[0]
|
17 |
+
elif len(labels) == 2:
|
18 |
+
label += f"{labels[0]} and {labels[1]}"
|
19 |
+
else:
|
20 |
+
combination = ", ".join(labels[0 : len(labels) - 1])
|
21 |
+
label += f"{combination}, and {labels[-1]}"
|
22 |
+
return label
|
23 |
+
|
24 |
+
|
25 |
+
labels = ["gpt-4o", "gpt-4o-mini", "gpt-4o-l"]
|
26 |
+
postprocessed_label = postprocess_label(labels)
|
27 |
+
print(postprocessed_label)
|