pmkhanh7890 commited on
Commit
b73a4fc
Β·
1 Parent(s): 0827f9d

refactor code and fix bugs

Browse files
application.py CHANGED
@@ -50,8 +50,8 @@ def generate_analysis_report(
50
  ):
51
  news_analysis = NewsVerification()
52
  news_analysis.load_news(news_title, news_content, news_image)
53
- news_analysis.generate_analysis_report()
54
- return news_analysis.analyze_details()
55
 
56
 
57
  # Define the GUI
 
50
  ):
51
  news_analysis = NewsVerification()
52
  news_analysis.load_news(news_title, news_content, news_image)
53
+ news_analysis.determine_origin()
54
+ return news_analysis.generate_report()
55
 
56
 
57
  # Define the GUI
src/application/config.py CHANGED
@@ -1,8 +1,8 @@
1
- # Download necessary NLTK data files
2
  """
3
  Author: Khanh Phan
4
  Date: 2024-12-04
5
  """
 
6
  import os
7
 
8
  import nltk
@@ -22,6 +22,7 @@ AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
22
  # GPT Model
23
  GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini"
24
  GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
 
25
  AZUREOPENAI_CLIENT = openai.AzureOpenAI(
26
  api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION,
27
  api_key=AZURE_OPENAI_API_KEY,
@@ -54,6 +55,7 @@ MAX_CHAR_SIZE = 30000
54
 
55
  # Number of top URLs per search
56
  TOP_URLS_PER_SEARCH = 3
 
57
 
58
  # Search parameters
59
  GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
@@ -80,5 +82,10 @@ factor = 0: Black.
80
  """
81
  ENTITY_LIGHTEN_COLOR = 2.2
82
  ENTITY_DARKEN_COLOR = 0.7
 
83
  ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness).
84
  ENTITY_BRIGHTNESS = 0.75 # color's brightness.
 
 
 
 
 
 
1
  """
2
  Author: Khanh Phan
3
  Date: 2024-12-04
4
  """
5
+
6
  import os
7
 
8
  import nltk
 
22
  # GPT Model
23
  GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini"
24
  GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"]
25
+ GPT_IMAGE_MODEL = "dall-e-3"
26
  AZUREOPENAI_CLIENT = openai.AzureOpenAI(
27
  api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION,
28
  api_key=AZURE_OPENAI_API_KEY,
 
55
 
56
  # Number of top URLs per search
57
  TOP_URLS_PER_SEARCH = 3
58
+ MAX_URL_SIZE = 2 * 1024 * 1024 # ~2 MB
59
 
60
  # Search parameters
61
  GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1"
 
82
  """
83
  ENTITY_LIGHTEN_COLOR = 2.2
84
  ENTITY_DARKEN_COLOR = 0.7
85
+
86
  ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness).
87
  ENTITY_BRIGHTNESS = 0.75 # color's brightness.
88
+
89
+
90
+ # HTML formatting
91
+ WORD_BREAK = "word-break: break-all;"
src/application/content_detection.py CHANGED
@@ -1,7 +1,12 @@
1
- from difflib import SequenceMatcher
 
 
 
2
 
3
  import pandas as pd
4
 
 
 
5
  from src.application.image.image_detection import (
6
  detect_image_by_ai_model,
7
  detect_image_by_reverse_search,
@@ -11,24 +16,27 @@ from src.application.text.entity import (
11
  apply_highlight,
12
  highlight_entities,
13
  )
14
- from src.application.text.helper import extract_equal_text
 
 
 
 
15
  from src.application.text.model_detection import (
16
  detect_text_by_ai_model,
17
  predict_generation_model,
18
  )
19
- from src.application.text.preprocessing import split_into_paragraphs
20
- from src.application.text.search_detection import (
21
- PARAPHRASE_THRESHOLD_MACHINE,
22
- find_sentence_source,
23
- )
24
 
25
 
26
  class NewsVerification:
27
  def __init__(self):
28
- self.news_text = ""
29
- self.news_title = ""
30
- self.news_content = ""
31
- self.news_image = ""
 
 
 
32
 
33
  self.text_prediction_label: list[str] = ["UNKNOWN"]
34
  self.text_prediction_score: list[float] = [0.0]
@@ -37,8 +45,8 @@ class NewsVerification:
37
  self.image_prediction_score: list[str] = [0.0]
38
  self.image_referent_url: list[str] = []
39
 
40
- self.news_prediction_label = ""
41
- self.news_prediction_score = -1
42
 
43
  # news' urls to find img
44
  self.found_img_url: list[str] = []
@@ -52,8 +60,7 @@ class NewsVerification:
52
  "similarity",
53
  "paraphrase",
54
  "url",
55
- "group",
56
- "entities",
57
  ],
58
  )
59
  self.grouped_url_df: pd.DataFrame = pd.DataFrame()
@@ -63,95 +70,165 @@ class NewsVerification:
63
  self.fact_checker_table: list = []
64
  self.governor_table: list = []
65
 
66
- def load_news(self, news_title, news_content, news_image):
 
 
 
 
 
 
 
 
 
 
67
  self.news_text = (news_title + "\n\n" + news_content).strip()
 
 
 
 
 
 
 
 
 
 
68
  self.news_title = news_title
69
  self.news_content = news_content
70
  self.news_image = news_image
71
 
72
- def determine_text_origin(self):
73
- self.find_text_source()
 
 
 
74
 
75
- # Group inout and source by url
76
  def concat_text(series):
 
 
 
77
  return " ".join(
78
  series.astype(str).tolist(),
79
  ) # Handle mixed data types and NaNs
80
 
81
- self.grouped_url_df = self.aligned_sentences_df.groupby("url").agg(
82
- {
83
- "input": concat_text,
84
- "source": concat_text,
85
- },
86
- )
87
- self.grouped_url_df = self.grouped_url_df.reset_index()
 
 
 
 
 
88
  # Add new columns for label and score
89
  self.grouped_url_df["label"] = None
90
  self.grouped_url_df["score"] = None
91
 
92
  print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
93
 
 
 
 
 
94
  for index, row in self.grouped_url_df.iterrows():
 
95
  label, score = self.verify_text(row["url"])
 
 
96
  if label == "UNKNOWN":
97
- # Concatenate text from "input" in sentence_df
98
  text = " ".join(row["input"])
99
 
100
- # detect by baseline model
101
  label, score = detect_text_by_ai_model(text)
102
 
103
  self.grouped_url_df.at[index, "label"] = label
104
  self.grouped_url_df.at[index, "score"] = score
105
 
106
- # Overall label or score for the whole input text
107
- if len(self.grouped_url_df) > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  machine_label = self.grouped_url_df[
109
  self.grouped_url_df["label"].str.contains(
110
- "MACHINE",
111
  case=False,
112
  na=False,
113
  )
114
  ]
115
 
116
- if len(machine_label) > 0:
117
- label = " ".join(machine_label["label"].tolist())
 
 
 
 
 
118
  self.text_prediction_label[0] = label
119
  self.text_prediction_score[0] = machine_label["score"].mean()
120
  else:
 
121
  machine_label = self.aligned_sentences_df[
122
  self.aligned_sentences_df["label"] == "HUMAN"
123
  ]
124
  self.text_prediction_label[0] = "HUMAN"
125
  self.text_prediction_score[0] = machine_label["score"].mean()
126
- else: # no source found in the input text
 
127
  print("No source found in the input text")
128
  text = " ".join(self.aligned_sentences_df["input"].tolist())
129
- # detect by baseline model
 
130
  label, score = detect_text_by_ai_model(text)
131
  self.text_prediction_label[0] = label
132
  self.text_prediction_score[0] = score
133
 
134
  def find_text_source(self):
135
  """
136
- Determines the origin of the given text based on paraphrasing detection
137
- and human authorship analysis.
138
-
139
- Args:
140
- text: The input text to be analyzed.
141
 
142
- Returns:
143
- str: The predicted origin of the text:
144
- - "HUMAN": If the text is likely written by a human.
145
- - "MACHINE": If the text is likely generated by a machine.
146
  """
147
  print("CHECK TEXT:")
148
  print("\tFrom search engine:")
149
- # Classify by search engine
150
- # input_sentences = split_into_sentences(self.news_text)
151
  input_paragraphs = split_into_paragraphs(self.news_text)
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- # Setup df for input_sentences
154
-
155
  for _ in range(len(input_paragraphs)):
156
  self.aligned_sentences_df = pd.concat(
157
  [
@@ -173,7 +250,7 @@ class NewsVerification:
173
  ignore_index=True,
174
  )
175
 
176
- # find a source for each sentence
177
  for index, _ in enumerate(input_paragraphs):
178
  similarity = self.aligned_sentences_df.loc[index, "similarity"]
179
  if similarity is not None:
@@ -188,23 +265,47 @@ class NewsVerification:
188
  index,
189
  self.aligned_sentences_df,
190
  )
191
-
 
 
 
192
  self.found_img_url.extend(img_urls)
193
 
194
- # determine if the whole source is from a news or not
195
-
196
  def verify_text(self, url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  label = "UNKNOWN"
198
  score = 0
 
199
  # calculate the average similarity when the similary score
200
  # in each row of sentences_df is higher than 0.8
 
 
201
  filtered_by_url = self.aligned_sentences_df[
202
  self.aligned_sentences_df["url"] == url
203
  ]
 
 
204
  filtered_by_similarity = filtered_by_url[
205
- filtered_by_url["similarity"] > 0.8
206
  ]
207
- if len(filtered_by_similarity) / len(self.aligned_sentences_df) > 0.5:
 
 
208
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
209
  contains_machine = (
210
  filtered_by_similarity["label"]
@@ -215,8 +316,10 @@ class NewsVerification:
215
  )
216
  .any()
217
  )
 
 
218
  if contains_machine:
219
- label = "MACHINE"
220
  machine_rows = filtered_by_similarity[
221
  filtered_by_similarity["label"].str.contains(
222
  "MACHINE",
@@ -225,9 +328,10 @@ class NewsVerification:
225
  )
226
  ]
227
  generated_model, _ = predict_generation_model(self.news_text)
228
- label += f"<br>({generated_model})"
229
  score = machine_rows["similarity"].mean()
230
  else:
 
231
  label = "HUMAN"
232
  human_rows = filtered_by_similarity[
233
  filtered_by_similarity["label"].str.contains(
@@ -241,13 +345,26 @@ class NewsVerification:
241
  return label, score
242
 
243
  def determine_image_origin(self):
 
 
 
 
 
 
 
 
 
244
  print("CHECK IMAGE:")
 
 
245
  if self.news_image is None:
246
  self.image_prediction_label = "UNKNOWN"
247
  self.image_prediction_score = 0.0
248
  self.image_referent_url = None
249
  return
250
 
 
 
251
  matched_url, similarity = detect_image_from_news_image(
252
  self.news_image,
253
  self.found_img_url,
@@ -259,6 +376,8 @@ class NewsVerification:
259
  self.image_referent_url = matched_url
260
  return
261
 
 
 
262
  matched_url, similarity = detect_image_by_reverse_search(
263
  self.news_image,
264
  )
@@ -269,6 +388,8 @@ class NewsVerification:
269
  self.image_referent_url = matched_url
270
  return
271
 
 
 
272
  detected_label, score = detect_image_by_ai_model(self.news_image)
273
  if detected_label:
274
  print(f"detected_label: {detected_label} ({score})")
@@ -277,18 +398,34 @@ class NewsVerification:
277
  self.image_referent_url = None
278
  return
279
 
 
280
  self.image_prediction_label = "UNKNOWN"
281
  self.image_prediction_score = 50
282
  self.image_referent_url = None
283
 
284
- def generate_analysis_report(self):
 
 
 
285
  if self.news_text != "":
286
  self.determine_text_origin()
287
  if self.news_image != "":
288
  self.determine_image_origin()
289
-
290
- def analyze_details(self):
291
  self.handle_entities()
 
 
 
 
 
 
 
 
 
 
 
 
292
  ordinary_user_table = self.create_ordinary_user_table()
293
  fact_checker_table = self.create_fact_checker_table()
294
  governor_table = self.create_governor_table()
@@ -296,6 +433,16 @@ class NewsVerification:
296
  return ordinary_user_table, fact_checker_table, governor_table
297
 
298
  def handle_entities(self):
 
 
 
 
 
 
 
 
 
 
299
  entities_with_colors = []
300
  for index, row in self.grouped_url_df.iterrows():
301
  # Get entity-words (in pair) with colors
@@ -304,51 +451,23 @@ class NewsVerification:
304
  row["source"],
305
  )
306
 
 
 
307
  for index, sentence in self.aligned_sentences_df.iterrows():
308
  if sentence["url"] == row["url"]:
 
309
  self.aligned_sentences_df.at[index, "entities"] = (
310
- entities_with_colors # must use at
311
  )
312
 
313
- def get_text_urls(self):
314
- return set(self.text_referent_url)
315
-
316
- def compare_sentences(self, sentence_1, sentence_2, position, color):
317
  """
318
- Compares two sentences and identifies common phrases,
319
- outputting their start and end positions.
320
 
 
 
321
  """
322
-
323
- if not sentence_1 or not sentence_2: # Handle empty strings
324
- return []
325
-
326
- s = SequenceMatcher(None, sentence_1, sentence_2)
327
- common_phrases = []
328
-
329
- for block in s.get_matching_blocks():
330
- if block.size > 0: # Ignore zero-length matches
331
- start_1 = block.a
332
- end_1 = block.a + block.size
333
- start_2 = block.b
334
- end_2 = block.b + block.size
335
-
336
- phrase = sentence_1[
337
- start_1:end_1
338
- ] # Or sentence_2[start_2:end_2], they are the same
339
-
340
- common_phrases.append(
341
- {
342
- "phrase": phrase,
343
- "start_1": start_1 + position,
344
- "end_1": end_1 + position,
345
- "start_2": start_2,
346
- "end_2": end_2,
347
- "color": color,
348
- },
349
- )
350
- position += len(sentence_1)
351
- return common_phrases, position
352
 
353
  def create_fact_checker_table(self):
354
  rows = []
@@ -387,7 +506,7 @@ class NewsVerification:
387
  if index == 0 or current_url != previous_url:
388
  first_url_row = True
389
  previous_url = current_url
390
- # Increase counter "span_row" when the next url is the same
391
  while (
392
  index + span_row < len(self.fact_checker_table)
393
  and self.fact_checker_table[index + span_row][4]
@@ -432,7 +551,7 @@ class NewsVerification:
432
  </table>
433
 
434
  <style>
435
- """
436
 
437
  def format_text_fact_checker_row(
438
  self,
@@ -467,12 +586,12 @@ class NewsVerification:
467
  entity_count = len(row[3])
468
 
469
  # Color overlapping words
470
- input_sentence = self.color_text(
471
  input_sentence,
472
  row[1],
473
  highlight_idx_input,
474
  ) # text, index of highlight words
475
- source_sentence = self.color_text(
476
  source_sentence,
477
  row[2],
478
  highlight_idx_source,
@@ -493,6 +612,7 @@ class NewsVerification:
493
  source_sentence = row[0]["source"]
494
 
495
  url = row[0]["url"]
 
496
  # Displayed label and score by url
497
  filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
498
  if len(filterby_url) > 0:
@@ -506,7 +626,7 @@ class NewsVerification:
506
  source_text_url = f"""<a href="{url}">{url}</a>"""
507
 
508
  # Format displayed entity count
509
- entity_count_text = self.get_entity_count_text(entity_count)
510
 
511
  border_top = "border-top: 1px solid transparent;"
512
  border_bottom = "border-bottom: 1px solid transparent;"
@@ -600,7 +720,7 @@ class NewsVerification:
600
  <style>
601
  """
602
 
603
- def format_text_ordinary_user_row(self, max_length=30):
604
  input_sentences = ""
605
  source_text_urls = ""
606
  urls = []
@@ -623,7 +743,7 @@ class NewsVerification:
623
  </tr>
624
  """
625
 
626
- def format_image_ordinary_user_row(self, max_length=30):
627
 
628
  if (
629
  self.image_referent_url is not None
@@ -720,12 +840,12 @@ class NewsVerification:
720
  )
721
 
722
  # Color overlapping words
723
- input_sentence = self.color_text(
724
  input_sentence,
725
  row[1],
726
  highlight_idx_input,
727
  ) # text, index of highlight words
728
- source_sentence = self.color_text(
729
  source_sentence,
730
  row[2],
731
  highlight_idx_source,
@@ -759,7 +879,7 @@ class NewsVerification:
759
  if row[3] is not None:
760
  entity_count.append(len(row[3]))
761
 
762
- entity_count_text = self.get_entity_count_text(sum(entity_count))
763
  word_break = "word-break: break-all;"
764
  return f"""
765
  <tr>
@@ -785,155 +905,3 @@ class NewsVerification:
785
 
786
  word_break = "word-break: break-all;"
787
  return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501
788
-
789
- def get_entity_count_text(self, entity_count):
790
- if entity_count <= 0:
791
- entity_count_text = ""
792
- elif entity_count == 1:
793
- entity_count_text = "with 1 altered entity"
794
- else:
795
- entity_count_text = "with altered entities"
796
- return entity_count_text
797
-
798
- def color_text(self, text, colored_idx, highlighted_idx):
799
- sentence = ""
800
- words = text.split()
801
-
802
- starts, ends = self.extract_starts_ends(colored_idx)
803
- starts, ends = self.filter_indices(starts, ends, highlighted_idx)
804
-
805
- previous_end = 0
806
- for start, end in zip(starts, ends):
807
- sentence += " ".join(words[previous_end:start])
808
-
809
- equal_words = " ".join(words[start:end])
810
- sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
811
-
812
- previous_end = end
813
-
814
- sentence += " ".join(words[previous_end:])
815
-
816
- return sentence
817
-
818
- def extract_starts_ends(self, colored_idx):
819
- starts = []
820
- ends = []
821
- for index in colored_idx:
822
- starts.append(index["start"])
823
- ends.append(index["end"])
824
- return starts, ends
825
-
826
- def filter_indices(self, starts, ends, ignore_indices):
827
- """
828
- Filters start and end indices to exclude any indices present in the
829
- ignore_indices list.
830
-
831
- Args:
832
- starts: A list of starting indices.
833
- ends: A list of ending indices. Must be the same length as starts.
834
- ignore_indices: A list of indices to exclude.
835
-
836
- Returns:
837
- A tuple of two lists: filtered_starts and filtered_ends.
838
- Returns empty lists if the input is invalid
839
- or if all ranges are filtered out.
840
- Prints error messages for invalid input.
841
-
842
- Examples:
843
- starts = [0, 5, 10]
844
- ends = [3, 7, 12] # words at the end will not be colored.
845
- ignore_indices = [1, 2, 12, 17]
846
-
847
- # Output:
848
- starts = [0, 3, 5, 10]
849
- ends = [1, 4, 7, 12]
850
-
851
- """
852
-
853
- if len(starts) != len(ends):
854
- print(
855
- "Error: The 'starts' and 'ends' lists must have the same length.", # noqa: E501
856
- )
857
- return [], []
858
-
859
- filtered_starts = []
860
- filtered_ends = []
861
-
862
- for i in range(len(starts)):
863
- start = starts[i]
864
- end = ends[i]
865
-
866
- if end < start:
867
- print(
868
- f"Error: End index {end} is less than start index {start} at position {i}.", # noqa: E501
869
- )
870
- return [], []
871
-
872
- start_end = list(range(start, end + 1, 1))
873
- start_end = list(set(start_end) - set(ignore_indices))
874
- # new_start, new_end = self.extract_sequences(start_end)
875
- new_start, new_end = self.extract_new_startend(
876
- start,
877
- end,
878
- ignore_indices,
879
- )
880
- filtered_starts.extend(new_start)
881
- filtered_ends.extend(new_end)
882
-
883
- return filtered_starts, filtered_ends
884
-
885
- def extract_new_startend(self, start, end, ignore_indices):
886
- # sort a set of ignore_indices
887
- indexes = list(set(ignore_indices))
888
- indexes.sort()
889
-
890
- new_starts = []
891
- new_ends = []
892
- new_start = start
893
- if indexes is None or len(indexes) < 1:
894
- new_starts.append(start)
895
- new_ends.append(end)
896
- return new_starts, new_ends
897
-
898
- for index in indexes:
899
- if index < start:
900
- continue
901
- elif index >= end:
902
- continue
903
-
904
- new_starts.append(new_start)
905
- new_ends.append(index)
906
-
907
- new_start = index + 1
908
-
909
- new_starts.append(new_start)
910
- new_ends.append(end)
911
-
912
- return new_starts, new_ends
913
-
914
- def extract_sequences(self, numbers):
915
- if len(numbers) == 1:
916
- return [numbers[0]], [numbers[0]]
917
-
918
- numbers.sort()
919
- starts = []
920
- ends = []
921
- for i, number in enumerate(numbers):
922
- if i == 0:
923
- start = number
924
- end = number
925
- continue
926
-
927
- if number - 1 == numbers[i - 1]:
928
- end = number
929
- else:
930
- starts.append(start)
931
- ends.append(end)
932
- start = number
933
- end = number
934
-
935
- if i == len(numbers) - 1:
936
- starts.append(start)
937
- ends.append(end)
938
-
939
- return starts, ends
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2024-12-04
4
+ """
5
 
6
  import pandas as pd
7
 
8
+ from src.application.config import MIN_RATIO_PARAPHRASE_NUM, PARAPHRASE_THRESHOLD, PARAPHRASE_THRESHOLD_MACHINE
9
+ from src.application.formatting import color_text, format_entity_count
10
  from src.application.image.image_detection import (
11
  detect_image_by_ai_model,
12
  detect_image_by_reverse_search,
 
16
  apply_highlight,
17
  highlight_entities,
18
  )
19
+ from src.application.text.helper import (
20
+ extract_equal_text,
21
+ postprocess_label,
22
+ split_into_paragraphs,
23
+ )
24
  from src.application.text.model_detection import (
25
  detect_text_by_ai_model,
26
  predict_generation_model,
27
  )
28
+ from src.application.text.search_detection import find_sentence_source
 
 
 
 
29
 
30
 
31
  class NewsVerification:
32
  def __init__(self):
33
+ """
34
+ Initializes the NewsVerification object.
35
+ """
36
+ self.news_text: str = ""
37
+ self.news_title: str = ""
38
+ self.news_content: str = ""
39
+ self.news_image: str = ""
40
 
41
  self.text_prediction_label: list[str] = ["UNKNOWN"]
42
  self.text_prediction_score: list[float] = [0.0]
 
45
  self.image_prediction_score: list[str] = [0.0]
46
  self.image_referent_url: list[str] = []
47
 
48
+ self.news_prediction_label: str = ""
49
+ self.news_prediction_score: float = -1
50
 
51
  # news' urls to find img
52
  self.found_img_url: list[str] = []
 
60
  "similarity",
61
  "paraphrase",
62
  "url",
63
+ # "entities",
 
64
  ],
65
  )
66
  self.grouped_url_df: pd.DataFrame = pd.DataFrame()
 
70
  self.fact_checker_table: list = []
71
  self.governor_table: list = []
72
 
73
+ def load_news(self, news_title: str, news_content: str, news_image: str):
74
+ """
75
+ Loads news data into the object's attributes.
76
+
77
+ Args:
78
+ news_title (str): The title of the news article.
79
+ news_content (str): The content of the news article.
80
+ news_image (str): The url of image in news article.
81
+ """
82
+ # Combine title and content for a full text representation.
83
+ # .strip() removes leading/trailing whitespace for cleaner text.
84
  self.news_text = (news_title + "\n\n" + news_content).strip()
85
+
86
+ # if not isinstance(news_title, str) or not isinstance(
87
+ # news_content,
88
+ # str,
89
+ # ):
90
+ # raise TypeError("News title and content must be strings.")
91
+
92
+ # if not isinstance(news_image, str) or news_image is not None:
93
+ # Warning("News image must be a string.")
94
+
95
  self.news_title = news_title
96
  self.news_content = news_content
97
  self.news_image = news_image
98
 
99
+ def group_by_url(self):
100
+ """
101
+ Groups aligned sentences by URL
102
+ Then, concatenates text the 'input' and 'source' text for each group.
103
+ """
104
 
 
105
  def concat_text(series):
106
+ """
107
+ Concatenates the elements of a pd.Series into a single string.
108
+ """
109
  return " ".join(
110
  series.astype(str).tolist(),
111
  ) # Handle mixed data types and NaNs
112
 
113
+ # Group sentences by URL and concatenate 'input' and 'source' text.
114
+ self.grouped_url_df = (
115
+ self.aligned_sentences_df.groupby("url")
116
+ .agg(
117
+ {
118
+ "input": concat_text,
119
+ "source": concat_text,
120
+ },
121
+ )
122
+ .reset_index()
123
+ ) # Reset index to make 'url' a regular column
124
+
125
  # Add new columns for label and score
126
  self.grouped_url_df["label"] = None
127
  self.grouped_url_df["score"] = None
128
 
129
  print(f"aligned_sentences_df:\n {self.aligned_sentences_df}")
130
 
131
+ def determine_text_origin_by_url(self):
132
+ """
133
+ Determines the text origin for each URL group.
134
+ """
135
  for index, row in self.grouped_url_df.iterrows():
136
+ # Verify text origin using URL-based verification.
137
  label, score = self.verify_text(row["url"])
138
+
139
+ # If URL-based verification returns 'UNKNOWN', use AI detection
140
  if label == "UNKNOWN":
141
+ # Concatenate text from "input" column in sentence_df
142
  text = " ".join(row["input"])
143
 
144
+ # Detect text origin using an AI model.
145
  label, score = detect_text_by_ai_model(text)
146
 
147
  self.grouped_url_df.at[index, "label"] = label
148
  self.grouped_url_df.at[index, "score"] = score
149
 
150
+ def determine_text_origin(self):
151
+ """
152
+ Determines the origin of the input text by analyzing
153
+ its sources and applying AI detection models.
154
+
155
+ This method groups sentences by their source URLs,
156
+ applies verification and AI detection, and then determines
157
+ an overall label and score for the input text.
158
+ """
159
+ # Find the text URLs associated with the input text
160
+ self.find_text_source()
161
+
162
+ # Group sentences by URL and concatenate 'input' and 'source' text.
163
+ self.group_by_url()
164
+
165
+ # Determine the text origin for each URL group
166
+ self.determine_text_origin_by_url()
167
+
168
+ # Determine the overall label and score for the entire input text.
169
+ if not self.grouped_url_df.empty:
170
+ # Check for 'gpt-4o' labels in the grouped URLs.
171
  machine_label = self.grouped_url_df[
172
  self.grouped_url_df["label"].str.contains(
173
+ "gpt-4o",
174
  case=False,
175
  na=False,
176
  )
177
  ]
178
 
179
+ if not machine_label.empty:
180
+ # If 'gpt-4o' labels are found, post-process and assign.
181
+ labels = machine_label["label"].tolist()
182
+ label = postprocess_label(labels)
183
+
184
+ # labels = " and ".join(machine_label["label"].tolist())
185
+ # label = remove_duplicate_words(label)
186
  self.text_prediction_label[0] = label
187
  self.text_prediction_score[0] = machine_label["score"].mean()
188
  else:
189
+ # If no 'gpt-4o' labels, assign for 'HUMAN' labels.
190
  machine_label = self.aligned_sentences_df[
191
  self.aligned_sentences_df["label"] == "HUMAN"
192
  ]
193
  self.text_prediction_label[0] = "HUMAN"
194
  self.text_prediction_score[0] = machine_label["score"].mean()
195
+ else:
196
+ # If no found URLs, use AI detection on the entire input text.
197
  print("No source found in the input text")
198
  text = " ".join(self.aligned_sentences_df["input"].tolist())
199
+
200
+ # Detect text origin using an AI model.
201
  label, score = detect_text_by_ai_model(text)
202
  self.text_prediction_label[0] = label
203
  self.text_prediction_score[0] = score
204
 
205
  def find_text_source(self):
206
  """
207
+ Determines the origin of the given text based on paraphrasing
208
+ detection and human authorship analysis.
 
 
 
209
 
210
+ 1. Splits the input news text into sentences,
211
+ 2. Searches for sources for each sentence
212
+ 3. Updates the aligned_sentences_df with the found sources.
 
213
  """
214
  print("CHECK TEXT:")
215
  print("\tFrom search engine:")
216
+
 
217
  input_paragraphs = split_into_paragraphs(self.news_text)
218
+
219
+ # Initialize an empty DataFrame if it doesn't exist, otherwise extend it.
220
+ if not hasattr(self, 'aligned_sentences_df') or self.aligned_sentences_df is None:
221
+ self.aligned_sentences_df = pd.DataFrame(columns=[
222
+ "input",
223
+ "source",
224
+ "label",
225
+ "similarity",
226
+ "paraphrase",
227
+ "url",
228
+ "entities",
229
+ ])
230
 
231
+ # Setup DataFrame for input_sentences
 
232
  for _ in range(len(input_paragraphs)):
233
  self.aligned_sentences_df = pd.concat(
234
  [
 
250
  ignore_index=True,
251
  )
252
 
253
+ # Find a source for each sentence
254
  for index, _ in enumerate(input_paragraphs):
255
  similarity = self.aligned_sentences_df.loc[index, "similarity"]
256
  if similarity is not None:
 
265
  index,
266
  self.aligned_sentences_df,
267
  )
268
+
269
+ # Initialize found_img_url if it does not exist.
270
+ if not hasattr(self, 'found_img_url'):
271
+ self.found_img_url = []
272
  self.found_img_url.extend(img_urls)
273
 
 
 
274
  def verify_text(self, url):
275
+ """
276
+ Verifies the text origin based on similarity scores and labels
277
+ associated with a given URL.
278
+
279
+ 1. Filters sentences by URL and similarity score,
280
+ 2. Determines if the text is likely generated by a machine or a human.
281
+ 3. Calculates an average similarity score.
282
+
283
+ Args:
284
+ url (str): The URL to filter sentences by.
285
+
286
+ Returns:
287
+ tuple: A
288
+ - Label ("MACHINE", "HUMAN", or "UNKNOWN")
289
+ - Score
290
+ """
291
  label = "UNKNOWN"
292
  score = 0
293
+
294
  # calculate the average similarity when the similary score
295
  # in each row of sentences_df is higher than 0.8
296
+
297
+ # Filter sentences by URL.
298
  filtered_by_url = self.aligned_sentences_df[
299
  self.aligned_sentences_df["url"] == url
300
  ]
301
+
302
+ # Filter sentences by similarity score (> PARAPHRASE_THRESHOLD).
303
  filtered_by_similarity = filtered_by_url[
304
+ filtered_by_url["similarity"] > PARAPHRASE_THRESHOLD
305
  ]
306
+
307
+ # Check if a ratio of remaining filtering-sentences is more than 50%.
308
+ if len(filtered_by_similarity) / len(self.aligned_sentences_df) > MIN_RATIO_PARAPHRASE_NUM:
309
  # check if "MACHINE" is in self.aligned_sentences_df["label"]:
310
  contains_machine = (
311
  filtered_by_similarity["label"]
 
316
  )
317
  .any()
318
  )
319
+
320
+ # TODO: integrate with determine_text_origin
321
  if contains_machine:
322
+ # If "MACHINE" label is present, set label and calculate score.
323
  machine_rows = filtered_by_similarity[
324
  filtered_by_similarity["label"].str.contains(
325
  "MACHINE",
 
328
  )
329
  ]
330
  generated_model, _ = predict_generation_model(self.news_text)
331
+ label = f"Partially generated by {generated_model}"
332
  score = machine_rows["similarity"].mean()
333
  else:
334
+ # If no "MACHINE" label, assign "HUMAN" label and calculate score.
335
  label = "HUMAN"
336
  human_rows = filtered_by_similarity[
337
  filtered_by_similarity["label"].str.contains(
 
345
  return label, score
346
 
347
  def determine_image_origin(self):
348
+ """
349
+ Determines the origin of the news image using various detection methods.
350
+
351
+ 1. Matching against previously found image URLs.
352
+ 2. Reverse image search.
353
+ 3. AI-based image detection.
354
+
355
+ If none of these methods succeed, the image origin is marked as "UNKNOWN".
356
+ """
357
  print("CHECK IMAGE:")
358
+
359
+ # Handle the case where no image is provided.
360
  if self.news_image is None:
361
  self.image_prediction_label = "UNKNOWN"
362
  self.image_prediction_score = 0.0
363
  self.image_referent_url = None
364
  return
365
 
366
+ # Attempt to match the image against previously found image URLs.
367
+ print("\tFrom found image URLs...")
368
  matched_url, similarity = detect_image_from_news_image(
369
  self.news_image,
370
  self.found_img_url,
 
376
  self.image_referent_url = matched_url
377
  return
378
 
379
+ # Attempt to find the image origin using reverse image search.
380
+ print("\tFrom reverse image search...")
381
  matched_url, similarity = detect_image_by_reverse_search(
382
  self.news_image,
383
  )
 
388
  self.image_referent_url = matched_url
389
  return
390
 
391
+ # Attempt to detect the image origin using an AI model.
392
+ print("\tFrom an AI model...")
393
  detected_label, score = detect_image_by_ai_model(self.news_image)
394
  if detected_label:
395
  print(f"detected_label: {detected_label} ({score})")
 
398
  self.image_referent_url = None
399
  return
400
 
401
+ # If all detection methods fail, mark the image origin as "UNKNOWN".
402
  self.image_prediction_label = "UNKNOWN"
403
  self.image_prediction_score = 50
404
  self.image_referent_url = None
405
 
406
+ def determine_origin(self):
407
+ """
408
+ Determine origins by analyzing the news text and image.
409
+ """
410
  if self.news_text != "":
411
  self.determine_text_origin()
412
  if self.news_image != "":
413
  self.determine_image_origin()
414
+
415
+ # Handle entity recognition and processing.
416
  self.handle_entities()
417
+
418
+ def generate_report(self) -> tuple[str, str, str]:
419
+ """
420
+ Generates reports tailored for different user roles
421
+ (ordinary users, fact checkers, governors).
422
+
423
+ Returns:
424
+ tuple: A tuple containing three html-formatted reports:
425
+ - ordinary_user_table: Report for ordinary users.
426
+ - fact_checker_table: Report for fact checkers.
427
+ - governor_table: Report for governors.
428
+ """
429
  ordinary_user_table = self.create_ordinary_user_table()
430
  fact_checker_table = self.create_fact_checker_table()
431
  governor_table = self.create_governor_table()
 
433
  return ordinary_user_table, fact_checker_table, governor_table
434
 
435
  def handle_entities(self):
436
+ """
437
+ Highlights and assigns entities with colors to aligned sentences
438
+ based on grouped URLs.
439
+
440
+ For each grouped URL:
441
+ 1. Highlights entities in the input and source text
442
+ 2. Then assigns these highlighted entities to the corresponding
443
+ sentences in the aligned sentences DataFrame.
444
+ """
445
+
446
  entities_with_colors = []
447
  for index, row in self.grouped_url_df.iterrows():
448
  # Get entity-words (in pair) with colors
 
451
  row["source"],
452
  )
453
 
454
+ # Assign the highlighted entities to the corresponding sentences
455
+ # in aligned_sentences_df.
456
  for index, sentence in self.aligned_sentences_df.iterrows():
457
  if sentence["url"] == row["url"]:
458
+ # Use .at to modify the DataFrame efficiently.
459
  self.aligned_sentences_df.at[index, "entities"] = (
460
+ entities_with_colors
461
  )
462
 
463
+ def get_text_urls(self) -> set:
 
 
 
464
  """
465
+ Returns a set of unique URLs referenced in the text analysis.
 
466
 
467
+ Returns:
468
+ set: A set containing the unique URLs referenced in the text.
469
  """
470
+ return set(self.text_referent_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
  def create_fact_checker_table(self):
473
  rows = []
 
506
  if index == 0 or current_url != previous_url:
507
  first_url_row = True
508
  previous_url = current_url
509
+ # Increase counter "span_row" when the next url is the same
510
  while (
511
  index + span_row < len(self.fact_checker_table)
512
  and self.fact_checker_table[index + span_row][4]
 
551
  </table>
552
 
553
  <style>
554
+ """
555
 
556
  def format_text_fact_checker_row(
557
  self,
 
586
  entity_count = len(row[3])
587
 
588
  # Color overlapping words
589
+ input_sentence = color_text(
590
  input_sentence,
591
  row[1],
592
  highlight_idx_input,
593
  ) # text, index of highlight words
594
+ source_sentence = color_text(
595
  source_sentence,
596
  row[2],
597
  highlight_idx_source,
 
612
  source_sentence = row[0]["source"]
613
 
614
  url = row[0]["url"]
615
+
616
  # Displayed label and score by url
617
  filterby_url = self.grouped_url_df[self.grouped_url_df["url"] == url]
618
  if len(filterby_url) > 0:
 
626
  source_text_url = f"""<a href="{url}">{url}</a>"""
627
 
628
  # Format displayed entity count
629
+ entity_count_text = format_entity_count(entity_count)
630
 
631
  border_top = "border-top: 1px solid transparent;"
632
  border_bottom = "border-bottom: 1px solid transparent;"
 
720
  <style>
721
  """
722
 
723
+ def format_text_ordinary_user_row(self):
724
  input_sentences = ""
725
  source_text_urls = ""
726
  urls = []
 
743
  </tr>
744
  """
745
 
746
+ def format_image_ordinary_user_row(self):
747
 
748
  if (
749
  self.image_referent_url is not None
 
840
  )
841
 
842
  # Color overlapping words
843
+ input_sentence = color_text(
844
  input_sentence,
845
  row[1],
846
  highlight_idx_input,
847
  ) # text, index of highlight words
848
+ source_sentence = color_text(
849
  source_sentence,
850
  row[2],
851
  highlight_idx_source,
 
879
  if row[3] is not None:
880
  entity_count.append(len(row[3]))
881
 
882
+ entity_count_text = format_entity_count(sum(entity_count))
883
  word_break = "word-break: break-all;"
884
  return f"""
885
  <tr>
 
905
 
906
  word_break = "word-break: break-all;"
907
  return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td style="{word_break}";>{source_image_url}</td></tr>""" # noqa: E501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/application/content_generation.py CHANGED
@@ -1,30 +1,39 @@
1
  import json
2
- import os
3
 
4
  import openai
5
- from dotenv import load_dotenv
6
 
7
- load_dotenv()
8
- AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
9
- AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
10
- AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
11
-
12
- client = openai.AzureOpenAI(
13
- api_version=AZURE_OPENAI_API_VERSION,
14
- api_key=AZURE_OPENAI_API_KEY,
15
- azure_endpoint=AZURE_OPENAI_ENDPOINT,
16
  )
17
 
18
 
19
- def generate_fake_text(text_generation_model, title, content):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Generate text using the selected models
21
  prompt = """Generate a random fake news tittle in this format:
22
- ---
23
- # Title: [Fake Title]
24
- # Content:
25
- [Fake Content]
26
- ---
27
- """
28
  if title and content:
29
  prompt += """base on the following context:
30
  # Title: {news_title}:\n# Content: {news_content}"""
@@ -38,7 +47,7 @@ def generate_fake_text(text_generation_model, title, content):
38
  # Generate text using the text generation model
39
  # Generate text using the selected model
40
  try:
41
- response = client.chat.completions.create(
42
  model=text_generation_model,
43
  messages=[{"role": "system", "content": prompt}],
44
  )
@@ -58,67 +67,92 @@ def generate_fake_text(text_generation_model, title, content):
58
  return fake_title, fake_content
59
 
60
 
61
- def extract_title_content(fake_news):
62
  """
63
- Extracts the title and content from the generated fake news string.
64
-
65
- This function parses a string containing fake news, which is expected
66
- to have a specific format with a title and content section marked by
67
- '# Title:' and '# Content:' respectively.
68
 
69
  Args:
70
- fake_news (str): A string containing the generated fake news.
71
 
72
  Returns:
73
- tuple: A tuple containing two elements:
74
- - title (str): The extracted title of the fake news.
75
- - content (str): The extracted content of the fake news.
76
-
77
- Note:
78
- The function assumes that the input string follows the expected format.
79
- If the format is not as expected, it may return unexpected results.
80
  """
81
- # Extract the title and content from the generated fake news
82
- title_start_index = fake_news.find("# Title: ") + len("# Title: ")
83
- title_end_index = fake_news.find("\n", title_start_index)
84
- title = fake_news[title_start_index:title_end_index].strip()
85
 
86
- content_start_index = fake_news.find("\n# Content: ") + len(
87
- "\n# Content: ",
88
- )
89
- content = fake_news[content_start_index:].strip()
 
 
 
 
 
 
 
 
 
90
 
91
  return title, content
92
 
93
 
94
- def generate_fake_image(model, title):
95
- if len(title) > 0:
96
- IMAGE_PROMPT = f"Generate a random image about {title}"
97
- else:
98
- IMAGE_PROMPT = "Generate a random image"
99
- result = client.images.generate(
100
- model="dall-e-3", # the name of your DALL-E 3 deployment
101
- prompt=IMAGE_PROMPT,
102
- n=1,
103
- )
104
- image_url = json.loads(result.model_dump_json())["data"][0]["url"]
105
- return image_url
106
 
 
 
 
107
 
108
- def replace_text(news_title, news_content, replace_df):
 
109
  """
110
- Replaces occurrences in the input text based on the provided DataFrame.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  Args:
113
- text: The input text.
114
- replace_df: A DF with 2 columns: "find_what" & "replace_with".
 
 
115
 
116
  Returns:
117
- The text after all replacements have been made.
118
  """
119
  for _, row in replace_df.iterrows():
120
  find_what = row["Find what:"]
121
  replace_with = row["Replace with:"]
122
  news_content = news_content.replace(find_what, replace_with)
123
  news_title = news_title.replace(find_what, replace_with)
 
124
  return news_title, news_content
 
1
  import json
 
2
 
3
  import openai
4
+ import pandas as pd
5
 
6
+ from src.application.config import (
7
+ AZUREOPENAI_CLIENT,
8
+ GPT_IMAGE_MODEL,
 
 
 
 
 
 
9
  )
10
 
11
 
12
+ def generate_fake_text(
13
+ text_generation_model: str,
14
+ title: str = None,
15
+ content: str = None,
16
+ ) -> tuple[str, str]:
17
+ """
18
+ Generates fake news title and content using an Azure OpenAI model.
19
+
20
+ Args:
21
+ text_generation_model: The name of the Azure OpenAI model to use.
22
+ title: Optional title to use as context for fake text generation.
23
+ content: Optional content to use as context for fake text generation.
24
+
25
+ Returns:
26
+ A tuple containing the generated fake title and content (both strings).
27
+ Returns empty strings if generation fails.
28
+ """
29
  # Generate text using the selected models
30
  prompt = """Generate a random fake news tittle in this format:
31
+ ---
32
+ # Title: [Fake Title]
33
+ # Content:
34
+ [Fake Content]
35
+ ---
36
+ """
37
  if title and content:
38
  prompt += """base on the following context:
39
  # Title: {news_title}:\n# Content: {news_content}"""
 
47
  # Generate text using the text generation model
48
  # Generate text using the selected model
49
  try:
50
+ response = AZUREOPENAI_CLIENT.chat.completions.create(
51
  model=text_generation_model,
52
  messages=[{"role": "system", "content": prompt}],
53
  )
 
67
  return fake_title, fake_content
68
 
69
 
70
+ def extract_title_content(fake_news: str) -> tuple[str, str]:
71
  """
72
+ Extracts the title and content from the generated fake text.
 
 
 
 
73
 
74
  Args:
75
+ fake_news: The generated fake text string.
76
 
77
  Returns:
78
+ A tuple containing the extracted title and content.
 
 
 
 
 
 
79
  """
80
+ title = ""
81
+ content = ""
 
 
82
 
83
+ try:
84
+ # Extract the title and content from the generated fake news
85
+ title_start = fake_news.find("# Title: ") + len("# Title: ")
86
+ title_end = fake_news.find("\n", title_start)
87
+ if title_start != -1 and title_end != -1:
88
+ title = fake_news[title_start:title_end].strip()
89
+
90
+ title_start = fake_news.find("\n# Content: ") + len(
91
+ "\n# Content: ",
92
+ )
93
+ content = fake_news[title_start:].strip()
94
+ except Exception as e:
95
+ print(f"Error extracting title and content: {e}")
96
 
97
  return title, content
98
 
99
 
100
+ def generate_fake_image(
101
+ title: str,
102
+ model: str = GPT_IMAGE_MODEL,
103
+ ) -> str | None:
104
+ """
105
+ Generates a fake image URL using Azure OpenAI's image generation API.
 
 
 
 
 
 
106
 
107
+ Args:
108
+ title: The title to use as a prompt for image generation.
109
+ model: The name of the Azure OpenAI image generation model to use.
110
 
111
+ Returns:
112
+ The URL of the generated image, or None if an error occurs.
113
  """
114
+ try:
115
+ if title:
116
+ image_prompt = f"Generate a random image about {title}"
117
+ else:
118
+ image_prompt = "Generate a random image"
119
+
120
+ result = AZUREOPENAI_CLIENT.images.generate(
121
+ model=model,
122
+ prompt=image_prompt,
123
+ n=1,
124
+ )
125
+
126
+ image_url = json.loads(result.model_dump_json())["data"][0]["url"]
127
+ return image_url
128
+
129
+ except Exception as e:
130
+ print(f"Error generating fake image: {e}")
131
+ return None # Return None if an error occurs
132
+
133
+
134
+ def replace_text(
135
+ news_title: str,
136
+ news_content: str,
137
+ replace_df: pd.DataFrame,
138
+ ) -> tuple[str, str]:
139
+ """
140
+ Replaces occurrences in the input title and content
141
+ based on the provided DataFrame.
142
 
143
  Args:
144
+ news_title: The input news title.
145
+ news_content: The input news content.
146
+ replace_df: A DataFrame with two columns:
147
+ "Find what:" and "Replace with:".
148
 
149
  Returns:
150
+ A tuple containing the modified news title and content.
151
  """
152
  for _, row in replace_df.iterrows():
153
  find_what = row["Find what:"]
154
  replace_with = row["Replace with:"]
155
  news_content = news_content.replace(find_what, replace_with)
156
  news_title = news_title.replace(find_what, replace_with)
157
+
158
  return news_title, news_content
src/application/formatting.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.application.text.helper import extract_starts_ends, filter_indices
2
+
3
+
4
+ def color_text(text: str, colored_idx: list[dict], highlighted_idx: list[int]) -> str:
5
+ """
6
+ Colors specific words in a text based on provided indices.
7
+
8
+ This method takes a text, a list of indices to color, and a list of indices to exclude.
9
+ It splits the text into words, filters the indices, and then wraps the words within
10
+ the specified ranges with a green span tag for coloring.
11
+
12
+ Args:
13
+ text (str): The input text.
14
+ colored_idx (list): A list of dictionaries, where each dictionary contains
15
+ 'start' and 'end' keys representing indices of words to color.
16
+ highlighted_idx (list): A list of indices to exclude from coloring.
17
+
18
+ Returns:
19
+ str: The text with colored words.
20
+ """
21
+ sentence = ""
22
+ words = text.split()
23
+
24
+ # Extract start and end indices from colored_idx.
25
+ starts, ends = extract_starts_ends(colored_idx)
26
+
27
+ # Filter the start and end indices to exclude highlighted_idx.
28
+ starts, ends = filter_indices(starts, ends, highlighted_idx)
29
+
30
+ previous_end = 0
31
+ for start, end in zip(starts, ends):
32
+ # Add the words before the current colored range to the sentence.
33
+ sentence += " ".join(words[previous_end:start])
34
+
35
+ # Add the colored range to the sentence.
36
+ equal_words = " ".join(words[start:end])
37
+ sentence += f" <span style='color:#00FF00;'>{equal_words}</span> "
38
+
39
+ # Update the previous end index.
40
+ previous_end = end
41
+
42
+ # Add the remaining words after the last colored range to the sentence.
43
+ sentence += " ".join(words[previous_end:])
44
+
45
+ return sentence
46
+
47
+
48
+ def format_entity_count(entity_count: int) -> str:
49
+ """
50
+ Generates a text description based on the number of altered entities.
51
+
52
+ Args:
53
+ entity_count (int): The number of altered entities.
54
+
55
+ Returns:
56
+ str: A text description of the entity count.
57
+ - "" if entity_count is 0 or negative.
58
+ - "with 1 altered entity" if entity_count is 1.
59
+ - "with altered entities" if entity_count is greater than 1.
60
+ """
61
+ if entity_count <= 0:
62
+ entity_count_text = ""
63
+ elif entity_count == 1:
64
+ entity_count_text = "with 1 altered entity"
65
+ else:
66
+ entity_count_text = "with altered entities"
67
+ return entity_count_text
src/application/formatting_ordinary_user.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.application.config import WORD_BREAK
2
+
3
+
4
+ def create_ordinary_user_table(self):
5
+ rows = []
6
+ rows.append(self.format_image_ordinary_user_row())
7
+ rows.append(self.format_text_ordinary_user_row())
8
+ table = "\n".join(rows)
9
+
10
+ return f"""
11
+ <h5>Comparison between input news and source news:</h5>
12
+ <table border="1" style="width:100%; text-align:left;">
13
+ <col style="width: 340px;">
14
+ <col style="width: 30px;">
15
+ <col style="width: 75px;">
16
+ <thead>
17
+ <tr>
18
+ <th>Input news</th>
19
+ <th>Forensic</th>
20
+ <th>Originality</th>
21
+ </tr>
22
+ </thead>
23
+ <tbody>
24
+ {table}
25
+ </tbody>
26
+ </table>
27
+
28
+ <style>
29
+ """
30
+
31
+ def format_text_ordinary_user_row(self):
32
+ input_sentences = ""
33
+ source_text_urls = ""
34
+ urls = []
35
+ for _, row in self.aligned_sentences_df.iterrows():
36
+ if row["input"] is None:
37
+ continue
38
+
39
+ input_sentences += row["input"] + "<br><br>"
40
+ url = row["url"]
41
+ if url not in urls:
42
+ urls.append(url)
43
+ source_text_urls += f"""<a href="{url}">{url}</a><br>"""
44
+
45
+ return f"""
46
+ <tr>
47
+ <td>{input_sentences}</td>
48
+ <td>{self.text_prediction_label[0]}<br>
49
+ ({self.text_prediction_score[0] * 100:.2f}%)</td>
50
+ <td style="{WORD_BREAK}";>{source_text_urls}</td>
51
+ </tr>
52
+ """
53
+
54
+ def format_image_ordinary_user_row(
55
+ image_referent_url: str,
56
+ image_prediction_label: str,
57
+ image_prediction_score: float,
58
+ ):
59
+ """
60
+ Formats an HTML table row for ordinary users,
61
+ displaying image analysis results.
62
+
63
+ Args:
64
+ image_referent_url (str): The URL of the referenced image.
65
+ image_prediction_label (str): The predicted label for the image.
66
+ image_prediction_score (float): The prediction score for the image.
67
+
68
+ Returns:
69
+ str: An HTML table row string containing the image analysis results.
70
+ """
71
+
72
+ # Put image, label, and score into html tag
73
+ if (
74
+ image_referent_url is not None
75
+ or image_referent_url != ""
76
+ ):
77
+ source_image_url = f"""<a href="{image_referent_url}">{image_referent_url}</a>""" # noqa: E501
78
+ else:
79
+ source_image_url = ""
80
+
81
+ return f"""
82
+ <tr>
83
+ <td>input image</td>
84
+ <td>{image_prediction_label}<br>({image_prediction_score:.2f}%)</td>
85
+ <td style="{WORD_BREAK}";>{source_image_url}</td>
86
+ </tr>
87
+ """
src/application/text/entity.py CHANGED
@@ -44,6 +44,7 @@ def extract_entities_gpt(
44
  """
45
 
46
  # Construct the prompt for the GPT model.
 
47
  prompt = f"""
48
  Compare the ORIGINAL TEXT and the COMPARED TEXT.
49
  Find entity pairs with significantly different meanings after paraphrasing.
 
44
  """
45
 
46
  # Construct the prompt for the GPT model.
47
+ # TODO: Move to config or prompt file
48
  prompt = f"""
49
  Compare the ORIGINAL TEXT and the COMPARED TEXT.
50
  Find entity pairs with significantly different meanings after paraphrasing.
src/application/text/helper.py CHANGED
@@ -8,7 +8,10 @@ import string
8
  from collections import Counter
9
  from difflib import SequenceMatcher
10
 
11
- from nltk.tokenize import word_tokenize
 
 
 
12
  from nltk.util import ngrams
13
  from sklearn.feature_extraction.text import TfidfVectorizer
14
 
@@ -276,3 +279,223 @@ def connect_consecutive_indexes(nums: list[int]) -> list[list[int, int]]:
276
  # Add the last range to the result.
277
  result.append([start, end])
278
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from collections import Counter
9
  from difflib import SequenceMatcher
10
 
11
+ from nltk.tokenize import (
12
+ sent_tokenize,
13
+ word_tokenize,
14
+ )
15
  from nltk.util import ngrams
16
  from sklearn.feature_extraction.text import TfidfVectorizer
17
 
 
279
  # Add the last range to the result.
280
  result.append([start, end])
281
  return result
282
+
283
+
284
+ def postprocess_label(labels: list[str]) -> str:
285
+ """
286
+ Creates a label string with the format
287
+ "Partially generated by [label1] and [label2] and ...".
288
+ Removes duplicate labels while preserving the original order.
289
+
290
+ Args:
291
+ labels: A list of strings representing labels.
292
+
293
+ Returns:
294
+ A string with the formatted label.
295
+ """
296
+ prefix = "Partially generated by "
297
+ for index, label in enumerate(labels):
298
+ if label.startswith(prefix):
299
+ labels[index] = label[len(prefix):]
300
+
301
+ labels = list(set(labels))
302
+ label = prefix
303
+
304
+ if len(labels) == 1:
305
+ label += labels[0]
306
+ elif len(labels) == 2:
307
+ label += f"{labels[0]} and {labels[1]}"
308
+ else:
309
+ combination = ", ".join(labels[0 : len(labels) - 1])
310
+ label += f"{combination}, and {labels[-1]}"
311
+ return label
312
+
313
+
314
+ def split_into_sentences(input_text: str) -> list[str]:
315
+ """
316
+ Splits input text into sentences by newlines
317
+ and then tokenizes each paragraph into sentences.
318
+
319
+ Args:
320
+ input_text (str): The input text as a string.
321
+
322
+ Returns:
323
+ list: A list of sentences.
324
+ Returns an empty list if input is not a string.
325
+ """
326
+ if not isinstance(input_text, str):
327
+ return []
328
+
329
+ # Split the input text into paragraphs based on newline characters,
330
+ # keeping the newline characters.
331
+ paragraphs = input_text.splitlines(keepends=True)
332
+ sentences = []
333
+ for paragraph in paragraphs:
334
+ # Remove leading/trailing whitespace
335
+ paragraph = paragraph.strip()
336
+
337
+ if paragraph and paragraph != "\n":
338
+ # Tokenize the paragraph into sentences
339
+ sentences.extend(sent_tokenize(paragraph))
340
+
341
+ return sentences
342
+
343
+
344
+ def split_into_paragraphs(input_text: str) -> list[str]:
345
+ """
346
+ Splits input text into paragraphs based on newline characters.
347
+
348
+ Args:
349
+ input_text (str): The input text as a string.
350
+
351
+ Returns:
352
+ list: A list of paragraphs.
353
+ Returns an empty list if input is not a string.
354
+ """
355
+ if not isinstance(input_text, str):
356
+ return []
357
+
358
+ # Split the input text into paragraphs based on newline characters,
359
+ # keeping the newline characters.
360
+ paragraphs = input_text.splitlines(keepends=True)
361
+ out_paragraphs = []
362
+
363
+ for paragraph in paragraphs:
364
+ # Remove leading/trailing whitespace
365
+ paragraph = paragraph.strip()
366
+
367
+ if paragraph and paragraph != "\n":
368
+ # Append the cleaned paragraph to the output list.
369
+ out_paragraphs.append(paragraph)
370
+
371
+ return out_paragraphs
372
+
373
+
374
+ def extract_starts_ends(colored_idx: list[dict]) -> tuple[list[int], list[int]]:
375
+ """
376
+ Extracts start and end indices from a list of dictionaries.
377
+
378
+ Args:
379
+ colored_idx (list[dict]): A list of dictionaries,
380
+ where each dictionary has 'start' and 'end' keys.
381
+
382
+ Returns:
383
+ tuple: A tuple containing two lists:
384
+ - starts (list[int]): A list of start indices.
385
+ - ends (list[int]): A list of end indices.
386
+ """
387
+ starts = []
388
+ ends = []
389
+ for index in colored_idx:
390
+ starts.append(index["start"])
391
+ ends.append(index["end"])
392
+ return starts, ends
393
+
394
+
395
+ def filter_indices(starts: list[int], ends: list[int], ignore_indices: list[int]):
396
+ """
397
+ Filters start and end indices to exclude any indices present in the
398
+ ignore_indices list.
399
+
400
+ Args:
401
+ starts (list[int]): A list of starting indices.
402
+ ends (list[int]): A list of ending indices.
403
+ Must be the same length as starts.
404
+ ignore_indices (list[int]): A list of indices to exclude.
405
+
406
+ Returns:
407
+ A tuple of two lists of integers:
408
+ - filtered_starts
409
+ - filtered_ends
410
+ Returns empty lists if the input is invalid
411
+ or if all ranges are filtered out.
412
+
413
+ Examples:
414
+ starts = [0, 5, 10]
415
+ ends = [3, 7, 12] # words at the end will not be colored.
416
+ ignore_indices = [1, 2, 12, 17]
417
+
418
+ # Output:
419
+ starts = [0, 3, 5, 10]
420
+ ends = [1, 4, 7, 12]
421
+
422
+ """
423
+
424
+ if len(starts) != len(ends):
425
+ print(
426
+ "Error: The 'starts' & 'ends' lists must have the same length.",
427
+ )
428
+ return [], []
429
+
430
+ filtered_starts = []
431
+ filtered_ends = []
432
+
433
+ for i in range(len(starts)):
434
+ start = starts[i]
435
+ end = ends[i]
436
+
437
+ if end < start:
438
+ print(
439
+ f"Error: End index {end} < start index {start} at position {i}.", # noqa: E501
440
+ )
441
+ return [], []
442
+
443
+ start_end = list(range(start, end + 1, 1))
444
+ start_end = list(set(start_end) - set(ignore_indices))
445
+ # new_start, new_end = self.extract_sequences(start_end)
446
+ new_start, new_end = extract_new_startend(
447
+ start,
448
+ end,
449
+ ignore_indices,
450
+ )
451
+ filtered_starts.extend(new_start)
452
+ filtered_ends.extend(new_end)
453
+
454
+ return filtered_starts, filtered_ends
455
+
456
+
457
+ def extract_new_startend(start: int, end: int, ignore_indices: list[int]) -> tuple[list[int], list[int]]:
458
+ """
459
+ Extracts new start and end indices by splitting a range based on
460
+ ignored indices.
461
+
462
+ Args:
463
+ start (int): The starting index of the range.
464
+ end (int): The ending index of the range (exclusive).
465
+ ignore_indices (list): indices to ignore within the range.
466
+
467
+ Returns:
468
+ tuple: A tuple containing two lists:
469
+ - new_starts (list): Starting indices for the sub-ranges.
470
+ - new_ends (list): Ending indices for the sub-ranges.
471
+ """
472
+ # Sort the set of ignore_indices in ascending order.
473
+ indexes = list(set(ignore_indices))
474
+ indexes.sort()
475
+
476
+ new_starts = []
477
+ new_ends = []
478
+ new_start = start
479
+
480
+ # If no indices to ignore, return the original range.
481
+ if indexes is None or len(indexes) < 1:
482
+ new_starts.append(start)
483
+ new_ends.append(end)
484
+ return new_starts, new_ends
485
+
486
+ for index in indexes:
487
+ # Skip indices that are outside the range [start, end).
488
+ if index < start:
489
+ continue
490
+ elif index >= end:
491
+ continue
492
+
493
+ new_starts.append(new_start)
494
+ new_ends.append(index)
495
+
496
+ new_start = index + 1
497
+
498
+ new_starts.append(new_start)
499
+ new_ends.append(end)
500
+
501
+ return new_starts, new_ends
src/application/text/model_detection.py CHANGED
@@ -13,7 +13,6 @@ from src.application.config import (
13
  DEVICE,
14
  GPT_PARAPHRASE_MODELS,
15
  HUMAN,
16
- MACHINE,
17
  MODEL_HUMAN_LABEL,
18
  PARAPHRASE_MODEL,
19
  UNKNOWN,
@@ -62,9 +61,9 @@ def detect_text_by_ai_model(
62
  if result["label"] == MODEL_HUMAN_LABEL[model]:
63
  label = HUMAN
64
  else:
65
- label = MACHINE
66
  generated_model, _ = predict_generation_model(input_text)
67
- label += f"<br>({generated_model})"
68
 
69
  return label, confidence_score
70
 
 
13
  DEVICE,
14
  GPT_PARAPHRASE_MODELS,
15
  HUMAN,
 
16
  MODEL_HUMAN_LABEL,
17
  PARAPHRASE_MODEL,
18
  UNKNOWN,
 
61
  if result["label"] == MODEL_HUMAN_LABEL[model]:
62
  label = HUMAN
63
  else:
64
+ # label = MACHINE
65
  generated_model, _ = predict_generation_model(input_text)
66
+ label = f"Partially generated by {generated_model}"
67
 
68
  return label, confidence_score
69
 
src/application/text/preprocessing.py DELETED
@@ -1,67 +0,0 @@
1
- """
2
- Author: Khanh Phan
3
- Date: 2024-12-04
4
- """
5
-
6
- from nltk.tokenize import sent_tokenize
7
-
8
-
9
- # TODO: consider moving to helpers
10
- def split_into_sentences(input_text: str) -> list[str]:
11
- """
12
- Splits input text into sentences by newlines
13
- and then tokenizes each paragraph into sentences.
14
-
15
- Args:
16
- input_text (str): The input text as a string.
17
-
18
- Returns:
19
- list: A list of sentences.
20
- Returns an empty list if input is not a string.
21
- """
22
- if not isinstance(input_text, str):
23
- return []
24
-
25
- # Split the input text into paragraphs based on newline characters,
26
- # keeping the newline characters.
27
- paragraphs = input_text.splitlines(keepends=True)
28
- sentences = []
29
- for paragraph in paragraphs:
30
- # Remove leading/trailing whitespace
31
- paragraph = paragraph.strip()
32
-
33
- if paragraph and paragraph != "\n":
34
- # Tokenize the paragraph into sentences
35
- sentences.extend(sent_tokenize(paragraph))
36
-
37
- return sentences
38
-
39
-
40
- def split_into_paragraphs(input_text: str) -> list[str]:
41
- """
42
- Splits input text into paragraphs based on newline characters.
43
-
44
- Args:
45
- input_text (str): The input text as a string.
46
-
47
- Returns:
48
- list: A list of paragraphs.
49
- Returns an empty list if input is not a string.
50
- """
51
- if not isinstance(input_text, str):
52
- return []
53
-
54
- # Split the input text into paragraphs based on newline characters,
55
- # keeping the newline characters.
56
- paragraphs = input_text.splitlines(keepends=True)
57
- out_paragraphs = []
58
-
59
- for paragraph in paragraphs:
60
- # Remove leading/trailing whitespace
61
- paragraph = paragraph.strip()
62
-
63
- if paragraph and paragraph != "\n":
64
- # Append the cleaned paragraph to the output list.
65
- out_paragraphs.append(paragraph)
66
-
67
- return out_paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/application/text/search_detection.py CHANGED
@@ -17,7 +17,7 @@ from src.application.config import (
17
  PARAPHRASE_THRESHOLD_MACHINE,
18
  TOP_URLS_PER_SEARCH,
19
  )
20
- from src.application.text.preprocessing import split_into_sentences
21
  from src.application.text.search import (
22
  generate_search_phrases,
23
  search_by_google,
 
17
  PARAPHRASE_THRESHOLD_MACHINE,
18
  TOP_URLS_PER_SEARCH,
19
  )
20
+ from src.application.text.helper import split_into_sentences
21
  from src.application.text.search import (
22
  generate_search_phrases,
23
  search_by_google,
src/application/url_reader.py CHANGED
@@ -8,18 +8,29 @@ from newspaper import (
8
  article,
9
  )
10
 
11
- # TODO: move this to a config file
12
- MAX_URL_SIZE = 2000000 # ~2MB
13
 
14
 
15
  class URLReader:
 
 
 
 
 
16
  def __init__(self, url: string, newspaper: bool = True):
17
- self.url = url
18
- self.text = None # string
19
- self.title = None # string
20
- self.images = None # list of Image objects
21
- self.top_image = None # Image object
22
- self.is_extracted = False
 
 
 
 
 
 
 
23
 
24
  url_size = self.get_size()
25
  if url_size is None or url_size > MAX_URL_SIZE:
@@ -27,9 +38,7 @@ class URLReader:
27
  else:
28
  self.is_extracted = True
29
 
30
- self.newspaper = (
31
- newspaper # True if using newspaper4k, False if using BS
32
- )
33
  if self.newspaper is True:
34
  self.extract_content_newspaper()
35
  else:
@@ -37,81 +46,70 @@ class URLReader:
37
 
38
  def extract_content_newspaper(self):
39
  """
40
- Use newspaper4k to extracts content from a URL
41
-
42
- Args:
43
- url: The URL of the web page.
44
-
45
- Returns:
46
- The extracted content (title, text, images)
47
  """
48
-
49
  try:
50
  response = requests.get(self.url)
51
- response.raise_for_status()
 
 
 
 
 
 
 
 
52
  except requests.exceptions.RequestException as e:
53
  print(f"Error fetching URL: {e}")
54
  return None
55
-
56
- try:
57
- news = article(url=self.url, fetch_images=True)
58
  except (ArticleException, ArticleBinaryDataException) as e:
59
  print(f"\t\t↑↑↑ Error downloading article: {e}")
60
  return None
61
 
62
- self.title = news.title
63
- self.text = news.text
64
- self.images = list(set(news.images)) # Remove duplicates
65
- self.top_image = news.top_image
66
-
67
  def extract_content_bs(self):
68
  """
69
- Use BS and process content
70
  """
71
- response = requests.get(self.url)
72
- response.raise_for_status()
 
73
 
74
- response.encoding = response.apparent_encoding
75
 
76
- try:
77
  soup = BeautifulSoup(response.content, "html.parser")
78
- except Exception as e:
79
- print(f"Error parsing HTML content from {self.url}: {e}")
80
- return None
81
 
82
- self.title = soup.title.string.strip() if soup.title else None
83
 
84
- image_urls = [img["src"] for img in soup.find_all("img")]
85
- self.images = image_urls
86
- self.top_image = self.images[0]
87
 
88
- # Exclude text within specific elements
89
- for element in soup(["img", "figcaption", "table", "script", "style"]):
90
- element.extract()
91
- # text = soup.get_text(separator="\n")
92
- paragraphs = soup.find_all("p")
93
- text = " ".join([p.get_text() for p in paragraphs])
94
 
95
- self.text = text
 
 
 
 
 
 
 
 
96
 
97
  def get_size(self):
98
  """
99
  Retrieves the size of a URL's content using a HEAD request.
100
-
101
- Args:
102
- url: The URL to check.
103
-
104
- Returns:
105
- The size of the content in bytes,
106
- or None if the size cannot be determined
107
- (e.g., due to network errors or missing Content-Length header).
108
  """
109
  try:
110
  response = requests.head(
111
  self.url,
112
  allow_redirects=True,
113
  timeout=5,
114
- ) # Add timeout
115
  response.raise_for_status() # Raise HTTPError for bad responses
116
 
117
  content_length = response.headers.get("Content-Length")
@@ -123,7 +121,7 @@ class URLReader:
123
 
124
  except requests.exceptions.RequestException as e:
125
  print(f"\t\t↑↑↑ Error getting URL size: {e}")
126
- return None
127
 
128
 
129
  if __name__ == "__main__":
 
8
  article,
9
  )
10
 
11
+ from src.application.config import MAX_URL_SIZE
 
12
 
13
 
14
  class URLReader:
15
+ """
16
+ A class to extract content (title, text, images) from a given URL.
17
+ Supports two extraction methods: newspaper4k and BeautifulSoup.
18
+ """
19
+
20
  def __init__(self, url: string, newspaper: bool = True):
21
+ """
22
+ Initializes the URLReader object.
23
+
24
+ Args:
25
+ url: The URL to extract content from.
26
+ newspaper: True to use newspaper4k, False to use BeautifulSoup.
27
+ """
28
+ self.url: str = url
29
+ self.text: str = None # Extracted text content
30
+ self.title: str = None # Extracted title
31
+ self.images: list[str] = None # list of image URLs
32
+ self.top_image: str = None # URL of the top image
33
+ self.is_extracted: bool = False # Indicating successful extraction
34
 
35
  url_size = self.get_size()
36
  if url_size is None or url_size > MAX_URL_SIZE:
 
38
  else:
39
  self.is_extracted = True
40
 
41
+ self.newspaper = newspaper
 
 
42
  if self.newspaper is True:
43
  self.extract_content_newspaper()
44
  else:
 
46
 
47
  def extract_content_newspaper(self):
48
  """
49
+ Extracts content from a URL using the newspaper4k library.
 
 
 
 
 
 
50
  """
 
51
  try:
52
  response = requests.get(self.url)
53
+ response.raise_for_status() # Raise HTTPError for bad responses
54
+
55
+ news = article(url=self.url, fetch_images=True)
56
+
57
+ self.title = news.title
58
+ self.text = news.text
59
+ self.images = list(set(news.images)) # Remove duplicates
60
+ self.top_image = news.top_image
61
+
62
  except requests.exceptions.RequestException as e:
63
  print(f"Error fetching URL: {e}")
64
  return None
 
 
 
65
  except (ArticleException, ArticleBinaryDataException) as e:
66
  print(f"\t\t↑↑↑ Error downloading article: {e}")
67
  return None
68
 
 
 
 
 
 
69
  def extract_content_bs(self):
70
  """
71
+ Extracts content from a URL using BeautifulSoup.
72
  """
73
+ try:
74
+ response = requests.get(self.url)
75
+ response.raise_for_status()
76
 
77
+ response.encoding = response.apparent_encoding # Detect encoding
78
 
 
79
  soup = BeautifulSoup(response.content, "html.parser")
 
 
 
80
 
81
+ self.title = soup.title.string.strip() if soup.title else None
82
 
83
+ image_urls = [img["src"] for img in soup.find_all("img")]
84
+ self.images = image_urls
85
+ self.top_image = self.images[0]
86
 
87
+ # Remove unwanted elements from the HTML
88
+ for element in soup(
89
+ ["img", "figcaption", "table", "script", "style"],
90
+ ):
91
+ element.extract()
 
92
 
93
+ paragraphs = soup.find_all("p")
94
+ self.text = " ".join([p.get_text() for p in paragraphs])
95
+
96
+ except requests.exceptions.RequestException as e:
97
+ print(f"Error fetching URL: {e}")
98
+ return None
99
+ except Exception as e:
100
+ print(f"Error parsing HTML content from {self.url}: {e}")
101
+ return None
102
 
103
  def get_size(self):
104
  """
105
  Retrieves the size of a URL's content using a HEAD request.
 
 
 
 
 
 
 
 
106
  """
107
  try:
108
  response = requests.head(
109
  self.url,
110
  allow_redirects=True,
111
  timeout=5,
112
+ )
113
  response.raise_for_status() # Raise HTTPError for bad responses
114
 
115
  content_length = response.headers.get("Content-Length")
 
121
 
122
  except requests.exceptions.RequestException as e:
123
  print(f"\t\t↑↑↑ Error getting URL size: {e}")
124
+ return None
125
 
126
 
127
  if __name__ == "__main__":
test.py CHANGED
@@ -1,3 +1,27 @@
1
- a = [1, 2]
2
- a.append(None)
3
- print(a)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def postprocess_label(labels: list[str]) -> str:
2
+ """
3
+ Creates a label string with the format
4
+ "Partially generated by [label1] and [label2] and ...".
5
+ Removes duplicate labels while preserving the original order.
6
+
7
+ Args:
8
+ labels: A list of strings representing labels.
9
+
10
+ Returns:
11
+ A string with the formatted label.
12
+ """
13
+ labels = list(set(labels))
14
+ label = "Partially generated by "
15
+ if len(label) == 1:
16
+ label += labels[0]
17
+ elif len(labels) == 2:
18
+ label += f"{labels[0]} and {labels[1]}"
19
+ else:
20
+ combination = ", ".join(labels[0 : len(labels) - 1])
21
+ label += f"{combination}, and {labels[-1]}"
22
+ return label
23
+
24
+
25
+ labels = ["gpt-4o", "gpt-4o-mini", "gpt-4o-l"]
26
+ postprocessed_label = postprocess_label(labels)
27
+ print(postprocessed_label)