Spaces:
Running
Running
File size: 5,752 Bytes
e58707f 5842223 e58707f 00b1038 e58707f 00b1038 e58707f 00b1038 e58707f 00b1038 e58707f 00b1038 e58707f 00b1038 e58707f 5842223 00b1038 e58707f 5842223 e58707f 5842223 e58707f 5842223 e58707f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
from pandas import DataFrame
from src.application.config import WORD_BREAK
from src.application.formatting import (
color_text,
format_entity_count,
)
from src.application.image.helper import encode_image
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import (
extract_equal_text,
replace_leading_spaces,
)
from src.application.text.text import TextDetector
def create_governor_table(
aligned_sentences_df: DataFrame,
text: TextDetector,
image: ImageDetector,
):
rows = []
if image.input is not None:
rows.append(format_image_governor_row(image))
if text.input is not None:
for _, row in aligned_sentences_df.iterrows():
if row["input"] is None:
continue
if row["source"] is None:
equal_idx_1 = equal_idx_2 = []
else:
# Get index of equal phrases in input and source sentences
equal_idx_1, equal_idx_2 = extract_equal_text(
row["input"],
row["source"],
)
text.governor_table.append(
[
row,
equal_idx_1,
equal_idx_2,
row["entities"],
],
)
formatted_row = format_text_governor_row(text)
rows.append(formatted_row)
table = "\n".join(rows)
return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
<thead>
<tr>
<th>Input news</th>
<th>Source (URL in Originality)</th>
<th>Forensic</th>
<th>Originality</th>
</tr>
</thead>
<tbody>
{table}
</tbody>
</table>
<style>
"""
def format_text_governor_row(text):
input_sentences = ""
source_sentences = ""
source_text_urls = ""
urls = []
sentence_count = 0
entity_count = [0, 0] # to get index of [-2]
for row in text.governor_table:
if row[0]["input"] is None:
continue
if row[0]["source"] is not None: # source is not empty
# highlight entities
input_sentence, highlight_idx_input = apply_highlight(
row[0]["input"],
row[3], # entities_with_colors
"input", # key
entity_count[-2], # since the last one is for current counting
)
source_sentence, highlight_idx_source = apply_highlight(
row[0]["source"],
row[3], # entities_with_colors
"source", # key
entity_count[-2], # since the last one is for current counting
)
# Color overlapping words
input_sentence = color_text(
input_sentence,
row[1],
highlight_idx_input,
) # text, index of highlight words
source_sentence = color_text(
source_sentence,
row[2],
highlight_idx_source,
) # text, index of highlight words
input_sentence = input_sentence.replace(
"span_style",
"span style",
).replace("1px_4px", "1px 4px")
source_sentence = source_sentence.replace(
"span_style",
"span style",
).replace("1px_4px", "1px 4px")
else:
if row[0]["source"] is None:
source_sentence = ""
else:
source_sentence = row[0]["source"]
input_sentence = row[0]["input"]
input_sentence = replace_leading_spaces(input_sentence)
source_sentence = replace_leading_spaces(source_sentence)
input_sentences += input_sentence + "<br>"
source_sentences += source_sentence + "<br>"
url = row[0]["url"]
if url not in urls:
urls.append(url)
source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
sentence_count += 1
if row[3] is not None:
entity_count.append(len(row[3]))
entity_count_text = format_entity_count(sum(entity_count))
return f"""
<tr>
<td>{input_sentences}</td>
<td>{source_sentences}</td>
<td>{text.prediction_label[0]}<br>
({text.prediction_score[0] * 100:.2f}%)<br><br>
{entity_count_text}</td>
<td style="{WORD_BREAK}";>{source_text_urls}</td>
</tr>
"""
def format_image_governor_row(image: ImageDetector):
if image.input is None:
return ""
if image.referent_url is not None or image.referent_url != "":
if "http" in image.input:
input_image = (
f"""<a href="{image.input}">{image.input}</a>""" # noqa: E501
)
else:
base64_image = encode_image(image.input)
input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">""" # noqa: E501
source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>""" # noqa: E501
source_image = f"""<img src="{image.referent_url}" width="100" height="150">""" # noqa: E501
else:
source_image = "Image not found"
source_image_url = ""
return f"""
<tr>
<td>{input_image}</td>
<td>{source_image}</td>
<td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
<td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>"""
|