File size: 5,752 Bytes
e58707f
 
 
 
 
 
 
5842223
e58707f
 
00b1038
 
 
 
e58707f
 
 
 
 
 
 
 
 
00b1038
 
e58707f
00b1038
 
 
 
e58707f
00b1038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e58707f
 
00b1038
 
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00b1038
 
 
 
 
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5842223
00b1038
 
 
e58707f
5842223
 
 
 
 
 
 
e58707f
5842223
e58707f
 
 
 
 
 
5842223
e58707f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from pandas import DataFrame

from src.application.config import WORD_BREAK
from src.application.formatting import (
    color_text,
    format_entity_count,
)
from src.application.image.helper import encode_image
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import (
    extract_equal_text,
    replace_leading_spaces,
)
from src.application.text.text import TextDetector


def create_governor_table(
    aligned_sentences_df: DataFrame,
    text: TextDetector,
    image: ImageDetector,
):
    rows = []
    if image.input is not None:
        rows.append(format_image_governor_row(image))

    if text.input is not None:
        for _, row in aligned_sentences_df.iterrows():
            if row["input"] is None:
                continue

            if row["source"] is None:
                equal_idx_1 = equal_idx_2 = []
            else:
                # Get index of equal phrases in input and source sentences
                equal_idx_1, equal_idx_2 = extract_equal_text(
                    row["input"],
                    row["source"],
                )

            text.governor_table.append(
                [
                    row,
                    equal_idx_1,
                    equal_idx_2,
                    row["entities"],
                ],
            )

        formatted_row = format_text_governor_row(text)
        rows.append(formatted_row)

    table = "\n".join(rows)
    return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (URL in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>

<style>
        """


def format_text_governor_row(text):
    input_sentences = ""
    source_sentences = ""
    source_text_urls = ""
    urls = []
    sentence_count = 0
    entity_count = [0, 0]  # to get index of [-2]
    for row in text.governor_table:
        if row[0]["input"] is None:
            continue

        if row[0]["source"] is not None:  # source is not empty
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input"],
                row[3],  # entities_with_colors
                "input",  # key
                entity_count[-2],  # since the last one is for current counting
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["source"],
                row[3],  # entities_with_colors
                "source",  # key
                entity_count[-2],  # since the last one is for current counting
            )

            # Color overlapping words
            input_sentence = color_text(
                input_sentence,
                row[1],
                highlight_idx_input,
            )  # text, index of highlight words
            source_sentence = color_text(
                source_sentence,
                row[2],
                highlight_idx_source,
            )  # text, index of highlight words

            input_sentence = input_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")
            source_sentence = source_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")

        else:
            if row[0]["source"] is None:
                source_sentence = ""
            else:
                source_sentence = row[0]["source"]
            input_sentence = row[0]["input"]

        input_sentence = replace_leading_spaces(input_sentence)
        source_sentence = replace_leading_spaces(source_sentence)

        input_sentences += input_sentence + "<br>"
        source_sentences += source_sentence + "<br>"

        url = row[0]["url"]
        if url not in urls:
            urls.append(url)
            source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
            sentence_count += 1
            if row[3] is not None:
                entity_count.append(len(row[3]))

    entity_count_text = format_entity_count(sum(entity_count))
    return f"""
<tr>
    <td>{input_sentences}</td>
    <td>{source_sentences}</td>
    <td>{text.prediction_label[0]}<br>
        ({text.prediction_score[0] * 100:.2f}%)<br><br>
        {entity_count_text}</td>
    <td style="{WORD_BREAK}";>{source_text_urls}</td>
</tr>
"""


def format_image_governor_row(image: ImageDetector):
    if image.input is None:
        return ""

    if image.referent_url is not None or image.referent_url != "":
        if "http" in image.input:
            input_image = (
                f"""<a href="{image.input}">{image.input}</a>"""  # noqa: E501
            )
        else:
            base64_image = encode_image(image.input)
            input_image = f"""<img src="data:image/jpeg;base64,{base64_image}" width="100" height="150">"""  # noqa: E501
        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
    else:
        source_image = "Image not found"
        source_image_url = ""

    return f"""
<tr>
    <td>{input_image}</td>
    <td>{source_image}</td>
    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
    <td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>"""