|
import re |
|
import html |
|
|
|
from collections import namedtuple |
|
from gram2vec.feature_locator import find_feature_spans |
|
from functools import lru_cache |
|
|
|
from utils.llm_feat_utils import generate_feature_spans_cached |
|
import pandas as pd |
|
Span = namedtuple('Span', ['start_char', 'end_char']) |
|
|
|
from gram2vec import vectorizer |
|
|
|
|
|
FEATURE_HANDLERS = { |
|
"Part-of-Speech Unigram": "pos_unigrams", |
|
"Part-of-Speech Bigram": "pos_bigrams", |
|
"Function Word": "func_words", |
|
"Punctuation": "punctuation", |
|
"Letter": "letters", |
|
"Dependency Label": "dep_labels", |
|
"Morphology Tag": "morph_tags", |
|
"Sentence Type": "sentences", |
|
"Emoji": "emojis", |
|
"Number of Tokens": "num_tokens" |
|
} |
|
|
|
@lru_cache(maxsize=1) |
|
def load_code_map(txt_path: str = "utils/augmented_human_readable.txt") -> dict: |
|
code_map = {} |
|
with open(txt_path, "r", encoding="utf-8") as f: |
|
for line in f: |
|
line = line.strip() |
|
if not line: |
|
continue |
|
human, code = [p.strip() for p in line.split(":", 1)] |
|
code_map[human] = code |
|
return code_map |
|
|
|
def get_shorthand(feature_str: str) -> str: |
|
""" |
|
Expects 'Category:Human-Readable', returns e.g. 'pos_unigrams:ADJ' or None. |
|
""" |
|
try: |
|
category, human = [p.strip() for p in feature_str.split(":", 1)] |
|
|
|
except ValueError: |
|
|
|
return None |
|
if category not in FEATURE_HANDLERS: |
|
return None |
|
code = load_code_map().get(human) |
|
if code is None: |
|
|
|
return None |
|
return f"{FEATURE_HANDLERS[category]}:{code}" |
|
|
|
def get_fullform(shorthand: str) -> str: |
|
""" |
|
Expects 'prefix:code' (e.g., 'pos_unigrams:ADJ'), returns 'Category:Human-Readable' |
|
(e.g., 'Part-of-Speech Unigram:Adjective'), or None if invalid. |
|
""" |
|
try: |
|
prefix, code = shorthand.split(":", 1) |
|
except ValueError: |
|
return None |
|
|
|
|
|
reverse_handlers = {v: k for k, v in FEATURE_HANDLERS.items()} |
|
category = reverse_handlers.get(prefix) |
|
if category is None: |
|
return None |
|
|
|
|
|
code_map = load_code_map() |
|
reverse_code_map = {v: k for k, v in code_map.items()} |
|
human = reverse_code_map.get(code) |
|
if human is None: |
|
return None |
|
|
|
return f"{category}:{human}" |
|
|
|
def highlight_both_spans(text, llm_spans, gram_spans): |
|
""" |
|
Walk the original `text` once, injecting <mark> tags at the correct offsets, |
|
so that nested or overlapping highlights never stomp on each other. |
|
""" |
|
|
|
|
|
style = """ |
|
<style> |
|
.mark-llm { background-color: #fff176; } |
|
.mark-gram { background-color: #90caf9; } |
|
</style> |
|
""" |
|
|
|
|
|
events = [] |
|
for s in llm_spans: |
|
events.append((s.start_char, 'open', 'llm')) |
|
events.append((s.end_char, 'close', 'llm')) |
|
for s in gram_spans: |
|
events.append((s.start_char, 'open', 'gram')) |
|
events.append((s.end_char, 'close', 'gram')) |
|
|
|
|
|
events.sort(key=lambda e: (e[0], 0 if e[1]=='open' else 1)) |
|
|
|
out = [] |
|
last_idx = 0 |
|
for idx, typ, cls in events: |
|
|
|
out.append(html.escape(text[last_idx:idx])) |
|
if typ == 'open': |
|
out.append(f'<mark class="mark-{cls}">') |
|
else: |
|
out.append('</mark>') |
|
last_idx = idx |
|
|
|
out.append(html.escape(text[last_idx:])) |
|
highlighted = "".join(out) |
|
|
|
highlighted = highlighted.replace('\n', '<br>') |
|
|
|
return style + highlighted |
|
|
|
|
|
def show_combined_spans_all(selected_feature_llm, selected_feature_g2v, |
|
llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=4): |
|
""" |
|
For mystery + 3 candidates: |
|
1. get llm spans via your existing cache+API |
|
2. get gram2vec spans via find_feature_spans |
|
3. merge and highlight both |
|
""" |
|
print(f"\n\n\n\n\nShowing combined spans for LLM feature '{selected_feature_llm}' and Gram2Vec feature '{selected_feature_g2v}'") |
|
print(f"predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}") |
|
print(f" keys = {background_authors_embeddings_df.keys()}") |
|
|
|
|
|
|
|
|
|
|
|
background_authors_embeddings_df = background_authors_embeddings_df[background_authors_embeddings_df.authorID.isin(visible_authors)] |
|
background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df]) |
|
|
|
authors_texts = ['\n\n =========== \n\n'.join(x) if type(x) == list else x for x in background_and_task_authors[:max_num_authors]['fullText'].tolist()] |
|
authors_names = background_and_task_authors[:max_num_authors]['authorID'].tolist() |
|
print(f"Number of authors to show: {len(authors_texts)}") |
|
print(f"Authors names: {authors_names}") |
|
texts = list(zip(authors_names, authors_texts)) |
|
|
|
if selected_feature_llm and selected_feature_llm != "None": |
|
|
|
author_list = list(llm_style_feats_analysis['spans'].values()) |
|
llm_spans_list = [] |
|
for i, (_, txt) in enumerate(texts): |
|
author_spans_list = [] |
|
for txt_span in author_list[i][selected_feature_llm]: |
|
author_spans_list.append(Span(txt.find(txt_span), txt.find(txt_span) + len(txt_span))) |
|
llm_spans_list.append(author_spans_list) |
|
else: |
|
print("Skipping LLM span extraction: feature is None") |
|
llm_spans_list = [[] for _ in texts] |
|
|
|
if selected_feature_g2v and selected_feature_g2v != "None": |
|
|
|
gram_spans_list = [] |
|
print(f"Selected Gram2Vec feature: {selected_feature_g2v}") |
|
short = get_shorthand(selected_feature_g2v) |
|
print(f"short hand: {short}") |
|
for role, txt in texts: |
|
try: |
|
print(f"Finding spans for {short} {role}") |
|
spans = find_feature_spans(txt, short) |
|
|
|
except: |
|
print(f"Error finding spans for {short} {role}") |
|
spans = [] |
|
gram_spans_list.append(spans) |
|
else: |
|
print("Skipping Gram2Vec span extraction: feature is None") |
|
gram_spans_list = [[] for _ in texts] |
|
|
|
|
|
print(f" ----> Number of authors: {len(texts)}") |
|
|
|
html_task_authors = create_html( |
|
texts[:4], |
|
llm_spans_list, |
|
gram_spans_list, |
|
selected_feature_llm, |
|
selected_feature_g2v, |
|
short, |
|
background = False, |
|
predicted_author=predicted_author, |
|
ground_truth_author=ground_truth_author |
|
) |
|
combined_html = "<div>" + "\n<hr>\n".join(html_task_authors) + "</div>" |
|
|
|
html_background_authors = create_html( |
|
texts[4:], |
|
llm_spans_list, |
|
gram_spans_list, |
|
selected_feature_llm, |
|
selected_feature_g2v, |
|
short, |
|
background = True, |
|
predicted_author=predicted_author, |
|
ground_truth_author=ground_truth_author |
|
) |
|
background_html = "<div>" + "\n<hr>\n".join(html_background_authors) + "</div>" |
|
return combined_html, background_html |
|
|
|
def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id: int=0) -> str: |
|
""" |
|
Returns a human-readable label for the author. |
|
""" |
|
print(f"get_label called with label: {label}, predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}, bg_id: {bg_id}") |
|
if label.startswith("Mystery") or label.startswith("Q_author"): |
|
return "Mystery Author" |
|
elif label.startswith("a0_author") or label.startswith("a1_author") or label.startswith("a2_author") or label.startswith("Candidate"): |
|
if label.startswith("Candidate"): |
|
id = int(label.split(" ")[2]) |
|
else: |
|
id = label.split("_")[0][-1] |
|
if predicted_author is not None and ground_truth_author is not None: |
|
if int(id) == predicted_author and int(id) == ground_truth_author: |
|
return f"Candidate {int(id)} (Predicted & Ground Truth)" |
|
elif int(id) == predicted_author: |
|
return f"Candidate {int(id)} (Predicted)" |
|
elif int(id) == ground_truth_author: |
|
return f"Candidate {int(id)} (Ground Truth)" |
|
else: |
|
return f"Candidate {int(id)}" |
|
else: |
|
return f"Candidate {int(id)}" |
|
else: |
|
return f"Background Author {bg_id+1}" |
|
|
|
def create_html(texts, llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short=None, background = False, predicted_author=None, ground_truth_author=None): |
|
html = [] |
|
for i, (label, txt) in enumerate(texts): |
|
label = get_label(label, predicted_author, ground_truth_author, i) if background else get_label(label, predicted_author, ground_truth_author) |
|
combined = highlight_both_spans(txt, llm_spans_list[i], gram_spans_list[i]) |
|
notice = "" |
|
if selected_feature_llm == "None": |
|
notice += f""" |
|
<div style="padding:8px; background:#eee; border:1px solid #aaa;"> |
|
<em>No LLM feature selected.</em> |
|
</div> |
|
""" |
|
elif not llm_spans_list[i]: |
|
notice += f""" |
|
<div style="padding:8px; background:#fee; border:1px solid #f00;"> |
|
<em>No spans found for LLM feature "{selected_feature_llm}".</em> |
|
</div> |
|
""" |
|
if selected_feature_g2v == "None": |
|
notice += f""" |
|
<div style="padding:8px; background:#eee; border:1px solid #aaa;"> |
|
<em>No Gram2Vec feature selected.</em> |
|
</div> |
|
""" |
|
elif not short: |
|
notice += f""" |
|
<div style="padding:8px; background:#fee; border:1px solid #f00;"> |
|
<em>Invalid or unmapped feature: "{selected_feature_g2v}".</em> |
|
</div> |
|
""" |
|
elif not gram_spans_list[i]: |
|
notice += f""" |
|
<div style="padding:8px; background:#fee; border:1px solid #f00;"> |
|
<em>No spans found for Gram2Vec feature "{selected_feature_g2v}".</em> |
|
</div> |
|
""" |
|
html.append(f""" |
|
<h3>{label}</h3> |
|
{notice} |
|
<div style="border:1px solid #ccc; padding:8px; margin-bottom:1em;"> |
|
{combined} |
|
</div> |
|
""") |
|
return html |