tags at the correct offsets,
so that nested or overlapping highlights never stomp on each other.
"""
# Inline CSS : mark-llm is in yellow, mark-gram in blue
style = """
"""
# Turn each span into two “events”: open and close
events = []
for s in llm_spans:
events.append((s.start_char, 'open', 'llm'))
events.append((s.end_char, 'close', 'llm'))
for s in gram_spans:
events.append((s.start_char, 'open', 'gram'))
events.append((s.end_char, 'close', 'gram'))
# Sort by position;
events.sort(key=lambda e: (e[0], 0 if e[1]=='open' else 1))
out = []
last_idx = 0
for idx, typ, cls in events:
# escape the slice between last index and this event
out.append(html.escape(text[last_idx:idx]))
if typ == 'open':
out.append(f'')
else:
out.append('')
last_idx = idx
out.append(html.escape(text[last_idx:]))
highlighted = "".join(out)
highlighted = highlighted.replace('\n', '
')
return style + highlighted
def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=4):
"""
For mystery + 3 candidates:
1. get llm spans via your existing cache+API
2. get gram2vec spans via find_feature_spans
3. merge and highlight both
"""
print(f"\n\n\n\n\nShowing combined spans for LLM feature '{selected_feature_llm}' and Gram2Vec feature '{selected_feature_g2v}'")
print(f"predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}")
print(f" keys = {background_authors_embeddings_df.keys()}")
# background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df])
# background_and_task_authors = background_and_task_authors[background_and_task_authors.authorID.isin(visible_authors)]
#get the visible background authors
background_authors_embeddings_df = background_authors_embeddings_df[background_authors_embeddings_df.authorID.isin(visible_authors)]
background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df])
authors_texts = ['\n\n =========== \n\n'.join(x) if type(x) == list else x for x in background_and_task_authors[:max_num_authors]['fullText'].tolist()]
authors_names = background_and_task_authors[:max_num_authors]['authorID'].tolist()
print(f"Number of authors to show: {len(authors_texts)}")
print(f"Authors names: {authors_names}")
texts = list(zip(authors_names, authors_texts))
if selected_feature_llm and selected_feature_llm != "None":
# print(llm_style_feats_analysis)
author_list = list(llm_style_feats_analysis['spans'].values())
llm_spans_list = []
for i, (_, txt) in enumerate(texts):
author_spans_list = []
for txt_span in author_list[i][selected_feature_llm]:
author_spans_list.append(Span(txt.find(txt_span), txt.find(txt_span) + len(txt_span)))
llm_spans_list.append(author_spans_list)
else:
print("Skipping LLM span extraction: feature is None")
llm_spans_list = [[] for _ in texts]
if selected_feature_g2v and selected_feature_g2v != "None":
# get gram2vec spans
gram_spans_list = []
print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
short = get_shorthand(selected_feature_g2v)
print(f"short hand: {short}")
for role, txt in texts:
try:
print(f"Finding spans for {short} {role}")
spans = find_feature_spans(txt, short)
# spans = [Span(fs.start_char, fs.end_char) for fs in raw_spans]
except:
print(f"Error finding spans for {short} {role}")
spans = []
gram_spans_list.append(spans)
else:
print("Skipping Gram2Vec span extraction: feature is None")
gram_spans_list = [[] for _ in texts]
# build HTML blocks
print(f" ----> Number of authors: {len(texts)}")
html_task_authors = create_html(
texts[:4], #first 4 are task
llm_spans_list,
gram_spans_list,
selected_feature_llm,
selected_feature_g2v,
short,
background = False,
predicted_author=predicted_author,
ground_truth_author=ground_truth_author
)
combined_html = "" + "\n
\n".join(html_task_authors) + ""
html_background_authors = create_html(
texts[4:], #last three are background
llm_spans_list,
gram_spans_list,
selected_feature_llm,
selected_feature_g2v,
short,
background = True,
predicted_author=predicted_author,
ground_truth_author=ground_truth_author
)
background_html = "" + "\n
\n".join(html_background_authors) + ""
return combined_html, background_html
def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id: int=0) -> str:
"""
Returns a human-readable label for the author.
"""
print(f"get_label called with label: {label}, predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}, bg_id: {bg_id}")
if label.startswith("Mystery") or label.startswith("Q_author"):
return "Mystery Author"
elif label.startswith("a0_author") or label.startswith("a1_author") or label.startswith("a2_author") or label.startswith("Candidate"):
if label.startswith("Candidate"):
id = int(label.split(" ")[2]) # Get the number after 'Candidate Author'
else:
id = label.split("_")[0][-1] # Get the last character of the first part (a0, a1, a2)
if predicted_author is not None and ground_truth_author is not None:
if int(id) == predicted_author and int(id) == ground_truth_author:
return f"Candidate {int(id)} (Predicted & Ground Truth)"
elif int(id) == predicted_author:
return f"Candidate {int(id)} (Predicted)"
elif int(id) == ground_truth_author:
return f"Candidate {int(id)} (Ground Truth)"
else:
return f"Candidate {int(id)}"
else:
return f"Candidate {int(id)}"
else:
return f"Background Author {bg_id+1}"
def create_html(texts, llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short=None, background = False, predicted_author=None, ground_truth_author=None):
html = []
for i, (label, txt) in enumerate(texts):
label = get_label(label, predicted_author, ground_truth_author, i) if background else get_label(label, predicted_author, ground_truth_author)
combined = highlight_both_spans(txt, llm_spans_list[i], gram_spans_list[i])
notice = ""
if selected_feature_llm == "None":
notice += f"""
No LLM feature selected.
"""
elif not llm_spans_list[i]:
notice += f"""
No spans found for LLM feature "{selected_feature_llm}".
"""
if selected_feature_g2v == "None":
notice += f"""
No Gram2Vec feature selected.
"""
elif not short:
notice += f"""
Invalid or unmapped feature: "{selected_feature_g2v}".
"""
elif not gram_spans_list[i]:
notice += f"""
No spans found for Gram2Vec feature "{selected_feature_g2v}".
"""
html.append(f"""
{label}
{notice}
{combined}
""")
return html