import re import html from collections import namedtuple from gram2vec.feature_locator import find_feature_spans from functools import lru_cache from utils.llm_feat_utils import generate_feature_spans_cached import pandas as pd Span = namedtuple('Span', ['start_char', 'end_char']) from gram2vec import vectorizer # ── the FEATURE_HANDLERS & loader ──────────── FEATURE_HANDLERS = { "Part-of-Speech Unigram": "pos_unigrams", "Part-of-Speech Bigram": "pos_bigrams", "Function Word": "func_words", "Punctuation": "punctuation", "Letter": "letters", "Dependency Label": "dep_labels", "Morphology Tag": "morph_tags", "Sentence Type": "sentences", "Emoji": "emojis", "Number of Tokens": "num_tokens" } @lru_cache(maxsize=1) def load_code_map(txt_path: str = "utils/augmented_human_readable.txt") -> dict: code_map = {} with open(txt_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue human, code = [p.strip() for p in line.split(":", 1)] code_map[human] = code return code_map def get_shorthand(feature_str: str) -> str: """ Expects 'Category:Human-Readable', returns e.g. 'pos_unigrams:ADJ' or None. """ try: category, human = [p.strip() for p in feature_str.split(":", 1)] # print(f"Category: {category}, Human: {human}") except ValueError: # print("Invalid format for feature string:", feature_str) return None if category not in FEATURE_HANDLERS: return None code = load_code_map().get(human) if code is None: # print(f"Warning: No code found for human-readable feature '{human}'") return None # fallback to the human-readable name return f"{FEATURE_HANDLERS[category]}:{code}" def get_fullform(shorthand: str) -> str: """ Expects 'prefix:code' (e.g., 'pos_unigrams:ADJ'), returns 'Category:Human-Readable' (e.g., 'Part-of-Speech Unigram:Adjective'), or None if invalid. """ try: prefix, code = shorthand.split(":", 1) except ValueError: return None # Reverse FEATURE_HANDLERS reverse_handlers = {v: k for k, v in FEATURE_HANDLERS.items()} category = reverse_handlers.get(prefix) if category is None: return None # Reverse code map code_map = load_code_map() reverse_code_map = {v: k for k, v in code_map.items()} human = reverse_code_map.get(code) if human is None: return None return f"{category}:{human}" def highlight_both_spans(text, llm_spans, gram_spans): """ Walk the original `text` once, injecting tags at the correct offsets, so that nested or overlapping highlights never stomp on each other. """ # Inline CSS : mark-llm is in yellow, mark-gram in blue style = """ """ # Turn each span into two “events”: open and close events = [] for s in llm_spans: events.append((s.start_char, 'open', 'llm')) events.append((s.end_char, 'close', 'llm')) for s in gram_spans: events.append((s.start_char, 'open', 'gram')) events.append((s.end_char, 'close', 'gram')) # Sort by position; events.sort(key=lambda e: (e[0], 0 if e[1]=='open' else 1)) out = [] last_idx = 0 for idx, typ, cls in events: # escape the slice between last index and this event out.append(html.escape(text[last_idx:idx])) if typ == 'open': out.append(f'') else: out.append('') last_idx = idx out.append(html.escape(text[last_idx:])) highlighted = "".join(out) highlighted = highlighted.replace('\n', '
') return style + highlighted def show_combined_spans_all(selected_feature_llm, selected_feature_g2v, llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=4): """ For mystery + 3 candidates: 1. get llm spans via your existing cache+API 2. get gram2vec spans via find_feature_spans 3. merge and highlight both """ print(f"\n\n\n\n\nShowing combined spans for LLM feature '{selected_feature_llm}' and Gram2Vec feature '{selected_feature_g2v}'") print(f"predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}") print(f" keys = {background_authors_embeddings_df.keys()}") # background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df]) # background_and_task_authors = background_and_task_authors[background_and_task_authors.authorID.isin(visible_authors)] #get the visible background authors background_authors_embeddings_df = background_authors_embeddings_df[background_authors_embeddings_df.authorID.isin(visible_authors)] background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df]) authors_texts = ['\n\n =========== \n\n'.join(x) if type(x) == list else x for x in background_and_task_authors[:max_num_authors]['fullText'].tolist()] authors_names = background_and_task_authors[:max_num_authors]['authorID'].tolist() print(f"Number of authors to show: {len(authors_texts)}") print(f"Authors names: {authors_names}") texts = list(zip(authors_names, authors_texts)) if selected_feature_llm and selected_feature_llm != "None": # print(llm_style_feats_analysis) author_list = list(llm_style_feats_analysis['spans'].values()) llm_spans_list = [] for i, (_, txt) in enumerate(texts): author_spans_list = [] for txt_span in author_list[i][selected_feature_llm]: author_spans_list.append(Span(txt.find(txt_span), txt.find(txt_span) + len(txt_span))) llm_spans_list.append(author_spans_list) else: print("Skipping LLM span extraction: feature is None") llm_spans_list = [[] for _ in texts] if selected_feature_g2v and selected_feature_g2v != "None": # get gram2vec spans gram_spans_list = [] print(f"Selected Gram2Vec feature: {selected_feature_g2v}") short = get_shorthand(selected_feature_g2v) print(f"short hand: {short}") for role, txt in texts: try: print(f"Finding spans for {short} {role}") spans = find_feature_spans(txt, short) # spans = [Span(fs.start_char, fs.end_char) for fs in raw_spans] except: print(f"Error finding spans for {short} {role}") spans = [] gram_spans_list.append(spans) else: print("Skipping Gram2Vec span extraction: feature is None") gram_spans_list = [[] for _ in texts] # build HTML blocks print(f" ----> Number of authors: {len(texts)}") html_task_authors = create_html( texts[:4], #first 4 are task llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short, background = False, predicted_author=predicted_author, ground_truth_author=ground_truth_author ) combined_html = "
" + "\n
\n".join(html_task_authors) + "
" html_background_authors = create_html( texts[4:], #last three are background llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short, background = True, predicted_author=predicted_author, ground_truth_author=ground_truth_author ) background_html = "
" + "\n
\n".join(html_background_authors) + "
" return combined_html, background_html def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id: int=0) -> str: """ Returns a human-readable label for the author. """ print(f"get_label called with label: {label}, predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}, bg_id: {bg_id}") if label.startswith("Mystery") or label.startswith("Q_author"): return "Mystery Author" elif label.startswith("a0_author") or label.startswith("a1_author") or label.startswith("a2_author") or label.startswith("Candidate"): if label.startswith("Candidate"): id = int(label.split(" ")[2]) # Get the number after 'Candidate Author' else: id = label.split("_")[0][-1] # Get the last character of the first part (a0, a1, a2) if predicted_author is not None and ground_truth_author is not None: if int(id) == predicted_author and int(id) == ground_truth_author: return f"Candidate {int(id)} (Predicted & Ground Truth)" elif int(id) == predicted_author: return f"Candidate {int(id)} (Predicted)" elif int(id) == ground_truth_author: return f"Candidate {int(id)} (Ground Truth)" else: return f"Candidate {int(id)}" else: return f"Candidate {int(id)}" else: return f"Background Author {bg_id+1}" def create_html(texts, llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short=None, background = False, predicted_author=None, ground_truth_author=None): html = [] for i, (label, txt) in enumerate(texts): label = get_label(label, predicted_author, ground_truth_author, i) if background else get_label(label, predicted_author, ground_truth_author) combined = highlight_both_spans(txt, llm_spans_list[i], gram_spans_list[i]) notice = "" if selected_feature_llm == "None": notice += f"""
No LLM feature selected.
""" elif not llm_spans_list[i]: notice += f"""
No spans found for LLM feature "{selected_feature_llm}".
""" if selected_feature_g2v == "None": notice += f"""
No Gram2Vec feature selected.
""" elif not short: notice += f"""
Invalid or unmapped feature: "{selected_feature_g2v}".
""" elif not gram_spans_list[i]: notice += f"""
No spans found for Gram2Vec feature "{selected_feature_g2v}".
""" html.append(f"""

{label}

{notice}
{combined}
""") return html