Spaces:

ExplainabiliyForAATeam
/

explainability-tool-for-aa

Running

App Files Files Community

Milad Alshomary commited on 8 days ago

Commit

3d73c8d

1 Parent(s): b96061f

updates

Browse files

Files changed (16) hide show

Dockerfile +16 -0
add_hf_env_to_hf_space.py +6 -0
app.py +516 -0
config/config.yaml +12 -0
datasets/placeholder.txt +1 -0
requirements.txt +10 -0
utils/augmented_human_readable.txt +617 -0
utils/clustering_utils.py +325 -0
utils/file_download.py +70 -0
utils/generate_augmented_mapping.py +86 -0
utils/gram2vec_feat_utils.py +284 -0
utils/human_readable.txt +40 -0
utils/interp_space_utils.py +638 -0
utils/llm_feat_utils.py +138 -0
utils/ui.py +225 -0
utils/visualizations.py +564 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["python", "app.py"]

add_hf_env_to_hf_space.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from huggingface_hub import HfApi
+import os
+repo_id = "miladalsh/explaining_authorship_attribution_models"
+api = HfApi()
+api.add_space_variable(repo_id=repo_id, key="OPENAI_API_KEY", value=os.environ["OPENAI_API_KEY"])

app.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import gradio as gr
+import json
+import os
+os.environ["GRADIO_TEMP_DIR"] = "./datasets/temp"  # Set a custom temp directory for Gradio
+os.makedirs(os.environ["GRADIO_TEMP_DIR"], exist_ok=True)
+import yaml
+import argparse
+import os
+import urllib.request
+from tqdm import tqdm
+from dotenv import load_dotenv
+from openai import OpenAI
+from utils.file_download import download_file_override
+def load_config(path="config/config.yaml"):
+    with open(path, "r") as f:
+        return yaml.safe_load(f)
+cfg = load_config()
+download_file_override(cfg.get('interp_space_url'), cfg.get('interp_space_path'))
+download_file_override(cfg.get('instances_to_explain_url'), cfg.get('instances_to_explain_path'))
+download_file_override(cfg.get('gram2vec_feats_url'), cfg.get('gram2vec_feats_path'))
+from utils.visualizations import *
+from utils.llm_feat_utils import *
+from utils.gram2vec_feat_utils import *
+from utils.interp_space_utils import *
+from utils.ui import *
+load_dotenv()
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+# ── load once at startup ────────────────────────────────────────
+GRAM2VEC_SHORTHAND = load_code_map()
+def validate_ground_truth(gt1, gt2, gt3):
+    selected = [gt1, gt2, gt3]
+    selected_count = sum(selected)
+    if selected_count > 1:
+        return None, "Please select only one ground truth author."
+    elif selected_count == 0:
+        return None, "No ground truth author selected."
+    index = selected.index(True)
+    return index, f"Candidate {index+1} is marked as the ground truth author."
+def app(share=False, use_cluster_feats=False):
+    instances, instance_ids = get_instances(cfg['instances_to_explain_path'])
+    interp      = load_interp_space(cfg)
+    clustered_authors_df = interp['clustered_authors_df'][:1000]
+    clustered_authors_df['fullText'] = clustered_authors_df['fullText'].map(lambda l: l[:3]) # Take at most 3 texts per author
+    with gr.Blocks(title="Author Attribution Explainability Tool") as demo:
+        # ── Big Centered Title ──────────────────────────────────────────
+        gr.HTML(styled_block("""
+        <h1 style="
+            text-align:center;
+            font-size:3em;      /* About 48px */
+            margin-bottom:0.3em;
+            font-weight:700;
+        ">
+            Author Attribution (AA) Explainability Tool
+        </h1>
+        """))
+        gr.HTML(styled_block("""
+        <div style="
+            text-align:center;
+            margin: 1em auto 2em auto;
+            max-width:900px;
+        ">
+            <p style="font-size:1.3em; line-height:1.4;">
+            This demo helps you <strong>see inside</strong> a deep AA model’s latent style space.
+            </p>
+            <p style="font-size:0.9em; line-height:1.4;">
+            Currently you are inspecting <a href="https://huggingface.co/rrivera1849/LUAR-MUD">LUAR</a> with pre-defined AA tasks from the <a href="https://www.iarpa.gov/images/research-programs/HIATUS/IARPA_HIATUS_Phase_1_HRS_Data.to_DAC_20240610.pdf">HRS dataset </a>
+            </p>
+            <div style="
+            display:flex;
+            justify-content:center;
+            gap:3em;
+            margin-top:1em;
+            ">
+            <!-- Visualize -->
+            <div style="max-width:200px;">
+                <div style="font-size:2em;">🔍</div>
+                <h4 style="margin:0.2em 0;">Visualize</h4>
+                <p style="margin:0; font-size:1em; line-height:1.3;">
+                Place your AA task with respect to other background authors.
+                </p>
+            </div>
+            <!-- GENERATE -->
+            <div style="max-width:200px;">
+                <div style="font-size:2em;">✏️</div>
+                <h4 style="margin:0.2em 0;">Generate</h4>
+                <p style="margin:0; font-size:1em; line-height:1.3;">
+                Describe your investigated authors' writing style via human-readable LLM-generated style features.
+                </p>
+            </div>
+            <!-- COMPARE -->
+            <div style="max-width:200px;">
+                <div style="font-size:2em;">⚖️</div>
+                <h4 style="margin:0.2em 0;">Compare</h4>
+                <p style="margin:0; font-size:1em; line-height:1.3;">
+                Contrast with <a href=""https://github.com/eric-sclafani/gram2vec>Gram2Vec</a> stylometric features.
+                </p>
+            </div>
+            </div>
+        </div>
+        """))
+        # ── Step-by-Step Guided Panel ──
+        with gr.Accordion("📝 How to Use", open=True):
+            gr.Markdown("""
+                    1. **Select** a model and a task source (pre-defined or custom)
+                    2. Click **Load Task & Generate Embeddings** to load the task and generate embeddings
+                    3. **Run Visualization** to see the mystery author and candidates in the AA model's latent space
+                    4. **Zoom** into the visualization to select a cluster of background authors
+                    5. Pick an **LLM feature** to highlight in yellow
+                    6. Pick a **Gram2Vec feature** to highlight in blue
+                    7. Click **Show Combined Spans** to compare side-by-side
+                    """
+            )
+        # ── Model Selection ─────────────────────────────────
+        model_radio = gr.Radio(
+            choices=[
+                'gabrielloiseau/LUAR-MUD-sentence-transformers',
+                'gabrielloiseau/LUAR-CRUD-sentence-transformers',
+                'miladalsh/light-luar',
+                'AnnaWegmann/Style-Embedding',
+                'Other'
+            ],
+            value='gabrielloiseau/LUAR-MUD-sentence-transformers',
+            label='Choose a Model to inspect'
+        )
+        print(f"Model choices: {model_radio.choices}")
+        print(f"Model default: {model_radio.value}")
+        custom_model = gr.Textbox(
+            label='Custom Model ID',
+            placeholder='Enter your Hugging Face Model ID here',
+            visible=False,
+            interactive=True
+        )
+        # Show the textbox when 'Other' is selected
+        model_radio.change(
+            fn=toggle_custom_model,
+            inputs=[model_radio],
+            outputs=[custom_model]
+        )
+        # ── Task Source Selection ─────────────────────────────────
+        task_mode = gr.Radio(
+            choices=["Predefined HRS Task", "Upload Your Own Task"],
+            value="Predefined HRS Task",
+            label="Select Task Source"
+        )
+        ground_truth_author = gr.State()  # To store the index of the ground truth author
+        with gr.Column():
+            with gr.Column(visible=True) as predefined_container:
+                gr.HTML("""
+                    <div style="
+                        font-size: 1.3em;
+                        font-weight: 600;
+                        margin-bottom: 0.5em;
+                    ">
+                        Pick a pre-defined task to investigate (a mystery text and its three candidate authors)
+                    </div>
+                    """)
+                task_dropdown = gr.Dropdown(
+                    choices=[f"Task {i}" for i in instance_ids],
+                    value=f"Task {instance_ids[0]}",
+                    label="Choose which mystery document to explain",
+                )
+            with gr.Column(visible=False) as custom_container:
+                gr.HTML("""
+                    <div style="
+                        font-size: 1.3em;
+                        font-weight: 600;
+                        margin-bottom: 0.5em;
+                    ">
+                        Upload your own task
+                    </div>
+                    """)
+                mystery_input   = gr.File(label="Mystery (.txt)", file_types=['.txt'])
+                with gr.Row():
+                    candidate1 = gr.File(label="Candidate 1 (.txt)", file_types=['.txt'])
+                    gt1_checkbox = gr.Checkbox(label="Ground Truth?", value=False)
+                with gr.Row():
+                    candidate2 = gr.File(label="Candidate 2 (.txt)", file_types=['.txt'])
+                    gt2_checkbox = gr.Checkbox(label="Ground Truth?", value=False)
+                with gr.Row():
+                    candidate3 = gr.File(label="Candidate 3 (.txt)", file_types=['.txt'])
+                    gt3_checkbox = gr.Checkbox(label="Ground Truth?", value=False)
+                validation_msg = gr.Textbox(label="Validation Result", interactive=False)
+            for checkbox in [gt1_checkbox, gt2_checkbox, gt3_checkbox]:
+                checkbox.change(
+                    fn=validate_ground_truth,
+                    inputs=[gt1_checkbox, gt2_checkbox, gt3_checkbox],
+                    outputs=[ground_truth_author, validation_msg]
+                )
+        # ── Load Task Button ─────────────────────────────────────
+        gr.HTML(instruction_callout("Click the button below to load the tasks and generate embeddings using selected model."))
+        load_button = gr.Button("Load Task & Generate Embeddings")
+        # ── HTML outputs for author texts ───────────────────────────
+        default_outputs = load_instance(0, instances)
+        #dont need defaults since they are loaded only on click of the load button
+        header  = gr.HTML()
+        mystery = gr.HTML()
+        mystery_state = gr.State()  # Store unformatted mystery text for later use
+        with gr.Row():
+            c0 = gr.HTML()
+            c1 = gr.HTML()
+            c2 = gr.HTML()
+            c0_state = gr.State()  # Store unformatted candidate 1 text for later use
+            c1_state = gr.State()  # Store unformatted candidate 2 text for later use
+            c2_state = gr.State()  # Store unformatted candidate 3 text for later use
+        # ── State to hold embeddings DataFrame ─────────────────────
+        task_authors_embeddings_df = gr.State()  # Store embeddings of task authors
+        background_authors_embeddings_df = gr.State()  # Store background authors DataFrame
+        task_mode.change(
+            fn=toggle_task,
+            inputs=[task_mode],
+            outputs=[predefined_container, custom_container]
+        )
+        # ── Wire call to load task and generate embeddings once load button is clicked ───────────────────
+        predicted_author = gr.State()  # Store predicted author from the embeddings
+        load_button.click(
+            fn=lambda: gr.update(value="⏳ Loading... Please wait", interactive=False),
+            inputs=[],
+            outputs=[load_button]
+        ).then(
+            fn=lambda mode, dropdown, mystery, c1, c2, c3, ground_truth_author, model_radio, custom_model_input:
+            update_task_display(
+                mode,
+                dropdown,
+                instances,       # closed over
+                clustered_authors_df,
+                mystery,
+                c1,
+                c2,
+                c3,
+                ground_truth_author,            # true_author placeholder
+                model_radio,
+                custom_model_input
+            ),
+            inputs=[task_mode, task_dropdown, mystery_input, candidate1, candidate2, candidate3, ground_truth_author, model_radio, custom_model],
+            outputs=[header, mystery, c0, c1, c2, mystery_state, c0_state, c1_state, c2_state, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author]  # embeddings_df is a placeholder for now
+        ).then(
+            fn=lambda: gr.update(value="Load Task & Generate Embeddings", interactive=True),
+            inputs=[],
+            outputs=[load_button]
+        )
+        # ── Visualization for features ─────────────────────────────
+        gr.HTML(instruction_callout("Run visualization to see which author is similar to the mystery document."))
+        run_btn   = gr.Button("Run visualization")
+        bg_proj_state = gr.State()
+        bg_lbls_state = gr.State()
+        bg_authors_df = gr.State()  # Holds the background authors DataFrame
+        with gr.Row():
+            with gr.Column(scale=3):
+                axis_ranges = gr.Textbox(visible=False, elem_id="axis-ranges")
+                plot = gr.Plot(
+                    label="Visualization",
+                    elem_id="feature-plot",
+                )
+                plot.change(
+                    fn=None,
+                    inputs=[plot],
+                    outputs=[axis_ranges],
+                    js="""
+                    function(){
+                        console.log("------------>[JS] plot.change() triggered<------------");
+                        let attempts = 0;
+                        const maxAttempts = 50;
+                        const tryAttach = () => {
+                            const gd = document.querySelector('#feature-plot .js-plotly-plot');
+                            if (!gd) {
+                                if (++attempts < maxAttempts) {
+                                    requestAnimationFrame(tryAttach);
+                                } else {
+                                    console.error(" ------------>Could not find .js-plotly-plot after multiple attempts.<------------");
+                                }
+                                return;
+                            }
+                            if (gd.__zoomListenerAttached) {
+                                console.log("------------>Zoom listener already attached.<------------");
+                                return;
+                            }
+                            gd.__zoomListenerAttached = true;
+                            console.log("------------>Zoom listener attached!<------------");
+                            gd.on('plotly_relayout', (ev) => {
+                                if (
+                                    ev['xaxis.range[0]'] === undefined ||
+                                    ev['xaxis.range[1]'] === undefined ||
+                                    ev['yaxis.range[0]'] === undefined ||
+                                    ev['yaxis.range[1]'] === undefined
+                                ) return;
+                                const payload = {
+                                    xaxis: [ev['xaxis.range[0]'], ev['xaxis.range[1]']],
+                                    yaxis: [ev['yaxis.range[0]'], ev['yaxis.range[1]']]
+                                };
+                                const txtbox = document.querySelector('#axis-ranges textarea');
+                                if (txtbox) {
+                                    txtbox.value = JSON.stringify(payload);
+                                    txtbox.dispatchEvent(new Event('input', { bubbles: true }));
+                                    console.log("------------> Zoom payload dispatched:<------------", payload);
+                                } else {
+                                    console.warn("------------> No hidden textbox found to write zoom payload.<------------");
+                                }
+                            });
+                        };
+                        requestAnimationFrame(tryAttach);
+                        return '';
+                    }
+                    """
+                )
+            with gr.Column(scale=1):
+                expl_html = """
+                    <h4>What am I looking at?</h4>
+                    <p>
+                    This plot shows the mystery author (★) and three candidate authors (◆)
+                    in the AA model’s latent space.<br>
+                    The grey ● symbols represent the background corpus—real authors with diverse writing styles.
+                    You can zoom in on any region of the plot. The system will analyze the visible authors
+                    in that area and list the most representative writing style features for the zoomed-in region.<br>
+                    Use this to compare your mystery text’s position against nearby writing styles and
+                    investigate which features distinguish it from others.
+                    </p>
+                """
+                gr.HTML(styled_html(expl_html))
+        cluster_dropdown = gr.Dropdown(choices=[], label="Select Cluster to Inspect", visible=False)
+        style_map_state = gr.State()
+        llm_style_feats_analysis = gr.State()
+        visible_zoomed_authors = gr.State()
+        if use_cluster_feats:
+            # ── Dynamic Cluster Choice dropdown ──────────────────────────────────
+            gr.HTML(instruction_callout("Choose a cluster from the dropdown below to inspect whether its features appear in the mystery author’s text."))
+            cluster_dropdown.visible = True
+        else:
+            gr.HTML(instruction_callout("Zoom in on the plot to select a set of background authors and see the presence of the top features from this set in candidate and mystery authors."))
+        with gr.Row():
+            # ── LLM Features Column ──────────────────────────────────
+            with gr.Column(scale=1, min_width=0):
+                # gr.Markdown("**Features from the cluster closest to the Mystery Author**")
+                gr.HTML("""
+                    <div style="
+                        font-size: 1.3em;
+                        font-weight: 600;
+                        margin-bottom: 0.5em;
+                    ">
+                        LLM-derived style  features prominent in the zoomed-in region
+                    </div>
+                    """)
+                features_rb = gr.Radio(choices=[], label="LLM-derived style features for this zoomed-in region")#, label="Features from the cluster closest to the Mystery Author", info="LLM-derived style features for this cluster")
+                feature_list_state = gr.State()
+            # ── Gram2Vec Features Column ─────────────────────────────
+            with gr.Column(scale=1, min_width=0):
+                # gr.Markdown("**Top-10 Gram2Vec Features most likely to occur in Mystery Author**")
+                gr.HTML("""
+                    <div style="
+                        font-size: 1.3em;
+                        font-weight: 600;
+                        margin-bottom: 0.5em;
+                    ">
+                        Gram2Vec Features prominent in the zoomed-in region
+                    </div>
+                    """)
+                gram2vec_rb    = gr.Radio(choices=[], label="Gram2Vec features for this zoomed-in region")#, label="Top-10 Gram2Vec Features most likely to occur in Mystery Author", info="Most prominent Gram2Vec features in the mystery text")
+                gram2vec_state = gr.State()
+        # ── Visualization button click ───────────────────────────────
+        run_btn.click(
+            fn=lambda iid, model_radio, custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author: visualize_clusters_plotly(
+                int(iid.replace('Task ','')), cfg, instances, model_radio,
+                custom_model_input, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author
+            ),
+            inputs=[task_dropdown, model_radio, custom_model, task_authors_embeddings_df, background_authors_embeddings_df, predicted_author, ground_truth_author],
+            outputs=[plot, style_map_state, bg_proj_state, bg_lbls_state, bg_authors_df]
+        )
+        # Populate feature list based on selection.
+        if use_cluster_feats:
+            # Use cluster-based flow
+            cluster_dropdown.change(
+                fn=on_cluster_change,
+                inputs=[cluster_dropdown, style_map_state],
+                outputs=[features_rb, gram2vec_rb , feature_list_state]
+                #adding feature_list_state to persisit all llm features in the app state
+            )
+        else:
+            axis_ranges.change(
+                fn=handle_zoom_with_retries,
+                inputs=[axis_ranges, bg_proj_state, bg_lbls_state, bg_authors_df, task_authors_embeddings_df],
+                outputs=[features_rb, gram2vec_rb , llm_style_feats_analysis, feature_list_state, visible_zoomed_authors]
+            )
+        # ── Show combined feature‐span highlights ──
+        # combined callout + legend in one HTML block
+        gr.HTML(
+            instruction_callout(
+                "Click \"Show Combined Spans\" to highlight the LLM (yellow) & Gram2Vec (blue) feature spans in the texts"
+            )
+            + """
+            <div style="
+                display: flex;
+                align-items: center;
+                justify-content: center;
+                gap: 2em;
+                margin-top: 0.5em;
+                font-size: 0.9em;
+            ">
+            <div style="display: flex; align-items: center; gap: 0.5em; font-weight: 600; font-size: 1.5em;">
+                <span style="
+                    display: inline-block;
+                    width: 1.5em; height: 1.5em;
+                    background: #FFEB3B;      /* bright yellow */
+                    border: 1px solid #666;
+                    vertical-align: middle;
+                "></span>
+                LLM feature
+            </div>
+            <div style="display: flex; align-items: center; gap: 0.5em; font-weight: 600; font-size: 1.5em;">
+                <span style="
+                    display: inline-block;
+                    width: 1.5em; height: 1.5em;
+                    background: #5CB3FF;      /* clearer blue */
+                    border: 1px solid #666;
+                    vertical-align: middle;
+                "></span>
+                Gram2Vec feature
+            </div>
+            </div>
+            """
+        )
+        combined_btn  = gr.Button("Show Combined Spans")
+        combined_html = gr.HTML()
+        show_background_checkbox = gr.Checkbox(label="Show spans in background authors", value=False)
+        background_html = gr.HTML(visible=False)
+        # print(f"in app: all_feats={feature_list_state.value}")
+        # print(f"in app: sel_feat_llm={features_rb.value}")
+        combined_btn.click(
+            fn=show_combined_spans_all,
+            inputs=[features_rb,
+                    gram2vec_rb,
+                    llm_style_feats_analysis,
+                    background_authors_embeddings_df,
+                    task_authors_embeddings_df,
+                    visible_zoomed_authors,
+                    predicted_author,
+                    ground_truth_author],
+            outputs=[combined_html, background_html]
+        )
+        # mapping -->
+        # iid = task_dropdown.value
+        # sel_feat_llm = features_rb.value
+        # all_feats = feature_list_state.value
+        # sel_feat_g2v = gram2vec_rb.value
+        # combined_html -> spans/html for task authors
+        # background_html -> spans/html for background authors
+        show_background_checkbox.change(
+            fn=lambda show: gr.update(visible=show),
+            inputs=[show_background_checkbox],
+            outputs=[background_html]
+        )
+    demo.launch(share=share)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--use_cluster_feats", action="store_true", help="Use cluster-based selection for features")
+    args = parser.parse_args()
+    app(share=True, use_cluster_feats=args.use_cluster_feats)

config/config.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# config.yaml
+instances_to_explain_path: "./datasets/hrs_explanations.json"
+instances_to_explain_url: "https://huggingface.co/datasets/miladalsh/explanation_tool_files/raw/main/hrs_explanations.json?download"
+interp_space_path:    "./datasets/luar_interp_space_cluster_19/"
+interp_space_url:    "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/luar_interp_space_cluster.zip?download=true"
+gram2vec_feats_path:      "./datasets/gram2vec_feats.csv"
+gram2vec_feats_url:      "https://huggingface.co/datasets/miladalsh/explanation_tool_files/resolve/main/gram2vec_feats.csv?download=true"
+style_feat_clm:       "llm_tfidf_weights"
+top_k:                10
+only_llm_feats:       false
+only_gram2vec_feats:  false

datasets/placeholder.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ test

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+nltk
+spacy
+scikit-learn
+openai
+python-dotenv
+gradio==5.30
+pyyaml
+plotly
+sentence_transformers
+git+https://github.com/MiladAlshomary/gram2vec

utils/augmented_human_readable.txt ADDED Viewed

	@@ -0,0 +1,617 @@

+Adjective:ADJ
+Adposition:ADP
+Adverb:ADV
+Auxiliary verb:AUX
+Coordinating conjunction:CCONJ
+Determiner:DET
+Interjection:INTJ
+Noun:NOUN
+Numeral:NUM
+Particle:PART
+Pronoun:PRON
+Proper noun:PROPN
+Punctuation:PUNCT
+Subordinating conjunction:SCONJ
+Symbol:SYM
+Verb:VERB
+Other:X
+Space:SPACE
+Other (foreign words, typos, abbreviations):X
+ADP: Adposition (preposition or postposition)
+Perfect aspect:Aspect=Perf
+Progressive aspect:Aspect=Prog
+Accusative case:Case=Acc
+Nominative case:Case=Nom
+Definite article:Definite=Def
+Indefinite article:Definite=Ind
+Comparative degree:Degree=Cmp
+Positive degree:Degree=Pos
+Superlative degree:Degree=Sup
+Feminine gender:Gender=Fem
+Masculine gender:Gender=Masc
+Indicative mood:Mood=Ind
+Plural number:Number=Plur
+Singular number:Number=Sing
+First person:Person=1
+Second person:Person=2
+Third person:Person=3
+Past tense:Tense=Past
+Present tense:Tense=Pres
+Finite verb form:VerbForm=Fin
+Infinitive verb form:VerbForm=Inf
+Adjective followed by Adjective:ADJ ADJ
+Adjective followed by Adposition:ADJ ADP
+Adjective followed by Adverb:ADJ ADV
+Adjective followed by Auxiliary verb:ADJ AUX
+Adjective followed by Coordinating conjunction:ADJ CCONJ
+Adjective followed by Determiner:ADJ DET
+Adjective followed by Interjection:ADJ INTJ
+Adjective followed by Noun:ADJ NOUN
+Adjective followed by Numeral:ADJ NUM
+Adjective followed by Other:ADJ X
+Adjective followed by Particle:ADJ PART
+Adjective followed by Pronoun:ADJ PRON
+Adjective followed by Proper noun:ADJ PROPN
+Adjective followed by Punctuation:ADJ PUNCT
+Adjective followed by Subordinating conjunction:ADJ SCONJ
+Adjective followed by Symbol:ADJ SYM
+Adjective followed by Verb:ADJ VERB
+Adposition followed by Adjective:ADP ADJ
+Adposition followed by Adposition:ADP ADP
+Adposition followed by Adverb:ADP ADV
+Adposition followed by Auxiliary verb:ADP AUX
+Adposition followed by Coordinating conjunction:ADP CCONJ
+Adposition followed by Determiner:ADP DET
+Adposition followed by Interjection:ADP INTJ
+Adposition followed by Noun:ADP NOUN
+Adposition followed by Numeral:ADP NUM
+Adposition followed by Other:ADP X
+Adposition followed by Particle:ADP PART
+Adposition followed by Pronoun:ADP PRON
+Adposition followed by Proper noun:ADP PROPN
+Adposition followed by Punctuation:ADP PUNCT
+Adposition followed by Subordinating conjunction:ADP SCONJ
+Adposition followed by Symbol:ADP SYM
+Adposition followed by Verb:ADP VERB
+Adverb followed by Adjective:ADV ADJ
+Adverb followed by Adposition:ADV ADP
+Adverb followed by Adverb:ADV ADV
+Adverb followed by Auxiliary verb:ADV AUX
+Adverb followed by Coordinating conjunction:ADV CCONJ
+Adverb followed by Determiner:ADV DET
+Adverb followed by Interjection:ADV INTJ
+Adverb followed by Noun:ADV NOUN
+Adverb followed by Numeral:ADV NUM
+Adverb followed by Other:ADV X
+Adverb followed by Particle:ADV PART
+Adverb followed by Pronoun:ADV PRON
+Adverb followed by Proper noun:ADV PROPN
+Adverb followed by Punctuation:ADV PUNCT
+Adverb followed by Subordinating conjunction:ADV SCONJ
+Adverb followed by Symbol:ADV SYM
+Adverb followed by Verb:ADV VERB
+Auxiliary verb followed by Adjective:AUX ADJ
+Auxiliary verb followed by Adposition:AUX ADP
+Auxiliary verb followed by Adverb:AUX ADV
+Auxiliary verb followed by Auxiliary verb:AUX AUX
+Auxiliary verb followed by Coordinating conjunction:AUX CCONJ
+Auxiliary verb followed by Determiner:AUX DET
+Auxiliary verb followed by Interjection:AUX INTJ
+Auxiliary verb followed by Noun:AUX NOUN
+Auxiliary verb followed by Numeral:AUX NUM
+Auxiliary verb followed by Other:AUX X
+Auxiliary verb followed by Particle:AUX PART
+Auxiliary verb followed by Pronoun:AUX PRON
+Auxiliary verb followed by Proper noun:AUX PROPN
+Auxiliary verb followed by Punctuation:AUX PUNCT
+Auxiliary verb followed by Subordinating conjunction:AUX SCONJ
+Auxiliary verb followed by Symbol:AUX SYM
+Auxiliary verb followed by Verb:AUX VERB
+Coordinating conjunction followed by Adjective:CCONJ ADJ
+Coordinating conjunction followed by Adposition:CCONJ ADP
+Coordinating conjunction followed by Adverb:CCONJ ADV
+Coordinating conjunction followed by Auxiliary verb:CCONJ AUX
+Coordinating conjunction followed by Coordinating conjunction:CCONJ CCONJ
+Coordinating conjunction followed by Determiner:CCONJ DET
+Coordinating conjunction followed by Interjection:CCONJ INTJ
+Coordinating conjunction followed by Noun:CCONJ NOUN
+Coordinating conjunction followed by Numeral:CCONJ NUM
+Coordinating conjunction followed by Other:CCONJ X
+Coordinating conjunction followed by Particle:CCONJ PART
+Coordinating conjunction followed by Pronoun:CCONJ PRON
+Coordinating conjunction followed by Proper noun:CCONJ PROPN
+Coordinating conjunction followed by Punctuation:CCONJ PUNCT
+Coordinating conjunction followed by Subordinating conjunction:CCONJ SCONJ
+Coordinating conjunction followed by Symbol:CCONJ SYM
+Coordinating conjunction followed by Verb:CCONJ VERB
+Determiner followed by Adjective:DET ADJ
+Determiner followed by Adposition:DET ADP
+Determiner followed by Adverb:DET ADV
+Determiner followed by Auxiliary verb:DET AUX
+Determiner followed by Coordinating conjunction:DET CCONJ
+Determiner followed by Determiner:DET DET
+Determiner followed by Interjection:DET INTJ
+Determiner followed by Noun:DET NOUN
+Determiner followed by Numeral:DET NUM
+Determiner followed by Other:DET X
+Determiner followed by Particle:DET PART
+Determiner followed by Pronoun:DET PRON
+Determiner followed by Proper noun:DET PROPN
+Determiner followed by Punctuation:DET PUNCT
+Determiner followed by Subordinating conjunction:DET SCONJ
+Determiner followed by Symbol:DET SYM
+Determiner followed by Verb:DET VERB
+Interjection followed by Adjective:INTJ ADJ
+Interjection followed by Adposition:INTJ ADP
+Interjection followed by Adverb:INTJ ADV
+Interjection followed by Auxiliary verb:INTJ AUX
+Interjection followed by Coordinating conjunction:INTJ CCONJ
+Interjection followed by Determiner:INTJ DET
+Interjection followed by Interjection:INTJ INTJ
+Interjection followed by Noun:INTJ NOUN
+Interjection followed by Numeral:INTJ NUM
+Interjection followed by Other:INTJ X
+Interjection followed by Particle:INTJ PART
+Interjection followed by Pronoun:INTJ PRON
+Interjection followed by Proper noun:INTJ PROPN
+Interjection followed by Punctuation:INTJ PUNCT
+Interjection followed by Subordinating conjunction:INTJ SCONJ
+Interjection followed by Symbol:INTJ SYM
+Interjection followed by Verb:INTJ VERB
+Noun followed by Adjective:NOUN ADJ
+Noun followed by Adposition:NOUN ADP
+Noun followed by Adverb:NOUN ADV
+Noun followed by Auxiliary verb:NOUN AUX
+Noun followed by Coordinating conjunction:NOUN CCONJ
+Noun followed by Determiner:NOUN DET
+Noun followed by Interjection:NOUN INTJ
+Noun followed by Noun:NOUN NOUN
+Noun followed by Numeral:NOUN NUM
+Noun followed by Other:NOUN X
+Noun followed by Particle:NOUN PART
+Noun followed by Pronoun:NOUN PRON
+Noun followed by Proper noun:NOUN PROPN
+Noun followed by Punctuation:NOUN PUNCT
+Noun followed by Subordinating conjunction:NOUN SCONJ
+Noun followed by Symbol:NOUN SYM
+Noun followed by Verb:NOUN VERB
+Numeral followed by Adjective:NUM ADJ
+Numeral followed by Adposition:NUM ADP
+Numeral followed by Adverb:NUM ADV
+Numeral followed by Auxiliary verb:NUM AUX
+Numeral followed by Coordinating conjunction:NUM CCONJ
+Numeral followed by Determiner:NUM DET
+Numeral followed by Interjection:NUM INTJ
+Numeral followed by Noun:NUM NOUN
+Numeral followed by Numeral:NUM NUM
+Numeral followed by Other:NUM X
+Numeral followed by Particle:NUM PART
+Numeral followed by Pronoun:NUM PRON
+Numeral followed by Proper noun:NUM PROPN
+Numeral followed by Punctuation:NUM PUNCT
+Numeral followed by Subordinating conjunction:NUM SCONJ
+Numeral followed by Symbol:NUM SYM
+Numeral followed by Verb:NUM VERB
+Other followed by Adjective:X ADJ
+Other followed by Adposition:X ADP
+Other followed by Adverb:X ADV
+Other followed by Auxiliary verb:X AUX
+Other followed by Coordinating conjunction:X CCONJ
+Other followed by Determiner:X DET
+Other followed by Interjection:X INTJ
+Other followed by Noun:X NOUN
+Other followed by Numeral:X NUM
+Other followed by Other:X X
+Other followed by Particle:X PART
+Other followed by Pronoun:X PRON
+Other followed by Proper noun:X PROPN
+Other followed by Punctuation:X PUNCT
+Other followed by Subordinating conjunction:X SCONJ
+Other followed by Symbol:X SYM
+Other followed by Verb:X VERB
+Particle followed by Adjective:PART ADJ
+Particle followed by Adposition:PART ADP
+Particle followed by Adverb:PART ADV
+Particle followed by Auxiliary verb:PART AUX
+Particle followed by Coordinating conjunction:PART CCONJ
+Particle followed by Determiner:PART DET
+Particle followed by Interjection:PART INTJ
+Particle followed by Noun:PART NOUN
+Particle followed by Numeral:PART NUM
+Particle followed by Other:PART X
+Particle followed by Particle:PART PART
+Particle followed by Pronoun:PART PRON
+Particle followed by Proper noun:PART PROPN
+Particle followed by Punctuation:PART PUNCT
+Particle followed by Subordinating conjunction:PART SCONJ
+Particle followed by Symbol:PART SYM
+Particle followed by Verb:PART VERB
+Pronoun followed by Adjective:PRON ADJ
+Pronoun followed by Adposition:PRON ADP
+Pronoun followed by Adverb:PRON ADV
+Pronoun followed by Auxiliary verb:PRON AUX
+Pronoun followed by Coordinating conjunction:PRON CCONJ
+Pronoun followed by Determiner:PRON DET
+Pronoun followed by Interjection:PRON INTJ
+Pronoun followed by Noun:PRON NOUN
+Pronoun followed by Numeral:PRON NUM
+Pronoun followed by Other:PRON X
+Pronoun followed by Particle:PRON PART
+Pronoun followed by Pronoun:PRON PRON
+Pronoun followed by Proper noun:PRON PROPN
+Pronoun followed by Punctuation:PRON PUNCT
+Pronoun followed by Subordinating conjunction:PRON SCONJ
+Pronoun followed by Symbol:PRON SYM
+Pronoun followed by Verb:PRON VERB
+Proper noun followed by Adjective:PROPN ADJ
+Proper noun followed by Adposition:PROPN ADP
+Proper noun followed by Adverb:PROPN ADV
+Proper noun followed by Auxiliary verb:PROPN AUX
+Proper noun followed by Coordinating conjunction:PROPN CCONJ
+Proper noun followed by Determiner:PROPN DET
+Proper noun followed by Interjection:PROPN INTJ
+Proper noun followed by Noun:PROPN NOUN
+Proper noun followed by Numeral:PROPN NUM
+Proper noun followed by Other:PROPN X
+Proper noun followed by Particle:PROPN PART
+Proper noun followed by Pronoun:PROPN PRON
+Proper noun followed by Proper noun:PROPN PROPN
+Proper noun followed by Punctuation:PROPN PUNCT
+Proper noun followed by Subordinating conjunction:PROPN SCONJ
+Proper noun followed by Symbol:PROPN SYM
+Proper noun followed by Verb:PROPN VERB
+Punctuation followed by Adjective:PUNCT ADJ
+Punctuation followed by Adposition:PUNCT ADP
+Punctuation followed by Adverb:PUNCT ADV
+Punctuation followed by Auxiliary verb:PUNCT AUX
+Punctuation followed by Coordinating conjunction:PUNCT CCONJ
+Punctuation followed by Determiner:PUNCT DET
+Punctuation followed by Interjection:PUNCT INTJ
+Punctuation followed by Noun:PUNCT NOUN
+Punctuation followed by Numeral:PUNCT NUM
+Punctuation followed by Other:PUNCT X
+Punctuation followed by Particle:PUNCT PART
+Punctuation followed by Pronoun:PUNCT PRON
+Punctuation followed by Proper noun:PUNCT PROPN
+Punctuation followed by Punctuation:PUNCT PUNCT
+Punctuation followed by Subordinating conjunction:PUNCT SCONJ
+Punctuation followed by Symbol:PUNCT SYM
+Punctuation followed by Verb:PUNCT VERB
+Subordinating conjunction followed by Adjective:SCONJ ADJ
+Subordinating conjunction followed by Adposition:SCONJ ADP
+Subordinating conjunction followed by Adverb:SCONJ ADV
+Subordinating conjunction followed by Auxiliary verb:SCONJ AUX
+Subordinating conjunction followed by Coordinating conjunction:SCONJ CCONJ
+Subordinating conjunction followed by Determiner:SCONJ DET
+Subordinating conjunction followed by Interjection:SCONJ INTJ
+Subordinating conjunction followed by Noun:SCONJ NOUN
+Subordinating conjunction followed by Numeral:SCONJ NUM
+Subordinating conjunction followed by Other:SCONJ X
+Subordinating conjunction followed by Particle:SCONJ PART
+Subordinating conjunction followed by Pronoun:SCONJ PRON
+Subordinating conjunction followed by Proper noun:SCONJ PROPN
+Subordinating conjunction followed by Punctuation:SCONJ PUNCT
+Subordinating conjunction followed by Subordinating conjunction:SCONJ SCONJ
+Subordinating conjunction followed by Symbol:SCONJ SYM
+Subordinating conjunction followed by Verb:SCONJ VERB
+Symbol followed by Adjective:SYM ADJ
+Symbol followed by Adposition:SYM ADP
+Symbol followed by Adverb:SYM ADV
+Symbol followed by Auxiliary verb:SYM AUX
+Symbol followed by Coordinating conjunction:SYM CCONJ
+Symbol followed by Determiner:SYM DET
+Symbol followed by Interjection:SYM INTJ
+Symbol followed by Noun:SYM NOUN
+Symbol followed by Numeral:SYM NUM
+Symbol followed by Other:SYM X
+Symbol followed by Particle:SYM PART
+Symbol followed by Pronoun:SYM PRON
+Symbol followed by Proper noun:SYM PROPN
+Symbol followed by Punctuation:SYM PUNCT
+Symbol followed by Subordinating conjunction:SYM SCONJ
+Symbol followed by Symbol:SYM SYM
+Symbol followed by Verb:SYM VERB
+Verb followed by Adjective:VERB ADJ
+Verb followed by Adposition:VERB ADP
+Verb followed by Adverb:VERB ADV
+Verb followed by Auxiliary verb:VERB AUX
+Verb followed by Coordinating conjunction:VERB CCONJ
+Verb followed by Determiner:VERB DET
+Verb followed by Interjection:VERB INTJ
+Verb followed by Noun:VERB NOUN
+Verb followed by Numeral:VERB NUM
+Verb followed by Other:VERB X
+Verb followed by Particle:VERB PART
+Verb followed by Pronoun:VERB PRON
+Verb followed by Proper noun:VERB PROPN
+Verb followed by Punctuation:VERB PUNCT
+Verb followed by Subordinating conjunction:VERB SCONJ
+Verb followed by Symbol:VERB SYM
+Verb followed by Verb:VERB VERB
+Accusative case:Case=Acc
+Comparative degree:Degree=Cmp
+Definite article:Definite=Def
+Feminine gender:Gender=Fem
+Finite verb form:VerbForm=Fin
+First person:Person=1
+Indefinite article:Definite=Ind
+Indicative mood:Mood=Ind
+Infinitive verb form:VerbForm=Inf
+Masculine gender:Gender=Masc
+Nominative case:Case=Nom
+Past tense:Tense=Past
+Perfect aspect:Aspect=Perf
+Plural number:Number=Plur
+Positive degree:Degree=Pos
+Present tense:Tense=Pres
+Progressive aspect:Aspect=Prog
+Second person:Person=2
+Singular number:Number=Sing
+Superlative degree:Degree=Sup
+Third person:Person=3
+Number of Tokens:num_tokens
+Adjectival clause:acl
+Adjectival complement:acomp
+Adjectival modifier:amod
+Adverbial clause modifier:advcl
+Adverbial modifier:advmod
+Agent (in passive voice):agent
+Appositional modifier:appos
+Attribute:attr
+Case marking:case
+Clausal complement:ccomp
+Clausal subject:csubj
+Clausal subject (passive):csubjpass
+Complement of preposition:pcomp
+Compound word:compound
+Conjunct:conj
+Coordinating conjunction:cc
+Adposition (preposition or postposition):ADP
+Dative:dative
+Direct object:dobj
+Expletive:expl
+Marker (introducing adverbial clause):mark
+Meta modifier:meta
+Negation modifier:neg
+Nominal modifier:nmod
+Nominal subject (passive):nsubjpass
+Noun phrase as adverbial modifier:npadvmod
+Numeric modifier:nummod
+Object of preposition:pobj
+Object predicate:oprd
+Open clausal complement:xcomp
+Parataxis:parataxis
+Passive auxiliary:auxpass
+Possession modifier:poss
+Pre-correlative conjunction:preconj
+Predeterminer:predet
+Prepositional modifier:prep
+Quantifier modifier:quantmod
+Relative clause modifier:relcl
+Root of the sentence:ROOT
+Unspecified dependency:dep
+Article pronoun type:PronType=Art
+Bracket punctuation type:PunctType=Brck
+Cardinal number:NumType=Card
+Comma punctuation type:PunctType=Comm
+Comparative conjunction type:ConjType=Cmp
+Dash punctuation type:PunctType=Dash
+Demonstrative pronoun type:PronType=Dem
+Final punctuation:PunctSide=Fin
+Foreign word:Foreign=Yes
+Gerund verb form:VerbForm=Ger
+Hyphenated:Hyph=Yes
+Indefinite pronoun type:PronType=Ind
+Initial punctuation:PunctSide=Ini
+Modal verb type:VerbType=Mod
+Multiplicative number:NumType=Mult
+Negative polarity:Polarity=Neg
+Neuter gender:Gender=Neut
+Ordinal number:NumType=Ord
+Participle verb form:VerbForm=Part
+Period punctuation type:PunctType=Peri
+Possessive:Poss=Yes
+Quotation punctuation type:PunctType=Quot
+Reflexive:Reflex=Yes
+Relative pronoun type:PronType=Rel
+❤️:❤️
+👍:👍
+😂:😂
+😍:😍
+!:!
+":\\"
+%:%
+&:&
+':'
+(:(
+):\)
+*:*
+,:,
+-:-
+.:.
+;:;
+?:?
+_:_
+`:`
+–:–
+':'
+':'
+all-cleft:all-cleft
+coordinate-clause:coordinate-clause
+if-because-cleft:if-because-cleft
+it-cleft:it-cleft
+obj-relcl:obj-relcl
+passive:passive
+pseudo-cleft:pseudo-cleft
+subj-relcl:subj-relcl
+tag-question:tag-question
+there-cleft:there-cleft
+punctuation:punctuation
+Articles:DET
+Auxiliary Verbs:AUX
+Conjunctions:CCONJ
+Prepositions:ADP
+Personal Pronouns:category:Personal Pronouns
+Demonstrative Pronouns:category:Demonstrative Pronouns
+Interrogative Pronouns:category:Interrogative Pronouns
+Modal Verbs:category:Modal Verbs
+Contractions:category:Contractions
+Adverbs:category:Adverbs
+Other:category:Other
+i:i
+me:me
+my:my
+myself:myself
+we:we
+our:our
+ours:ours
+ourselves:ourselves
+you:you
+'re:'re
+'ve:'ve
+'ll:'ll
+'d:'d
+'s:'s
+'t:'t
+your:your
+yours:yours
+yourself:yourself
+yourselves:yourselves
+he:he
+him:him
+his:his
+himself:himself
+she:she
+her:her
+ers:ers
+herself:herself
+it:it
+its:its
+itself:itself
+they:they
+them:them
+their:their
+theirs:theirs
+themselves:themselves
+what:what
+which:which
+who:who
+this:this
+that:that
+these:these
+those:those
+am:am
+is:is
+are:are
+was:was
+were:were
+be:be
+been:been
+being:being
+have:have
+has:has
+had:had
+having:having
+do:do
+does:does
+did:did
+doing:doing
+a:a
+an:an
+the:the
+and:and
+but:but
+if:if
+or:or
+because:because
+as:as
+until:until
+while:while
+of:of
+at:at
+by:by
+for:for
+with:with
+about:about
+against:against
+between:between
+into:into
+through:through
+during:during
+before:before
+after:after
+above:above
+below:below
+to:to
+from:from
+up:up
+down:down
+in:in
+out:out
+on:on
+off:off
+over:over
+under:under
+again:again
+further:further
+then:then
+once:once
+here:here
+there:there
+when:when
+where:where
+why:why
+how:how
+all:all
+any:any
+both:both
+each:each
+few:few
+more:more
+most:most
+other:other
+some:some
+such:such
+no:no
+nor:nor
+not:not
+only:only
+own:own
+same:same
+so:so
+than:than
+too:too
+very:very
+can:can
+will:will
+just:just
+don:don
+should:should
+now:now
+ain:ain
+aren:aren
+couldn:couldn
+didn:didn
+doesn:doesn
+hadn:hadn
+hasn:hasn
+haven:haven
+isn:isn
+ma:ma
+shouldn:shouldn
+wasn:wasn
+weren:weren
+won:won
+wouldn:wouldn

utils/clustering_utils.py ADDED Viewed

	@@ -0,0 +1,325 @@

+# Required for clustering_author function:
+import pandas as pd
+import numpy as np
+from sklearn.cluster import DBSCAN
+from sklearn.metrics import silhouette_score
+# Required for analyze_space_distance_preservation
+from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
+from scipy.stats import pearsonr
+from typing import List, Dict, Any
+def _find_best_dbscan_eps(X: np.ndarray,
+                          eps_values: List[float],
+                          min_samples: int,
+                          metric: str) -> tuple[float | None, np.ndarray | None, float]:
+    """
+    Iterates through eps_values for DBSCAN and returns the parameters
+    that yield the highest silhouette score.
+    Args:
+        X (np.ndarray): The input data (embeddings).
+        eps_values (List[float]): List of eps values to try.
+        min_samples (int): DBSCAN min_samples parameter.
+        metric (str): Distance metric for DBSCAN and silhouette score.
+    Returns:
+        tuple[float | None, np.ndarray | None, float]:
+            - best_eps: The eps value that resulted in the best score. None if no suitable clustering.
+            - best_labels: The cluster labels from the best DBSCAN run. None if no suitable clustering.
+            - best_score: The highest silhouette score achieved.
+    """
+    best_score = -1.001  # Silhouette score is in [-1, 1]
+    best_labels = None
+    best_eps = None
+    for eps in eps_values:
+        if eps <= 1e-9:  # eps must be positive
+            continue
+        db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
+        labels = db.fit_predict(X)
+        unique_labels_set = set(labels)
+        n_clusters_ = len(unique_labels_set) - (1 if -1 in unique_labels_set else 0)
+        if n_clusters_ > 1:
+            clustered_mask = (labels != -1)
+            if np.sum(clustered_mask) >= 2:  # Need at least 2 non-noise points
+                X_clustered = X[clustered_mask]
+                labels_clustered = labels[clustered_mask]
+                try:
+                    score = silhouette_score(X_clustered, labels_clustered, metric=metric)
+                    if score > best_score:
+                        best_score = score
+                        best_labels = labels.copy()
+                        best_eps = eps
+                except ValueError:  # Catch errors from silhouette_score
+                    pass
+        elif n_clusters_ == 1 and best_labels is None: # Fallback for single cluster
+            if not all(l == -1 for l in labels):
+                current_score_for_single_cluster = -0.5 # Nominal score
+                if current_score_for_single_cluster > best_score:
+                    best_score = current_score_for_single_cluster
+                    best_labels = labels.copy()
+                    best_eps = eps
+    return best_eps, best_labels, best_score
+def clustering_author(background_corpus_df: pd.DataFrame,
+                      embedding_clm: str = 'style_embedding',
+                      eps_values: List[float] = None,
+                      min_samples: int = 5,
+                      metric: str = 'cosine') -> pd.DataFrame:
+    """
+    Performs DBSCAN clustering on embeddings in a DataFrame.
+    Experiments with different `eps` parameters to find a clustering
+    that maximizes the silhouette score, indicating well-separated clusters.
+    Args:
+        background_corpus_df (pd.DataFrame): DataFrame with an embedding column.
+        embedding_clm (str): Name of the column containing embeddings.
+                             Each embedding should be a list or NumPy array.
+        eps_values (List[float], optional): Specific `eps` values to test.
+                                            If None, a default range is used.
+                                            For 'cosine' metric, eps is typically in [0, 2].
+                                            For 'euclidean', scale depends on embedding magnitudes.
+        min_samples (int): DBSCAN `min_samples` parameter. Minimum number of
+                           samples in a neighborhood for a point to be a core point.
+        metric (str): The distance metric to use for DBSCAN and silhouette score
+                      (e.g., 'cosine', 'euclidean').
+    Returns:
+        pd.DataFrame: The input DataFrame with a new 'cluster_label' column.
+                      Labels are from the DBSCAN run with the highest silhouette score.
+                      If no suitable clustering is found, labels might be all -1 (noise).
+    """
+    if embedding_clm not in background_corpus_df.columns:
+        raise ValueError(f"Embedding column '{embedding_clm}' not found in DataFrame.")
+    embeddings_list = background_corpus_df[embedding_clm].tolist()
+    X_list = []
+    original_indices = [] # To map results back to the original DataFrame's indices
+    for i, emb_val in enumerate(embeddings_list):
+        if emb_val is not None:
+            try:
+                e = np.asarray(emb_val, dtype=float)
+                if e.ndim == 1 and e.size > 0: # Standard 1D vector
+                    X_list.append(e)
+                    original_indices.append(i)
+                elif e.ndim == 0 and e.size == 1: # Scalar value, treat as 1D vector of size 1
+                    X_list.append(np.array([e.item()]))
+                    original_indices.append(i)
+                # Silently skip empty arrays or improperly shaped arrays
+            except (TypeError, ValueError):
+                # Silently skip if conversion to float array fails
+                pass
+    # Initialize labels for all rows in the original DataFrame to -1 (noise/unprocessed)
+    final_labels_for_df = pd.Series(-1, index=background_corpus_df.index, dtype=int)
+    if not X_list:
+        print(f"No valid embeddings found in column '{embedding_clm}'. Assigning all 'cluster_label' as -1.")
+        background_corpus_df['cluster_label'] = final_labels_for_df
+        return background_corpus_df
+    X = np.array(X_list) # Creates a 2D array from the list of 1D arrays
+    if X.shape[0] == 1:
+        print("Only one valid embedding found. Assigning cluster label 0 to it.")
+        if original_indices: # Should always be true if X.shape[0]==1 from X_list
+            final_labels_for_df.iloc[original_indices[0]] = 0
+        background_corpus_df['cluster_label'] = final_labels_for_df
+        return background_corpus_df
+    if X.shape[0] < min_samples:
+        print(f"Number of valid embeddings ({X.shape[0]}) is less than min_samples ({min_samples}). "
+              f"All valid embeddings will be marked as noise (-1).")
+        for original_idx in original_indices:
+             final_labels_for_df.iloc[original_idx] = -1
+        background_corpus_df['cluster_label'] = final_labels_for_df
+        return background_corpus_df
+    if eps_values is None:
+        if metric == 'cosine':
+            eps_values = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+        else:
+            if X.shape[0] > 1:
+                data_spread = np.std(X)
+                eps_values = [round(data_spread * f, 2) for f in [0.25, 0.5, 1.0]]
+                eps_values = [e for e in eps_values if e > 1e-6]
+            if not eps_values or X.shape[0] <=1:
+                 eps_values = [0.5, 1.0, 1.5]
+        print(f"Warning: `eps_values` not provided. Using default range for metric '{metric}': {eps_values}. "
+              f"It's recommended to supply `eps_values` tuned to your data.")
+    print(f"Performing DBSCAN clustering (min_samples={min_samples}, metric='{metric}') with eps values: "
+          f"{[f'{e:.2f}' for e in eps_values]}")
+    best_eps, best_labels, best_score = _find_best_dbscan_eps(X, eps_values, min_samples, metric)
+    if best_labels is not None:
+        num_found_clusters = len(set(best_labels) - {-1})
+        print(f"Best clustering found: eps={best_eps:.2f}, Silhouette Score={best_score:.4f} ({num_found_clusters} clusters).")
+        for i, label in enumerate(best_labels):
+            original_df_idx = original_indices[i]
+            final_labels_for_df.iloc[original_df_idx] = label
+    else:
+        print("No suitable DBSCAN clustering found meeting criteria. All processed embeddings marked as noise (-1).")
+    background_corpus_df['cluster_label'] = final_labels_for_df
+    return background_corpus_df
+def _safe_embeddings_to_matrix(embeddings_column: pd.Series) -> np.ndarray:
+    """
+    Converts a pandas Series of embeddings (expected to be lists of floats or 1D np.arrays)
+    into a 2D NumPy matrix. Handles None values and attempts to stack consistently.
+    Returns an empty 2D array (e.g., shape (0,0) or (0,D)) if conversion fails or no valid data.
+    """
+    embeddings_list = embeddings_column.tolist()
+    processed_1d_arrays = []
+    for emb in embeddings_list:
+        if emb is not None:
+            if hasattr(emb, '__iter__') and not isinstance(emb, (str, bytes)):
+                try:
+                    arr = np.asarray(emb, dtype=float)
+                    if arr.ndim == 1 and arr.size > 0:
+                        processed_1d_arrays.append(arr)
+                except (TypeError, ValueError):
+                    pass # Ignore embeddings that cannot be converted
+    if not processed_1d_arrays:
+        return np.empty((0,0))
+    # Check for consistent dimensionality before vstacking
+    first_len = processed_1d_arrays[0].shape[0]
+    consistent_embeddings = [arr for arr in processed_1d_arrays if arr.shape[0] == first_len]
+    if not consistent_embeddings:
+        return np.empty((0, first_len if processed_1d_arrays else 0)) # (0,D) or (0,0)
+    try:
+        return np.vstack(consistent_embeddings)
+    except ValueError:
+        # Should not happen if lengths are consistent
+        return np.empty((0, first_len))
+def _compute_cluster_centroids(
+    df_clustered_items: pd.DataFrame, # DataFrame already filtered for non-noise items
+    embedding_clm: str,
+    cluster_label_clm: str
+) -> Dict[Any, np.ndarray]:
+    """Computes the centroid for each cluster from a pre-filtered DataFrame."""
+    centroids = {}
+    if df_clustered_items.empty:
+        return centroids
+    for cluster_id, group in df_clustered_items.groupby(cluster_label_clm):
+        embeddings_matrix = _safe_embeddings_to_matrix(group[embedding_clm])
+        if embeddings_matrix.ndim == 2 and embeddings_matrix.shape[0] > 0 and embeddings_matrix.shape[1] > 0:
+            centroids[cluster_id] = np.mean(embeddings_matrix, axis=0)
+    return centroids
+def _project_to_centroid_space(
+    original_embeddings_matrix: np.ndarray, # (n_items, n_original_features)
+    centroids_map: Dict[Any, np.ndarray]    # {cluster_id: centroid_vector (n_original_features,)}
+) -> np.ndarray:
+    """Projects embeddings into a new space defined by cluster centroids using cosine similarity."""
+    if not centroids_map or original_embeddings_matrix.ndim != 2 or \
+       original_embeddings_matrix.shape[0] == 0 or original_embeddings_matrix.shape[1] == 0:
+        return np.empty((original_embeddings_matrix.shape[0], 0)) # (n_items, 0_new_features)
+    sorted_cluster_ids = sorted(centroids_map.keys())
+    valid_centroid_vectors = []
+    for cid in sorted_cluster_ids:
+        centroid_vec = centroids_map[cid]
+        if isinstance(centroid_vec, np.ndarray) and centroid_vec.ndim == 1 and \
+           centroid_vec.size == original_embeddings_matrix.shape[1]:
+            valid_centroid_vectors.append(centroid_vec)
+    if not valid_centroid_vectors:
+        return np.empty((original_embeddings_matrix.shape[0], 0))
+    centroid_matrix = np.vstack(valid_centroid_vectors) # (n_valid_centroids, n_original_features)
+    # Result: (n_items, n_valid_centroids)
+    projected_matrix = cosine_similarity(original_embeddings_matrix, centroid_matrix)
+    return projected_matrix
+def _get_pairwise_cosine_distances(embeddings_matrix: np.ndarray) -> np.ndarray:
+    """Calculates unique pairwise cosine distances from an embedding matrix."""
+    if not isinstance(embeddings_matrix, np.ndarray) or embeddings_matrix.ndim != 2 or \
+       embeddings_matrix.shape[0] < 2 or embeddings_matrix.shape[1] == 0:
+        return np.array([]) # Not enough samples or features
+    dist_matrix = cosine_distances(embeddings_matrix)
+    iu = np.triu_indices(dist_matrix.shape[0], k=1) # Upper triangle, excluding diagonal
+    return dist_matrix[iu]
+def analyze_space_distance_preservation(
+    df: pd.DataFrame,
+    embedding_clm: str = 'style_embedding',
+    cluster_label_clm: str = 'cluster_label'
+) -> float | None:
+    """
+    Analyzes how well a new space, defined by cluster centroids, preserves
+    the cosine distance relationships from the original embedding space.
+    Args:
+        df (pd.DataFrame): DataFrame with original embeddings and cluster labels.
+        embedding_clm (str): Column name for original embeddings.
+        cluster_label_clm (str): Column name for cluster labels.
+    Returns:
+        float | None: Pearson correlation coefficient. Returns None if analysis
+                      cannot be performed (e.g., <2 clusters, <2 items), or 0.0
+                      if correlation is NaN (e.g. due to zero variance in distances).
+    """
+    df_valid_items = df[df[cluster_label_clm] != -1].copy()
+    if df_valid_items.shape[0] < 2:
+        return None # Need at least 2 items for pairwise distances
+    original_embeddings_matrix = _safe_embeddings_to_matrix(df_valid_items[embedding_clm])
+    if original_embeddings_matrix.ndim != 2 or original_embeddings_matrix.shape[0] < 2 or \
+       original_embeddings_matrix.shape[1] == 0:
+        return None # Valid matrix from original embeddings could not be formed
+    centroids = _compute_cluster_centroids(df_valid_items, embedding_clm, cluster_label_clm)
+    if len(centroids) < 2: # Need at least 2 centroids for a multi-dimensional new space
+        return None
+    projected_embeddings_matrix = _project_to_centroid_space(original_embeddings_matrix, centroids)
+    if projected_embeddings_matrix.ndim != 2 or projected_embeddings_matrix.shape[0] < 2 or \
+       projected_embeddings_matrix.shape[1] < 2: # New space needs at least 2 dimensions (centroids)
+        return None
+    distances_original_space = _get_pairwise_cosine_distances(original_embeddings_matrix)
+    distances_new_space = _get_pairwise_cosine_distances(projected_embeddings_matrix)
+    if distances_original_space.size == 0 or distances_new_space.size == 0 or \
+       distances_original_space.size != distances_new_space.size:
+        return None # Mismatch or empty distances
+    # Handle cases where variance is zero in one of the distance arrays (leads to NaN correlation)
+    if np.all(distances_new_space == distances_new_space[0]) or \
+       np.all(distances_original_space == distances_original_space[0]):
+        return 0.0 # Correlation is undefined or 0 if one variable is constant
+    try:
+        correlation, _ = pearsonr(distances_original_space, distances_new_space)
+    except ValueError: # Should be caught by variance checks, but as a safeguard
+        return None
+    if np.isnan(correlation):
+        return 0.0 # Default for NaN correlation
+    return correlation

utils/file_download.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import shutil
+import urllib.request
+import zipfile
+import tempfile
+from tqdm import tqdm
+from urllib.parse import urlparse
+class TqdmUpTo(tqdm):
+    def update_to(self, b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            self.total = tsize
+        self.update(b * bsize - self.n)
+def download_file_override(url: str, dest_path: str):
+    """
+    Download a file from a URL and always overwrite the target.
+    If it's a zip, extract its contents directly into dest_path (no extra folder level).
+    If it's not a zip, save it directly to dest_path.
+    """
+    # Ensure parent dir for files
+    dest_dir = dest_path if dest_path.endswith(('/', '\\')) else os.path.dirname(dest_path)
+    if dest_dir:
+        os.makedirs(dest_dir, exist_ok=True)
+    # Temp file for download
+    tmp_fd, tmp_path = tempfile.mkstemp()
+    os.close(tmp_fd)
+    filename = os.path.basename(urlparse(url).path) or "downloaded.file"
+    print(f"Downloading {filename}...")
+    try:
+        with TqdmUpTo(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t:
+            urllib.request.urlretrieve(url, filename=tmp_path, reporthook=t.update_to)
+        if zipfile.is_zipfile(tmp_path):
+            # Remove dest_path if exists
+            if os.path.exists(dest_path):
+                shutil.rmtree(dest_path)
+            os.makedirs(dest_path, exist_ok=True)
+            # Extract into temp folder first
+            with tempfile.TemporaryDirectory() as tmp_extract_dir:
+                with zipfile.ZipFile(tmp_path, 'r') as z:
+                    z.extractall(tmp_extract_dir)
+                # Move *contents* of extracted folder into dest_path
+                for item in os.listdir(tmp_extract_dir):
+                    src = os.path.join(tmp_extract_dir, item)
+                    dst = os.path.join(dest_path, item)
+                    if os.path.isdir(src):
+                        shutil.move(src, dst)
+                    else:
+                        shutil.move(src, dst)
+            print(f"Extracted zip contents into '{dest_path}'.")
+        else:
+            # Ensure parent dir exists
+            os.makedirs(os.path.dirname(dest_path) or ".", exist_ok=True)
+            if os.path.exists(dest_path):
+                os.remove(dest_path)
+            shutil.move(tmp_path, dest_path)
+            tmp_path = None
+            print(f"Saved file to '{dest_path}'.")
+    finally:
+        if tmp_path and os.path.exists(tmp_path):
+            os.remove(tmp_path)

utils/generate_augmented_mapping.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import csv
+import re
+def load_original_map_and_extract_morph(path="human_readable.txt"):
+    human_to_code = {}
+    morph_entries = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or ":" not in line or line.startswith("#"):
+                continue
+            key, val = [p.strip() for p in line.split(":", 1)]
+            # If key looks like Aspect=Perf, it's a morphological tag
+            if "=" in key:
+                morph_entries.append((val, key))  # human:code
+            else:
+                human_to_code[val] = key  # human:code for POS/etc.
+    return human_to_code, morph_entries
+def extract_bigrams_from_csv(csv_path="../datasets/gram2vec_feats.csv"):
+    bigrams = set()
+    with open(csv_path, "r", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            feat = row["gram2vec_feats"]
+            if feat.startswith("Part-of-Speech Bigram:"):
+                human_bigram = feat.split(":", 1)[1].strip()
+                if "followed by" in human_bigram:
+                    bigrams.add(human_bigram)
+    return bigrams
+def generate_bigram_code_map(human_to_code, bigrams):
+    pattern = re.compile(r"(.+?) followed by (.+)")
+    code_map = {}
+    for bigram in bigrams:
+        match = pattern.match(bigram)
+        if match:
+            x = match.group(1).strip()
+            y = match.group(2).strip()
+            code_x = human_to_code.get(x)
+            code_y = human_to_code.get(y)
+            if code_x and code_y:
+                code_map[bigram] = f"{code_x} {code_y}"
+            else:
+                print(f"Could not map: {bigram} → {code_x}, {code_y}")
+        else:
+            print(f"Not matched: {bigram}")
+    return code_map
+def write_augmented_map(pos_bigram_map, morph_entries, original_path="human_readable.txt", output_path="augmented_human_readable.txt"):
+    with open(output_path, "w", encoding="utf-8") as f:
+        # Flip original lines: write human-readable:code instead of code:human
+        with open(original_path, "r", encoding="utf-8") as orig:
+            for line in orig:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    f.write(line + "\n")
+                    continue
+                if ":" not in line:
+                    continue
+                key, val = [p.strip() for p in line.split(":", 1)]
+                flipped_line = f"{val}:{key}\n"
+                f.write(flipped_line)
+        # Add new section for POS bigrams
+        f.write("\n")
+        for human, code in sorted(pos_bigram_map.items()):
+            f.write(f"{human}:{code}\n")
+        # Re-add morph tag mappings
+        f.write("\n")
+        for human, code in sorted(morph_entries):
+            f.write(f"{human}:{code}\n")
+    print(f"Augmented map written to {output_path}")
+# Run all
+human_to_code, morph_entries = load_original_map_and_extract_morph()
+bigrams = extract_bigrams_from_csv()
+pos_bigram_map = generate_bigram_code_map(human_to_code, bigrams)
+write_augmented_map(pos_bigram_map, morph_entries)

utils/gram2vec_feat_utils.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import re
+import html
+from collections import namedtuple
+from gram2vec.feature_locator import find_feature_spans
+from functools import lru_cache
+from utils.llm_feat_utils import generate_feature_spans_cached
+import pandas as pd
+Span = namedtuple('Span', ['start_char', 'end_char'])
+from gram2vec import vectorizer
+# ── the FEATURE_HANDLERS & loader  ────────────
+FEATURE_HANDLERS = {
+    "Part-of-Speech Unigram": "pos_unigrams",
+    "Part-of-Speech Bigram":  "pos_bigrams",
+    "Function Word":          "func_words",
+    "Punctuation":            "punctuation",
+    "Letter":                 "letters",
+    "Dependency Label":       "dep_labels",
+    "Morphology Tag":         "morph_tags",
+    "Sentence Type":          "sentences",
+    "Emoji":                  "emojis",
+    "Number of Tokens":       "num_tokens"
+}
+@lru_cache(maxsize=1)
+def load_code_map(txt_path: str = "utils/augmented_human_readable.txt") -> dict:
+    code_map = {}
+    with open(txt_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            human, code = [p.strip() for p in line.split(":", 1)]
+            code_map[human] = code
+    return code_map
+def get_shorthand(feature_str: str) -> str:
+    """
+    Expects 'Category:Human-Readable', returns e.g. 'pos_unigrams:ADJ' or None.
+    """
+    try:
+        category, human = [p.strip() for p in feature_str.split(":", 1)]
+        # print(f"Category: {category}, Human: {human}")
+    except ValueError:
+        # print("Invalid format for feature string:", feature_str)
+        return None
+    if category not in FEATURE_HANDLERS:
+        return None
+    code = load_code_map().get(human)
+    if code is None:
+        # print(f"Warning: No code found for human-readable feature '{human}'")
+        return None  # fallback to the human-readable name
+    return f"{FEATURE_HANDLERS[category]}:{code}"
+def get_fullform(shorthand: str) -> str:
+    """
+    Expects 'prefix:code' (e.g., 'pos_unigrams:ADJ'), returns 'Category:Human-Readable'
+    (e.g., 'Part-of-Speech Unigram:Adjective'), or None if invalid.
+    """
+    try:
+        prefix, code = shorthand.split(":", 1)
+    except ValueError:
+        return None
+    # Reverse FEATURE_HANDLERS
+    reverse_handlers = {v: k for k, v in FEATURE_HANDLERS.items()}
+    category = reverse_handlers.get(prefix)
+    if category is None:
+        return None
+    # Reverse code map
+    code_map = load_code_map()
+    reverse_code_map = {v: k for k, v in code_map.items()}
+    human = reverse_code_map.get(code)
+    if human is None:
+        return None
+    return f"{category}:{human}"
+def highlight_both_spans(text, llm_spans, gram_spans):
+    """
+    Walk the original `text` once, injecting <mark> tags at the correct offsets,
+    so that nested or overlapping highlights never stomp on each other.
+    """
+    # Inline CSS : mark-llm is in yellow, mark-gram in blue
+    style = """
+    <style>
+      .mark-llm  { background-color: #fff176; }
+      .mark-gram { background-color: #90caf9; }
+    </style>
+    """
+    # Turn each span into two “events”: open and close
+    events = []
+    for s in llm_spans:
+        events.append((s.start_char, 'open',  'llm'))
+        events.append((s.end_char,   'close', 'llm'))
+    for s in gram_spans:
+        events.append((s.start_char, 'open',  'gram'))
+        events.append((s.end_char,   'close', 'gram'))
+    # Sort by position;
+    events.sort(key=lambda e: (e[0], 0 if e[1]=='open' else 1))
+    out = []
+    last_idx = 0
+    for idx, typ, cls in events:
+        # escape the slice between last index and this event
+        out.append(html.escape(text[last_idx:idx]))
+        if typ == 'open':
+            out.append(f'<mark class="mark-{cls}">')
+        else:
+            out.append('</mark>')
+        last_idx = idx
+    out.append(html.escape(text[last_idx:]))
+    highlighted = "".join(out)
+    highlighted = highlighted.replace('\n', '<br>')
+    return style + highlighted
+def show_combined_spans_all(selected_feature_llm, selected_feature_g2v,
+                            llm_style_feats_analysis, background_authors_embeddings_df, task_authors_embeddings_df, visible_authors, predicted_author=None, ground_truth_author=None, max_num_authors=7):
+    """
+    For mystery + 3 candidates:
+     1. get llm spans via your existing cache+API
+     2. get gram2vec spans via find_feature_spans
+     3. merge and highlight both
+    """
+    print(f"\n\n\n\n\nShowing combined spans for LLM feature '{selected_feature_llm}' and Gram2Vec feature '{selected_feature_g2v}'")
+    print(f"predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}")
+    print(f" keys = {background_authors_embeddings_df.keys()}")
+    # background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df])
+    # background_and_task_authors = background_and_task_authors[background_and_task_authors.authorID.isin(visible_authors)]
+    #get the visible background authors
+    background_authors_embeddings_df = background_authors_embeddings_df[background_authors_embeddings_df.authorID.isin(visible_authors)]
+    background_and_task_authors = pd.concat([task_authors_embeddings_df, background_authors_embeddings_df])
+    authors_texts = ['\n\n =========== \n\n'.join(x) if type(x) == list else x for x in background_and_task_authors[:max_num_authors]['fullText'].tolist()]
+    authors_names = background_and_task_authors[:max_num_authors]['authorID'].tolist()
+    print(f"Number of authors to show: {len(authors_texts)}")
+    print(f"Authors names: {authors_names}")
+    texts = list(zip(authors_names, authors_texts))
+    if selected_feature_llm and selected_feature_llm != "None":
+        # print(llm_style_feats_analysis)
+        author_list = list(llm_style_feats_analysis['spans'].values())
+        llm_spans_list = []
+        for i, (_, txt) in enumerate(texts):
+            author_spans_list = []
+            for txt_span in author_list[i][selected_feature_llm]:
+                    author_spans_list.append(Span(txt.find(txt_span), txt.find(txt_span) + len(txt_span)))
+            llm_spans_list.append(author_spans_list)
+    else:
+        print("Skipping LLM span extraction: feature is None")
+        llm_spans_list = [[] for _ in texts]
+    if selected_feature_g2v and selected_feature_g2v != "None":
+        # get gram2vec spans
+        gram_spans_list = []
+        print(f"Selected Gram2Vec feature: {selected_feature_g2v}")
+        short = get_shorthand(selected_feature_g2v)
+        print(f"short hand: {short}")
+        for role, txt in texts:
+            try:
+                print(f"Finding spans for {short} {role}")
+                spans = find_feature_spans(txt, short)
+                # spans = [Span(fs.start_char, fs.end_char) for fs in raw_spans]
+            except:
+                print(f"Error finding spans for {short} {role}")
+                spans = []
+            gram_spans_list.append(spans)
+    else:
+        print("Skipping Gram2Vec span extraction: feature is None")
+        gram_spans_list = [[] for _ in texts]
+    # build HTML blocks
+    print(f" ----> Number of authors: {len(texts)}")
+    html_task_authors = create_html(
+        texts[:4], #first 4 are task
+        llm_spans_list,
+        gram_spans_list,
+        selected_feature_llm,
+        selected_feature_g2v,
+        short,
+        background = False,
+        predicted_author=predicted_author,
+        ground_truth_author=ground_truth_author
+    )
+    combined_html = "<div>" + "\n<hr>\n".join(html_task_authors) + "</div>"
+    html_background_authors = create_html(
+        texts[4:], #last three are background
+        llm_spans_list,
+        gram_spans_list,
+        selected_feature_llm,
+        selected_feature_g2v,
+        short,
+        background = True,
+        predicted_author=predicted_author,
+        ground_truth_author=ground_truth_author
+    )
+    background_html = "<div>" + "\n<hr>\n".join(html_background_authors) + "</div>"
+    return combined_html, background_html
+def get_label(label: str, predicted_author=None, ground_truth_author=None, bg_id: int=0) -> str:
+    """
+    Returns a human-readable label for the author.
+    """
+    print(f"get_label called with label: {label}, predicted_author: {predicted_author}, ground_truth_author: {ground_truth_author}, bg_id: {bg_id}")
+    if label.startswith("Mystery") or label.startswith("Q_author"):
+        return "Mystery Author"
+    elif label.startswith("a0_author") or label.startswith("a1_author") or label.startswith("a2_author") or label.startswith("Candidate"):
+        if label.startswith("Candidate"):
+            id = int(label.split(" ")[2])  # Get the number after 'Candidate Author'
+        else:
+            id = label.split("_")[0][-1] # Get the last character of the first part (a0, a1, a2)
+        if predicted_author is not None and ground_truth_author is not None:
+            if int(id) == predicted_author and int(id) == ground_truth_author:
+                return f"Candidate {int(id)+1} (Predicted & Ground Truth)"
+            elif int(id) == predicted_author:
+                return f"Candidate {int(id)+1} (Predicted)"
+            elif int(id) == ground_truth_author:
+                return f"Candidate {int(id)+1} (Ground Truth)"
+            else:
+                return f"Candidate {int(id)+1}"
+        else:
+            return f"Candidate {int(id)+1}"
+    else:
+        return f"Background Author {bg_id+1}"
+def create_html(texts, llm_spans_list, gram_spans_list, selected_feature_llm, selected_feature_g2v, short=None, background = False, predicted_author=None, ground_truth_author=None):
+    html = []
+    for i, (label, txt) in enumerate(texts):
+        label = get_label(label, predicted_author, ground_truth_author,  i) if background else get_label(label, predicted_author, ground_truth_author)
+        combined = highlight_both_spans(txt, llm_spans_list[i], gram_spans_list[i])
+        notice = ""
+        if selected_feature_llm == "None":
+            notice += f"""
+            <div style="padding:8px; background:#eee; border:1px solid #aaa;">
+              <em>No LLM feature selected.</em>
+            </div>
+            """
+        elif not llm_spans_list[i]:
+            notice += f"""
+            <div style="padding:8px; background:#fee; border:1px solid #f00;">
+              <em>No spans found for LLM feature "{selected_feature_llm}".</em>
+            </div>
+            """
+        if selected_feature_g2v == "None":
+            notice += f"""
+            <div style="padding:8px; background:#eee; border:1px solid #aaa;">
+              <em>No Gram2Vec feature selected.</em>
+            </div>
+            """
+        elif not short:
+            notice += f"""
+            <div style="padding:8px; background:#fee; border:1px solid #f00;">
+              <em>Invalid or unmapped feature: "{selected_feature_g2v}".</em>
+            </div>
+            """
+        elif not gram_spans_list[i]:
+            notice += f"""
+            <div style="padding:8px; background:#fee; border:1px solid #f00;">
+              <em>No spans found for Gram2Vec feature "{selected_feature_g2v}".</em>
+            </div>
+            """
+        html.append(f"""
+          <h3>{label}</h3>
+          {notice}
+          <div style="border:1px solid #ccc; padding:8px; margin-bottom:1em;">
+            {combined}
+          </div>
+        """)
+    return html

utils/human_readable.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+ADJ: Adjective
+ADP: Adposition
+ADV: Adverb
+AUX: Auxiliary verb
+CCONJ: Coordinating conjunction
+DET: Determiner
+INTJ: Interjection
+NOUN: Noun
+NUM: Numeral
+PART: Particle
+PRON: Pronoun
+PROPN: Proper noun
+PUNCT: Punctuation
+SCONJ: Subordinating conjunction
+SYM: Symbol
+VERB: Verb
+X: Other
+SPACE: Space
+Aspect=Perf: Perfect aspect
+Aspect=Prog: Progressive aspect
+Case=Acc: Accusative case
+Case=Nom: Nominative case
+Definite=Def: Definite article
+Definite=Ind: Indefinite article
+Degree=Cmp: Comparative degree
+Degree=Pos: Positive degree
+Degree=Sup: Superlative degree
+Gender=Fem: Feminine gender
+Gender=Masc: Masculine gender
+Mood=Ind: Indicative mood
+Number=Plur: Plural number
+Number=Sing: Singular number
+Person=1: First person
+Person=2: Second person
+Person=3: Third person
+Tense=Past: Past tense
+Tense=Pres: Present tense
+VerbForm=Fin: Finite verb form
+VerbForm=Inf: Infinitive verb form

utils/interp_space_utils.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import sys
+import pandas as pd
+import numpy as np
+import math
+from collections import Counter, defaultdict
+from typing import List, Any
+from sklearn.feature_extraction.text import TfidfVectorizer
+import os
+import pickle
+import hashlib
+import json
+from gram2vec import vectorizer
+from openai import OpenAI
+from openai.lib._pydantic import to_strict_json_schema
+from pydantic import BaseModel
+from pydantic import ValidationError
+import time
+from utils.llm_feat_utils import generate_feature_spans_cached
+from collections import Counter
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+CACHE_DIR = "datasets/embeddings_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+# Bump this whenever there is a change etc...
+CACHE_VERSION = 1
+class style_analysis_schema(BaseModel):
+    features: list[str]
+    spans: dict[str, dict[str, list[str]]]
+class FeatureIdentificationSchema(BaseModel):
+    features: list[str]
+class SpanExtractionSchema(BaseModel):
+    spans: dict[str, dict[str, list[str]]]  # {author_name: {feature: [spans]}}
+def compute_g2v_features(clustered_authors_df: pd.DataFrame, task_authors_df: pd.DataFrame=None, text_clm='fullText') -> pd.DataFrame:
+    """
+    Computes gram2vec feature vectors for each author and adds them to the DataFrame.
+    This effectively creates a mapping from each author to their vector.
+    """
+    if task_authors_df is not None:
+        print (f"concatenating task authors and background corpus authors")
+        print(f"Number of task authors: {len(task_authors_df)}")
+        print(f"task authors author_ids: {task_authors_df.authorID.tolist()}")
+        print(f"task authors -->")
+        print(task_authors_df)
+        print(f"Number of background corpus authors: {len(clustered_authors_df)}")
+        clustered_authors_df = pd.concat([task_authors_df, clustered_authors_df])
+        print(f"Number of authors after concatenation: {len(clustered_authors_df)}")
+    # Gather the input texts (preserves list-of-strings if any)
+    #texts = background_corpus_df[text_clm].fillna("").tolist()
+    author_texts = ['\n\n'.join(x) for x in clustered_authors_df.fullText.tolist()]
+    print(f"Number of author_texts: {len(author_texts)}")
+    # Create a reproducible JSON serialization of the texts
+    serialized = json.dumps({
+        "col": text_clm,
+        "texts": author_texts
+    }, sort_keys=True, ensure_ascii=False)
+    # Compute MD5 hash
+    digest = hashlib.md5(serialized.encode("utf-8")).hexdigest()
+    cache_path = os.path.join(CACHE_DIR, f"{digest}.pkl")
+    # If cache hit, load and return
+    if os.path.exists(cache_path):
+        print(f"Cache hit...")
+        with open(cache_path, "rb") as f:
+            clustered_authors_df = pickle.load(f)
+    else: # Else compute and cache
+        g2v_feats_df = vectorizer.from_documents(author_texts, batch_size=16)
+        print(f"Number of g2v features: {len(g2v_feats_df)}")
+        print(f"Number of clustered_authors_df.authorID.tolist(): {len(clustered_authors_df.authorID.tolist())}")
+        print(f"Number of g2v_feats_df.to_numpy().tolist(): {len(g2v_feats_df.to_numpy().tolist())}")
+        ids = clustered_authors_df.authorID.tolist()
+        counter = Counter(ids)
+        duplicates = [k for k, v in counter.items() if v > 1]
+        print(f"Duplicate authorIDs: {duplicates}")
+        print(f"Number of duplicates: {len(ids) - len(set(ids))}")
+        author_to_g2v_feats = {x[0]: x[1] for x in zip(clustered_authors_df.authorID.tolist(), g2v_feats_df.to_numpy().tolist())}
+        print(f"Number of authors with g2v features: {len(author_to_g2v_feats)}")
+        # apply normalization
+        vector_std  = np.std(list(author_to_g2v_feats.values()), axis=0)
+        vector_mean = np.mean(list(author_to_g2v_feats.values()), axis=0)
+        vector_std[vector_std == 0] = 1.0
+        author_to_g2v_feats_z_normalized = {x[0]: (x[1] - vector_mean) / vector_std for x in author_to_g2v_feats.items()}
+        print(f"Number of authors with g2v features normalized: {len(author_to_g2v_feats_z_normalized)}")
+        print(f" len of clustered authors df: {len(clustered_authors_df)}")
+        # Add the vectors as a new column of the DataFrame.
+        clustered_authors_df['g2v_vector'] = [{x[1]: x[0] for x in zip(val, g2v_feats_df.columns.tolist())}
+                                            for val in author_to_g2v_feats_z_normalized.values()]
+        with open(cache_path, "wb") as f:
+            pickle.dump(clustered_authors_df, f)
+    if task_authors_df is not None:
+        task_authors_df = clustered_authors_df[clustered_authors_df.authorID.isin(task_authors_df.authorID.tolist())]
+        clustered_authors_df = clustered_authors_df[~clustered_authors_df.authorID.isin(task_authors_df.authorID.tolist())]
+    return clustered_authors_df['g2v_vector'].tolist(), task_authors_df['g2v_vector'].tolist()
+def get_task_authors_from_background_df(background_df):
+    task_authors_df = background_df[background_df.authorID.isin(["Q_author", "a0_author", "a1_author", "a2_author"])]
+    return task_authors_df
+def instance_to_df(instance, predicted_author=None, ground_truth_author=None):
+    #create a dataframe of the task authors
+    task_authos_df  = pd.DataFrame([
+        {'authorID': 'Mystery author', 'fullText': instance['Q_fullText'], 'predicted': None, 'ground_truth': None},
+        {'authorID': 'Candidate Author 1', 'fullText': instance['a0_fullText'], 'predicted': predicted_author == 0, 'ground_truth': ground_truth_author == 0},
+        {'authorID': 'Candidate Author 2', 'fullText': instance['a1_fullText'], 'predicted': predicted_author == 1, 'ground_truth': ground_truth_author == 1},
+        {'authorID': 'Candidate Author 3', 'fullText': instance['a2_fullText'], 'predicted': predicted_author == 2, 'ground_truth': ground_truth_author == 2}
+    ])
+    if type(instance['Q_fullText']) == list:
+        task_authos_df = task_authos_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
+    return task_authos_df
+def generate_style_embedding(background_corpus_df: pd.DataFrame, text_clm: str, model_name: str) -> pd.DataFrame:
+    """
+    Generates style embeddings for documents in a background corpus using a specified model.
+    If a row in `text_clm` contains a list of strings, the final embedding for that row
+    is the average of the embeddings of all strings in the list.
+    Args:
+        background_corpus_df (pd.DataFrame): DataFrame containing the corpus.
+        text_clm (str): Name of the column containing the text data (either string or list of strings).
+        model_name (str): Name of the model to use for generating embeddings.
+    Returns:
+        pd.DataFrame: The input DataFrame with a new column for style embeddings.
+    """
+    from sentence_transformers import SentenceTransformer
+    import torch
+    if model_name not in [
+        'gabrielloiseau/LUAR-MUD-sentence-transformers',
+        'gabrielloiseau/LUAR-CRUD-sentence-transformers',
+        'miladalsh/light-luar',
+        'AnnaWegmann/Style-Embedding',
+    ]:
+        print('Model is not supported')
+        return background_corpus_df
+    print(f"Generating style embeddings using {model_name} on column '{text_clm}'...")
+    model = SentenceTransformer(model_name)
+    embedding_dim = model.get_sentence_embedding_dimension()
+    # Heuristic to check if the column contains lists of strings by checking the first valid item.
+    # This assumes the column is homogenous.
+    is_list_column = False
+    if not background_corpus_df.empty:
+        # Get the first non-NaN value to inspect its type
+        series_no_na = background_corpus_df[text_clm].dropna()
+        if not series_no_na.empty:
+            first_valid_item = series_no_na.iloc[0]
+            if isinstance(first_valid_item, list):
+                is_list_column = True
+    if is_list_column:
+        # Flatten all texts into a single list for batch processing
+        texts_to_encode = []
+        row_lengths = []
+        for text_list in background_corpus_df[text_clm]:
+            # Ensure we handle None, empty lists, or other non-list types gracefully
+            if isinstance(text_list, list) and text_list:
+                texts_to_encode.extend(text_list)
+                row_lengths.append(len(text_list))
+            else:
+                row_lengths.append(0)
+        if texts_to_encode:
+            all_embeddings = model.encode(texts_to_encode, convert_to_tensor=True, show_progress_bar=True)
+        else:
+            all_embeddings = torch.empty((0, embedding_dim), device=model.device)
+        # Reconstruct and average embeddings for each row
+        final_embeddings = []
+        current_pos = 0
+        for length in row_lengths:
+            if length > 0:
+                row_embeddings = all_embeddings[current_pos:current_pos + length]
+                avg_embedding = torch.mean(row_embeddings, dim=0)
+                final_embeddings.append(avg_embedding.cpu().numpy())
+                current_pos += length
+            else:
+                final_embeddings.append(np.zeros(embedding_dim))
+    else:
+        # Column contains single strings
+        texts = background_corpus_df[text_clm].fillna("").tolist()
+        # convert_to_tensor=False is faster if we just need numpy arrays
+        embeddings = model.encode(texts, show_progress_bar=True)
+        final_embeddings = list(embeddings)
+    # Create a clean column name from the model name
+    col_name = f'{model_name.split("/")[-1]}_style_embedding'
+    background_corpus_df[col_name] = final_embeddings
+    return background_corpus_df
+# ── wrapper with caching ───────────────────────────────────────
+def cached_generate_style_embedding(background_corpus_df: pd.DataFrame,
+                                    text_clm: str,
+                                    model_name: str) -> pd.DataFrame:
+    """
+    Wraps `generate_style_embedding`, caching its output in pickle files
+    keyed by an MD5 of (model_name + text list). If the cache exists,
+    loads and returns it instead of recomputing.
+    """
+    # Gather the input texts (preserves list-of-strings if any)
+    texts = background_corpus_df[text_clm].fillna("").tolist()
+    # Create a reproducible JSON serialization of the texts
+    serialized = json.dumps({
+        "model": model_name,
+        "col": text_clm,
+        "texts": texts
+    }, sort_keys=True, ensure_ascii=False)
+    # Compute MD5 hash
+    digest = hashlib.md5(serialized.encode("utf-8")).hexdigest()
+    cache_path = os.path.join(CACHE_DIR, f"{digest}.pkl")
+    # If cache hit, load and return
+    if os.path.exists(cache_path):
+        print(f"Cache hit for {model_name} on column '{text_clm}'")
+        print(cache_path)
+        with open(cache_path, "rb") as f:
+            return pickle.load(f)
+    # Otherwise, compute, cache, and return
+    df_with_emb = generate_style_embedding(background_corpus_df, text_clm, model_name)
+    print(f"Computing embeddings for {model_name} on column '{text_clm}', saving to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(df_with_emb, f)
+    return df_with_emb
+def get_style_feats_distribution(documentIDs, style_feats_dict):
+    style_feats = []
+    for documentId in documentIDs:
+        if documentId not in document_to_style_feats:
+            #print(documentId)
+            continue
+        style_feats+= document_to_style_feats[documentId]
+    tfidf = [style_feats.count(key) * val for key, val in style_feats_dict.items()]
+    return tfidf
+def get_cluster_top_feats(style_feats_distribution, style_feats_list, top_k=5):
+    sorted_feats = np.argsort(style_feats_distribution)[::-1]
+    top_feats = [style_feats_list[x] for x in sorted_feats[:top_k] if style_feats_distribution[x] > 0]
+    return top_feats
+def compute_clusters_style_representation(
+    background_corpus_df: pd.DataFrame,
+    cluster_ids: List[Any],
+    other_cluster_ids: List[Any],
+    features_clm_name: str,
+    cluster_label_clm_name: str = 'cluster_label',
+    top_n: int = 10
+) -> List[str]:
+    """
+    Given a DataFrame with document IDs, cluster IDs, and feature lists,
+    return the top N features that are most important in the specified `cluster_ids`
+    while having low importance in `other_cluster_ids`.
+    Importance is determined by TF-IDF scores. The final score for a feature is
+    (summed TF-IDF in `cluster_ids`) - (summed TF-IDF in `other_cluster_ids`).
+    Parameters:
+    - background_corpus_df: pd.DataFrame. Must contain the columns specified by
+                            `cluster_label_clm_name` and `features_clm_name`.
+                            The column `features_clm_name` should contain lists of strings (features).
+    - cluster_ids: List of cluster IDs for which to find representative features (target clusters).
+    - other_cluster_ids: List of cluster IDs whose features should be down-weighted.
+                         Features prominent in these clusters will have their scores reduced.
+                         Pass an empty list or None if no contrastive clusters are needed.
+    - features_clm_name: The name of the column in `background_corpus_df` that
+                         contains the list of features for each document.
+    - cluster_label_clm_name: The name of the column in `background_corpus_df`
+                              that contains the cluster labels. Defaults to 'cluster_label'.
+    - top_n: Number of top features to return.
+    Returns:
+    - List[str]: A list of feature names. These are up to `top_n` features
+                 ranked by their adjusted TF-IDF scores (score in `cluster_ids`
+                 minus score in `other_cluster_ids`). Only features with a final
+                 adjusted score > 0 are included.
+    """
+    assert background_corpus_df[features_clm_name].apply(
+        lambda x: isinstance(x, list) and all(isinstance(feat, str) for feat in x)
+    ).all(), f"Column '{features_clm_name}' must contain lists of strings."
+    # Compute TF-IDF on the entire corpus
+    vectorizer = TfidfVectorizer(
+        tokenizer=lambda x: x,
+        preprocessor=lambda x: x,
+        token_pattern=None  # Disable default token pattern, treat items in list as tokens
+    )
+    tfidf_matrix = vectorizer.fit_transform(background_corpus_df[features_clm_name])
+    feature_names = vectorizer.get_feature_names_out()
+    # Get boolean mask for documents in selected clusters
+    selected_mask = background_corpus_df[cluster_label_clm_name].isin(cluster_ids).to_numpy()
+    if not selected_mask.any():
+        return [] # No documents found for the given cluster_ids
+    # Subset the TF-IDF matrix using the boolean mask
+    selected_tfidf = tfidf_matrix[selected_mask]
+    # Sum TF-IDF scores across documents for each feature in the target clusters
+    target_feature_scores_sum = selected_tfidf.sum(axis=0).A1  # Convert to 1D array
+    # Initialize adjusted scores with target scores
+    adjusted_feature_scores = target_feature_scores_sum.copy()
+    # If other_cluster_ids are provided and not empty, subtract their TF-IDF sums
+    if other_cluster_ids: # Checks if the list is not None and not empty
+        other_selected_mask = background_corpus_df[cluster_label_clm_name].isin(other_cluster_ids).to_numpy()
+        if other_selected_mask.any():
+            other_selected_tfidf = tfidf_matrix[other_selected_mask]
+            contrast_feature_scores_sum = other_selected_tfidf.sum(axis=0).A1
+            # Element-wise subtraction; assumes feature_names aligns for both sums
+            adjusted_feature_scores -= contrast_feature_scores_sum
+    # Map scores to feature names
+    feature_score_dict = dict(zip(feature_names, adjusted_feature_scores))
+    # Sort features by score
+    sorted_features = sorted(feature_score_dict.items(), key=lambda item: item[1], reverse=True)
+    # Return the names of the top_n features that have a score > 0
+    top_features = [feature for feature, score in sorted_features if score > 0][:top_n]
+    return top_features
+def compute_clusters_style_representation_2(
+        background_corpus_df: pd.DataFrame,
+        cluster_ids: List[Any],
+        cluster_label_clm_name: str = 'cluster_label',
+        max_num_feats: int = 5,
+        max_num_documents_per_author=3,
+        max_num_authors=5):
+    """
+    Call openAI to analyze the common writing style features of the given list of texts
+    """
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
+    background_corpus_df = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
+    author_texts = background_corpus_df['fullText'].tolist()[:max_num_authors]
+    author_texts = "\n\n".join(["""Author {}:\n""".format(i+1) + text for i, text in enumerate(author_texts)])
+    author_names = background_corpus_df[cluster_label_clm_name].tolist()[:max_num_authors]
+    print(f"Number of authors: {len(background_corpus_df)}")
+    print(author_names)
+    print(author_texts)
+    print(f"Number of authors: {len(author_names)}")
+    print(f"Number of authors: {len(author_texts)}")
+    prompt = f"""First identify a list of {max_num_feats} writing style features that are common between the given texts. Second for every author text and style feature, extract all spans that represent the feature. Output for every author all style features with their spans.
+    Author Texts:
+    \"\"\"{author_texts}\"\"\"
+    """
+    # Compute MD5 hash
+    digest = hashlib.md5(prompt.encode("utf-8")).hexdigest()
+    cache_path = os.path.join(CACHE_DIR, f"{digest}.pkl")
+    # If cache hit, load and return
+    if os.path.exists(cache_path):
+        print(f"Loading authors writing style from cache ...")
+        with open(cache_path, "rb") as f:
+            parsed_response = pickle.load(f)
+    else: # Else compute and cache
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role":"assistant","content":"You are a forensic linguistic who knows how to analyze similarites in writing styles."},
+                {"role":"user","content":prompt}],
+            response_format={"type": "json_schema", "json_schema": {"name": "style_analysis_schema", "schema":  to_strict_json_schema(style_analysis_schema)}}
+        )
+        parsed_response = json.loads(response.choices[0].message.content)
+        with open(cache_path, "wb") as f:
+            pickle.dump(parsed_response, f)
+    return parsed_response
+def identify_style_features(author_texts: list[str], max_num_feats: int = 5) -> list[str]:
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    prompt = f"""Identify {max_num_feats} writing style features that are commonly found across the following texts. Do not extract spans. Just return the feature names as a list.
+    Author Texts:
+    \"\"\"{chr(10).join(author_texts)}\"\"\"
+    """
+    def _make_call():
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "assistant", "content": "You are a forensic linguist specializing in writing styles."},
+                {"role": "user", "content": prompt}
+            ],
+            response_format={
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "FeatureIdentificationSchema",
+                    "schema": to_strict_json_schema(FeatureIdentificationSchema)
+                }
+            }
+        )
+        return json.loads(response.choices[0].message.content)
+    return retry_call(_make_call, FeatureIdentificationSchema).features
+def retry_call(call_fn, schema_class, max_attempts=3, wait_sec=2):
+    for attempt in range(max_attempts):
+        try:
+            result = call_fn()
+            # Validate against schema
+            validated = schema_class(**result)
+            return validated
+        except (ValidationError, KeyError, json.JSONDecodeError) as e:
+            print(f"Attempt {attempt + 1} failed with error: {e}")
+            time.sleep(wait_sec)
+    raise RuntimeError("All retry attempts failed for OpenAI call.")
+def extract_all_spans(authors_df: pd.DataFrame, features: list[str], cluster_label_clm_name: str = 'authorID') -> dict[str, dict[str, list[str]]]:
+    """
+    For each author, use `generate_feature_spans_cached` to get feature->span mappings.
+    Returns a dict: {author_name: {feature: [spans]}}
+    """
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+    spans_by_author = {}
+    for _, row in authors_df.iterrows():
+        author_name = str(row[cluster_label_clm_name])
+        print(author_name)
+        role = f"{author_name}"
+        full_text = row['fullText']
+        spans = generate_feature_spans_cached(client, full_text, features, role)
+        spans_by_author[author_name] = spans
+    return spans_by_author
+def compute_clusters_style_representation_3(
+    background_corpus_df: pd.DataFrame,
+    cluster_ids: List[Any],
+    cluster_label_clm_name: str = 'authorID',
+    max_num_feats: int = 5,
+    max_num_documents_per_author=3,
+    max_num_authors=5
+    ):
+    print(f"Computing style representation for visible clusters: {len(cluster_ids)}")
+    # STEP 1: Identify features on 5 visible authors
+    background_corpus_df['fullText'] = background_corpus_df['fullText'].map(lambda x: '\n\n'.join(x[:max_num_documents_per_author]) if isinstance(x, list) else x)
+    background_corpus_df_feat_id = background_corpus_df[background_corpus_df[cluster_label_clm_name].isin(cluster_ids)]
+    author_texts = background_corpus_df_feat_id['fullText'].tolist()[:max_num_authors]
+    author_texts = "\n\n".join(["""Author {}:\n""".format(i+1) + text for i, text in enumerate(author_texts)])
+    author_names = background_corpus_df_feat_id[cluster_label_clm_name].tolist()[:max_num_authors]
+    print(f"Number of authors: {len(background_corpus_df_feat_id)}")
+    print(author_names)
+    print(author_texts)
+    print(f"Number of authors: {len(author_names)}")
+    print(f"Number of authors: {len(author_texts)}")
+    features = identify_style_features(author_texts, max_num_feats=max_num_feats)
+    # STEP 2: Prepare author pool for span extraction
+    span_df = background_corpus_df.iloc[:7]
+    author_names = span_df[cluster_label_clm_name].tolist()[:7]
+    print(f"Number of authors for span detection : {len(span_df)}")
+    print(author_names)
+    spans_by_author = extract_all_spans(span_df, features, cluster_label_clm_name)
+    return {
+        "features": features,
+        "spans": spans_by_author
+    }
+def compute_clusters_g2v_representation(
+    background_corpus_df: pd.DataFrame,
+    author_ids: List[Any],
+    other_author_ids: List[Any],
+    features_clm_name: str,
+    top_n: int = 10
+) -> List[str]:
+    # Get boolean mask for documents in selected clusters
+    selected_mask = background_corpus_df['authorID'].isin(author_ids).to_numpy()
+    if not selected_mask.any():
+        return [] # No documents found for the given cluster_ids
+    selected_feats = background_corpus_df[selected_mask][features_clm_name].tolist()
+    all_g2v_feats  = list(selected_feats[0].keys())
+    all_g2v_values = np.array([list(x.values()) for x in selected_feats]).mean(axis=0)
+    other_selected_feats = background_corpus_df[~selected_mask][features_clm_name].tolist()
+    all_g2v_other_feats  = list(other_selected_feats[0].keys())
+    all_g2v_other_values = np.array([list(x.values()) for x in other_selected_feats]).mean(axis=0)
+    final_g2v_feats_values = all_g2v_values - all_g2v_other_values
+    top_g2v_feats = sorted(list(zip(all_g2v_feats, final_g2v_feats_values)), key=lambda x: -x[1])
+    print(top_g2v_feats[:top_n])
+    return [x[0] for x in top_g2v_feats[:top_n]]
+def generate_interpretable_space_representation(interp_space_path, styles_df_path, feat_clm, output_clm, num_feats=5):
+    styles_df = pd.read_csv(styles_df_path)[[feat_clm, "documentID"]]
+    # A dictionary of style features and their IDF
+    style_feats_agg_df = styles_df.groupby(feat_clm).agg({'documentID': lambda x : len(list(x))}).reset_index()
+    style_feats_agg_df['document_freq'] = style_feats_agg_df.documentID
+    style_to_feats_dfreq = {x[0]: math.log(styles_df.documentID.nunique()/x[1]) for x in zip(style_feats_agg_df[feat_clm].tolist(), style_feats_agg_df.document_freq.tolist())}
+    # A list of style features we work with
+    style_feats_list = style_feats_agg_df[feat_clm].tolist()
+    print('Number of style feats ', len(style_feats_list))
+    # A list of documents and what list of style features each has
+    doc_style_agg_df     = styles_df.groupby('documentID').agg({feat_clm: lambda x : list(x)}).reset_index()
+    document_to_feats_dict = {x[0]: x[1] for x in zip(doc_style_agg_df.documentID.tolist(), doc_style_agg_df[feat_clm].tolist())}
+    # Load the clustering information
+    df = pd.read_pickle(interp_space_path)
+    df = df[df.cluster_label != -1]
+    # A cluster to list of documents
+    clusterd_df = df.groupby('cluster_label').agg({
+        'documentID': lambda x: [d_id for doc_ids in x for d_id in doc_ids]
+    }).reset_index()
+    # Filter-in only documents that has a style description
+    clusterd_df['documentID'] = clusterd_df.documentID.apply(lambda documentIDs: [documentID for documentID in documentIDs if documentID in document_to_feats_dict])
+    # Map from cluster label to list of features through the document information
+    clusterd_df[feat_clm] = clusterd_df.documentID.apply(lambda doc_ids: [f for d_id in doc_ids for f in document_to_feats_dict[d_id]])
+    def compute_tfidf(row):
+        style_counts = Counter(row[feat_clm])
+        total_num_styles = sum(style_counts.values())
+        #print(style_counts, total_num_styles)
+        style_distribution = {
+            style: math.log(1+count) * style_to_feats_dfreq[style] if style in style_to_feats_dfreq else 0 for style, count in style_counts.items()
+        } #TF-IDF
+        return style_distribution
+    def create_tfidf_rep(tfidf_dist, num_feats):
+        style_feats = sorted(tfidf_dist.items(), key=lambda x: -x[1])
+        top_k_feats = [x[0] for x in style_feats[:num_feats] if str(x[0]) != 'nan']
+        return top_k_feats
+    clusterd_df[output_clm +'_dist'] = clusterd_df.apply(lambda row: compute_tfidf(row), axis=1)
+    clusterd_df[output_clm]         = clusterd_df[output_clm +'_dist'].apply(lambda dist: create_tfidf_rep(dist, num_feats))
+    return clusterd_df
+def compute_predicted_author(task_authors_df: pd.DataFrame, col_name: str) -> int:
+    """
+    Computes the predicted author based on the style features.
+    """
+    print("Computing predicted author using LUAR-MUD-style embeddings...")
+    # Extract LUAR embeddings from task authors dataframe
+    mystery_embedding = np.array(task_authors_df.iloc[0][col_name]).reshape(1, -1)
+    candidate_embeddings = np.array([
+        task_authors_df.iloc[1][col_name],
+        task_authors_df.iloc[2][col_name],
+        task_authors_df.iloc[3][col_name]
+    ])
+    # Compute cosine similarities
+    similarities = cosine_similarity(mystery_embedding, candidate_embeddings)[0]
+    predicted_author = int(np.argmax(similarities))
+    print(f"Predicted author is Candidate {predicted_author + 1}")
+    return predicted_author
+if __name__ == "__main__":
+    background_corpus = pd.read_pickle('../datasets/luar_interp_space_cluster_19/train_authors.pkl')
+    print(background_corpus.columns)
+    print(background_corpus[['authorID', 'fullText', 'cluster_label']].head())
+    # # Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
+    # feats = compute_clusters_style_representation(
+    #     background_corpus_df=background_corpus,
+    #     cluster_ids=['00005a5c-5c06-3a36-37f9-53c6422a31d8',],
+    #     other_cluster_ids=[], # Pass the contrastive cluster IDs here
+    #     cluster_label_clm_name='authorID',
+    #     features_clm_name='final_attribute_name'
+    # )
+    # print(feats)
+    generate_style_embedding(background_corpus, 'fullText', 'AnnaWegmann/Style-Embedding')
+    print(background_corpus.columns)

utils/llm_feat_utils.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import json
+import os
+import hashlib
+import time
+from json import JSONDecodeError
+CACHE_DIR = "datasets/feature_spans_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+import pandas as pd
+#read and create the Gram2Vec feature set once
+_g2v_df      = pd.read_csv("datasets/gram2vec_feats.csv")
+GRAM2VEC_SET = set(_g2v_df['gram2vec_feats'].unique())
+MAX_ATTEMPTS = 3
+WAIT_SECONDS = 2
+# Bump this whenever there is a change prompt, feature space, etc...
+CACHE_VERSION = 2
+def _feat_hash(feature: str, text: str) -> str:
+    blob = json.dumps({
+        "version": CACHE_VERSION,
+        "text": text,
+        "features": sorted(feature)
+    }, sort_keys=True).encode()
+    return hashlib.md5(blob).hexdigest()
+def generate_feature_spans(client, text: str, features: list[str]) -> str:
+    print("Calling OpenAI to extract spans")
+    """
+    Call to OpenAI to extract spans. Returns a JSON string.
+    """
+    prompt = f"""You are a linguistic specialist. Given a writing sample and a list of descriptive features, identify the exact text spans that demonstrate each feature.
+    Important:
+    - The headers like "Document 1:" etc are NOT part of the original text — ignore them.
+    - For each feature, even if there is no match, return an empty list.
+    - Only return exact phrases from the text.
+    Respond in JSON format like:
+    {{
+      "feature1": ["span1", "span2"],
+      "feature2": [],
+      …
+    }}
+    Text:
+    \"\"\"{text}\"\"\"
+    Style Features:
+    {features}
+    """
+    response = client.chat.completions.create(
+        model="gpt-4",
+        messages=[{"role":"user","content":prompt}],
+        temperature=0.3,
+    )
+    return response.choices[0].message.content
+def generate_feature_spans_with_retries(client, text: str, features: list[str]) -> dict:
+    """
+    Calls `generate_feature_spans` with retries on failure.
+    Returns the parsed JSON dict mapping feature->list[spans].
+    """
+    for attempt in range(MAX_ATTEMPTS):
+        try:
+            response_str = generate_feature_spans(client, text, features)
+            result = json.loads(response_str)
+            return result
+        except (JSONDecodeError, ValueError) as e:
+            print(f"Attempt {attempt+1} failed: {e}")
+            if attempt < MAX_ATTEMPTS - 1:
+                wait_sec = WAIT_SECONDS * (2 ** attempt)
+                print(f"Retrying after {wait_sec} seconds...")
+                time.sleep(wait_sec)
+    raise RuntimeError("All retry attempts failed for OpenAI call.")
+def generate_feature_spans_cached(client, text: str, features: list[str], role: str = "mystery" ) -> dict:
+    """
+    Computes a cache key from text + feature list,
+    then either loads or calls the API and saves to disk.
+    Returns the parsed JSON dict mapping feature->list[spans].
+    """
+    print(f"Generating spans for ({role})")
+    # print(f"feature list {features}")
+    role = role.replace(" ", "_").replace("/", "_").replace("-", "_")
+    print(f"Cache dir: {CACHE_DIR}")
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    cache_path = os.path.join(CACHE_DIR, f"{role}.json")
+    if os.path.exists(cache_path):
+        with open(cache_path) as f:
+            cache: dict[str, dict] = json.load(f)
+    else:
+        cache = {}
+    result: dict[str, list[str]] = {}
+    missing_feats: list[str] = []
+    for feat in features:
+        if feat == "None":
+            result[feat] = []
+            continue
+        h = _feat_hash(feat, text)
+        if h in cache:
+            result[feat] = cache[h]["spans"]
+        else:
+            missing_feats.append(feat)
+    if missing_feats:
+        mapping = generate_feature_spans_with_retries(client, text, missing_feats)
+        # 4) update cache & result for each missing feature
+        for feat in missing_feats:
+            h = _feat_hash(feat, text)
+            spans = mapping.get(feat)
+            cache[h] = {
+                "feature": feat,
+                "spans": spans
+            }
+            result[feat] = spans
+        # 5) write back the combined cache
+        with open(cache_path, "w") as f:
+            json.dump(cache, f, indent=2)
+    return result
+def split_features(all_feats):
+    """
+    Given a list of mixed features, returns two lists:
+    - llm_feats: those NOT in the Gram2Vec CSV
+    - g2v_feats: those present in the CSV
+    """
+    g2v_feats = [feat for feat in all_feats if feat in GRAM2VEC_SET]
+    llm_feats = [feat for feat in all_feats if feat not in GRAM2VEC_SET]
+    return llm_feats, g2v_feats

utils/ui.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import gradio as gr
+import pandas as pd
+from utils.visualizations import load_instance, get_instances, clean_text
+from utils.interp_space_utils import cached_generate_style_embedding, instance_to_df, compute_g2v_features, compute_predicted_author
+# ── Global CSS to be prepended to every block ─────────────────────────────────
+GLOBAL_CSS = """
+<style>
+  /* Bold only the top‐level field labels (not every label) */
+  .gradio-container .input_label {
+    font-weight: 600 !important;
+    font-size: 1.1em !important;
+  }
+  /* Reset radio‐option labels to normal weight/size */
+  .gradio-container .radio-container .radio-option-label {
+    font-weight: normal !important;
+    font-size: 1em !important;
+  }
+  /* Give HTML output blocks a stronger border and padding */
+  .gradio-container .output-html {
+    border: 2px solid #888 !important;
+    border-radius: 4px !important;
+    padding: 0.5em !important;
+    margin-bottom: 1em !important;
+    font-size: 1em !important;
+    line-height: 1.4 !important;
+  }
+</style>
+"""
+def styled_block(content: str) -> str:
+    """
+    Injects GLOBAL_CSS before the provided content.
+    Returns a single HTML blob safe to pass into gr.HTML().
+    """
+    return GLOBAL_CSS + "\n" + content
+def styled_html(html_content: str) -> str:
+    """
+    Wraps raw HTML content with global CSS. Pass the result to gr.HTML().
+    """
+    return styled_block(html_content)
+def instruction_callout(text: str) -> str:
+    """
+    Returns a full HTML string (with global CSS) rendering `text`
+    as a bold, full-width callout box.
+    Usage:
+        gr.HTML(instruction_callout(
+            "Run visualization to see which author cluster contains the mystery document."
+        ))
+    """
+    callout = f"""
+    <div style="
+      background: #e3f2fd;                /* light blue background */
+      border-left: 5px solid #2196f3;     /* bold accent stripe */
+      padding: 12px 16px;
+      margin-bottom: 12px;
+      font-weight: 600;
+      font-size: 1.1em;
+    ">
+      {text}
+    </div>
+    """
+    return styled_html(callout)
+def read_txt(f):
+    if not f:
+        return ""
+    path = f.name if hasattr(f, 'name') else f
+    try:
+        with open(path, 'r', encoding='utf-8') as fh:
+            return fh.read().strip()
+    except Exception:
+        return "(Could not read file)"
+# Toggle which input UI is visible
+def toggle_task(mode):
+    print(mode)
+    return (
+        gr.update(visible=(mode == "Predefined HRS Task")),
+        gr.update(visible=(mode == "Upload Your Own Task"))
+    )
+# Update displayed texts based on mode
+def update_task_display(mode, iid, instances, background_df, mystery_file, cand1_file, cand2_file, cand3_file, true_author, model_radio, custom_model_input):
+    model_name = model_radio if model_radio != "Other" else custom_model_input
+    if mode == "Predefined HRS Task":
+        iid = int(iid.replace('Task ', ''))
+        data = instances[iid]
+        predicted_author = data['latent_rank'][0]
+        ground_truth_author = data['gt_idx']
+        mystery_txt = data['Q_fullText']
+        c1_txt = data['a0_fullText']
+        c2_txt = data['a1_fullText']
+        c3_txt = data['a2_fullText']
+        candidate_texts = [c1_txt, c2_txt, c3_txt]
+        #create a dataframe of the task authors
+        task_authors_df  = instance_to_df(instances[iid])
+        print(f"\n\n\n ----> Loaded task {iid} with {len(task_authors_df)} authors\n\n\n")
+        print(task_authors_df)
+    else:
+        header_html = "<h3>Custom Uploaded Task</h3>"
+        mystery_txt = read_txt(mystery_file)
+        c1_txt = read_txt(cand1_file)
+        c2_txt = read_txt(cand2_file)
+        c3_txt = read_txt(cand3_file)
+        candidate_texts = [c1_txt, c2_txt, c3_txt]
+        ground_truth_author = true_author
+        print(f"Ground truth author: {ground_truth_author} ; {true_author}")
+        custom_task_instance = {
+            'Q_fullText': mystery_txt,
+            'a0_fullText': c1_txt,
+            'a1_fullText': c2_txt,
+            'a2_fullText': c3_txt
+        }
+        task_authors_df  = instance_to_df(custom_task_instance)
+        print(task_authors_df)
+    print(f"Generating embeddings for {model_name} on task authors")
+    task_authors_df = cached_generate_style_embedding(task_authors_df, 'fullText', model_name)
+    print("Task authors after embedding generation:")
+    print(task_authors_df)
+    # Generate the new embedding of all the background_df authors
+    print(f"Generating embeddings for {model_name} on background corpus")
+    background_df = cached_generate_style_embedding(background_df, 'fullText', model_name)
+    print(f"Generated embeddings for {len(background_df)} texts using model '{model_name}'")
+    # computing g2v features
+    print("Generating g2v features for on background corpus")
+    background_g2v, task_authors_g2v = compute_g2v_features(background_df, task_authors_df)
+    background_df['g2v_vector'] = background_g2v
+    task_authors_df['g2v_vector'] = task_authors_g2v
+    print(f"Gram2Vec feature generation complete")
+    print(background_df.columns)
+    # Computing predicted author by checking pairwise cosine similarity over luar embeddings
+    col_name = f'{model_name.split("/")[-1]}_style_embedding'
+    predicted_author = compute_predicted_author(task_authors_df, col_name)
+    #generating html for the task
+    header_html, mystery_html, candidate_htmls = task_HTML(mystery_txt, candidate_texts, predicted_author, ground_truth_author)
+    return [
+        header_html,
+        mystery_html,
+        candidate_htmls[0],
+        candidate_htmls[1],
+        candidate_htmls[2],
+        mystery_txt,
+        c1_txt,
+        c2_txt,
+        c3_txt,
+        task_authors_df,
+        background_df,
+        predicted_author,
+        ground_truth_author
+    ]
+def task_HTML(mystery_text, candidate_texts, predicted_author, ground_truth_author):
+    header_html = f"""
+    <div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
+      <h3>Here’s the mystery passage alongside three candidate texts—look for the green highlight to see the predicted author.</h3>
+    </div>
+    """
+    # mystery_text = clean_text(mystery_text)
+    mystery_html = f"""
+    <div style="
+            border: 2px solid #ff5722;      /* accent border */
+            background: #fff3e0;            /* very light matching wash */
+            border-radius: 6px;
+            padding: 1em;
+            margin-bottom: 1em;
+        ">
+        <h3 style="margin-top:0; color:#bf360c;">Mystery Author</h3>
+        <p>{clean_text(mystery_text)}</p>
+    </div>
+    """
+    print(f"Predicted author: {predicted_author}, Ground truth author: {ground_truth_author}")
+    # Candidate boxes
+    candidate_htmls = []
+    for i in range(3):
+        text = candidate_texts[i]
+        title = f"Candidate {i+1}"
+        extra_style = ""
+        if ground_truth_author == i:
+            if ground_truth_author != predicted_author: # highlight the true author only if its different than the predictd one
+                title += " (True Author)"
+                extra_style = (
+                    "border: 2px solid #ff5722; "
+                    "background: #fff3e0; "
+                    "padding:10px; "
+                )
+        if predicted_author == i:
+            if predicted_author == ground_truth_author:
+                title += " (Predicted and True Author)"
+            else:
+                title += " (Predicted Author)"
+            extra_style = (
+                "border:2px solid #228B22; "        # dark green border
+                "background-color: #e6ffe6; "       # light green fill
+                "padding:10px; "
+            )
+        candidate_htmls.append(f"""
+        <div style="border:1px solid #ccc; padding:10px; {extra_style}">
+          <h4>{title}</h4>
+          <p>{clean_text(text)}</p>
+        </div>
+        """)
+    return header_html, mystery_html, candidate_htmls
+def toggle_custom_model(choice):
+    return gr.update(visible=(choice == "Other"))

utils/visualizations.py ADDED Viewed

	@@ -0,0 +1,564 @@

+import gradio as gr
+import json
+import numpy as np
+from sklearn.manifold import TSNE
+import pickle as pkl
+import os
+import hashlib
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.colors import sample_colorscale
+from gradio import update
+import re
+from utils.interp_space_utils import compute_clusters_style_representation_3, compute_clusters_g2v_representation
+from utils.llm_feat_utils import split_features
+from utils.gram2vec_feat_utils import get_shorthand, get_fullform
+import plotly.io as pio
+def clean_text(text: str) -> str:
+    """
+    Cleans the text by replacing HTML tags with their escaped versions.
+    """
+    return text.replace('<','&lt;').replace('>','&gt;').replace('\n', '<br>')
+def get_instances(instances_to_explain_path: str = 'datasets/instances_to_explain.json'):
+    """
+    Loads the JSON and returns:
+      - instances_to_explain: the raw dict/list of instances
+      - instance_ids: list of keys (if dict) or indices (if list)
+    """
+    instances_to_explain = json.load(open(instances_to_explain_path))
+    if isinstance(instances_to_explain, dict):
+        instance_ids = list(instances_to_explain.keys())
+    else:
+        instance_ids = list(range(len(instances_to_explain)))
+    return instances_to_explain, instance_ids
+def load_instance(instance_id, instances_to_explain: dict):
+    """
+    Given a selected instance_id and the loaded data,
+    returns (mystery_html, c0_html, c1_html, c2_html).
+    """
+    # normalize instance_id
+    try:
+        iid = int(instance_id)
+    except ValueError:
+        iid = instance_id
+    data = instances_to_explain[iid]
+    predicted_author = data['latent_rank'][0]
+    ground_truth_author = data['gt_idx']
+    header_html = f"""
+    <div style="border:1px solid #ccc; padding:10px; margin-bottom:10px;">
+      <h3>Here’s the mystery passage alongside three candidate texts—look for the green highlight to see the predicted author.</h3>
+    </div>
+    """
+    mystery_text = clean_text(data['Q_fullText'])
+    mystery_html = f"""
+    <div style="
+            border: 2px solid #ff5722;      /* accent border */
+            background: #fff3e0;            /* very light matching wash */
+            border-radius: 6px;
+            padding: 1em;
+            margin-bottom: 1em;
+        ">
+        <h3 style="margin-top:0; color:#bf360c;">Mystery Author</h3>
+        <p>{clean_text(mystery_text)}</p>
+    </div>
+    """
+    # Candidate boxes
+    candidate_htmls = []
+    for i in range(3):
+        text = data[f'a{i}_fullText']
+        title = f"Candidate {i+1}"
+        extra_style = ""
+        if ground_truth_author == i:
+            if ground_truth_author != predicted_author: # highlight the true author only if its different than the predictd one
+                title += " (True Author)"
+                extra_style = (
+                    "border: 2px solid #ff5722; "
+                    "background: #fff3e0; "
+                    "padding:10px; "
+                )
+        if predicted_author == i:
+            if predicted_author == ground_truth_author:
+                title += " (Predicted and True Author)"
+            else:
+                title += " (Predicted Author)"
+            extra_style = (
+                "border:2px solid #228B22; "        # dark green border
+                "background-color: #e6ffe6; "       # light green fill
+                "padding:10px; "
+            )
+        candidate_htmls.append(f"""
+        <div style="border:1px solid #ccc; padding:10px; {extra_style}">
+          <h4>{title}</h4>
+          <p>{clean_text(text)}</p>
+        </div>
+        """)
+    return header_html, mystery_html, candidate_htmls[0], candidate_htmls[1], candidate_htmls[2]
+def compute_tsne_with_cache(embeddings: np.ndarray, cache_path: str = 'datasets/tsne_cache.pkl') -> np.ndarray:
+    """
+    Compute t-SNE with caching to avoid recomputation for the same input.
+    Args:
+        embeddings (np.ndarray): The input embeddings to compute t-SNE on.
+        cache_path (str): Path to the cache file.
+    Returns:
+        np.ndarray: The t-SNE transformed embeddings.
+    """
+    # Create a hash of the input embeddings to use as a key
+    hash_key = hashlib.md5(embeddings.tobytes()).hexdigest()
+    if os.path.exists(cache_path):
+        with open(cache_path, 'rb') as f:
+            cache = pkl.load(f)
+    else:
+        cache = {}
+    if hash_key in cache:
+        return cache[hash_key]
+    else:
+        print("Computing t-SNE")
+        tsne_result = TSNE(n_components=2, learning_rate='auto',
+                           init='random', perplexity=3).fit_transform(embeddings)
+        cache[hash_key] = tsne_result
+        with open(cache_path, 'wb') as f:
+            pkl.dump(cache, f)
+        return tsne_result
+def load_interp_space(cfg):
+    interp_space_path      = cfg['interp_space_path'] + 'interpretable_space.pkl'
+    interp_space_rep_path  = cfg['interp_space_path'] + 'interpretable_space_representations.json'
+    gram2vec_feats_path    = cfg['interp_space_path'] + '/../gram2vec_feats.csv'
+    clustered_authors_path = cfg['interp_space_path'] + 'train_authors.pkl'
+    # Load authors embeddings and their cluster labels
+    clustered_authors_df = pd.read_pickle(clustered_authors_path)
+    clustered_authors_df = clustered_authors_df[clustered_authors_df.cluster_label != -1]
+    author_embedding = clustered_authors_df.author_embedding.tolist()
+    author_labels    = clustered_authors_df.cluster_label.tolist()
+    author_ids      = clustered_authors_df.authorID.tolist()
+    # filter out gram2vec features that doesn't have representation
+    clustered_authors_df['gram2vec_feats'] = clustered_authors_df.gram2vec_feats.apply(lambda feats: [feat for feat in feats if get_shorthand(feat) is not None])
+    # Load a list of gram2vec features --> we use it to distinguish the cluster representations whether they come from gram2vec or llms
+    gram2vec_df = pd.read_csv(gram2vec_feats_path)
+    gram2vec_feats = gram2vec_df.gram2vec_feats.unique().tolist()
+    # Load interpretable space embeddings and the representation of each dimension
+    interpretable_space = pkl.load(open(interp_space_path, 'rb'))
+    del interpretable_space[-1] #DBSCAN generate a cluster -1 of all outliers. We don't want this cluster
+    dimension_to_latent = {key: interpretable_space[key][0] for key in interpretable_space}
+    interpretable_space_rep_df = pd.read_json(interp_space_rep_path)
+    #dimension_to_style  = {x[0]: x[1] for x in zip(interpretable_space_rep_df.cluster_label.tolist(), interpretable_space_rep_df[style_feat_clm].tolist())}
+    dimension_to_style  = {x[0]: [feat[0] for feat in sorted(x[1].items(), key=lambda feat_w:-feat_w[1])] for x in zip(interpretable_space_rep_df.cluster_label.tolist(), interpretable_space_rep_df[cfg['style_feat_clm']].tolist())}
+    if cfg['only_llm_feats']:
+        #print('only llm feats')
+        dimension_to_style = {dim[0]:[feat for feat in dim[1] if feat not in gram2vec_feats] for dim in dimension_to_style.items()}
+    if cfg['only_gram2vec_feats']:
+        #print('only gra2vec feats')
+        dimension_to_style = {dim[0]:[feat for feat in dim[1] if feat in gram2vec_feats] for dim in dimension_to_style.items()}
+    # Take top features from g2v and llm
+    def take_to_k_llm_and_g2v_feats(feats_list, top_k):
+        g2v_feats = [x for x in feats_list if x in gram2vec_feats][:top_k]
+        llm_feats = [x for x in feats_list if x not in gram2vec_feats][:top_k]
+        return g2v_feats + llm_feats
+    dimension_to_style = {dim[0]: take_to_k_llm_and_g2v_feats(dim[1], cfg['top_k']) for dim in dimension_to_style.items()}
+    return {
+        'dimension_to_latent': dimension_to_latent,
+        'dimension_to_style' : dimension_to_style,
+        'author_embedding' : author_embedding,
+        'author_labels' : author_labels,
+        'author_ids' : author_ids,
+        'clustered_authors_df' : clustered_authors_df
+    }
+#function to handle zoom events
+def handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
+    """
+    event_json         – stringified JSON from JS listener
+    bg_proj            – (N,2) numpy array with 2D coordinates
+    bg_lbls            – list of N author IDs
+    clustered_authors_df – pd.DataFrame containing authorID and final_attribute_name
+    """
+    print("[INFO] Handling zoom event")
+    if not event_json:
+        return gr.update(value=""), gr.update(value=""), None, None, None
+    try:
+        ranges = json.loads(event_json)
+        (x_min, x_max) = ranges["xaxis"]
+        (y_min, y_max) = ranges["yaxis"]
+    except (json.JSONDecodeError, KeyError, ValueError):
+        return gr.update(value=""), gr.update(value=""), None, None, None
+    # Find points within the zoomed region
+    mask = (
+        (bg_proj[:, 0] >= x_min) & (bg_proj[:, 0] <= x_max) &
+        (bg_proj[:, 1] >= y_min) & (bg_proj[:, 1] <= y_max)
+    )
+    visible_authors = [lbl for lbl, keep in zip(bg_lbls, mask) if keep]
+    print(f"[INFO] Zoomed region includes {len(visible_authors)} authors:{visible_authors}")
+    # Example: Find features for clusters [2,3,4] that are NOT prominent in cluster [1]
+    # llm_feats = compute_clusters_style_representation(
+    #     background_corpus_df=clustered_authors_df,
+    #     cluster_ids=visible_authors,
+    #     cluster_label_clm_name='authorID',
+    #     other_cluster_ids=[],
+    #     features_clm_name='final_attribute_name_manually_processed'
+    # )
+    print(f"Task authors: {len(task_authors_df)}, Clustered authors: {len(clustered_authors_df)}")
+    merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
+    print(f"Merged authors DataFrame:\n{len(merged_authors_df)}")
+    style_analysis_response = compute_clusters_style_representation_3(
+        background_corpus_df=merged_authors_df,
+        cluster_ids=visible_authors,
+        cluster_label_clm_name='authorID',
+    )
+    llm_feats = ['None'] + style_analysis_response['features']
+    merged_authors_df = pd.concat([task_authors_df, clustered_authors_df])
+    g2v_feats = compute_clusters_g2v_representation(
+        background_corpus_df=merged_authors_df,
+        author_ids=visible_authors,
+        other_author_ids=[],
+        features_clm_name='g2v_vector'
+    )
+    # Gram2vec features are already in shorthand. convert to human readable for display
+    HR_g2v_list = []
+    for feat in g2v_feats:
+        HR_g2v = get_fullform(feat)
+        print(f"\n\n feat: {feat} ---> Human Readable: {HR_g2v}")
+        if HR_g2v is None:
+            print(f"Skipping Gram2Vec feature without human readable form: {feat}")
+        else:
+            HR_g2v_list.append(HR_g2v)
+    HR_g2v_list = ["None"] + HR_g2v_list
+    print(f"[INFO] Found {len(llm_feats)} LLM features and {len(g2v_feats)} Gram2Vec features in the zoomed region.")
+    print(f"[INFO] unfiltered g2v features: {g2v_feats}")
+    print(f"[INFO] LLM features: {llm_feats}")
+    print(f"[INFO] Gram2Vec features: {HR_g2v_list}")
+    return (
+        gr.update(choices=llm_feats, value=llm_feats[0]),
+        gr.update(choices=HR_g2v_list, value=HR_g2v_list[0]),
+        style_analysis_response,
+        llm_feats,
+        visible_authors
+    )
+    # return gr.update(value="\n".join(llm_feats).join("\n").join(g2v_feats)), llm_feats, g2v_feats
+def handle_zoom_with_retries(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df):
+    """
+    event_json         – stringified JSON from JS listener
+    bg_proj            – (N,2) numpy array with 2D coordinates
+    bg_lbls            – list of N author IDs
+    clustered_authors_df – pd.DataFrame containing authorID and final_attribute_name
+    task_authors_df   – pd.DataFrame containing authorID and final_attribute_name
+    """
+    print("[INFO] Handling zoom event with retries")
+    for attempt in range(3):
+        try:
+            return handle_zoom(event_json, bg_proj, bg_lbls, clustered_authors_df, task_authors_df)
+        except Exception as e:
+            print(f"[ERROR] Attempt {attempt + 1} failed: {e}")
+            if attempt < 2:
+                print("[INFO] Retrying...")
+    return (
+        None,
+        None,
+        None,
+        None,
+        None
+    )
+def visualize_clusters_plotly(iid, cfg, instances, model_radio, custom_model_input, task_authors_df, background_authors_embeddings_df, pred_idx=None, gt_idx=None):
+    model_name = model_radio if model_radio != "Other" else custom_model_input
+    embedding_col_name = f'{model_name.split("/")[-1]}_style_embedding'
+    print(background_authors_embeddings_df.columns)
+    print("Generating cluster visualization")
+    iid = int(iid)
+    interp      = load_interp_space(cfg)
+    # dim2lat     = interp['dimension_to_latent']
+    style_names = interp['dimension_to_style']
+    # bg_emb      = np.array(interp['author_embedding'])
+    # print(f"bg_emb shape: {bg_emb.shape}")
+    #replace with cached embedddings
+    bg_emb      = np.array(background_authors_embeddings_df[embedding_col_name].tolist()) #placeholder for background embeddings
+    print(f"bg_emb shape: {bg_emb.shape}")
+    # print("interp.keys():", interp.keys())
+    #bg_lbls     = interp['author_labels']
+    #bg_ids      = interp['author_ids']
+    bg_ids = task_authors_df['authorID'].tolist() + background_authors_embeddings_df['authorID'].tolist()
+    # inst         = instances[iid]
+    # print("inst.keys():", inst.keys())
+    # q_lat        = np.array(inst['author_latents'][:1])
+    # print(f"q_lat shape: {q_lat.shape}")
+    # c_lat        = np.array(inst['author_latents'][1:])
+    # print(f"c_lat shape: {c_lat.shape}")
+    # pred_idx     = inst['latent_rank'][0]
+    # gt_idx       = inst['gt_idx']
+    q_lat = np.array(task_authors_df[embedding_col_name].iloc[0]).reshape(1, -1) # Mystery author latent
+    print(f"q_lat shape: {q_lat.shape}")
+    c_lat = np.array(task_authors_df[embedding_col_name].iloc[1:].tolist())  # Candidate authors latents
+    print(f"c_lat shape: {c_lat.shape}")
+    # cent_emb = np.array([v for _,v in dim2lat.items()])
+    # cent_lbl = np.array([k for k,_ in dim2lat.items()])
+    # all_emb = np.vstack([q_lat, c_lat, bg_emb, cent_emb])
+    all_emb = np.vstack([q_lat, c_lat, bg_emb])
+    proj    = compute_tsne_with_cache(all_emb)
+    # split
+    q_proj    = proj[0]
+    c_proj    = proj[1:4]
+    #bg_proj   = proj[4:4+len(bg_lbls)]
+    bg_proj   = proj
+    # cent_proj = proj[4+len(bg_lbls):]
+    # find nearest centroid
+    # dists = np.linalg.norm(cent_proj - q_proj, axis=1)
+    # idx   = int(np.argmin(dists))
+    # cluster_label_query = cent_lbl[idx]
+    # features of the nearest centroid to display
+    # feature_list = style_names[cluster_label_query]
+    # cluster_labels_per_candidate = [
+    #     cent_lbl[int(np.argmin(np.linalg.norm(cent_proj - c_proj[i], axis=1)))]
+    #     for i in range(c_proj.shape[0])
+    # ]
+    # prepare colorscale
+    # n_cent = len(cent_lbl)
+    # cent_colors = sample_colorscale("algae", [i/(n_cent-1) for i in range(n_cent)])
+    # map each cluster label to its color
+    # color_map = { label: cent_colors[i] for i, label in enumerate(cent_lbl) }
+    # uncomment the following line to show background authors
+    ## background author colors pulled from their cluster label
+    # bg_colors = [ color_map[label] for label in bg_lbls ]
+    # 2) build Plotly figure
+    fig = go.Figure()
+    fig.update_layout(
+        template='plotly_white',
+        margin=dict(l=40,r=40,t=60,b=40),
+        autosize=True,
+        hovermode='closest',
+        # Enable zoom events
+        dragmode='zoom'
+    )
+    # fig.update_layout(
+    #     template='plotly_white',
+    #     margin=dict(l=40,r=40,t=60,b=40),
+    #     autosize=True,
+    #     hovermode='closest')
+    # uncomment the following line to show background authors
+    ## background authors (light grey dots)
+    fig.add_trace(go.Scattergl(
+        x=bg_proj[:,0], y=bg_proj[:,1],
+        mode='markers',
+        marker=dict(size=6, color="#d3d3d3"),# color=bg_colors
+        name='Background authors',
+        hoverinfo='skip'
+    ))
+    # centroids (rainbow colors + hovertext of your top-k features)
+    # hover_texts = [
+    #     f"Cluster {lbl}<br>" + "<br>".join(style_names[lbl])
+    #     for lbl in cent_lbl
+    # ]
+    # fig.add_trace(go.Scattergl(
+    #     x=cent_proj[:,0], y=cent_proj[:,1],
+    #     mode='markers',
+    #     marker=dict(symbol='triangle-up', size=10, color="#d3d3d3"),#color=cent_colors
+    #     name='Cluster centroids',
+    #     hovertext=hover_texts,
+    #     hoverinfo='text'
+    # ))
+    # three candidates
+    marker_syms = ['diamond','pentagon','x']
+    for i in range(3):
+        # label = f"Candidate {i+1}" + (" (predicted)" if i==pred_idx else "")
+        base = f"Candidate {i+1}"
+        # pick the right suffix
+        if i == pred_idx and i == gt_idx:
+            suffix = " (Predicted & Ground Truth)"
+        elif i == pred_idx:
+            suffix = " (Predicted)"
+        elif i == gt_idx:
+            suffix = "(Ground Truth)"
+        else:
+            suffix = ""
+        label = base + suffix
+        fig.add_trace(go.Scattergl(
+            x=[c_proj[i,0]], y=[c_proj[i,1]],
+            mode='markers',
+            marker=dict(symbol=marker_syms[i], size=12, color='darkblue'),
+            name=label,
+            hoverinfo='skip'
+        ))
+    # query author
+    fig.add_trace(go.Scattergl(
+        x=[q_proj[0]], y=[q_proj[1]],
+        mode='markers',
+        marker=dict(symbol='star', size=14, color='red'),
+        name='Mystery author',
+        hoverinfo='skip'
+    ))
+    # ── Arrowed annotations for mystery + candidates ──────────────────────────
+    # Mystery author (red star)
+    fig.add_annotation(
+        x=q_proj[0], y=q_proj[1],
+        xref='x', yref='y',
+        text="Mystery",
+        showarrow=True,
+        arrowhead=2,
+        arrowsize=1,
+        arrowwidth=1.5,
+        ax=40,   # tail offset in pixels: moves the label 40px to the right
+        ay=-40,  # moves the label 40px up
+        font=dict(color='red', size=12)
+    )
+    # Candidate authors (dark blue ◆)
+    offsets = [(-40, -30), (40, -30), (0, 40)]  # [(ax,ay) for Cand1, Cand2, Cand3]
+    for i in range(3):
+        # build the right label
+        if i == pred_idx and i == gt_idx:
+            label = f"Candidate {i+1} (Predicted & Ground Truth)"
+        elif i == pred_idx:
+            label = f"Candidate {i+1} (Predicted)"
+        elif i == gt_idx:
+            label = f"Candidate {i+1} (Ground Truth)"
+        else:
+            label = f"Candidate {i+1}"
+        fig.add_annotation(
+            x=c_proj[i,0], y=c_proj[i,1],
+            xref='x', yref='y',
+            text= label,
+            showarrow=True,
+            arrowhead=2,
+            arrowsize=1,
+            arrowwidth=1.5,
+            ax=offsets[i][0],
+            ay=offsets[i][1],
+            font=dict(color='darkblue', size=12)
+        )
+    print('Done processing....')
+    # Prepare outputs for the new cluster‐dropdown UI
+    # all_clusters = sorted(style_names.keys())
+    # --- build display names for the dropdown ---
+    # sorted_labels = sorted([int(lbl) for lbl in cent_lbl])
+    # display_clusters = []
+    # for lbl in sorted_labels:
+    #     name = f"Cluster {lbl}"
+    #     if lbl == cluster_label_query:
+    #         name += " (closest to mystery author)"
+    #     matching_indices = [i + 1 for i, val in enumerate(cluster_labels_per_candidate) if int(val) == lbl]
+    #     if matching_indices:
+    #         if len(matching_indices) == 1:
+    #             name += f" (closest to Candidate {matching_indices[0]} author)"
+    #         else:
+    #             candidate_str = ", ".join(f"Candidate {i}" for i in matching_indices)
+    #             name += f" (closest to {candidate_str} authors)"
+    #     display_clusters.append(name)
+    # print(f"All clusters: {all_clusters}")
+    # return: figure, dropdown payload, full style_map
+    return (
+      fig,
+    #   update(choices=display_clusters, value=display_clusters[cluster_label_query]),
+      style_names,
+      bg_proj,  # Return background points
+      bg_ids,    # Return background labels
+      background_authors_embeddings_df,  # Return the DataFrame for zoom handling
+    )
+    # return fig, update(choices=feature_list, value=feature_list[0]),feature_list
+def extract_cluster_key(display_label: str) -> int:
+    """
+    Given a dropdown label like
+      "Cluster 5 (closest to mystery author; closest to Candidate 1 author)"
+    returns the integer 5.
+    """
+    m = re.match(r"Cluster\s+(\d+)", display_label)
+    if not m:
+        raise ValueError(f"Unrecognized cluster label: {display_label}")
+    return int(m.group(1))
+# When a cluster is selected, split features and populate radio buttons
+def on_cluster_change(selected_cluster, style_map):
+    cluster_key = extract_cluster_key(selected_cluster)
+    all_feats = style_map[cluster_key]
+    llm_feats, g2v_feats = split_features(all_feats)
+    # print(f"Selected cluster: {selected_cluster} ({cluster_key})")
+    # print(f"LLM features: {llm_feats}")
+    # Add "None" as a default selectable option
+    llm_feats = ["None"] + llm_feats
+    # filter out any g2v feature without a shorthand
+    filtered_g2v = []
+    for feat in g2v_feats:
+        if get_shorthand(feat) is None:
+            print(f"Skipping Gram2Vec feature without shorthand: {feat}")
+        else:
+            filtered_g2v.append(feat)
+    # Add "None" as a default selectable option
+    filtered_g2v = ["None"] + filtered_g2v
+    return (
+        gr.update(choices=llm_feats, value=llm_feats[0]),
+        gr.update(choices=filtered_g2v, value=filtered_g2v[0]),
+        llm_feats
+    )