"""
LLM Service for Tibetan Text Metrics
This module provides a unified interface for analyzing text similarity metrics
using both LLM-based and rule-based approaches.
"""
import os
import json
import logging
import requests
import pandas as pd
import re
# Set up logging
logger = logging.getLogger(__name__)
# Try to load environment variables
ENV_LOADED = False
try:
from dotenv import load_dotenv
load_dotenv()
ENV_LOADED = True
except ImportError:
logger.warning("python-dotenv not installed. Using system environment variables.")
# Constants
DEFAULT_MAX_TOKENS = 4000
DEFAULT_MODEL = "mistralai/mistral-7b-instruct"
DEFAULT_TEMPERATURE = 0.3
DEFAULT_TOP_P = 0.9
class LLMService:
"""
Service for analyzing text similarity metrics using LLMs and rule-based methods.
"""
def __init__(self, api_key: str = None):
"""
Initialize the LLM service.
Args:
api_key: Optional API key for OpenRouter. If not provided, will try to load from environment.
"""
self.api_key = api_key or os.getenv('OPENROUTER_API_KEY')
self.model = DEFAULT_MODEL
self.temperature = DEFAULT_TEMPERATURE
self.top_p = DEFAULT_TOP_P
def analyze_similarity(
self,
results_df: pd.DataFrame,
use_llm: bool = True,
) -> str:
"""
Analyze similarity metrics using either LLM or rule-based approach.
Args:
results_df: DataFrame containing similarity metrics
use_llm: Whether to use LLM for analysis (falls back to rule-based if False or on error)
Returns:
str: Analysis of the metrics in markdown format with appropriate fallback messages
"""
# If LLM is disabled, use rule-based analysis
if not use_llm:
logger.info("LLM analysis disabled. Using rule-based analysis.")
return self._analyze_with_rules(results_df)
# Try LLM analysis if enabled
try:
if not self.api_key:
raise ValueError("No OpenRouter API key provided. Please set the OPENROUTER_API_KEY environment variable.")
logger.info("Attempting LLM-based analysis...")
return self._analyze_with_llm(results_df, max_tokens=DEFAULT_MAX_TOKENS)
except Exception as e:
error_msg = str(e)
logger.error(f"Error in LLM analysis: {error_msg}")
# Create a user-friendly error message
if "payment" in error_msg.lower() or "402" in error_msg:
error_note = "OpenRouter API payment required. Falling back to rule-based analysis."
elif "invalid" in error_msg.lower() or "401" in error_msg:
error_note = "Invalid OpenRouter API key. Falling back to rule-based analysis."
elif "rate limit" in error_msg.lower() or "429" in error_msg:
error_note = "API rate limit exceeded. Falling back to rule-based analysis."
else:
error_note = f"LLM analysis failed: {error_msg[:200]}. Falling back to rule-based analysis."
# Get rule-based analysis
rule_based_analysis = self._analyze_with_rules(results_df)
# Combine the error message with the rule-based analysis
return f"## Analysis of Tibetan Text Similarity Metrics\n\n*Note: {error_note}*\n\n{rule_based_analysis}"
def _prepare_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare the DataFrame for analysis.
Args:
df: Input DataFrame with similarity metrics
Returns:
pd.DataFrame: Cleaned and prepared DataFrame
"""
# Make a copy to avoid modifying the original
df = df.copy()
# Clean text columns
text_cols = ['Text A', 'Text B']
for col in text_cols:
if col in df.columns:
df[col] = df[col].fillna('Unknown').astype(str)
df[col] = df[col].str.replace('.txt$', '', regex=True)
# Filter out perfect matches (likely empty cells)
metrics_cols = ['Jaccard Similarity (%)', 'Normalized LCS', 'TF-IDF Cosine Sim']
if all(col in df.columns for col in metrics_cols):
mask = ~((df['Jaccard Similarity (%)'] == 100.0) &
(df['Normalized LCS'] == 1.0) &
(df['TF-IDF Cosine Sim'] == 1.0))
df = df[mask].copy()
return df
def _analyze_with_llm(self, df: pd.DataFrame, max_tokens: int) -> str:
"""
Analyze metrics using an LLM via OpenRouter API.
Args:
df: Prepared DataFrame with metrics
max_tokens: Maximum tokens for the response
Returns:
str: LLM analysis in markdown format
"""
# Prepare the prompt with data and instructions
prompt = self._create_llm_prompt(df)
try:
# Call the LLM API
response = self._call_openrouter_api(
prompt=prompt,
system_message=self._get_system_prompt(),
max_tokens=max_tokens,
temperature=self.temperature,
top_p=self.top_p
)
# Process and format the response
return self._format_llm_response(response, df)
except Exception as e:
logger.error(f"Error in LLM analysis: {str(e)}")
raise
def _analyze_with_rules(self, df: pd.DataFrame) -> str:
"""
Analyze metrics using rule-based approach.
Args:
df: Prepared DataFrame with metrics
Returns:
str: Rule-based analysis in markdown format
"""
analysis = ["## Tibetan Text Similarity Analysis (Rule-Based)"]
# Basic stats
text_a_col = 'Text A' if 'Text A' in df.columns else None
text_b_col = 'Text B' if 'Text B' in df.columns else None
if text_a_col and text_b_col:
unique_texts = set(df[text_a_col].unique()) | set(df[text_b_col].unique())
analysis.append(f"- **Texts analyzed:** {', '.join(sorted(unique_texts))}")
# Analyze each metric
metric_analyses = []
if 'Jaccard Similarity (%)' in df.columns:
jaccard_analysis = self._analyze_jaccard(df)
metric_analyses.append(jaccard_analysis)
if 'Normalized LCS' in df.columns:
lcs_analysis = self._analyze_lcs(df)
metric_analyses.append(lcs_analysis)
if 'TF-IDF Cosine Sim' in df.columns:
tfidf_analysis = self._analyze_tfidf(df)
metric_analyses.append(tfidf_analysis)
# Add all metric analyses
if metric_analyses:
analysis.extend(metric_analyses)
# Add overall interpretation
analysis.append("\n## Overall Interpretation")
analysis.append(self._generate_overall_interpretation(df))
return "\n\n".join(analysis)
def _analyze_jaccard(self, df: pd.DataFrame) -> str:
"""Analyze Jaccard similarity scores."""
jaccard = df['Jaccard Similarity (%)'].dropna()
if jaccard.empty:
return ""
mean_jaccard = jaccard.mean()
max_jaccard = jaccard.max()
min_jaccard = jaccard.min()
analysis = [
"### Jaccard Similarity Analysis",
f"- **Range:** {min_jaccard:.1f}% to {max_jaccard:.1f}% (mean: {mean_jaccard:.1f}%)"
]
# Interpret the scores
if mean_jaccard > 60:
analysis.append("- **High vocabulary overlap** suggests texts share significant content or are from the same tradition.")
elif mean_jaccard > 30:
analysis.append("- **Moderate vocabulary overlap** indicates some shared content or themes.")
else:
analysis.append("- **Low vocabulary overlap** suggests texts are on different topics or from different traditions.")
# Add top pairs
top_pairs = df.nlargest(3, 'Jaccard Similarity (%)')
if not top_pairs.empty:
analysis.append("\n**Most similar pairs:**")
for _, row in top_pairs.iterrows():
text_a = row.get('Text A', 'Text 1')
text_b = row.get('Text B', 'Text 2')
score = row['Jaccard Similarity (%)']
analysis.append(f"- {text_a} ↔ {text_b}: {score:.1f}%")
return "\n".join(analysis)
def _analyze_lcs(self, df: pd.DataFrame) -> str:
"""Analyze Longest Common Subsequence scores."""
lcs = df['Normalized LCS'].dropna()
if lcs.empty:
return ""
mean_lcs = lcs.mean()
max_lcs = lcs.max()
min_lcs = lcs.min()
analysis = [
"### Structural Similarity (LCS) Analysis",
f"- **Range:** {min_lcs:.2f} to {max_lcs:.2f} (mean: {mean_lcs:.2f})"
]
# Interpret the scores
if mean_lcs > 0.7:
analysis.append("- **High structural similarity** suggests texts follow similar organizational patterns.")
elif mean_lcs > 0.4:
analysis.append("- **Moderate structural similarity** indicates some shared organizational elements.")
else:
analysis.append("- **Low structural similarity** suggests different organizational approaches.")
# Add top pairs
top_pairs = df.nlargest(3, 'Normalized LCS')
if not top_pairs.empty:
analysis.append("\n**Most structurally similar pairs:**")
for _, row in top_pairs.iterrows():
text_a = row.get('Text A', 'Text 1')
text_b = row.get('Text B', 'Text 2')
score = row['Normalized LCS']
analysis.append(f"- {text_a} ↔ {text_b}: {score:.2f}")
return "\n".join(analysis)
def _analyze_tfidf(self, df: pd.DataFrame) -> str:
"""Analyze TF-IDF cosine similarity scores."""
tfidf = df['TF-IDF Cosine Sim'].dropna()
if tfidf.empty:
return ""
mean_tfidf = tfidf.mean()
max_tfidf = tfidf.max()
min_tfidf = tfidf.min()
analysis = [
"### Thematic Similarity (TF-IDF) Analysis",
f"- **Range:** {min_tfidf:.2f} to {max_tfidf:.2f} (mean: {mean_tfidf:.2f})"
]
# Interpret the scores
if mean_tfidf > 0.8:
analysis.append("- **High thematic similarity** suggests texts share distinctive terms and concepts.")
elif mean_tfidf > 0.5:
analysis.append("- **Moderate thematic similarity** indicates some shared distinctive terms.")
else:
analysis.append("- **Low thematic similarity** suggests different conceptual focuses.")
# Add top pairs
top_pairs = df.nlargest(3, 'TF-IDF Cosine Sim')
if not top_pairs.empty:
analysis.append("\n**Most thematically similar pairs:**")
for _, row in top_pairs.iterrows():
text_a = row.get('Text A', 'Text 1')
text_b = row.get('Text B', 'Text 2')
score = row['TF-IDF Cosine Sim']
analysis.append(f"- {text_a} ↔ {text_b}: {score:.2f}")
return "\n".join(analysis)
def _generate_overall_interpretation(self, df: pd.DataFrame) -> str:
"""Generate an overall interpretation of the metrics."""
interpretations = []
# Get metrics if they exist
has_jaccard = 'Jaccard Similarity (%)' in df.columns
has_lcs = 'Normalized LCS' in df.columns
has_tfidf = 'TF-IDF Cosine Sim' in df.columns
# Calculate means for available metrics
metrics = {}
if has_jaccard:
metrics['jaccard'] = df['Jaccard Similarity (%)'].mean()
if has_lcs:
metrics['lcs'] = df['Normalized LCS'].mean()
if has_tfidf:
metrics['tfidf'] = df['TF-IDF Cosine Sim'].mean()
# Generate interpretation based on metrics
if metrics:
interpretations.append("Based on the analysis of similarity metrics:")
if has_jaccard and metrics['jaccard'] > 60:
interpretations.append("- The high Jaccard similarity indicates significant vocabulary overlap between texts, "
"suggesting they may share common sources or be part of the same textual tradition.")
if has_lcs and metrics['lcs'] > 0.7:
interpretations.append("- The high LCS score indicates strong structural similarity, "
"suggesting the texts may follow similar organizational patterns or share common structural elements.")
if has_tfidf and metrics['tfidf'] > 0.8:
interpretations.append("- The high TF-IDF similarity suggests the texts share distinctive terms and concepts, "
"indicating they may cover similar topics or themes.")
# Add cross-metric interpretations
if has_jaccard and has_lcs and metrics['jaccard'] > 60 and metrics['lcs'] > 0.7:
interpretations.append("\nThe combination of high Jaccard and LCS similarities strongly suggests "
"that these texts are closely related, possibly being different versions or "
"transmissions of the same work or sharing a common source.")
if has_tfidf and has_jaccard and metrics['tfidf'] < 0.5 and metrics['jaccard'] > 60:
interpretations.append("\nThe high Jaccard but lower TF-IDF similarity suggests that while the texts "
"share many common words, they may use them in different contexts or with different "
"meanings, possibly indicating different interpretations of similar material.")
# Add general guidance if no specific patterns found
if not interpretations:
interpretations.append("The analysis did not reveal strong patterns in the similarity metrics. "
"This could indicate that the texts are either very similar or very different "
"across all measured dimensions.")
return "\n\n".join(interpretations)
def _create_llm_prompt(self, df: pd.DataFrame) -> str:
"""
Create a prompt for the LLM based on the DataFrame.
Args:
df: Prepared DataFrame with metrics
Returns:
str: Formatted prompt for the LLM
"""
# Format the CSV data for the prompt
csv_data = df.to_csv(index=False)
# Create the prompt using the user's template
prompt = """You are a specialized text analysis interpreter with expertise in Tibetan textual studies. Your task is to analyze text similarity data from a CSV file and create a clear, narrative explanation for scholars who may not have technical expertise.