from motor.motor_asyncio import AsyncIOMotorClient
import pandas as pd
import numpy as np
import re
import json
import umap
import plotly.io as pio
import hdbscan
from bertopic import BERTopic
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from skopt import gp_minimize
from sentence_transformers import SentenceTransformer
import torch
import random
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
import optuna
import pandas as pd
import dash
from dash import dcc, html, Input, Output, State
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import dash_bootstrap_components as dbc
from fastapi import HTTPException, APIRouter, Request
from pydantic import BaseModel
import threading
import time
import webbrowser
import asyncio
# Set seed for reproducibility
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
if __name__ == "__main__":
set_seed(42)
multiprocessing.freeze_support()
global TitleName
TitleName = "Dashboard"
router = APIRouter()
class TrendAnalysisRequest(BaseModel):
userId: str
topic: str
year: str = None
page: int = 0
async def fetch_papers_with_pagination(request: Request, userId: str, topic: str, year: str = None, page: int = 0):
# Build the query filter
query_filter = {"userId": userId, "topic": topic}
if year:
query_filter["year"] = year
# Count total matching documents
count_pipeline = [
{"$match": query_filter},
{"$unwind": "$papers"},
{"$count": "total_papers"}
]
collection = request.app.state.collection
count_result = await collection.aggregate(count_pipeline).to_list(length=1)
total_papers = count_result[0]['total_papers'] if count_result else 0
print(f"Total papers matching criteria: {total_papers}")
# If no papers found, return empty result
if total_papers == 0:
return pd.DataFrame(), 0, 0, 0, 0
# Define pagination constants
papers_per_page = 200
min_papers_last_page = 50
# Calculate basic pagination
if total_papers <= papers_per_page:
# Simple case: all papers fit in one page
total_pages = 1
else:
# Multiple pages case
full_pages = total_papers // papers_per_page
remaining = total_papers % papers_per_page
if remaining >= min_papers_last_page:
# If remaining papers meet minimum threshold, create a separate page
total_pages = full_pages + 1
else:
# Otherwise, we'll have exactly 'full_pages' pages
# The remaining papers will be added to the last page
total_pages = full_pages
# Ensure page is within valid range
if page >= total_pages:
return pd.DataFrame(), 0, total_pages, 0, total_papers
# Calculate skip and limit based on page number
if total_pages == 1:
# Only one page - return all papers
skip = 0
limit = total_papers
elif page < total_pages - 1:
# Regular full page
skip = page * papers_per_page
limit = papers_per_page
else:
# Last page - might include remaining papers
remaining = total_papers % papers_per_page
if remaining >= min_papers_last_page or remaining == 0:
# Last page with either enough remaining papers or perfectly divided
skip = page * papers_per_page
limit = remaining if remaining > 0 else papers_per_page
else:
# Last page with remaining papers that don't meet minimum threshold
# We distribute by adding them to the last page
skip = (total_pages - 1) * papers_per_page
limit = papers_per_page + remaining
print(f"Pagination: Page {page + 1} of {total_pages}, Skip {skip}, Limit {limit}")
# MongoDB aggregation pipeline
pipeline = [
{"$match": query_filter},
{"$unwind": "$papers"},
{"$replaceRoot": {"newRoot": "$papers"}},
{"$project": {
"_id": 0,
"paperId": 1,
"url": 1,
"title": 1,
"abstract": 1,
"citationCount": 1,
"influentialCitationCount": 1,
"embedding": 1,
"publicationDate": 1,
"authors": 1
}},
{"$sort": {"publicationDate": 1}},
{"$skip": skip},
{"$limit": limit}
]
# Execute the aggregation pipeline
cursor = collection.aggregate(pipeline)
papers = await cursor.to_list(None)
papers_count = len(papers)
print(f"Papers Retrieved: {papers_count}")
# Convert to DataFrame
df = pd.DataFrame(papers)
df = df.sort_values(by="publicationDate")
print(df[["paperId", "publicationDate"]].head(10))
return df, page, total_pages, papers_count, total_papers
# Preprocessing function
def clean_text(text):
text = str(text).lower()
text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
return ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
# Adaptive clustering and topic modeling
def perform_trend_analysis(df):
# Convert embeddings
def convert_embedding(embedding):
return np.array(embedding["vector"], dtype=np.float64) if isinstance(embedding,
dict) and "vector" in embedding else None
df["embedding"] = df["embedding"].apply(convert_embedding)
df = df.dropna(subset=["embedding"])
if df.empty:
return df, {}
df["clean_text"] = (df["abstract"].fillna("")).apply(clean_text)
def objective(trial):
umap_n_components = trial.suggest_int("umap_n_components", 1, 12)
umap_min_dist = trial.suggest_float("umap_min_dist", 0.1, 0.8)
umap_n_neighbors = trial.suggest_int("umap_n_neighbors", 2, 12)
hdbscan_min_cluster_size = trial.suggest_int("hdbscan_min_cluster_size", 2, 10)
hdbscan_min_samples = trial.suggest_int("hdbscan_min_samples", 1, 10)
hdbscan_cluster_selection_epsilon = trial.suggest_float("hdbscan_cluster_selection_epsilon", 0.2, 0.8)
hdbscan_cluster_selection_method = trial.suggest_categorical("hdbscan_cluster_selection_method",
["eom", "leaf"])
reducer_high_dim = umap.UMAP(
n_components=umap_n_components,
random_state=42,
min_dist=umap_min_dist,
n_neighbors=umap_n_neighbors,
metric="cosine"
)
reduced_embeddings_high_dim = reducer_high_dim.fit_transform(np.vstack(df["embedding"].values)).astype(
np.float64)
clusterer = hdbscan.HDBSCAN(
min_cluster_size=hdbscan_min_cluster_size,
min_samples=hdbscan_min_samples,
cluster_selection_epsilon=hdbscan_cluster_selection_epsilon,
cluster_selection_method=hdbscan_cluster_selection_method,
prediction_data=True,
core_dist_n_jobs=1
)
labels = clusterer.fit_predict(reduced_embeddings_high_dim)
if len(set(labels)) > 1:
dbcv_score = hdbscan.validity.validity_index(reduced_embeddings_high_dim, labels)
else:
dbcv_score = -np.inf
return dbcv_score
study = optuna.create_study(
direction="maximize",
sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=100)
best_params = study.best_params
umap_model = umap.UMAP(
n_components=best_params["umap_n_components"],
random_state=42,
min_dist=best_params["umap_min_dist"],
n_neighbors=best_params["umap_n_neighbors"],
metric="cosine"
)
hdbscan_model = hdbscan.HDBSCAN(
min_cluster_size=best_params["hdbscan_min_cluster_size"],
min_samples=best_params["hdbscan_min_samples"],
cluster_selection_epsilon=best_params["hdbscan_cluster_selection_epsilon"],
cluster_selection_method=best_params["hdbscan_cluster_selection_method"],
prediction_data=True,
core_dist_n_jobs=1
)
vectorizer = CountVectorizer(
stop_words=list(ENGLISH_STOP_WORDS),
ngram_range=(2, 3)
)
representation_model = KeyBERTInspired()
embedding_model = SentenceTransformer("allenai/specter")
topic_model = BERTopic(
vectorizer_model=vectorizer,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
embedding_model=embedding_model,
nr_topics='auto',
top_n_words=8,
representation_model=representation_model,
ctfidf_model=ClassTfidfTransformer(reduce_frequent_words=False, bm25_weighting=True)
)
topics, _ = topic_model.fit_transform(df["clean_text"], np.vstack(df["embedding"].values))
df["topic"] = topics
topic_labels = {t: " | ".join([word for word, _ in topic_model.get_topic(t)][:8]) for t in set(topics)}
reduced_embeddings_2d = umap.UMAP(n_components=2, random_state=42).fit_transform(
np.vstack(df["embedding"].values)).astype(np.float64)
df["x"] = reduced_embeddings_2d[:, 0]
df["y"] = reduced_embeddings_2d[:, 1]
df["topic_label"] = df["topic"].map(topic_labels)
return df, topic_labels
def build_dashboard(df, titleNm, topic_year,existing_app=None):
global dash_app
TitleName = titleNm + "_" + topic_year
color_palette = px.colors.qualitative.Vivid
unique_topics = sorted(df["topic"].unique())
color_map = {topic: color_palette[i % len(color_palette)] for i, topic in enumerate(unique_topics)}
# Map colors to topics
df["color"] = df["topic"].map(color_map)
# Calculate the number of papers in each cluster
cluster_sizes = df.groupby("topic").size().reset_index(name="paper_count")
df = df.merge(cluster_sizes, on="topic", how="left")
app = existing_app if existing_app else dash.Dash(__name__, external_stylesheets=[dbc.themes.DARKLY])
# Improved marker scaling with a better range
min_size = 50
max_size = 140
df["marker_size"] = ((df["paper_count"] - df["paper_count"].min()) /
(df["paper_count"].max() - df["paper_count"].min())) * (max_size - min_size) + min_size
# Add log-transformed citation and influence columns
df["log_citation"] = np.log1p(df["citationCount"])
df["log_influence"] = np.log1p(df["influentialCitationCount"])
# Bayesian shrinkage for citations and influence
global_median_citation = df["log_citation"].median()
global_median_influence = df["log_influence"].median()
C = 10 # Shrinkage constant
def bayesian_shrinkage(group, global_median, C):
return (group.sum() + C * global_median) / (len(group) + C)
adjusted_citations = df.groupby("topic")["log_citation"].apply(
lambda x: bayesian_shrinkage(x, global_median_citation, C))
adjusted_influence = df.groupby("topic")["log_influence"].apply(
lambda x: bayesian_shrinkage(x, global_median_influence, C))
# Merge adjusted metrics back into the dataframe
df = df.merge(adjusted_citations.rename("adjusted_citation"), on="topic")
df = df.merge(adjusted_influence.rename("adjusted_influence"), on="topic")
# Calculate global percentiles for thresholds
citation_25th = df["adjusted_citation"].quantile(0.25)
citation_75th = df["adjusted_citation"].quantile(0.75)
influence_25th = df["adjusted_influence"].quantile(0.25)
influence_75th = df["adjusted_influence"].quantile(0.75)
# Enhanced theme classification with more distinct emojis
def classify_theme(row):
if row["adjusted_citation"] >= citation_75th and row["adjusted_influence"] >= influence_75th:
return "🔥 Hot Topic"
elif row["adjusted_citation"] <= citation_25th and row["adjusted_influence"] >= influence_75th:
return "💎 Gap Opportunity"
elif row["adjusted_citation"] >= citation_75th and row["adjusted_influence"] <= influence_25th:
return "⚠️ Risky Theme"
else:
return "🔄 Neutral"
df["theme"] = df.apply(classify_theme, axis=1)
# Create a more visually appealing figure
fig = go.Figure()
# Add subtle grid lines for reference
fig.update_xaxes(
showgrid=True,
gridwidth=0.1,
gridcolor='rgba(255, 255, 255, 0.05)',
zeroline=False
)
fig.update_yaxes(
showgrid=True,
gridwidth=0.1,
gridcolor='rgba(255, 255, 255, 0.05)',
zeroline=False
)
for topic in unique_topics:
topic_data = df[df["topic"] == topic]
# Get cluster center
center_x = topic_data["x"].mean()
center_y = topic_data["y"].mean()
# Get label
full_topic_formatted = topic_data['topic_label'].iloc[
0] if 'topic_label' in topic_data.columns else f"Cluster {topic}"
# Add a subtle glow effect with a larger outer circle
fig.add_trace(
go.Scatter(
x=[center_x],
y=[center_y],
mode="markers",
marker=dict(
color=color_map[topic],
size=topic_data["marker_size"].iloc[0] * 1.2, # Slightly larger for glow effect
opacity=0.3,
line=dict(width=0),
symbol="circle",
),
showlegend=False,
hoverinfo="none",
)
)
# Add main cluster circle with enhanced styling
fig.add_trace(
go.Scatter(
x=[center_x],
y=[center_y],
mode="markers+text",
marker=dict(
color=color_map[topic],
size=topic_data["marker_size"].iloc[0],
opacity=0.85,
line=dict(width=2, color="white"),
symbol="circle",
),
text=[f"{topic}"],
textposition="middle center",
textfont=dict(
family="Arial Black",
size=16,
color="white"
),
name=f"{topic}",
hovertemplate=(
"Cluster ID: %{text}
" +
"Name:
" + full_topic_formatted + "
" +
"Papers: " + str(topic_data["paper_count"].iloc[0]) + "
" +
"Popularity: " + (
"🔼 High" if topic_data["adjusted_citation"].iloc[0] >= citation_75th else "🔽 Low") +
f" (Adjusted Citation: {topic_data['adjusted_citation'].iloc[0]:.2f})
" +
"Impactfulness: " + (
"🔼 High" if topic_data["adjusted_influence"].iloc[0] >= influence_75th else "🔽 Low") +
f" (Adjusted Influence: {topic_data['adjusted_influence'].iloc[0]:.2f})
" +
"Theme: " + topic_data["theme"].iloc[0] +
"