|
import spacy
|
|
import requests
|
|
from time import sleep
|
|
from tqdm import tqdm
|
|
from fastapi import HTTPException
|
|
from pydantic import BaseModel
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from datetime import datetime
|
|
from fastapi import APIRouter, Request
|
|
from fastapi.templating import Jinja2Templates
|
|
from fastapi.responses import HTMLResponse
|
|
import json
|
|
|
|
|
|
nlp = spacy.load("en_core_sci_sm")
|
|
templates = Jinja2Templates(directory="templates")
|
|
|
|
OPENALEX_API_URL = "https://api.openalex.org/works"
|
|
PER_PAGE = 100
|
|
REQUEST_DELAY = 0.5
|
|
MAX_RESULTS = 1000
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
|
|
class CitationAnalysisRequest(BaseModel):
|
|
userId: str
|
|
topic: str
|
|
year: int
|
|
|
|
|
|
|
|
def extract_keywords(text):
|
|
doc = nlp(text.lower())
|
|
noun_chunks = [chunk.text for chunk in doc.noun_chunks]
|
|
individual_tokens = [token.text for token in doc if token.pos_ in ["NOUN", "VERB"] and not token.is_stop]
|
|
keywords = set(noun_chunks + individual_tokens)
|
|
|
|
cleaned_keywords = set()
|
|
for keyword in keywords:
|
|
if not any(keyword in chunk for chunk in noun_chunks if keyword != chunk):
|
|
cleaned_keywords.add(keyword)
|
|
return sorted(list(cleaned_keywords))
|
|
|
|
|
|
|
|
def fetch_citing_papers_for_batch(paper_ids):
|
|
citing_map = {pid: [] for pid in paper_ids}
|
|
batch_size = 100
|
|
|
|
def fetch_batch(batch_ids):
|
|
filter_query = "|".join(batch_ids)
|
|
cursor = "*"
|
|
local_citing_map = {pid: [] for pid in batch_ids}
|
|
|
|
while True:
|
|
params = {
|
|
"filter": f"cites:{filter_query}",
|
|
"per_page": 200,
|
|
"cursor": cursor
|
|
}
|
|
|
|
try:
|
|
response = requests.get(OPENALEX_API_URL, params=params)
|
|
if response.status_code != 200:
|
|
print(f"β Error while fetching citing papers: {response.status_code}")
|
|
break
|
|
|
|
data = response.json()
|
|
for citing_work in data.get("results", []):
|
|
refs = citing_work.get("referenced_works", [])
|
|
for ref_id in refs:
|
|
if ref_id in local_citing_map:
|
|
local_citing_map[ref_id].append(citing_work["id"])
|
|
|
|
next_cursor = data.get("meta", {}).get("next_cursor")
|
|
if not next_cursor:
|
|
break
|
|
cursor = next_cursor
|
|
sleep(REQUEST_DELAY)
|
|
|
|
except Exception as e:
|
|
print(f"β Exception: {e}")
|
|
break
|
|
|
|
return local_citing_map
|
|
|
|
print("\nπ Fetching citing papers for all topic papers...")
|
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
|
futures = []
|
|
for i in range(0, len(paper_ids), batch_size):
|
|
batch_ids = paper_ids[i:i + batch_size]
|
|
futures.append(executor.submit(fetch_batch, batch_ids))
|
|
|
|
for future in tqdm(futures, desc="ParallelGroup"):
|
|
local_citing_map = future.result()
|
|
citing_map.update(local_citing_map)
|
|
|
|
return citing_map
|
|
|
|
|
|
|
|
def fetch_papers_with_citations(keywords, year, concept_threshold=0.8, max_results=MAX_RESULTS):
|
|
query = " + ".join(keywords)
|
|
print(f"\nπ Final OpenAlex Query: {query}")
|
|
params = {
|
|
"filter": f"title_and_abstract.search:{query},publication_year:{year}",
|
|
"per_page": PER_PAGE,
|
|
"cursor": "*"
|
|
}
|
|
|
|
all_papers = []
|
|
paper_id_map = {}
|
|
print("\nπ Fetching topic papers from OpenAlex...")
|
|
|
|
while len(all_papers) < max_results:
|
|
response = requests.get(OPENALEX_API_URL, params=params)
|
|
if response.status_code != 200:
|
|
print(f"β Error: {response.status_code}")
|
|
break
|
|
|
|
data = response.json()
|
|
results = data.get("results", [])
|
|
|
|
for paper in tqdm(results, desc="π Collecting papers"):
|
|
paper_id = paper.get("id", "")
|
|
concepts = [
|
|
c["display_name"]
|
|
for c in paper.get("concepts", [])
|
|
if c.get("score", 0) >= concept_threshold
|
|
]
|
|
|
|
paper_data = {
|
|
"id": paper_id,
|
|
"title": paper.get("title", "No title"),
|
|
"cited_by_count": paper.get("cited_by_count", 0),
|
|
"publication_date": paper.get("publication_date", ""),
|
|
"referenced_works": paper.get("referenced_works", []),
|
|
"concepts": concepts,
|
|
"cited_by_ids": []
|
|
}
|
|
paper_id_map[paper_id] = paper_data
|
|
all_papers.append(paper_data)
|
|
|
|
next_cursor = data.get("meta", {}).get("next_cursor")
|
|
if not next_cursor:
|
|
break
|
|
|
|
params["cursor"] = next_cursor
|
|
sleep(REQUEST_DELAY)
|
|
|
|
all_ids = list(paper_id_map.keys())
|
|
citing_map = fetch_citing_papers_for_batch(all_ids)
|
|
|
|
for pid, citing_ids in citing_map.items():
|
|
paper_id_map[pid]["cited_by_ids"] = citing_ids
|
|
|
|
cleaned_papers = []
|
|
for paper in all_papers:
|
|
if paper.get("referenced_works") or paper.get("cited_by_ids"):
|
|
cleaned_papers.append(paper)
|
|
|
|
print(f"\nπ§Ή Removed {len(all_papers) - len(cleaned_papers)} papers without references or citations.")
|
|
return cleaned_papers[:max_results]
|
|
|
|
|
|
|
|
async def save_to_mongodb(userId, topic, year, papers, request: Request):
|
|
metadata = {
|
|
"userId": userId,
|
|
"topic": topic,
|
|
"year": year,
|
|
"scraped_on": datetime.utcnow().isoformat() + "Z",
|
|
"papers": papers
|
|
}
|
|
collection = request.app.state.collection1
|
|
result = await collection.insert_one(metadata)
|
|
print(f"\nβ
Saved metadata to MongoDB with ID: {result.inserted_id}")
|
|
return str(result.inserted_id)
|
|
|
|
|
|
|
|
@router.post("/save")
|
|
async def save_data(data_request: CitationAnalysisRequest, saveRequest: Request):
|
|
userId = data_request.userId
|
|
topic = data_request.topic
|
|
year = data_request.year
|
|
|
|
keywords = extract_keywords(topic)
|
|
print("\nπ Extracted Keywords:")
|
|
print(keywords)
|
|
|
|
if not keywords:
|
|
raise HTTPException(status_code=400, detail="No keywords extracted. Please provide a valid topic.")
|
|
|
|
papers = fetch_papers_with_citations(keywords, year)
|
|
if not papers:
|
|
raise HTTPException(status_code=404, detail="No papers retrieved for the given topic and year.")
|
|
|
|
document_id = await save_to_mongodb(userId, topic, year, papers, saveRequest)
|
|
return {"message": "Data saved successfully!", "document_id": document_id}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.get("/citation-data", response_class=HTMLResponse)
|
|
async def get_citation_data(request: Request):
|
|
|
|
user_id = request.query_params.get("userId")
|
|
topic = request.query_params.get("topic")
|
|
year = request.query_params.get("year")
|
|
|
|
if not user_id or not topic or not year:
|
|
raise HTTPException(status_code=400, detail="Missing required query parameters.")
|
|
|
|
|
|
query = {"userId": user_id, "topic": topic, "year": int(year)}
|
|
|
|
collection = request.app.state.collection1
|
|
result = await collection.find_one(
|
|
query,
|
|
sort=[("scraped_on", -1)]
|
|
)
|
|
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="No data found for the specified criteria")
|
|
|
|
|
|
result["_id"] = str(result["_id"])
|
|
|
|
data = result["papers"]
|
|
if not data:
|
|
raise HTTPException(status_code=404, detail="No papers found in the database.")
|
|
|
|
data_json = json.dumps(data)
|
|
|
|
return templates.TemplateResponse("gra.html", {"request": request, "data": data_json})
|
|
|
|
|
|
@router.post("/check-data-exists-citation/")
|
|
async def check_data_exists(data_request: CitationAnalysisRequest, saveRequest: Request):
|
|
|
|
query = {
|
|
"userId": data_request.userId,
|
|
"topic": data_request.topic,
|
|
"year": data_request.year
|
|
}
|
|
|
|
|
|
collection = saveRequest.app.state.collection1
|
|
|
|
|
|
document = await collection.find_one(query)
|
|
|
|
|
|
if document:
|
|
return {
|
|
"exists": True,
|
|
"message": "Data found for the given userId, topic, and year."
|
|
}
|
|
else:
|
|
return {
|
|
"exists": False,
|
|
"message": "No data found for the given userId, topic, and year."
|
|
}
|
|
|