|
from fastapi import HTTPException, APIRouter, Request
|
|
from pydantic import BaseModel
|
|
import requests
|
|
from time import sleep
|
|
import spacy
|
|
from pymongo import MongoClient
|
|
|
|
|
|
nlp = spacy.load("en_core_sci_sm")
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
topic: str
|
|
year: int
|
|
userId: str
|
|
|
|
|
|
|
|
|
|
def extract_keywords(text):
|
|
doc = nlp(text.lower())
|
|
noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
|
|
individual_tokens = [
|
|
token.text.strip() for token in doc
|
|
if token.pos_ in ["NOUN", "VERB"] and not token.is_stop
|
|
]
|
|
keywords = set(noun_chunks + individual_tokens)
|
|
|
|
cleaned_keywords = set()
|
|
for keyword in keywords:
|
|
if not any(keyword in chunk and keyword != chunk for chunk in noun_chunks):
|
|
cleaned_keywords.add(keyword)
|
|
return sorted(list(cleaned_keywords))
|
|
|
|
|
|
def fetch_works(refined_query, year, per_page=200):
|
|
OPENALEX_API_URL = f"https://api.openalex.org/works"
|
|
params = {
|
|
"filter": f"title_and_abstract.search:{refined_query},publication_year:{year}",
|
|
"per_page": per_page,
|
|
"cursor": "*"
|
|
}
|
|
all_results = []
|
|
|
|
while True:
|
|
response = requests.get(OPENALEX_API_URL, params=params)
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=response.status_code, detail="Error fetching data from OpenAlex")
|
|
|
|
data = response.json()
|
|
all_results.extend(data.get("results", []))
|
|
|
|
next_cursor = data.get("meta", {}).get("next_cursor")
|
|
if next_cursor:
|
|
params["cursor"] = next_cursor
|
|
else:
|
|
break
|
|
|
|
sleep(0.2)
|
|
|
|
return all_results
|
|
|
|
|
|
def batch_fetch_host_org_details(source_ids):
|
|
host_org_map = {}
|
|
batch_size = 100
|
|
for i in range(0, len(source_ids), batch_size):
|
|
batch = source_ids[i:i + batch_size]
|
|
openalex_ids = [src_id.split("/")[-1] for src_id in batch]
|
|
filter_query = "|".join(openalex_ids)
|
|
url = f"https://api.openalex.org/sources?filter=openalex_id:{filter_query}"
|
|
response = requests.get(url)
|
|
if response.status_code == 200:
|
|
sources = response.json().get("results", [])
|
|
for source in sources:
|
|
source_id = source.get("id")
|
|
|
|
host_org_id = source.get("host_organization", "unknown")
|
|
host_org_name = source.get("host_organization_name", "unknown")
|
|
|
|
host_org_map[source_id] = {
|
|
"host_organization_name": host_org_name,
|
|
"host_organization_id": host_org_id
|
|
}
|
|
sleep(0.1)
|
|
else:
|
|
raise HTTPException(status_code=response.status_code, detail="Error fetching host organization details")
|
|
return host_org_map
|
|
|
|
|
|
def extract_metadata(works):
|
|
source_ids = list({
|
|
work["primary_location"]["source"]["id"]
|
|
for work in works
|
|
if work.get("primary_location") and work["primary_location"].get("source")
|
|
})
|
|
|
|
|
|
host_org_map = batch_fetch_host_org_details(source_ids)
|
|
|
|
metadata = []
|
|
for work in works:
|
|
primary_location = work.get("primary_location", {}) or {}
|
|
source = primary_location.get("source", {}) or {}
|
|
|
|
source_id = source.get("id")
|
|
host_org_details = host_org_map.get(source_id, {"host_organization_name": "Unknown", "host_organization_id": "Unknown"})
|
|
|
|
|
|
work_type = (
|
|
work.get("type") or
|
|
primary_location.get("type") or
|
|
source.get("type") or
|
|
"unknown"
|
|
)
|
|
|
|
metadata.append({
|
|
"id": work.get("id"),
|
|
"is_oa": work.get("open_access", {}).get("is_oa", False),
|
|
"oa_status": work.get("open_access", {}).get("oa_status", "unknown"),
|
|
"venue": source.get("display_name", "unknown"),
|
|
"venue_id": source_id or "unknown",
|
|
"host_organization_name": host_org_details["host_organization_name"],
|
|
"host_organization_id": host_org_details["host_organization_id"],
|
|
"type": work_type,
|
|
"publication_date": work.get("publication_date", "unknown"),
|
|
})
|
|
return metadata
|
|
|
|
|
|
def clean_metadata(metadata):
|
|
initial_count = len(metadata)
|
|
cleaned_metadata = [
|
|
entry for entry in metadata
|
|
if not (entry["venue"] == "unknown" and entry["host_organization_name"] == "Unknown")
|
|
]
|
|
final_count = len(cleaned_metadata)
|
|
print(f"Total papers before cleaning: {initial_count}")
|
|
print(f"Total papers after cleaning: {final_count}")
|
|
return cleaned_metadata
|
|
|
|
|
|
async def save_to_mongodb(userId, topic, year, metadata,request:Request):
|
|
document = {
|
|
"userId": userId,
|
|
"topic": topic,
|
|
"year": year,
|
|
"metadata": metadata
|
|
}
|
|
collection = request.app.state.collection2
|
|
await collection.update_one(
|
|
{"userId": userId, "topic": topic, "year": year},
|
|
{"$set": document},
|
|
upsert=True
|
|
)
|
|
print(f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}")
|
|
|
|
|
|
@router.post("/search/")
|
|
async def search(data:SearchRequest,request: Request):
|
|
userId = data.userId
|
|
topic = data.topic
|
|
year = data.year
|
|
|
|
|
|
keywords = extract_keywords(topic)
|
|
refined_query = "+".join(keywords)
|
|
print(f"Using refined search query: {refined_query}")
|
|
|
|
|
|
works = fetch_works(refined_query, year)
|
|
if not works:
|
|
raise HTTPException(status_code=404, detail="No works found for the given query")
|
|
|
|
|
|
metadata = extract_metadata(works)
|
|
|
|
|
|
cleaned_metadata = clean_metadata(metadata)
|
|
|
|
|
|
await save_to_mongodb(userId, topic, year, cleaned_metadata,request)
|
|
|
|
|
|
return {"message": f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}", "metadata": cleaned_metadata}
|
|
|
|
@router.post("/check-data-exists-venue/")
|
|
async def check_data_exists(request_data: SearchRequest, request:Request):
|
|
|
|
query = {
|
|
"userId": request_data.userId,
|
|
"topic": request_data.topic
|
|
}
|
|
|
|
|
|
if request_data.year:
|
|
query["year"] = request_data.year
|
|
|
|
collection = request.app.state.collection2
|
|
|
|
document = await collection.find_one(query)
|
|
|
|
|
|
return {
|
|
"exists": document is not None,
|
|
"message": "Data found" if document else "Data not found"
|
|
}
|
|
|
|
|
|
|