Paperlens / venuedata.py
Ippo987's picture
Yup this is it
e487cc6 verified
raw
history blame
7.61 kB
from fastapi import HTTPException, APIRouter, Request
from pydantic import BaseModel
import requests
from time import sleep
import spacy
from pymongo import MongoClient
# Load SciSpacy model
nlp = spacy.load("en_core_sci_sm")
router = APIRouter()
# Pydantic model for request body
class SearchRequest(BaseModel):
topic: str
year: int
userId: str # Add userId to the request body
# Function to extract keywords from user topic
def extract_keywords(text):
doc = nlp(text.lower())
noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
individual_tokens = [
token.text.strip() for token in doc
if token.pos_ in ["NOUN", "VERB"] and not token.is_stop
]
keywords = set(noun_chunks + individual_tokens)
cleaned_keywords = set()
for keyword in keywords:
if not any(keyword in chunk and keyword != chunk for chunk in noun_chunks):
cleaned_keywords.add(keyword)
return sorted(list(cleaned_keywords))
# Fetch works from OpenAlex based on refined topic
def fetch_works(refined_query, year, per_page=200):
OPENALEX_API_URL = f"https://api.openalex.org/works"
params = {
"filter": f"title_and_abstract.search:{refined_query},publication_year:{year}",
"per_page": per_page,
"cursor": "*"
}
all_results = []
while True:
response = requests.get(OPENALEX_API_URL, params=params)
if response.status_code != 200:
raise HTTPException(status_code=response.status_code, detail="Error fetching data from OpenAlex")
data = response.json()
all_results.extend(data.get("results", []))
next_cursor = data.get("meta", {}).get("next_cursor")
if next_cursor:
params["cursor"] = next_cursor
else:
break
sleep(0.2)
return all_results
# Batch fetch host organization details for sources
def batch_fetch_host_org_details(source_ids):
host_org_map = {}
batch_size = 100
for i in range(0, len(source_ids), batch_size):
batch = source_ids[i:i + batch_size]
openalex_ids = [src_id.split("/")[-1] for src_id in batch]
filter_query = "|".join(openalex_ids)
url = f"https://api.openalex.org/sources?filter=openalex_id:{filter_query}"
response = requests.get(url)
if response.status_code == 200:
sources = response.json().get("results", [])
for source in sources:
source_id = source.get("id")
# Handle the case where host_organization is a string (URL)
host_org_id = source.get("host_organization", "unknown")
host_org_name = source.get("host_organization_name", "unknown")
host_org_map[source_id] = {
"host_organization_name": host_org_name,
"host_organization_id": host_org_id
}
sleep(0.1)
else:
raise HTTPException(status_code=response.status_code, detail="Error fetching host organization details")
return host_org_map
# Extract metadata from works
def extract_metadata(works):
source_ids = list({
work["primary_location"]["source"]["id"]
for work in works
if work.get("primary_location") and work["primary_location"].get("source")
})
# Fetch host organization details (name and ID)
host_org_map = batch_fetch_host_org_details(source_ids)
metadata = []
for work in works:
primary_location = work.get("primary_location", {}) or {}
source = primary_location.get("source", {}) or {}
source_id = source.get("id")
host_org_details = host_org_map.get(source_id, {"host_organization_name": "Unknown", "host_organization_id": "Unknown"})
# Extract type field
work_type = (
work.get("type") or # Check the top-level "type" field
primary_location.get("type") or # Check "type" in primary_location
source.get("type") or # Check "type" in source
"unknown" # Fallback value
)
metadata.append({
"id": work.get("id"),
"is_oa": work.get("open_access", {}).get("is_oa", False),
"oa_status": work.get("open_access", {}).get("oa_status", "unknown"),
"venue": source.get("display_name", "unknown"),
"venue_id": source_id or "unknown",
"host_organization_name": host_org_details["host_organization_name"],
"host_organization_id": host_org_details["host_organization_id"],
"type": work_type,
"publication_date": work.get("publication_date", "unknown"),
})
return metadata
# Clean metadata by removing entries with unknown venue and host organization name
def clean_metadata(metadata):
initial_count = len(metadata)
cleaned_metadata = [
entry for entry in metadata
if not (entry["venue"] == "unknown" and entry["host_organization_name"] == "Unknown")
]
final_count = len(cleaned_metadata)
print(f"Total papers before cleaning: {initial_count}")
print(f"Total papers after cleaning: {final_count}")
return cleaned_metadata
# Save metadata to MongoDB
async def save_to_mongodb(userId, topic, year, metadata,request:Request):
document = {
"userId": userId,
"topic": topic,
"year": year,
"metadata": metadata
}
collection = request.app.state.collection2
await collection.update_one(
{"userId": userId, "topic": topic, "year": year},
{"$set": document},
upsert=True
)
print(f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}")
# FastAPI endpoint
@router.post("/search/")
async def search(data:SearchRequest,request: Request):
userId = data.userId
topic = data.topic
year = data.year
# Extract keywords and refine query
keywords = extract_keywords(topic)
refined_query = "+".join(keywords)
print(f"Using refined search query: {refined_query}")
# Fetch works from OpenAlex
works = fetch_works(refined_query, year)
if not works:
raise HTTPException(status_code=404, detail="No works found for the given query")
# Extract metadata
metadata = extract_metadata(works)
# Clean metadata
cleaned_metadata = clean_metadata(metadata)
# Save metadata to MongoDB
await save_to_mongodb(userId, topic, year, cleaned_metadata,request)
# Return metadata as JSON response
return {"message": f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}", "metadata": cleaned_metadata}
@router.post("/check-data-exists-venue/")
async def check_data_exists(request_data: SearchRequest, request:Request):
# Create a query to check if the data exists
query = {
"userId": request_data.userId,
"topic": request_data.topic
}
# Add year to query if it's provided
if request_data.year:
query["year"] = request_data.year
collection = request.app.state.collection2
# Check if a document matching the query exists
document = await collection.find_one(query) # Await the async operation
# Return result
return {
"exists": document is not None,
"message": "Data found" if document else "Data not found"
}