Spaces:

Ippo987
/

Paperlens

Paused

File size: 7,608 Bytes

e487cc6

from fastapi import HTTPException, APIRouter, Request
from pydantic import BaseModel
import requests
from time import sleep
import spacy
from pymongo import MongoClient

# Load SciSpacy model
nlp = spacy.load("en_core_sci_sm")

router = APIRouter()

# Pydantic model for request body
class SearchRequest(BaseModel):
    topic: str
    year: int
    userId: str  # Add userId to the request body



# Function to extract keywords from user topic
def extract_keywords(text):
    doc = nlp(text.lower())
    noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
    individual_tokens = [
        token.text.strip() for token in doc 
        if token.pos_ in ["NOUN", "VERB"] and not token.is_stop
    ]
    keywords = set(noun_chunks + individual_tokens)

    cleaned_keywords = set()
    for keyword in keywords:
        if not any(keyword in chunk and keyword != chunk for chunk in noun_chunks):
            cleaned_keywords.add(keyword)
    return sorted(list(cleaned_keywords))

# Fetch works from OpenAlex based on refined topic
def fetch_works(refined_query, year, per_page=200):
    OPENALEX_API_URL = f"https://api.openalex.org/works"
    params = {
        "filter": f"title_and_abstract.search:{refined_query},publication_year:{year}",
        "per_page": per_page,
        "cursor": "*"
    }
    all_results = []

    while True:
        response = requests.get(OPENALEX_API_URL, params=params)
        if response.status_code != 200:
            raise HTTPException(status_code=response.status_code, detail="Error fetching data from OpenAlex")
        
        data = response.json()
        all_results.extend(data.get("results", []))

        next_cursor = data.get("meta", {}).get("next_cursor")
        if next_cursor:
            params["cursor"] = next_cursor
        else:
            break

        sleep(0.2)

    return all_results

# Batch fetch host organization details for sources
def batch_fetch_host_org_details(source_ids):
    host_org_map = {}
    batch_size = 100
    for i in range(0, len(source_ids), batch_size):
        batch = source_ids[i:i + batch_size]
        openalex_ids = [src_id.split("/")[-1] for src_id in batch]
        filter_query = "|".join(openalex_ids)
        url = f"https://api.openalex.org/sources?filter=openalex_id:{filter_query}"
        response = requests.get(url)
        if response.status_code == 200:
            sources = response.json().get("results", [])
            for source in sources:
                source_id = source.get("id")
                # Handle the case where host_organization is a string (URL)
                host_org_id = source.get("host_organization", "unknown")
                host_org_name = source.get("host_organization_name", "unknown")
                
                host_org_map[source_id] = {
                    "host_organization_name": host_org_name,
                    "host_organization_id": host_org_id
                }
            sleep(0.1)
        else:
            raise HTTPException(status_code=response.status_code, detail="Error fetching host organization details")
    return host_org_map

# Extract metadata from works
def extract_metadata(works):
    source_ids = list({
        work["primary_location"]["source"]["id"]
        for work in works 
        if work.get("primary_location") and work["primary_location"].get("source")
    })
    
    # Fetch host organization details (name and ID)
    host_org_map = batch_fetch_host_org_details(source_ids)
    
    metadata = []
    for work in works:
        primary_location = work.get("primary_location", {}) or {}
        source = primary_location.get("source", {}) or {}

        source_id = source.get("id")
        host_org_details = host_org_map.get(source_id, {"host_organization_name": "Unknown", "host_organization_id": "Unknown"})

        # Extract type field
        work_type = (
            work.get("type") or  # Check the top-level "type" field
            primary_location.get("type") or  # Check "type" in primary_location
            source.get("type") or  # Check "type" in source
            "unknown"  # Fallback value
        )

        metadata.append({
            "id": work.get("id"),
            "is_oa": work.get("open_access", {}).get("is_oa", False),
            "oa_status": work.get("open_access", {}).get("oa_status", "unknown"),
            "venue": source.get("display_name", "unknown"),
            "venue_id": source_id or "unknown",
            "host_organization_name": host_org_details["host_organization_name"],
            "host_organization_id": host_org_details["host_organization_id"],
            "type": work_type,
            "publication_date": work.get("publication_date", "unknown"),
        })
    return metadata

# Clean metadata by removing entries with unknown venue and host organization name
def clean_metadata(metadata):
    initial_count = len(metadata)
    cleaned_metadata = [
        entry for entry in metadata
        if not (entry["venue"] == "unknown" and entry["host_organization_name"] == "Unknown")
    ]
    final_count = len(cleaned_metadata)
    print(f"Total papers before cleaning: {initial_count}")
    print(f"Total papers after cleaning: {final_count}")
    return cleaned_metadata

# Save metadata to MongoDB
async def save_to_mongodb(userId, topic, year, metadata,request:Request):
    document = {
        "userId": userId,
        "topic": topic,
        "year": year,
        "metadata": metadata
    }
    collection = request.app.state.collection2
    await collection.update_one(
        {"userId": userId, "topic": topic, "year": year},
        {"$set": document},
        upsert=True
    )
    print(f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}")

# FastAPI endpoint
@router.post("/search/")
async def search(data:SearchRequest,request: Request):
    userId = data.userId
    topic = data.topic
    year = data.year

    # Extract keywords and refine query
    keywords = extract_keywords(topic)
    refined_query = "+".join(keywords)
    print(f"Using refined search query: {refined_query}")

    # Fetch works from OpenAlex
    works = fetch_works(refined_query, year)
    if not works:
        raise HTTPException(status_code=404, detail="No works found for the given query")

    # Extract metadata
    metadata = extract_metadata(works)

    # Clean metadata
    cleaned_metadata = clean_metadata(metadata)

    # Save metadata to MongoDB
    await save_to_mongodb(userId, topic, year, cleaned_metadata,request)

    # Return metadata as JSON response
    return {"message": f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}", "metadata": cleaned_metadata}

@router.post("/check-data-exists-venue/")
async def check_data_exists(request_data: SearchRequest, request:Request):
    # Create a query to check if the data exists
    query = {
        "userId": request_data.userId,
        "topic": request_data.topic
    }
    
    # Add year to query if it's provided
    if request_data.year:
        query["year"] = request_data.year
    
    collection = request.app.state.collection2
    # Check if a document matching the query exists
    document = await collection.find_one(query)  # Await the async operation
    
    # Return result
    return {
        "exists": document is not None,
        "message": "Data found" if document else "Data not found"
    }