File size: 7,608 Bytes
e487cc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from fastapi import HTTPException, APIRouter, Request
from pydantic import BaseModel
import requests
from time import sleep
import spacy
from pymongo import MongoClient

# Load SciSpacy model
nlp = spacy.load("en_core_sci_sm")

router = APIRouter()

# Pydantic model for request body
class SearchRequest(BaseModel):
    topic: str
    year: int
    userId: str  # Add userId to the request body



# Function to extract keywords from user topic
def extract_keywords(text):
    doc = nlp(text.lower())
    noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
    individual_tokens = [
        token.text.strip() for token in doc 
        if token.pos_ in ["NOUN", "VERB"] and not token.is_stop
    ]
    keywords = set(noun_chunks + individual_tokens)

    cleaned_keywords = set()
    for keyword in keywords:
        if not any(keyword in chunk and keyword != chunk for chunk in noun_chunks):
            cleaned_keywords.add(keyword)
    return sorted(list(cleaned_keywords))

# Fetch works from OpenAlex based on refined topic
def fetch_works(refined_query, year, per_page=200):
    OPENALEX_API_URL = f"https://api.openalex.org/works"
    params = {
        "filter": f"title_and_abstract.search:{refined_query},publication_year:{year}",
        "per_page": per_page,
        "cursor": "*"
    }
    all_results = []

    while True:
        response = requests.get(OPENALEX_API_URL, params=params)
        if response.status_code != 200:
            raise HTTPException(status_code=response.status_code, detail="Error fetching data from OpenAlex")
        
        data = response.json()
        all_results.extend(data.get("results", []))

        next_cursor = data.get("meta", {}).get("next_cursor")
        if next_cursor:
            params["cursor"] = next_cursor
        else:
            break

        sleep(0.2)

    return all_results

# Batch fetch host organization details for sources
def batch_fetch_host_org_details(source_ids):
    host_org_map = {}
    batch_size = 100
    for i in range(0, len(source_ids), batch_size):
        batch = source_ids[i:i + batch_size]
        openalex_ids = [src_id.split("/")[-1] for src_id in batch]
        filter_query = "|".join(openalex_ids)
        url = f"https://api.openalex.org/sources?filter=openalex_id:{filter_query}"
        response = requests.get(url)
        if response.status_code == 200:
            sources = response.json().get("results", [])
            for source in sources:
                source_id = source.get("id")
                # Handle the case where host_organization is a string (URL)
                host_org_id = source.get("host_organization", "unknown")
                host_org_name = source.get("host_organization_name", "unknown")
                
                host_org_map[source_id] = {
                    "host_organization_name": host_org_name,
                    "host_organization_id": host_org_id
                }
            sleep(0.1)
        else:
            raise HTTPException(status_code=response.status_code, detail="Error fetching host organization details")
    return host_org_map

# Extract metadata from works
def extract_metadata(works):
    source_ids = list({
        work["primary_location"]["source"]["id"]
        for work in works 
        if work.get("primary_location") and work["primary_location"].get("source")
    })
    
    # Fetch host organization details (name and ID)
    host_org_map = batch_fetch_host_org_details(source_ids)
    
    metadata = []
    for work in works:
        primary_location = work.get("primary_location", {}) or {}
        source = primary_location.get("source", {}) or {}

        source_id = source.get("id")
        host_org_details = host_org_map.get(source_id, {"host_organization_name": "Unknown", "host_organization_id": "Unknown"})

        # Extract type field
        work_type = (
            work.get("type") or  # Check the top-level "type" field
            primary_location.get("type") or  # Check "type" in primary_location
            source.get("type") or  # Check "type" in source
            "unknown"  # Fallback value
        )

        metadata.append({
            "id": work.get("id"),
            "is_oa": work.get("open_access", {}).get("is_oa", False),
            "oa_status": work.get("open_access", {}).get("oa_status", "unknown"),
            "venue": source.get("display_name", "unknown"),
            "venue_id": source_id or "unknown",
            "host_organization_name": host_org_details["host_organization_name"],
            "host_organization_id": host_org_details["host_organization_id"],
            "type": work_type,
            "publication_date": work.get("publication_date", "unknown"),
        })
    return metadata

# Clean metadata by removing entries with unknown venue and host organization name
def clean_metadata(metadata):
    initial_count = len(metadata)
    cleaned_metadata = [
        entry for entry in metadata
        if not (entry["venue"] == "unknown" and entry["host_organization_name"] == "Unknown")
    ]
    final_count = len(cleaned_metadata)
    print(f"Total papers before cleaning: {initial_count}")
    print(f"Total papers after cleaning: {final_count}")
    return cleaned_metadata

# Save metadata to MongoDB
async def save_to_mongodb(userId, topic, year, metadata,request:Request):
    document = {
        "userId": userId,
        "topic": topic,
        "year": year,
        "metadata": metadata
    }
    collection = request.app.state.collection2
    await collection.update_one(
        {"userId": userId, "topic": topic, "year": year},
        {"$set": document},
        upsert=True
    )
    print(f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}")

# FastAPI endpoint
@router.post("/search/")
async def search(data:SearchRequest,request: Request):
    userId = data.userId
    topic = data.topic
    year = data.year

    # Extract keywords and refine query
    keywords = extract_keywords(topic)
    refined_query = "+".join(keywords)
    print(f"Using refined search query: {refined_query}")

    # Fetch works from OpenAlex
    works = fetch_works(refined_query, year)
    if not works:
        raise HTTPException(status_code=404, detail="No works found for the given query")

    # Extract metadata
    metadata = extract_metadata(works)

    # Clean metadata
    cleaned_metadata = clean_metadata(metadata)

    # Save metadata to MongoDB
    await save_to_mongodb(userId, topic, year, cleaned_metadata,request)

    # Return metadata as JSON response
    return {"message": f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}", "metadata": cleaned_metadata}

@router.post("/check-data-exists-venue/")
async def check_data_exists(request_data: SearchRequest, request:Request):
    # Create a query to check if the data exists
    query = {
        "userId": request_data.userId,
        "topic": request_data.topic
    }
    
    # Add year to query if it's provided
    if request_data.year:
        query["year"] = request_data.year
    
    collection = request.app.state.collection2
    # Check if a document matching the query exists
    document = await collection.find_one(query)  # Await the async operation
    
    # Return result
    return {
        "exists": document is not None,
        "message": "Data found" if document else "Data not found"
    }