File size: 2,195 Bytes
79899c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from typing import List
from bio_requests.rag_request import RagRequest
from dto.bio_document import BaseBioDocument
from utils.bio_logger import bio_logger as logger

import pandas as pd

# Load the Excel file
df = pd.read_excel("config/2023JCR(完整).xlsx")

# Select only the 'ISSN' and '5年IF' columns
df = df[["ISSN", "5年IF", "EISSN"]]

# Convert '5年IF' to float, setting invalid values to 0.01
df["5年IF"] = pd.to_numeric(df["5年IF"], errors="coerce").fillna(0.01)


class RerankService:
    def __init__(self):

        # Select only the 'ISSN' and '5年IF' columns
        self.df = df

    async def rerank(
        self, rag_request: RagRequest, documents: List[BaseBioDocument] = []
    ) -> List[BaseBioDocument]:
        if not rag_request.data_source or "pubmed" not in rag_request.data_source:
            logger.info("RerankService: data_source is not pubmed, skip rerank")
            return documents
        logger.info("RerankService: start rerank")
        # Now sorted_documents contains the documents sorted by "5-year IF" from high to low

        # Step 1: Extract ISSN and query the DataFrame for "5-year IF"

        for document in documents:
            issn = document.journal["issn"]

            # Check if ISSN exists in the 'ISSN' column
            if_5_year = self.df.loc[self.df["ISSN"] == issn, "5年IF"].values
            if if_5_year.size > 0:
                document.if_score = if_5_year[0]
            else:
                # If not found in 'ISSN', check the 'EISSN' column
                if_5_year = self.df.loc[self.df["EISSN"] == issn, "5年IF"].values
                if if_5_year.size > 0:
                    document.if_score = if_5_year[0]
                else:
                    document.if_score = None

        # Step 2: De-duplicate the ID of each document in the documents list
        documents = list({doc.bio_id: doc for doc in documents}.values())

        # Step 3: Sort documents by "5-year IF" in descending order
        sorted_documents = sorted(
            documents,
            key=lambda x: x.if_score if x.if_score is not None else 0.01,
            reverse=True,
        )

        return sorted_documents