from typing import List from bio_requests.rag_request import RagRequest from dto.bio_document import BaseBioDocument from utils.bio_logger import bio_logger as logger import pandas as pd # Load the Excel file df = pd.read_excel("config/2023JCR(完整).xlsx") # Select only the 'ISSN' and '5年IF' columns df = df[["ISSN", "5年IF", "EISSN"]] # Convert '5年IF' to float, setting invalid values to 0.01 df["5年IF"] = pd.to_numeric(df["5年IF"], errors="coerce").fillna(0.01) class RerankService: def __init__(self): # Select only the 'ISSN' and '5年IF' columns self.df = df async def rerank( self, rag_request: RagRequest, documents: List[BaseBioDocument] = [] ) -> List[BaseBioDocument]: if not rag_request.data_source or "pubmed" not in rag_request.data_source: logger.info("RerankService: data_source is not pubmed, skip rerank") return documents logger.info("RerankService: start rerank") # Now sorted_documents contains the documents sorted by "5-year IF" from high to low # Step 1: Extract ISSN and query the DataFrame for "5-year IF" for document in documents: issn = document.journal["issn"] # Check if ISSN exists in the 'ISSN' column if_5_year = self.df.loc[self.df["ISSN"] == issn, "5年IF"].values if if_5_year.size > 0: document.if_score = if_5_year[0] else: # If not found in 'ISSN', check the 'EISSN' column if_5_year = self.df.loc[self.df["EISSN"] == issn, "5年IF"].values if if_5_year.size > 0: document.if_score = if_5_year[0] else: document.if_score = None # Step 2: De-duplicate the ID of each document in the documents list documents = list({doc.bio_id: doc for doc in documents}.values()) # Step 3: Sort documents by "5-year IF" in descending order sorted_documents = sorted( documents, key=lambda x: x.if_score if x.if_score is not None else 0.01, reverse=True, ) return sorted_documents