Spaces:

jackkuo
/

streamlit-macp-agents

Sleeping

App Files Files Community

streamlit-macp-agents / python-services /Retrieve /service /rerank.py

jackkuo

add QA

79899c0 12 days ago

raw

history blame contribute delete

2.2 kB

	from typing import List
	from bio_requests.rag_request import RagRequest
	from dto.bio_document import BaseBioDocument
	from utils.bio_logger import bio_logger as logger

	import pandas as pd

	# Load the Excel file
	df = pd.read_excel("config/2023JCR（完整）.xlsx")

	# Select only the 'ISSN' and '5年IF' columns
	df = df[["ISSN", "5年IF", "EISSN"]]

	# Convert '5年IF' to float, setting invalid values to 0.01
	df["5年IF"] = pd.to_numeric(df["5年IF"], errors="coerce").fillna(0.01)


	class RerankService:
	def __init__(self):

	# Select only the 'ISSN' and '5年IF' columns
	self.df = df

	async def rerank(
	self, rag_request: RagRequest, documents: List[BaseBioDocument] = []
	) -> List[BaseBioDocument]:
	if not rag_request.data_source or "pubmed" not in rag_request.data_source:
	logger.info("RerankService: data_source is not pubmed, skip rerank")
	return documents
	logger.info("RerankService: start rerank")
	# Now sorted_documents contains the documents sorted by "5-year IF" from high to low

	# Step 1: Extract ISSN and query the DataFrame for "5-year IF"

	for document in documents:
	issn = document.journal["issn"]

	# Check if ISSN exists in the 'ISSN' column
	if_5_year = self.df.loc[self.df["ISSN"] == issn, "5年IF"].values
	if if_5_year.size > 0:
	document.if_score = if_5_year[0]
	else:
	# If not found in 'ISSN', check the 'EISSN' column
	if_5_year = self.df.loc[self.df["EISSN"] == issn, "5年IF"].values
	if if_5_year.size > 0:
	document.if_score = if_5_year[0]
	else:
	document.if_score = None

	# Step 2: De-duplicate the ID of each document in the documents list
	documents = list({doc.bio_id: doc for doc in documents}.values())

	# Step 3: Sort documents by "5-year IF" in descending order
	sorted_documents = sorted(
	documents,
	key=lambda x: x.if_score if x.if_score is not None else 0.01,
	reverse=True,
	)

	return sorted_documents