import os import time import shutil import pandas as pd import traceback import sys from pathlib import Path from fastapi import APIRouter, UploadFile, File, HTTPException, Depends, Body from fastapi.responses import FileResponse from custom_auth import get_current_user_from_token from services.sentence_transformer_service import SentenceTransformerService, sentence_transformer_service # Add the path to import modules from meisai-check-ai sys.path.append(os.path.join(os.path.dirname(__file__), "..", "meisai-check-ai")) from mapping_lib.standard_subject_data_mapper import StandardSubjectDataMapper from mapping_lib.subject_similarity_mapper import SubjectSimilarityMapper from mapping_lib.sub_subject_similarity_mapper import SubSubjectSimilarityMapper from mapping_lib.name_similarity_mapper import NameSimilarityMapper from mapping_lib.sub_subject_and_name_data_mapper import SubSubjectAndNameDataMapper from mapping_lib.abstract_similarity_mapper import AbstractSimilarityMapper from mapping_lib.name_and_abstract_mapper import NameAndAbstractDataMapper from mapping_lib.unit_similarity_mapper import UnitSimilarityMapper from mapping_lib.standard_name_mapper import StandardNameMapper from config import UPLOAD_DIR, OUTPUT_DIR from models import ( EmbeddingRequest, PredictRawRequest, PredictRawResponse, PredictRecord, PredictResult, ) router = APIRouter() @router.post("/predict") async def predict( current_user=Depends(get_current_user_from_token), file: UploadFile = File(...), sentence_service: SentenceTransformerService = Depends( lambda: sentence_transformer_service ), ): """ Process an input CSV file and return standardized names (requires authentication) """ if not file.filename.endswith(".csv"): raise HTTPException(status_code=400, detail="Only CSV files are supported") # Save uploaded file timestamp = int(time.time()) input_file_path = os.path.join(UPLOAD_DIR, f"input_{timestamp}_{current_user.username}.csv") output_file_path = os.path.join(OUTPUT_DIR, f"output_{timestamp}_{current_user.username}.csv") try: with open(input_file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) finally: file.file.close() try: # Load input data start_time = time.time() df_input_data = pd.read_csv(input_file_path) # Ensure basic columns exist with default values basic_columns = { "シート名": "", "行": "", "科目": "", "中科目": "", "分類": "", "名称": "", "単位": "", "摘要": "", "備考": "", } for col, default_value in basic_columns.items(): if col not in df_input_data.columns: df_input_data[col] = default_value # Process data using the new mapping system similar to predict.py try: # Subject mapping if sentence_service.df_subject_map_data is not None: subject_similarity_mapper = SubjectSimilarityMapper( cached_embedding_helper=sentence_service.subject_cached_embedding_helper, df_map_data=sentence_service.df_subject_map_data, ) list_input_subject = df_input_data["科目"].unique() df_subject_data = pd.DataFrame({"科目": list_input_subject}) subject_similarity_mapper.predict_input(df_input_data=df_subject_data) output_subject_map = dict( zip(df_subject_data["科目"], df_subject_data["出力_科目"]) ) df_input_data["標準科目"] = df_input_data["科目"].map( output_subject_map ) df_input_data["出力_科目"] = df_input_data["科目"].map( output_subject_map ) except Exception as e: print(f"Error processing SubjectSimilarityMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Standard subject mapping if sentence_service.df_standard_subject_map_data is not None: standard_subject_data_mapper = StandardSubjectDataMapper( df_map_data=sentence_service.df_standard_subject_map_data ) df_output_data = standard_subject_data_mapper.map_data( df_input_data=df_input_data, input_key_columns=["出力_科目"], in_place=True, ) else: df_output_data = df_input_data.copy() except Exception as e: print(f"Error processing StandardSubjectDataMapper: {e}") # Continue with original data if standard subject mapping fails df_output_data = df_input_data.copy() try: # Sub subject mapping if sentence_service.df_sub_subject_map_data is not None: sub_subject_similarity_mapper = SubSubjectSimilarityMapper( cached_embedding_helper=sentence_service.sub_subject_cached_embedding_helper, df_map_data=sentence_service.df_sub_subject_map_data, ) sub_subject_similarity_mapper.predict_input( df_input_data=df_output_data ) df_output_data = df_output_data.fillna("") except Exception as e: print(f"Error processing SubSubjectSimilarityMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Name mapping if sentence_service.df_name_map_data is not None: name_sentence_mapper = NameSimilarityMapper( cached_embedding_helper=sentence_service.name_cached_embedding_helper, df_map_data=sentence_service.df_name_map_data, ) name_sentence_mapper.predict_input(df_input_data=df_output_data) except Exception as e: print(f"Error processing NameSimilarityMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Sub subject and name mapping if sentence_service.df_sub_subject_and_name_map_data is not None: sub_subject_and_name_mapper = SubSubjectAndNameDataMapper( df_map_data=sentence_service.df_sub_subject_and_name_map_data ) sub_subject_and_name_mapper.map_data(df_input_data=df_output_data) except Exception as e: print(f"Error processing SubSubjectAndNameDataMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Abstract mapping if sentence_service.df_abstract_map_data is not None: # Ensure required columns exist before AbstractSimilarityMapper required_columns_for_abstract = { "標準科目": "", "摘要グループ": "", "確定": "未確定", "摘要": "", "備考": "", } # Add missing columns with appropriate defaults for col, default_val in required_columns_for_abstract.items(): if col not in df_output_data.columns: df_output_data[col] = default_val print( f"DEBUG: Added missing column '{col}' with default value '{default_val}'" ) # Ensure data types are correct (convert to string to avoid type issues) for col in ["標準科目", "摘要グループ", "確定", "摘要", "備考"]: if col in df_output_data.columns: df_output_data[col] = df_output_data[col].astype(str).fillna("") abstract_similarity_mapper = AbstractSimilarityMapper( cached_embedding_helper=sentence_service.abstract_cached_embedding_helper, df_map_data=sentence_service.df_abstract_map_data, ) abstract_similarity_mapper.predict_input(df_input_data=df_output_data) print(f"DEBUG: AbstractSimilarityMapper completed successfully") except Exception as e: print(f"Error processing AbstractSimilarityMapper: {e}") print(f"DEBUG: Full error traceback:") import traceback traceback.print_exc() # Don't raise the exception, continue processing print(f"DEBUG: Continuing without AbstractSimilarityMapper...") try: # Name and abstract mapping if sentence_service.df_name_and_subject_map_data is not None: name_and_abstract_mapper = NameAndAbstractDataMapper( df_map_data=sentence_service.df_name_and_subject_map_data ) df_output_data = name_and_abstract_mapper.map_data(df_output_data) except Exception as e: print(f"Error processing NameAndAbstractDataMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Unit mapping if sentence_service.df_unit_map_data is not None: unit_mapper = UnitSimilarityMapper( cached_embedding_helper=sentence_service.unit_cached_embedding_helper, df_map_data=sentence_service.df_unit_map_data, ) unit_mapper.predict_input(df_input_data=df_output_data) except Exception as e: print(f"Error processing UnitMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Standard name mapping if sentence_service.df_standard_name_map_data is not None: standard_name_mapper = StandardNameMapper( df_map_data=sentence_service.df_standard_name_map_data ) df_output_data = standard_name_mapper.map_data(df_output_data) except Exception as e: print(f"Error processing StandardNameMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) # Create output columns and ensure they have proper values # Add ID column if not exists if "ID" not in df_output_data.columns: df_output_data.reset_index(drop=False, inplace=True) df_output_data.rename(columns={"index": "ID"}, inplace=True) df_output_data["ID"] = df_output_data["ID"] + 1 # Start from 1 # Ensure required columns exist with default values required_columns = { "シート名": "", "行": "", "科目": "", "中科目": "", "分類": "", "名称": "", "単位": "", "摘要": "", "備考": "", "出力_科目": "", "出力_中科目": "", "出力_項目名": "", "出力_標準単位": "", "出力_集計用単位": "", "出力_確率度": 0.0, } for col, default_value in required_columns.items(): if col not in df_output_data.columns: df_output_data[col] = default_value # Map output columns to match Excel structure # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper if "出力_基準中科目" in df_output_data.columns: df_output_data["出力_中科目"] = df_output_data["出力_基準中科目"] elif "標準中科目" in df_output_data.columns: df_output_data["出力_中科目"] = df_output_data["標準中科目"] # 出力_項目名 mapping - use the final item name from name and abstract mapper if ( "出力_項目名" in df_output_data.columns and not df_output_data["出力_項目名"].isna().all() ): # Keep existing 出力_項目名 if it exists and has values pass elif "出力_標準名称" in df_output_data.columns: df_output_data["出力_項目名"] = df_output_data["出力_標準名称"] elif "出力_基準名称" in df_output_data.columns: df_output_data["出力_項目名"] = df_output_data["出力_基準名称"] # 出力_標準単位 mapping - use unit mapper result if "出力_標準単位" in df_output_data.columns: df_output_data["出力_標準単位"] = df_output_data["出力_標準単位"] # 出力_集計用単位 mapping - use unit mapper result if "出力_集計用単位" in df_output_data.columns: df_output_data["出力_集計用単位"] = df_output_data["出力_集計用単位"] # 出力_確率度 mapping - use the name similarity as main probability if "出力_名称類似度" in df_output_data.columns: df_output_data["出力_確率度"] = df_output_data["出力_名称類似度"] elif "出力_中科目類似度" in df_output_data.columns: df_output_data["出力_確率度"] = df_output_data["出力_中科目類似度"] elif "出力_摘要類似度" in df_output_data.columns: df_output_data["出力_確率度"] = df_output_data["出力_摘要類似度"] elif "出力_単位類似度" in df_output_data.columns: df_output_data["出力_確率度"] = df_output_data["出力_単位類似度"] else: df_output_data["出力_確率度"] = 0.0 # Fill NaN values and ensure all output columns have proper values df_output_data = df_output_data.fillna("") # Debug: Print available columns to see what we have print(f"Available columns after processing: {list(df_output_data.columns)}") # Final check and fallback for missing output columns if ( "出力_中科目" not in df_output_data.columns or df_output_data["出力_中科目"].eq("").all() ): df_output_data["出力_中科目"] = df_output_data.get("中科目", "") if ( "出力_項目名" not in df_output_data.columns or df_output_data["出力_項目名"].eq("").all() ): df_output_data["出力_項目名"] = df_output_data.get("名称", "") if ( "出力_単位" not in df_output_data.columns or df_output_data["出力_単位"].eq("").all() ): df_output_data["出力_単位"] = df_output_data.get("単位", "") if "出力_確率度" not in df_output_data.columns: df_output_data["出力_確率度"] = 0 # Default confidence score # Define output columns in exact order as shown in Excel output_columns = [ "ID", "シート名", "行", "科目", "中科目", "分類", "名称", "単位", "摘要", "備考", "出力_科目", "出力_中科目", "出力_項目名", "出力_確率度", "出力_標準単位", "出力_集計用単位", ] # Save with utf_8_sig encoding for Japanese Excel compatibility df_output_data[output_columns].to_csv( output_file_path, index=False, encoding="utf_8_sig" ) # Save all caches sentence_service.save_all_caches() end_time = time.time() execution_time = end_time - start_time print(f"Execution time: {execution_time} seconds") return FileResponse( path=output_file_path, filename=f"output_{Path(file.filename).stem}.csv", media_type="text/csv", headers={ "Content-Disposition": f'attachment; filename="output_{Path(file.filename).stem}.csv"', "Content-Type": "application/x-www-form-urlencoded", }, ) except Exception as e: print(f"Error processing file: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/embeddings") async def create_embeddings( request: EmbeddingRequest, current_user=Depends(get_current_user_from_token), sentence_service: SentenceTransformerService = Depends( lambda: sentence_transformer_service ), ): """ Create embeddings for a list of input sentences (requires authentication) """ try: start_time = time.time() embeddings = sentence_service.sentenceTransformerHelper.create_embeddings( request.sentences ) end_time = time.time() execution_time = end_time - start_time print(f"Execution time: {execution_time} seconds") # Convert numpy array to list for JSON serialization embeddings_list = embeddings.tolist() return {"embeddings": embeddings_list} except Exception as e: print(f"Error creating embeddings: {e}") raise HTTPException(status_code=500, detail=str(e)) @router.post("/predict-raw", response_model=PredictRawResponse) async def predict_raw( request: PredictRawRequest, current_user=Depends(get_current_user_from_token), sentence_service: SentenceTransformerService = Depends( lambda: sentence_transformer_service ), ): """ Process raw input records and return standardized names (requires authentication) """ try: # Convert input records to DataFrame records_dict = { "科目": [], "中科目": [], "分類": [], "名称": [], "単位": [], "摘要": [], "備考": [], "シート名": [], # Required by BaseNameData but not used "行": [], # Required by BaseNameData but not used } for record in request.records: records_dict["科目"].append(record.subject) records_dict["中科目"].append(record.sub_subject) records_dict["分類"].append(record.name_category) records_dict["名称"].append(record.name) records_dict["単位"].append("") # Default empty records_dict["摘要"].append(record.abstract or "") records_dict["備考"].append(record.memo or "") records_dict["シート名"].append("") # Placeholder records_dict["行"].append("") # Placeholder df_input_data = pd.DataFrame(records_dict) # Process data similar to the main predict function try: # Subject mapping if sentence_service.df_subject_map_data is not None: subject_similarity_mapper = SubjectSimilarityMapper( cached_embedding_helper=sentence_service.subject_cached_embedding_helper, df_map_data=sentence_service.df_subject_map_data, ) list_input_subject = df_input_data["科目"].unique() df_subject_data = pd.DataFrame({"科目": list_input_subject}) subject_similarity_mapper.predict_input(df_input_data=df_subject_data) output_subject_map = dict( zip(df_subject_data["科目"], df_subject_data["出力_科目"]) ) df_input_data["標準科目"] = df_input_data["科目"].map( output_subject_map ) df_input_data["出力_科目"] = df_input_data["科目"].map( output_subject_map ) else: df_input_data["標準科目"] = df_input_data["科目"] df_input_data["出力_科目"] = df_input_data["科目"] except Exception as e: print(f"Error processing SubjectSimilarityMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Name mapping (simplified for raw predict) if sentence_service.df_name_map_data is not None: name_sentence_mapper = NameSimilarityMapper( cached_embedding_helper=sentence_service.name_cached_embedding_helper, df_map_data=sentence_service.df_name_map_data, ) name_sentence_mapper.predict_input(df_input_data=df_input_data) except Exception as e: print(f"Error processing NameSimilarityMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) try: # Unit mapping if sentence_service.df_unit_map_data is not None: unit_mapper = UnitSimilarityMapper( cached_embedding_helper=sentence_service.unit_cached_embedding_helper, df_map_data=sentence_service.df_unit_map_data, ) unit_mapper.predict_input(df_input_data=df_input_data) except Exception as e: print(f"Error processing UnitSimilarityMapper: {e}") raise HTTPException(status_code=500, detail=str(e)) # Ensure required columns exist for col in [ "確定", "出力_標準名称", "出力_名称類似度", "出力_標準単位", "出力_単位類似度", ]: if col not in df_input_data.columns: if col in ["出力_名称類似度", "出力_単位類似度"]: df_input_data[col] = 0.0 else: df_input_data[col] = "" # Convert results to response format results = [] for _, row in df_input_data.iterrows(): result = PredictResult( subject=row["科目"], sub_subject=row["中科目"], name_category=row["分類"], name=row["名称"], abstract=row["摘要"], memo=row["備考"], confirmed=row.get("確定", ""), standard_subject=row.get("出力_科目", row["科目"]), standard_name=row.get("出力_標準名称", ""), similarity_score=float(row.get("出力_名称類似度", 0.0)), ) results.append(result) # Save all caches sentence_service.save_all_caches() return PredictRawResponse(results=results) except Exception as e: print(f"Error processing records: {e}") raise HTTPException(status_code=500, detail=str(e))