import os from app.db import supabase from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader def list_all_files(bucket_name, limit_per_page=1000): all_files = [] offset = 0 while True: try: files = supabase.storage.from_(bucket_name).list("", { "limit": limit_per_page, "offset": offset }) if not files: break all_files.extend(files) offset += limit_per_page except Exception as e: print(f"Error fetching files with offset {offset}: {e}") break return all_files def get_data(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) data_dir = os.path.join(BASE_DIR, 'data') if not os.path.exists(data_dir): os.makedirs(data_dir) try: local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))] except Exception as e: print(f"Error accessing local files: {e}") return try: remote_files_info = list_all_files("pnp-bot-storage") except Exception as e: print(f"Error fetching files from Supabase: {e}") return remote_files = [f["name"] for f in remote_files_info] # Sinkronisasi: hapus file yang tidak ada di storage file_to_delete = list(set(local_files) - set(remote_files)) file_to_download = list(set(remote_files) - set(local_files)) for filename in file_to_delete: try: os.remove(os.path.join(data_dir, filename)) print(f"Removed: {filename}") except Exception as e: print(f"Error removing {filename}: {e}") for filename in file_to_download: try: file_path = os.path.join(data_dir, filename) res = supabase.storage.from_("pnp-bot-storage").download(filename) with open(file_path, "wb") as f: f.write(res) print(f"Downloaded: {filename}") except Exception as e: print(f"Error downloading {filename}: {e}") def load_docs(): BASE_DIR = os.path.dirname(os.path.abspath(__file__)) data_dir = os.path.join(BASE_DIR, 'data') if not os.path.exists(data_dir): print(f"Directory not found: {data_dir}") os.makedirs(data_dir) print(f"Created directory: {data_dir}") return [] documents = [] try: files = os.listdir(data_dir) except PermissionError: print(f"Permission denied: {data_dir}") return [] for file in files: file_path = os.path.join(data_dir, file) if file.endswith(".pdf"): try: loader = PyPDFLoader(file_path) documents.extend(loader.load()) except Exception as e: print(f"Error loading PDF file {file}: {e}") elif file.endswith('.docx') or file.endswith('.doc'): try: loader = Docx2txtLoader(file_path) documents.extend(loader.load()) except Exception as e: print(f"Error loading DOCX/DOC file {file}: {e}") elif file.endswith('.txt'): try: loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True) documents.extend(loader.load()) except Exception as e: print(f"Error loading TXT file {file}: {e}") return documents