Spaces:
Sleeping
Sleeping
import os | |
from app.db import supabase | |
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader | |
def list_all_files(bucket_name, limit_per_page=1000): | |
all_files = [] | |
offset = 0 | |
while True: | |
try: | |
files = supabase.storage.from_(bucket_name).list("", { | |
"limit": limit_per_page, | |
"offset": offset | |
}) | |
if not files: | |
break | |
all_files.extend(files) | |
offset += limit_per_page | |
except Exception as e: | |
print(f"Error fetching files with offset {offset}: {e}") | |
break | |
return all_files | |
def get_data(): | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
data_dir = os.path.join(BASE_DIR, 'data') | |
if not os.path.exists(data_dir): | |
os.makedirs(data_dir) | |
try: | |
local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))] | |
except Exception as e: | |
print(f"Error accessing local files: {e}") | |
return | |
try: | |
remote_files_info = list_all_files("pnp-bot-storage") | |
except Exception as e: | |
print(f"Error fetching files from Supabase: {e}") | |
return | |
remote_files = [f["name"] for f in remote_files_info] | |
# Sinkronisasi: hapus file yang tidak ada di storage | |
file_to_delete = list(set(local_files) - set(remote_files)) | |
file_to_download = list(set(remote_files) - set(local_files)) | |
for filename in file_to_delete: | |
try: | |
os.remove(os.path.join(data_dir, filename)) | |
print(f"Removed: {filename}") | |
except Exception as e: | |
print(f"Error removing {filename}: {e}") | |
for filename in file_to_download: | |
try: | |
file_path = os.path.join(data_dir, filename) | |
res = supabase.storage.from_("pnp-bot-storage").download(filename) | |
with open(file_path, "wb") as f: | |
f.write(res) | |
print(f"Downloaded: {filename}") | |
except Exception as e: | |
print(f"Error downloading {filename}: {e}") | |
def load_docs(): | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
data_dir = os.path.join(BASE_DIR, 'data') | |
if not os.path.exists(data_dir): | |
print(f"Directory not found: {data_dir}") | |
os.makedirs(data_dir) | |
print(f"Created directory: {data_dir}") | |
return [] | |
documents = [] | |
try: | |
files = os.listdir(data_dir) | |
except PermissionError: | |
print(f"Permission denied: {data_dir}") | |
return [] | |
for file in files: | |
file_path = os.path.join(data_dir, file) | |
if file.endswith(".pdf"): | |
try: | |
loader = PyPDFLoader(file_path) | |
documents.extend(loader.load()) | |
except Exception as e: | |
print(f"Error loading PDF file {file}: {e}") | |
elif file.endswith('.docx') or file.endswith('.doc'): | |
try: | |
loader = Docx2txtLoader(file_path) | |
documents.extend(loader.load()) | |
except Exception as e: | |
print(f"Error loading DOCX/DOC file {file}: {e}") | |
elif file.endswith('.txt'): | |
try: | |
loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True) | |
documents.extend(loader.load()) | |
except Exception as e: | |
print(f"Error loading TXT file {file}: {e}") | |
return documents |