Spaces:
Sleeping
Sleeping
File size: 3,474 Bytes
ea1ba01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import os
from app.db import supabase
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
def list_all_files(bucket_name, limit_per_page=1000):
all_files = []
offset = 0
while True:
try:
files = supabase.storage.from_(bucket_name).list("", {
"limit": limit_per_page,
"offset": offset
})
if not files:
break
all_files.extend(files)
offset += limit_per_page
except Exception as e:
print(f"Error fetching files with offset {offset}: {e}")
break
return all_files
def get_data():
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(BASE_DIR, 'data')
if not os.path.exists(data_dir):
os.makedirs(data_dir)
try:
local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
except Exception as e:
print(f"Error accessing local files: {e}")
return
try:
remote_files_info = list_all_files("pnp-bot-storage")
except Exception as e:
print(f"Error fetching files from Supabase: {e}")
return
remote_files = [f["name"] for f in remote_files_info]
# Sinkronisasi: hapus file yang tidak ada di storage
file_to_delete = list(set(local_files) - set(remote_files))
file_to_download = list(set(remote_files) - set(local_files))
for filename in file_to_delete:
try:
os.remove(os.path.join(data_dir, filename))
print(f"Removed: {filename}")
except Exception as e:
print(f"Error removing {filename}: {e}")
for filename in file_to_download:
try:
file_path = os.path.join(data_dir, filename)
res = supabase.storage.from_("pnp-bot-storage").download(filename)
with open(file_path, "wb") as f:
f.write(res)
print(f"Downloaded: {filename}")
except Exception as e:
print(f"Error downloading {filename}: {e}")
def load_docs():
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(BASE_DIR, 'data')
if not os.path.exists(data_dir):
print(f"Directory not found: {data_dir}")
os.makedirs(data_dir)
print(f"Created directory: {data_dir}")
return []
documents = []
try:
files = os.listdir(data_dir)
except PermissionError:
print(f"Permission denied: {data_dir}")
return []
for file in files:
file_path = os.path.join(data_dir, file)
if file.endswith(".pdf"):
try:
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
except Exception as e:
print(f"Error loading PDF file {file}: {e}")
elif file.endswith('.docx') or file.endswith('.doc'):
try:
loader = Docx2txtLoader(file_path)
documents.extend(loader.load())
except Exception as e:
print(f"Error loading DOCX/DOC file {file}: {e}")
elif file.endswith('.txt'):
try:
loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True)
documents.extend(loader.load())
except Exception as e:
print(f"Error loading TXT file {file}: {e}")
return documents |