pnp-chatbot-v1 / app /data_loader.py
FauziIsyrinApridal
Initial commit without binary files
ea1ba01
import os
from app.db import supabase
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
def list_all_files(bucket_name, limit_per_page=1000):
all_files = []
offset = 0
while True:
try:
files = supabase.storage.from_(bucket_name).list("", {
"limit": limit_per_page,
"offset": offset
})
if not files:
break
all_files.extend(files)
offset += limit_per_page
except Exception as e:
print(f"Error fetching files with offset {offset}: {e}")
break
return all_files
def get_data():
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(BASE_DIR, 'data')
if not os.path.exists(data_dir):
os.makedirs(data_dir)
try:
local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
except Exception as e:
print(f"Error accessing local files: {e}")
return
try:
remote_files_info = list_all_files("pnp-bot-storage")
except Exception as e:
print(f"Error fetching files from Supabase: {e}")
return
remote_files = [f["name"] for f in remote_files_info]
# Sinkronisasi: hapus file yang tidak ada di storage
file_to_delete = list(set(local_files) - set(remote_files))
file_to_download = list(set(remote_files) - set(local_files))
for filename in file_to_delete:
try:
os.remove(os.path.join(data_dir, filename))
print(f"Removed: {filename}")
except Exception as e:
print(f"Error removing {filename}: {e}")
for filename in file_to_download:
try:
file_path = os.path.join(data_dir, filename)
res = supabase.storage.from_("pnp-bot-storage").download(filename)
with open(file_path, "wb") as f:
f.write(res)
print(f"Downloaded: {filename}")
except Exception as e:
print(f"Error downloading {filename}: {e}")
def load_docs():
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(BASE_DIR, 'data')
if not os.path.exists(data_dir):
print(f"Directory not found: {data_dir}")
os.makedirs(data_dir)
print(f"Created directory: {data_dir}")
return []
documents = []
try:
files = os.listdir(data_dir)
except PermissionError:
print(f"Permission denied: {data_dir}")
return []
for file in files:
file_path = os.path.join(data_dir, file)
if file.endswith(".pdf"):
try:
loader = PyPDFLoader(file_path)
documents.extend(loader.load())
except Exception as e:
print(f"Error loading PDF file {file}: {e}")
elif file.endswith('.docx') or file.endswith('.doc'):
try:
loader = Docx2txtLoader(file_path)
documents.extend(loader.load())
except Exception as e:
print(f"Error loading DOCX/DOC file {file}: {e}")
elif file.endswith('.txt'):
try:
loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True)
documents.extend(loader.load())
except Exception as e:
print(f"Error loading TXT file {file}: {e}")
return documents