File size: 3,474 Bytes
ea1ba01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
from app.db import supabase
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader

def list_all_files(bucket_name, limit_per_page=1000):
    all_files = []
    offset = 0

    while True:
        try:
            files = supabase.storage.from_(bucket_name).list("", {
                "limit": limit_per_page,
                "offset": offset
            })
            if not files:
                break
            all_files.extend(files)
            offset += limit_per_page
        except Exception as e:
            print(f"Error fetching files with offset {offset}: {e}")
            break

    return all_files


def get_data():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    data_dir = os.path.join(BASE_DIR, 'data')

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    try:
        local_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
    except Exception as e:
        print(f"Error accessing local files: {e}")
        return

    try:
        remote_files_info = list_all_files("pnp-bot-storage")
    except Exception as e:
        print(f"Error fetching files from Supabase: {e}")
        return

    remote_files = [f["name"] for f in remote_files_info]

    # Sinkronisasi: hapus file yang tidak ada di storage
    file_to_delete = list(set(local_files) - set(remote_files))
    file_to_download = list(set(remote_files) - set(local_files))

    for filename in file_to_delete:
        try:
            os.remove(os.path.join(data_dir, filename))
            print(f"Removed: {filename}")
        except Exception as e:
            print(f"Error removing {filename}: {e}")

    for filename in file_to_download:
        try:
            file_path = os.path.join(data_dir, filename)
            res = supabase.storage.from_("pnp-bot-storage").download(filename)
            with open(file_path, "wb") as f:
                f.write(res)
            print(f"Downloaded: {filename}")
        except Exception as e:
            print(f"Error downloading {filename}: {e}")


def load_docs():
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    data_dir = os.path.join(BASE_DIR, 'data')

    if not os.path.exists(data_dir):
        print(f"Directory not found: {data_dir}")
        os.makedirs(data_dir)
        print(f"Created directory: {data_dir}")
        return []

    documents = []

    try:
        files = os.listdir(data_dir)
    except PermissionError:
        print(f"Permission denied: {data_dir}")
        return []

    for file in files:
        file_path = os.path.join(data_dir, file)
        if file.endswith(".pdf"):
            try:
                loader = PyPDFLoader(file_path)
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading PDF file {file}: {e}")
        elif file.endswith('.docx') or file.endswith('.doc'):
            try:
                loader = Docx2txtLoader(file_path)
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading DOCX/DOC file {file}: {e}")
        elif file.endswith('.txt'):
            try:
                loader = TextLoader(file_path, encoding='utf-8', autodetect_encoding=True)
                documents.extend(loader.load())
            except Exception as e:
                print(f"Error loading TXT file {file}: {e}")

    return documents