SysModeler mdShakinRobofication commited on
Commit
922d194
·
verified ·
1 Parent(s): fc3a249

Upload vectorDB_upload_script.py (#2)

Browse files

- Upload vectorDB_upload_script.py (607d4b0104b28e646036b609e4fd4fdeb06f1b4f)


Co-authored-by: Md Shakin <mdShakinRobofication@users.noreply.huggingface.co>

Files changed (1) hide show
  1. vectorDB_upload_script.py +208 -0
vectorDB_upload_script.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to store FAISS and pickle files in Azure Cosmos DB MongoDB API
3
+ This script should be run only once to upload the vector database files.
4
+
5
+ run this at root of the project:
6
+ python -m app.services.chatbot.vectorDB_upload_script
7
+ """
8
+
9
+ import os
10
+ import asyncio
11
+ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorGridFSBucket
12
+ from datetime import datetime
13
+ import logging
14
+ from dotenv import load_dotenv
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Load environment variables from .env
21
+ load_dotenv()
22
+
23
+ # Database configuration
24
+ MONGO_URI = os.getenv("MONGODB_KEY")
25
+ DB_NAME = "SysmodelerDB"
26
+ BUCKET_NAME = "vector_db"
27
+
28
+ # File paths
29
+ FAISS_PATH = r"c:\Users\User\Downloads\faiss_index_sysml\index.faiss"
30
+ PICKLE_PATH = r"c:\Users\User\Downloads\faiss_index_sysml\index.pkl"
31
+
32
+ class VectorDBUploader:
33
+ def __init__(self):
34
+ self.client = None
35
+ self.db = None
36
+ self.fs = None
37
+
38
+ async def connect(self):
39
+ """Connect to MongoDB"""
40
+ try:
41
+ self.client = AsyncIOMotorClient(MONGO_URI)
42
+ self.db = self.client[DB_NAME]
43
+ self.fs = AsyncIOMotorGridFSBucket(self.db, bucket_name=BUCKET_NAME)
44
+
45
+ # Test connection
46
+ await self.client.admin.command('ping')
47
+ logger.info("Successfully connected to MongoDB")
48
+ return True
49
+ except Exception as e:
50
+ logger.error(f"Failed to connect to MongoDB: {e}")
51
+ return False
52
+
53
+ async def file_exists(self, filename: str) -> bool:
54
+ """Check if file already exists in GridFS"""
55
+ try:
56
+ async for file_doc in self.fs.find({"filename": filename}):
57
+ return True
58
+ return False
59
+ except Exception as e:
60
+ logger.error(f"Error checking file existence: {e}")
61
+ return False
62
+
63
+ # async def upload_file(self, file_path: str, filename: str) -> bool:
64
+ # """Upload a file to GridFS"""
65
+ # try:
66
+ # # Check if file exists locally
67
+ # if not os.path.exists(file_path):
68
+ # logger.error(f"Local file not found: {file_path}")
69
+ # return False
70
+
71
+ # # Check if file already exists in database
72
+ # if await self.file_exists(filename):
73
+ # logger.warning(f"File {filename} already exists in database. Skipping...")
74
+ # return True
75
+
76
+ # # Get file size for logging
77
+ # file_size = os.path.getsize(file_path)
78
+ # logger.info(f"Uploading {filename} ({file_size} bytes)...")
79
+
80
+ # # Upload file
81
+ # with open(file_path, 'rb') as f:
82
+ # file_id = await self.fs.upload_from_stream(
83
+ # filename,
84
+ # f,
85
+ # metadata={
86
+ # "uploaded_at": datetime.utcnow(),
87
+ # "original_path": file_path,
88
+ # "file_size": file_size,
89
+ # "description": f"Vector database file: {filename}"
90
+ # }
91
+ # )
92
+
93
+ # logger.info(f"Successfully uploaded {filename} with ID: {file_id}")
94
+ # return True
95
+
96
+ # except Exception as e:
97
+ # logger.error(f"Failed to upload {filename}: {e}")
98
+ # return False
99
+
100
+
101
+ async def upload_file(self, file_path: str, filename: str) -> bool:
102
+ """Upload a file to GridFS, overwriting any existing files with the same name"""
103
+ try:
104
+ # Check if file exists locally
105
+ if not os.path.exists(file_path):
106
+ logger.error(f"Local file not found: {file_path}")
107
+ return False
108
+
109
+ # Remove any existing file with the same name
110
+ async for file_doc in self.fs.find({"filename": filename}):
111
+ await self.fs.delete(file_doc._id)
112
+ logger.info(f"Deleted existing file: {filename} (ID: {file_doc._id})")
113
+
114
+ # Get file size for logging
115
+ file_size = os.path.getsize(file_path)
116
+ logger.info(f"Uploading {filename} ({file_size} bytes)...")
117
+
118
+ # Upload file
119
+ with open(file_path, 'rb') as f:
120
+ file_id = await self.fs.upload_from_stream(
121
+ filename,
122
+ f,
123
+ metadata={
124
+ "uploaded_at": datetime.utcnow(),
125
+ "original_path": file_path,
126
+ "file_size": file_size,
127
+ "description": f"Vector database file: {filename}"
128
+ }
129
+ )
130
+
131
+ logger.info(f"Successfully uploaded {filename} with ID: {file_id}")
132
+ return True
133
+
134
+ except Exception as e:
135
+ logger.error(f"Failed to upload {filename}: {e}")
136
+ return False
137
+
138
+
139
+ async def list_files(self):
140
+ """List all files in the GridFS bucket"""
141
+ try:
142
+ logger.info(f"Files in {BUCKET_NAME} bucket:")
143
+ async for file_doc in self.fs.find():
144
+ logger.info(f"- {file_doc.filename} (ID: {file_doc._id}, Size: {file_doc.length} bytes)")
145
+ except Exception as e:
146
+ logger.error(f"Failed to list files: {e}")
147
+
148
+ async def upload_vector_files(self):
149
+ """Upload both FAISS and pickle files"""
150
+ files_to_upload = [
151
+ (FAISS_PATH, "index.faiss"),
152
+ (PICKLE_PATH, "index.pkl")
153
+ ]
154
+
155
+ success_count = 0
156
+ for file_path, filename in files_to_upload:
157
+ if await self.upload_file(file_path, filename):
158
+ success_count += 1
159
+ else:
160
+ logger.error(f"Failed to upload {filename}")
161
+
162
+ logger.info(f"Upload completed: {success_count}/{len(files_to_upload)} files uploaded successfully")
163
+ return success_count == len(files_to_upload)
164
+
165
+ async def close(self):
166
+ """Close database connection"""
167
+ if self.client:
168
+ self.client.close()
169
+ logger.info("Database connection closed")
170
+
171
+ async def main():
172
+ """Main function to upload vector database files"""
173
+ uploader = VectorDBUploader()
174
+
175
+ try:
176
+ # Connect to database
177
+ if not await uploader.connect():
178
+ logger.error("Failed to connect to database. Exiting...")
179
+ return
180
+
181
+ # Upload files
182
+ logger.info("Starting vector database files upload...")
183
+ success = await uploader.upload_vector_files()
184
+
185
+ if success:
186
+ logger.info("All files uploaded successfully!")
187
+ else:
188
+ logger.error("Some files failed to upload")
189
+
190
+ # List all files in the bucket
191
+ await uploader.list_files()
192
+
193
+ except Exception as e:
194
+ logger.error(f"Unexpected error: {e}")
195
+ finally:
196
+ await uploader.close()
197
+
198
+ if __name__ == "__main__":
199
+ print("Vector Database File Uploader")
200
+ print("=" * 50)
201
+ print(f"Database: {DB_NAME}")
202
+ print(f"Bucket: {BUCKET_NAME}")
203
+ print(f"FAISS file: {FAISS_PATH}")
204
+ print(f"Pickle file: {PICKLE_PATH}")
205
+ print("=" * 50)
206
+
207
+ # Run the upload process
208
+ asyncio.run(main())