Futuresony commited on
Commit
f3f0d14
·
verified ·
1 Parent(s): 029dfaa

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -285
app.py DELETED
@@ -1,285 +0,0 @@
1
- import psycopg2
2
- import os
3
- import pickle
4
- import traceback
5
- import numpy as np
6
- import json
7
- import base64
8
- import time
9
-
10
- # Assuming gspread and SentenceTransformer are installed
11
- try:
12
- import gspread
13
- from oauth2client.service_account import ServiceAccountCredentials
14
- from sentence_transformers import SentenceTransformer
15
- print("gspread and SentenceTransformer imported successfully.")
16
- except ImportError:
17
- print("Error: Required libraries (gspread, oauth2client, sentence_transformers) not found.")
18
- print("Please install them: pip install psycopg2-binary gspread oauth2client sentence-transformers numpy")
19
- # Exit or handle the error appropriately if libraries are missing
20
- exit() # Exiting for demonstration if imports fail
21
-
22
- # Define environment variables for PostgreSQL connection
23
- # These should be set in the environment where you run this script
24
- #DB_HOST = os.getenv("DB_HOST")
25
- DB_NAME = "postgres"
26
- #DB_NAME = os.getenv("DB_NAME")
27
- DB_HOST = "https://wziqfkzaqorzthpoxhjh.supabase.co"
28
- #DB_USER = os.getenv("DB_USER")
29
- DB_USER = "postgres"
30
- #DB_PASSWORD = os.getenv("DB_PASSWORD")
31
- DB_PASSWORD = "Me21322972.........." # Replace with your actual password
32
- #DB_PORT = os.getenv("DB_PORT", "5432") # Default PostgreSQL port
33
- DB_PORT = "5432"
34
-
35
- # Define environment variables for Google Sheets authentication
36
- GOOGLE_BASE64_CREDENTIALS = os.getenv("GOOGLE_BASE64_CREDENTIALS")
37
- SHEET_ID = "19ipxC2vHYhpXCefpxpIkpeYdI43a1Ku2kYwecgUULIw" # Replace with your actual Sheet ID
38
-
39
- # Define table names
40
- BUSINESS_DATA_TABLE = "business_data"
41
- CONVERSATION_HISTORY_TABLE = "conversation_history"
42
-
43
- # Define Embedding Dimension (must match your chosen Sentence Transformer model)
44
- EMBEDDING_DIM = 384 # Dimension for paraphrase-MiniLM-L6-v2
45
-
46
- # --- Database Functions ---
47
- def connect_db():
48
- """Establishes a connection to the PostgreSQL database."""
49
- print("Attempting to connect to the database...")
50
- # Retrieve credentials inside the function in case environment variables are set after import
51
- # Use the hardcoded global variables defined above for this test
52
- db_host = DB_HOST
53
- db_name = DB_NAME
54
- db_user = DB_USER
55
- db_password = DB_PASSWORD
56
- db_port = DB_PORT
57
-
58
-
59
- if not all([db_host, db_name, db_user, db_password]):
60
- print("Error: Database credentials (DB_HOST, DB_NAME, DB_USER, DB_PASSWORD) are not fully set as environment variables.")
61
- return None
62
-
63
- # *** FIX: Remove http(s):// prefix from host if present ***
64
- if db_host.startswith("https://"):
65
- db_host = db_host.replace("https://", "")
66
- elif db_host.startswith("http://"):
67
- db_host = db_host.replace("http://", "")
68
- # **********************************************************
69
-
70
- try:
71
- conn = psycopg2.connect(
72
- host=db_host,
73
- database=db_name,
74
- user=db_user,
75
- password=db_password,
76
- port=db_port
77
- )
78
- print("Database connection successful.")
79
- return conn
80
- except Exception as e:
81
- print(f"Error connecting to the database: {e}")
82
- print(traceback.format_exc())
83
- return None
84
-
85
- def setup_db_schema(conn):
86
- """Sets up the necessary tables and pgvector extension."""
87
- print("Setting up database schema...")
88
- try:
89
- with conn.cursor() as cur:
90
- # Enable pgvector extension
91
- cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
92
- print("pgvector extension enabled (if not already).")
93
-
94
- # Create business_data table
95
- cur.execute(f"""
96
- CREATE TABLE IF NOT EXISTS {BUSINESS_DATA_TABLE} (
97
- id SERIAL PRIMARY KEY,
98
- service TEXT NOT NULL,
99
- description TEXT NOT NULL,
100
- embedding vector({EMBEDDING_DIM}) -- Assuming EMBEDDING_DIM is defined globally
101
- );
102
- """)
103
- print(f"Table '{BUSINESS_DATA_TABLE}' created (if not already).")
104
-
105
- # Create conversation_history table
106
- cur.execute(f"""
107
- CREATE TABLE IF NOT EXISTS {CONVERSATION_HISTORY_TABLE} (
108
- id SERIAL PRIMARY KEY,
109
- timestamp TIMESTAMP WITH TIME ZONE NOT NULL,
110
- user_id TEXT,
111
- user_query TEXT,
112
- model_response TEXT,
113
- tool_details JSONB,
114
- model_used TEXT
115
- );
116
- """)
117
- print(f"Table '{CONVERSATION_HISTORY_TABLE}' created (if not already).")
118
-
119
- conn.commit()
120
- print("Database schema setup complete.")
121
- return True
122
- except Exception as e:
123
- print(f"Error setting up database schema: {e}")
124
- print(traceback.format_exc())
125
- conn.rollback()
126
- return False
127
-
128
- # --- Google Sheets Authentication and Data Retrieval ---
129
- def authenticate_google_sheets():
130
- """Authenticates with Google Sheets using base64 encoded credentials."""
131
- print("Authenticating Google Account for Sheets access...")
132
- if not GOOGLE_BASE64_CREDENTIALS:
133
- print("Error: GOOGLE_BASE64_CREDENTIALS environment variable not set. Google Sheets access will fail.")
134
- return None
135
- try:
136
- credentials_json = base64.b64decode(GOOGLE_BASE64_CREDENTIALS).decode('utf-8')
137
- credentials = json.loads(credentials_json)
138
- # Use ServiceAccountCredentials.from_json_keyfile_dict for dictionary
139
- scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
140
- creds = ServiceAccountCredentials.from_json_keyfile_dict(credentials, scope)
141
- gc = gspread.authorize(creds)
142
- print("Google Sheets authentication successful.")
143
- return gc
144
- except Exception as e:
145
- print(f"Google Sheets authentication failed: {e}")
146
- print(traceback.format_exc())
147
- print("Please ensure your GOOGLE_BASE64_CREDENTIALS environment variable is correctly set and contains valid service account credentials.")
148
- return None
149
-
150
- # --- Data Migration Function ---
151
- def migrate_google_sheet_data_to_db(conn, gc_client, embedder_model):
152
- """Retrieves data from Google Sheet, generates embeddings, and inserts into DB."""
153
- print("Migrating data from Google Sheet to database...")
154
- if gc_client is None or SHEET_ID is None:
155
- print("Skipping Google Sheet migration: Google Sheets client or Sheet ID not available.")
156
- return False
157
- if embedder_model is None:
158
- print("Skipping Google Sheet migration: Embedder not available.")
159
- return False
160
- if EMBEDDING_DIM is None:
161
- print("Skipping Google Sheet migration: EMBEDDING_DIM not defined.")
162
- return False
163
-
164
- try:
165
- # Check if business_data table is already populated
166
- with conn.cursor() as cur:
167
- cur.execute(f"SELECT COUNT(*) FROM {BUSINESS_DATA_TABLE};")
168
- count = cur.fetchone()[0]
169
- if count > 0:
170
- print(f"Table '{BUSINESS_DATA_TABLE}' already contains {count} records. Skipping migration.")
171
- return True # Indicate success because data is already there
172
-
173
- sheet = gc_client.open_by_key(SHEET_ID).sheet1
174
- print(f"Successfully opened Google Sheet with ID: {SHEET_ID}")
175
- data_records = sheet.get_all_records()
176
-
177
- if not data_records:
178
- print("No data records found in Google Sheet.")
179
- return False
180
-
181
- filtered_data = [row for row in data_records if row.get('Service') and row.get('Description')]
182
- if not filtered_data:
183
- print("Filtered data is empty after checking for 'Service' and 'Description'.")
184
- return False
185
-
186
- print(f"Processing {len(filtered_data)} records for migration.")
187
- descriptions_for_embedding = [f"Service: {row['Service'].strip()}. Description: {row['Description'].strip()}" for row in filtered_data]
188
-
189
- # Generate embeddings in batches if needed for large datasets
190
- batch_size = 64
191
- embeddings_list = []
192
- for i in range(0, len(descriptions_for_embedding), batch_size):
193
- batch_descriptions = descriptions_for_embedding[i:i + batch_size]
194
- print(f"Encoding batch {int(i/batch_size) + 1} of {int(len(descriptions_for_embedding)/batch_size) + 1}...")
195
- batch_embeddings = embedder_model.encode(batch_descriptions, convert_to_tensor=False)
196
- embeddings_list.extend(batch_embeddings.tolist()) # Convert numpy array to list
197
-
198
- insert_count = 0
199
- with conn.cursor() as cur:
200
- for i, row in enumerate(filtered_data):
201
- service = row.get('Service', '').strip()
202
- description = row.get('Description', '').strip()
203
- embedding = embeddings_list[i]
204
-
205
- # Use the vector literal format '[]' for inserting embeddings
206
- # Use execute_values for potentially faster bulk inserts if necessary, but simple execute is fine for this
207
- cur.execute(f"""
208
- INSERT INTO {BUSINESS_DATA_TABLE} (service, description, embedding)
209
- VALUES (%s, %s, %s::vector);
210
- """, (service, description, embedding))
211
- insert_count += 1
212
- if insert_count % 100 == 0:
213
- conn.commit() # Commit periodically
214
- print(f"Inserted {insert_count} records...")
215
-
216
- conn.commit() # Commit remaining records
217
- print(f"Migration complete. Inserted {insert_count} records into '{BUSINESS_DATA_TABLE}'.")
218
- return True
219
-
220
- except Exception as e:
221
- print(f"Error during Google Sheet data migration: {e}")
222
- print(traceback.format_exc())
223
- conn.rollback()
224
- return False
225
-
226
- # --- Main Migration Execution ---
227
- if __name__ == "__main__":
228
- print("Starting RAG data migration script...")
229
-
230
- # 1. Authenticate Google Sheets
231
- gc = authenticate_google_sheets()
232
- if gc is None:
233
- print("Google Sheets authentication failed. Cannot migrate data from Sheets.")
234
- # Exit or handle the error if Sheets auth fails
235
- exit()
236
-
237
- # 2. Initialize Embedder Model
238
- try:
239
- print(f"Loading Sentence Transformer model for embeddings (dimension: {EMBEDDING_DIM})...")
240
- # Make sure to use the correct model and check its dimension
241
- embedder = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
242
- # Verify the dimension matches EMBEDDING_DIM
243
- if embedder.get_sentence_embedding_dimension() != EMBEDDING_DIM:
244
- print(f"Error: Loaded embedder dimension ({embedder.get_sentence_embedding_dimension()}) does not match expected EMBEDDING_DIM ({EMBEDDING_DIM}).")
245
- print("Please check the model or update EMBEDDING_DIM.")
246
- embedder = None # Set to None to prevent migration with wrong dimension
247
- else:
248
- print("Embedder model loaded successfully.")
249
-
250
- except Exception as e:
251
- print(f"Error loading Sentence Transformer model: {e}")
252
- print(traceback.format_exc())
253
- embedder = None # Set to None if model loading fails
254
-
255
- if embedder is None:
256
- print("Embedder model not available. Cannot generate embeddings for migration.")
257
- # Exit or handle the error if embedder fails to load
258
- exit()
259
-
260
-
261
- # 3. Connect to Database
262
- db_conn = connect_db()
263
- if db_conn is None:
264
- print("Database connection failed. Cannot migrate data.")
265
- # Exit or handle the error if DB connection fails
266
- exit()
267
-
268
- try:
269
- # 4. Setup Database Schema (if not already done)
270
- if setup_db_schema(db_conn):
271
- # 5. Migrate Data
272
- if migrate_google_sheet_data_to_db(db_conn, gc, embedder):
273
- print("\nRAG Data Migration to PostgreSQL completed successfully.")
274
- else:
275
- print("\nRAG Data Migration to PostgreSQL failed.")
276
- else:
277
- print("\nDatabase schema setup failed. Data migration skipped.")
278
-
279
- finally:
280
- # 6. Close Database Connection
281
- if db_conn:
282
- db_conn.close()
283
- print("Database connection closed.")
284
-
285
- print("\nMigration script finished.")