Roxanne-WANG commited on
Commit
305d669
·
1 Parent(s): 749f953
Files changed (2) hide show
  1. app.py +0 -66
  2. schema_item_filter.py +23 -12
app.py CHANGED
@@ -1,69 +1,3 @@
1
- # import streamlit as st
2
- # from text2sql import ChatBot
3
- # from langdetect import detect
4
- # from utils.translate_utils import translate_zh_to_en
5
- # from utils.db_utils import add_a_record
6
- # from langdetect.lang_detect_exception import LangDetectException
7
-
8
- # # Initialize chatbot and other variables
9
- # text2sql_bot = ChatBot()
10
- # baidu_api_token = None
11
-
12
- # # Define database schemas for demonstration
13
- # db_schemas = {
14
- # "singer": """
15
- # CREATE TABLE "singer" (
16
- # "Singer_ID" int,
17
- # "Name" text,
18
- # "Birth_Year" real,
19
- # "Net_Worth_Millions" real,
20
- # "Citizenship" text,
21
- # PRIMARY KEY ("Singer_ID")
22
- # );
23
-
24
- # CREATE TABLE "song" (
25
- # "Song_ID" int,
26
- # "Title" text,
27
- # "Singer_ID" int,
28
- # "Sales" real,
29
- # "Highest_Position" real,
30
- # PRIMARY KEY ("Song_ID"),
31
- # FOREIGN KEY ("Singer_ID") REFERENCES "singer"("Singer_ID")
32
- # );
33
- # """,
34
- # # Add other schemas as needed
35
- # }
36
-
37
- # # Streamlit UI
38
- # st.title("Text-to-SQL Chatbot")
39
- # st.sidebar.header("Select a Database")
40
-
41
- # # Sidebar for selecting a database
42
- # selected_db = st.sidebar.selectbox("Choose a database:", list(db_schemas.keys()))
43
-
44
- # # Display the selected schema
45
- # st.sidebar.text_area("Database Schema", db_schemas[selected_db], height=600)
46
-
47
- # # User input section
48
- # question = st.text_input("Enter your question:")
49
- # db_id = selected_db # Use selected database for DB ID
50
-
51
- # if question:
52
- # add_a_record(question, db_id)
53
-
54
- # try:
55
- # if baidu_api_token is not None and detect(question) != "en":
56
- # print("Before translation:", question)
57
- # question = translate_zh_to_en(question, baidu_api_token)
58
- # print("After translation:", question)
59
- # except LangDetectException as e:
60
- # print("Language detection error:", str(e))
61
-
62
- # predicted_sql = text2sql_bot.get_response(question, db_id)
63
- # st.write(f"**Database:** {db_id}")
64
- # st.write(f"**Predicted SQL query:** {predicted_sql}")
65
-
66
-
67
  import streamlit as st
68
  from text2sql import ChatBot
69
  from transformers import (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from text2sql import ChatBot
3
  from transformers import (
schema_item_filter.py CHANGED
@@ -238,15 +238,25 @@ def lista_contains_listb(lista, listb):
238
  class SchemaItemClassifierInference():
239
  def __init__(self, model_save_path):
240
  set_seed(42)
241
- # load tokenizer from Hugging Face
242
- self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space = True)
243
- # load model directly from Hugging Face
 
 
244
  self.model = SchemaItemClassifier(model_save_path, "test")
245
  self.model.eval()
 
 
 
 
246
 
247
  def predict_one(self, sample):
248
- encoder_input_ids, encoder_input_attention_mask, column_name_token_indices,\
249
- table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
 
 
 
 
250
 
251
  with torch.no_grad():
252
  model_outputs = self.model(
@@ -258,15 +268,16 @@ class SchemaItemClassifierInference():
258
  )
259
 
260
  table_logits = model_outputs["batch_table_name_cls_logits"][0]
261
- table_pred_probs = torch.nn.functional.softmax(table_logits, dim = 1)[:, 1].cpu().tolist()
262
-
263
  column_logits = model_outputs["batch_column_info_cls_logits"][0]
264
- column_pred_probs = torch.nn.functional.softmax(column_logits, dim = 1)[:, 1].cpu().tolist()
265
 
266
  splitted_column_pred_probs = []
267
- # split predicted column probs into each table
268
  for table_id, column_num in enumerate(column_num_in_each_table):
269
- splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]): sum(column_num_in_each_table[:table_id]) + column_num])
 
270
  column_pred_probs = splitted_column_pred_probs
271
 
272
  result_dict = dict()
@@ -329,9 +340,9 @@ class SchemaItemClassifierInference():
329
  print(column_coverage_results)
330
 
331
  if __name__ == "__main__":
332
- dataset_name = "bird_with_evidence"
333
  # dataset_name = "bird"
334
- # dataset_name = "spider"
335
  sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
336
  import json
337
  dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))
 
238
  class SchemaItemClassifierInference():
239
  def __init__(self, model_save_path):
240
  set_seed(42)
241
+
242
+ # Load tokenizer from Hugging Face
243
+ self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space=True)
244
+
245
+ # Load the model from Hugging Face or local path
246
  self.model = SchemaItemClassifier(model_save_path, "test")
247
  self.model.eval()
248
+
249
+ # Move model to GPU if available, otherwise stay on CPU
250
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
251
+ self.model = self.model.to(self.device) # Move model to the selected device
252
 
253
  def predict_one(self, sample):
254
+ encoder_input_ids, encoder_input_attention_mask, column_name_token_indices, \
255
+ table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
256
+
257
+ # Ensure all tensors are moved to the same device as the model
258
+ encoder_input_ids = encoder_input_ids.to(self.device)
259
+ encoder_input_attention_mask = encoder_input_attention_mask.to(self.device)
260
 
261
  with torch.no_grad():
262
  model_outputs = self.model(
 
268
  )
269
 
270
  table_logits = model_outputs["batch_table_name_cls_logits"][0]
271
+ table_pred_probs = torch.nn.functional.softmax(table_logits, dim=1)[:, 1].cpu().tolist()
272
+
273
  column_logits = model_outputs["batch_column_info_cls_logits"][0]
274
+ column_pred_probs = torch.nn.functional.softmax(column_logits, dim=1)[:, 1].cpu().tolist()
275
 
276
  splitted_column_pred_probs = []
277
+ # Split predicted column probs into each table
278
  for table_id, column_num in enumerate(column_num_in_each_table):
279
+ splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]):
280
+ sum(column_num_in_each_table[:table_id]) + column_num])
281
  column_pred_probs = splitted_column_pred_probs
282
 
283
  result_dict = dict()
 
340
  print(column_coverage_results)
341
 
342
  if __name__ == "__main__":
343
+ # dataset_name = "bird_with_evidence"
344
  # dataset_name = "bird"
345
+ dataset_name = "spider"
346
  sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
347
  import json
348
  dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))