Spaces:
Paused
Paused
Commit
·
305d669
1
Parent(s):
749f953
update
Browse files- app.py +0 -66
- schema_item_filter.py +23 -12
app.py
CHANGED
@@ -1,69 +1,3 @@
|
|
1 |
-
# import streamlit as st
|
2 |
-
# from text2sql import ChatBot
|
3 |
-
# from langdetect import detect
|
4 |
-
# from utils.translate_utils import translate_zh_to_en
|
5 |
-
# from utils.db_utils import add_a_record
|
6 |
-
# from langdetect.lang_detect_exception import LangDetectException
|
7 |
-
|
8 |
-
# # Initialize chatbot and other variables
|
9 |
-
# text2sql_bot = ChatBot()
|
10 |
-
# baidu_api_token = None
|
11 |
-
|
12 |
-
# # Define database schemas for demonstration
|
13 |
-
# db_schemas = {
|
14 |
-
# "singer": """
|
15 |
-
# CREATE TABLE "singer" (
|
16 |
-
# "Singer_ID" int,
|
17 |
-
# "Name" text,
|
18 |
-
# "Birth_Year" real,
|
19 |
-
# "Net_Worth_Millions" real,
|
20 |
-
# "Citizenship" text,
|
21 |
-
# PRIMARY KEY ("Singer_ID")
|
22 |
-
# );
|
23 |
-
|
24 |
-
# CREATE TABLE "song" (
|
25 |
-
# "Song_ID" int,
|
26 |
-
# "Title" text,
|
27 |
-
# "Singer_ID" int,
|
28 |
-
# "Sales" real,
|
29 |
-
# "Highest_Position" real,
|
30 |
-
# PRIMARY KEY ("Song_ID"),
|
31 |
-
# FOREIGN KEY ("Singer_ID") REFERENCES "singer"("Singer_ID")
|
32 |
-
# );
|
33 |
-
# """,
|
34 |
-
# # Add other schemas as needed
|
35 |
-
# }
|
36 |
-
|
37 |
-
# # Streamlit UI
|
38 |
-
# st.title("Text-to-SQL Chatbot")
|
39 |
-
# st.sidebar.header("Select a Database")
|
40 |
-
|
41 |
-
# # Sidebar for selecting a database
|
42 |
-
# selected_db = st.sidebar.selectbox("Choose a database:", list(db_schemas.keys()))
|
43 |
-
|
44 |
-
# # Display the selected schema
|
45 |
-
# st.sidebar.text_area("Database Schema", db_schemas[selected_db], height=600)
|
46 |
-
|
47 |
-
# # User input section
|
48 |
-
# question = st.text_input("Enter your question:")
|
49 |
-
# db_id = selected_db # Use selected database for DB ID
|
50 |
-
|
51 |
-
# if question:
|
52 |
-
# add_a_record(question, db_id)
|
53 |
-
|
54 |
-
# try:
|
55 |
-
# if baidu_api_token is not None and detect(question) != "en":
|
56 |
-
# print("Before translation:", question)
|
57 |
-
# question = translate_zh_to_en(question, baidu_api_token)
|
58 |
-
# print("After translation:", question)
|
59 |
-
# except LangDetectException as e:
|
60 |
-
# print("Language detection error:", str(e))
|
61 |
-
|
62 |
-
# predicted_sql = text2sql_bot.get_response(question, db_id)
|
63 |
-
# st.write(f"**Database:** {db_id}")
|
64 |
-
# st.write(f"**Predicted SQL query:** {predicted_sql}")
|
65 |
-
|
66 |
-
|
67 |
import streamlit as st
|
68 |
from text2sql import ChatBot
|
69 |
from transformers import (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from text2sql import ChatBot
|
3 |
from transformers import (
|
schema_item_filter.py
CHANGED
@@ -238,15 +238,25 @@ def lista_contains_listb(lista, listb):
|
|
238 |
class SchemaItemClassifierInference():
|
239 |
def __init__(self, model_save_path):
|
240 |
set_seed(42)
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
244 |
self.model = SchemaItemClassifier(model_save_path, "test")
|
245 |
self.model.eval()
|
|
|
|
|
|
|
|
|
246 |
|
247 |
def predict_one(self, sample):
|
248 |
-
encoder_input_ids, encoder_input_attention_mask, column_name_token_indices
|
249 |
-
|
|
|
|
|
|
|
|
|
250 |
|
251 |
with torch.no_grad():
|
252 |
model_outputs = self.model(
|
@@ -258,15 +268,16 @@ class SchemaItemClassifierInference():
|
|
258 |
)
|
259 |
|
260 |
table_logits = model_outputs["batch_table_name_cls_logits"][0]
|
261 |
-
table_pred_probs = torch.nn.functional.softmax(table_logits, dim
|
262 |
-
|
263 |
column_logits = model_outputs["batch_column_info_cls_logits"][0]
|
264 |
-
column_pred_probs = torch.nn.functional.softmax(column_logits, dim
|
265 |
|
266 |
splitted_column_pred_probs = []
|
267 |
-
#
|
268 |
for table_id, column_num in enumerate(column_num_in_each_table):
|
269 |
-
splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]):
|
|
|
270 |
column_pred_probs = splitted_column_pred_probs
|
271 |
|
272 |
result_dict = dict()
|
@@ -329,9 +340,9 @@ class SchemaItemClassifierInference():
|
|
329 |
print(column_coverage_results)
|
330 |
|
331 |
if __name__ == "__main__":
|
332 |
-
dataset_name = "bird_with_evidence"
|
333 |
# dataset_name = "bird"
|
334 |
-
|
335 |
sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
|
336 |
import json
|
337 |
dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))
|
|
|
238 |
class SchemaItemClassifierInference():
|
239 |
def __init__(self, model_save_path):
|
240 |
set_seed(42)
|
241 |
+
|
242 |
+
# Load tokenizer from Hugging Face
|
243 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space=True)
|
244 |
+
|
245 |
+
# Load the model from Hugging Face or local path
|
246 |
self.model = SchemaItemClassifier(model_save_path, "test")
|
247 |
self.model.eval()
|
248 |
+
|
249 |
+
# Move model to GPU if available, otherwise stay on CPU
|
250 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
251 |
+
self.model = self.model.to(self.device) # Move model to the selected device
|
252 |
|
253 |
def predict_one(self, sample):
|
254 |
+
encoder_input_ids, encoder_input_attention_mask, column_name_token_indices, \
|
255 |
+
table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
|
256 |
+
|
257 |
+
# Ensure all tensors are moved to the same device as the model
|
258 |
+
encoder_input_ids = encoder_input_ids.to(self.device)
|
259 |
+
encoder_input_attention_mask = encoder_input_attention_mask.to(self.device)
|
260 |
|
261 |
with torch.no_grad():
|
262 |
model_outputs = self.model(
|
|
|
268 |
)
|
269 |
|
270 |
table_logits = model_outputs["batch_table_name_cls_logits"][0]
|
271 |
+
table_pred_probs = torch.nn.functional.softmax(table_logits, dim=1)[:, 1].cpu().tolist()
|
272 |
+
|
273 |
column_logits = model_outputs["batch_column_info_cls_logits"][0]
|
274 |
+
column_pred_probs = torch.nn.functional.softmax(column_logits, dim=1)[:, 1].cpu().tolist()
|
275 |
|
276 |
splitted_column_pred_probs = []
|
277 |
+
# Split predicted column probs into each table
|
278 |
for table_id, column_num in enumerate(column_num_in_each_table):
|
279 |
+
splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]):
|
280 |
+
sum(column_num_in_each_table[:table_id]) + column_num])
|
281 |
column_pred_probs = splitted_column_pred_probs
|
282 |
|
283 |
result_dict = dict()
|
|
|
340 |
print(column_coverage_results)
|
341 |
|
342 |
if __name__ == "__main__":
|
343 |
+
# dataset_name = "bird_with_evidence"
|
344 |
# dataset_name = "bird"
|
345 |
+
dataset_name = "spider"
|
346 |
sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
|
347 |
import json
|
348 |
dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))
|