Spaces:

Roxanne-WANG
/

LangSQL

Paused

App Files Files Community

Roxanne-WANG commited on Apr 20

Commit

b759b87

1 Parent(s): b423caf

update webpage

Browse files

Files changed (17) hide show

Dockerfile +45 -0
README.md +2 -2
app.py +64 -0
build_contents_index.py +75 -0
data/.DS_Store +0 -0
data/history/.DS_Store +0 -0
data/tables.json +0 -0
databases/.DS_Store +0 -0
databases/singer/schema.sql +39 -0
databases/singer/singer.sqlite +0 -0
requirements.txt +18 -0
schema_item_filter.py +343 -0
text2sql.py +182 -0
utils/bridge_content_encoder.py +261 -0
utils/classifier_model.py +186 -0
utils/db_utils.py +255 -0
utils/translate_utils.py +23 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+# Use NVIDIA CUDA base image for GPU support
+FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
+# Set the working directory
+WORKDIR /app
+# Update and install system dependencies (including Java and other tools)
+RUN apt-get update && \
+    apt-get install -y openjdk-11-jdk git rsync make build-essential libssl-dev zlib1g-dev \
+    libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
+    libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev \
+    libffi-dev liblzma-dev git-lfs ffmpeg libsm6 libxext6 cmake \
+    libgl1-mesa-glx && rm -rf /var/lib/apt/lists/* && git lfs install
+# Set JAVA_HOME environment variable
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+ENV PATH="${JAVA_HOME}/bin:${PATH}"
+# Install Python version manager (pyenv) and Python 3.10
+RUN curl https://pyenv.run | bash
+RUN pyenv install 3.10 && pyenv global 3.10 && pyenv rehash
+# Install pip and other dependencies
+RUN pip install --no-cache-dir --upgrade pip
+RUN pip install --no-cache-dir datasets transformers langdetect streamlit
+# Install PyTorch and CUDA dependencies
+RUN pip install --no-cache-dir torch==1.13.1+cu117 torchvision==0.14.1 torchaudio==0.13.1
+# Copy requirements.txt and install dependencies
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code to the container
+COPY . /app/
+# Expose the port the app will run on
+EXPOSE 8501
+# Set the environment variable for streamlit
+ENV STREAMLIT_SERVER_PORT=8501
+ENV STREAMLIT_SERVER_HEADLESS=true
+# Command to run the application
+CMD ["streamlit", "run", "app.py"]

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 title: LangSQL
-emoji: 🏢
-colorFrom: indigo
 colorTo: gray
 sdk: streamlit
 sdk_version: 1.44.1

 ---
 title: LangSQL
+emoji: 🦕
+colorFrom: blue
 colorTo: gray
 sdk: streamlit
 sdk_version: 1.44.1

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import streamlit as st
+from text2sql import ChatBot
+from langdetect import detect
+from utils.translate_utils import translate_zh_to_en
+from utils.db_utils import add_a_record
+from langdetect.lang_detect_exception import LangDetectException
+# Initialize chatbot and other variables
+text2sql_bot = ChatBot()
+baidu_api_token = None
+# Define database schemas for demonstration
+db_schemas = {
+    "singer": """
+    CREATE TABLE "singer" (
+        "Singer_ID" int,
+        "Name" text,
+        "Birth_Year" real,
+        "Net_Worth_Millions" real,
+        "Citizenship" text,
+        PRIMARY KEY ("Singer_ID")
+    );
+    CREATE TABLE "song" (
+        "Song_ID" int,
+        "Title" text,
+        "Singer_ID" int,
+        "Sales" real,
+        "Highest_Position" real,
+        PRIMARY KEY ("Song_ID"),
+        FOREIGN KEY ("Singer_ID") REFERENCES "singer"("Singer_ID")
+    );
+    """,
+    # Add other schemas as needed
+}
+# Streamlit UI
+st.title("Text-to-SQL Chatbot")
+st.sidebar.header("Select a Database")
+# Sidebar for selecting a database
+selected_db = st.sidebar.selectbox("Choose a database:", list(db_schemas.keys()))
+# Display the selected schema
+st.sidebar.text_area("Database Schema", db_schemas[selected_db], height=600)
+# User input section
+question = st.text_input("Enter your question:")
+db_id = selected_db  # Use selected database for DB ID
+if question:
+    add_a_record(question, db_id)
+    try:
+        if baidu_api_token is not None and detect(question) != "en":
+            print("Before translation:", question)
+            question = translate_zh_to_en(question, baidu_api_token)
+            print("After translation:", question)
+    except LangDetectException as e:
+        print("Language detection error:", str(e))
+    predicted_sql = text2sql_bot.get_response(question, db_id)
+    st.write(f"**Database:** {db_id}")
+    st.write(f"**Predicted SQL query:** {predicted_sql}")

build_contents_index.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from utils.db_utils import get_cursor_from_path, execute_sql_long_time_limitation
+import json
+import os, shutil
+def remove_contents_of_a_folder(index_path):
+    # if index_path does not exist, then create it
+    os.makedirs(index_path, exist_ok = True)
+    # remove files in index_path
+    for filename in os.listdir(index_path):
+        file_path = os.path.join(index_path, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print('Failed to delete %s. Reason: %s' % (file_path, e))
+def build_content_index(db_path, index_path):
+    '''
+    Create a BM25 index for all contents in a database
+    '''
+    cursor = get_cursor_from_path(db_path)
+    results = execute_sql_long_time_limitation(cursor, "SELECT name FROM sqlite_master WHERE type='table';")
+    table_names = [result[0] for result in results]
+    all_column_contents = []
+    for table_name in table_names:
+        # skip SQLite system table: sqlite_sequence
+        if table_name == "sqlite_sequence":
+            continue
+        results = execute_sql_long_time_limitation(cursor, "SELECT name FROM PRAGMA_TABLE_INFO('{}')".format(table_name))
+        column_names_in_one_table = [result[0] for result in results]
+        for column_name in column_names_in_one_table:
+            try:
+                print("SELECT DISTINCT `{}` FROM `{}` WHERE `{}` IS NOT NULL;".format(column_name, table_name, column_name))
+                results = execute_sql_long_time_limitation(cursor, "SELECT DISTINCT `{}` FROM `{}` WHERE `{}` IS NOT NULL;".format(column_name, table_name, column_name))
+                column_contents = [str(result[0]).strip() for result in results]
+                for c_id, column_content in enumerate(column_contents):
+                    # remove empty and extremely-long contents
+                    if len(column_content) != 0 and len(column_content) <= 25:
+                        all_column_contents.append(
+                            {
+                                "id": "{}-**-{}-**-{}".format(table_name, column_name, c_id).lower(),
+                                "contents": column_content
+                            }
+                        )
+            except Exception as e:
+                print(str(e))
+    with open("./data/temp_db_index/contents.json", "w") as f:
+        f.write(json.dumps(all_column_contents, indent = 2, ensure_ascii = True))
+    # Building a BM25 Index (Direct Java Implementation), see https://github.com/castorini/pyserini/blob/master/docs/usage-index.md
+    cmd = "python -m pyserini.index.lucene --collection JsonCollection --input ./data/temp_db_index --index {} --generator DefaultLuceneDocumentGenerator --threads 16 --storePositions --storeDocvectors --storeRaw".format(index_path)
+    d = os.system(cmd)
+    print(d)
+    os.remove("./data/temp_db_index/contents.json")
+if __name__ == "__main__":
+    os.makedirs('./data/temp_db_index', exist_ok = True)
+    print("build content index for databases...")
+    remove_contents_of_a_folder("db_contents_index")
+    # build content index for Bank_Financials's training set databases
+    for db_id in os.listdir("databases"):
+        print(db_id)
+        build_content_index(
+            os.path.join("databases", db_id, db_id + ".sqlite"),
+            os.path.join("db_contents_index", db_id)
+        )
+    os.rmdir('./data/temp_db_index')

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/history/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/tables.json ADDED Viewed

The diff for this file is too large to render. See raw diff

databases/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

databases/singer/schema.sql ADDED Viewed

	@@ -0,0 +1,39 @@

+PRAGMA foreign_keys = ON;
+CREATE TABLE "singer" (
+"Singer_ID" int,
+"Name" text,
+"Birth_Year" real,
+"Net_Worth_Millions" real,
+"Citizenship" text,
+PRIMARY KEY ("Singer_ID")
+);
+CREATE TABLE "song" (
+"Song_ID" int,
+"Title" text,
+"Singer_ID" int,
+"Sales" real,
+"Highest_Position" real,
+PRIMARY KEY ("Song_ID"),
+FOREIGN KEY ("Singer_ID") REFERENCES `singer`("Singer_ID")
+);
+INSERT INTO  "singer" VALUES (1,"Liliane Bettencourt","1944","30.0","France");
+INSERT INTO  "singer" VALUES (2,"Christy Walton","1948","28.8","United States");
+INSERT INTO  "singer" VALUES (3,"Alice Walton","1949","26.3","United States");
+INSERT INTO  "singer" VALUES (4,"Iris Fontbona","1942","17.4","Chile");
+INSERT INTO  "singer" VALUES (5,"Jacqueline Mars","1940","17.8","United States");
+INSERT INTO  "singer" VALUES (6,"Gina Rinehart","1953","17","Australia");
+INSERT INTO  "singer" VALUES (7,"Susanne Klatten","1962","14.3","Germany");
+INSERT INTO  "singer" VALUES (8,"Abigail Johnson","1961","12.7","United States");
+INSERT INTO  "song" VALUES ("1","Do They Know It's Christmas",1,"1094000","1");
+INSERT INTO  "song" VALUES ("2","F**k It (I Don't Want You Back)",1,"552407","1");
+INSERT INTO  "song" VALUES ("3","Cha Cha Slide",2,"351421","1");
+INSERT INTO  "song" VALUES ("4","Call on Me",4,"335000","1");
+INSERT INTO  "song" VALUES ("5","Yeah",2,"300000","1");
+INSERT INTO  "song" VALUES ("6","All This Time",6,"292000","1");
+INSERT INTO  "song" VALUES ("7","Left Outside Alone",5,"275000","3");
+INSERT INTO  "song" VALUES ("8","Mysterious Girl",7,"261000","1");

databases/singer/singer.sqlite ADDED Viewed

Binary file (20.5 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+streamlit
+text2sql
+langdetect
+faiss-cpu
+func_timeout
+nltk
+numpy
+pandas
+rapidfuzz
+tqdm
+transformers
+chardet
+sqlparse
+accelerate
+bitsandbytes
+sql_metadata
+datasets
+whoosh

schema_item_filter.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import numpy as np
+import random
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from utils.classifier_model import SchemaItemClassifier
+from transformers.trainer_utils import set_seed
+def prepare_inputs_and_labels(sample, tokenizer):
+    table_names = [table["table_name"] for table in sample["schema"]["schema_items"]]
+    column_names = [table["column_names"] for table in sample["schema"]["schema_items"]]
+    column_num_in_each_table = [len(table["column_names"]) for table in sample["schema"]["schema_items"]]
+    # `column_name_word_indices` and `table_name_word_indices` record the word indices of each column and table in `input_words`, whose element is an integer
+    column_name_word_indices, table_name_word_indices = [], []
+    input_words = [sample["text"]]
+    for table_id, table_name in enumerate(table_names):
+        input_words.append("|")
+        input_words.append(table_name)
+        table_name_word_indices.append(len(input_words) - 1)
+        input_words.append(":")
+        for column_name in column_names[table_id]:
+            input_words.append(column_name)
+            column_name_word_indices.append(len(input_words) - 1)
+            input_words.append(",")
+        # remove the last ","
+        input_words = input_words[:-1]
+    tokenized_inputs = tokenizer(
+        input_words,
+        return_tensors="pt",
+        is_split_into_words = True,
+        padding = "max_length",
+        max_length = 512,
+        truncation = True
+    )
+    # after tokenizing, one table name or column name may be splitted into multiple tokens (i.e., sub-words)
+    # `column_name_token_indices` and `table_name_token_indices` records the token indices of each column and table in `input_ids`, whose element is a list of integer
+    column_name_token_indices, table_name_token_indices = [], []
+    word_indices = tokenized_inputs.word_ids(batch_index = 0)
+    # obtain token indices of each column in `input_ids`
+    for column_name_word_index in column_name_word_indices:
+        column_name_token_indices.append([token_id for token_id, word_index in enumerate(word_indices) if column_name_word_index == word_index])
+    # obtain token indices of each table in `input_ids`
+    for table_name_word_index in table_name_word_indices:
+        table_name_token_indices.append([token_id for token_id, word_index in enumerate(word_indices) if table_name_word_index == word_index])
+    encoder_input_ids = tokenized_inputs["input_ids"]
+    encoder_input_attention_mask = tokenized_inputs["attention_mask"]
+    # print("\n".join(tokenizer.batch_decode(encoder_input_ids, skip_special_tokens = True)))
+    if torch.cuda.is_available():
+        encoder_input_ids = encoder_input_ids.cuda()
+        encoder_input_attention_mask = encoder_input_attention_mask.cuda()
+    return encoder_input_ids, encoder_input_attention_mask, \
+        column_name_token_indices, table_name_token_indices, column_num_in_each_table
+def get_schema(tables_and_columns):
+    schema_items = []
+    table_names = list(dict.fromkeys([t for t, c in tables_and_columns]))
+    for table_name in table_names:
+        schema_items.append(
+            {
+                "table_name": table_name,
+                "column_names":  [c for t, c in tables_and_columns if t == table_name]
+            }
+        )
+    return {"schema_items": schema_items}
+def get_sequence_length(text, tables_and_columns, tokenizer):
+    table_names = [t for t, c in tables_and_columns]
+    # duplicate `table_names` while preserving order
+    table_names = list(dict.fromkeys(table_names))
+    column_names = []
+    for table_name in table_names:
+        column_names.append([c for t, c in tables_and_columns if t == table_name])
+    input_words = [text]
+    for table_id, table_name in enumerate(table_names):
+        input_words.append("|")
+        input_words.append(table_name)
+        input_words.append(":")
+        for column_name in column_names[table_id]:
+            input_words.append(column_name)
+            input_words.append(",")
+        # remove the last ","
+        input_words = input_words[:-1]
+    tokenized_inputs = tokenizer(input_words, is_split_into_words = True)
+    return len(tokenized_inputs["input_ids"])
+# handle extremely long schema sequences
+def split_sample(sample, tokenizer):
+    text = sample["text"]
+    table_names = []
+    column_names = []
+    for table in sample["schema"]["schema_items"]:
+        table_names.append(table["table_name"] + " ( " + table["table_comment"] + " ) " \
+            if table["table_comment"] != "" else table["table_name"])
+        column_names.append([column_name + " ( " + column_comment + " ) " \
+            if column_comment != "" else column_name \
+                for column_name, column_comment in zip(table["column_names"], table["column_comments"])])
+    splitted_samples = []
+    recorded_tables_and_columns = []
+    for table_idx, table_name in enumerate(table_names):
+        for column_name in column_names[table_idx]:
+            if get_sequence_length(text, recorded_tables_and_columns + [[table_name, column_name]], tokenizer) < 500:
+                recorded_tables_and_columns.append([table_name, column_name])
+            else:
+                splitted_samples.append(
+                    {
+                        "text": text,
+                        "schema": get_schema(recorded_tables_and_columns)
+                    }
+                )
+                recorded_tables_and_columns = [[table_name, column_name]]
+    splitted_samples.append(
+        {
+            "text": text,
+            "schema": get_schema(recorded_tables_and_columns)
+        }
+    )
+    return splitted_samples
+def merge_pred_results(sample, pred_results):
+    # table_names = [table["table_name"] for table in sample["schema"]["schema_items"]]
+    # column_names = [table["column_names"] for table in sample["schema"]["schema_items"]]
+    table_names = []
+    column_names = []
+    for table in sample["schema"]["schema_items"]:
+        table_names.append(table["table_name"] + " ( " + table["table_comment"] + " ) " \
+            if table["table_comment"] != "" else table["table_name"])
+        column_names.append([column_name + " ( " + column_comment + " ) " \
+            if column_comment != "" else column_name \
+                for column_name, column_comment in zip(table["column_names"], table["column_comments"])])
+    merged_results = []
+    for table_id, table_name in enumerate(table_names):
+        table_prob = 0
+        column_probs = []
+        for result_dict in pred_results:
+            if table_name in result_dict:
+                if table_prob < result_dict[table_name]["table_prob"]:
+                    table_prob = result_dict[table_name]["table_prob"]
+                column_probs += result_dict[table_name]["column_probs"]
+        merged_results.append(
+            {
+                "table_name": table_name,
+                "table_prob": table_prob,
+                "column_names": column_names[table_id],
+                "column_probs": column_probs
+            }
+        )
+    return merged_results
+def filter_schema(data, sic, num_top_k_tables = 5, num_top_k_columns = 5):
+    filtered_schema = dict()
+    filtered_matched_contents = dict()
+    filtered_schema["schema_items"] = []
+    filtered_schema["foreign_keys"] = []
+    table_names = [table["table_name"] for table in data["schema"]["schema_items"]]
+    table_comments = [table["table_comment"] for table in data["schema"]["schema_items"]]
+    column_names = [table["column_names"] for table in data["schema"]["schema_items"]]
+    column_types = [table["column_types"] for table in data["schema"]["schema_items"]]
+    column_comments = [table["column_comments"] for table in data["schema"]["schema_items"]]
+    column_contents = [table["column_contents"] for table in data["schema"]["schema_items"]]
+    pk_indicators = [table["pk_indicators"] for table in data["schema"]["schema_items"]]
+    # predict scores for each tables and columns
+    pred_results = sic.predict(data)
+    # remain top_k1 tables for each database and top_k2 columns for each remained table
+    table_probs = [pred_result["table_prob"] for pred_result in pred_results]
+    table_indices = np.argsort(-np.array(table_probs), kind="stable")[:num_top_k_tables].tolist()
+    for table_idx in table_indices:
+        column_probs = pred_results[table_idx]["column_probs"]
+        column_indices = np.argsort(-np.array(column_probs), kind="stable")[:num_top_k_columns].tolist()
+        filtered_schema["schema_items"].append(
+            {
+                "table_name": table_names[table_idx],
+                "table_comment": table_comments[table_idx],
+                "column_names": [column_names[table_idx][column_idx] for column_idx in column_indices],
+                "column_types": [column_types[table_idx][column_idx] for column_idx in column_indices],
+                "column_comments": [column_comments[table_idx][column_idx] for column_idx in column_indices],
+                "column_contents": [column_contents[table_idx][column_idx] for column_idx in column_indices],
+                "pk_indicators": [pk_indicators[table_idx][column_idx] for column_idx in column_indices]
+            }
+        )
+        # extract matched contents of remained columns
+        for column_name in [column_names[table_idx][column_idx] for column_idx in column_indices]:
+            tc_name = "{}.{}".format(table_names[table_idx], column_name)
+            if tc_name in data["matched_contents"]:
+                filtered_matched_contents[tc_name] = data["matched_contents"][tc_name]
+    # extract foreign keys among remianed tables
+    filtered_table_names = [table_names[table_idx] for table_idx in table_indices]
+    for foreign_key in data["schema"]["foreign_keys"]:
+        source_table, source_column, target_table, target_column = foreign_key
+        if source_table in filtered_table_names and target_table in filtered_table_names:
+            filtered_schema["foreign_keys"].append(foreign_key)
+    # replace the old schema with the filtered schema
+    data["schema"] = filtered_schema
+    # replace the old matched contents with the filtered matched contents
+    data["matched_contents"] = filtered_matched_contents
+    return data
+def lista_contains_listb(lista, listb):
+    for b in listb:
+        if b not in lista:
+            return 0
+    return 1
+class SchemaItemClassifierInference():
+    def __init__(self, model_save_path):
+        set_seed(42)
+        # load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_save_path, add_prefix_space = True)
+        # initialize model
+        self.model = SchemaItemClassifier(model_save_path, "test")
+        # load fine-tuned params
+        self.model.load_state_dict(torch.load(model_save_path + "/dense_classifier.pt", map_location=torch.device('cpu')), strict=False)
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+        self.model.eval()
+    def predict_one(self, sample):
+        encoder_input_ids, encoder_input_attention_mask, column_name_token_indices,\
+            table_name_token_indices, column_num_in_each_table = prepare_inputs_and_labels(sample, self.tokenizer)
+        with torch.no_grad():
+            model_outputs = self.model(
+                encoder_input_ids,
+                encoder_input_attention_mask,
+                [column_name_token_indices],
+                [table_name_token_indices],
+                [column_num_in_each_table]
+            )
+        table_logits = model_outputs["batch_table_name_cls_logits"][0]
+        table_pred_probs = torch.nn.functional.softmax(table_logits, dim = 1)[:, 1].cpu().tolist()
+        column_logits = model_outputs["batch_column_info_cls_logits"][0]
+        column_pred_probs = torch.nn.functional.softmax(column_logits, dim = 1)[:, 1].cpu().tolist()
+        splitted_column_pred_probs = []
+        # split predicted column probs into each table
+        for table_id, column_num in enumerate(column_num_in_each_table):
+            splitted_column_pred_probs.append(column_pred_probs[sum(column_num_in_each_table[:table_id]): sum(column_num_in_each_table[:table_id]) + column_num])
+        column_pred_probs = splitted_column_pred_probs
+        result_dict = dict()
+        for table_idx, table in enumerate(sample["schema"]["schema_items"]):
+            result_dict[table["table_name"]] = {
+                "table_name": table["table_name"],
+                "table_prob": table_pred_probs[table_idx],
+                "column_names": table["column_names"],
+                "column_probs": column_pred_probs[table_idx],
+            }
+        return result_dict
+    def predict(self, test_sample):
+        splitted_samples = split_sample(test_sample, self.tokenizer)
+        pred_results = []
+        for splitted_sample in splitted_samples:
+            pred_results.append(self.predict_one(splitted_sample))
+        return merge_pred_results(test_sample, pred_results)
+    def evaluate_coverage(self, dataset):
+        max_k = 100
+        total_num_for_table_coverage, total_num_for_column_coverage = 0, 0
+        table_coverage_results = [0]*max_k
+        column_coverage_results = [0]*max_k
+        for data in dataset:
+            indices_of_used_tables = [idx for idx, label in enumerate(data["table_labels"]) if label == 1]
+            pred_results = sic.predict(data)
+            # print(pred_results)
+            table_probs = [res["table_prob"] for res in pred_results]
+            for k in range(max_k):
+                indices_of_top_k_tables = np.argsort(-np.array(table_probs), kind="stable")[:k+1].tolist()
+                if lista_contains_listb(indices_of_top_k_tables, indices_of_used_tables):
+                    table_coverage_results[k] += 1
+            total_num_for_table_coverage += 1
+            for table_idx in range(len(data["table_labels"])):
+                indices_of_used_columns = [idx for idx, label in enumerate(data["column_labels"][table_idx]) if label == 1]
+                if len(indices_of_used_columns) == 0:
+                    continue
+                column_probs = pred_results[table_idx]["column_probs"]
+                for k in range(max_k):
+                    indices_of_top_k_columns = np.argsort(-np.array(column_probs), kind="stable")[:k+1].tolist()
+                    if lista_contains_listb(indices_of_top_k_columns, indices_of_used_columns):
+                        column_coverage_results[k] += 1
+                total_num_for_column_coverage += 1
+                indices_of_top_10_columns = np.argsort(-np.array(column_probs), kind="stable")[:10].tolist()
+                if lista_contains_listb(indices_of_top_10_columns, indices_of_used_columns) == 0:
+                    print(pred_results[table_idx])
+                    print(data["column_labels"][table_idx])
+                    print(data["question"])
+        print(total_num_for_table_coverage)
+        print(table_coverage_results)
+        print(total_num_for_column_coverage)
+        print(column_coverage_results)
+if __name__ == "__main__":
+    dataset_name = "bird_with_evidence"
+    # dataset_name = "bird"
+    # dataset_name = "spider"
+    sic = SchemaItemClassifierInference("sic_ckpts/sic_{}".format(dataset_name))
+    import json
+    dataset = json.load(open("./data/sft_eval_{}_text2sql.json".format(dataset_name)))
+    sic.evaluate_coverage(dataset)

text2sql.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import json
+import torch
+import copy
+import re
+import sqlparse
+import sqlite3
+from tqdm import tqdm
+from utils.db_utils import get_db_schema
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from whoosh.index import create_in
+from whoosh.fields import Schema, TEXT
+from whoosh.qparser import QueryParser
+from utils.db_utils import check_sql_executability, get_matched_contents, get_db_schema_sequence, get_matched_content_sequence
+from schema_item_filter import SchemaItemClassifierInference, filter_schema
+def remove_similar_comments(names, comments):
+    '''
+    Remove table (or column) comments that have a high degree of similarity with their names
+    '''
+    new_comments = []
+    for name, comment in zip(names, comments):
+        if name.replace("_", "").replace(" ", "") == comment.replace("_", "").replace(" ", ""):
+            new_comments.append("")
+        else:
+            new_comments.append(comment)
+    return new_comments
+def load_db_comments(table_json_path):
+    additional_db_info = json.load(open(table_json_path))
+    db_comments = dict()
+    for db_info in additional_db_info:
+        comment_dict = dict()
+        column_names = [column_name.lower() for _, column_name in db_info["column_names_original"]]
+        table_idx_of_each_column = [t_idx for t_idx, _ in db_info["column_names_original"]]
+        column_comments = [column_comment.lower() for _, column_comment in db_info["column_names"]]
+        assert len(column_names) == len(column_comments)
+        column_comments = remove_similar_comments(column_names, column_comments)
+        table_names = [table_name.lower() for table_name in db_info["table_names_original"]]
+        table_comments = [table_comment.lower() for table_comment in db_info["table_names"]]
+        assert len(table_names) == len(table_comments)
+        table_comments = remove_similar_comments(table_names, table_comments)
+        for table_idx, (table_name, table_comment) in enumerate(zip(table_names, table_comments)):
+            comment_dict[table_name] = {
+                "table_comment": table_comment,
+                "column_comments": dict()
+            }
+            for t_idx, column_name, column_comment in zip(table_idx_of_each_column, column_names, column_comments):
+                if t_idx == table_idx:
+                    comment_dict[table_name]["column_comments"][column_name] = column_comment
+        db_comments[db_info["db_id"]] = comment_dict
+    return db_comments
+def get_db_id2schema(db_path, tables_json):
+    db_comments = load_db_comments(tables_json)
+    db_id2schema = dict()
+    for db_id in tqdm(os.listdir(db_path)):
+        db_id2schema[db_id] = get_db_schema(os.path.join(db_path, db_id, db_id + ".sqlite"), db_comments, db_id)
+    return db_id2schema
+def get_db_id2ddl(db_path):
+    db_ids = os.listdir(db_path)
+    db_id2ddl = dict()
+    for db_id in db_ids:
+        conn = sqlite3.connect(os.path.join(db_path, db_id, db_id + ".sqlite"))
+        cursor = conn.cursor()
+        cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
+        tables = cursor.fetchall()
+        ddl = []
+        for table in tables:
+            table_name = table[0]
+            table_ddl = table[1]
+            table_ddl.replace("\t", " ")
+            while "  " in table_ddl:
+                table_ddl = table_ddl.replace("  ", " ")
+            table_ddl = re.sub(r'--.*', '', table_ddl)
+            table_ddl = sqlparse.format(table_ddl, keyword_case = "upper", identifier_case = "lower", reindent_aligned = True)
+            table_ddl = table_ddl.replace(", ", ",\n    ")
+            if table_ddl.endswith(";"):
+                table_ddl = table_ddl[:-1]
+            table_ddl = table_ddl[:-1] + "\n);"
+            table_ddl = re.sub(r"(CREATE TABLE.*?)\(", r"\1(\n    ", table_ddl)
+            ddl.append(table_ddl)
+        db_id2ddl[db_id] = "\n\n".join(ddl)
+    return db_id2ddl
+class ChatBot():
+    def __init__(self) -> None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+        model_name = "seeklhy/codes-7b-merged"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, device_map = "auto", torch_dtype = torch.float16)
+        self.max_length = 4096
+        self.max_new_tokens = 256
+        self.max_prefix_length = self.max_length - self.max_new_tokens
+        self.sic = SchemaItemClassifierInference("sic_ckpts/sic_bird")
+        self.db_id2content_searcher = dict()
+        for db_id in os.listdir("db_contents_index"):
+            schema = Schema(content=TEXT(stored=True))
+            index_dir = os.path.join("db_contents_index", db_id)
+            if not os.path.exists(index_dir):
+                os.makedirs(index_dir)
+            ix = create_in(index_dir, schema)
+            writer = ix.writer()
+            with open(os.path.join(index_dir, f"{db_id}.json"), "r") as file:
+                data = json.load(file)
+                for item in data:
+                    writer.add_document(content=item['content'])
+            writer.commit()
+            self.db_id2content_searcher[db_id] = ix
+        self.db_ids = sorted(os.listdir("databases"))
+        self.db_id2schema = get_db_id2schema("databases", "data/tables.json")
+        self.db_id2ddl = get_db_id2ddl("databases")
+    def get_response(self, question, db_id):
+        data = {
+            "text": question,
+            "schema": copy.deepcopy(self.db_id2schema[db_id]),
+            "matched_contents": get_matched_contents(question, self.db_id2content_searcher[db_id])
+        }
+        data = filter_schema(data, self.sic, 6, 10)
+        data["schema_sequence"] = get_db_schema_sequence(data["schema"])
+        data["content_sequence"] = get_matched_content_sequence(data["matched_contents"])
+        prefix_seq = data["schema_sequence"] + "\n" + data["content_sequence"] + "\n" + data["text"] + "\n"
+        print(prefix_seq)
+        input_ids = [self.tokenizer.bos_token_id] + self.tokenizer(prefix_seq , truncation = False)["input_ids"]
+        if len(input_ids) > self.max_prefix_length:
+            print("the current input sequence exceeds the max_tokens, we will truncate it.")
+            input_ids = [self.tokenizer.bos_token_id] + input_ids[-(self.max_prefix_length-1):]
+        attention_mask = [1] * len(input_ids)
+        inputs = {
+            "input_ids": torch.tensor([input_ids], dtype = torch.int64).to(self.model.device),
+            "attention_mask": torch.tensor([attention_mask], dtype = torch.int64).to(self.model.device)
+        }
+        input_length = inputs["input_ids"].shape[1]
+        with torch.no_grad():
+            generate_ids = self.model.generate(
+                **inputs,
+                max_new_tokens = self.max_new_tokens,
+                num_beams = 4,
+                num_return_sequences = 4
+            )
+        generated_sqls = self.tokenizer.batch_decode(generate_ids[:, input_length:], skip_special_tokens = True, clean_up_tokenization_spaces = False)
+        final_generated_sql = None
+        for generated_sql in generated_sqls:
+            execution_error = check_sql_executability(generated_sql, os.path.join("databases", db_id, db_id + ".sqlite"))
+            if execution_error is None:
+                final_generated_sql = generated_sql
+                break
+        if final_generated_sql is None:
+            if generated_sqls[0].strip() != "":
+                final_generated_sql = generated_sqls[0].strip()
+            else:
+                final_generated_sql = "Sorry, I can not generate a suitable SQL query for your question."
+        return final_generated_sql.replace("\n", " ")

utils/bridge_content_encoder.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+ Copyright (c) 2020, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ Encode DB content.
+"""
+import difflib
+from typing import List, Optional, Tuple
+from rapidfuzz import fuzz
+import sqlite3
+import functools
+# fmt: off
+_stopwords = {'who', 'ourselves', 'down', 'only', 'were', 'him', 'at', "weren't", 'has', 'few', "it's", 'm', 'again',
+              'd', 'haven', 'been', 'other', 'we', 'an', 'own', 'doing', 'ma', 'hers', 'all', "haven't", 'in', 'but',
+              "shouldn't", 'does', 'out', 'aren', 'you', "you'd", 'himself', "isn't", 'most', 'y', 'below', 'is',
+              "wasn't", 'hasn', 'them', 'wouldn', 'against', 'this', 'about', 'there', 'don', "that'll", 'a', 'being',
+              'with', 'your', 'theirs', 'its', 'any', 'why', 'now', 'during', 'weren', 'if', 'should', 'those', 'be',
+              'they', 'o', 't', 'of', 'or', 'me', 'i', 'some', 'her', 'do', 'will', 'yours', 'for', 'mightn', 'nor',
+              'needn', 'the', 'until', "couldn't", 'he', 'which', 'yourself', 'to', "needn't", "you're", 'because',
+              'their', 'where', 'it', "didn't", 've', 'whom', "should've", 'can', "shan't", 'on', 'had', 'have',
+              'myself', 'am', "don't", 'under', 'was', "won't", 'these', 'so', 'as', 'after', 'above', 'each', 'ours',
+              'hadn', 'having', 'wasn', 's', 'doesn', "hadn't", 'than', 'by', 'that', 'both', 'herself', 'his',
+              "wouldn't", 'into', "doesn't", 'before', 'my', 'won', 'more', 'are', 'through', 'same', 'how', 'what',
+              'over', 'll', 'yourselves', 'up', 'mustn', "mustn't", "she's", 're', 'such', 'didn', "you'll", 'shan',
+              'when', "you've", 'themselves', "mightn't", 'she', 'from', 'isn', 'ain', 'between', 'once', 'here',
+              'shouldn', 'our', 'and', 'not', 'too', 'very', 'further', 'while', 'off', 'couldn', "hasn't", 'itself',
+              'then', 'did', 'just', "aren't"}
+# fmt: on
+_commonwords = {"no", "yes", "many"}
+def is_number(s: str) -> bool:
+    try:
+        float(s.replace(",", ""))
+        return True
+    except:
+        return False
+def is_stopword(s: str) -> bool:
+    return s.strip() in _stopwords
+def is_commonword(s: str) -> bool:
+    return s.strip() in _commonwords
+def is_common_db_term(s: str) -> bool:
+    return s.strip() in ["id"]
+class Match(object):
+    def __init__(self, start: int, size: int) -> None:
+        self.start = start
+        self.size = size
+def is_span_separator(c: str) -> bool:
+    return c in "'\"()`,.?! "
+def split(s: str) -> List[str]:
+    return [c.lower() for c in s.strip()]
+def prefix_match(s1: str, s2: str) -> bool:
+    i, j = 0, 0
+    for i in range(len(s1)):
+        if not is_span_separator(s1[i]):
+            break
+    for j in range(len(s2)):
+        if not is_span_separator(s2[j]):
+            break
+    if i < len(s1) and j < len(s2):
+        return s1[i] == s2[j]
+    elif i >= len(s1) and j >= len(s2):
+        return True
+    else:
+        return False
+def get_effective_match_source(s: str, start: int, end: int) -> Match:
+    _start = -1
+    for i in range(start, start - 2, -1):
+        if i < 0:
+            _start = i + 1
+            break
+        if is_span_separator(s[i]):
+            _start = i
+            break
+    if _start < 0:
+        return None
+    _end = -1
+    for i in range(end - 1, end + 3):
+        if i >= len(s):
+            _end = i - 1
+            break
+        if is_span_separator(s[i]):
+            _end = i
+            break
+    if _end < 0:
+        return None
+    while _start < len(s) and is_span_separator(s[_start]):
+        _start += 1
+    while _end >= 0 and is_span_separator(s[_end]):
+        _end -= 1
+    return Match(_start, _end - _start + 1)
+def get_matched_entries(
+    s: str, field_values: List[str], m_theta: float = 0.85, s_theta: float = 0.85
+) -> Optional[List[Tuple[str, Tuple[str, str, float, float, int]]]]:
+    if not field_values:
+        return None
+    if isinstance(s, str):
+        n_grams = split(s)
+    else:
+        n_grams = s
+    matched = dict()
+    for field_value in field_values:
+        if not isinstance(field_value, str):
+            continue
+        fv_tokens = split(field_value)
+        sm = difflib.SequenceMatcher(None, n_grams, fv_tokens)
+        match = sm.find_longest_match(0, len(n_grams), 0, len(fv_tokens))
+        if match.size > 0:
+            source_match = get_effective_match_source(
+                n_grams, match.a, match.a + match.size
+            )
+            if source_match: # and source_match.size > 1
+                match_str = field_value[match.b : match.b + match.size]
+                source_match_str = s[
+                    source_match.start : source_match.start + source_match.size
+                ]
+                c_match_str = match_str.lower().strip()
+                c_source_match_str = source_match_str.lower().strip()
+                c_field_value = field_value.lower().strip()
+                if c_match_str and not is_common_db_term(c_match_str): # and not is_number(c_match_str)
+                    if (
+                        is_stopword(c_match_str)
+                        or is_stopword(c_source_match_str)
+                        or is_stopword(c_field_value)
+                    ):
+                        continue
+                    if c_source_match_str.endswith(c_match_str + "'s"):
+                        match_score = 1.0
+                    else:
+                        if prefix_match(c_field_value, c_source_match_str):
+                            match_score = fuzz.ratio(c_field_value, c_source_match_str) / 100
+                        else:
+                            match_score = 0
+                    if (
+                        is_commonword(c_match_str)
+                        or is_commonword(c_source_match_str)
+                        or is_commonword(c_field_value)
+                    ) and match_score < 1:
+                        continue
+                    s_match_score = match_score
+                    if match_score >= m_theta and s_match_score >= s_theta:
+                        if field_value.isupper() and match_score * s_match_score < 1:
+                            continue
+                        matched[match_str] = (
+                            field_value,
+                            source_match_str,
+                            match_score,
+                            s_match_score,
+                            match.size,
+                        )
+    if not matched:
+        return None
+    else:
+        return sorted(
+            matched.items(),
+            key=lambda x: (1e16 * x[1][2] + 1e8 * x[1][3] + x[1][4]),
+            reverse=True,
+        )
+@functools.lru_cache(maxsize=1000, typed=False)
+def get_column_picklist(table_name: str, column_name: str, db_path: str) -> list:
+    fetch_sql = "SELECT DISTINCT `{}` FROM `{}`".format(column_name, table_name)
+    try:
+        conn = sqlite3.connect(db_path)
+        conn.text_factory = bytes
+        c = conn.cursor()
+        c.execute(fetch_sql)
+        picklist = set()
+        for x in c.fetchall():
+            if isinstance(x[0], str):
+                picklist.add(x[0].encode("utf-8"))
+            elif isinstance(x[0], bytes):
+                try:
+                    picklist.add(x[0].decode("utf-8"))
+                except UnicodeDecodeError:
+                    picklist.add(x[0].decode("latin-1"))
+            else:
+                picklist.add(x[0])
+        picklist = list(picklist)
+    except Exception as e:
+        picklist = []
+    finally:
+        conn.close()
+    return picklist
+def get_database_matches(
+    question: str,
+    table_name: str,
+    column_name: str,
+    db_path: str,
+    top_k_matches: int = 2,
+    match_threshold: float = 0.85,
+) -> List[str]:
+    picklist = get_column_picklist(
+        table_name=table_name, column_name=column_name, db_path=db_path
+    )
+    # only maintain data in ``str'' type
+    picklist = [ele.strip() for ele in picklist if isinstance(ele, str)]
+    # picklist is unordered, we sort it to ensure the reproduction stability
+    picklist = sorted(picklist)
+    matches = []
+    if picklist and isinstance(picklist[0], str):
+        matched_entries = get_matched_entries(
+            s=question,
+            field_values=picklist,
+            m_theta=match_threshold,
+            s_theta=match_threshold,
+        )
+        if matched_entries:
+            num_values_inserted = 0
+            for _match_str, (
+                field_value,
+                _s_match_str,
+                match_score,
+                s_match_score,
+                _match_size,
+            ) in matched_entries:
+                if "name" in column_name and match_score * s_match_score < 1:
+                    continue
+                if table_name != "sqlite_sequence":  # Spider database artifact
+                    matches.append(field_value.strip())
+                    num_values_inserted += 1
+                    if num_values_inserted >= top_k_matches:
+                        break
+    return matches

utils/classifier_model.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import torch
+import torch.nn as nn
+from transformers import AutoConfig, RobertaModel
+class SchemaItemClassifier(nn.Module):
+    def __init__(self, model_name_or_path, mode):
+        super(SchemaItemClassifier, self).__init__()
+        if mode in ["eval", "test"]:
+            # load config
+            config = AutoConfig.from_pretrained(model_name_or_path)
+            # randomly initialize model's parameters according to the config
+            self.plm_encoder = RobertaModel(config)
+        elif mode == "train":
+            self.plm_encoder = RobertaModel.from_pretrained(model_name_or_path)
+        else:
+            raise ValueError()
+        self.plm_hidden_size = self.plm_encoder.config.hidden_size
+        # column cls head
+        self.column_info_cls_head_linear1 = nn.Linear(self.plm_hidden_size, 256)
+        self.column_info_cls_head_linear2 = nn.Linear(256, 2)
+        # column bi-lstm layer
+        self.column_info_bilstm = nn.LSTM(
+            input_size = self.plm_hidden_size,
+            hidden_size = int(self.plm_hidden_size/2),
+            num_layers = 2,
+            dropout = 0,
+            bidirectional = True
+        )
+        # linear layer after column bi-lstm layer
+        self.column_info_linear_after_pooling = nn.Linear(self.plm_hidden_size, self.plm_hidden_size)
+        # table cls head
+        self.table_name_cls_head_linear1 = nn.Linear(self.plm_hidden_size, 256)
+        self.table_name_cls_head_linear2 = nn.Linear(256, 2)
+        # table bi-lstm pooling layer
+        self.table_name_bilstm = nn.LSTM(
+            input_size = self.plm_hidden_size,
+            hidden_size = int(self.plm_hidden_size/2),
+            num_layers = 2,
+            dropout = 0,
+            bidirectional = True
+        )
+        # linear layer after table bi-lstm layer
+        self.table_name_linear_after_pooling = nn.Linear(self.plm_hidden_size, self.plm_hidden_size)
+        # activation function
+        self.leakyrelu = nn.LeakyReLU()
+        self.tanh = nn.Tanh()
+        # table-column cross-attention layer
+        self.table_column_cross_attention_layer = nn.MultiheadAttention(embed_dim = self.plm_hidden_size, num_heads = 8)
+        # dropout function, p=0.2 means randomly set 20% neurons to 0
+        self.dropout = nn.Dropout(p = 0.2)
+    def table_column_cross_attention(
+        self,
+        table_name_embeddings_in_one_db,
+        column_info_embeddings_in_one_db,
+        column_number_in_each_table
+    ):
+        table_num = table_name_embeddings_in_one_db.shape[0]
+        table_name_embedding_attn_list = []
+        for table_id in range(table_num):
+            table_name_embedding = table_name_embeddings_in_one_db[[table_id], :]
+            column_info_embeddings_in_one_table = column_info_embeddings_in_one_db[
+                sum(column_number_in_each_table[:table_id]) : sum(column_number_in_each_table[:table_id+1]), :]
+            table_name_embedding_attn, _ = self.table_column_cross_attention_layer(
+                table_name_embedding,
+                column_info_embeddings_in_one_table,
+                column_info_embeddings_in_one_table
+            )
+            table_name_embedding_attn_list.append(table_name_embedding_attn)
+        # residual connection
+        table_name_embeddings_in_one_db = table_name_embeddings_in_one_db + torch.cat(table_name_embedding_attn_list, dim = 0)
+        # row-wise L2 norm
+        table_name_embeddings_in_one_db = torch.nn.functional.normalize(table_name_embeddings_in_one_db, p=2.0, dim=1)
+        return table_name_embeddings_in_one_db
+    def table_column_cls(
+        self,
+        encoder_input_ids,
+        encoder_input_attention_mask,
+        batch_aligned_column_info_ids,
+        batch_aligned_table_name_ids,
+        batch_column_number_in_each_table
+    ):
+        batch_size = encoder_input_ids.shape[0]
+        encoder_output = self.plm_encoder(
+            input_ids = encoder_input_ids,
+            attention_mask = encoder_input_attention_mask,
+            return_dict = True
+        ) # encoder_output["last_hidden_state"].shape = (batch_size x seq_length x hidden_size)
+        batch_table_name_cls_logits, batch_column_info_cls_logits = [], []
+        # handle each data in current batch
+        for batch_id in range(batch_size):
+            column_number_in_each_table = batch_column_number_in_each_table[batch_id]
+            sequence_embeddings = encoder_output["last_hidden_state"][batch_id, :, :] # (seq_length x hidden_size)
+            # obtain table ids for each table
+            aligned_table_name_ids = batch_aligned_table_name_ids[batch_id]
+            # obtain column ids for each column
+            aligned_column_info_ids = batch_aligned_column_info_ids[batch_id]
+            table_name_embedding_list, column_info_embedding_list = [], []
+            # obtain table embedding via bi-lstm pooling + a non-linear layer
+            for table_name_ids in aligned_table_name_ids:
+                table_name_embeddings = sequence_embeddings[table_name_ids, :]
+                # BiLSTM pooling
+                output_t, (hidden_state_t, cell_state_t) = self.table_name_bilstm(table_name_embeddings)
+                table_name_embedding = hidden_state_t[-2:, :].view(1, self.plm_hidden_size)
+                table_name_embedding_list.append(table_name_embedding)
+            table_name_embeddings_in_one_db = torch.cat(table_name_embedding_list, dim = 0)
+            # non-linear mlp layer
+            table_name_embeddings_in_one_db = self.leakyrelu(self.table_name_linear_after_pooling(table_name_embeddings_in_one_db))
+            # obtain column embedding via bi-lstm pooling + a non-linear layer
+            for column_info_ids in aligned_column_info_ids:
+                column_info_embeddings = sequence_embeddings[column_info_ids, :]
+                # BiLSTM pooling
+                output_c, (hidden_state_c, cell_state_c) = self.column_info_bilstm(column_info_embeddings)
+                column_info_embedding = hidden_state_c[-2:, :].view(1, self.plm_hidden_size)
+                column_info_embedding_list.append(column_info_embedding)
+            column_info_embeddings_in_one_db = torch.cat(column_info_embedding_list, dim = 0)
+            # non-linear mlp layer
+            column_info_embeddings_in_one_db = self.leakyrelu(self.column_info_linear_after_pooling(column_info_embeddings_in_one_db))
+            # table-column (tc) cross-attention
+            table_name_embeddings_in_one_db = self.table_column_cross_attention(
+                table_name_embeddings_in_one_db,
+                column_info_embeddings_in_one_db,
+                column_number_in_each_table
+            )
+            # calculate table 0-1 logits
+            table_name_embeddings_in_one_db = self.table_name_cls_head_linear1(table_name_embeddings_in_one_db)
+            table_name_embeddings_in_one_db = self.dropout(self.leakyrelu(table_name_embeddings_in_one_db))
+            table_name_cls_logits = self.table_name_cls_head_linear2(table_name_embeddings_in_one_db)
+            # calculate column 0-1 logits
+            column_info_embeddings_in_one_db = self.column_info_cls_head_linear1(column_info_embeddings_in_one_db)
+            column_info_embeddings_in_one_db = self.dropout(self.leakyrelu(column_info_embeddings_in_one_db))
+            column_info_cls_logits = self.column_info_cls_head_linear2(column_info_embeddings_in_one_db)
+            batch_table_name_cls_logits.append(table_name_cls_logits)
+            batch_column_info_cls_logits.append(column_info_cls_logits)
+        return batch_table_name_cls_logits, batch_column_info_cls_logits
+    def forward(
+        self,
+        encoder_input_ids,
+        encoder_attention_mask,
+        batch_aligned_column_info_ids,
+        batch_aligned_table_name_ids,
+        batch_column_number_in_each_table,
+    ):
+        batch_table_name_cls_logits, batch_column_info_cls_logits \
+            = self.table_column_cls(
+                encoder_input_ids,
+                encoder_attention_mask,
+                batch_aligned_column_info_ids,
+                batch_aligned_table_name_ids,
+                batch_column_number_in_each_table
+        )
+        return {
+            "batch_table_name_cls_logits" : batch_table_name_cls_logits,
+            "batch_column_info_cls_logits": batch_column_info_cls_logits
+        }

utils/db_utils.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import os
+import json
+import sqlite3
+from func_timeout import func_set_timeout, FunctionTimedOut
+from utils.bridge_content_encoder import get_matched_entries
+from nltk.tokenize import word_tokenize
+from nltk import ngrams
+def add_a_record(question, db_id):
+    conn = sqlite3.connect('data/history/history.sqlite')
+    cursor = conn.cursor()
+    cursor.execute("INSERT INTO record (question, db_id) VALUES (?, ?)", (question, db_id))
+    conn.commit()
+    conn.close()
+def obtain_n_grams(sequence, max_n):
+    tokens = word_tokenize(sequence)
+    all_grams = []
+    for n in range(1, max_n + 1):
+        all_grams.extend([" ".join(gram) for gram in ngrams(tokens, n)])
+    return all_grams
+# get the database cursor for a sqlite database path
+def get_cursor_from_path(sqlite_path):
+    try:
+        if not os.path.exists(sqlite_path):
+            print("Openning a new connection %s" % sqlite_path)
+        connection = sqlite3.connect(sqlite_path, check_same_thread = False)
+    except Exception as e:
+        print(sqlite_path)
+        raise e
+    connection.text_factory = lambda b: b.decode(errors="ignore")
+    cursor = connection.cursor()
+    return cursor
+# execute predicted sql with a time limitation
+@func_set_timeout(15)
+def execute_sql(cursor, sql):
+    cursor.execute(sql)
+    return cursor.fetchall()
+# execute predicted sql with a long time limitation (for buiding content index)
+@func_set_timeout(2000)
+def execute_sql_long_time_limitation(cursor, sql):
+    cursor.execute(sql)
+    return cursor.fetchall()
+def check_sql_executability(generated_sql, db):
+    if generated_sql.strip() == "":
+        return "Error: empty string"
+    try:
+        cursor = get_cursor_from_path(db)
+        execute_sql(cursor, generated_sql)
+        execution_error = None
+    except FunctionTimedOut as fto:
+        print("SQL execution time out error: {}.".format(fto))
+        execution_error = "SQL execution times out."
+    except Exception as e:
+        print("SQL execution runtime error: {}.".format(e))
+        execution_error = str(e)
+    return execution_error
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+def detect_special_char(name):
+    for special_char in ['(', '-', ')', ' ', '/']:
+        if special_char in name:
+            return True
+    return False
+def add_quotation_mark(s):
+    return "`" + s + "`"
+def get_column_contents(column_name, table_name, cursor):
+    select_column_sql = "SELECT DISTINCT `{}` FROM `{}` WHERE `{}` IS NOT NULL LIMIT 2;".format(column_name, table_name, column_name)
+    results = execute_sql_long_time_limitation(cursor, select_column_sql)
+    column_contents = [str(result[0]).strip() for result in results]
+    # remove empty and extremely-long contents
+    column_contents = [content for content in column_contents if len(content) != 0 and len(content) <= 25]
+    return column_contents
+def get_matched_contents(question, searcher):
+    # coarse-grained matching between the input text and all contents in database
+    grams = obtain_n_grams(question, 4)
+    hits = []
+    for query in grams:
+        hits.extend(searcher.search(query, k = 10))
+    coarse_matched_contents = dict()
+    for i in range(len(hits)):
+        matched_result = json.loads(hits[i].raw)
+        # `tc_name` refers to column names like `table_name.column_name`, e.g., document_drafts.document_id
+        tc_name = ".".join(matched_result["id"].split("-**-")[:2])
+        if tc_name in coarse_matched_contents.keys():
+            if matched_result["contents"] not in coarse_matched_contents[tc_name]:
+                coarse_matched_contents[tc_name].append(matched_result["contents"])
+        else:
+            coarse_matched_contents[tc_name] = [matched_result["contents"]]
+    fine_matched_contents = dict()
+    for tc_name, contents in coarse_matched_contents.items():
+        # fine-grained matching between the question and coarse matched contents
+        fm_contents = get_matched_entries(question, contents)
+        if fm_contents is None:
+            continue
+        for _match_str, (field_value, _s_match_str, match_score, s_match_score, _match_size,) in fm_contents:
+            if match_score < 0.9:
+                continue
+            if tc_name in fine_matched_contents.keys():
+                if len(fine_matched_contents[tc_name]) < 25:
+                    fine_matched_contents[tc_name].append(field_value.strip())
+            else:
+                fine_matched_contents[tc_name] = [field_value.strip()]
+    return fine_matched_contents
+def get_db_schema_sequence(schema):
+    schema_sequence = "database schema :\n"
+    for table in schema["schema_items"]:
+        table_name, table_comment = table["table_name"], table["table_comment"]
+        if detect_special_char(table_name):
+            table_name = add_quotation_mark(table_name)
+        # if table_comment != "":
+        #     table_name += " ( comment : " + table_comment + " )"
+        column_info_list = []
+        for column_name, column_type, column_comment, column_content, pk_indicator in \
+            zip(table["column_names"], table["column_types"], table["column_comments"], table["column_contents"], table["pk_indicators"]):
+            if detect_special_char(column_name):
+                column_name = add_quotation_mark(column_name)
+            additional_column_info = []
+            # column type
+            additional_column_info.append(column_type)
+            # pk indicator
+            if pk_indicator != 0:
+                additional_column_info.append("primary key")
+            # column comment
+            if column_comment != "":
+                additional_column_info.append("comment : " + column_comment)
+            # representive column values
+            if len(column_content) != 0:
+                additional_column_info.append("values : " + " , ".join(column_content))
+            column_info_list.append(table_name + "." + column_name + " ( " + " | ".join(additional_column_info) + " )")
+        schema_sequence += "table "+ table_name + " , columns = [ " + " , ".join(column_info_list) + " ]\n"
+    if len(schema["foreign_keys"]) != 0:
+        schema_sequence += "foreign keys :\n"
+        for foreign_key in schema["foreign_keys"]:
+            for i in range(len(foreign_key)):
+                if detect_special_char(foreign_key[i]):
+                    foreign_key[i] = add_quotation_mark(foreign_key[i])
+            schema_sequence += "{}.{} = {}.{}\n".format(foreign_key[0], foreign_key[1], foreign_key[2], foreign_key[3])
+    else:
+        schema_sequence += "foreign keys : None\n"
+    return schema_sequence.strip()
+def get_matched_content_sequence(matched_contents):
+    content_sequence = ""
+    if len(matched_contents) != 0:
+        content_sequence += "matched contents :\n"
+        for tc_name, contents in matched_contents.items():
+            table_name = tc_name.split(".")[0]
+            column_name = tc_name.split(".")[1]
+            if detect_special_char(table_name):
+                table_name = add_quotation_mark(table_name)
+            if detect_special_char(column_name):
+                column_name = add_quotation_mark(column_name)
+            content_sequence += table_name + "." + column_name + " ( " + " , ".join(contents) + " )\n"
+    else:
+        content_sequence = "matched contents : None"
+    return content_sequence.strip()
+def get_db_schema(db_path, db_comments, db_id):
+    if db_id in db_comments:
+        db_comment = db_comments[db_id]
+    else:
+        db_comment = None
+    cursor = get_cursor_from_path(db_path)
+    # obtain table names
+    results = execute_sql(cursor, "SELECT name FROM sqlite_master WHERE type='table';")
+    table_names = [result[0].lower() for result in results]
+    schema = dict()
+    schema["schema_items"] = []
+    foreign_keys = []
+    # for each table
+    for table_name in table_names:
+        # skip SQLite system table: sqlite_sequence
+        if table_name == "sqlite_sequence":
+            continue
+        # obtain column names in the current table
+        results = execute_sql(cursor, "SELECT name, type, pk FROM PRAGMA_TABLE_INFO('{}')".format(table_name))
+        column_names_in_one_table = [result[0].lower() for result in results]
+        column_types_in_one_table = [result[1].lower() for result in results]
+        pk_indicators_in_one_table = [result[2] for result in results]
+        column_contents = []
+        for column_name in column_names_in_one_table:
+            column_contents.append(get_column_contents(column_name, table_name, cursor))
+        # obtain foreign keys in the current table
+        results = execute_sql(cursor, "SELECT * FROM pragma_foreign_key_list('{}');".format(table_name))
+        for result in results:
+            if None not in [result[3], result[2], result[4]]:
+                foreign_keys.append([table_name.lower(), result[3].lower(), result[2].lower(), result[4].lower()])
+        # obtain comments for each schema item
+        if db_comment is not None:
+            if table_name in db_comment: # record comments for tables and columns
+                table_comment = db_comment[table_name]["table_comment"]
+                column_comments = [db_comment[table_name]["column_comments"][column_name] \
+                    if column_name in db_comment[table_name]["column_comments"] else "" \
+                        for column_name in column_names_in_one_table]
+            else: # current database has comment information, but the current table does not
+                table_comment = ""
+                column_comments = ["" for _ in column_names_in_one_table]
+        else: # current database has no comment information
+            table_comment = ""
+            column_comments = ["" for _ in column_names_in_one_table]
+        schema["schema_items"].append({
+            "table_name": table_name,
+            "table_comment": table_comment,
+            "column_names": column_names_in_one_table,
+            "column_types": column_types_in_one_table,
+            "column_comments": column_comments,
+            "column_contents": column_contents,
+            "pk_indicators": pk_indicators_in_one_table
+        })
+    schema["foreign_keys"] = foreign_keys
+    return schema

utils/translate_utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import requests
+import random
+import json
+def translate_zh_to_en(question, token):
+    url = 'https://aip.baidubce.com/rpc/2.0/mt/texttrans/v1?access_token=' + token
+    from_lang = 'auto'
+    to_lang = 'en'
+    term_ids = ''
+    # Build request
+    headers = {'Content-Type': 'application/json'}
+    payload = {'q': question, 'from': from_lang, 'to': to_lang, 'termIds' : term_ids}
+    # Send request
+    r = requests.post(url, params=payload, headers=headers)
+    result = r.json()
+    return result["result"]["trans_result"][0]["dst"]
+if __name__ == "__main__":
+    print(translate_zh_to_en("你好啊！"))