Roxanne-WANG commited on
Commit
abb320a
·
1 Parent(s): 1c20d2c

update model weight

Browse files
build_whoosh_index.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ from whoosh import index
4
+ from whoosh.fields import Schema, ID, TEXT
5
+
6
+
7
+ def extract_contents_from_db(db_path, max_len=25):
8
+ """
9
+ Extract all non-null, unique text values of length <= max_len
10
+ from every table and column in the SQLite database.
11
+
12
+ Returns:
13
+ List of tuples [(doc_id, text), ...]
14
+ """
15
+ conn = sqlite3.connect(db_path)
16
+ cur = conn.cursor()
17
+ docs = []
18
+
19
+ # Iterate over all user tables in the database
20
+ for (table_name,) in cur.execute(
21
+ "SELECT name FROM sqlite_master WHERE type='table'"
22
+ ):
23
+ if table_name == "sqlite_sequence":
24
+ continue
25
+
26
+ # PRAGMA table_info returns rows like (cid, name, type, ...)
27
+ # We want the column **name**, which is at index 1
28
+ cols = [r[1] for r in cur.execute(f"PRAGMA table_info('{table_name}')")]
29
+
30
+ # Pull distinct non-null values from each column
31
+ for col in cols:
32
+ for (val,) in cur.execute(
33
+ f"SELECT DISTINCT `{col}` FROM `{table_name}` WHERE `{col}` IS NOT NULL"
34
+ ):
35
+ text = str(val).strip()
36
+ if 0 < len(text) <= max_len:
37
+ # Generate a unique document ID
38
+ doc_id = f"{table_name}-{col}-{hash(text)}"
39
+ docs.append((doc_id, text))
40
+
41
+ conn.close()
42
+ return docs
43
+
44
+
45
+ def build_index_for_db(db_id, db_path, index_root="db_contents_index"):
46
+ """
47
+ Build (or open) a Whoosh index for a single database.
48
+
49
+ - If the index already exists in index_root/db_id, it will be opened.
50
+ - Otherwise, a new index is created and populated from the SQLite file.
51
+ """
52
+ index_dir = os.path.join(index_root, db_id)
53
+ os.makedirs(index_dir, exist_ok=True)
54
+
55
+ # Define the schema: unique ID + stored text field
56
+ schema = Schema(
57
+ id=ID(stored=True, unique=True),
58
+ content=TEXT(stored=True)
59
+ )
60
+
61
+ # Open existing index if present
62
+ if index.exists_in(index_dir):
63
+ return index.open_dir(index_dir)
64
+
65
+ # Otherwise create a new index and add documents
66
+ ix = index.create_in(index_dir, schema)
67
+ writer = ix.writer()
68
+ docs = extract_contents_from_db(db_path)
69
+ for doc_id, text in docs:
70
+ writer.add_document(id=doc_id, content=text)
71
+ writer.commit()
72
+ return ix
73
+
74
+
75
+ if __name__ == "__main__":
76
+ DATABASE_ROOT = "databases"
77
+ INDEX_ROOT = "db_contents_index"
78
+
79
+ # Optionally remove any existing index directory to start fresh
80
+ if os.path.isdir(INDEX_ROOT):
81
+ import shutil
82
+ shutil.rmtree(INDEX_ROOT)
83
+ os.makedirs(INDEX_ROOT, exist_ok=True)
84
+
85
+ # Loop over each database folder in databases/
86
+ for db_id in os.listdir(DATABASE_ROOT):
87
+ db_file = os.path.join(DATABASE_ROOT, db_id, f"{db_id}.sqlite")
88
+ if os.path.isfile(db_file):
89
+ print(f"Building Whoosh index for {db_id}...")
90
+ build_index_for_db(db_id, db_file, INDEX_ROOT)
91
+
92
+ print("All indexes built successfully.")
db_contents_index/singer/{write.lock → MAIN_WRITELOCK} RENAMED
File without changes
db_contents_index/singer/MAIN_qq60yoh2am2v4iv7.seg ADDED
Binary file (19.9 kB). View file
 
db_contents_index/singer/_0.fdm DELETED
Binary file (157 Bytes)
 
db_contents_index/singer/_0.fdt DELETED
Binary file (1.96 kB)
 
db_contents_index/singer/_0.fdx DELETED
Binary file (64 Bytes)
 
db_contents_index/singer/_0.fnm DELETED
Binary file (343 Bytes)
 
db_contents_index/singer/_0.nvd DELETED
Binary file (126 Bytes)
 
db_contents_index/singer/_0.nvm DELETED
Binary file (103 Bytes)
 
db_contents_index/singer/_0.si DELETED
Binary file (520 Bytes)
 
db_contents_index/singer/_0.tvd DELETED
Binary file (518 Bytes)
 
db_contents_index/singer/_0.tvm DELETED
Binary file (162 Bytes)
 
db_contents_index/singer/_0.tvx DELETED
Binary file (69 Bytes)
 
db_contents_index/singer/_0_Lucene90_0.doc DELETED
Binary file (101 Bytes)
 
db_contents_index/singer/_0_Lucene90_0.dvd DELETED
Binary file (1.68 kB)
 
db_contents_index/singer/_0_Lucene90_0.dvm DELETED
Binary file (171 Bytes)
 
db_contents_index/singer/_0_Lucene90_0.pos DELETED
Binary file (160 Bytes)
 
db_contents_index/singer/_0_Lucene90_0.tim DELETED
Binary file (1.22 kB)
 
db_contents_index/singer/_0_Lucene90_0.tip DELETED
Binary file (107 Bytes)
 
db_contents_index/singer/_0_Lucene90_0.tmd DELETED
Binary file (269 Bytes)
 
db_contents_index/singer/_MAIN_1.toc ADDED
Binary file (1.63 kB). View file
 
db_contents_index/singer/segments_1 DELETED
Binary file (154 Bytes)
 
text2sql.py CHANGED
@@ -9,6 +9,7 @@ import sqlite3
9
  from tqdm import tqdm
10
  from utils.db_utils import get_db_schema
11
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
12
  from whoosh.index import create_in
13
  from whoosh.fields import Schema, TEXT
14
  from whoosh.qparser import QueryParser
@@ -115,18 +116,15 @@ class ChatBot():
115
  self.sic = SchemaItemClassifierInference("Roxanne-WANG/LangSQL")
116
  self.db_id2content_searcher = dict()
117
  for db_id in os.listdir("db_contents_index"):
118
- schema = Schema(content=TEXT(stored=True))
119
  index_dir = os.path.join("db_contents_index", db_id)
120
- if not os.path.exists(index_dir):
121
- os.makedirs(index_dir)
122
- ix = create_in(index_dir, schema)
123
- writer = ix.writer()
124
- with open(os.path.join(index_dir, f"{db_id}.json"), "r") as file:
125
- data = json.load(file)
126
- for item in data:
127
- writer.add_document(content=item['content'])
128
- writer.commit()
129
- self.db_id2content_searcher[db_id] = ix
130
 
131
  self.db_ids = sorted(os.listdir("databases"))
132
  self.db_id2schema = get_db_id2schema("databases", "data/tables.json")
 
9
  from tqdm import tqdm
10
  from utils.db_utils import get_db_schema
11
  from transformers import AutoModelForCausalLM, AutoTokenizer
12
+ from whoosh import index
13
  from whoosh.index import create_in
14
  from whoosh.fields import Schema, TEXT
15
  from whoosh.qparser import QueryParser
 
116
  self.sic = SchemaItemClassifierInference("Roxanne-WANG/LangSQL")
117
  self.db_id2content_searcher = dict()
118
  for db_id in os.listdir("db_contents_index"):
 
119
  index_dir = os.path.join("db_contents_index", db_id)
120
+
121
+ # Open existing Whoosh index directory
122
+ if index.exists_in(index_dir):
123
+ ix = index.open_dir(index_dir)
124
+ # keep a searcher around for querying
125
+ self.db_id2content_searcher[db_id] = ix.searcher()
126
+ else:
127
+ raise ValueError(f"No Whoosh index found for '{db_id}' at '{index_dir}'")
 
 
128
 
129
  self.db_ids = sorted(os.listdir("databases"))
130
  self.db_id2schema = get_db_id2schema("databases", "data/tables.json")