om4r932 commited on
Commit
7fcb613
·
1 Parent(s): c1729b7

V3 : Add ETSI source + added scripts

Browse files
bm25_maker.py → 3gpp_bm25_maker.py RENAMED
File without changes
spec_indexer.py → 3gpp_spec_indexer.py RENAMED
File without changes
tdoc_indexer.py → 3gpp_tdoc_indexer.py RENAMED
File without changes
app.py CHANGED
@@ -8,9 +8,12 @@ import gradio as gr
8
  load_dotenv()
9
  hf_token = os.environ["HF_TOKEN"]
10
 
11
- SCRIPT_DOC = "tdoc_indexer.py"
12
- SCRIPT_SPEC = "spec_indexer.py"
13
- SCRIPT_BM25 = "bm25_maker.py"
 
 
 
14
 
15
  def get_script_output(script_path, current_log=""):
16
  accumulated_output = current_log
@@ -35,26 +38,40 @@ def get_script_output(script_path, current_log=""):
35
  def index_tdocs():
36
  log_output = "⏳ Indexation en cours...\n"
37
  yield gr.update(interactive=False), gr.update(interactive=False), log_output
38
- for log in get_script_output(SCRIPT_DOC, log_output):
39
- yield gr.update(interactive=False), gr.update(interactive=False), log
40
  log_output = log
41
 
42
  log_output += "✅ Terminé.\n"
43
- yield gr.update(interactive=True), gr.update(interactive=True), log_output
44
 
45
- def index_specifications():
46
  log_output = "⏳ Indexation en cours...\n"
47
- yield gr.update(interactive=False), gr.update(interactive=False), log_output
48
- for log in get_script_output(SCRIPT_SPEC, log_output):
49
- yield gr.update(interactive=False), gr.update(interactive=False), log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  log_output = log
51
 
52
- for log in get_script_output(SCRIPT_BM25, log_output):
53
- yield gr.update(interactive=False), gr.update(interactive=False), log
54
  log_output = log
55
 
56
  log_output += "✅ Terminé.\n"
57
- yield gr.update(interactive=True), gr.update(interactive=True), log_output
58
 
59
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
60
  gr.Markdown("# 📄 3GPP Indexer Main Menu")
@@ -63,11 +80,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
63
  with gr.Column():
64
  tdocs_btn = gr.Button("Re-index TDocs", variant="primary")
65
  with gr.Column():
66
- spec_btn = gr.Button("Re-index Specifications", variant="primary")
67
-
68
  out = gr.Textbox(label="Output", lines=25, autoscroll=True, interactive=False)
69
 
70
- tdocs_btn.click(index_tdocs, outputs=[tdocs_btn, spec_btn, out])
71
- spec_btn.click(index_specifications, outputs=[tdocs_btn, spec_btn, out])
 
72
 
73
  demo.queue().launch()
 
8
  load_dotenv()
9
  hf_token = os.environ["HF_TOKEN"]
10
 
11
+ SCRIPT_DOC_3GPP = "3gpp_tdoc_indexer.py"
12
+ SCRIPT_SPEC_3GPP = "3gpp_spec_indexer.py"
13
+ SCRIPT_BM25_3GPP = "3gpp_bm25_maker.py"
14
+
15
+ SCRIPT_SPEC_ETSI = "etsi_spec_indexer.py"
16
+ SCRIPT_BM25_ETSI = "etsi_bm25_maker.py"
17
 
18
  def get_script_output(script_path, current_log=""):
19
  accumulated_output = current_log
 
38
  def index_tdocs():
39
  log_output = "⏳ Indexation en cours...\n"
40
  yield gr.update(interactive=False), gr.update(interactive=False), log_output
41
+ for log in get_script_output(SCRIPT_DOC_3GPP, log_output):
42
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
43
  log_output = log
44
 
45
  log_output += "✅ Terminé.\n"
46
+ yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
47
 
48
+ def index_3gpp_specifications():
49
  log_output = "⏳ Indexation en cours...\n"
50
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
51
+ for log in get_script_output(SCRIPT_SPEC_3GPP, log_output):
52
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
53
+ log_output = log
54
+
55
+ for log in get_script_output(SCRIPT_BM25_3GPP, log_output):
56
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
57
+ log_output = log
58
+
59
+ log_output += "✅ Terminé.\n"
60
+ yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
61
+
62
+ def index_etsi_specifications():
63
+ log_output = "⏳ Indexation en cours...\n"
64
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
65
+ for log in get_script_output(SCRIPT_SPEC_ETSI, log_output):
66
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
67
  log_output = log
68
 
69
+ for log in get_script_output(SCRIPT_BM25_ETSI, log_output):
70
+ yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
71
  log_output = log
72
 
73
  log_output += "✅ Terminé.\n"
74
+ yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
75
 
76
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
77
  gr.Markdown("# 📄 3GPP Indexer Main Menu")
 
80
  with gr.Column():
81
  tdocs_btn = gr.Button("Re-index TDocs", variant="primary")
82
  with gr.Column():
83
+ spec_btn_3gpp = gr.Button("Re-index 3GPP Specifications", variant="primary")
84
+ spec_btn_etsi = gr.Button("Re-index ETSI Specifications", variant="primary")
85
  out = gr.Textbox(label="Output", lines=25, autoscroll=True, interactive=False)
86
 
87
+ tdocs_btn.click(index_tdocs, outputs=[tdocs_btn, spec_btn_3gpp, spec_btn_etsi, out])
88
+ spec_btn_3gpp.click(index_3gpp_specifications, outputs=[tdocs_btn, spec_btn_3gpp, spec_btn_etsi, out])
89
+ spec_btn_etsi.click(index_etsi_specifications, outputs=[tdocs_btn, spec_btn_3gpp, spec_btn_etsi, out])
90
 
91
  demo.queue().launch()
etsi_bm25_maker.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import os, warnings
3
+ os.environ["CURL_CA_BUNDLE"] = ''
4
+ from dotenv import load_dotenv
5
+ warnings.filterwarnings("ignore")
6
+ load_dotenv()
7
+ import bm25s
8
+ from bm25s.hf import BM25HF
9
+ from datasets import load_dataset
10
+ unique_specs = set()
11
+
12
+ dataset_text = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ.get("HF_TOKEN"))
13
+ dataset_metadata = load_dataset("OrganizedProgrammers/ETSISpecMetadata", token=os.environ.get("HF_TOKEN"))
14
+
15
+ dataset_text = dataset_text["train"].to_list()
16
+ dataset_metadata = dataset_metadata["train"].to_list()
17
+
18
+ corpus_json = []
19
+
20
+ def get_document(spec_id: str, spec_title: Optional[str]):
21
+ text = [f"{spec_id} - {spec_title}\n" if spec_title else f"{spec_id}\n"]
22
+ for section in dataset_text:
23
+ if spec_id == section["doc_id"]:
24
+ text.extend([f"{section['section']}\n\n{section['content']}"])
25
+ return text
26
+
27
+ for specification in dataset_metadata:
28
+ if specification['id'] in unique_specs: continue
29
+ for section in dataset_text:
30
+ if specification['id'] == section['doc_id']:
31
+ corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
32
+ "id": specification['id'],
33
+ "title": specification['title'],
34
+ "section_title": section['section'],
35
+ "version": specification['version'],
36
+ "type": specification['type'],
37
+ "url": specification['url'],
38
+ "scope": specification['scope']
39
+ }})
40
+ unique_specs.add(specification['id'])
41
+
42
+ corpus_text = [doc["text"] for doc in corpus_json]
43
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
44
+
45
+ retriever = BM25HF(corpus=corpus_json)
46
+ retriever.index(corpus_tokens)
47
+
48
+ retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF_TOKEN"))
49
+
50
+ unique_specs = set()
51
+ corpus_json = []
52
+
53
+ for specification in dataset_metadata:
54
+ if specification['id'] in unique_specs: continue
55
+ text_list = get_document(specification['id'], specification['title'])
56
+ text = "\n".join(text_list)
57
+ if len(text_list) == 1: continue
58
+ corpus_json.append({"text": text, "metadata": specification})
59
+ unique_specs.add(specification['id'])
60
+
61
+ corpus_text = [doc["text"] for doc in corpus_json]
62
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
63
+
64
+ retriever = BM25HF(corpus=corpus_json)
65
+ retriever.index(corpus_tokens)
66
+
67
+ retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF_TOKEN"))
etsi_spec_indexer.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import warnings
4
+ from dotenv import load_dotenv
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ warnings.filterwarnings("ignore")
9
+ os.environ["CURL_CA_BUNDLE"] = ""
10
+ load_dotenv()
11
+
12
+ from datasets import load_dataset, Dataset
13
+ from datasets.data_files import EmptyDatasetError
14
+ import threading
15
+ import zipfile
16
+ import sys
17
+ import requests
18
+ import fitz
19
+ import re
20
+ import json
21
+ import traceback
22
+ import io
23
+ import concurrent.futures
24
+ import hashlib
25
+
26
+ CHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
27
+ DICT_LOCK = threading.Lock()
28
+ DOCUMENT_LOCK = threading.Lock()
29
+ STOP_EVENT = threading.Event()
30
+
31
+ documents_by_spec_num = {}
32
+
33
+ try:
34
+ spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ.get("HF_TOKEN"))
35
+ spec_contents = spec_contents["train"].to_list()
36
+
37
+ for section in spec_contents:
38
+ if section["doc_id"] not in documents_by_spec_num.keys():
39
+ documents_by_spec_num[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
40
+ else:
41
+ documents_by_spec_num[section["doc_id"]]["content"][section["section"]] = section["content"]
42
+ except EmptyDatasetError as e:
43
+ print("Base de données vide !")
44
+ indexed_specifications = {}
45
+ specifications_passed = set()
46
+ processed_count = 0
47
+ total_count = 0
48
+
49
+ session = requests.Session()
50
+ req = session.post("https://portal.etsi.org/ETSIPages/LoginEOL.ashx", verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}, data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}))
51
+ print("Récupération des spécifications depuis ETSI...", req.status_code)
52
+
53
+ url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
54
+ url_tr = url_ts.replace("stdType=TS", "stdType=TR")
55
+
56
+ data_ts = requests.get(url_ts, verify=False).content
57
+ data_tr = requests.get(url_tr, verify=False).content
58
+
59
+ df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
60
+ df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
61
+
62
+ backup_ts = df_ts["ETSI deliverable"]
63
+ backup_tr = df_tr["ETSI deliverable"]
64
+
65
+ df_ts["ETSI deliverable"] = df_ts["ETSI deliverable"].str.extract(r"\s*ETSI TS (\d+ \d+(?:-\d+(?:-\d+)?)?)")
66
+ df_tr["ETSI deliverable"] = df_tr["ETSI deliverable"].str.extract(r"\s*ETSI TR (\d+ \d+(?:-\d+(?:-\d+)?)?)")
67
+
68
+ version1 = backup_ts.str.extract(r"\s*ETSI TS \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
69
+ version2 = backup_tr.str.extract(r"\s*ETSI TR \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
70
+
71
+ df_ts["Version"] = version1[0]
72
+ df_tr["Version"] = version2[0]
73
+
74
+ def ver_tuple(v):
75
+ return tuple(map(int, v.split(".")))
76
+
77
+ df_ts["temp"] = df_ts["Version"].apply(ver_tuple)
78
+ df_tr["temp"] = df_tr["Version"].apply(ver_tuple)
79
+
80
+ df_ts["Type"] = "TS"
81
+ df_tr["Type"] = "TR"
82
+
83
+ df = pd.concat([df_ts, df_tr])
84
+
85
+ unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
86
+ unique_df = unique_df.drop(columns="temp")
87
+ unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
88
+
89
+ df = df.drop(columns="temp")
90
+ df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
91
+
92
+ def get_text(specification: str):
93
+ if STOP_EVENT.is_set():
94
+ return None, []
95
+
96
+ print(f"\n[INFO] Tentative de récupération de la spécification {specification}", flush=True)
97
+ response = session.get(
98
+ unique_df[unique_df["ETSI deliverable"] == specification].iloc[0]["PDF link"],
99
+ verify=False,
100
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
101
+ )
102
+
103
+ if response.status_code != 200:
104
+ print(f"\n[ERREUR] Echec du téléchargement du PDF pour {specification}. {req.status_code}", flush=True)
105
+ return None, []
106
+
107
+ pdf = fitz.open(stream=response.content, filetype="pdf")
108
+ return pdf, pdf.get_toc()
109
+
110
+ def get_spec_content(specification: str):
111
+ def extract_sections(text, titles):
112
+ sections = {}
113
+ # On trie les titres selon leur position dans le texte
114
+ sorted_titles = sorted(titles, key=lambda t: text.find(t))
115
+ for i, title in enumerate(sorted_titles):
116
+ start = text.find(title)
117
+ if i + 1 < len(sorted_titles):
118
+ end = text.find(sorted_titles[i + 1])
119
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
120
+ else:
121
+ sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
122
+ return sections
123
+ if STOP_EVENT.is_set():
124
+ return {}
125
+ print("\n[INFO] Tentative de récupération du texte", flush=True)
126
+ pdf, doc_toc = get_text(specification)
127
+ text = []
128
+ first = 0
129
+ for level, title, page in doc_toc:
130
+ first = page - 1
131
+ break
132
+ for page in pdf[first:]:
133
+ text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
134
+ text = "\n".join(text)
135
+
136
+ if not text or STOP_EVENT.is_set() or not doc_toc:
137
+ print("\n[ERREUR] Pas de texte/table of contents trouvé !")
138
+ return {}
139
+ print(f"\n[INFO] Texte {specification} récupéré", flush=True)
140
+ titles = []
141
+ for level, title, page in doc_toc:
142
+ if STOP_EVENT.is_set():
143
+ return {}
144
+ if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
145
+ titles.append('\n'.join(title.strip().split(" ", 1)))
146
+
147
+ return extract_sections(text, titles)
148
+
149
+ def hasher(specification: str, version: str):
150
+ return hashlib.md5(f"{specification}{version}".encode()).hexdigest()
151
+
152
+ def get_scope(content):
153
+ for title, text in content.items():
154
+ if title.lower().endswith("scope"):
155
+ return text
156
+ return ""
157
+
158
+ def process_specification(spec):
159
+ global processed_count, indexed_specifications, documents_by_spec_num
160
+ if STOP_EVENT.is_set():
161
+ return
162
+ try:
163
+ version = spec.get('Version')
164
+ if not version: return
165
+ doc_id = str(spec.get("ETSI deliverable"))
166
+ document = None
167
+ with DOCUMENT_LOCK:
168
+ if doc_id in documents_by_spec_num and documents_by_spec_num[doc_id]["hash"] == hasher(doc_id, version) and not doc_id in specifications_passed:
169
+ document = documents_by_spec_num[doc_id]
170
+ specifications_passed.add(doc_id)
171
+ print(f"\n[INFO] Document déjà présent pour {doc_id} (version {spec['Version']})", flush=True)
172
+ elif doc_id in specifications_passed:
173
+ document = documents_by_spec_num[doc_id]
174
+ print(f"\n[INFO] Document déjà présent pour {doc_id} [dernière version présent]")
175
+ else:
176
+ print(f"\n[INFO] Tentative de récupération du document {doc_id} (version {spec['Version']})", flush=True)
177
+ document = get_spec_content(doc_id)
178
+ if document:
179
+ documents_by_spec_num[doc_id] = {"content": document, "hash": hasher(doc_id, version)}
180
+ document = {"content": document, "hash": hasher(doc_id, version)}
181
+ specifications_passed.add(doc_id)
182
+ print(f"\n[INFO] Document extrait pour {doc_id} (version {spec['Version']})", flush=True)
183
+
184
+ string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
185
+ metadata = {
186
+ "id": str(doc_id),
187
+ "title": spec["title"],
188
+ "type": spec["Type"],
189
+ "version": version,
190
+ "url": spec["PDF link"],
191
+ "scope": "" if not document else get_scope(document["content"])
192
+ }
193
+ with DICT_LOCK:
194
+ indexed_specifications[string_key] = metadata
195
+ processed_count += 1
196
+ sys.stdout.write(f"\rTraitement: {processed_count}/{total_count} spécifications...")
197
+ sys.stdout.flush()
198
+ except Exception as e:
199
+ traceback.print_exception(e)
200
+ print(f"\n[ERREUR] Échec du traitement de {doc_id} {version}: {e}", flush=True)
201
+
202
+ def sauvegarder(indexed_specifications, documents_by_spec_num):
203
+ print("\nSauvegarde en cours...", flush=True)
204
+
205
+ flat_metadata = [metadata for _, metadata in indexed_specifications.items()]
206
+ flat_docs = []
207
+ for doc_id, data in documents_by_spec_num.items():
208
+ for title, content in data["content"].items():
209
+ flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
210
+
211
+ push_spec_content = Dataset.from_list(flat_docs)
212
+ push_spec_metadata = Dataset.from_list(flat_metadata)
213
+ push_spec_content.push_to_hub("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"])
214
+ push_spec_metadata.push_to_hub("OrganizedProgrammers/ETSISpecMetadata", token=os.environ["HF_TOKEN"])
215
+ print("Sauvegarde terminée.", flush=True)
216
+
217
+ def main():
218
+ global total_count
219
+ start_time = time.time()
220
+
221
+ specifications = df.to_dict(orient="records")
222
+ total_count = len(specifications)
223
+ print(f"Traitement de {total_count} spécifications avec multithreading...")
224
+ try:
225
+ with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
226
+ futures = [executor.submit(process_specification, spec) for spec in specifications]
227
+ while True:
228
+ if all(f.done() for f in futures):
229
+ break
230
+ if STOP_EVENT.is_set():
231
+ break
232
+ time.sleep(0.35)
233
+ except Exception as e:
234
+ print(f"\nErreur inattendue dans le ThreadPool : {e}", flush=True)
235
+ print("\nSauvegarde des résultats...", flush=True)
236
+ sauvegarder(indexed_specifications, documents_by_spec_num)
237
+ elapsed_time = time.time() - start_time
238
+ print(f"\nTraitement terminé en {elapsed_time:.2f} secondes.", flush=True)
239
+
240
+ if __name__ == "__main__":
241
+ try:
242
+ main()
243
+ except KeyboardInterrupt:
244
+ print("\nInterruption détectée (Ctrl+C). Arrêt des tâches en cours...", flush=True)
245
+ STOP_EVENT.set()
246
+ time.sleep(2)
247
+ sauvegarder(indexed_specifications, documents_by_spec_num)
248
+ print("Arrêt propre du script.", flush=True)
249
+ sys.exit(0)
250
+ except Exception as e:
251
+ print(f"\nErreur inattendue : {e}", flush=True)
252
+ sauvegarder(indexed_specifications, documents_by_spec_num)
253
+ sys.exit(1)
254
+
255
+ # print(get_spec_content("188 005-1"))
requirements.txt CHANGED
@@ -6,4 +6,5 @@ numpy
6
  python-dotenv
7
  gradio
8
  bm25s[full]
9
- lxml
 
 
6
  python-dotenv
7
  gradio
8
  bm25s[full]
9
+ lxml
10
+ fitz