om4r932 commited on
Commit
22ee398
·
1 Parent(s): ebe17cc

V2 (changed data storage method + rework)

Browse files
app.py CHANGED
@@ -1,264 +1,71 @@
1
- from datetime import datetime
2
- import os
3
- import warnings
4
- import traceback
5
- import gradio as gr
6
  import subprocess
7
- from huggingface_hub import Repository
8
- from git import Repo
9
- import requests
10
-
11
- warnings.filterwarnings('ignore')
12
-
13
- DOC_INDEXER = "indexer_multi.py"
14
- SPEC_INDEXER = "spec_indexer_multi.py"
15
- SPEC_DOC_INDEXER = "spec_doc_indexer_multi.py"
16
- BM25_INDEXER = "bm25_maker.py"
17
-
18
- DOC_INDEX_FILE = "indexed_docs.json"
19
- SPEC_INDEX_FILE = "indexed_specifications.json"
20
- SPEC_DOC_INDEX_FILE = "indexed_docs_content.zip"
21
- BM25_INDEX_FILE = "bm25s.zip"
22
-
23
- HF_SEARCH_REPO = "OrganizedProgrammers/3GPPDocFinder"
24
- REPO_DIR = os.path.dirname(os.path.abspath(__file__))
25
-
26
- def get_docs_stats():
27
- if os.path.exists(DOC_INDEX_FILE):
28
- import json
29
- with open(DOC_INDEX_FILE, 'r', encoding='utf-8') as f:
30
- data = json.load(f)
31
- return len(data["docs"])
32
- return 0
33
-
34
- def get_specs_stats():
35
- if os.path.exists(SPEC_INDEX_FILE):
36
- import json
37
- with open(SPEC_INDEX_FILE, 'r', encoding='utf-8') as f:
38
- data = json.load(f)
39
- return len(data["specs"])
40
- return 0
41
-
42
- def get_scopes_stats():
43
- if os.path.exists(SPEC_INDEX_FILE):
44
- import json
45
- with open(SPEC_INDEX_FILE, 'r', encoding="utf-8") as f:
46
- data = json.load(f)
47
- return len(data['scopes'])
48
- return 0
49
-
50
- def check_permissions(user: str, token: str):
51
- try:
52
- req = requests.get("https://huggingface.co/api/whoami-v2", verify=False, headers={"Accept": "application/json", "Authorization": f"Bearer {token}"})
53
- if req.status_code != 200:
54
- return False
55
- reqJson: dict = req.json()
56
- if not reqJson.get("name") or reqJson['name'] != user:
57
- return False
58
- if not reqJson.get("orgs") or len(reqJson['orgs']) == 0:
59
- return False
60
- for org in reqJson['orgs']:
61
- if "645cfa1b5ebf379fd6d8a339" == org['id']:
62
- return True
63
- if not reqJson.get('auth') or reqJson['auth'] == {}:
64
- return False
65
- if reqJson['auth']['accessToken']['role'] != "fineGrained":
66
- return False
67
- for scope in reqJson['auth']['accessToken']['fineGrained']['scoped']:
68
- if scope['entity']['type'] == "org" and scope['entity']['_id'] == "645cfa1b5ebf379fd6d8a339" and all(perm in scope['permissions'] for perm in ['repo.write', 'repo.content.read']):
69
- return True
70
- return False
71
- except Exception as e:
72
- traceback.print_exception(e)
73
- return False
74
-
75
- def update_logged(user: str, token: str):
76
- if check_permissions(user, token):
77
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
78
- else:
79
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
80
-
81
- def commit_and_push_3gppindexers(user, token, files, message, current_log=""):
82
- log = current_log + "\n"
83
- repo = Repo(REPO_DIR)
84
- origin = repo.remotes.origin
85
- repo.config_writer().set_value("user", "name", "3GPP Indexer Automatic Git Tool").release()
86
- repo.config_writer().set_value("user", "email", "example@mail.org").release()
87
- origin.pull()
88
- log += "Git pull succeed !\n"
89
- yield log
90
-
91
- repo.git.add(files)
92
- repo.index.commit(message)
93
-
94
- try:
95
- repo.git.push(f"https://{user}:{token}@huggingface.co/spaces/OrganizedProgrammers/3GPPIndexers")
96
- log += "Git push succeed !\n"
97
- yield log
98
- log += "Wait for Huggingface to restart the Space\n"
99
- yield log
100
- except Exception as e:
101
- log += f"Git push failed: {e}\n"
102
- yield log
103
 
104
- def commit_and_push_3gppdocfinder(token, files, message, current_log=""):
105
- log = current_log + "\n"
106
- if not token:
107
- log += "No token provided. Skipping HuggingFace push.\n"
108
- yield log
109
- return
110
-
111
- hf_repo_dir = os.path.join(REPO_DIR, "hf_spaces")
112
- repo = None
113
-
114
- if not os.path.exists(hf_repo_dir):
115
- repo = Repository(
116
- local_dir=hf_repo_dir,
117
- repo_type="space",
118
- clone_from=HF_SEARCH_REPO,
119
- git_user="3GPP Indexer Automatic Git Tool",
120
- git_email="example@mail.org",
121
- token=token,
122
- skip_lfs_files=True
123
- )
124
- else:
125
- repo = Repository(
126
- local_dir=hf_repo_dir,
127
- repo_type="space",
128
- git_user="3GPP Indexer Automatic Git Tool",
129
- git_email="example@mail.org",
130
- token=token,
131
- skip_lfs_files=True
132
- )
133
-
134
- repo.git_pull()
135
-
136
- # Copy artifact files to huggingface space
137
- for f in files:
138
- import shutil
139
- shutil.copy2(f, os.path.join(hf_repo_dir, f))
140
-
141
- repo.git_add(auto_lfs_track=True)
142
- repo.git_commit(message)
143
- repo.git_push()
144
-
145
- log += "Pushed to HuggingFace.\n"
146
- yield log
147
 
148
- def refresh_stats():
149
- return str(get_docs_stats()), str(get_specs_stats()), str(get_scopes_stats())
 
150
 
151
- def stream_script_output(script_path, current_log=""):
152
  accumulated_output = current_log
153
-
154
  process = subprocess.Popen(
155
  ["python", script_path],
156
  stdout=subprocess.PIPE,
157
  stderr=subprocess.STDOUT,
158
  bufsize=1,
159
- universal_newlines=True,
160
  )
161
-
162
  for line in process.stdout:
163
  accumulated_output += line
164
  yield accumulated_output
165
 
166
  process.stdout.close()
167
  process.wait()
168
-
169
  yield accumulated_output
170
 
171
- def index_documents(user, token):
172
  log_output = "⏳ Indexation en cours...\n"
173
- # Désactiver tous les boutons
174
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
175
-
176
- # Lancer l'indexation
177
- if not check_permissions(user, token):
178
- log_output += "❌ Identifiants invalides\n"
179
- yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
180
- return
181
-
182
- for log in stream_script_output(DOC_INDEXER, log_output):
183
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
184
- log_output = log
185
-
186
- d = datetime.today().strftime("%d/%m/%Y-%H:%M:%S")
187
-
188
- for log in commit_and_push_3gppdocfinder(token, [DOC_INDEX_FILE], f"Update documents indexer via Indexer: {d}", log_output):
189
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
190
- log_output = log
191
-
192
- for log in commit_and_push_3gppindexers(user, token, [DOC_INDEX_FILE], f"Update documents indexer via Indexer: {d}", log_output):
193
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
194
  log_output = log
195
-
196
- # Réactiver les boutons à la fin
197
  log_output += "✅ Terminé.\n"
198
- yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
199
 
200
- def index_specifications(user, token):
201
  log_output = "⏳ Indexation en cours...\n"
202
- # Désactiver tous les boutons
203
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
204
-
205
- # Lancer l'indexation
206
- if not check_permissions(user, token):
207
- log_output += "❌ Identifiants invalides\n"
208
- yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
209
- return
210
-
211
- for log in stream_script_output(SPEC_INDEXER, log_output):
212
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
213
- log_output = log
214
-
215
- for log in stream_script_output(SPEC_DOC_INDEXER, log_output):
216
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
217
- log_output = log
218
-
219
- for log in stream_script_output(BM25_INDEXER, log_output):
220
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
221
- log_output = log
222
-
223
- d = datetime.today().strftime("%d/%m/%Y-%H:%M:%S")
224
-
225
- for log in commit_and_push_3gppdocfinder(token, [SPEC_DOC_INDEX_FILE, BM25_INDEX_FILE, SPEC_INDEX_FILE], f"Update specifications indexer via Indexer: {d}", log_output):
226
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
227
  log_output = log
228
 
229
- for log in commit_and_push_3gppindexers(user, token, [SPEC_DOC_INDEX_FILE, BM25_INDEX_FILE, SPEC_INDEX_FILE], f"Update specifications indexer via Indexer: {d}", log_output):
230
- yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
231
  log_output = log
232
 
233
- # Réactiver les boutons à la fin
234
  log_output += "✅ Terminé.\n"
235
- yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
236
 
237
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
238
- gr.Markdown("## 📄 3GPP Indexers")
239
-
240
  with gr.Row() as r1:
241
  with gr.Column():
242
- git_user = gr.Textbox(label="Git user (for push/pull indexes)")
243
- git_pass = gr.Textbox(label="Git Token", type="password")
244
- btn_login = gr.Button("Login", variant="primary")
245
-
246
- with gr.Row(visible=False) as r2:
247
- with gr.Column():
248
- doc_count = gr.Textbox(label="Docs Indexed", value=str(get_docs_stats()), interactive=False)
249
- btn_docs = gr.Button("Re-index Documents", variant="primary")
250
  with gr.Column():
251
- spec_count = gr.Textbox(label="Specs Indexed", value=str(get_specs_stats()), interactive=False)
252
- btn_specs = gr.Button("Re-index Specifications", variant="primary")
253
- with gr.Column():
254
- scope_count = gr.Textbox(label="Scopes Indexed", value=str(get_scopes_stats()), interactive=False)
255
-
256
- out = gr.Textbox(label="Output/Log", lines=13, autoscroll=True, visible=False)
257
- refresh = gr.Button(value="🔄 Refresh Stats", visible=False)
258
-
259
- btn_login.click(update_logged, inputs=[git_user, git_pass], outputs=[r1, r2, out, refresh])
260
- btn_docs.click(index_documents, inputs=[git_user, git_pass], outputs=[btn_docs, btn_specs, refresh, out])
261
- btn_specs.click(index_specifications, inputs=[git_user, git_pass], outputs=[btn_docs, btn_specs, refresh, out])
262
- refresh.click(refresh_stats, outputs=[doc_count, spec_count, scope_count])
263
 
264
- demo.launch()
 
 
 
 
 
 
1
  import subprocess
2
+ import warnings, os
3
+ warnings.filterwarnings("ignore")
4
+ os.environ["CURL_CA_BUNDLE"] = ""
5
+ from dotenv import load_dotenv
6
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ load_dotenv()
9
+ hf_token = os.environ["HF_TOKEN"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ SCRIPT_DOC = "tdoc_indexer.py"
12
+ SCRIPT_SPEC = "spec_indexer.py"
13
+ SCRIPT_BM25 = "bm25_maker.py"
14
 
15
+ def get_script_output(script_path, current_log=""):
16
  accumulated_output = current_log
17
+
18
  process = subprocess.Popen(
19
  ["python", script_path],
20
  stdout=subprocess.PIPE,
21
  stderr=subprocess.STDOUT,
22
  bufsize=1,
23
+ universal_newlines=True
24
  )
25
+
26
  for line in process.stdout:
27
  accumulated_output += line
28
  yield accumulated_output
29
 
30
  process.stdout.close()
31
  process.wait()
32
+
33
  yield accumulated_output
34
 
35
+ def index_tdocs():
36
  log_output = "⏳ Indexation en cours...\n"
37
+ for log in get_script_output(SCRIPT_DOC):
38
+ yield log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  log_output = log
40
+
 
41
  log_output += "✅ Terminé.\n"
42
+ yield log_output
43
 
44
+ def index_specifications():
45
  log_output = "⏳ Indexation en cours...\n"
46
+ for log in get_script_output(SCRIPT_SPEC):
47
+ yield log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  log_output = log
49
 
50
+ for log in get_script_output(SCRIPT_BM25):
51
+ yield log
52
  log_output = log
53
 
 
54
  log_output += "✅ Terminé.\n"
55
+ yield log_output
56
 
57
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
58
+ gr.Markdown("# 📄 3GPP Indexer Main Menu")
59
+
60
  with gr.Row() as r1:
61
  with gr.Column():
62
+ tdocs_btn = gr.Button("Re-index TDocs", variant="primary")
 
 
 
 
 
 
 
63
  with gr.Column():
64
+ spec_btn = gr.Button("Re-index Specifications", variant="primary")
65
+
66
+ out = gr.Textbox(label="Output", lines=25, autoscroll=True, interactive=False)
67
+
68
+ tdocs_btn.click(index_tdocs, outputs=[out])
69
+ spec_btn.click(index_specifications, outputs=[out])
 
 
 
 
 
 
70
 
71
+ demo.queue().launch()
bm25_maker.py CHANGED
@@ -1,59 +1,67 @@
1
- import shutil
2
- import zipfile
3
- import json
 
 
4
  import bm25s
 
 
 
5
 
6
- import nltk
7
- from nltk.stem import WordNetLemmatizer
8
 
9
- nltk.download("wordnet")
10
- lemmatizer = WordNetLemmatizer()
11
- indexer_id = "3gpp_bm25_docs"
12
- unique_specs = set()
13
 
14
- with open("indexed_specifications.json", "r") as f:
15
- spec_data = json.load(f)["specs"]
16
- with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf:
17
- for file_name in zf.namelist():
18
- if file_name.endswith(".json"):
19
- doc_bytes = zf.read(file_name)
20
- try:
21
- doc_data = json.loads(doc_bytes.decode("utf-8"))
22
- print("Documents loaded successfully !")
23
- except json.JSONDecodeError as e:
24
- print(f"Error while decoding the JSON file {file_name}: {e}")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  corpus_json = []
27
 
28
- for _, specification in spec_data.items():
29
- full_text = f"{specification['id']} - {specification['title']}\n\n\n"
30
- if specification['id'] in unique_specs:
31
- continue
32
- document = doc_data.get(specification['id'], None)
33
- if document is None: continue
34
- if not isinstance(document, str):
35
- full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()])
36
- corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": {
37
- "id": specification['id'],
38
- "title": specification['title'],
39
- "version": specification['version'],
40
- "release": specification['release'],
41
- "type": specification['type'],
42
- "working_group": specification['working_group'],
43
- "url": specification['url'],
44
- "scope": specification['scope']
45
- }})
46
- unique_specs.add(specification['id'])
47
- else:
48
- print(f"Skipping {specification['id']}")
49
- unique_specs.add(specification['id'])
50
 
51
  corpus_text = [doc["text"] for doc in corpus_json]
52
  corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
53
 
54
- retriever = bm25s.BM25(corpus=corpus_json)
55
  retriever.index(corpus_tokens)
56
 
57
- retriever.save(indexer_id)
58
-
59
- shutil.make_archive("bm25s", 'zip', '.', indexer_id)
 
1
+ import os, warnings
2
+ os.environ["CURL_CA_BUNDLE"] = ''
3
+ from dotenv import load_dotenv
4
+ warnings.filterwarnings("ignore")
5
+ load_dotenv()
6
  import bm25s
7
+ from bm25s.hf import BM25HF
8
+ from datasets import load_dataset
9
+ unique_specs = set()
10
 
11
+ dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ.get("HF_TOKEN"))
12
+ dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ.get("HF_TOKEN"))
13
 
14
+ dataset_text = dataset_text["train"].to_list()
15
+ dataset_metadata = dataset_metadata["train"].to_list()
16
+
17
+ corpus_json = []
18
 
19
+ def get_document(spec_id: str, spec_title: str):
20
+ text = [f"{spec_id} - {spec_title}\n"]
21
+ for section in dataset_text:
22
+ if spec_id == section["doc_id"]:
23
+ text.extend([f"{section['section']}\n\n{section['content']}"])
24
+ return text
 
 
 
 
 
25
 
26
+ for specification in dataset_metadata:
27
+ if specification['id'] in unique_specs: continue
28
+ for section in dataset_text:
29
+ if specification['id'] == section['doc_id']:
30
+ corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
31
+ "id": specification['id'],
32
+ "title": specification['title'],
33
+ "section_title": section['section'],
34
+ "version": specification['version'],
35
+ "type": specification['type'],
36
+ "working_group": specification['working_group'],
37
+ "url": specification['url'],
38
+ "scope": specification['scope']
39
+ }})
40
+ unique_specs.add(specification['id'])
41
+
42
+ corpus_text = [doc["text"] for doc in corpus_json]
43
+ corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
44
+
45
+ retriever = BM25HF(corpus=corpus_json)
46
+ retriever.index(corpus_tokens)
47
+
48
+ retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF_TOKEN"))
49
+
50
+ unique_specs = set()
51
  corpus_json = []
52
 
53
+ for specification in dataset_metadata:
54
+ if specification['id'] in unique_specs: continue
55
+ text_list = get_document(specification['id'], specification['title'])
56
+ text = "\n".join(text_list)
57
+ if len(text_list) == 1: continue
58
+ corpus_json.append({"text": text, "metadata": specification})
59
+ unique_specs.add(specification['id'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  corpus_text = [doc["text"] for doc in corpus_json]
62
  corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
63
 
64
+ retriever = BM25HF(corpus=corpus_json)
65
  retriever.index(corpus_tokens)
66
 
67
+ retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF_TOKEN"))
 
 
requirements.txt CHANGED
@@ -1,9 +1,8 @@
1
- gradio
2
  requests
3
  beautifulsoup4
4
- gitpython
5
- huggingface_hub
6
- lxml
7
- scikit-learn
8
- bm25s[full]
9
- nltk
 
 
1
  requests
2
  beautifulsoup4
3
+ datasets
4
+ pandas
5
+ numpy
6
+ python-dotenv
7
+ gradio
8
+ bm25s[full]
spec_doc_indexer_multi.py → scripts/old/spec_doc_indexer_multi.py RENAMED
File without changes
spec_indexer_multi.py → scripts/old/spec_indexer_multi.py RENAMED
File without changes
spec_indexer.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import warnings
4
+ from dotenv import load_dotenv
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ warnings.filterwarnings("ignore")
9
+ os.environ["CURL_CA_BUNDLE"] = ""
10
+ load_dotenv()
11
+
12
+ from bm25s.hf import BM25HF
13
+ from datasets import load_dataset, Dataset
14
+ import bm25s
15
+
16
+ import threading
17
+ import zipfile
18
+ import sys
19
+ import subprocess
20
+ import requests
21
+ import re
22
+ import traceback
23
+ import io
24
+ import concurrent.futures
25
+ import hashlib
26
+
27
+ CHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
28
+ DICT_LOCK = threading.Lock()
29
+ DOCUMENT_LOCK = threading.Lock()
30
+ STOP_EVENT = threading.Event()
31
+
32
+ spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
33
+ spec_contents = spec_contents["train"].to_list()
34
+ documents_by_spec_num = {}
35
+ for section in spec_contents:
36
+ if section["doc_id"] not in documents_by_spec_num.keys():
37
+ documents_by_spec_num[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
38
+ else:
39
+ documents_by_spec_num[section["doc_id"]]["content"][section["section"]] = section["content"]
40
+
41
+ indexed_specifications = {}
42
+ specifications_passed = set()
43
+ processed_count = 0
44
+ total_count = 0
45
+
46
+ def get_text(specification: str, version: str, second: bool = False):
47
+ """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
48
+ if STOP_EVENT.is_set():
49
+ return []
50
+ doc_id = specification
51
+ series = doc_id.split(".")[0]
52
+ content = []
53
+
54
+ print(f"\n[INFO] Tentative de récupération de la spécification {doc_id} version {version}", flush=True)
55
+ response = requests.get(
56
+ f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
57
+ verify=False,
58
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
59
+ )
60
+
61
+ if response.status_code != 200:
62
+ print(f"\n[ERREUR] Echec du téléchargement du ZIP pour {specification}-{version}. Tentative avec dernière version disponible", flush=True)
63
+ last_possible_version = requests.post('https://organizedprogrammers-3gppdocfinder.hf.space/find', verify=False, headers={"Content-Type": "application/json"}, json={"doc_id": specification})
64
+ if last_possible_version.status_code != 200:
65
+ print(f"\n[ERREUR] Echec du 2e téléchargement du ZIP pour {specification}-{version}. {last_possible_version.status_code}", flush=True)
66
+ return []
67
+ data = last_possible_version.json()
68
+ return get_text(specification, data['version'], True)
69
+
70
+ zip_bytes = io.BytesIO(response.content)
71
+ zip_file = zipfile.ZipFile(zip_bytes)
72
+
73
+ def extract_text(zipfile: zipfile.ZipFile, filename: str):
74
+ if (filename.endswith(".doc") or filename.endswith(".docx")) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
75
+ doc_bytes = zipfile.read(filename)
76
+ input_path = f"/tmp/{filename}"
77
+ output_path = "/tmp"
78
+ changed_ext_filename = re.sub(r".docx?$", ".txt", filename)
79
+ output_file = f"/tmp/{changed_ext_filename}"
80
+
81
+ with open(input_path, "wb") as f:
82
+ f.write(doc_bytes)
83
+ try:
84
+ print(f"\n[INFO] Tentative de conversion DOC/DOCX -> TXT", flush=True)
85
+ try:
86
+ subprocess.run(
87
+ ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", output_path, input_path],
88
+ check=True,
89
+ capture_output=True
90
+ )
91
+ except subprocess.CalledProcessError as e:
92
+ print(f"\n[ERREUR] LibreOffice a échoué : {e}", flush=True)
93
+ return []
94
+ if os.path.exists(output_file):
95
+ with open(output_file, "r", encoding="utf-8") as f:
96
+ return [line.strip() for line in f if line.strip()]
97
+ finally:
98
+ if os.path.exists(input_path):
99
+ os.remove(input_path)
100
+ if os.path.exists(output_file):
101
+ os.remove(output_file)
102
+ return []
103
+
104
+ for fileinfo in zip_file.infolist():
105
+ if STOP_EVENT.is_set():
106
+ return []
107
+ if fileinfo.filename.endswith(".zip") and len(zip_file.namelist()) == 1:
108
+ nested_zip_bytes = io.BytesIO(zip_file.read(fileinfo.filename))
109
+ zip_file = zipfile.ZipFile(nested_zip_bytes)
110
+ break
111
+
112
+ for filename in zip_file.namelist():
113
+ if STOP_EVENT.is_set():
114
+ return []
115
+ content.extend(extract_text(zip_file, filename))
116
+
117
+ if content:
118
+ print("\n[INFO] Conversion terminé", flush=True)
119
+ else:
120
+ print(f"\n[ERREUR] Pas réussi", flush=True)
121
+ return content
122
+
123
+ def get_spec_content(specification: str, version:str):
124
+ if STOP_EVENT.is_set():
125
+ return {}
126
+ print("\n[INFO] Tentative de récupération du texte", flush=True)
127
+ text = get_text(specification, version)
128
+ if not text or STOP_EVENT.is_set():
129
+ return {}
130
+ print(f"\n[INFO] Texte {specification}-{version} récupéré", flush=True)
131
+ chapters = []
132
+ chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+[^\.]$") # 3.5.2.1 Introduction
133
+
134
+ for i, line in enumerate(text):
135
+ if STOP_EVENT.is_set():
136
+ return {}
137
+ if chapter_regex.fullmatch(line):
138
+ chapters.append((i, line))
139
+ document = {}
140
+ for i in range(len(chapters)):
141
+ if STOP_EVENT.is_set():
142
+ return {}
143
+ start_index, chapter_title = chapters[i]
144
+ end_index = chapters[i+1][0] if i + 1 < len(chapters) else len(text)
145
+ content_lines = text[start_index + 1:end_index]
146
+ document[chapter_title.replace("\t", " ")] = "\n".join(content_lines)
147
+ print(f"\n[INFO] Document fini", flush=True)
148
+ return document
149
+
150
+ def version_to_code(version_str):
151
+ parts = version_str.split('.')
152
+ if len(parts) != 3: return None
153
+ try:
154
+ x, y, z = [int(p) for p in parts]
155
+ except ValueError:
156
+ return None
157
+ if x < 36 and y < 36 and z < 36:
158
+ return f"{CHARS[x]}{CHARS[y]}{CHARS[z]}"
159
+ else:
160
+ return f"{str(x).zfill(2)}{str(y).zfill(2)}{str(z).zfill(2)}"
161
+
162
+ def hasher(specification: str, version_code: str):
163
+ return hashlib.md5(f"{specification}{version_code}".encode()).hexdigest()
164
+
165
+ def get_scope(content):
166
+ for title, text in content.items():
167
+ if title.lower().endswith("scope"):
168
+ return text
169
+ return ""
170
+
171
+ def process_specification(spec):
172
+ global processed_count, indexed_specifications, documents_by_spec_num
173
+ if STOP_EVENT.is_set():
174
+ return
175
+ try:
176
+ if not spec.get('vers'): return
177
+ doc_id = str(spec['spec_num'])
178
+ document = None
179
+ version_code = version_to_code(str(spec['vers']))
180
+ if not version_code: return
181
+ with DOCUMENT_LOCK:
182
+ if doc_id in documents_by_spec_num and documents_by_spec_num[doc_id]["hash"] == hasher(doc_id, version_code) and not doc_id in specifications_passed:
183
+ document = documents_by_spec_num[doc_id]
184
+ specifications_passed.add(doc_id)
185
+ print(f"\n[INFO] Document déjà présent pour {doc_id} (version {spec['vers']})", flush=True)
186
+ elif doc_id in specifications_passed:
187
+ print(f"\n[INFO] Document déjà présent pour {doc_id} [dernière version présent]")
188
+ else:
189
+ print(f"\n[INFO] Tentative de récupération du document {doc_id} (version {spec['vers']})", flush=True)
190
+ document = get_spec_content(doc_id, version_code)
191
+ if document:
192
+ documents_by_spec_num[doc_id] = {"content": document, "hash": hasher(doc_id, version_code)}
193
+ specifications_passed.add(doc_id)
194
+ print(f"\n[INFO] Document extrait pour {doc_id} (version {spec['vers']})", flush=True)
195
+ series = doc_id.split(".")[0]
196
+ url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
197
+ string_key = f"{spec['spec_num']}+-+{spec['title']}+-+{spec['type']}+-+{spec['vers']}+-+{spec['WG']}"
198
+ metadata = {
199
+ "id": doc_id,
200
+ "title": spec["title"],
201
+ "type": spec["type"],
202
+ "version": str(spec["vers"]),
203
+ "working_group": spec["WG"],
204
+ "url": url,
205
+ "scope": "" if not document else get_scope(document["content"])
206
+ }
207
+ with DICT_LOCK:
208
+ indexed_specifications[string_key] = metadata
209
+ processed_count += 1
210
+ sys.stdout.write(f"\rTraitement: {processed_count}/{total_count} spécifications...")
211
+ sys.stdout.flush()
212
+ except Exception as e:
213
+ traceback.print_exception(e)
214
+ print(f"\n[ERREUR] Échec du traitement de {spec.get('spec_num', 'inconnu')} v{spec.get('vers')}: {e}", flush=True)
215
+
216
+ def sauvegarder(indexed_specifications, documents_by_spec_num):
217
+ print("\nSauvegarde en cours...", flush=True)
218
+
219
+ flat_metadata = [metadata for _, metadata in indexed_specifications.items()]
220
+ flat_docs = []
221
+ for doc_id, data in documents_by_spec_num.items():
222
+ for title, content in data["content"].items():
223
+ flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
224
+
225
+ push_spec_content = Dataset.from_list(flat_docs)
226
+ push_spec_metadata = Dataset.from_list(flat_metadata)
227
+ push_spec_content.push_to_hub("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
228
+ push_spec_metadata.push_to_hub("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF_TOKEN"])
229
+ print("Sauvegarde terminée.", flush=True)
230
+
231
+ def main():
232
+ global total_count
233
+ start_time = time.time()
234
+
235
+ # Récupération des spécifications depuis le site 3GPP
236
+ print("Récup��ration des spécifications depuis 3GPP...")
237
+ response = requests.get(
238
+ f'https://www.3gpp.org/dynareport?code=status-report.htm',
239
+ headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
240
+ verify=False
241
+ )
242
+
243
+ # Analyse des tableaux HTML
244
+ dfs = pd.read_html(
245
+ io.StringIO(response.text),
246
+ storage_options={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
247
+ encoding="utf-8"
248
+ )
249
+
250
+ for x in range(len(dfs)):
251
+ dfs[x] = dfs[x].replace({np.nan: None})
252
+
253
+ # Extraction des colonnes nécessaires
254
+ columns_needed = [0, 1, 2, 3, 4]
255
+ extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
256
+ columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
257
+
258
+ # Préparation des spécifications
259
+ specifications = []
260
+ for df in extracted_dfs:
261
+ for index, row in df.iterrows():
262
+ doc = row.to_list()
263
+ doc_dict = dict(zip(columns, doc))
264
+ specifications.append(doc_dict)
265
+
266
+ total_count = len(specifications)
267
+ print(f"Traitement de {total_count} spécifications avec multithreading...")
268
+ if os.path.exists("indexed_docs_content.zip"):
269
+ +print(f"Chargement de {len(documents_by_spec_num)} documents depuis le cache.")
270
+ try:
271
+ with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
272
+ futures = [executor.submit(process_specification, spec) for spec in specifications]
273
+ while True:
274
+ if all(f.done() for f in futures):
275
+ break
276
+ if STOP_EVENT.is_set():
277
+ break
278
+ time.sleep(0.35)
279
+ except Exception as e:
280
+ print(f"\nErreur inattendue dans le ThreadPool : {e}", flush=True)
281
+ print("\nSauvegarde des résultats...", flush=True)
282
+ sauvegarder(indexed_specifications, documents_by_spec_num)
283
+ elapsed_time = time.time() - start_time
284
+ print(f"\nTraitement terminé en {elapsed_time:.2f} secondes.", flush=True)
285
+ print(f"Métadonnées sauvegardées dans 'indexed_specifications.json'.", flush=True)
286
+ print(f"Contenu des documents sauvegardé dans 'indexed_docs_content.zip'.", flush=True)
287
+
288
+ if __name__ == "__main__":
289
+ try:
290
+ main()
291
+ except KeyboardInterrupt:
292
+ print("\nInterruption détectée (Ctrl+C). Arrêt des tâches en cours...", flush=True)
293
+ STOP_EVENT.set()
294
+ time.sleep(2)
295
+ sauvegarder(indexed_specifications, documents_by_spec_num)
296
+ print("Arrêt propre du script.", flush=True)
297
+ sys.exit(0)
298
+ except Exception as e:
299
+ print(f"\nErreur inattendue : {e}", flush=True)
300
+ sauvegarder(indexed_specifications, documents_by_spec_num)
301
+ sys.exit(1)
indexer_multi.py → tdoc_indexer.py RENAMED
@@ -1,6 +1,7 @@
1
  from datetime import datetime
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  import json
5
  import os
6
  import time
@@ -14,9 +15,9 @@ warnings.filterwarnings("ignore")
14
 
15
  class TsgDocIndexer:
16
  def __init__(self, max_workers=10):
17
- self.main_ftp_url = "https://www.3gpp.org/ftp"
18
- self.indexer_file = "indexed_docs.json"
19
- self.indexer, self.latest_date = self.load_indexer()
20
  self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
21
  self.max_workers = max_workers
22
 
@@ -31,19 +32,21 @@ class TsgDocIndexer:
31
 
32
  def load_indexer(self):
33
  """Load existing index if available"""
34
- if os.path.exists(self.indexer_file):
35
- with open(self.indexer_file, "r", encoding="utf-8") as f:
36
- x = json.load(f)
37
- return x["docs"], x["last_indexed_date"]
38
- return {}, None
 
39
 
40
  def save_indexer(self):
41
  """Save the updated index"""
42
- with open(self.indexer_file, "w", encoding="utf-8") as f:
43
- today = datetime.today()
44
- self.latest_date = today.strftime("%d/%m/%Y-%H:%M:%S")
45
- output = {"docs": self.indexer, "last_indexed_date": self.latest_date}
46
- json.dump(output, f, indent=4, ensure_ascii=False)
 
47
 
48
  def get_docs_from_url(self, url):
49
  """Récupérer la liste des documents/répertoires depuis une URL"""
@@ -197,10 +200,6 @@ class TsgDocIndexer:
197
 
198
  # Attendre que toutes les tâches soient terminées
199
  concurrent.futures.wait(futures)
200
-
201
- # Sauvegarder après chaque groupe de travail
202
- with self.indexer_lock:
203
- self.save_indexer()
204
 
205
  def index_all_tdocs(self):
206
  """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
@@ -254,10 +253,6 @@ class TsgDocIndexer:
254
  for meeting in meeting_folders if meeting not in ['./', '../']]
255
  concurrent.futures.wait(futures)
256
 
257
- # Sauvegarder après chaque groupe de travail
258
- with self.indexer_lock:
259
- self.save_indexer()
260
-
261
  docs_count_after = len(self.indexer)
262
  new_docs_count = docs_count_after - docs_count_before
263
 
 
1
  from datetime import datetime
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ from datasets import load_dataset, Dataset
5
  import json
6
  import os
7
  import time
 
15
 
16
  class TsgDocIndexer:
17
  def __init__(self, max_workers=10):
18
+ self.indexer = self.load_indexer()
19
+ self.main_ftp_url = "https://3gpp.org/ftp"
20
+ self.dataset = load_dataset("OrganizedProgrammers/3GPPTDocLocation")
21
  self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
22
  self.max_workers = max_workers
23
 
 
32
 
33
  def load_indexer(self):
34
  """Load existing index if available"""
35
+ all_docs = {}
36
+ tdoc_locations = load_dataset("OrganizedProgrammers/3GPPTDocLocation", token=os.environ["HF_TOKEN"])
37
+ tdoc_locations = tdoc_locations["train"].to_list()
38
+ for doc in tdoc_locations:
39
+ all_docs[doc["doc_id"]] = doc["url"]
40
+ return all_docs
41
 
42
  def save_indexer(self):
43
  """Save the updated index"""
44
+ data = []
45
+ for doc_id, url in self.indexer.items():
46
+ data.append({"doc_id": doc_id, "url": url})
47
+
48
+ dataset = Dataset.from_list(data)
49
+ dataset.push_to_hub("OrganizedProgrammers/3GPPTDocLocation", token=os.environ["HF_TOKEN"])
50
 
51
  def get_docs_from_url(self, url):
52
  """Récupérer la liste des documents/répertoires depuis une URL"""
 
200
 
201
  # Attendre que toutes les tâches soient terminées
202
  concurrent.futures.wait(futures)
 
 
 
 
203
 
204
  def index_all_tdocs(self):
205
  """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
 
253
  for meeting in meeting_folders if meeting not in ['./', '../']]
254
  concurrent.futures.wait(futures)
255
 
 
 
 
 
256
  docs_count_after = len(self.indexer)
257
  new_docs_count = docs_count_after - docs_count_before
258