Spaces:
Running
Running
V3 : Add ETSI source + added scripts
Browse files- bm25_maker.py → 3gpp_bm25_maker.py +0 -0
- spec_indexer.py → 3gpp_spec_indexer.py +0 -0
- tdoc_indexer.py → 3gpp_tdoc_indexer.py +0 -0
- app.py +35 -17
- etsi_bm25_maker.py +67 -0
- etsi_spec_indexer.py +255 -0
- requirements.txt +2 -1
bm25_maker.py → 3gpp_bm25_maker.py
RENAMED
File without changes
|
spec_indexer.py → 3gpp_spec_indexer.py
RENAMED
File without changes
|
tdoc_indexer.py → 3gpp_tdoc_indexer.py
RENAMED
File without changes
|
app.py
CHANGED
@@ -8,9 +8,12 @@ import gradio as gr
|
|
8 |
load_dotenv()
|
9 |
hf_token = os.environ["HF_TOKEN"]
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
14 |
|
15 |
def get_script_output(script_path, current_log=""):
|
16 |
accumulated_output = current_log
|
@@ -35,26 +38,40 @@ def get_script_output(script_path, current_log=""):
|
|
35 |
def index_tdocs():
|
36 |
log_output = "⏳ Indexation en cours...\n"
|
37 |
yield gr.update(interactive=False), gr.update(interactive=False), log_output
|
38 |
-
for log in get_script_output(
|
39 |
-
yield gr.update(interactive=False), gr.update(interactive=False), log
|
40 |
log_output = log
|
41 |
|
42 |
log_output += "✅ Terminé.\n"
|
43 |
-
yield gr.update(interactive=True), gr.update(interactive=True), log_output
|
44 |
|
45 |
-
def
|
46 |
log_output = "⏳ Indexation en cours...\n"
|
47 |
-
yield gr.update(interactive=False), gr.update(interactive=False), log_output
|
48 |
-
for log in get_script_output(
|
49 |
-
yield gr.update(interactive=False), gr.update(interactive=False), log
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
log_output = log
|
51 |
|
52 |
-
for log in get_script_output(
|
53 |
-
yield gr.update(interactive=False), gr.update(interactive=False), log
|
54 |
log_output = log
|
55 |
|
56 |
log_output += "✅ Terminé.\n"
|
57 |
-
yield gr.update(interactive=True), gr.update(interactive=True), log_output
|
58 |
|
59 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
60 |
gr.Markdown("# 📄 3GPP Indexer Main Menu")
|
@@ -63,11 +80,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
63 |
with gr.Column():
|
64 |
tdocs_btn = gr.Button("Re-index TDocs", variant="primary")
|
65 |
with gr.Column():
|
66 |
-
|
67 |
-
|
68 |
out = gr.Textbox(label="Output", lines=25, autoscroll=True, interactive=False)
|
69 |
|
70 |
-
tdocs_btn.click(index_tdocs, outputs=[tdocs_btn,
|
71 |
-
|
|
|
72 |
|
73 |
demo.queue().launch()
|
|
|
8 |
load_dotenv()
|
9 |
hf_token = os.environ["HF_TOKEN"]
|
10 |
|
11 |
+
SCRIPT_DOC_3GPP = "3gpp_tdoc_indexer.py"
|
12 |
+
SCRIPT_SPEC_3GPP = "3gpp_spec_indexer.py"
|
13 |
+
SCRIPT_BM25_3GPP = "3gpp_bm25_maker.py"
|
14 |
+
|
15 |
+
SCRIPT_SPEC_ETSI = "etsi_spec_indexer.py"
|
16 |
+
SCRIPT_BM25_ETSI = "etsi_bm25_maker.py"
|
17 |
|
18 |
def get_script_output(script_path, current_log=""):
|
19 |
accumulated_output = current_log
|
|
|
38 |
def index_tdocs():
|
39 |
log_output = "⏳ Indexation en cours...\n"
|
40 |
yield gr.update(interactive=False), gr.update(interactive=False), log_output
|
41 |
+
for log in get_script_output(SCRIPT_DOC_3GPP, log_output):
|
42 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
|
43 |
log_output = log
|
44 |
|
45 |
log_output += "✅ Terminé.\n"
|
46 |
+
yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
|
47 |
|
48 |
+
def index_3gpp_specifications():
|
49 |
log_output = "⏳ Indexation en cours...\n"
|
50 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
|
51 |
+
for log in get_script_output(SCRIPT_SPEC_3GPP, log_output):
|
52 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
|
53 |
+
log_output = log
|
54 |
+
|
55 |
+
for log in get_script_output(SCRIPT_BM25_3GPP, log_output):
|
56 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
|
57 |
+
log_output = log
|
58 |
+
|
59 |
+
log_output += "✅ Terminé.\n"
|
60 |
+
yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
|
61 |
+
|
62 |
+
def index_etsi_specifications():
|
63 |
+
log_output = "⏳ Indexation en cours...\n"
|
64 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
|
65 |
+
for log in get_script_output(SCRIPT_SPEC_ETSI, log_output):
|
66 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
|
67 |
log_output = log
|
68 |
|
69 |
+
for log in get_script_output(SCRIPT_BM25_ETSI, log_output):
|
70 |
+
yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
|
71 |
log_output = log
|
72 |
|
73 |
log_output += "✅ Terminé.\n"
|
74 |
+
yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
|
75 |
|
76 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
77 |
gr.Markdown("# 📄 3GPP Indexer Main Menu")
|
|
|
80 |
with gr.Column():
|
81 |
tdocs_btn = gr.Button("Re-index TDocs", variant="primary")
|
82 |
with gr.Column():
|
83 |
+
spec_btn_3gpp = gr.Button("Re-index 3GPP Specifications", variant="primary")
|
84 |
+
spec_btn_etsi = gr.Button("Re-index ETSI Specifications", variant="primary")
|
85 |
out = gr.Textbox(label="Output", lines=25, autoscroll=True, interactive=False)
|
86 |
|
87 |
+
tdocs_btn.click(index_tdocs, outputs=[tdocs_btn, spec_btn_3gpp, spec_btn_etsi, out])
|
88 |
+
spec_btn_3gpp.click(index_3gpp_specifications, outputs=[tdocs_btn, spec_btn_3gpp, spec_btn_etsi, out])
|
89 |
+
spec_btn_etsi.click(index_etsi_specifications, outputs=[tdocs_btn, spec_btn_3gpp, spec_btn_etsi, out])
|
90 |
|
91 |
demo.queue().launch()
|
etsi_bm25_maker.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
import os, warnings
|
3 |
+
os.environ["CURL_CA_BUNDLE"] = ''
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
warnings.filterwarnings("ignore")
|
6 |
+
load_dotenv()
|
7 |
+
import bm25s
|
8 |
+
from bm25s.hf import BM25HF
|
9 |
+
from datasets import load_dataset
|
10 |
+
unique_specs = set()
|
11 |
+
|
12 |
+
dataset_text = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ.get("HF_TOKEN"))
|
13 |
+
dataset_metadata = load_dataset("OrganizedProgrammers/ETSISpecMetadata", token=os.environ.get("HF_TOKEN"))
|
14 |
+
|
15 |
+
dataset_text = dataset_text["train"].to_list()
|
16 |
+
dataset_metadata = dataset_metadata["train"].to_list()
|
17 |
+
|
18 |
+
corpus_json = []
|
19 |
+
|
20 |
+
def get_document(spec_id: str, spec_title: Optional[str]):
|
21 |
+
text = [f"{spec_id} - {spec_title}\n" if spec_title else f"{spec_id}\n"]
|
22 |
+
for section in dataset_text:
|
23 |
+
if spec_id == section["doc_id"]:
|
24 |
+
text.extend([f"{section['section']}\n\n{section['content']}"])
|
25 |
+
return text
|
26 |
+
|
27 |
+
for specification in dataset_metadata:
|
28 |
+
if specification['id'] in unique_specs: continue
|
29 |
+
for section in dataset_text:
|
30 |
+
if specification['id'] == section['doc_id']:
|
31 |
+
corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
|
32 |
+
"id": specification['id'],
|
33 |
+
"title": specification['title'],
|
34 |
+
"section_title": section['section'],
|
35 |
+
"version": specification['version'],
|
36 |
+
"type": specification['type'],
|
37 |
+
"url": specification['url'],
|
38 |
+
"scope": specification['scope']
|
39 |
+
}})
|
40 |
+
unique_specs.add(specification['id'])
|
41 |
+
|
42 |
+
corpus_text = [doc["text"] for doc in corpus_json]
|
43 |
+
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
|
44 |
+
|
45 |
+
retriever = BM25HF(corpus=corpus_json)
|
46 |
+
retriever.index(corpus_tokens)
|
47 |
+
|
48 |
+
retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF_TOKEN"))
|
49 |
+
|
50 |
+
unique_specs = set()
|
51 |
+
corpus_json = []
|
52 |
+
|
53 |
+
for specification in dataset_metadata:
|
54 |
+
if specification['id'] in unique_specs: continue
|
55 |
+
text_list = get_document(specification['id'], specification['title'])
|
56 |
+
text = "\n".join(text_list)
|
57 |
+
if len(text_list) == 1: continue
|
58 |
+
corpus_json.append({"text": text, "metadata": specification})
|
59 |
+
unique_specs.add(specification['id'])
|
60 |
+
|
61 |
+
corpus_text = [doc["text"] for doc in corpus_json]
|
62 |
+
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
|
63 |
+
|
64 |
+
retriever = BM25HF(corpus=corpus_json)
|
65 |
+
retriever.index(corpus_tokens)
|
66 |
+
|
67 |
+
retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF_TOKEN"))
|
etsi_spec_indexer.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
import warnings
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
warnings.filterwarnings("ignore")
|
9 |
+
os.environ["CURL_CA_BUNDLE"] = ""
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
from datasets import load_dataset, Dataset
|
13 |
+
from datasets.data_files import EmptyDatasetError
|
14 |
+
import threading
|
15 |
+
import zipfile
|
16 |
+
import sys
|
17 |
+
import requests
|
18 |
+
import fitz
|
19 |
+
import re
|
20 |
+
import json
|
21 |
+
import traceback
|
22 |
+
import io
|
23 |
+
import concurrent.futures
|
24 |
+
import hashlib
|
25 |
+
|
26 |
+
CHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
|
27 |
+
DICT_LOCK = threading.Lock()
|
28 |
+
DOCUMENT_LOCK = threading.Lock()
|
29 |
+
STOP_EVENT = threading.Event()
|
30 |
+
|
31 |
+
documents_by_spec_num = {}
|
32 |
+
|
33 |
+
try:
|
34 |
+
spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ.get("HF_TOKEN"))
|
35 |
+
spec_contents = spec_contents["train"].to_list()
|
36 |
+
|
37 |
+
for section in spec_contents:
|
38 |
+
if section["doc_id"] not in documents_by_spec_num.keys():
|
39 |
+
documents_by_spec_num[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
|
40 |
+
else:
|
41 |
+
documents_by_spec_num[section["doc_id"]]["content"][section["section"]] = section["content"]
|
42 |
+
except EmptyDatasetError as e:
|
43 |
+
print("Base de données vide !")
|
44 |
+
indexed_specifications = {}
|
45 |
+
specifications_passed = set()
|
46 |
+
processed_count = 0
|
47 |
+
total_count = 0
|
48 |
+
|
49 |
+
session = requests.Session()
|
50 |
+
req = session.post("https://portal.etsi.org/ETSIPages/LoginEOL.ashx", verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}, data=json.dumps({"username": os.environ.get("EOL_USER"), "password": os.environ.get("EOL_PASSWORD")}))
|
51 |
+
print("Récupération des spécifications depuis ETSI...", req.status_code)
|
52 |
+
|
53 |
+
url_ts = "https://www.etsi.org/?option=com_standardssearch&view=data&format=csv&includeScope=1&page=1&search=&title=1&etsiNumber=1&content=0&version=0&onApproval=0&published=1&withdrawn=0&historical=0&isCurrent=1&superseded=0&harmonized=0&keyword=&TB=&stdType=TS&frequency=&mandate=&collection=&sort=1"
|
54 |
+
url_tr = url_ts.replace("stdType=TS", "stdType=TR")
|
55 |
+
|
56 |
+
data_ts = requests.get(url_ts, verify=False).content
|
57 |
+
data_tr = requests.get(url_tr, verify=False).content
|
58 |
+
|
59 |
+
df_ts = pd.read_csv(io.StringIO(data_ts.decode('utf-8')), sep=";", skiprows=1, index_col=False)
|
60 |
+
df_tr = pd.read_csv(io.StringIO(data_tr.decode('utf-8')), sep=";", skiprows=1, index_col=False)
|
61 |
+
|
62 |
+
backup_ts = df_ts["ETSI deliverable"]
|
63 |
+
backup_tr = df_tr["ETSI deliverable"]
|
64 |
+
|
65 |
+
df_ts["ETSI deliverable"] = df_ts["ETSI deliverable"].str.extract(r"\s*ETSI TS (\d+ \d+(?:-\d+(?:-\d+)?)?)")
|
66 |
+
df_tr["ETSI deliverable"] = df_tr["ETSI deliverable"].str.extract(r"\s*ETSI TR (\d+ \d+(?:-\d+(?:-\d+)?)?)")
|
67 |
+
|
68 |
+
version1 = backup_ts.str.extract(r"\s*ETSI TS \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
|
69 |
+
version2 = backup_tr.str.extract(r"\s*ETSI TR \d+ \d+(?:-\d+(?:-\d+)?)? V(\d+\.\d+\.\d+)")
|
70 |
+
|
71 |
+
df_ts["Version"] = version1[0]
|
72 |
+
df_tr["Version"] = version2[0]
|
73 |
+
|
74 |
+
def ver_tuple(v):
|
75 |
+
return tuple(map(int, v.split(".")))
|
76 |
+
|
77 |
+
df_ts["temp"] = df_ts["Version"].apply(ver_tuple)
|
78 |
+
df_tr["temp"] = df_tr["Version"].apply(ver_tuple)
|
79 |
+
|
80 |
+
df_ts["Type"] = "TS"
|
81 |
+
df_tr["Type"] = "TR"
|
82 |
+
|
83 |
+
df = pd.concat([df_ts, df_tr])
|
84 |
+
|
85 |
+
unique_df = df.loc[df.groupby("ETSI deliverable")["temp"].idxmax()]
|
86 |
+
unique_df = unique_df.drop(columns="temp")
|
87 |
+
unique_df = unique_df[(~unique_df["title"].str.contains("3GPP", case=True, na=False))]
|
88 |
+
|
89 |
+
df = df.drop(columns="temp")
|
90 |
+
df = df[(~df["title"].str.contains("3GPP", case=True, na=False))]
|
91 |
+
|
92 |
+
def get_text(specification: str):
|
93 |
+
if STOP_EVENT.is_set():
|
94 |
+
return None, []
|
95 |
+
|
96 |
+
print(f"\n[INFO] Tentative de récupération de la spécification {specification}", flush=True)
|
97 |
+
response = session.get(
|
98 |
+
unique_df[unique_df["ETSI deliverable"] == specification].iloc[0]["PDF link"],
|
99 |
+
verify=False,
|
100 |
+
headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
101 |
+
)
|
102 |
+
|
103 |
+
if response.status_code != 200:
|
104 |
+
print(f"\n[ERREUR] Echec du téléchargement du PDF pour {specification}. {req.status_code}", flush=True)
|
105 |
+
return None, []
|
106 |
+
|
107 |
+
pdf = fitz.open(stream=response.content, filetype="pdf")
|
108 |
+
return pdf, pdf.get_toc()
|
109 |
+
|
110 |
+
def get_spec_content(specification: str):
|
111 |
+
def extract_sections(text, titles):
|
112 |
+
sections = {}
|
113 |
+
# On trie les titres selon leur position dans le texte
|
114 |
+
sorted_titles = sorted(titles, key=lambda t: text.find(t))
|
115 |
+
for i, title in enumerate(sorted_titles):
|
116 |
+
start = text.find(title)
|
117 |
+
if i + 1 < len(sorted_titles):
|
118 |
+
end = text.find(sorted_titles[i + 1])
|
119 |
+
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
|
120 |
+
else:
|
121 |
+
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
|
122 |
+
return sections
|
123 |
+
if STOP_EVENT.is_set():
|
124 |
+
return {}
|
125 |
+
print("\n[INFO] Tentative de récupération du texte", flush=True)
|
126 |
+
pdf, doc_toc = get_text(specification)
|
127 |
+
text = []
|
128 |
+
first = 0
|
129 |
+
for level, title, page in doc_toc:
|
130 |
+
first = page - 1
|
131 |
+
break
|
132 |
+
for page in pdf[first:]:
|
133 |
+
text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
|
134 |
+
text = "\n".join(text)
|
135 |
+
|
136 |
+
if not text or STOP_EVENT.is_set() or not doc_toc:
|
137 |
+
print("\n[ERREUR] Pas de texte/table of contents trouvé !")
|
138 |
+
return {}
|
139 |
+
print(f"\n[INFO] Texte {specification} récupéré", flush=True)
|
140 |
+
titles = []
|
141 |
+
for level, title, page in doc_toc:
|
142 |
+
if STOP_EVENT.is_set():
|
143 |
+
return {}
|
144 |
+
if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
|
145 |
+
titles.append('\n'.join(title.strip().split(" ", 1)))
|
146 |
+
|
147 |
+
return extract_sections(text, titles)
|
148 |
+
|
149 |
+
def hasher(specification: str, version: str):
|
150 |
+
return hashlib.md5(f"{specification}{version}".encode()).hexdigest()
|
151 |
+
|
152 |
+
def get_scope(content):
|
153 |
+
for title, text in content.items():
|
154 |
+
if title.lower().endswith("scope"):
|
155 |
+
return text
|
156 |
+
return ""
|
157 |
+
|
158 |
+
def process_specification(spec):
|
159 |
+
global processed_count, indexed_specifications, documents_by_spec_num
|
160 |
+
if STOP_EVENT.is_set():
|
161 |
+
return
|
162 |
+
try:
|
163 |
+
version = spec.get('Version')
|
164 |
+
if not version: return
|
165 |
+
doc_id = str(spec.get("ETSI deliverable"))
|
166 |
+
document = None
|
167 |
+
with DOCUMENT_LOCK:
|
168 |
+
if doc_id in documents_by_spec_num and documents_by_spec_num[doc_id]["hash"] == hasher(doc_id, version) and not doc_id in specifications_passed:
|
169 |
+
document = documents_by_spec_num[doc_id]
|
170 |
+
specifications_passed.add(doc_id)
|
171 |
+
print(f"\n[INFO] Document déjà présent pour {doc_id} (version {spec['Version']})", flush=True)
|
172 |
+
elif doc_id in specifications_passed:
|
173 |
+
document = documents_by_spec_num[doc_id]
|
174 |
+
print(f"\n[INFO] Document déjà présent pour {doc_id} [dernière version présent]")
|
175 |
+
else:
|
176 |
+
print(f"\n[INFO] Tentative de récupération du document {doc_id} (version {spec['Version']})", flush=True)
|
177 |
+
document = get_spec_content(doc_id)
|
178 |
+
if document:
|
179 |
+
documents_by_spec_num[doc_id] = {"content": document, "hash": hasher(doc_id, version)}
|
180 |
+
document = {"content": document, "hash": hasher(doc_id, version)}
|
181 |
+
specifications_passed.add(doc_id)
|
182 |
+
print(f"\n[INFO] Document extrait pour {doc_id} (version {spec['Version']})", flush=True)
|
183 |
+
|
184 |
+
string_key = f"{doc_id}+-+{spec['title']}+-+{spec['Type']}+-+{spec['Version']}"
|
185 |
+
metadata = {
|
186 |
+
"id": str(doc_id),
|
187 |
+
"title": spec["title"],
|
188 |
+
"type": spec["Type"],
|
189 |
+
"version": version,
|
190 |
+
"url": spec["PDF link"],
|
191 |
+
"scope": "" if not document else get_scope(document["content"])
|
192 |
+
}
|
193 |
+
with DICT_LOCK:
|
194 |
+
indexed_specifications[string_key] = metadata
|
195 |
+
processed_count += 1
|
196 |
+
sys.stdout.write(f"\rTraitement: {processed_count}/{total_count} spécifications...")
|
197 |
+
sys.stdout.flush()
|
198 |
+
except Exception as e:
|
199 |
+
traceback.print_exception(e)
|
200 |
+
print(f"\n[ERREUR] Échec du traitement de {doc_id} {version}: {e}", flush=True)
|
201 |
+
|
202 |
+
def sauvegarder(indexed_specifications, documents_by_spec_num):
|
203 |
+
print("\nSauvegarde en cours...", flush=True)
|
204 |
+
|
205 |
+
flat_metadata = [metadata for _, metadata in indexed_specifications.items()]
|
206 |
+
flat_docs = []
|
207 |
+
for doc_id, data in documents_by_spec_num.items():
|
208 |
+
for title, content in data["content"].items():
|
209 |
+
flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
|
210 |
+
|
211 |
+
push_spec_content = Dataset.from_list(flat_docs)
|
212 |
+
push_spec_metadata = Dataset.from_list(flat_metadata)
|
213 |
+
push_spec_content.push_to_hub("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"])
|
214 |
+
push_spec_metadata.push_to_hub("OrganizedProgrammers/ETSISpecMetadata", token=os.environ["HF_TOKEN"])
|
215 |
+
print("Sauvegarde terminée.", flush=True)
|
216 |
+
|
217 |
+
def main():
|
218 |
+
global total_count
|
219 |
+
start_time = time.time()
|
220 |
+
|
221 |
+
specifications = df.to_dict(orient="records")
|
222 |
+
total_count = len(specifications)
|
223 |
+
print(f"Traitement de {total_count} spécifications avec multithreading...")
|
224 |
+
try:
|
225 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
|
226 |
+
futures = [executor.submit(process_specification, spec) for spec in specifications]
|
227 |
+
while True:
|
228 |
+
if all(f.done() for f in futures):
|
229 |
+
break
|
230 |
+
if STOP_EVENT.is_set():
|
231 |
+
break
|
232 |
+
time.sleep(0.35)
|
233 |
+
except Exception as e:
|
234 |
+
print(f"\nErreur inattendue dans le ThreadPool : {e}", flush=True)
|
235 |
+
print("\nSauvegarde des résultats...", flush=True)
|
236 |
+
sauvegarder(indexed_specifications, documents_by_spec_num)
|
237 |
+
elapsed_time = time.time() - start_time
|
238 |
+
print(f"\nTraitement terminé en {elapsed_time:.2f} secondes.", flush=True)
|
239 |
+
|
240 |
+
if __name__ == "__main__":
|
241 |
+
try:
|
242 |
+
main()
|
243 |
+
except KeyboardInterrupt:
|
244 |
+
print("\nInterruption détectée (Ctrl+C). Arrêt des tâches en cours...", flush=True)
|
245 |
+
STOP_EVENT.set()
|
246 |
+
time.sleep(2)
|
247 |
+
sauvegarder(indexed_specifications, documents_by_spec_num)
|
248 |
+
print("Arrêt propre du script.", flush=True)
|
249 |
+
sys.exit(0)
|
250 |
+
except Exception as e:
|
251 |
+
print(f"\nErreur inattendue : {e}", flush=True)
|
252 |
+
sauvegarder(indexed_specifications, documents_by_spec_num)
|
253 |
+
sys.exit(1)
|
254 |
+
|
255 |
+
# print(get_spec_content("188 005-1"))
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ numpy
|
|
6 |
python-dotenv
|
7 |
gradio
|
8 |
bm25s[full]
|
9 |
-
lxml
|
|
|
|
6 |
python-dotenv
|
7 |
gradio
|
8 |
bm25s[full]
|
9 |
+
lxml
|
10 |
+
fitz
|