Spaces:

OrganizedProgrammers
/

DocIndexer

Running

App Files Files Community

om4r932 commited on 19 days ago

Commit

22ee398

1 Parent(s): ebe17cc

V2 (changed data storage method + rework)

Browse files

Files changed (7) hide show

app.py +36 -229
bm25_maker.py +54 -46
requirements.txt +6 -7
spec_doc_indexer_multi.py → scripts/old/spec_doc_indexer_multi.py +0 -0
spec_indexer_multi.py → scripts/old/spec_indexer_multi.py +0 -0
spec_indexer.py +301 -0
indexer_multi.py → tdoc_indexer.py +16 -21

app.py CHANGED Viewed

@@ -1,264 +1,71 @@
-from datetime import datetime
-import os
-import warnings
-import traceback
-import gradio as gr
 import subprocess
-from huggingface_hub import Repository
-from git import Repo
-import requests
-warnings.filterwarnings('ignore')
-DOC_INDEXER = "indexer_multi.py"
-SPEC_INDEXER = "spec_indexer_multi.py"
-SPEC_DOC_INDEXER = "spec_doc_indexer_multi.py"
-BM25_INDEXER = "bm25_maker.py"
-DOC_INDEX_FILE = "indexed_docs.json"
-SPEC_INDEX_FILE = "indexed_specifications.json"
-SPEC_DOC_INDEX_FILE = "indexed_docs_content.zip"
-BM25_INDEX_FILE = "bm25s.zip"
-HF_SEARCH_REPO = "OrganizedProgrammers/3GPPDocFinder"
-REPO_DIR = os.path.dirname(os.path.abspath(__file__))
-def get_docs_stats():
-    if os.path.exists(DOC_INDEX_FILE):
-        import json
-        with open(DOC_INDEX_FILE, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-            return len(data["docs"])
-    return 0
-def get_specs_stats():
-    if os.path.exists(SPEC_INDEX_FILE):
-        import json
-        with open(SPEC_INDEX_FILE, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-            return len(data["specs"])
-    return 0
-def get_scopes_stats():
-    if os.path.exists(SPEC_INDEX_FILE):
-        import json
-        with open(SPEC_INDEX_FILE, 'r', encoding="utf-8") as f:
-            data = json.load(f)
-            return len(data['scopes'])
-    return 0
-def check_permissions(user: str, token: str):
-    try:
-        req = requests.get("https://huggingface.co/api/whoami-v2", verify=False, headers={"Accept": "application/json", "Authorization": f"Bearer {token}"})
-        if req.status_code != 200:
-            return False
-        reqJson: dict = req.json()
-        if not reqJson.get("name") or reqJson['name'] != user:
-            return False
-        if not reqJson.get("orgs") or len(reqJson['orgs']) == 0:
-            return False
-        for org in reqJson['orgs']:
-            if "645cfa1b5ebf379fd6d8a339" == org['id']:
-                return True
-        if not reqJson.get('auth') or reqJson['auth'] == {}:
-            return False
-        if reqJson['auth']['accessToken']['role'] != "fineGrained":
-            return False
-        for scope in reqJson['auth']['accessToken']['fineGrained']['scoped']:
-            if scope['entity']['type'] == "org" and scope['entity']['_id'] == "645cfa1b5ebf379fd6d8a339" and all(perm in scope['permissions'] for perm in ['repo.write', 'repo.content.read']):
-                return True
-        return False
-    except Exception as e:
-        traceback.print_exception(e)
-        return False
-def update_logged(user: str, token: str):
-    if check_permissions(user, token):
-        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
-    else:
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
-def commit_and_push_3gppindexers(user, token, files, message, current_log=""):
-    log = current_log + "\n"
-    repo = Repo(REPO_DIR)
-    origin = repo.remotes.origin
-    repo.config_writer().set_value("user", "name", "3GPP Indexer Automatic Git Tool").release()
-    repo.config_writer().set_value("user", "email", "example@mail.org").release()
-    origin.pull()
-    log += "Git pull succeed !\n"
-    yield log
-    repo.git.add(files)
-    repo.index.commit(message)
-    try:
-        repo.git.push(f"https://{user}:{token}@huggingface.co/spaces/OrganizedProgrammers/3GPPIndexers")
-        log += "Git push succeed !\n"
-        yield log
-        log += "Wait for Huggingface to restart the Space\n"
-        yield log
-    except Exception as e:
-        log += f"Git push failed: {e}\n"
-        yield log
-def commit_and_push_3gppdocfinder(token, files, message, current_log=""):
-    log = current_log + "\n"
-    if not token:
-        log += "No token provided. Skipping HuggingFace push.\n"
-        yield log
-        return
-    hf_repo_dir = os.path.join(REPO_DIR, "hf_spaces")
-    repo = None
-    if not os.path.exists(hf_repo_dir):
-        repo = Repository(
-            local_dir=hf_repo_dir,
-            repo_type="space",
-            clone_from=HF_SEARCH_REPO,
-            git_user="3GPP Indexer Automatic Git Tool",
-            git_email="example@mail.org",
-            token=token,
-            skip_lfs_files=True
-        )
-    else:
-        repo = Repository(
-            local_dir=hf_repo_dir,
-            repo_type="space",
-            git_user="3GPP Indexer Automatic Git Tool",
-            git_email="example@mail.org",
-            token=token,
-            skip_lfs_files=True
-        )
-    repo.git_pull()
-    # Copy artifact files to huggingface space
-    for f in files:
-        import shutil
-        shutil.copy2(f, os.path.join(hf_repo_dir, f))
-    repo.git_add(auto_lfs_track=True)
-    repo.git_commit(message)
-    repo.git_push()
-    log += "Pushed to HuggingFace.\n"
-    yield log
-def refresh_stats():
-    return str(get_docs_stats()), str(get_specs_stats()), str(get_scopes_stats())
-def stream_script_output(script_path, current_log=""):
     accumulated_output = current_log
     process = subprocess.Popen(
         ["python", script_path],
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         bufsize=1,
-        universal_newlines=True,
     )
     for line in process.stdout:
         accumulated_output += line
         yield accumulated_output
     process.stdout.close()
     process.wait()
     yield accumulated_output
-def index_documents(user, token):
     log_output = "⏳ Indexation en cours...\n"
-    # Désactiver tous les boutons
-    yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
-    # Lancer l'indexation
-    if not check_permissions(user, token):
-        log_output += "❌ Identifiants invalides\n"
-        yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
-        return
-    for log in stream_script_output(DOC_INDEXER, log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
-        log_output = log
-    d = datetime.today().strftime("%d/%m/%Y-%H:%M:%S")
-    for log in commit_and_push_3gppdocfinder(token, [DOC_INDEX_FILE], f"Update documents indexer via Indexer: {d}", log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
-        log_output = log
-    for log in commit_and_push_3gppindexers(user, token, [DOC_INDEX_FILE], f"Update documents indexer via Indexer: {d}", log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
         log_output = log
-    # Réactiver les boutons à la fin
     log_output += "✅ Terminé.\n"
-    yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
-def index_specifications(user, token):
     log_output = "⏳ Indexation en cours...\n"
-    # Désactiver tous les boutons
-    yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log_output
-    # Lancer l'indexation
-    if not check_permissions(user, token):
-        log_output += "❌ Identifiants invalides\n"
-        yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
-        return
-    for log in stream_script_output(SPEC_INDEXER, log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
-        log_output = log
-    for log in stream_script_output(SPEC_DOC_INDEXER, log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
-        log_output = log
-    for log in stream_script_output(BM25_INDEXER, log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
-        log_output = log
-    d = datetime.today().strftime("%d/%m/%Y-%H:%M:%S")
-    for log in commit_and_push_3gppdocfinder(token, [SPEC_DOC_INDEX_FILE, BM25_INDEX_FILE, SPEC_INDEX_FILE], f"Update specifications indexer via Indexer: {d}", log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
         log_output = log
-    for log in commit_and_push_3gppindexers(user, token, [SPEC_DOC_INDEX_FILE, BM25_INDEX_FILE, SPEC_INDEX_FILE], f"Update specifications indexer via Indexer: {d}", log_output):
-        yield gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), log
         log_output = log
-    # Réactiver les boutons à la fin
     log_output += "✅ Terminé.\n"
-    yield gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), log_output
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 📄 3GPP Indexers")
     with gr.Row() as r1:
         with gr.Column():
-            git_user = gr.Textbox(label="Git user (for push/pull indexes)")
-            git_pass = gr.Textbox(label="Git Token", type="password")
-            btn_login = gr.Button("Login", variant="primary")
-    with gr.Row(visible=False) as r2:
-        with gr.Column():
-            doc_count = gr.Textbox(label="Docs Indexed", value=str(get_docs_stats()), interactive=False)
-            btn_docs = gr.Button("Re-index Documents", variant="primary")
         with gr.Column():
-            spec_count = gr.Textbox(label="Specs Indexed", value=str(get_specs_stats()), interactive=False)
-            btn_specs = gr.Button("Re-index Specifications", variant="primary")
-        with gr.Column():
-            scope_count = gr.Textbox(label="Scopes Indexed", value=str(get_scopes_stats()), interactive=False)
-    out = gr.Textbox(label="Output/Log", lines=13, autoscroll=True, visible=False)
-    refresh = gr.Button(value="🔄 Refresh Stats", visible=False)
-    btn_login.click(update_logged, inputs=[git_user, git_pass], outputs=[r1, r2, out, refresh])
-    btn_docs.click(index_documents, inputs=[git_user, git_pass], outputs=[btn_docs, btn_specs, refresh, out])
-    btn_specs.click(index_specifications, inputs=[git_user, git_pass], outputs=[btn_docs, btn_specs, refresh, out])
-    refresh.click(refresh_stats, outputs=[doc_count, spec_count, scope_count])
-demo.launch()

 import subprocess
+import warnings, os
+warnings.filterwarnings("ignore")
+os.environ["CURL_CA_BUNDLE"] = ""
+from dotenv import load_dotenv
+import gradio as gr
+load_dotenv()
+hf_token = os.environ["HF_TOKEN"]
+SCRIPT_DOC = "tdoc_indexer.py"
+SCRIPT_SPEC = "spec_indexer.py"
+SCRIPT_BM25 = "bm25_maker.py"
+def get_script_output(script_path, current_log=""):
     accumulated_output = current_log
     process = subprocess.Popen(
         ["python", script_path],
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         bufsize=1,
+        universal_newlines=True
     )
     for line in process.stdout:
         accumulated_output += line
         yield accumulated_output
     process.stdout.close()
     process.wait()
     yield accumulated_output
+def index_tdocs():
     log_output = "⏳ Indexation en cours...\n"
+    for log in get_script_output(SCRIPT_DOC):
+        yield log
         log_output = log
     log_output += "✅ Terminé.\n"
+    yield log_output
+def index_specifications():
     log_output = "⏳ Indexation en cours...\n"
+    for log in get_script_output(SCRIPT_SPEC):
+        yield log
         log_output = log
+    for log in get_script_output(SCRIPT_BM25):
+        yield log
         log_output = log
     log_output += "✅ Terminé.\n"
+    yield log_output
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📄 3GPP Indexer Main Menu")
     with gr.Row() as r1:
         with gr.Column():
+            tdocs_btn = gr.Button("Re-index TDocs", variant="primary")
         with gr.Column():
+            spec_btn = gr.Button("Re-index Specifications", variant="primary")
+    out = gr.Textbox(label="Output", lines=25, autoscroll=True, interactive=False)
+    tdocs_btn.click(index_tdocs, outputs=[out])
+    spec_btn.click(index_specifications, outputs=[out])
+demo.queue().launch()

bm25_maker.py CHANGED Viewed

@@ -1,59 +1,67 @@
-import shutil
-import zipfile
-import json
 import bm25s
-import nltk
-from nltk.stem import WordNetLemmatizer
-nltk.download("wordnet")
-lemmatizer = WordNetLemmatizer()
-indexer_id = "3gpp_bm25_docs"
-unique_specs = set()
-with open("indexed_specifications.json", "r") as f:
-    spec_data = json.load(f)["specs"]
-with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf:
-    for file_name in zf.namelist():
-        if file_name.endswith(".json"):
-            doc_bytes = zf.read(file_name)
-            try:
-                doc_data = json.loads(doc_bytes.decode("utf-8"))
-                print("Documents loaded successfully !")
-            except json.JSONDecodeError as e:
-                print(f"Error while decoding the JSON file {file_name}: {e}")
 corpus_json = []
-for _, specification in spec_data.items():
-    full_text = f"{specification['id']} - {specification['title']}\n\n\n"
-    if specification['id'] in unique_specs:
-        continue
-    document = doc_data.get(specification['id'], None)
-    if document is None: continue
-    if not isinstance(document, str):
-        full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()])
-        corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": {
-            "id": specification['id'],
-            "title": specification['title'],
-            "version": specification['version'],
-            "release": specification['release'],
-            "type": specification['type'],
-            "working_group": specification['working_group'],
-            "url": specification['url'],
-            "scope": specification['scope']
-        }})
-        unique_specs.add(specification['id'])
-    else:
-        print(f"Skipping {specification['id']}")
-        unique_specs.add(specification['id'])
 corpus_text = [doc["text"] for doc in corpus_json]
 corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
-retriever = bm25s.BM25(corpus=corpus_json)
 retriever.index(corpus_tokens)
-retriever.save(indexer_id)
-shutil.make_archive("bm25s", 'zip', '.', indexer_id)

+import os, warnings
+os.environ["CURL_CA_BUNDLE"] = ''
+from dotenv import load_dotenv
+warnings.filterwarnings("ignore")
+load_dotenv()
 import bm25s
+from bm25s.hf import BM25HF
+from datasets import load_dataset
+unique_specs = set()
+dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ.get("HF_TOKEN"))
+dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ.get("HF_TOKEN"))
+dataset_text = dataset_text["train"].to_list()
+dataset_metadata = dataset_metadata["train"].to_list()
+corpus_json = []
+def get_document(spec_id: str, spec_title: str):
+    text = [f"{spec_id} - {spec_title}\n"]
+    for section in dataset_text:
+        if spec_id == section["doc_id"]:
+            text.extend([f"{section['section']}\n\n{section['content']}"])
+    return text
+for specification in dataset_metadata:
+    if specification['id'] in unique_specs: continue
+    for section in dataset_text:
+        if specification['id'] == section['doc_id']:
+            corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
+                "id": specification['id'],
+                "title": specification['title'],
+                "section_title": section['section'],
+                "version": specification['version'],
+                "type": specification['type'],
+                "working_group": specification['working_group'],
+                "url": specification['url'],
+                "scope": specification['scope']
+            }})
+    unique_specs.add(specification['id'])
+corpus_text = [doc["text"] for doc in corpus_json]
+corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
+retriever = BM25HF(corpus=corpus_json)
+retriever.index(corpus_tokens)
+retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF_TOKEN"))
+unique_specs = set()
 corpus_json = []
+for specification in dataset_metadata:
+    if specification['id'] in unique_specs: continue
+    text_list = get_document(specification['id'], specification['title'])
+    text = "\n".join(text_list)
+    if len(text_list) == 1: continue
+    corpus_json.append({"text": text, "metadata": specification})
+    unique_specs.add(specification['id'])
 corpus_text = [doc["text"] for doc in corpus_json]
 corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
+retriever = BM25HF(corpus=corpus_json)
 retriever.index(corpus_tokens)
+retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF_TOKEN"))

requirements.txt CHANGED Viewed

@@ -1,9 +1,8 @@
-gradio
 requests
 beautifulsoup4
-gitpython
-huggingface_hub
-lxml
-scikit-learn
-bm25s[full]
-nltk

 requests
 beautifulsoup4
+datasets
+pandas
+numpy
+python-dotenv
+gradio
+bm25s[full]

spec_doc_indexer_multi.py → scripts/old/spec_doc_indexer_multi.py RENAMED Viewed

File without changes

spec_indexer_multi.py → scripts/old/spec_indexer_multi.py RENAMED Viewed

File without changes

spec_indexer.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import os
+import time
+import warnings
+from dotenv import load_dotenv
+import numpy as np
+import pandas as pd
+warnings.filterwarnings("ignore")
+os.environ["CURL_CA_BUNDLE"] = ""
+load_dotenv()
+from bm25s.hf import BM25HF
+from datasets import load_dataset, Dataset
+import bm25s
+import threading
+import zipfile
+import sys
+import subprocess
+import requests
+import re
+import traceback
+import io
+import concurrent.futures
+import hashlib
+CHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
+DICT_LOCK = threading.Lock()
+DOCUMENT_LOCK = threading.Lock()
+STOP_EVENT = threading.Event()
+spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
+spec_contents = spec_contents["train"].to_list()
+documents_by_spec_num = {}
+for section in spec_contents:
+    if section["doc_id"] not in documents_by_spec_num.keys():
+        documents_by_spec_num[section["doc_id"]] = {"content": {section["section"]: section["content"]}, "hash": section["hash"]}
+    else:
+        documents_by_spec_num[section["doc_id"]]["content"][section["section"]] = section["content"]
+indexed_specifications = {}
+specifications_passed = set()
+processed_count = 0
+total_count = 0
+def get_text(specification: str, version: str, second: bool = False):
+    """Récupère les bytes du PDF à partir d'une spécification et d'une version."""
+    if STOP_EVENT.is_set():
+        return []
+    doc_id = specification
+    series = doc_id.split(".")[0]
+    content = []
+    print(f"\n[INFO] Tentative de récupération de la spécification {doc_id} version {version}", flush=True)
+    response = requests.get(
+        f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version}.zip",
+        verify=False,
+        headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
+    )
+    if response.status_code != 200:
+        print(f"\n[ERREUR] Echec du téléchargement du ZIP pour {specification}-{version}. Tentative avec dernière version disponible", flush=True)
+        last_possible_version = requests.post('https://organizedprogrammers-3gppdocfinder.hf.space/find', verify=False, headers={"Content-Type": "application/json"}, json={"doc_id": specification})
+        if last_possible_version.status_code != 200:
+            print(f"\n[ERREUR] Echec du 2e téléchargement du ZIP pour {specification}-{version}. {last_possible_version.status_code}", flush=True)
+            return []
+        data = last_possible_version.json()
+        return get_text(specification, data['version'], True)
+    zip_bytes = io.BytesIO(response.content)
+    zip_file = zipfile.ZipFile(zip_bytes)
+    def extract_text(zipfile: zipfile.ZipFile, filename: str):
+        if (filename.endswith(".doc") or filename.endswith(".docx")) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
+            doc_bytes = zipfile.read(filename)
+            input_path = f"/tmp/{filename}"
+            output_path = "/tmp"
+            changed_ext_filename = re.sub(r".docx?$", ".txt", filename)
+            output_file = f"/tmp/{changed_ext_filename}"
+            with open(input_path, "wb") as f:
+                f.write(doc_bytes)
+            try:
+                print(f"\n[INFO] Tentative de conversion DOC/DOCX -> TXT", flush=True)
+                try:
+                    subprocess.run(
+                        ["libreoffice", "--headless", "--convert-to", "txt:Text", "--outdir", output_path, input_path],
+                        check=True,
+                        capture_output=True
+                    )
+                except subprocess.CalledProcessError as e:
+                    print(f"\n[ERREUR] LibreOffice a échoué : {e}", flush=True)
+                    return []
+                if os.path.exists(output_file):
+                    with open(output_file, "r", encoding="utf-8") as f:
+                        return [line.strip() for line in f if line.strip()]
+            finally:
+                if os.path.exists(input_path):
+                    os.remove(input_path)
+                if os.path.exists(output_file):
+                    os.remove(output_file)
+        return []
+    for fileinfo in zip_file.infolist():
+        if STOP_EVENT.is_set():
+            return []
+        if fileinfo.filename.endswith(".zip") and len(zip_file.namelist()) == 1:
+            nested_zip_bytes = io.BytesIO(zip_file.read(fileinfo.filename))
+            zip_file = zipfile.ZipFile(nested_zip_bytes)
+            break
+    for filename in zip_file.namelist():
+        if STOP_EVENT.is_set():
+            return []
+        content.extend(extract_text(zip_file, filename))
+    if content:
+        print("\n[INFO] Conversion terminé", flush=True)
+    else:
+        print(f"\n[ERREUR] Pas réussi", flush=True)
+    return content
+def get_spec_content(specification: str, version:str):
+    if STOP_EVENT.is_set():
+        return {}
+    print("\n[INFO] Tentative de récupération du texte", flush=True)
+    text = get_text(specification, version)
+    if not text or STOP_EVENT.is_set():
+        return {}
+    print(f"\n[INFO] Texte {specification}-{version} récupéré", flush=True)
+    chapters = []
+    chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+[^\.]$") # 3.5.2.1      Introduction
+    for i, line in enumerate(text):
+        if STOP_EVENT.is_set():
+            return {}
+        if chapter_regex.fullmatch(line):
+            chapters.append((i, line))
+    document = {}
+    for i in range(len(chapters)):
+        if STOP_EVENT.is_set():
+            return {}
+        start_index, chapter_title = chapters[i]
+        end_index = chapters[i+1][0] if i + 1 < len(chapters) else len(text)
+        content_lines = text[start_index + 1:end_index]
+        document[chapter_title.replace("\t", " ")] = "\n".join(content_lines)
+    print(f"\n[INFO] Document fini", flush=True)
+    return document
+def version_to_code(version_str):
+    parts = version_str.split('.')
+    if len(parts) != 3: return None
+    try:
+        x, y, z = [int(p) for p in parts]
+    except ValueError:
+        return None
+    if x < 36 and y < 36 and z < 36:
+        return f"{CHARS[x]}{CHARS[y]}{CHARS[z]}"
+    else:
+        return f"{str(x).zfill(2)}{str(y).zfill(2)}{str(z).zfill(2)}"
+def hasher(specification: str, version_code: str):
+    return hashlib.md5(f"{specification}{version_code}".encode()).hexdigest()
+def get_scope(content):
+    for title, text in content.items():
+        if title.lower().endswith("scope"):
+            return text
+    return ""
+def process_specification(spec):
+    global processed_count, indexed_specifications, documents_by_spec_num
+    if STOP_EVENT.is_set():
+        return
+    try:
+        if not spec.get('vers'): return
+        doc_id = str(spec['spec_num'])
+        document = None
+        version_code = version_to_code(str(spec['vers']))
+        if not version_code: return
+        with DOCUMENT_LOCK:
+            if doc_id in documents_by_spec_num and documents_by_spec_num[doc_id]["hash"] == hasher(doc_id, version_code) and not doc_id in specifications_passed:
+                document = documents_by_spec_num[doc_id]
+                specifications_passed.add(doc_id)
+                print(f"\n[INFO] Document déjà présent pour {doc_id} (version {spec['vers']})", flush=True)
+            elif doc_id in specifications_passed:
+                print(f"\n[INFO] Document déjà présent pour {doc_id} [dernière version présent]")
+            else:
+                print(f"\n[INFO] Tentative de récupération du document {doc_id} (version {spec['vers']})", flush=True)
+                document = get_spec_content(doc_id, version_code)
+                if document:
+                    documents_by_spec_num[doc_id] = {"content": document, "hash": hasher(doc_id, version_code)}
+                    specifications_passed.add(doc_id)
+                    print(f"\n[INFO] Document extrait pour {doc_id} (version {spec['vers']})", flush=True)
+        series = doc_id.split(".")[0]
+        url = f"https://www.3gpp.org/ftp/Specs/archive/{series}_series/{doc_id}/{doc_id.replace('.', '')}-{version_code}.zip"
+        string_key = f"{spec['spec_num']}+-+{spec['title']}+-+{spec['type']}+-+{spec['vers']}+-+{spec['WG']}"
+        metadata = {
+            "id": doc_id,
+            "title": spec["title"],
+            "type": spec["type"],
+            "version": str(spec["vers"]),
+            "working_group": spec["WG"],
+            "url": url,
+            "scope": "" if not document else get_scope(document["content"])
+        }
+        with DICT_LOCK:
+            indexed_specifications[string_key] = metadata
+            processed_count += 1
+        sys.stdout.write(f"\rTraitement: {processed_count}/{total_count} spécifications...")
+        sys.stdout.flush()
+    except Exception as e:
+        traceback.print_exception(e)
+        print(f"\n[ERREUR] Échec du traitement de {spec.get('spec_num', 'inconnu')} v{spec.get('vers')}: {e}", flush=True)
+def sauvegarder(indexed_specifications, documents_by_spec_num):
+    print("\nSauvegarde en cours...", flush=True)
+    flat_metadata = [metadata for _, metadata in indexed_specifications.items()]
+    flat_docs = []
+    for doc_id, data in documents_by_spec_num.items():
+        for title, content in data["content"].items():
+            flat_docs.append({"hash": data["hash"], "doc_id": doc_id, "section": title, "content": content})
+    push_spec_content = Dataset.from_list(flat_docs)
+    push_spec_metadata = Dataset.from_list(flat_metadata)
+    push_spec_content.push_to_hub("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
+    push_spec_metadata.push_to_hub("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ["HF_TOKEN"])
+    print("Sauvegarde terminée.", flush=True)
+def main():
+    global total_count
+    start_time = time.time()
+    # Récupération des spécifications depuis le site 3GPP
+    print("Récup��ration des spécifications depuis 3GPP...")
+    response = requests.get(
+        f'https://www.3gpp.org/dynareport?code=status-report.htm',
+        headers={"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
+        verify=False
+    )
+    # Analyse des tableaux HTML
+    dfs = pd.read_html(
+        io.StringIO(response.text),
+        storage_options={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'},
+        encoding="utf-8"
+    )
+    for x in range(len(dfs)):
+        dfs[x] = dfs[x].replace({np.nan: None})
+    # Extraction des colonnes nécessaires
+    columns_needed = [0, 1, 2, 3, 4]
+    extracted_dfs = [df.iloc[:, columns_needed] for df in dfs]
+    columns = [x.replace("\xa0", "_") for x in extracted_dfs[0].columns]
+    # Préparation des spécifications
+    specifications = []
+    for df in extracted_dfs:
+        for index, row in df.iterrows():
+            doc = row.to_list()
+            doc_dict = dict(zip(columns, doc))
+            specifications.append(doc_dict)
+    total_count = len(specifications)
+    print(f"Traitement de {total_count} spécifications avec multithreading...")
+    if os.path.exists("indexed_docs_content.zip"):
+        +print(f"Chargement de {len(documents_by_spec_num)} documents depuis le cache.")
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            futures = [executor.submit(process_specification, spec) for spec in specifications]
+            while True:
+                if all(f.done() for f in futures):
+                    break
+                if STOP_EVENT.is_set():
+                    break
+                time.sleep(0.35)
+    except Exception as e:
+        print(f"\nErreur inattendue dans le ThreadPool : {e}", flush=True)
+    print("\nSauvegarde des résultats...", flush=True)
+    sauvegarder(indexed_specifications, documents_by_spec_num)
+    elapsed_time = time.time() - start_time
+    print(f"\nTraitement terminé en {elapsed_time:.2f} secondes.", flush=True)
+    print(f"Métadonnées sauvegardées dans 'indexed_specifications.json'.", flush=True)
+    print(f"Contenu des documents sauvegardé dans 'indexed_docs_content.zip'.", flush=True)
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nInterruption détectée (Ctrl+C). Arrêt des tâches en cours...", flush=True)
+        STOP_EVENT.set()
+        time.sleep(2)
+        sauvegarder(indexed_specifications, documents_by_spec_num)
+        print("Arrêt propre du script.", flush=True)
+        sys.exit(0)
+    except Exception as e:
+        print(f"\nErreur inattendue : {e}", flush=True)
+        sauvegarder(indexed_specifications, documents_by_spec_num)
+        sys.exit(1)

indexer_multi.py → tdoc_indexer.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from datetime import datetime
 import requests
 from bs4 import BeautifulSoup
 import json
 import os
 import time
@@ -14,9 +15,9 @@ warnings.filterwarnings("ignore")
 class TsgDocIndexer:
     def __init__(self, max_workers=10):
-        self.main_ftp_url = "https://www.3gpp.org/ftp"
-        self.indexer_file = "indexed_docs.json"
-        self.indexer, self.latest_date = self.load_indexer()
         self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
         self.max_workers = max_workers
@@ -31,19 +32,21 @@ class TsgDocIndexer:
     def load_indexer(self):
         """Load existing index if available"""
-        if os.path.exists(self.indexer_file):
-            with open(self.indexer_file, "r", encoding="utf-8") as f:
-                x = json.load(f)
-                return x["docs"], x["last_indexed_date"]
-        return {}, None
     def save_indexer(self):
         """Save the updated index"""
-        with open(self.indexer_file, "w", encoding="utf-8") as f:
-            today = datetime.today()
-            self.latest_date = today.strftime("%d/%m/%Y-%H:%M:%S")
-            output = {"docs": self.indexer, "last_indexed_date": self.latest_date}
-            json.dump(output, f, indent=4, ensure_ascii=False)
     def get_docs_from_url(self, url):
         """Récupérer la liste des documents/répertoires depuis une URL"""
@@ -197,10 +200,6 @@ class TsgDocIndexer:
             # Attendre que toutes les tâches soient terminées
             concurrent.futures.wait(futures)
-        # Sauvegarder après chaque groupe de travail
-        with self.indexer_lock:
-            self.save_indexer()
     def index_all_tdocs(self):
         """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
@@ -254,10 +253,6 @@ class TsgDocIndexer:
                       for meeting in meeting_folders if meeting not in ['./', '../']]
             concurrent.futures.wait(futures)
-        # Sauvegarder après chaque groupe de travail
-        with self.indexer_lock:
-            self.save_indexer()
         docs_count_after = len(self.indexer)
         new_docs_count = docs_count_after - docs_count_before

 from datetime import datetime
 import requests
 from bs4 import BeautifulSoup
+from datasets import load_dataset, Dataset
 import json
 import os
 import time
 class TsgDocIndexer:
     def __init__(self, max_workers=10):
+        self.indexer = self.load_indexer()
+        self.main_ftp_url = "https://3gpp.org/ftp"
+        self.dataset = load_dataset("OrganizedProgrammers/3GPPTDocLocation")
         self.valid_doc_pattern = re.compile(r'^(S[1-6P]|C[1-6P]|R[1-6P])-\d+', flags=re.IGNORECASE)
         self.max_workers = max_workers
     def load_indexer(self):
         """Load existing index if available"""
+        all_docs = {}
+        tdoc_locations = load_dataset("OrganizedProgrammers/3GPPTDocLocation", token=os.environ["HF_TOKEN"])
+        tdoc_locations = tdoc_locations["train"].to_list()
+        for doc in tdoc_locations:
+            all_docs[doc["doc_id"]] = doc["url"]
+        return all_docs
     def save_indexer(self):
         """Save the updated index"""
+        data = []
+        for doc_id, url in self.indexer.items():
+            data.append({"doc_id": doc_id, "url": url})
+        dataset = Dataset.from_list(data)
+        dataset.push_to_hub("OrganizedProgrammers/3GPPTDocLocation", token=os.environ["HF_TOKEN"])
     def get_docs_from_url(self, url):
         """Récupérer la liste des documents/répertoires depuis une URL"""
             # Attendre que toutes les tâches soient terminées
             concurrent.futures.wait(futures)
     def index_all_tdocs(self):
         """Indexer tous les documents ZIP dans la structure FTP 3GPP avec multithreading"""
                       for meeting in meeting_folders if meeting not in ['./', '../']]
             concurrent.futures.wait(futures)
         docs_count_after = len(self.indexer)
         new_docs_count = docs_count_after - docs_count_before