import os, warnings, requests os.environ["CURL_CA_BUNDLE"] = '' from dotenv import load_dotenv from huggingface_hub import configure_http_backend def backend_factory() -> requests.Session: session = requests.Session() session.verify = False return session configure_http_backend(backend_factory=backend_factory) warnings.filterwarnings("ignore") load_dotenv() import bm25s from bm25s.hf import BM25HF from datasets import load_dataset unique_specs = set() dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent") dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata") dataset_text = dataset_text["train"].to_list() dataset_metadata = dataset_metadata["train"].to_list() corpus_json = [] def get_document(spec_id: str, spec_title: str): text = [f"{spec_id} - {spec_title}\n"] for section in dataset_text: if spec_id == section["doc_id"]: text.extend([f"{section['section']}\n\n{section['content']}"]) return text for specification in dataset_metadata: if specification['id'] in unique_specs: continue for section in dataset_text: if specification['id'] == section['doc_id']: corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": { "id": specification['id'], "title": specification['title'], "section_title": section['section'], "version": specification['version'], "type": specification['type'], "working_group": specification['working_group'], "url": specification['url'], "scope": specification['scope'] }}) unique_specs.add(specification['id']) corpus_text = [doc["text"] for doc in corpus_json] corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") retriever = BM25HF(corpus=corpus_json) retriever.index(corpus_tokens) retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSections", token=os.environ.get("HF_TOKEN")) unique_specs = set() corpus_json = [] for specification in dataset_metadata: if specification['id'] in unique_specs: continue text_list = get_document(specification['id'], specification['title']) text = "\n".join(text_list) if len(text_list) == 1: continue corpus_json.append({"text": text, "metadata": specification}) unique_specs.add(specification['id']) corpus_text = [doc["text"] for doc in corpus_json] corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") retriever = BM25HF(corpus=corpus_json) retriever.index(corpus_tokens) retriever.save_to_hub("OrganizedProgrammers/3GPPBM25IndexSingle", token=os.environ.get("HF_TOKEN"))