Spaces:
Running
Running
from typing import Optional | |
import os, warnings | |
os.environ["CURL_CA_BUNDLE"] = '' | |
from dotenv import load_dotenv | |
warnings.filterwarnings("ignore") | |
load_dotenv() | |
import bm25s | |
from bm25s.hf import BM25HF | |
from datasets import load_dataset | |
unique_specs = set() | |
dataset_text = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ.get("HF_TOKEN")) | |
dataset_metadata = load_dataset("OrganizedProgrammers/ETSISpecMetadata", token=os.environ.get("HF_TOKEN")) | |
dataset_text = dataset_text["train"].to_list() | |
dataset_metadata = dataset_metadata["train"].to_list() | |
corpus_json = [] | |
def get_document(spec_id: str, spec_title: Optional[str]): | |
text = [f"{spec_id} - {spec_title}\n" if spec_title else f"{spec_id}\n"] | |
for section in dataset_text: | |
if spec_id == section["doc_id"]: | |
text.extend([f"{section['section']}\n\n{section['content']}"]) | |
return text | |
for specification in dataset_metadata: | |
if specification['id'] in unique_specs: continue | |
for section in dataset_text: | |
if specification['id'] == section['doc_id']: | |
corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": { | |
"id": specification['id'], | |
"title": specification['title'], | |
"section_title": section['section'], | |
"version": specification['version'], | |
"type": specification['type'], | |
"url": specification['url'], | |
"scope": specification['scope'] | |
}}) | |
unique_specs.add(specification['id']) | |
corpus_text = [doc["text"] for doc in corpus_json] | |
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") | |
retriever = BM25HF(corpus=corpus_json) | |
retriever.index(corpus_tokens) | |
retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF_TOKEN")) | |
unique_specs = set() | |
corpus_json = [] | |
for specification in dataset_metadata: | |
if specification['id'] in unique_specs: continue | |
text_list = get_document(specification['id'], specification['title']) | |
text = "\n".join(text_list) | |
if len(text_list) == 1: continue | |
corpus_json.append({"text": text, "metadata": specification}) | |
unique_specs.add(specification['id']) | |
corpus_text = [doc["text"] for doc in corpus_json] | |
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") | |
retriever = BM25HF(corpus=corpus_json) | |
retriever.index(corpus_tokens) | |
retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF_TOKEN")) |