Spaces:
Running
Running
File size: 2,698 Bytes
7fcb613 a49b92f 22ee398 a49b92f 22ee398 abdabaa 22ee398 abdabaa a49b92f abdabaa 22ee398 abdabaa 7fcb613 22ee398 abdabaa 22ee398 7fcb613 22ee398 abdabaa 22ee398 abdabaa 22ee398 abdabaa 7fcb613 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from typing import Optional
import os, warnings, requests
os.environ["CURL_CA_BUNDLE"] = ''
from huggingface_hub import configure_http_backend
def backend_factory() -> requests.Session:
session = requests.Session()
session.verify = False
return session
configure_http_backend(backend_factory=backend_factory)
from dotenv import load_dotenv
warnings.filterwarnings("ignore")
load_dotenv()
import bm25s
from bm25s.hf import BM25HF
from datasets import load_dataset
unique_specs = set()
dataset_text = load_dataset("OrganizedProgrammers/ETSISpecContent")
dataset_metadata = load_dataset("OrganizedProgrammers/ETSISpecMetadata")
dataset_text = dataset_text["train"].to_list()
dataset_metadata = dataset_metadata["train"].to_list()
corpus_json = []
def get_document(spec_id: str, spec_title: Optional[str]):
text = [f"{spec_id} - {spec_title}\n" if spec_title else f"{spec_id}\n"]
for section in dataset_text:
if spec_id == section["doc_id"]:
text.extend([f"{section['section']}\n\n{section['content']}"])
return text
for specification in dataset_metadata:
if specification['id'] in unique_specs: continue
for section in dataset_text:
if specification['id'] == section['doc_id']:
corpus_json.append({"text": f"{section['section']}\n{section['content']}", "metadata": {
"id": specification['id'],
"title": specification['title'],
"section_title": section['section'],
"version": specification['version'],
"type": specification['type'],
"url": specification['url'],
"scope": specification['scope']
}})
unique_specs.add(specification['id'])
corpus_text = [doc["text"] for doc in corpus_json]
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
retriever = BM25HF(corpus=corpus_json)
retriever.index(corpus_tokens)
retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSections", token=os.environ.get("HF_TOKEN"))
unique_specs = set()
corpus_json = []
for specification in dataset_metadata:
if specification['id'] in unique_specs: continue
text_list = get_document(specification['id'], specification['title'])
text = "\n".join(text_list)
if len(text_list) == 1: continue
corpus_json.append({"text": text, "metadata": specification})
unique_specs.add(specification['id'])
corpus_text = [doc["text"] for doc in corpus_json]
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en")
retriever = BM25HF(corpus=corpus_json)
retriever.index(corpus_tokens)
retriever.save_to_hub("OrganizedProgrammers/ETSIBM25IndexSingle", token=os.environ.get("HF_TOKEN")) |