Spaces:
Sleeping
Sleeping
import shutil | |
import zipfile | |
import json | |
import bm25s | |
import nltk | |
from nltk.stem import WordNetLemmatizer | |
nltk.download("wordnet") | |
lemmatizer = WordNetLemmatizer() | |
indexer_id = "3gpp_bm25_docs" | |
unique_specs = set() | |
with open("indexed_specifications.json", "r") as f: | |
spec_data = json.load(f)["specs"] | |
with zipfile.ZipFile(open("indexed_docs_content.zip", "rb")) as zf: | |
for file_name in zf.namelist(): | |
if file_name.endswith(".json"): | |
doc_bytes = zf.read(file_name) | |
try: | |
doc_data = json.loads(doc_bytes.decode("utf-8")) | |
print("Documents loaded successfully !") | |
except json.JSONDecodeError as e: | |
print(f"Error while decoding the JSON file {file_name}: {e}") | |
corpus_json = [] | |
for _, specification in spec_data.items(): | |
full_text = f"{specification['id']} - {specification['title']}\n\n\n" | |
if specification['id'] in unique_specs: | |
continue | |
document = doc_data.get(specification['id'], None) | |
if document is None: continue | |
if not isinstance(document, str): | |
full_text += "\n".join([f"{title}\n\n{document[title]}" for title in document.keys()]) | |
corpus_json.append({"text": lemmatizer.lemmatize(full_text), "metadata": { | |
"id": specification['id'], | |
"title": specification['title'], | |
"version": specification['version'], | |
"release": specification['release'], | |
"type": specification['type'], | |
"working_group": specification['working_group'], | |
"url": specification['url'], | |
"scope": specification['scope'] | |
}}) | |
unique_specs.add(specification['id']) | |
else: | |
print(f"Skipping {specification['id']}") | |
unique_specs.add(specification['id']) | |
corpus_text = [doc["text"] for doc in corpus_json] | |
corpus_tokens = bm25s.tokenize(corpus_text, stopwords="en") | |
retriever = bm25s.BM25(corpus=corpus_json) | |
retriever.index(corpus_tokens) | |
retriever.save(indexer_id) | |
shutil.make_archive("bm25s", 'zip', '.', indexer_id) |