om4r932 commited on
Commit
a49b92f
·
1 Parent(s): 1d09a74
3gpp_bm25_maker.py CHANGED
@@ -1,6 +1,14 @@
1
- import os, warnings
2
  os.environ["CURL_CA_BUNDLE"] = ''
3
  from dotenv import load_dotenv
 
 
 
 
 
 
 
 
4
  warnings.filterwarnings("ignore")
5
  load_dotenv()
6
  import bm25s
@@ -8,8 +16,8 @@ from bm25s.hf import BM25HF
8
  from datasets import load_dataset
9
  unique_specs = set()
10
 
11
- dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ.get("HF_TOKEN"))
12
- dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata", token=os.environ.get("HF_TOKEN"))
13
 
14
  dataset_text = dataset_text["train"].to_list()
15
  dataset_metadata = dataset_metadata["train"].to_list()
 
1
+ import os, warnings, requests
2
  os.environ["CURL_CA_BUNDLE"] = ''
3
  from dotenv import load_dotenv
4
+ from huggingface_hub import configure_http_backend
5
+ def backend_factory() -> requests.Session:
6
+ session = requests.Session()
7
+ session.verify = False
8
+ return session
9
+
10
+ configure_http_backend(backend_factory=backend_factory)
11
+
12
  warnings.filterwarnings("ignore")
13
  load_dotenv()
14
  import bm25s
 
16
  from datasets import load_dataset
17
  unique_specs = set()
18
 
19
+ dataset_text = load_dataset("OrganizedProgrammers/3GPPSpecContent")
20
+ dataset_metadata = load_dataset("OrganizedProgrammers/3GPPSpecMetadata")
21
 
22
  dataset_text = dataset_text["train"].to_list()
23
  dataset_metadata = dataset_metadata["train"].to_list()
3gpp_spec_indexer.py CHANGED
@@ -1,10 +1,20 @@
1
  import os
2
  import time
3
  import warnings
 
4
  from dotenv import load_dotenv
5
  import numpy as np
6
  import pandas as pd
7
 
 
 
 
 
 
 
 
 
 
8
  warnings.filterwarnings("ignore")
9
  os.environ["CURL_CA_BUNDLE"] = ""
10
  load_dotenv()
@@ -14,7 +24,6 @@ import threading
14
  import zipfile
15
  import sys
16
  import subprocess
17
- import requests
18
  import re
19
  import traceback
20
  import io
@@ -26,7 +35,7 @@ DICT_LOCK = threading.Lock()
26
  DOCUMENT_LOCK = threading.Lock()
27
  STOP_EVENT = threading.Event()
28
 
29
- spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
30
  spec_contents = spec_contents["train"].to_list()
31
  documents_by_spec_num = {}
32
  for section in spec_contents:
 
1
  import os
2
  import time
3
  import warnings
4
+ import requests
5
  from dotenv import load_dotenv
6
  import numpy as np
7
  import pandas as pd
8
 
9
+ from huggingface_hub import configure_http_backend
10
+ def backend_factory() -> requests.Session:
11
+ session = requests.Session()
12
+ session.verify = False
13
+ return session
14
+
15
+ configure_http_backend(backend_factory=backend_factory)
16
+
17
+
18
  warnings.filterwarnings("ignore")
19
  os.environ["CURL_CA_BUNDLE"] = ""
20
  load_dotenv()
 
24
  import zipfile
25
  import sys
26
  import subprocess
 
27
  import re
28
  import traceback
29
  import io
 
35
  DOCUMENT_LOCK = threading.Lock()
36
  STOP_EVENT = threading.Event()
37
 
38
+ spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent")
39
  spec_contents = spec_contents["train"].to_list()
40
  documents_by_spec_num = {}
41
  for section in spec_contents:
3gpp_tdoc_indexer.py CHANGED
@@ -10,6 +10,13 @@ import re
10
  import concurrent.futures
11
  import threading
12
  from typing import List, Dict, Any
 
 
 
 
 
 
 
13
 
14
  warnings.filterwarnings("ignore")
15
 
 
10
  import concurrent.futures
11
  import threading
12
  from typing import List, Dict, Any
13
+ from huggingface_hub import configure_http_backend
14
+ def backend_factory() -> requests.Session:
15
+ session = requests.Session()
16
+ session.verify = False
17
+ return session
18
+
19
+ configure_http_backend(backend_factory=backend_factory)
20
 
21
  warnings.filterwarnings("ignore")
22
 
etsi_bm25_maker.py CHANGED
@@ -1,6 +1,13 @@
1
  from typing import Optional
2
- import os, warnings
3
  os.environ["CURL_CA_BUNDLE"] = ''
 
 
 
 
 
 
 
4
  from dotenv import load_dotenv
5
  warnings.filterwarnings("ignore")
6
  load_dotenv()
@@ -9,8 +16,8 @@ from bm25s.hf import BM25HF
9
  from datasets import load_dataset
10
  unique_specs = set()
11
 
12
- dataset_text = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ.get("HF_TOKEN"))
13
- dataset_metadata = load_dataset("OrganizedProgrammers/ETSISpecMetadata", token=os.environ.get("HF_TOKEN"))
14
 
15
  dataset_text = dataset_text["train"].to_list()
16
  dataset_metadata = dataset_metadata["train"].to_list()
 
1
  from typing import Optional
2
+ import os, warnings, requests
3
  os.environ["CURL_CA_BUNDLE"] = ''
4
+ from huggingface_hub import configure_http_backend
5
+ def backend_factory() -> requests.Session:
6
+ session = requests.Session()
7
+ session.verify = False
8
+ return session
9
+
10
+ configure_http_backend(backend_factory=backend_factory)
11
  from dotenv import load_dotenv
12
  warnings.filterwarnings("ignore")
13
  load_dotenv()
 
16
  from datasets import load_dataset
17
  unique_specs = set()
18
 
19
+ dataset_text = load_dataset("OrganizedProgrammers/ETSISpecContent")
20
+ dataset_metadata = load_dataset("OrganizedProgrammers/ETSISpecMetadata")
21
 
22
  dataset_text = dataset_text["train"].to_list()
23
  dataset_metadata = dataset_metadata["train"].to_list()
etsi_spec_indexer.py CHANGED
@@ -3,18 +3,25 @@ import time
3
  import warnings
4
  from dotenv import load_dotenv
5
  import numpy as np
 
6
  import pandas as pd
7
 
8
  warnings.filterwarnings("ignore")
9
  os.environ["CURL_CA_BUNDLE"] = ""
10
  load_dotenv()
 
 
 
 
 
 
 
11
 
12
  from datasets import load_dataset, Dataset
13
  from datasets.data_files import EmptyDatasetError
14
  import threading
15
  import zipfile
16
  import sys
17
- import requests
18
  import fitz
19
  import re
20
  import json
@@ -23,6 +30,7 @@ import io
23
  import concurrent.futures
24
  import hashlib
25
 
 
26
  CHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
27
  DICT_LOCK = threading.Lock()
28
  DOCUMENT_LOCK = threading.Lock()
 
3
  import warnings
4
  from dotenv import load_dotenv
5
  import numpy as np
6
+ import requests
7
  import pandas as pd
8
 
9
  warnings.filterwarnings("ignore")
10
  os.environ["CURL_CA_BUNDLE"] = ""
11
  load_dotenv()
12
+ from huggingface_hub import configure_http_backend
13
+ def backend_factory() -> requests.Session:
14
+ session = requests.Session()
15
+ session.verify = False
16
+ return session
17
+
18
+ configure_http_backend(backend_factory=backend_factory)
19
 
20
  from datasets import load_dataset, Dataset
21
  from datasets.data_files import EmptyDatasetError
22
  import threading
23
  import zipfile
24
  import sys
 
25
  import fitz
26
  import re
27
  import json
 
30
  import concurrent.futures
31
  import hashlib
32
 
33
+
34
  CHARS = "0123456789abcdefghijklmnopqrstuvwxyz"
35
  DICT_LOCK = threading.Lock()
36
  DOCUMENT_LOCK = threading.Lock()