Commit
·
8822f57
1
Parent(s):
dcced35
Add data download
Browse filesSigned-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>
- services.py +18 -2
services.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
import pandas as pd
|
3 |
import torch
|
4 |
import httpx
|
@@ -8,6 +9,7 @@ from typing import Optional, Any
|
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
|
10 |
from pydantic import BaseModel, Field
|
|
|
11 |
|
12 |
|
13 |
def get_best_torch_device():
|
@@ -26,9 +28,23 @@ device = get_best_torch_device()
|
|
26 |
|
27 |
|
28 |
# Load the basic WDI metadata and vectors.
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
31 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
df = pd.read_json(wdi_data_vec_fpath)
|
33 |
|
34 |
# Make it easy to index based on the idno
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
import pandas as pd
|
4 |
import torch
|
5 |
import httpx
|
|
|
9 |
from sentence_transformers import SentenceTransformer
|
10 |
|
11 |
from pydantic import BaseModel, Field
|
12 |
+
from urllib.request import urlretrieve
|
13 |
|
14 |
|
15 |
def get_best_torch_device():
|
|
|
28 |
|
29 |
|
30 |
# Load the basic WDI metadata and vectors.
|
31 |
+
|
32 |
+
EMBEDDING_FNAME = "avsolatorio__GIST-small-Embedding-v0__005__indicator_embeddings.json"
|
33 |
+
EMBEDDING_SOURCE = (
|
34 |
+
f"https://raw.githubusercontent.com/"
|
35 |
+
f"avsolatorio/ai-for-data-blog/refs/heads/main/semantic-search/data/{EMBEDDING_FNAME}"
|
36 |
)
|
37 |
+
wdi_data_vec_fpath = os.path.join("data", EMBEDDING_FNAME)
|
38 |
+
|
39 |
+
os.makedirs(os.path.dirname(wdi_data_vec_fpath), exist_ok=True)
|
40 |
+
|
41 |
+
if not os.path.exists(wdi_data_vec_fpath):
|
42 |
+
print(f"Downloading {EMBEDDING_FNAME} to {wdi_data_vec_fpath}...")
|
43 |
+
urlretrieve(EMBEDDING_SOURCE, wdi_data_vec_fpath)
|
44 |
+
print("Download complete.")
|
45 |
+
else:
|
46 |
+
print(f"File already exists at {wdi_data_vec_fpath}.")
|
47 |
+
|
48 |
df = pd.read_json(wdi_data_vec_fpath)
|
49 |
|
50 |
# Make it easy to index based on the idno
|