avsolatorio commited on
Commit
8822f57
·
1 Parent(s): dcced35

Add data download

Browse files

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

Files changed (1) hide show
  1. services.py +18 -2
services.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  import pandas as pd
3
  import torch
4
  import httpx
@@ -8,6 +9,7 @@ from typing import Optional, Any
8
  from sentence_transformers import SentenceTransformer
9
 
10
  from pydantic import BaseModel, Field
 
11
 
12
 
13
  def get_best_torch_device():
@@ -26,9 +28,23 @@ device = get_best_torch_device()
26
 
27
 
28
  # Load the basic WDI metadata and vectors.
29
- wdi_data_vec_fpath = (
30
- "./data/avsolatorio__GIST-small-Embedding-v0__005__indicator_embeddings.json"
 
 
 
31
  )
 
 
 
 
 
 
 
 
 
 
 
32
  df = pd.read_json(wdi_data_vec_fpath)
33
 
34
  # Make it easy to index based on the idno
 
1
  import json
2
+ import os
3
  import pandas as pd
4
  import torch
5
  import httpx
 
9
  from sentence_transformers import SentenceTransformer
10
 
11
  from pydantic import BaseModel, Field
12
+ from urllib.request import urlretrieve
13
 
14
 
15
  def get_best_torch_device():
 
28
 
29
 
30
  # Load the basic WDI metadata and vectors.
31
+
32
+ EMBEDDING_FNAME = "avsolatorio__GIST-small-Embedding-v0__005__indicator_embeddings.json"
33
+ EMBEDDING_SOURCE = (
34
+ f"https://raw.githubusercontent.com/"
35
+ f"avsolatorio/ai-for-data-blog/refs/heads/main/semantic-search/data/{EMBEDDING_FNAME}"
36
  )
37
+ wdi_data_vec_fpath = os.path.join("data", EMBEDDING_FNAME)
38
+
39
+ os.makedirs(os.path.dirname(wdi_data_vec_fpath), exist_ok=True)
40
+
41
+ if not os.path.exists(wdi_data_vec_fpath):
42
+ print(f"Downloading {EMBEDDING_FNAME} to {wdi_data_vec_fpath}...")
43
+ urlretrieve(EMBEDDING_SOURCE, wdi_data_vec_fpath)
44
+ print("Download complete.")
45
+ else:
46
+ print(f"File already exists at {wdi_data_vec_fpath}.")
47
+
48
  df = pd.read_json(wdi_data_vec_fpath)
49
 
50
  # Make it easy to index based on the idno