Spaces:
Sleeping
Sleeping
import json | |
import os | |
from opensearchpy import OpenSearch | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer("all-MiniLM-L6-v2") | |
host = "localhost" | |
port = 9200 | |
OPENSEARCH_ADMIN_PASSWORD = os.getenv("OPENSEARCH_ADMIN_PASSWORD", "yw7L5u9nLs3a") | |
auth = ( | |
"admin", | |
OPENSEARCH_ADMIN_PASSWORD, | |
) | |
# Create the client with SSL/TLS enabled, but hostname verification disabled. | |
client = OpenSearch( | |
hosts=[{"host": host, "port": port}], | |
http_compress=True, # enables gzip compression for request bodies | |
http_auth=auth, | |
use_ssl=True, | |
verify_certs=False, | |
ssl_assert_hostname=False, | |
ssl_show_warn=False, | |
) | |
with open("datafiniti_properties_sunnyvale_400.json", "r") as f: | |
bulk_body = [] | |
for line in f: | |
property = json.loads(line) | |
try: | |
print(f'indexing {property["address"]}') | |
bathrooms = int(property["numBathroom"]) | |
beds = property["numBedroom"] | |
price = property["mostRecentPriceAmount"] | |
size = property["floorSizeValue"] | |
address = ", ".join( | |
[ | |
property["address"], | |
property["city"], | |
property["province"], | |
property["postalCode"][:5], | |
] | |
) | |
descriptions = property["descriptions"] | |
descriptions = sorted( | |
descriptions, key=lambda x: x["dateSeen"], reverse=True | |
) | |
description = descriptions[0]["value"] | |
row = { | |
"bathrooms": bathrooms, | |
"bedrooms": beds, | |
"listingPrice": price, | |
"squareFootage": size, | |
"address": address, | |
"publicDescription": description, | |
"publicDescriptionKnn": model.encode(description).tolist(), | |
} | |
bulk_body.append({"create": {"_index": "datafiniti_props", "_id": address}}) | |
bulk_body.append(row) | |
except: | |
pass | |
client.bulk( | |
index="datafiniti_props", | |
body=bulk_body, | |
) | |