Spaces:
Sleeping
Sleeping
File size: 2,131 Bytes
fcd327a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import json
import os
from opensearchpy import OpenSearch
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
host = "localhost"
port = 9200
OPENSEARCH_ADMIN_PASSWORD = os.getenv("OPENSEARCH_ADMIN_PASSWORD", "yw7L5u9nLs3a")
auth = (
"admin",
OPENSEARCH_ADMIN_PASSWORD,
)
# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
hosts=[{"host": host, "port": port}],
http_compress=True, # enables gzip compression for request bodies
http_auth=auth,
use_ssl=True,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
)
with open("datafiniti_properties_sunnyvale_400.json", "r") as f:
bulk_body = []
for line in f:
property = json.loads(line)
try:
print(f'indexing {property["address"]}')
bathrooms = int(property["numBathroom"])
beds = property["numBedroom"]
price = property["mostRecentPriceAmount"]
size = property["floorSizeValue"]
address = ", ".join(
[
property["address"],
property["city"],
property["province"],
property["postalCode"][:5],
]
)
descriptions = property["descriptions"]
descriptions = sorted(
descriptions, key=lambda x: x["dateSeen"], reverse=True
)
description = descriptions[0]["value"]
row = {
"bathrooms": bathrooms,
"bedrooms": beds,
"listingPrice": price,
"squareFootage": size,
"address": address,
"publicDescription": description,
"publicDescriptionKnn": model.encode(description).tolist(),
}
bulk_body.append({"create": {"_index": "datafiniti_props", "_id": address}})
bulk_body.append(row)
except:
pass
client.bulk(
index="datafiniti_props",
body=bulk_body,
)
|