buyer_agent / ingest_datafiniti_data.py
mincomp's picture
Upload folder using huggingface_hub
fcd327a verified
import json
import os
from opensearchpy import OpenSearch
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
host = "localhost"
port = 9200
OPENSEARCH_ADMIN_PASSWORD = os.getenv("OPENSEARCH_ADMIN_PASSWORD", "yw7L5u9nLs3a")
auth = (
"admin",
OPENSEARCH_ADMIN_PASSWORD,
)
# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
hosts=[{"host": host, "port": port}],
http_compress=True, # enables gzip compression for request bodies
http_auth=auth,
use_ssl=True,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
)
with open("datafiniti_properties_sunnyvale_400.json", "r") as f:
bulk_body = []
for line in f:
property = json.loads(line)
try:
print(f'indexing {property["address"]}')
bathrooms = int(property["numBathroom"])
beds = property["numBedroom"]
price = property["mostRecentPriceAmount"]
size = property["floorSizeValue"]
address = ", ".join(
[
property["address"],
property["city"],
property["province"],
property["postalCode"][:5],
]
)
descriptions = property["descriptions"]
descriptions = sorted(
descriptions, key=lambda x: x["dateSeen"], reverse=True
)
description = descriptions[0]["value"]
row = {
"bathrooms": bathrooms,
"bedrooms": beds,
"listingPrice": price,
"squareFootage": size,
"address": address,
"publicDescription": description,
"publicDescriptionKnn": model.encode(description).tolist(),
}
bulk_body.append({"create": {"_index": "datafiniti_props", "_id": address}})
bulk_body.append(row)
except:
pass
client.bulk(
index="datafiniti_props",
body=bulk_body,
)