Spaces:
Sleeping
Sleeping
Genesis
Browse files- app.py +212 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np # linear algebra
|
2 |
+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
3 |
+
from huggingface_hub import snapshot_download
|
4 |
+
from datasets import load_dataset
|
5 |
+
from gensim.models import FastText
|
6 |
+
from s2sphere import CellId, Cell, LatLng
|
7 |
+
from collections import defaultdict
|
8 |
+
import folium
|
9 |
+
from folium import Map
|
10 |
+
import gradio as gr
|
11 |
+
from gradio_folium import Folium
|
12 |
+
from sklearn.cluster import KMeans
|
13 |
+
|
14 |
+
def extract_restaurant_embeddings(model, processed_df):
|
15 |
+
"""
|
16 |
+
Extract the embeddings for all restaurants
|
17 |
+
"""
|
18 |
+
unique_restaurants = processed_df['res_cell_id'].unique()
|
19 |
+
|
20 |
+
restaurant_embeddings = {}
|
21 |
+
for restaurant_id in unique_restaurants:
|
22 |
+
token = str(restaurant_id) # No prefix, just the cell ID
|
23 |
+
try:
|
24 |
+
embedding = model.wv[token]
|
25 |
+
restaurant_embeddings[restaurant_id] = embedding
|
26 |
+
except KeyError:
|
27 |
+
print(f"Warning: Restaurant {restaurant_id} not found in vocabulary")
|
28 |
+
|
29 |
+
return restaurant_embeddings
|
30 |
+
|
31 |
+
def cluster_embeddings(restaurant_embeddings, algo):
|
32 |
+
restaurant_ids = list(restaurant_embeddings.keys())
|
33 |
+
embedding_matrix = np.array([restaurant_embeddings[res_id] for res_id in restaurant_ids])
|
34 |
+
labels = algo.fit_predict(embedding_matrix)
|
35 |
+
restaurant_clusters = dict(zip(restaurant_ids, labels))
|
36 |
+
return restaurant_clusters
|
37 |
+
|
38 |
+
def s2_cell_to_geojson(cell_id_token_or_int):
|
39 |
+
# Convert to CellId
|
40 |
+
cell_id = CellId.from_token(str(cell_id_token_or_int)) if isinstance(cell_id_token_or_int, str) else CellId(cell_id_token_or_int)
|
41 |
+
|
42 |
+
cell = Cell(cell_id)
|
43 |
+
|
44 |
+
# Get cell corner coordinates
|
45 |
+
coords = []
|
46 |
+
for i in range(4):
|
47 |
+
vertex = cell.get_vertex(i)
|
48 |
+
latlng = LatLng.from_point(vertex)
|
49 |
+
coords.append([latlng.lng().degrees, latlng.lat().degrees]) # GeoJSON uses [lng, lat]
|
50 |
+
coords.append(coords[0]) # Close the polygon
|
51 |
+
|
52 |
+
# Build GeoJSON
|
53 |
+
geojson = {
|
54 |
+
"type": "Feature",
|
55 |
+
"geometry": {
|
56 |
+
"type": "Polygon",
|
57 |
+
"coordinates": [coords]
|
58 |
+
},
|
59 |
+
"properties": {
|
60 |
+
"cell_id": str(cell_id),
|
61 |
+
"level": cell_id.level()
|
62 |
+
}
|
63 |
+
}
|
64 |
+
return geojson
|
65 |
+
|
66 |
+
def map_cluster_to_restaurants(restaurant_clusters):
|
67 |
+
# Reverse mapping: cluster_id → list of restaurant_ids
|
68 |
+
cluster_to_restaurants = defaultdict(list)
|
69 |
+
for res_id, cluster_id in restaurant_clusters.items():
|
70 |
+
cluster_to_restaurants[cluster_id].append(res_id)
|
71 |
+
return cluster_to_restaurants
|
72 |
+
|
73 |
+
def get_cluster_jsons(cluster_to_restaurants):
|
74 |
+
clusters_jsons = []
|
75 |
+
for cid, res_ids in cluster_to_restaurants.items():
|
76 |
+
features = []
|
77 |
+
for cell_id in res_ids:
|
78 |
+
try:
|
79 |
+
feature = s2_cell_to_geojson(cell_id)
|
80 |
+
features.append(feature)
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error converting {cell_id}: {e}")
|
83 |
+
|
84 |
+
# Build GeoJSON FeatureCollection
|
85 |
+
geojson = {
|
86 |
+
"type": "FeatureCollection",
|
87 |
+
"features": features
|
88 |
+
}
|
89 |
+
clusters_jsons.append(geojson)
|
90 |
+
return clusters_jsons
|
91 |
+
|
92 |
+
def visualise_on_map(jsons):
|
93 |
+
# Create map (you can center it later using a known location or one of the features)
|
94 |
+
m = folium.Map(location=[12.935656, 77.543204], zoom_start=12)
|
95 |
+
|
96 |
+
# Loop through all cluster GeoJSONs and add them to the map
|
97 |
+
for i, geojson in enumerate(jsons):
|
98 |
+
try:
|
99 |
+
folium.GeoJson(
|
100 |
+
geojson,
|
101 |
+
name=f"Cluster {i}",
|
102 |
+
tooltip=f"Cluster {i}",
|
103 |
+
style_function=lambda feature, color=f"#{i*123456%0xFFFFFF:06x}": {
|
104 |
+
"fillColor": color,
|
105 |
+
"color": color,
|
106 |
+
"weight": 1,
|
107 |
+
"fillOpacity": 0.4,
|
108 |
+
},
|
109 |
+
).add_to(m)
|
110 |
+
except Exception as e:
|
111 |
+
print(f"Failed to add cluster {i}: {e}")
|
112 |
+
|
113 |
+
# Optional: Add a layer control to toggle clusters
|
114 |
+
folium.LayerControl().add_to(m)
|
115 |
+
|
116 |
+
return m
|
117 |
+
|
118 |
+
REPO_ID = "ankush-003/fastCell"
|
119 |
+
|
120 |
+
dataset = load_dataset("ankush-003/Cells_Data")
|
121 |
+
df = dataset['train'].to_pandas()
|
122 |
+
|
123 |
+
snapshot_download(repo_id=REPO_ID, local_dir="/model")
|
124 |
+
|
125 |
+
model = FastText.load(
|
126 |
+
"/model/cell_embedddings_model"
|
127 |
+
)
|
128 |
+
|
129 |
+
restaurant_embeddings = extract_restaurant_embeddings(model, df)
|
130 |
+
|
131 |
+
def run_clustering(num_clusters, clusters_to_display):
|
132 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
133 |
+
restaurant_clusters = cluster_embeddings(restaurant_embeddings, kmeans)
|
134 |
+
df['cluster'] = df['res_cell_id'].map(restaurant_clusters)
|
135 |
+
|
136 |
+
# Count restaurants per cluster
|
137 |
+
cluster_sizes = df['cluster'].value_counts().sort_index()
|
138 |
+
avg_size = cluster_sizes.mean()
|
139 |
+
min_size = cluster_sizes.min()
|
140 |
+
max_size = cluster_sizes.max()
|
141 |
+
|
142 |
+
analysis = f"""
|
143 |
+
## Clustering Analysis (K={num_clusters})
|
144 |
+
|
145 |
+
- Total restaurants: {len(df)}
|
146 |
+
- Number of clusters: {num_clusters}
|
147 |
+
- Average restaurants per cluster: {avg_size:.1f}
|
148 |
+
- Smallest cluster size: {min_size}
|
149 |
+
- Largest cluster size: {max_size}
|
150 |
+
- Empty clusters: {num_clusters - len(cluster_sizes)}
|
151 |
+
"""
|
152 |
+
|
153 |
+
c_to_r = map_cluster_to_restaurants(restaurant_clusters)
|
154 |
+
clusters_jsons = get_cluster_jsons(c_to_r)
|
155 |
+
if clusters_to_display > len(clusters_jsons):
|
156 |
+
clusters_to_display = len(clusters_jsons)
|
157 |
+
# Show map
|
158 |
+
m = visualise_on_map(clusters_jsons[:clusters_to_display])
|
159 |
+
|
160 |
+
return analysis, m
|
161 |
+
|
162 |
+
# Create Gradio interface
|
163 |
+
with gr.Blocks(title="Restaurant Clustering Tool") as app:
|
164 |
+
gr.Markdown("# Restaurant K-Means Clustering Analysis")
|
165 |
+
gr.Markdown("Analyze restaurant data by adjusting the number of clusters")
|
166 |
+
|
167 |
+
with gr.Row():
|
168 |
+
with gr.Column(scale=1):
|
169 |
+
num_clusters_input = gr.Slider(
|
170 |
+
minimum=2,
|
171 |
+
maximum=3460,
|
172 |
+
value=300,
|
173 |
+
step=1,
|
174 |
+
label="Total Number of Clusters (K)"
|
175 |
+
)
|
176 |
+
|
177 |
+
display_clusters_input = gr.Slider(
|
178 |
+
minimum=1,
|
179 |
+
maximum=3460,
|
180 |
+
value=10,
|
181 |
+
step=1,
|
182 |
+
label="Number of Clusters to Display"
|
183 |
+
)
|
184 |
+
|
185 |
+
with gr.Row():
|
186 |
+
cluster_btn = gr.Button("Run Clustering")
|
187 |
+
|
188 |
+
with gr.Row():
|
189 |
+
output_text = gr.Markdown()
|
190 |
+
|
191 |
+
with gr.Row():
|
192 |
+
output_plot = Folium(value=Map(location=[12.935656, 77.543204], zoom_start=12), height=400)
|
193 |
+
|
194 |
+
cluster_btn.click(
|
195 |
+
fn=run_clustering,
|
196 |
+
inputs=[num_clusters_input, display_clusters_input],
|
197 |
+
outputs=[output_text, output_plot]
|
198 |
+
)
|
199 |
+
|
200 |
+
gr.Markdown("""
|
201 |
+
## About this app
|
202 |
+
|
203 |
+
This app demonstrates K-means clustering on restaurant data. The algorithm groups similar restaurants together based on their descriptions and other features.
|
204 |
+
|
205 |
+
### How to use:
|
206 |
+
1. Adjust the number of clusters using the slider
|
207 |
+
2. Click "Run Clustering" to see the results
|
208 |
+
3. Analyze the visualization and metrics
|
209 |
+
""")
|
210 |
+
|
211 |
+
if __name__ == "__main__":
|
212 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
scikit-learn
|
4 |
+
gradio_folium
|
5 |
+
folium
|
6 |
+
gensim
|
7 |
+
gradio
|
8 |
+
s2sphere
|