ankush-003 commited on
Commit
edba165
·
verified ·
1 Parent(s): 896f4f0
Files changed (2) hide show
  1. app.py +212 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ from huggingface_hub import snapshot_download
4
+ from datasets import load_dataset
5
+ from gensim.models import FastText
6
+ from s2sphere import CellId, Cell, LatLng
7
+ from collections import defaultdict
8
+ import folium
9
+ from folium import Map
10
+ import gradio as gr
11
+ from gradio_folium import Folium
12
+ from sklearn.cluster import KMeans
13
+
14
+ def extract_restaurant_embeddings(model, processed_df):
15
+ """
16
+ Extract the embeddings for all restaurants
17
+ """
18
+ unique_restaurants = processed_df['res_cell_id'].unique()
19
+
20
+ restaurant_embeddings = {}
21
+ for restaurant_id in unique_restaurants:
22
+ token = str(restaurant_id) # No prefix, just the cell ID
23
+ try:
24
+ embedding = model.wv[token]
25
+ restaurant_embeddings[restaurant_id] = embedding
26
+ except KeyError:
27
+ print(f"Warning: Restaurant {restaurant_id} not found in vocabulary")
28
+
29
+ return restaurant_embeddings
30
+
31
+ def cluster_embeddings(restaurant_embeddings, algo):
32
+ restaurant_ids = list(restaurant_embeddings.keys())
33
+ embedding_matrix = np.array([restaurant_embeddings[res_id] for res_id in restaurant_ids])
34
+ labels = algo.fit_predict(embedding_matrix)
35
+ restaurant_clusters = dict(zip(restaurant_ids, labels))
36
+ return restaurant_clusters
37
+
38
+ def s2_cell_to_geojson(cell_id_token_or_int):
39
+ # Convert to CellId
40
+ cell_id = CellId.from_token(str(cell_id_token_or_int)) if isinstance(cell_id_token_or_int, str) else CellId(cell_id_token_or_int)
41
+
42
+ cell = Cell(cell_id)
43
+
44
+ # Get cell corner coordinates
45
+ coords = []
46
+ for i in range(4):
47
+ vertex = cell.get_vertex(i)
48
+ latlng = LatLng.from_point(vertex)
49
+ coords.append([latlng.lng().degrees, latlng.lat().degrees]) # GeoJSON uses [lng, lat]
50
+ coords.append(coords[0]) # Close the polygon
51
+
52
+ # Build GeoJSON
53
+ geojson = {
54
+ "type": "Feature",
55
+ "geometry": {
56
+ "type": "Polygon",
57
+ "coordinates": [coords]
58
+ },
59
+ "properties": {
60
+ "cell_id": str(cell_id),
61
+ "level": cell_id.level()
62
+ }
63
+ }
64
+ return geojson
65
+
66
+ def map_cluster_to_restaurants(restaurant_clusters):
67
+ # Reverse mapping: cluster_id → list of restaurant_ids
68
+ cluster_to_restaurants = defaultdict(list)
69
+ for res_id, cluster_id in restaurant_clusters.items():
70
+ cluster_to_restaurants[cluster_id].append(res_id)
71
+ return cluster_to_restaurants
72
+
73
+ def get_cluster_jsons(cluster_to_restaurants):
74
+ clusters_jsons = []
75
+ for cid, res_ids in cluster_to_restaurants.items():
76
+ features = []
77
+ for cell_id in res_ids:
78
+ try:
79
+ feature = s2_cell_to_geojson(cell_id)
80
+ features.append(feature)
81
+ except Exception as e:
82
+ print(f"Error converting {cell_id}: {e}")
83
+
84
+ # Build GeoJSON FeatureCollection
85
+ geojson = {
86
+ "type": "FeatureCollection",
87
+ "features": features
88
+ }
89
+ clusters_jsons.append(geojson)
90
+ return clusters_jsons
91
+
92
+ def visualise_on_map(jsons):
93
+ # Create map (you can center it later using a known location or one of the features)
94
+ m = folium.Map(location=[12.935656, 77.543204], zoom_start=12)
95
+
96
+ # Loop through all cluster GeoJSONs and add them to the map
97
+ for i, geojson in enumerate(jsons):
98
+ try:
99
+ folium.GeoJson(
100
+ geojson,
101
+ name=f"Cluster {i}",
102
+ tooltip=f"Cluster {i}",
103
+ style_function=lambda feature, color=f"#{i*123456%0xFFFFFF:06x}": {
104
+ "fillColor": color,
105
+ "color": color,
106
+ "weight": 1,
107
+ "fillOpacity": 0.4,
108
+ },
109
+ ).add_to(m)
110
+ except Exception as e:
111
+ print(f"Failed to add cluster {i}: {e}")
112
+
113
+ # Optional: Add a layer control to toggle clusters
114
+ folium.LayerControl().add_to(m)
115
+
116
+ return m
117
+
118
+ REPO_ID = "ankush-003/fastCell"
119
+
120
+ dataset = load_dataset("ankush-003/Cells_Data")
121
+ df = dataset['train'].to_pandas()
122
+
123
+ snapshot_download(repo_id=REPO_ID, local_dir="/model")
124
+
125
+ model = FastText.load(
126
+ "/model/cell_embedddings_model"
127
+ )
128
+
129
+ restaurant_embeddings = extract_restaurant_embeddings(model, df)
130
+
131
+ def run_clustering(num_clusters, clusters_to_display):
132
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
133
+ restaurant_clusters = cluster_embeddings(restaurant_embeddings, kmeans)
134
+ df['cluster'] = df['res_cell_id'].map(restaurant_clusters)
135
+
136
+ # Count restaurants per cluster
137
+ cluster_sizes = df['cluster'].value_counts().sort_index()
138
+ avg_size = cluster_sizes.mean()
139
+ min_size = cluster_sizes.min()
140
+ max_size = cluster_sizes.max()
141
+
142
+ analysis = f"""
143
+ ## Clustering Analysis (K={num_clusters})
144
+
145
+ - Total restaurants: {len(df)}
146
+ - Number of clusters: {num_clusters}
147
+ - Average restaurants per cluster: {avg_size:.1f}
148
+ - Smallest cluster size: {min_size}
149
+ - Largest cluster size: {max_size}
150
+ - Empty clusters: {num_clusters - len(cluster_sizes)}
151
+ """
152
+
153
+ c_to_r = map_cluster_to_restaurants(restaurant_clusters)
154
+ clusters_jsons = get_cluster_jsons(c_to_r)
155
+ if clusters_to_display > len(clusters_jsons):
156
+ clusters_to_display = len(clusters_jsons)
157
+ # Show map
158
+ m = visualise_on_map(clusters_jsons[:clusters_to_display])
159
+
160
+ return analysis, m
161
+
162
+ # Create Gradio interface
163
+ with gr.Blocks(title="Restaurant Clustering Tool") as app:
164
+ gr.Markdown("# Restaurant K-Means Clustering Analysis")
165
+ gr.Markdown("Analyze restaurant data by adjusting the number of clusters")
166
+
167
+ with gr.Row():
168
+ with gr.Column(scale=1):
169
+ num_clusters_input = gr.Slider(
170
+ minimum=2,
171
+ maximum=3460,
172
+ value=300,
173
+ step=1,
174
+ label="Total Number of Clusters (K)"
175
+ )
176
+
177
+ display_clusters_input = gr.Slider(
178
+ minimum=1,
179
+ maximum=3460,
180
+ value=10,
181
+ step=1,
182
+ label="Number of Clusters to Display"
183
+ )
184
+
185
+ with gr.Row():
186
+ cluster_btn = gr.Button("Run Clustering")
187
+
188
+ with gr.Row():
189
+ output_text = gr.Markdown()
190
+
191
+ with gr.Row():
192
+ output_plot = Folium(value=Map(location=[12.935656, 77.543204], zoom_start=12), height=400)
193
+
194
+ cluster_btn.click(
195
+ fn=run_clustering,
196
+ inputs=[num_clusters_input, display_clusters_input],
197
+ outputs=[output_text, output_plot]
198
+ )
199
+
200
+ gr.Markdown("""
201
+ ## About this app
202
+
203
+ This app demonstrates K-means clustering on restaurant data. The algorithm groups similar restaurants together based on their descriptions and other features.
204
+
205
+ ### How to use:
206
+ 1. Adjust the number of clusters using the slider
207
+ 2. Click "Run Clustering" to see the results
208
+ 3. Analyze the visualization and metrics
209
+ """)
210
+
211
+ if __name__ == "__main__":
212
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ gradio_folium
5
+ folium
6
+ gensim
7
+ gradio
8
+ s2sphere