import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
from sklearn.metrics import silhouette_score | |
from sklearn.preprocessing import StandardScaler | |
from scipy.sparse import load_npz | |
import time | |
from multiprocessing import Pool | |
embed_type = 'SBERT' # Change this to 'MLFPA' or 'BERT' as needed | |
#if no parquet create it | |
try: | |
embeddings_df = pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') | |
except: | |
# Load MLFPA_project-main\BERT embeddings\bert_embedding.npz | |
embeddings_df = np.load('BERT embeddings/sbert_embedding.npz')['sbert_embedding']#np.load(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embedding.npz') | |
# print(embeddings_df.files) # Check the keys in the .npz file | |
# embeddings_df = [f'{embed_type.lower().replace("-", "")}_embedding'] | |
print(embeddings_df.shape) # Check the shape of the embeddings | |
#print data type | |
print(type(embeddings_df)) # Check the type of the embeddings | |
#change to pandas dataframe | |
embeddings_df = pd.DataFrame(embeddings_df) | |
#save as parquet | |
embeddings_df.to_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#to_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') | |
#load parquet | |
embeddings_df = pd.read_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') | |
#do the clustering lmao | |
def scale_and_pca(embeddings_df): | |
# Standardize the data | |
scaler = StandardScaler() | |
embeddings_scaled = scaler.fit_transform(embeddings_df) | |
# Perform PCA to reduce dimensionality | |
pca = PCA(n_components=3) | |
embeddings_pca = pca.fit_transform(embeddings_scaled) | |
return embeddings_pca | |
embeddings_pca = scale_and_pca(embeddings_df) | |
#remove embeddings_df from memory | |
del embeddings_df | |
# Create a 3D scatter plot of the PCA results | |
def plot_3d_scatter(embeddings_pca): | |
fig = plt.figure(figsize=(10, 7)) | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], embeddings_pca[:, 2], s=1) | |
ax.set_xlabel('PC 1') | |
ax.set_ylabel('PC 2') | |
ax.set_zlabel('PC 3') | |
plt.title('3D PCA of BERT Embeddings') | |
plt.show() | |
# plot_3d_scatter(embeddings_pca) | |
# def compute_silhouette(n_clusters, data): | |
# kmeans = KMeans(n_clusters=n_clusters, random_state=420) | |
# labels = kmeans.fit_predict(data) | |
# silhouette_avg = silhouette_score(data, labels) | |
# print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}") | |
# return silhouette_avg | |
# silhouette_scores = [] | |
# for i in range(2, 10): | |
# start_time = time.time() | |
# silhouette_scores.append(compute_silhouette(i, embeddings_pca)) | |
# end_time = time.time() | |
# print(f"Time taken for n_clusters = {i}: {end_time - start_time} seconds") | |
# # Plot silhouette scores | |
# plt.figure(figsize=(10, 6)) | |
# plt.plot(range(2, 10), silhouette_scores, marker='o') | |
# plt.title('Silhouette Scores for Different Cluster Sizes') | |
# plt.xlabel('Number of Clusters') | |
# plt.ylabel('Silhouette Score') | |
# plt.xticks(range(2, 10)) | |
# plt.grid() | |
# plt.show() | |
# # Save silhouette scores to CSV | |
# silhouette_df = pd.DataFrame({'n_clusters': range(2, 10), 'silhouette_score': silhouette_scores}) | |
# silhouette_df.to_csv('MLFPA_project-main/Raf_scores/silhouette_scores.csv', index=False) | |
#save the the cluster labels for n_clusters = 5 | |
def save_cluster_labels(n_clusters, data): | |
kmeans = KMeans(n_clusters=n_clusters, random_state=420) | |
labels = kmeans.fit_predict(data) | |
labels_df = pd.DataFrame(labels, columns=['cluster_label']) | |
labels_df.to_csv(f'raf_clusters/cluster_labels_sbert.csv', index=False) | |
return labels_df | |
save_cluster_labels(5, embeddings_pca) | |
# plot_3d_scatter(embeddings_pca) | |