Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 9046d6ce authored by NADAL Morgane's avatar NADAL Morgane
Browse files

PCA and KMeans functional

parent 8fedfebd
No related branches found
No related tags found
No related merge requests found
......@@ -42,22 +42,79 @@ import glob
from typing import List
def PCAOnDF(df: pd_.DataFrame(),
target: str,
targets: List[str],
colors: List[str],
save_name: str = None,
title: str = ""
) -> list:
'''
Perform 2D PCA on the CHO-DIO dataframe.
Print ratio variance and plot the PCA.
'''
# Separating the features from their conditions and durations
all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target])
df_all = df.drop([target], axis=1)
# Standardize the data
scaler = StandardScaler()
scaler.fit(df_all)
stand_df = scaler.transform(df_all)
# Create the PCA and fit the data
pca = PCA(n_components=2)
principal_components = pca.fit_transform(stand_df)
print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
# Give the final df containing the principal component and their condition
final_df = pd_.concat([principal_df, all_target[target]], axis=1)
# Plot
fig = pl_.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('Principal Component 1', fontsize=15)
ax.set_ylabel('Principal Component 2', fontsize=15)
ax.set_title(f'2 component PCA{title}', fontsize=20)
for tgt, color in zip(targets, colors):
idx = final_df[target] == tgt
ax.scatter(final_df.loc[idx, 'principal component 1']
, final_df.loc[idx, 'principal component 2']
, c=color
, s=30)
ax.legend(targets)
ax.grid()
if save_name is not None:
pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
return pca.explained_variance_ratio_
def KmeansOnDF(df: pd_.DataFrame(),
nb_clusters: tuple,
representation: bool = False,
target: str,
plot_bar: bool = True,
rep_on_image: bool = False,
labeled_somas=None,
elbow: bool = False,
intracluster_var: bool = True,
save_name: str = None,
title: str = "",
) -> KMeans:
'''
Perform kmeans on the pandas dataframe. Can find the best number of cluster with elbow method,
find the intracluster variance, and represent the result on the initial images.
Returns kmeans.
'''
# Separating the features from their conditions and durations
all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target])
df = df.drop([target], axis=1)
# Data standardization
scaler = StandardScaler()
scaler.fit(df)
stand_df = scaler.transform(df)
stand_df = scaler.fit_transform(df)
# Best number of clusters using Elbow method
if elbow:
......@@ -77,14 +134,29 @@ def KmeansOnDF(df: pd_.DataFrame(),
# Kmeans with x clusters
for nb_cluster in nb_clusters:
kmeans = KMeans(n_clusters=nb_cluster, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(stand_df)
kmeans.fit_predict(stand_df)
label_df = pd_.DataFrame(data=kmeans.labels_, columns=['label'])
lab_cond_df = pd_.concat([label_df, all_target[target]], axis=1)
# Intracluster variance
if intracluster_var:
var = IntraClusterVariance(df, kmeans, nb_cluster)
var_df = pd_.DataFrame(stand_df)
var = IntraClusterVariance(var_df, kmeans, nb_cluster)
# Barplot
if plot_bar:
fig = pl_.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
sb_.countplot(x="condition", hue="label", data=lab_cond_df, palette=sb_.color_palette("deep", n_colors=2))
ax.set_title(f'Distribution of the clustering labels according to conditions{title}', fontsize=11)
ax.grid()
if save_name is not None:
pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\Hist_Clustering_{save_name}.png")
# pl_.show(block=True)
# pl_.close()
# Representation on the image
if representation:
if rep_on_image:
RepresentationOnImages(labeled_somas, kmeans, nb_cluster)
return kmeans
......@@ -96,10 +168,9 @@ def IntraClusterVariance(df: pd_.DataFrame(), kmeans: KMeans(), nb_cluster: int)
'''
var = []
for cluster in range(nb_cluster):
soma_cluster = [indx + 1 for indx, value in enumerate(kmeans.labels_) if value == cluster]
mean_cluster = np_.average([df.loc[f"soma {row}", :] for row in soma_cluster], axis=0)
variance = sum([np_.linalg.norm(df.loc[f"soma {row}", :] - mean_cluster) ** 2 for row in soma_cluster]) / len(
soma_cluster)
soma_cluster = [indx for indx, value in enumerate(kmeans.labels_) if value == cluster]
mean_cluster = np_.average([df.iloc[row, :] for row in soma_cluster], axis=0)
variance = sum([np_.linalg.norm(df.iloc[row, :] - mean_cluster) ** 2 for row in soma_cluster]) / (len(soma_cluster) - 1)
var.append(variance)
print(f"Intracluster variance for {nb_cluster} clusters :", var)
......@@ -144,56 +215,6 @@ def FeaturesStatistics(df):
pl_.close()
def PCAOnDF(df: pd_.DataFrame(),
target: str,
targets: List[str],
colors: List[str],
save_name: str = None,
title: str = ""
) -> list:
'''
Perform 2D PCA on the CHO-DIO dataframe.
Print ratio variance and plot the PCA.
'''
# Separating the features from their conditions and durations
# all_target = df.loc[:, [target]].values
df_all = df.drop([target], axis=1)
# Standardize the data
scaler = StandardScaler()
scaler.fit(df_all)
stand_df = scaler.transform(df_all)
# Create the PCA and fit the data
pca = PCA(n_components=2)
principal_components = pca.fit_transform(stand_df)
print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
# Give the final df containing the principal component and their condition
final_df = pd_.concat([principal_df, df[target]], axis=1)
# Plot
fig = pl_.figure(figsize=(8, 8))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('Principal Component 1', fontsize=15)
ax.set_ylabel('Principal Component 2', fontsize=15)
ax.set_title(f'2 component PCA{title}', fontsize=20)
for tgt, color in zip(targets, colors):
idx = final_df[target] == tgt
ax.scatter(final_df.loc[idx, 'principal component 1']
, final_df.loc[idx, 'principal component 2']
, c=color
, s=30)
ax.legend(targets)
ax.grid()
if save_name is not None:
pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
return pca.explained_variance_ratio_
if __name__ == "__main__":
#
# os.chdir("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion")
......@@ -224,71 +245,105 @@ if __name__ == "__main__":
axis=1)
df = df.dropna(axis=0, how="any")
# KMeansIntraImage(df, nb_clusters=(2,))
# FeatureDistribution(df)
# -- PCA with all the features
# Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
df_all = df.drop(["duration"], axis=1)
all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
# Between the two conditions, for each duration (2 conditions, 3 durations)
groupby_duration = df.groupby("duration")
duration_pca = []
for duration, values in groupby_duration:
# print(duration, values.shape)
# groupby_condition = values.groupby("condition")
# for cond, val in groupby_condition:
# print(cond, val.shape)
## duration: str, values: pd_.DataFrame()
duration_df = values.drop(["duration"], axis=1)
pca = PCAOnDF(duration_df,
target="condition",
targets=["CHO", "DIO"],
colors=["b", "r"],
save_name=f"{duration}_features",
title=f" - {duration} Sample")
duration_pca.append(pca)
## -- K-means with all the features (2 conditions)
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# kmeans = fa_.KmeansOnDF(concatenated_features_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# # -- PCA with all the features
# # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# df_all = df.drop(["duration"], axis=1)
# all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
#
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = df.groupby("duration")
# duration_pca = []
# for duration, values in groupby_duration:
# kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
## -- Select Discriminant features by statistical analysis
# TODO filtered_df = SelectFeatures(concatenated_features_df)
## -- PCA with all the features with the cluster as label
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# TODO pca = fa_.PCAOnDF(concatenated_features_df)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# # print(duration, values.shape)
# # groupby_condition = values.groupby("condition")
# # for cond, val in groupby_condition:
# # print(cond, val.shape)
# ## duration: str, values: pd_.DataFrame()
# duration_df = values.drop(["duration"], axis=1)
# pca = PCAOnDF(duration_df,
# target="condition",
# targets=["CHO", "DIO"],
# colors=["b", "r"],
# save_name=f"{duration}_features",
# title=f" - {duration} Sample")
# duration_pca.append(pca)
# -- K-means with all the features (2 conditions)
# # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# df_all = df.drop(["duration"], axis=1)
# kmeans = KmeansOnDF(df_all,
# target="condition",
# nb_clusters=(2,),
# elbow=False,
# intracluster_var=True,
# plot_bar=True,
# save_name="all features"
# )
#
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = df.groupby("duration")
# for duration, values in groupby_duration:
# pca = fa_.PCAOnDF(values)
# duration_df = values.drop(["duration"], axis=1)
# kmeans = KmeansOnDF(duration_df,
# target="condition",
# nb_clusters=(2,),
# elbow=False,
# intracluster_var=True,
# plot_bar=True,
# save_name=f"{duration}_features",
# title=f" - {duration} Sample",
# )
## -- Select Discriminant features by statistical analysis
# TODO filtered_df = SelectFeatures(concatenated_features_df)
## -- PCA with selected features
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# TODO pca = fa_.PCAOnDF(filtered_df)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# filtered_groupby_duration = filtered_df.groupby("Duration")
# for duration, values in filtered_groupby_duration:
# pca = fa_.PCAOnDF(values)
## -- K-means with selected features
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# filtered_kmeans = fa_.KmeansOnDF(filtered_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# for duration, values in filtered_groupby_duration:
# filtered_kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
\ No newline at end of file
# # -- PCA with selected features
# # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# df_all = df.drop(["duration"], axis=1)
# all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
#
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = df.groupby("duration")
# duration_pca = []
# for duration, values in groupby_duration:
# # print(duration, values.shape)
# # groupby_condition = values.groupby("condition")
# # for cond, val in groupby_condition:
# # print(cond, val.shape)
# ## duration: str, values: pd_.DataFrame()
# duration_df = values.drop(["duration"], axis=1)
# pca = PCAOnDF(duration_df,
# target="condition",
# targets=["CHO", "DIO"],
# colors=["b", "r"],
# save_name=f"{duration}_features",
# title=f" - {duration} Sample")
# duration_pca.append(pca)
# -- K-means with selected features (2 conditions)
# # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# df_all = df.drop(["duration"], axis=1)
# kmeans = KmeansOnDF(df_all,
# target="condition",
# nb_clusters=(2,),
# elbow=False,
# intracluster_var=True,
# plot_bar=True,
# save_name="all features"
# )
#
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = df.groupby("duration")
# for duration, values in groupby_duration:
# duration_df = values.drop(["duration"], axis=1)
# kmeans = KmeansOnDF(duration_df,
# target="condition",
# nb_clusters=(2,),
# elbow=False,
# intracluster_var=True,
# plot_bar=True,
# save_name=f"{duration}_features",
# title=f" - {duration} Sample",
# )
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment