PCA and KMeans functional

9046d6ce · NADAL Morgane · 8fedfebd · 9046d6ce
Commit 9046d6ce authored 4 years ago by NADAL Morgane
--- a/features_analysis.py
+++ b/features_analysis.py
@@ -42,22 +42,79 @@ import glob
 from typing import List


+def PCAOnDF(df: pd_.DataFrame(),
+            target: str,
+            targets: List[str],
+            colors: List[str],
+            save_name: str = None,
+            title: str = ""
+            ) -> list:
+    '''
+    Perform 2D PCA on the CHO-DIO dataframe.
+    Print ratio variance and plot the PCA.
+    '''
+    # Separating the features from their conditions and durations
+    all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target])
+    df_all = df.drop([target], axis=1)
+
+    # Standardize the data
+    scaler = StandardScaler()
+    scaler.fit(df_all)
+    stand_df = scaler.transform(df_all)
+
+    # Create the PCA and fit the data
+    pca = PCA(n_components=2)
+    principal_components = pca.fit_transform(stand_df)
+    print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
+
+    principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
+
+    # Give the final df containing the principal component and their condition
+    final_df = pd_.concat([principal_df, all_target[target]], axis=1)
+
+    # Plot
+    fig = pl_.figure(figsize=(8, 8))
+    ax = fig.add_subplot(1, 1, 1)
+    ax.set_xlabel('Principal Component 1', fontsize=15)
+    ax.set_ylabel('Principal Component 2', fontsize=15)
+    ax.set_title(f'2 component PCA{title}', fontsize=20)
+    for tgt, color in zip(targets, colors):
+        idx = final_df[target] == tgt
+        ax.scatter(final_df.loc[idx, 'principal component 1']
+                   , final_df.loc[idx, 'principal component 2']
+                   , c=color
+                   , s=30)
+    ax.legend(targets)
+    ax.grid()
+    if save_name is not None:
+        pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
+
+    return pca.explained_variance_ratio_
+
+
 def KmeansOnDF(df: pd_.DataFrame(),
               nb_clusters: tuple,
-               representation: bool = False,
+               target: str,
+               plot_bar: bool = True,
+               rep_on_image: bool = False,
               labeled_somas=None,
               elbow: bool = False,
               intracluster_var: bool = True,
+               save_name: str = None,
+               title: str = "",
               ) -> KMeans:
    '''
    Perform kmeans on the pandas dataframe. Can find the best number of cluster with elbow method,
    find the intracluster variance, and represent the result on the initial images.
    Returns kmeans.
    '''
+    # Separating the features from their conditions and durations
+    all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target])
+    df = df.drop([target], axis=1)
+
    # Data standardization
    scaler = StandardScaler()
-    scaler.fit(df)
-    stand_df = scaler.transform(df)
+    stand_df = scaler.fit_transform(df)

    # Best number of clusters using Elbow method
    if elbow:
@@ -77,14 +134,29 @@ def KmeansOnDF(df: pd_.DataFrame(),
    # Kmeans with x clusters
    for nb_cluster in nb_clusters:
        kmeans = KMeans(n_clusters=nb_cluster, init='k-means++', max_iter=300, n_init=10, random_state=0)
-        pred_y = kmeans.fit_predict(stand_df)
+        kmeans.fit_predict(stand_df)
+        label_df = pd_.DataFrame(data=kmeans.labels_, columns=['label'])
+        lab_cond_df = pd_.concat([label_df, all_target[target]], axis=1)

        # Intracluster variance
        if intracluster_var:
-            var = IntraClusterVariance(df, kmeans, nb_cluster)
+            var_df = pd_.DataFrame(stand_df)
+            var = IntraClusterVariance(var_df, kmeans, nb_cluster)
+
+        # Barplot
+        if plot_bar:
+            fig = pl_.figure(figsize=(8, 8))
+            ax = fig.add_subplot(1, 1, 1)
+            sb_.countplot(x="condition", hue="label", data=lab_cond_df, palette=sb_.color_palette("deep", n_colors=2))
+            ax.set_title(f'Distribution of the clustering labels according to conditions{title}', fontsize=11)
+            ax.grid()
+            if save_name is not None:
+                pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\Hist_Clustering_{save_name}.png")
+                # pl_.show(block=True)
+                # pl_.close()

        # Representation on the image
-        if representation:
+        if rep_on_image:
            RepresentationOnImages(labeled_somas, kmeans, nb_cluster)

    return kmeans
@@ -96,10 +168,9 @@ def IntraClusterVariance(df: pd_.DataFrame(), kmeans: KMeans(), nb_cluster: int)
    '''
    var = []
    for cluster in range(nb_cluster):
-        soma_cluster = [indx + 1 for indx, value in enumerate(kmeans.labels_) if value == cluster]
-        mean_cluster = np_.average([df.loc[f"soma {row}", :] for row in soma_cluster], axis=0)
-        variance = sum([np_.linalg.norm(df.loc[f"soma {row}", :] - mean_cluster) ** 2 for row in soma_cluster]) / len(
-            soma_cluster)
+        soma_cluster = [indx for indx, value in enumerate(kmeans.labels_) if value == cluster]
+        mean_cluster = np_.average([df.iloc[row, :] for row in soma_cluster], axis=0)
+        variance = sum([np_.linalg.norm(df.iloc[row, :] - mean_cluster) ** 2 for row in soma_cluster]) / (len(soma_cluster) - 1)
        var.append(variance)

    print(f"Intracluster variance for {nb_cluster} clusters :", var)
@@ -144,56 +215,6 @@ def FeaturesStatistics(df):
        pl_.close()


-def PCAOnDF(df: pd_.DataFrame(),
-            target: str,
-            targets: List[str],
-            colors: List[str],
-            save_name: str = None,
-            title: str = ""
-            ) -> list:
-    '''
-    Perform 2D PCA on the CHO-DIO dataframe.
-    Print ratio variance and plot the PCA.
-    '''
-    # Separating the features from their conditions and durations
-    # all_target = df.loc[:, [target]].values
-    df_all = df.drop([target], axis=1)
-
-    # Standardize the data
-    scaler = StandardScaler()
-    scaler.fit(df_all)
-    stand_df = scaler.transform(df_all)
-
-    # Create the PCA and fit the data
-    pca = PCA(n_components=2)
-    principal_components = pca.fit_transform(stand_df)
-    print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
-
-    principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
-
-    # Give the final df containing the principal component and their condition
-    final_df = pd_.concat([principal_df, df[target]], axis=1)
-
-    # Plot
-    fig = pl_.figure(figsize=(8, 8))
-    ax = fig.add_subplot(1, 1, 1)
-    ax.set_xlabel('Principal Component 1', fontsize=15)
-    ax.set_ylabel('Principal Component 2', fontsize=15)
-    ax.set_title(f'2 component PCA{title}', fontsize=20)
-    for tgt, color in zip(targets, colors):
-        idx = final_df[target] == tgt
-        ax.scatter(final_df.loc[idx, 'principal component 1']
-                   , final_df.loc[idx, 'principal component 2']
-                   , c=color
-                   , s=30)
-    ax.legend(targets)
-    ax.grid()
-    if save_name is not None:
-        pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
-
-    return pca.explained_variance_ratio_
-
-
 if __name__ == "__main__":
    #
    # os.chdir("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion")
@@ -224,71 +245,105 @@ if __name__ == "__main__":
                 axis=1)
    df = df.dropna(axis=0, how="any")

-    # KMeansIntraImage(df, nb_clusters=(2,))
-    # FeatureDistribution(df)
-
-    # -- PCA with all the features
-    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    df_all = df.drop(["duration"], axis=1)
-    all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
-
-    # Between the two conditions, for each duration (2 conditions, 3 durations)
-    groupby_duration = df.groupby("duration")
-    duration_pca = []
-    for duration, values in groupby_duration:
-        # print(duration, values.shape)
-        # groupby_condition = values.groupby("condition")
-        # for cond, val in groupby_condition:
-        #     print(cond, val.shape)
-        ## duration: str, values: pd_.DataFrame()
-        duration_df = values.drop(["duration"], axis=1)
-        pca = PCAOnDF(duration_df,
-                      target="condition",
-                      targets=["CHO", "DIO"],
-                      colors=["b", "r"],
-                      save_name=f"{duration}_features",
-                      title=f" - {duration} Sample")
-        duration_pca.append(pca)
-
-    ## -- K-means with all the features (2 conditions)
-
-    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # kmeans = fa_.KmeansOnDF(concatenated_features_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
-
-    ## Between the two conditions, for each duration (2 conditions, 3 durations)
+    # # -- PCA with all the features
+    # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # df_all = df.drop(["duration"], axis=1)
+    # all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
+    #
+    # # Between the two conditions, for each duration (2 conditions, 3 durations)
+    # groupby_duration = df.groupby("duration")
+    # duration_pca = []
    # for duration, values in groupby_duration:
-    # kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
-
-    ## -- Select Discriminant features by statistical analysis
-    # TODO filtered_df = SelectFeatures(concatenated_features_df)
-
-    ## -- PCA with all the features with the cluster as label
-
-    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # TODO pca = fa_.PCAOnDF(concatenated_features_df)
-
-    ## Between the two conditions, for each duration (2 conditions, 3 durations)
+    #     # print(duration, values.shape)
+    #     # groupby_condition = values.groupby("condition")
+    #     # for cond, val in groupby_condition:
+    #     #     print(cond, val.shape)
+    #     ## duration: str, values: pd_.DataFrame()
+    #     duration_df = values.drop(["duration"], axis=1)
+    #     pca = PCAOnDF(duration_df,
+    #                   target="condition",
+    #                   targets=["CHO", "DIO"],
+    #                   colors=["b", "r"],
+    #                   save_name=f"{duration}_features",
+    #                   title=f" - {duration} Sample")
+    #     duration_pca.append(pca)
+
+    # -- K-means with all the features (2 conditions)
+
+    # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # df_all = df.drop(["duration"], axis=1)
+    # kmeans = KmeansOnDF(df_all,
+    #                     target="condition",
+    #                     nb_clusters=(2,),
+    #                     elbow=False,
+    #                     intracluster_var=True,
+    #                     plot_bar=True,
+    #                     save_name="all features"
+    #                     )
+    #
+    # # Between the two conditions, for each duration (2 conditions, 3 durations)
+    # groupby_duration = df.groupby("duration")
    # for duration, values in groupby_duration:
-    # pca = fa_.PCAOnDF(values)
+    #     duration_df = values.drop(["duration"], axis=1)
+    #     kmeans = KmeansOnDF(duration_df,
+    #                         target="condition",
+    #                         nb_clusters=(2,),
+    #                         elbow=False,
+    #                         intracluster_var=True,
+    #                         plot_bar=True,
+    #                         save_name=f"{duration}_features",
+    #                         title=f" - {duration} Sample",
+    #                         )

    ## -- Select Discriminant features by statistical analysis
    # TODO filtered_df = SelectFeatures(concatenated_features_df)

-    ## -- PCA with selected features
-
-    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # TODO pca = fa_.PCAOnDF(filtered_df)
-
-    ## Between the two conditions, for each duration (2 conditions, 3 durations)
-    # filtered_groupby_duration = filtered_df.groupby("Duration")
-    # for duration, values in filtered_groupby_duration:
-    # pca = fa_.PCAOnDF(values)
-
-    ## -- K-means with selected features
-
-    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # filtered_kmeans = fa_.KmeansOnDF(filtered_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
-
-    ## Between the two conditions, for each duration (2 conditions, 3 durations)
-    # for duration, values in filtered_groupby_duration:
-    # filtered_kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
\ No newline at end of file
+    # # -- PCA with selected features
+    # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # df_all = df.drop(["duration"], axis=1)
+    # all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
+    #
+    # # Between the two conditions, for each duration (2 conditions, 3 durations)
+    # groupby_duration = df.groupby("duration")
+    # duration_pca = []
+    # for duration, values in groupby_duration:
+    #     # print(duration, values.shape)
+    #     # groupby_condition = values.groupby("condition")
+    #     # for cond, val in groupby_condition:
+    #     #     print(cond, val.shape)
+    #     ## duration: str, values: pd_.DataFrame()
+    #     duration_df = values.drop(["duration"], axis=1)
+    #     pca = PCAOnDF(duration_df,
+    #                   target="condition",
+    #                   targets=["CHO", "DIO"],
+    #                   colors=["b", "r"],
+    #                   save_name=f"{duration}_features",
+    #                   title=f" - {duration} Sample")
+    #     duration_pca.append(pca)
+
+    # -- K-means with selected features (2 conditions)
+
+    # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # df_all = df.drop(["duration"], axis=1)
+    # kmeans = KmeansOnDF(df_all,
+    #                     target="condition",
+    #                     nb_clusters=(2,),
+    #                     elbow=False,
+    #                     intracluster_var=True,
+    #                     plot_bar=True,
+    #                     save_name="all features"
+    #                     )
+    #
+    # # Between the two conditions, for each duration (2 conditions, 3 durations)
+    # groupby_duration = df.groupby("duration")
+    # for duration, values in groupby_duration:
+    #     duration_df = values.drop(["duration"], axis=1)
+    #     kmeans = KmeansOnDF(duration_df,
+    #                         target="condition",
+    #                         nb_clusters=(2,),
+    #                         elbow=False,
+    #                         intracluster_var=True,
+    #                         plot_bar=True,
+    #                         save_name=f"{duration}_features",
+    #                         title=f" - {duration} Sample",
+    #                         )
\ No newline at end of file