From 7b24a67690eba358c7662c6665975b53e9526f74 Mon Sep 17 00:00:00 2001
From: monadal <morgane.nadal@inria.fr>
Date: Mon, 6 Jul 2020 18:18:25 +0200
Subject: [PATCH] improved features_analysis.py and add new parameter to
 nutrimorph.py

---
 features_analysis.py | 582 ++++++++++++++++++++++++++-----------------
 nutrimorph.py        |   3 +-
 parameters.ini       |   1 +
 search_parameters.py |   1 +
 4 files changed, 359 insertions(+), 228 deletions(-)

diff --git a/features_analysis.py b/features_analysis.py
index ef7ef96..6c19640 100644
--- a/features_analysis.py
+++ b/features_analysis.py
@@ -51,10 +51,12 @@ def PCAOnDF(df: pd_.DataFrame(),
             save_name: str = None,
             title: str = "",
             plot_duration: bool = True,
+            save_in: str = None,
+            three_D: bool = False,
             ) -> list:
     '''
-    Perform 2D PCA on the CHO-DIO dataframe.
-    Print ratio variance and plot the PCA.
+    Perform 2D or 3D PCA on the CHO-DIO dataframe.
+    Save explained variance ratio as csv, plot and save the PCAs.
     '''
     # Separating the features from their conditions and durations
     all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target])
@@ -68,18 +70,23 @@ def PCAOnDF(df: pd_.DataFrame(),
     # Create the PCA and fit the data
     pca = PCA(n_components=2)
     principal_components = pca.fit_transform(stand_df)
+
+    # Printing and saving the explained variance ratio
     print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
+    pca_exp_var_df = pd_.DataFrame(pca.explained_variance_ratio_, columns=["pca explained variance ration"])
+    pca_exp_var_df = pca_exp_var_df.rename(index={0: "principal component 1", 1: "principal component 2"})
+    pca_exp_var_df.to_csv(f"{save_in}\\pca_explained_variance_ratio_{save_name}.csv")
 
+    # Creating the principal component df
     principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
-
     # Give the final df containing the principal component and their condition
     final_df = pd_.concat([principal_df, all_target[target]], axis=1)
 
-    # Plot
+    # Plot 2D
     fig = pl_.figure(figsize=(8, 8))
     ax = fig.add_subplot(1, 1, 1)
-    ax.set_xlabel('Principal Component 1', fontsize=15)
-    ax.set_ylabel('Principal Component 2', fontsize=15)
+    ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca.explained_variance_ratio_[0],2)}', fontsize=15)
+    ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca.explained_variance_ratio_[1],2)}', fontsize=15)
     ax.set_title(f'2 component PCA{title}', fontsize=20)
     for tgt, color in zip(targets, colors):
         idx = final_df[target] == tgt
@@ -89,30 +96,104 @@ def PCAOnDF(df: pd_.DataFrame(),
                    , s=30)
     ax.legend(targets)
     ax.grid()
+
     if save_name is not None:
-        pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
+        pl_.savefig(f"{save_in}\\PCA_{save_name}.png")
 
     if plot_duration:
+        # Make separated plots for each duration of the experiments
         fig = pl_.figure(figsize=(8, 8))
         ax = fig.add_subplot(1, 1, 1)
-        ax.set_xlabel('Principal Component 1', fontsize=15)
-        ax.set_ylabel('Principal Component 2', fontsize=15)
+
+        ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca.explained_variance_ratio_[0],2)}', fontsize=15)
+        ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca.explained_variance_ratio_[1],2)}', fontsize=15)
         ax.set_title(f'2 component PCA{title}', fontsize=20)
-        targets = [["CHO", "1H"], ["CHO", "3H"], ["CHO", "6H"], ["DIO", "1H"], ["DIO", "3H"], ["DIO", "6H"]]
+
+        new_tgts = [["CHO", "1H"], ["CHO", "3H"], ["CHO", "6H"], ["DIO", "1H"], ["DIO", "3H"], ["DIO", "6H"]]
         final_df = pd_.concat([df[["condition", "duration"]], principal_df], axis=1)
-        for tgt, color in zip(targets, ["lavender", "royalblue", "navy", "lightcoral", "firebrick", "red"]):
+
+        for tgt, color in zip(new_tgts, ["lavender", "royalblue", "navy", "lightcoral", "firebrick", "red"]):
             new_df = final_df.loc[(final_df["condition"] == tgt[0]) & (final_df["duration"] == tgt[1])]
             new_df = new_df.drop(["condition", "duration"], axis=1)
+
             ax.scatter(new_df['principal component 1']
                        , new_df['principal component 2']
                        , c=color
                        , s=30)
+
+        ax.legend(new_tgts)
+        ax.grid()
+
+        if save_name is not None:
+            pl_.savefig(f"{save_in}\\PCA_duration_{save_name}.png")
+
+    if three_D:
+        # Create the 3D PCA and fit the data
+        pca3d = PCA(n_components=3)
+        principal_components3d = pca3d.fit_transform(stand_df)
+
+        # Print and Save the explained variance ratio
+        print(f"3D PCA explained variance ratio ({save_name}): ", pca3d.explained_variance_ratio_)
+        pca3d_exp_var_df = pd_.DataFrame(pca3d.explained_variance_ratio_, columns=["3d pca explained variance ration"])
+        pca3d_exp_var_df = pca3d_exp_var_df.rename(index={0: "principal component 1", 1: "principal component 2", 2: "principal component 3"})
+        pca3d_exp_var_df.to_csv(f"{save_in}\\three_d_pca_explained_variance_ratio_{save_name}.csv")
+
+        # Creating the principal component df
+        principal3d_df = pd_.DataFrame(data=principal_components3d,
+                                       columns=['principal component 1', 'principal component 2', 'principal component 3'])
+        # Give the final df containing the principal component and their condition
+        final3d_df = pd_.concat([principal3d_df, all_target[target]], axis=1)
+
+        # Plot
+        fig = pl_.figure(figsize=(8, 8))
+        ax = fig.add_subplot(111, projection='3d')
+
+        ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca3d.explained_variance_ratio_[0],2)}', fontsize=15)
+        ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca3d.explained_variance_ratio_[1],2)}', fontsize=15)
+        ax.set_zlabel(f'Principal Component 3 - ratio = {round(pca3d.explained_variance_ratio_[2],2)}', fontsize=15)
+        ax.set_title(f'3 component PCA{title}', fontsize=20)
+
+        for tgt, color in zip(targets, colors):
+            idx = final3d_df[target] == tgt
+
+            ax.scatter(final3d_df.loc[idx, 'principal component 1']
+                       , final3d_df.loc[idx, 'principal component 2']
+                       , final3d_df.loc[idx, 'principal component 3']
+                       , c=color
+                       , s=30)
+
         ax.legend(targets)
         ax.grid()
+
         if save_name is not None:
-            pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_duration_{save_name}.png")
+            pl_.savefig(f"{save_in}\\three_d_PCA_{save_name}.png")
+
+        if plot_duration:
+            fig = pl_.figure(figsize=(8, 8))
+            ax = fig.add_subplot(111, projection="3d")
 
-    return pca.explained_variance_ratio_
+            ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca3d.explained_variance_ratio_[0], 2)}', fontsize=15)
+            ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca3d.explained_variance_ratio_[1],2)}', fontsize=15)
+            ax.set_zlabel(f'Principal Component 3 - ratio = {round(pca3d.explained_variance_ratio_[2],2)}', fontsize=15)
+            ax.set_title(f'3 component PCA{title}', fontsize=20)
+
+            final3d_df = pd_.concat([df[["condition", "duration"]], principal3d_df], axis=1)
+
+            for tgt, color in zip(new_tgts, ["lavender", "royalblue", "navy", "lightcoral", "firebrick", "red"]):
+                new3d_df = final3d_df.loc[(final_df["condition"] == tgt[0]) & (final3d_df["duration"] == tgt[1])]
+                new3d_df = new3d_df.drop(["condition", "duration"], axis=1)
+
+                ax.scatter(new3d_df['principal component 1']
+                           , new3d_df['principal component 2']
+                           , new3d_df['principal component 3']
+                           , c=color
+                           , s=30)
+
+            ax.legend(new_tgts)
+            ax.grid()
+
+            if save_name is not None:
+                pl_.savefig(f"{save_in}\\three_d_PCA_duration_{save_name}.png")
 
 
 def KmeansOnDF(df: pd_.DataFrame(),
@@ -131,7 +212,7 @@ def KmeansOnDF(df: pd_.DataFrame(),
     '''
     Perform kmeans on the pandas dataframe. Can find the best number of cluster with elbow method,
     find the intracluster variance, and represent the result on the initial images.
-    Returns kmeans.
+    Plot barplots with percentage of each label for each condition.
     '''
     # Separating the features from their conditions and durations
     all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target])
@@ -148,6 +229,7 @@ def KmeansOnDF(df: pd_.DataFrame(),
             kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
             kmeans.fit(stand_df)
             wcss.append(kmeans.inertia_)
+        pl_.figure()
         pl_.plot(range(1, 24), wcss)
         pl_.plot(range(1, 24), wcss, 'bo')
         pl_.title('Elbow Method')
@@ -167,10 +249,10 @@ def KmeansOnDF(df: pd_.DataFrame(),
         if intracluster_var:
             var_df = pd_.DataFrame(stand_df)
             var = IntraClusterVariance(var_df, kmeans, nb_cluster, save_name)
-            # TODO save csv
-            var_df = pd_.DataFrame(var, columns=["PCA explained variance ratio"])
+            var_df = pd_.DataFrame(var, columns=["intracluster variance"])
+            var_df = var_df.rename({cluster: f"label {cluster}" for cluster in range(nb_cluster)})
             if save_in:
-                var_df.to_csv(f"{save_in}\\pca_var_explained_ratio_{save_name}.csv")
+                var_df.to_csv(f"{save_in}\\intracluster_variance_kmeans_{save_name}_k={nb_cluster}.csv")
 
         # TODO Intersection of ellipsoids of the covariance matrices
         #  - ellipsoid equation: (x-mean)(1/Mcov)(x-mean).T <= 1
@@ -182,13 +264,12 @@ def KmeansOnDF(df: pd_.DataFrame(),
                 ax = fig.add_subplot(1, 1, 1)
                 (lab_cond_df.groupby("condition")["label"].value_counts(normalize=True).mul(100).rename(
                     "Percentage (%)").reset_index().pipe((sb_.barplot, "data"), x="condition", y="Percentage (%)",
-                                                         hue="label", palette=sb_.color_palette("deep", n_colors=2)))
-                # sb_.countplot(x="condition", hue="label", data=lab_cond_df, palette=sb_.color_palette("deep", n_colors=2))
+                                                         hue="label", palette=sb_.color_palette("deep", n_colors=nb_cluster)))
                 ax.set_ylim(0, 100)
-                ax.set_title(f'Distribution of the clustering labels according to conditions{title}', fontsize=11)
+                ax.set_title(f'Distribution of the clustering labels according to conditions{title}_k={nb_cluster}', fontsize=11)
                 ax.grid()
                 if save_name is not None:
-                    pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}.png")
+                    pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_k={nb_cluster}.png")
                     # pl_.show(block=True)
                     # pl_.close()
             else:
@@ -196,14 +277,12 @@ def KmeansOnDF(df: pd_.DataFrame(),
                 ax= fig.add_subplot(1, 1, 1)
                 (lab_cond_df.groupby("condition")["label"].value_counts(normalize=True).mul(100).rename(
                     "Percentage (%)").reset_index().pipe((sb_.barplot, "data"), x="condition", y="Percentage (%)",
-                                                         hue="label", palette=sb_.color_palette("deep", n_colors=2)))
-
-                # sb_.countplot(x="condition", hue="label", data=lab_cond_df, palette=sb_.color_palette("deep", n_colors=2))
+                                                         hue="label", palette=sb_.color_palette("deep", n_colors=nb_cluster)))
                 ax.set_ylim(0, 100)
                 ax.grid()
-                ax.set_title(f'Distribution of the clustering labels according to conditions{title}', fontsize=11)
+                ax.set_title(f'Distribution of the clustering labels according to conditions{title}_k={nb_cluster}', fontsize=11)
                 if save_name is not None:
-                    pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}.png")
+                    pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_k={nb_cluster}.png")
                     # pl_.show(block=True)
                     # pl_.close()
                 lab_cond_df2 = pd_.concat((lab_cond_df, df[["duration"]]), axis=1)
@@ -213,18 +292,17 @@ def KmeansOnDF(df: pd_.DataFrame(),
                     ax = fig.add_subplot(1, 1, 1)
                     (dur_df.groupby("condition")["label"].value_counts(normalize=True).mul(100).rename(
                         "Percentage (%)").reset_index().pipe((sb_.barplot, "data"), x="condition", y="Percentage (%)",
-                                                             hue="label", palette=sb_.color_palette("deep", n_colors=2)))
+                                                             hue="label", palette=sb_.color_palette("deep", n_colors=nb_cluster)))
                     ax.set_ylim(0, 100)
                     ax.grid()
-                    ax.set_title(f'Distribution of the clustering labels according to conditions{title} - duration = {dur}', fontsize=11)
+                    ax.set_title(f'Distribution of the clustering labels according to conditions{title}_k={nb_cluster} - duration = {dur}', fontsize=11)
                     if save_name is not None:
-                        pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_dur_{dur}.png")
+                        pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_dur_{dur}_k={nb_cluster}.png")
                         # pl_.show(block=True)
                         # pl_.close()
 
-                # sb_.catplot("condition", col="duration", kind="count", hue="label", data=pd_.concat((lab_cond_df, df[["duration"]]), axis=1), palette=sb_.color_palette("deep", n_colors=2))
                 if save_name is not None:
-                    pl_.savefig(f"{save_in}\\Hist_duration_Clustering_{save_name}.png")
+                    pl_.savefig(f"{save_in}\\Hist_duration_Clustering_{save_name}_k={nb_cluster}.png")
                     print(f"Saved in {save_in}")
                     # pl_.show(block=True)
                     # pl_.close()
@@ -282,6 +360,9 @@ def FeaturesStatistics(df: pd_.DataFrame(),
                        ):
     '''
     Return the statistics allowing the user to choose the most relevant features to feed ML algorithms for ex.
+    Statistical description, correlation heatmap, dropping correlated features or features with little difference between the two conditions,
+    Plotting features distribution and boxplots, performing Kolmogorov-Smirnov two-sample test and Wilcoxon signed-rank two-sample test.
+    Can draw a decision tree.
     '''
     #
     # Overview of the basic stats on each columns of df
@@ -290,16 +371,10 @@ def FeaturesStatistics(df: pd_.DataFrame(),
         if save_in is not None:
             description.to_csv(f"{save_in}\\df_stat_description.csv")
 
+    # Data formatting
     df_scalar = df.drop(["duration", "condition"], axis=1)
     df_cond = df.drop(["duration"], axis=1)
-    # df_groupby_cond = df.drop("duration", axis=1).groupby("condition")
     df_groupby_dur = df.groupby("duration")
-    # condition = pd_.DataFrame(df.loc[:, ["condition"]].values, columns=["condition"])
-
-    # Overview of features distribution and correlation
-
-    ## /!\ if too many features, error in seaborn display
-    ## Even by splitting the data : too intense!
 
     # Compute the correlation matrix
     corr_matrix = df_scalar.corr().abs()
@@ -320,68 +395,18 @@ def FeaturesStatistics(df: pd_.DataFrame(),
             pl_.savefig(f"{save_in}\\Features correlation heat map{title}.png")
             pl_.close()
 
-    if drop_feat:
-        # Drop highly correlated features
-        print("Drop highly correlated features")
-        # Select upper triangle of correlation matrix
-        upper = corr_matrix.where(np_.triu(np_.ones(corr_matrix.shape), k=1).astype(np_.bool))
-        # Find index of feature columns with correlation greater than 0.9
-        to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
-        # Drop features
-        drop_HCF_df = df_scalar.drop(df[to_drop], axis=1)
-
-        # Drop column with null variance
-        drop_HCF_df = drop_HCF_df.loc[:, drop_HCF_df.var() != 0.0]
-
-        # ADD the condition and duration
-        drop_HCF_df = pd_.concat((df[["condition", "duration"]], drop_HCF_df), axis=1)
-
-        if save_in:
-            drop_HCF_df.to_csv(f"{save_in}\\df_drop_highly_corr_feat.csv")
-            print(f"Selection of low correlated features in: {save_in}\\df_drop_highly_corr_feat.csv")
-
-    # Statistics for each features
-
-    if distribution:
-        for column in df_scalar.columns:
-            cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1)
-
-            fig = pl_.figure(constrained_layout=False)
-            gs = fig.add_gridspec(ncols=2, nrows=1)
-            ax1 = fig.add_subplot(gs[0, 0])
-            ax2 = fig.add_subplot(gs[0, 1])
-
-            # Plot a histogram and kernel density estimate
-            print(f"Plot a histogram and kernel density estimate for feature {column}")
-            CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"]
-            DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"]
-
-            sb_.distplot(CHO[[column]], color="b", ax=ax1)
-            sb_.distplot(DIO[[column]], color="r", ax=ax1)
-
-            # Draw a boxplot
-            print(f"Plot a boxplot for feature {column}")
-            sb_.boxplot(data=df_cond, x="condition", y=column, hue="condition", palette=["b", "r"], ax=ax2)
-            ax1.set_title(f'{column} distribution{title}', fontsize=11)
-            ax2.set_title(f'{column} boxplot{title}', fontsize=11)
-
-            pl_.tight_layout()
-
-            if save_in is not None:
-                pl_.savefig(f"{save_in}\\feat_distrib_{column}.png")
-                pl_.close()
-
     if stat_test:
-
+        # Create dictionaries to store the statistics and p-values
         dict_ks = {}
         dict_wx = {}
         dict_ks_dur = {}
         dict_wx_dur = {}
 
         for column in df_scalar.columns:
-
+            # For each feature, perform a statistical test to know if the feature distribution or median is different btw CHO and DIO conditions
             cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1)
 
+            # Separate the conditions
             CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"]
             CHO = np_.asarray(CHO[column])
             DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"]
@@ -395,7 +420,25 @@ def FeaturesStatistics(df: pd_.DataFrame(),
             wx = si_.stats.ranksums(CHO, DIO)
             dict_wx[column] = wx
 
-            # For each duration between conditions
+            # Plot the p-values
+            fig = pl_.figure(constrained_layout=False, figsize=(15,8))
+            gs = fig.add_gridspec(ncols=2, nrows=1)
+            ax1 = fig.add_subplot(gs[0, 0])
+            ax2 = fig.add_subplot(gs[0, 1])
+            #
+            ax1.set_title(f"Kolmogorov-Smirnov p-value btw conditions - {column}", fontsize=13, pad=20)
+            ax1.grid()
+            ax1.set_xlabel("Duration")
+            ax1.set_yscale("log")
+            ax1.set_ylabel("p-value")
+            #
+            ax2.set_title(f"Wilcoxon signed-rank p-value btw conditions - {column}", fontsize=13, pad=20)
+            ax2.grid()
+            ax2.set_xlabel("Duration")
+            ax2.set_yscale("log")
+            ax2.set_ylabel("p-value")
+
+            # Do the same thing than above but separate durations between conditions
             for duration, values in df_groupby_dur:
                 if duration not in dict_ks_dur:
                     dict_ks_dur[duration] = {}
@@ -417,6 +460,18 @@ def FeaturesStatistics(df: pd_.DataFrame(),
                 wx2 = si_.stats.ranksums(CHO_, DIO_)
                 dict_wx_dur[duration][column] = wx2
 
+            ax1.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_ks_dur[duration][column][1] for duration, _ in df_groupby_dur], "g")
+            ax1.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_ks_dur[duration][column][1] for duration, _ in df_groupby_dur], 'ko')
+            ax2.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_wx_dur[duration][column][1] for duration, _ in df_groupby_dur], "y")
+            ax2.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_wx_dur[duration][column][1] for duration, _ in df_groupby_dur], 'ko')
+
+            pl_.tight_layout()
+
+            if save_in:
+                pl_.savefig(f"{save_in}\\p_values_duration_{column}")
+                pl_.close()
+
+        # Reformat the data to save it as csv
         df_ks = pd_.DataFrame.from_dict(data=dict_ks)
         df_ks = df_ks.rename(index={0: "Kolmogorov-Smirnov statistic", 1: "Kolmogorov-Smirnov p-value"})
         df_wx = pd_.DataFrame.from_dict(data=dict_wx)
@@ -430,13 +485,80 @@ def FeaturesStatistics(df: pd_.DataFrame(),
             df_wx_dur = pd_.concat((df_wx_dur, pd_.DataFrame.from_dict(data=dict_wx_dur[key])))
             df_wx_dur = df_wx_dur.rename(index={0: f"{key} - Wilcoxon statistic", 1: f"{key} - Wilcoxon p-value"})
 
-        stat_tests_df1 = pd_.concat((df_ks, df_wx))
-        stat_tests_df2 = pd_.concat((df_ks_dur, df_wx_dur))
+        stat_tests_df1 = pd_.concat((df_ks, df_wx))  # KS and Wx all durations
+        stat_tests_df2 = pd_.concat((df_ks_dur, df_wx_dur))  # KS and Wx for each duration
 
         if save_in:
             stat_tests_df1.to_csv(f"{save_in}\\stat_test_KS_wilcoxon_all.csv")
             stat_tests_df2.to_csv(f"{save_in}\\stat_test_KS_wilcoxon_for_each_duration.csv")
 
+    if drop_feat:
+        # Drop highly correlated features
+        print("Drop highly correlated features")
+        # Select upper triangle of correlation matrix
+        upper = corr_matrix.where(np_.triu(np_.ones(corr_matrix.shape), k=1).astype(np_.bool))
+        # Find index of feature columns with correlation greater than 0.9
+        to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
+        # Drop features
+        drop_HCF_df = df_scalar.drop(df[to_drop], axis=1)
+
+        # Drop column with null variance
+        drop_HCF_df = drop_HCF_df.loc[:, drop_HCF_df.var() != 0.0]
+
+        if stat_test:
+            # drop non significant features (distribution and mean not different btw CHO and DIO)
+            drop_HCF_df_6H = drop_HCF_df.copy()
+            # based on all durations;
+            to_drop =[column for column in drop_HCF_df.columns if (stat_tests_df1.loc["Kolmogorov-Smirnov p-value", column] > 1.0e-2) and (stat_tests_df1.loc["Wilcoxon p-value", column] > 1.0e-2)]
+            drop_HCF_df = drop_HCF_df.drop(drop_HCF_df[to_drop], axis=1)
+
+            # only based on 6H duration;
+            to_drop_6H =[column for column in drop_HCF_df_6H.columns if (stat_tests_df2.loc["6H - Kolmogorov-Smirnov p-value", column] > 1.0e-3) and (stat_tests_df2.loc["6H - Wilcoxon p-value", column] > 1.0e-3)]
+            drop_HCF_df_6H = drop_HCF_df_6H.drop(drop_HCF_df_6H[to_drop_6H], axis=1)
+
+            drop_HCF_df_6H = pd_.concat((df[["condition", "duration"]], drop_HCF_df_6H), axis=1)
+
+            if save_in:
+                drop_HCF_df_6H.to_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv")
+                print(f"Selection of features with distribution and median of CHO vs DIO different in: {save_in}")
+
+        # Add the condition and duration
+        drop_HCF_df = pd_.concat((df[["condition", "duration"]], drop_HCF_df), axis=1)
+
+        if save_in:
+            drop_HCF_df.to_csv(f"{save_in}\\df_drop_highly_corr_feat.csv")
+            print(f"Selection of low correlated features in: {save_in}")
+
+    # Statistics for each features
+    if distribution:
+        for column in df_scalar.columns:
+            cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1)
+
+            fig = pl_.figure(constrained_layout=False)
+            gs = fig.add_gridspec(ncols=2, nrows=1)
+            ax1 = fig.add_subplot(gs[0, 0])
+            ax2 = fig.add_subplot(gs[0, 1])
+
+            # Plot a histogram and kernel density estimate
+            print(f"Plot a histogram and kernel density estimate for feature {column}")
+            CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"]
+            DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"]
+
+            sb_.distplot(CHO[[column]], color="b", ax=ax1)
+            sb_.distplot(DIO[[column]], color="r", ax=ax1)
+
+            # Draw a boxplot
+            print(f"Plot a boxplot for feature {column}")
+            sb_.boxplot(data=df_cond, x="condition", y=column, hue="condition", palette=["b", "r"], ax=ax2)
+            ax1.set_title(f'{column} distribution{title}', fontsize=11)
+            ax2.set_title(f'{column} boxplot{title}', fontsize=11)
+
+            pl_.tight_layout()
+
+            if save_in is not None:
+                pl_.savefig(f"{save_in}\\feat_distrib_{column}.png")
+                pl_.close()
+
     # Decision tree
     if decision_tree:
         # Test btw CHO and DIO
@@ -478,16 +600,17 @@ if __name__ == "__main__":
     # labeled_somas = np_.load("path.npy")
     # df = pd_.read_csv(".\combined_features.csv", index_col=0)
 
-    # path = sy_.argv[1]
-    path = "D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion"
+    ## Parameters
+    path = sy_.argv[1]
+    save_in = sy_.argv[2]
 
-    df0 = pd_.read_csv(f"{path}\\all_features.csv",
+    ## DF cleaning
+    df0 = pd_.read_csv(f"{path}\\features.csv",
                       # index_col=0,
                       )
     df = df0.drop(["Unnamed: 0"], axis=1)
 
-    # Statistical analysis
-
+    ## Statistical analysis
     # For the moment drop the columns with non scalar values, and un-useful values
     # - TO BE CHANGED TODO (use distance metrics such as bhattacharyya coef, etc)
     df = df.drop(["soma uid",
@@ -500,133 +623,138 @@ if __name__ == "__main__":
     # -- PCA with all the features
     print("\nALL FEATURES\n")
     # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # all_pca = PCAOnDF(df,
-    #                   target="condition",
-    #                   targets=["CHO", "DIO"],
-    #                   colors=["b", "r"],
-    #                   save_name="all_features",
-    #                   )
-    #
-    # # Between the two conditions, for each duration (2 conditions, 3 durations)
-    # groupby_duration = df.groupby("duration")
-    # duration_pca = []
-    # for duration, values in groupby_duration:
-    #     ## duration: str, values: pd_.DataFrame()
-    #     pca = PCAOnDF(values,
-    #                   target="condition",
-    #                   targets=["CHO", "DIO"],
-    #                   colors=["b", "r"],
-    #                   save_name=f"{duration}_features",
-    #                   title=f" - {duration} Sample",
-    #                   plot_duration=False)
-    #     duration_pca.append(pca)
-    #
-    ## -- K-means with all the features (2 conditions)
-    #
-    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    ## df_all = df.drop(["duration"], axis=1)
-    # kmeans = KmeansOnDF(df,
-    #                     target="condition",
-    #                     nb_clusters=(2,),
-    #                     elbow=False,
-    #                     intracluster_var=True,
-    #                     plot_bar=True,
-    #                     save_name="all_features",
-    #                     save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis",
-    #                     )
+    PCAOnDF(df,
+            target="condition",
+            targets=["CHO", "DIO"],
+            colors=["b", "r"],
+            save_name="all_features",
+            save_in=save_in,
+            three_D=True,
+            )
+
+    # Between the two conditions, for each duration (2 conditions, 3 durations)
+    groupby_duration = df.groupby("duration")
+    for duration, values in groupby_duration:
+        ## duration: str, values: pd_.DataFrame()
+        PCAOnDF(values,
+                target="condition",
+                targets=["CHO", "DIO"],
+                colors=["b", "r"],
+                save_name=f"{duration}_features",
+                save_in=save_in,
+                title=f" - {duration} Sample",
+                plot_duration=False,
+                three_D=True,
+                )
+
+    # -- K-means with all the features (2 conditions)
 
     # Test for multiple glial populations
-    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # kmeans = KmeansOnDF(df,
-    #                     target="condition",
-    #                     nb_clusters=(2,3,4,5),
-    #                     elbow=False,
-    #                     intracluster_var=True,# TODO store var
-    #                     plot_bar=True,
-    #                     save_name="all_features",
-    #                     save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis",
-    #                     )
-
-    # # Between the two conditions, for each duration (2 conditions, 3 durations)
-    # groupby_duration = df.groupby("duration")
-    # for duration, values in groupby_duration:
-    #     kmeans = KmeansOnDF(values,
-    #                         target="condition",
-    #                         nb_clusters=(2,),
-    #                         elbow=False,
-    #                         intracluster_var=True,
-    #                         plot_bar=True,
-    #                         save_name=f"{duration}_features",
-    #                         title=f" - {duration} Sample",
-    #                         duration=True,
-    #                         save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis",
-    #                         )
-
-    # ## -- Various plots to analyse the data and find discriminant features by statistical analysis
-    # FeaturesStatistics(df,
-    #                    save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis",
-    #                    describe=True,
-    #                    heatmap=True,
-    #                    distribution=True,
-    #                    drop_feat=True,
-    #                    stat_test=True,
-    #                    decision_tree=False,
-    #                    )
-    # ## TODO: Enter selected features here
-    # # selected_features = []
-    # # selected_df = df[selected_features]
-    # ## TODO Or use the csv with dropped features
-    # selected_df = pd_.read_csv(f"{path}\\Features_analysis\\df_drop_highly_corr_feat.csv")
-    # # if other columns need to be dropped:
-    # to_drop = ["Unnamed: 0", "min_curvature"]
-    # selected_df = selected_df.drop(to_drop, axis=1)
-    #
-    # # # -- PCA with all the features
-    # print("\nSELECTED FEATURES\n")
-    # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # all_pca = PCAOnDF(df,
-    #                   target="condition",
-    #                   targets=["CHO", "DIO"],
-    #                   colors=["b", "r"],
-    #                   save_name="all_selected_features",
-    #                   )
-    #
-    # # # Between the two conditions, for each duration (2 conditions, 3 durations)
-    # groupby_duration = df.groupby("duration")
-    # duration_pca = []
-    # for duration, values in groupby_duration:
-    #     ## duration: str, values: pd_.DataFrame()
-    #     pca = PCAOnDF(values,
-    #                   target="condition",
-    #                   targets=["CHO", "DIO"],
-    #                   colors=["b", "r"],
-    #                   save_name=f"{duration}_selected_features",
-    #                   title=f" - {duration} Sample - selected features",
-    #                   plot_duration=False)
-    #     duration_pca.append(pca)
-    #
-    # # # -- K-means with all the features (2 conditions)
-    # #
-    # # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-    # kmeans = KmeansOnDF(df,
-    #                     target="condition",
-    #                     nb_clusters=(2,),
-    #                     elbow=False,
-    #                     intracluster_var=True,
-    #                     plot_bar=True,
-    #                     save_name="all_selected_features"
-    #                     )
-    #
-    # # # Between the two conditions, for each duration (2 conditions, 3 durations)
-    # groupby_duration = df.groupby("duration")
-    # for duration, values in groupby_duration:
-    #     kmeans = KmeansOnDF(values,
-    #                         target="condition",
-    #                         nb_clusters=(2,),
-    #                         elbow=False,
-    #                         intracluster_var=True,
-    #                         plot_bar=True,
-    #                         save_name=f"{duration}_selected_features",
-    #                         title=f" - {duration} Sample - selected features",
-    #                         duration=True,
-    #                         )
+    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    kmeans = KmeansOnDF(df,
+                        target="condition",
+                        nb_clusters=(2, 3, 4, 5),
+                        elbow=True,
+                        intracluster_var=True,
+                        plot_bar=True,
+                        save_name="all_features_multiple_pop",
+                        save_in=save_in,
+                        )
+
+    # Between the two conditions, for each duration (2 conditions, 3 durations)
+    groupby_duration = df.groupby("duration")
+    for duration, values in groupby_duration:
+        kmeans = KmeansOnDF(values,
+                            target="condition",
+                            nb_clusters=(2, 3, 4, 5),
+                            elbow=False,
+                            intracluster_var=True,
+                            plot_bar=True,
+                            save_name=f"{duration}_features_multiple_pop",
+                            title=f" - {duration} Sample",
+                            duration=True,
+                            save_in=save_in,
+                            )
+
+    # -- Various plots to analyse the data and find discriminant features by statistical analysis
+    print("\nFEATURE SELECTION\n")
+    FeaturesStatistics(df,
+                       save_in=save_in,
+                       describe=True,
+                       heatmap=True,
+                       distribution=True,
+                       stat_test=True,
+                       drop_feat=True,
+                       decision_tree=False,
+                       )
+    ## TODO: Enter selected features here
+    # selected_features = []
+    # selected_df = df[selected_features]
+    ## TODO Or use the csv with dropped features
+    selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat.csv")
+    # selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv")
+    ## If an error raises, only run the part until FeaturesStatistics included, and then run the last part.
+
+    # if other columns need to be dropped:
+    try:
+        to_drop = ["Unnamed: 0", "min_curvature"]
+        selected_df = selected_df.drop(to_drop, axis=1)
+    except:
+        selected_df = selected_df.drop(["Unnamed: 0"], axis=1)
+
+    # -- PCA with all the features
+    print("\nSELECTED FEATURES\n")
+    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    PCAOnDF(df,
+            target="condition",
+            targets=["CHO", "DIO"],
+            colors=["b", "r"],
+            save_name="all_selected_features",
+            save_in=save_in,
+            three_D=True,
+            )
+
+    # Between the two conditions, for each duration (2 conditions, 3 durations)
+    groupby_duration = df.groupby("duration")
+    for duration, values in groupby_duration:
+        # duration: str, values: pd_.DataFrame()
+        PCAOnDF(values,
+                target="condition",
+                targets=["CHO", "DIO"],
+                colors=["b", "r"],
+                save_name=f"{duration}_selected_features",
+                save_in=save_in,
+                title=f" - {duration} Sample - selected features",
+                plot_duration=False,
+                three_D=True,
+                )
+
+    # -- K-means with all the features (2 conditions)
+
+    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    kmeans = KmeansOnDF(df,
+                        target="condition",
+                        nb_clusters=(2,3,4,5),
+                        elbow=False,
+                        intracluster_var=True,
+                        plot_bar=True,
+                        save_name="all_selected_features",
+                        save_in=save_in,
+                        )
+
+    # Between the two conditions, for each duration (2 conditions, 3 durations)
+    groupby_duration = df.groupby("duration")
+    for duration, values in groupby_duration:
+        kmeans = KmeansOnDF(values,
+                            target="condition",
+                            nb_clusters=(2,3,4,5),
+                            elbow=False,
+                            intracluster_var=True,
+                            plot_bar=True,
+                            save_name=f"{duration}_selected_features",
+                            save_in=save_in,
+                            title=f" - {duration} Sample - selected features",
+                            duration=True,
+                            )
+
+    ## TODO: Random forests ?
diff --git a/nutrimorph.py b/nutrimorph.py
index 5ed03a8..b90aa89 100644
--- a/nutrimorph.py
+++ b/nutrimorph.py
@@ -79,6 +79,7 @@ data_path = None
 channel = None
 save_images = None
 save_csv = None
+save_features_analysis = None
 dilatation_erosion = None
 size_voxel_in_micron = None
 statistical_analysis = None
@@ -662,7 +663,7 @@ if __name__ == '__main__':
 
         # Clustering with this df and module features_analysis.py
         if statistical_analysis:
-            os_.system(f"feature_analysis.py {save_csv}\\features.csv")
+            os_.system(f"feature_analysis.py {save_csv}\\features.csv {save_features_analysis}")
 
     else:
         raise ImportError("Not a valid data path!")
diff --git a/parameters.ini b/parameters.ini
index 0c8b6bc..e8e1f17 100644
--- a/parameters.ini
+++ b/parameters.ini
@@ -62,6 +62,7 @@ save_images                :    \\path_where_to_save_MIP_and_graphs
 ; if None, no saving. Otherwise, path where to save images (MIP & graphs)
 save_csv                   :    \\path_where_to_save_features_csv
 ; where to save the csv. if None, by default it is in the .tif directory
+save_features_analysis     :    \\path_where_to_save_features_analysis_graphs_and_csv
 dilatation_erosion         :    True
 ; Choose whether to perform erosion/dilation on the extensions during the connexion process.
 ; Better results are achieved with it, and doing dilatation_erosion is theoretically more rigorous
diff --git a/search_parameters.py b/search_parameters.py
index 7fd20c0..be8df30 100644
--- a/search_parameters.py
+++ b/search_parameters.py
@@ -55,6 +55,7 @@ size_voxel_in_micron = IfNotFloat('Input', 'size_voxel_in_micron')
 dilatation_erosion = parameters.getboolean('Input', 'dilatation_erosion')
 save_images = parameters['Input']['save_images']
 save_csv = parameters['Input']['save_csv']
+save_features_analysis = parameters['Input']['save_features_analysis']
 statistical_analysis = parameters.getboolean('Input', 'statistical_analysis')
 
 # [Somas]
-- 
GitLab