dvpt of algo helping feature selection

e1cccbf8 · NADAL Morgane · 9046d6ce · e1cccbf8 · e1cccbf8
Commit e1cccbf8 authored 4 years ago by NADAL Morgane
--- a/features_analysis.py
+++ b/features_analysis.py
@@ -32,13 +32,14 @@
 import pandas as pd_
 import numpy as np_
 import matplotlib.pyplot as pl_
+import matplotlib.gridspec as gs_
 from sklearn.cluster import KMeans
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
 import scipy as si_
 import seaborn as sb_
-import os
-import glob
+
+import sys as sy_
 from typing import List


@@ -195,29 +196,133 @@ def RepresentationOnImages(labeled_somas, kmeans, nb_cluster):
    pl_.close()


-def FeaturesStatistics(df):
+def FeaturesStatistics(df: pd_.DataFrame(),
+                       save_in: str = None,
+                       title: str = "",
+                       ):
    '''
    Return the statistics allowing the user to choose the most relevant features to feed ML algorithms for ex.
    '''
+    #
    # Overview of the basic stats on each columns of df
-    df.describe()
+    description = df.describe()
+    if save_in is not None:
+        description.to_csv(f"{save_in}\\df_stat_description.csv")
+
+    df_scalar = df.drop(["duration", "condition"], axis=1)
+    df_cond = df.drop(["duration"], axis=1)
+    df_groupby_cond = df.drop("duration", axis=1).groupby("condition")
+    df_groupby_dur = df.groupby("duration")
+    condition = pd_.DataFrame(df.loc[:, ["condition"]].values, columns=["condition"])

    # Overview of features distribution and correlation
-    sb_.pairplot(df, kind='reg')
-    sb_.pairplot(df, kind='reg', hue='Condition')

-    # Statistics for each features
-    for column in df.columns:
-        print(column)
-        hist = df[column].hist(bins=20)
-        pl_.title(f"{column}")
-        pl_.savefig(f"D:\\MorganeNadal\\M2 report\\kmeans24\\feat_distrib_{column}.png")
+    ## /!\ if too many features, error in seaborn display
+    ## Even by splitting the data : too intense!
+
+    # Plot heat map with correlation matrix btw features
+    # Compute the correlation matrix
+    print("Heat map with correlation matrix btw features")
+    corr_matrix = df_scalar.corr().abs()
+    # Generate a mask for the upper triangle
+    mask = np_.triu(np_.ones_like(corr_matrix, dtype=np_.bool))
+    # Set up the figure
+    fig, ax = pl_.subplots(figsize=(13, 9))
+    # Generate a custom diverging colormap
+    cmap = sb_.diverging_palette(220, 10, as_cmap=True)
+    # Draw the heatmap with the mask and correct aspect ratio
+    sb_.heatmap(corr_matrix, mask=mask, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, xticklabels=False)
+    ax.set_title(f'Features correlation heat map{title}', fontsize=20)
+    if save_in is not None:
+        pl_.savefig(f"{save_in}\\Features correlation heat map{title}.png")
        pl_.close()

+    # Drop highly correlated features
+    print("Drop highly correlated features")
+    # Select upper triangle of correlation matrix
+    upper = corr_matrix.where(np_.triu(np_.ones(corr_matrix.shape), k=1).astype(np_.bool))
+    # Find index of feature columns with correlation greater than 0.9
+    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
+    # Drop features
+    drop_HCF_df = df_scalar.drop(df[to_drop], axis=1)
+    if save_in:
+        drop_HCF_df.to_csv(f"{save_in}\\df_drop_highly_corr_feat.csv")
+        print(f"Selection of low correlated features in: {save_in}\\df_drop_highly_corr_feat.csv")
+
+    # Statistics for each features
+    dict_ks = {}
+    dict_wx = {}
+    dict_ks_dur = {}
+    dict_wx_dur = {}
+
+    for column in df_scalar.columns:
+        cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1)
+
+        fig = pl_.figure(constrained_layout=False)
+        gs = fig.add_gridspec(ncols=2, nrows=1)
+        ax1 = fig.add_subplot(gs[0, 0])
+        ax2 = fig.add_subplot(gs[0, 1])
+
+        # Plot a histogram and kernel density estimate
+        print(f"Plot a histogram and kernel density estimate for feature {column}")
+        CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"]
+        DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"]
+
+        sb_.distplot(CHO[[column]], color="b", ax=ax1)
+        sb_.distplot(DIO[[column]], color="r", ax=ax1)
+
+        # Draw a boxplot
+        print(f"Plot a boxplot for feature {column}")
+        sb_.boxplot(data=df_cond, x="condition", y=column, hue="condition", palette=["b", "r"], ax=ax2)
+        ax1.set_title(f'{column} distribution{title}', fontsize=11)
+        ax2.set_title(f'{column} boxplot{title}', fontsize=11)
+
+        pl_.tight_layout()
+
+        if save_in is not None:
+            pl_.savefig(f"{save_in}\\feat_distrib_{column}.png")
+            pl_.close()
+
+        # Compare distribution between conditions (goodness of fit)
+        print(f"Kolmogorov-Smirnov between conditions")
+        ks = si_.stats.kstest(CHO[[column]], DIO[[column]])
+        dict_ks[column] = ks
+
+        # Compare median between conditions
+        print(f"Wilcoxon signed-rank test between conditions")
+        wx = si_.stats.wilcoxon(CHO[[column]], DIO[[column]])
+        dict_wx[column] = wx
+
+        # For each duration between conditions
+        for duration, values in df_groupby_dur:
+            dict_ks_dur[duration] = {}
+            dict_wx_dur[duration] = {}
+
+            duration_df = values.drop(["duration"])
+            CHO_ = duration_df.loc[duration_df['condition'] == "CHO"]
+            DIO_ = duration_df.loc[duration_df['condition'] == "DIO"]
+
+            # Compare distribution
+            print(f"Kolmogorov-Smirnov between conditions")
+            ks2 = si_.stats.kstest(CHO[[column]], DIO[[column]])
+            dict_ks_dur[duration][column] = ks2
+
+            # Compare median
+            print(f"Wilcoxon signed-rank test between conditions")
+            wx2 = si_.stats.kstest(CHO[[column]], DIO[[column]])
+            dict_wx_dur[duration][column] = wx2
+
+        df_ks = pd_.DataFrame.from_dict()
+        df_wx = pd_.DataFrame.from_dict()
+        df_ks_dur = pd_.DataFrame.from_dict()
+        df_wx_dur = pd_.DataFrame.from_dict()
+
+        stat_tests_df = pd_.concatenate((df_ks, df_wx, df_ks_dur, df_wx_dur))
+

 if __name__ == "__main__":
    #
-    # os.chdir("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion")
+    # os.chdir("path")

    ## If need to concatenate files:
    # all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
@@ -226,10 +331,13 @@ if __name__ == "__main__":
    # df.to_csv(".\combined_features.csv")

    ## If use labeled somas:
-    # labeled_somas = np_.load("D:\\MorganeNadal\\Results\\labeled_somas.npy")
+    # labeled_somas = np_.load("path.npy")
    # df = pd_.read_csv(".\combined_features.csv", index_col=0)

-    df0 = pd_.read_csv("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\all_features.csv",
+    # path = sy_.argv[1]
+    path = "D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion"
+
+    df0 = pd_.read_csv(path,
                      # index_col=0,
                      )
    df = df0.drop(["Unnamed: 0"], axis=1)
@@ -295,9 +403,15 @@ if __name__ == "__main__":
    #                         title=f" - {duration} Sample",
    #                         )

-    ## -- Select Discriminant features by statistical analysis
-    # TODO filtered_df = SelectFeatures(concatenated_features_df)
+    ## -- Various plots to analyse the data and find discriminant features by statistical analysis
+    FeaturesStatistics(df,
+                       save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis",
+                       )
+    ## TODO: Enter selected features here
+    # selected_features = []
+    # selected_df = df[selected_features]

+    #
    # # -- PCA with selected features
    # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    # df_all = df.drop(["duration"], axis=1)

--- a/nutrimorph.py
+++ b/nutrimorph.py
@@ -660,79 +660,9 @@ if __name__ == '__main__':
        else:
            concatenated_features_df.to_csv(f"{save_csv}\\features.csv")

-        # --- TODO Clustering with this df and module features_analysis.py
-        # if statistical_analysis:
-        ## For the moment drop the columns with non scalar values, and un-useful values
-        ## - TO BE CHANGED (use distance metrics such as bhattacharyya coef, etc)
-        # df = concatenated_features_df.drop(["soma uid",
-        #                                     "spherical_angles_eva",
-        #                                     "spherical_angles_evb",
-        #                                     "hist_lengths",
-        #                                     "hist_lengths_P",
-        #                                     "hist_lengths_S",
-        #                                     "hist_curvature",
-        #                                     "hist_curvature_P",
-        #                                     "hist_curvature_S"],
-        #                                    axis=1)
-
-        ## -- PCA with all the features
-        ## Separating the features from their conditions and durations
-        # all_target = concatenated_features_df.loc[:, ['Condition']].values
-        # all_features_df = concatenated_features_df.drop(["Duration"], axis=1)
-
-        ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-        # TODO pca = fa_.PCAOnDF(concatenated_features_df)
-
-        ## Between the two conditions, for each duration (2 conditions, 3 durations)
-        # groupby_duration = concatenated_features_df.groupby("Duration")
-        # for duration, values in groupby_duration:
-        ##TODO find the condition to print it on PCA
-        # print(duration)
-        # duration_features_df = concatenated_features_df.drop(["Duration"], axis=1)
-        # pca = fa_.PCAOnDF(values)
-
-        ## -- K-means with all the features (2 conditions)
-
-        ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-        # kmeans = fa_.KmeansOnDF(concatenated_features_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
-
-        ## Between the two conditions, for each duration (2 conditions, 3 durations)
-        # for duration, values in groupby_duration:
-        # kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
-
-        ## -- Select Discriminant features by statistical analysis
-        # TODO filtered_df = SelectFeatures(concatenated_features_df)
-
-        ## -- PCA with all the features with the cluster as label
-
-        ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-        # TODO pca = fa_.PCAOnDF(concatenated_features_df)
-
-        ## Between the two conditions, for each duration (2 conditions, 3 durations)
-        # for duration, values in groupby_duration:
-        # pca = fa_.PCAOnDF(values)
-
-        ## -- Select Discriminant features by statistical analysis
-        # TODO filtered_df = SelectFeatures(concatenated_features_df)
-
-        ## -- PCA with selected features
-
-        ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-        # TODO pca = fa_.PCAOnDF(filtered_df)
-
-        ## Between the two conditions, for each duration (2 conditions, 3 durations)
-        # filtered_groupby_duration = filtered_df.groupby("Duration")
-        # for duration, values in filtered_groupby_duration:
-        # pca = fa_.PCAOnDF(values)
-
-        ## -- K-means with selected features
-
-        ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
-        # filtered_kmeans = fa_.KmeansOnDF(filtered_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
-
-        ## Between the two conditions, for each duration (2 conditions, 3 durations)
-        # for duration, values in filtered_groupby_duration:
-        # filtered_kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
+        # Clustering with this df and module features_analysis.py
+        if statistical_analysis:
+            os_.system(f"feature_analysis.py {save_csv}\\features.csv")

    else:
        raise ImportError("Not a valid data path!")