From 7b24a67690eba358c7662c6665975b53e9526f74 Mon Sep 17 00:00:00 2001 From: monadal <morgane.nadal@inria.fr> Date: Mon, 6 Jul 2020 18:18:25 +0200 Subject: [PATCH] improved features_analysis.py and add new parameter to nutrimorph.py --- features_analysis.py | 582 ++++++++++++++++++++++++++----------------- nutrimorph.py | 3 +- parameters.ini | 1 + search_parameters.py | 1 + 4 files changed, 359 insertions(+), 228 deletions(-) diff --git a/features_analysis.py b/features_analysis.py index ef7ef96..6c19640 100644 --- a/features_analysis.py +++ b/features_analysis.py @@ -51,10 +51,12 @@ def PCAOnDF(df: pd_.DataFrame(), save_name: str = None, title: str = "", plot_duration: bool = True, + save_in: str = None, + three_D: bool = False, ) -> list: ''' - Perform 2D PCA on the CHO-DIO dataframe. - Print ratio variance and plot the PCA. + Perform 2D or 3D PCA on the CHO-DIO dataframe. + Save explained variance ratio as csv, plot and save the PCAs. ''' # Separating the features from their conditions and durations all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target]) @@ -68,18 +70,23 @@ def PCAOnDF(df: pd_.DataFrame(), # Create the PCA and fit the data pca = PCA(n_components=2) principal_components = pca.fit_transform(stand_df) + + # Printing and saving the explained variance ratio print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_) + pca_exp_var_df = pd_.DataFrame(pca.explained_variance_ratio_, columns=["pca explained variance ration"]) + pca_exp_var_df = pca_exp_var_df.rename(index={0: "principal component 1", 1: "principal component 2"}) + pca_exp_var_df.to_csv(f"{save_in}\\pca_explained_variance_ratio_{save_name}.csv") + # Creating the principal component df principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2']) - # Give the final df containing the principal component and their condition final_df = pd_.concat([principal_df, all_target[target]], axis=1) - # Plot + # Plot 2D fig = pl_.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) - ax.set_xlabel('Principal Component 1', fontsize=15) - ax.set_ylabel('Principal Component 2', fontsize=15) + ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca.explained_variance_ratio_[0],2)}', fontsize=15) + ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca.explained_variance_ratio_[1],2)}', fontsize=15) ax.set_title(f'2 component PCA{title}', fontsize=20) for tgt, color in zip(targets, colors): idx = final_df[target] == tgt @@ -89,30 +96,104 @@ def PCAOnDF(df: pd_.DataFrame(), , s=30) ax.legend(targets) ax.grid() + if save_name is not None: - pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png") + pl_.savefig(f"{save_in}\\PCA_{save_name}.png") if plot_duration: + # Make separated plots for each duration of the experiments fig = pl_.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) - ax.set_xlabel('Principal Component 1', fontsize=15) - ax.set_ylabel('Principal Component 2', fontsize=15) + + ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca.explained_variance_ratio_[0],2)}', fontsize=15) + ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca.explained_variance_ratio_[1],2)}', fontsize=15) ax.set_title(f'2 component PCA{title}', fontsize=20) - targets = [["CHO", "1H"], ["CHO", "3H"], ["CHO", "6H"], ["DIO", "1H"], ["DIO", "3H"], ["DIO", "6H"]] + + new_tgts = [["CHO", "1H"], ["CHO", "3H"], ["CHO", "6H"], ["DIO", "1H"], ["DIO", "3H"], ["DIO", "6H"]] final_df = pd_.concat([df[["condition", "duration"]], principal_df], axis=1) - for tgt, color in zip(targets, ["lavender", "royalblue", "navy", "lightcoral", "firebrick", "red"]): + + for tgt, color in zip(new_tgts, ["lavender", "royalblue", "navy", "lightcoral", "firebrick", "red"]): new_df = final_df.loc[(final_df["condition"] == tgt[0]) & (final_df["duration"] == tgt[1])] new_df = new_df.drop(["condition", "duration"], axis=1) + ax.scatter(new_df['principal component 1'] , new_df['principal component 2'] , c=color , s=30) + + ax.legend(new_tgts) + ax.grid() + + if save_name is not None: + pl_.savefig(f"{save_in}\\PCA_duration_{save_name}.png") + + if three_D: + # Create the 3D PCA and fit the data + pca3d = PCA(n_components=3) + principal_components3d = pca3d.fit_transform(stand_df) + + # Print and Save the explained variance ratio + print(f"3D PCA explained variance ratio ({save_name}): ", pca3d.explained_variance_ratio_) + pca3d_exp_var_df = pd_.DataFrame(pca3d.explained_variance_ratio_, columns=["3d pca explained variance ration"]) + pca3d_exp_var_df = pca3d_exp_var_df.rename(index={0: "principal component 1", 1: "principal component 2", 2: "principal component 3"}) + pca3d_exp_var_df.to_csv(f"{save_in}\\three_d_pca_explained_variance_ratio_{save_name}.csv") + + # Creating the principal component df + principal3d_df = pd_.DataFrame(data=principal_components3d, + columns=['principal component 1', 'principal component 2', 'principal component 3']) + # Give the final df containing the principal component and their condition + final3d_df = pd_.concat([principal3d_df, all_target[target]], axis=1) + + # Plot + fig = pl_.figure(figsize=(8, 8)) + ax = fig.add_subplot(111, projection='3d') + + ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca3d.explained_variance_ratio_[0],2)}', fontsize=15) + ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca3d.explained_variance_ratio_[1],2)}', fontsize=15) + ax.set_zlabel(f'Principal Component 3 - ratio = {round(pca3d.explained_variance_ratio_[2],2)}', fontsize=15) + ax.set_title(f'3 component PCA{title}', fontsize=20) + + for tgt, color in zip(targets, colors): + idx = final3d_df[target] == tgt + + ax.scatter(final3d_df.loc[idx, 'principal component 1'] + , final3d_df.loc[idx, 'principal component 2'] + , final3d_df.loc[idx, 'principal component 3'] + , c=color + , s=30) + ax.legend(targets) ax.grid() + if save_name is not None: - pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_duration_{save_name}.png") + pl_.savefig(f"{save_in}\\three_d_PCA_{save_name}.png") + + if plot_duration: + fig = pl_.figure(figsize=(8, 8)) + ax = fig.add_subplot(111, projection="3d") - return pca.explained_variance_ratio_ + ax.set_xlabel(f'Principal Component 1 - ratio = {round(pca3d.explained_variance_ratio_[0], 2)}', fontsize=15) + ax.set_ylabel(f'Principal Component 2 - ratio = {round(pca3d.explained_variance_ratio_[1],2)}', fontsize=15) + ax.set_zlabel(f'Principal Component 3 - ratio = {round(pca3d.explained_variance_ratio_[2],2)}', fontsize=15) + ax.set_title(f'3 component PCA{title}', fontsize=20) + + final3d_df = pd_.concat([df[["condition", "duration"]], principal3d_df], axis=1) + + for tgt, color in zip(new_tgts, ["lavender", "royalblue", "navy", "lightcoral", "firebrick", "red"]): + new3d_df = final3d_df.loc[(final_df["condition"] == tgt[0]) & (final3d_df["duration"] == tgt[1])] + new3d_df = new3d_df.drop(["condition", "duration"], axis=1) + + ax.scatter(new3d_df['principal component 1'] + , new3d_df['principal component 2'] + , new3d_df['principal component 3'] + , c=color + , s=30) + + ax.legend(new_tgts) + ax.grid() + + if save_name is not None: + pl_.savefig(f"{save_in}\\three_d_PCA_duration_{save_name}.png") def KmeansOnDF(df: pd_.DataFrame(), @@ -131,7 +212,7 @@ def KmeansOnDF(df: pd_.DataFrame(), ''' Perform kmeans on the pandas dataframe. Can find the best number of cluster with elbow method, find the intracluster variance, and represent the result on the initial images. - Returns kmeans. + Plot barplots with percentage of each label for each condition. ''' # Separating the features from their conditions and durations all_target = pd_.DataFrame(df.loc[:, [target]].values, columns=[target]) @@ -148,6 +229,7 @@ def KmeansOnDF(df: pd_.DataFrame(), kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(stand_df) wcss.append(kmeans.inertia_) + pl_.figure() pl_.plot(range(1, 24), wcss) pl_.plot(range(1, 24), wcss, 'bo') pl_.title('Elbow Method') @@ -167,10 +249,10 @@ def KmeansOnDF(df: pd_.DataFrame(), if intracluster_var: var_df = pd_.DataFrame(stand_df) var = IntraClusterVariance(var_df, kmeans, nb_cluster, save_name) - # TODO save csv - var_df = pd_.DataFrame(var, columns=["PCA explained variance ratio"]) + var_df = pd_.DataFrame(var, columns=["intracluster variance"]) + var_df = var_df.rename({cluster: f"label {cluster}" for cluster in range(nb_cluster)}) if save_in: - var_df.to_csv(f"{save_in}\\pca_var_explained_ratio_{save_name}.csv") + var_df.to_csv(f"{save_in}\\intracluster_variance_kmeans_{save_name}_k={nb_cluster}.csv") # TODO Intersection of ellipsoids of the covariance matrices # - ellipsoid equation: (x-mean)(1/Mcov)(x-mean).T <= 1 @@ -182,13 +264,12 @@ def KmeansOnDF(df: pd_.DataFrame(), ax = fig.add_subplot(1, 1, 1) (lab_cond_df.groupby("condition")["label"].value_counts(normalize=True).mul(100).rename( "Percentage (%)").reset_index().pipe((sb_.barplot, "data"), x="condition", y="Percentage (%)", - hue="label", palette=sb_.color_palette("deep", n_colors=2))) - # sb_.countplot(x="condition", hue="label", data=lab_cond_df, palette=sb_.color_palette("deep", n_colors=2)) + hue="label", palette=sb_.color_palette("deep", n_colors=nb_cluster))) ax.set_ylim(0, 100) - ax.set_title(f'Distribution of the clustering labels according to conditions{title}', fontsize=11) + ax.set_title(f'Distribution of the clustering labels according to conditions{title}_k={nb_cluster}', fontsize=11) ax.grid() if save_name is not None: - pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}.png") + pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_k={nb_cluster}.png") # pl_.show(block=True) # pl_.close() else: @@ -196,14 +277,12 @@ def KmeansOnDF(df: pd_.DataFrame(), ax= fig.add_subplot(1, 1, 1) (lab_cond_df.groupby("condition")["label"].value_counts(normalize=True).mul(100).rename( "Percentage (%)").reset_index().pipe((sb_.barplot, "data"), x="condition", y="Percentage (%)", - hue="label", palette=sb_.color_palette("deep", n_colors=2))) - - # sb_.countplot(x="condition", hue="label", data=lab_cond_df, palette=sb_.color_palette("deep", n_colors=2)) + hue="label", palette=sb_.color_palette("deep", n_colors=nb_cluster))) ax.set_ylim(0, 100) ax.grid() - ax.set_title(f'Distribution of the clustering labels according to conditions{title}', fontsize=11) + ax.set_title(f'Distribution of the clustering labels according to conditions{title}_k={nb_cluster}', fontsize=11) if save_name is not None: - pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}.png") + pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_k={nb_cluster}.png") # pl_.show(block=True) # pl_.close() lab_cond_df2 = pd_.concat((lab_cond_df, df[["duration"]]), axis=1) @@ -213,18 +292,17 @@ def KmeansOnDF(df: pd_.DataFrame(), ax = fig.add_subplot(1, 1, 1) (dur_df.groupby("condition")["label"].value_counts(normalize=True).mul(100).rename( "Percentage (%)").reset_index().pipe((sb_.barplot, "data"), x="condition", y="Percentage (%)", - hue="label", palette=sb_.color_palette("deep", n_colors=2))) + hue="label", palette=sb_.color_palette("deep", n_colors=nb_cluster))) ax.set_ylim(0, 100) ax.grid() - ax.set_title(f'Distribution of the clustering labels according to conditions{title} - duration = {dur}', fontsize=11) + ax.set_title(f'Distribution of the clustering labels according to conditions{title}_k={nb_cluster} - duration = {dur}', fontsize=11) if save_name is not None: - pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_dur_{dur}.png") + pl_.savefig(f"{save_in}\\Hist_Clustering_{save_name}_dur_{dur}_k={nb_cluster}.png") # pl_.show(block=True) # pl_.close() - # sb_.catplot("condition", col="duration", kind="count", hue="label", data=pd_.concat((lab_cond_df, df[["duration"]]), axis=1), palette=sb_.color_palette("deep", n_colors=2)) if save_name is not None: - pl_.savefig(f"{save_in}\\Hist_duration_Clustering_{save_name}.png") + pl_.savefig(f"{save_in}\\Hist_duration_Clustering_{save_name}_k={nb_cluster}.png") print(f"Saved in {save_in}") # pl_.show(block=True) # pl_.close() @@ -282,6 +360,9 @@ def FeaturesStatistics(df: pd_.DataFrame(), ): ''' Return the statistics allowing the user to choose the most relevant features to feed ML algorithms for ex. + Statistical description, correlation heatmap, dropping correlated features or features with little difference between the two conditions, + Plotting features distribution and boxplots, performing Kolmogorov-Smirnov two-sample test and Wilcoxon signed-rank two-sample test. + Can draw a decision tree. ''' # # Overview of the basic stats on each columns of df @@ -290,16 +371,10 @@ def FeaturesStatistics(df: pd_.DataFrame(), if save_in is not None: description.to_csv(f"{save_in}\\df_stat_description.csv") + # Data formatting df_scalar = df.drop(["duration", "condition"], axis=1) df_cond = df.drop(["duration"], axis=1) - # df_groupby_cond = df.drop("duration", axis=1).groupby("condition") df_groupby_dur = df.groupby("duration") - # condition = pd_.DataFrame(df.loc[:, ["condition"]].values, columns=["condition"]) - - # Overview of features distribution and correlation - - ## /!\ if too many features, error in seaborn display - ## Even by splitting the data : too intense! # Compute the correlation matrix corr_matrix = df_scalar.corr().abs() @@ -320,68 +395,18 @@ def FeaturesStatistics(df: pd_.DataFrame(), pl_.savefig(f"{save_in}\\Features correlation heat map{title}.png") pl_.close() - if drop_feat: - # Drop highly correlated features - print("Drop highly correlated features") - # Select upper triangle of correlation matrix - upper = corr_matrix.where(np_.triu(np_.ones(corr_matrix.shape), k=1).astype(np_.bool)) - # Find index of feature columns with correlation greater than 0.9 - to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] - # Drop features - drop_HCF_df = df_scalar.drop(df[to_drop], axis=1) - - # Drop column with null variance - drop_HCF_df = drop_HCF_df.loc[:, drop_HCF_df.var() != 0.0] - - # ADD the condition and duration - drop_HCF_df = pd_.concat((df[["condition", "duration"]], drop_HCF_df), axis=1) - - if save_in: - drop_HCF_df.to_csv(f"{save_in}\\df_drop_highly_corr_feat.csv") - print(f"Selection of low correlated features in: {save_in}\\df_drop_highly_corr_feat.csv") - - # Statistics for each features - - if distribution: - for column in df_scalar.columns: - cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1) - - fig = pl_.figure(constrained_layout=False) - gs = fig.add_gridspec(ncols=2, nrows=1) - ax1 = fig.add_subplot(gs[0, 0]) - ax2 = fig.add_subplot(gs[0, 1]) - - # Plot a histogram and kernel density estimate - print(f"Plot a histogram and kernel density estimate for feature {column}") - CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"] - DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"] - - sb_.distplot(CHO[[column]], color="b", ax=ax1) - sb_.distplot(DIO[[column]], color="r", ax=ax1) - - # Draw a boxplot - print(f"Plot a boxplot for feature {column}") - sb_.boxplot(data=df_cond, x="condition", y=column, hue="condition", palette=["b", "r"], ax=ax2) - ax1.set_title(f'{column} distribution{title}', fontsize=11) - ax2.set_title(f'{column} boxplot{title}', fontsize=11) - - pl_.tight_layout() - - if save_in is not None: - pl_.savefig(f"{save_in}\\feat_distrib_{column}.png") - pl_.close() - if stat_test: - + # Create dictionaries to store the statistics and p-values dict_ks = {} dict_wx = {} dict_ks_dur = {} dict_wx_dur = {} for column in df_scalar.columns: - + # For each feature, perform a statistical test to know if the feature distribution or median is different btw CHO and DIO conditions cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1) + # Separate the conditions CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"] CHO = np_.asarray(CHO[column]) DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"] @@ -395,7 +420,25 @@ def FeaturesStatistics(df: pd_.DataFrame(), wx = si_.stats.ranksums(CHO, DIO) dict_wx[column] = wx - # For each duration between conditions + # Plot the p-values + fig = pl_.figure(constrained_layout=False, figsize=(15,8)) + gs = fig.add_gridspec(ncols=2, nrows=1) + ax1 = fig.add_subplot(gs[0, 0]) + ax2 = fig.add_subplot(gs[0, 1]) + # + ax1.set_title(f"Kolmogorov-Smirnov p-value btw conditions - {column}", fontsize=13, pad=20) + ax1.grid() + ax1.set_xlabel("Duration") + ax1.set_yscale("log") + ax1.set_ylabel("p-value") + # + ax2.set_title(f"Wilcoxon signed-rank p-value btw conditions - {column}", fontsize=13, pad=20) + ax2.grid() + ax2.set_xlabel("Duration") + ax2.set_yscale("log") + ax2.set_ylabel("p-value") + + # Do the same thing than above but separate durations between conditions for duration, values in df_groupby_dur: if duration not in dict_ks_dur: dict_ks_dur[duration] = {} @@ -417,6 +460,18 @@ def FeaturesStatistics(df: pd_.DataFrame(), wx2 = si_.stats.ranksums(CHO_, DIO_) dict_wx_dur[duration][column] = wx2 + ax1.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_ks_dur[duration][column][1] for duration, _ in df_groupby_dur], "g") + ax1.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_ks_dur[duration][column][1] for duration, _ in df_groupby_dur], 'ko') + ax2.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_wx_dur[duration][column][1] for duration, _ in df_groupby_dur], "y") + ax2.plot([f"{duration}" for duration, _ in df_groupby_dur], [dict_wx_dur[duration][column][1] for duration, _ in df_groupby_dur], 'ko') + + pl_.tight_layout() + + if save_in: + pl_.savefig(f"{save_in}\\p_values_duration_{column}") + pl_.close() + + # Reformat the data to save it as csv df_ks = pd_.DataFrame.from_dict(data=dict_ks) df_ks = df_ks.rename(index={0: "Kolmogorov-Smirnov statistic", 1: "Kolmogorov-Smirnov p-value"}) df_wx = pd_.DataFrame.from_dict(data=dict_wx) @@ -430,13 +485,80 @@ def FeaturesStatistics(df: pd_.DataFrame(), df_wx_dur = pd_.concat((df_wx_dur, pd_.DataFrame.from_dict(data=dict_wx_dur[key]))) df_wx_dur = df_wx_dur.rename(index={0: f"{key} - Wilcoxon statistic", 1: f"{key} - Wilcoxon p-value"}) - stat_tests_df1 = pd_.concat((df_ks, df_wx)) - stat_tests_df2 = pd_.concat((df_ks_dur, df_wx_dur)) + stat_tests_df1 = pd_.concat((df_ks, df_wx)) # KS and Wx all durations + stat_tests_df2 = pd_.concat((df_ks_dur, df_wx_dur)) # KS and Wx for each duration if save_in: stat_tests_df1.to_csv(f"{save_in}\\stat_test_KS_wilcoxon_all.csv") stat_tests_df2.to_csv(f"{save_in}\\stat_test_KS_wilcoxon_for_each_duration.csv") + if drop_feat: + # Drop highly correlated features + print("Drop highly correlated features") + # Select upper triangle of correlation matrix + upper = corr_matrix.where(np_.triu(np_.ones(corr_matrix.shape), k=1).astype(np_.bool)) + # Find index of feature columns with correlation greater than 0.9 + to_drop = [column for column in upper.columns if any(upper[column] > 0.9)] + # Drop features + drop_HCF_df = df_scalar.drop(df[to_drop], axis=1) + + # Drop column with null variance + drop_HCF_df = drop_HCF_df.loc[:, drop_HCF_df.var() != 0.0] + + if stat_test: + # drop non significant features (distribution and mean not different btw CHO and DIO) + drop_HCF_df_6H = drop_HCF_df.copy() + # based on all durations; + to_drop =[column for column in drop_HCF_df.columns if (stat_tests_df1.loc["Kolmogorov-Smirnov p-value", column] > 1.0e-2) and (stat_tests_df1.loc["Wilcoxon p-value", column] > 1.0e-2)] + drop_HCF_df = drop_HCF_df.drop(drop_HCF_df[to_drop], axis=1) + + # only based on 6H duration; + to_drop_6H =[column for column in drop_HCF_df_6H.columns if (stat_tests_df2.loc["6H - Kolmogorov-Smirnov p-value", column] > 1.0e-3) and (stat_tests_df2.loc["6H - Wilcoxon p-value", column] > 1.0e-3)] + drop_HCF_df_6H = drop_HCF_df_6H.drop(drop_HCF_df_6H[to_drop_6H], axis=1) + + drop_HCF_df_6H = pd_.concat((df[["condition", "duration"]], drop_HCF_df_6H), axis=1) + + if save_in: + drop_HCF_df_6H.to_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv") + print(f"Selection of features with distribution and median of CHO vs DIO different in: {save_in}") + + # Add the condition and duration + drop_HCF_df = pd_.concat((df[["condition", "duration"]], drop_HCF_df), axis=1) + + if save_in: + drop_HCF_df.to_csv(f"{save_in}\\df_drop_highly_corr_feat.csv") + print(f"Selection of low correlated features in: {save_in}") + + # Statistics for each features + if distribution: + for column in df_scalar.columns: + cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1) + + fig = pl_.figure(constrained_layout=False) + gs = fig.add_gridspec(ncols=2, nrows=1) + ax1 = fig.add_subplot(gs[0, 0]) + ax2 = fig.add_subplot(gs[0, 1]) + + # Plot a histogram and kernel density estimate + print(f"Plot a histogram and kernel density estimate for feature {column}") + CHO = cond_col_df.loc[cond_col_df['condition'] == "CHO"] + DIO = cond_col_df.loc[cond_col_df['condition'] == "DIO"] + + sb_.distplot(CHO[[column]], color="b", ax=ax1) + sb_.distplot(DIO[[column]], color="r", ax=ax1) + + # Draw a boxplot + print(f"Plot a boxplot for feature {column}") + sb_.boxplot(data=df_cond, x="condition", y=column, hue="condition", palette=["b", "r"], ax=ax2) + ax1.set_title(f'{column} distribution{title}', fontsize=11) + ax2.set_title(f'{column} boxplot{title}', fontsize=11) + + pl_.tight_layout() + + if save_in is not None: + pl_.savefig(f"{save_in}\\feat_distrib_{column}.png") + pl_.close() + # Decision tree if decision_tree: # Test btw CHO and DIO @@ -478,16 +600,17 @@ if __name__ == "__main__": # labeled_somas = np_.load("path.npy") # df = pd_.read_csv(".\combined_features.csv", index_col=0) - # path = sy_.argv[1] - path = "D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion" + ## Parameters + path = sy_.argv[1] + save_in = sy_.argv[2] - df0 = pd_.read_csv(f"{path}\\all_features.csv", + ## DF cleaning + df0 = pd_.read_csv(f"{path}\\features.csv", # index_col=0, ) df = df0.drop(["Unnamed: 0"], axis=1) - # Statistical analysis - + ## Statistical analysis # For the moment drop the columns with non scalar values, and un-useful values # - TO BE CHANGED TODO (use distance metrics such as bhattacharyya coef, etc) df = df.drop(["soma uid", @@ -500,133 +623,138 @@ if __name__ == "__main__": # -- PCA with all the features print("\nALL FEATURES\n") # Between the two conditions, regardless the duration of experiment (2 conditions, all durations) - # all_pca = PCAOnDF(df, - # target="condition", - # targets=["CHO", "DIO"], - # colors=["b", "r"], - # save_name="all_features", - # ) - # - # # Between the two conditions, for each duration (2 conditions, 3 durations) - # groupby_duration = df.groupby("duration") - # duration_pca = [] - # for duration, values in groupby_duration: - # ## duration: str, values: pd_.DataFrame() - # pca = PCAOnDF(values, - # target="condition", - # targets=["CHO", "DIO"], - # colors=["b", "r"], - # save_name=f"{duration}_features", - # title=f" - {duration} Sample", - # plot_duration=False) - # duration_pca.append(pca) - # - ## -- K-means with all the features (2 conditions) - # - ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations) - ## df_all = df.drop(["duration"], axis=1) - # kmeans = KmeansOnDF(df, - # target="condition", - # nb_clusters=(2,), - # elbow=False, - # intracluster_var=True, - # plot_bar=True, - # save_name="all_features", - # save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis", - # ) + PCAOnDF(df, + target="condition", + targets=["CHO", "DIO"], + colors=["b", "r"], + save_name="all_features", + save_in=save_in, + three_D=True, + ) + + # Between the two conditions, for each duration (2 conditions, 3 durations) + groupby_duration = df.groupby("duration") + for duration, values in groupby_duration: + ## duration: str, values: pd_.DataFrame() + PCAOnDF(values, + target="condition", + targets=["CHO", "DIO"], + colors=["b", "r"], + save_name=f"{duration}_features", + save_in=save_in, + title=f" - {duration} Sample", + plot_duration=False, + three_D=True, + ) + + # -- K-means with all the features (2 conditions) # Test for multiple glial populations - ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations) - # kmeans = KmeansOnDF(df, - # target="condition", - # nb_clusters=(2,3,4,5), - # elbow=False, - # intracluster_var=True,# TODO store var - # plot_bar=True, - # save_name="all_features", - # save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis", - # ) - - # # Between the two conditions, for each duration (2 conditions, 3 durations) - # groupby_duration = df.groupby("duration") - # for duration, values in groupby_duration: - # kmeans = KmeansOnDF(values, - # target="condition", - # nb_clusters=(2,), - # elbow=False, - # intracluster_var=True, - # plot_bar=True, - # save_name=f"{duration}_features", - # title=f" - {duration} Sample", - # duration=True, - # save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis", - # ) - - # ## -- Various plots to analyse the data and find discriminant features by statistical analysis - # FeaturesStatistics(df, - # save_in="D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis", - # describe=True, - # heatmap=True, - # distribution=True, - # drop_feat=True, - # stat_test=True, - # decision_tree=False, - # ) - # ## TODO: Enter selected features here - # # selected_features = [] - # # selected_df = df[selected_features] - # ## TODO Or use the csv with dropped features - # selected_df = pd_.read_csv(f"{path}\\Features_analysis\\df_drop_highly_corr_feat.csv") - # # if other columns need to be dropped: - # to_drop = ["Unnamed: 0", "min_curvature"] - # selected_df = selected_df.drop(to_drop, axis=1) - # - # # # -- PCA with all the features - # print("\nSELECTED FEATURES\n") - # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations) - # all_pca = PCAOnDF(df, - # target="condition", - # targets=["CHO", "DIO"], - # colors=["b", "r"], - # save_name="all_selected_features", - # ) - # - # # # Between the two conditions, for each duration (2 conditions, 3 durations) - # groupby_duration = df.groupby("duration") - # duration_pca = [] - # for duration, values in groupby_duration: - # ## duration: str, values: pd_.DataFrame() - # pca = PCAOnDF(values, - # target="condition", - # targets=["CHO", "DIO"], - # colors=["b", "r"], - # save_name=f"{duration}_selected_features", - # title=f" - {duration} Sample - selected features", - # plot_duration=False) - # duration_pca.append(pca) - # - # # # -- K-means with all the features (2 conditions) - # # - # # # Between the two conditions, regardless the duration of experiment (2 conditions, all durations) - # kmeans = KmeansOnDF(df, - # target="condition", - # nb_clusters=(2,), - # elbow=False, - # intracluster_var=True, - # plot_bar=True, - # save_name="all_selected_features" - # ) - # - # # # Between the two conditions, for each duration (2 conditions, 3 durations) - # groupby_duration = df.groupby("duration") - # for duration, values in groupby_duration: - # kmeans = KmeansOnDF(values, - # target="condition", - # nb_clusters=(2,), - # elbow=False, - # intracluster_var=True, - # plot_bar=True, - # save_name=f"{duration}_selected_features", - # title=f" - {duration} Sample - selected features", - # duration=True, - # ) + # Between the two conditions, regardless the duration of experiment (2 conditions, all durations) + kmeans = KmeansOnDF(df, + target="condition", + nb_clusters=(2, 3, 4, 5), + elbow=True, + intracluster_var=True, + plot_bar=True, + save_name="all_features_multiple_pop", + save_in=save_in, + ) + + # Between the two conditions, for each duration (2 conditions, 3 durations) + groupby_duration = df.groupby("duration") + for duration, values in groupby_duration: + kmeans = KmeansOnDF(values, + target="condition", + nb_clusters=(2, 3, 4, 5), + elbow=False, + intracluster_var=True, + plot_bar=True, + save_name=f"{duration}_features_multiple_pop", + title=f" - {duration} Sample", + duration=True, + save_in=save_in, + ) + + # -- Various plots to analyse the data and find discriminant features by statistical analysis + print("\nFEATURE SELECTION\n") + FeaturesStatistics(df, + save_in=save_in, + describe=True, + heatmap=True, + distribution=True, + stat_test=True, + drop_feat=True, + decision_tree=False, + ) + ## TODO: Enter selected features here + # selected_features = [] + # selected_df = df[selected_features] + ## TODO Or use the csv with dropped features + selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat.csv") + # selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv") + ## If an error raises, only run the part until FeaturesStatistics included, and then run the last part. + + # if other columns need to be dropped: + try: + to_drop = ["Unnamed: 0", "min_curvature"] + selected_df = selected_df.drop(to_drop, axis=1) + except: + selected_df = selected_df.drop(["Unnamed: 0"], axis=1) + + # -- PCA with all the features + print("\nSELECTED FEATURES\n") + # Between the two conditions, regardless the duration of experiment (2 conditions, all durations) + PCAOnDF(df, + target="condition", + targets=["CHO", "DIO"], + colors=["b", "r"], + save_name="all_selected_features", + save_in=save_in, + three_D=True, + ) + + # Between the two conditions, for each duration (2 conditions, 3 durations) + groupby_duration = df.groupby("duration") + for duration, values in groupby_duration: + # duration: str, values: pd_.DataFrame() + PCAOnDF(values, + target="condition", + targets=["CHO", "DIO"], + colors=["b", "r"], + save_name=f"{duration}_selected_features", + save_in=save_in, + title=f" - {duration} Sample - selected features", + plot_duration=False, + three_D=True, + ) + + # -- K-means with all the features (2 conditions) + + # Between the two conditions, regardless the duration of experiment (2 conditions, all durations) + kmeans = KmeansOnDF(df, + target="condition", + nb_clusters=(2,3,4,5), + elbow=False, + intracluster_var=True, + plot_bar=True, + save_name="all_selected_features", + save_in=save_in, + ) + + # Between the two conditions, for each duration (2 conditions, 3 durations) + groupby_duration = df.groupby("duration") + for duration, values in groupby_duration: + kmeans = KmeansOnDF(values, + target="condition", + nb_clusters=(2,3,4,5), + elbow=False, + intracluster_var=True, + plot_bar=True, + save_name=f"{duration}_selected_features", + save_in=save_in, + title=f" - {duration} Sample - selected features", + duration=True, + ) + + ## TODO: Random forests ? diff --git a/nutrimorph.py b/nutrimorph.py index 5ed03a8..b90aa89 100644 --- a/nutrimorph.py +++ b/nutrimorph.py @@ -79,6 +79,7 @@ data_path = None channel = None save_images = None save_csv = None +save_features_analysis = None dilatation_erosion = None size_voxel_in_micron = None statistical_analysis = None @@ -662,7 +663,7 @@ if __name__ == '__main__': # Clustering with this df and module features_analysis.py if statistical_analysis: - os_.system(f"feature_analysis.py {save_csv}\\features.csv") + os_.system(f"feature_analysis.py {save_csv}\\features.csv {save_features_analysis}") else: raise ImportError("Not a valid data path!") diff --git a/parameters.ini b/parameters.ini index 0c8b6bc..e8e1f17 100644 --- a/parameters.ini +++ b/parameters.ini @@ -62,6 +62,7 @@ save_images : \\path_where_to_save_MIP_and_graphs ; if None, no saving. Otherwise, path where to save images (MIP & graphs) save_csv : \\path_where_to_save_features_csv ; where to save the csv. if None, by default it is in the .tif directory +save_features_analysis : \\path_where_to_save_features_analysis_graphs_and_csv dilatation_erosion : True ; Choose whether to perform erosion/dilation on the extensions during the connexion process. ; Better results are achieved with it, and doing dilatation_erosion is theoretically more rigorous diff --git a/search_parameters.py b/search_parameters.py index 7fd20c0..be8df30 100644 --- a/search_parameters.py +++ b/search_parameters.py @@ -55,6 +55,7 @@ size_voxel_in_micron = IfNotFloat('Input', 'size_voxel_in_micron') dilatation_erosion = parameters.getboolean('Input', 'dilatation_erosion') save_images = parameters['Input']['save_images'] save_csv = parameters['Input']['save_csv'] +save_features_analysis = parameters['Input']['save_features_analysis'] statistical_analysis = parameters.getboolean('Input', 'statistical_analysis') # [Somas] -- GitLab