dvpt pca

8fedfebd · NADAL Morgane · 98cd4399 · 8fedfebd
Commit 8fedfebd authored 4 years ago by NADAL Morgane
--- a/features_analysis.py
+++ b/features_analysis.py
@@ -39,10 +39,21 @@ import scipy as si_
 import seaborn as sb_
 import os
 import glob
-import pandas as pd
+from typing import List
-def KmeansOnDF(df: pd_.DataFrame(), nb_clusters: tuple, representation: bool = False, labeled_somas=None, elbow: bool = False, intracluster_var : bool = True):
+def KmeansOnDF(df: pd_.DataFrame(),
+               nb_clusters: tuple,
+               representation: bool = False,
+               labeled_somas=None,
+               elbow: bool = False,
+               intracluster_var: bool = True,
+               ) -> KMeans:
+    '''
+    Perform kmeans on the pandas dataframe. Can find the best number of cluster with elbow method,
+    find the intracluster variance, and represent the result on the initial images.
+    Returns kmeans.
+    '''
    # Data standardization
    scaler = StandardScaler()
    scaler.fit(df)
@@ -97,6 +108,9 @@ def IntraClusterVariance(df: pd_.DataFrame(), kmeans: KMeans(), nb_cluster: int)
 def RepresentationOnImages(labeled_somas, kmeans, nb_cluster):
+    '''
+    Represent the result of kmeans on labeled image. IN DVPT. Only available for a kmean intra-image.
+    '''
    clustered_somas = labeled_somas.copy()
    clustered_somas = np_.amax(clustered_somas, axis=0)
    for indx, value in enumerate(kmeans.labels_):
@@ -130,26 +144,151 @@ def FeaturesStatistics(df):
        pl_.close()
-def PCAOnDF(df:pd_.DataFrame()):
+def PCAOnDF(df: pd_.DataFrame(),
+            target: str,
+            targets: List[str],
+            colors: List[str],
+            save_name: str = None,
+            title: str = ""
+            ) -> list:
+    '''
+    Perform 2D PCA on the CHO-DIO dataframe.
+    Print ratio variance and plot the PCA.
+    '''
+    # Separating the features from their conditions and durations
+    # all_target = df.loc[:, [target]].values
+    df_all = df.drop([target], axis=1)
+    # Standardize the data
    scaler = StandardScaler()
-    scaler.fit(df)
+    scaler.fit(df_all)
-    stand_df = scaler.transform(df)
+    stand_df = scaler.transform(df_all)
+    # Create the PCA and fit the data
    pca = PCA(n_components=2)
-    principalComponents = pca.fit_transform(stand_df)
+    principal_components = pca.fit_transform(stand_df)
-    principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])
+    print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
+    principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
-# if __name__ == "__main__":
-#     os.chdir("D:\MorganeNadal\Results\Features")
+    # Give the final df containing the principal component and their condition
-#     all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
+    final_df = pd_.concat([principal_df, df[target]], axis=1)
-#     print(all_filenames)
-#     df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
+    # Plot
-#     # df.to_csv("D:\MorganeNadal\Results\combined_features.csv")
+    fig = pl_.figure(figsize=(8, 8))
-#     # labeled_somas = np_.load("D:\\MorganeNadal\\Results\\labeled_somas.npy")
+    ax = fig.add_subplot(1, 1, 1)
-#     # df = pd_.read_csv("D:\\MorganeNadal\\M2 report\\Results\\features_all_images_DIO_CHO_.csv", index_col=0)
+    ax.set_xlabel('Principal Component 1', fontsize=15)
-#     df = df.drop(["spherical_angles_eva", "spherical_angles_evb", "hist_lengths", "hist_lengths_P", "hist_lengths_S",
+    ax.set_ylabel('Principal Component 2', fontsize=15)
-#                   "hist_curvature", "hist_curvature_P", "hist_curvature_S"],
+    ax.set_title(f'2 component PCA{title}', fontsize=20)
-#                  axis=1)
+    for tgt, color in zip(targets, colors):
-#     KMeansIntraImage(df, nb_clusters=(2,))
+        idx = final_df[target] == tgt
-#     # FeatureDistribution(df)
+        ax.scatter(final_df.loc[idx, 'principal component 1']
\ No newline at end of file
+                   , final_df.loc[idx, 'principal component 2']
+                   , c=color
+                   , s=30)
+    ax.legend(targets)
+    ax.grid()
+    if save_name is not None:
+        pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
+    return pca.explained_variance_ratio_
+if __name__ == "__main__":
+    #
+    # os.chdir("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion")
+    ## If need to concatenate files:
+    # all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
+    # print(all_filenames)
+    # df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
+    # df.to_csv(".\combined_features.csv")
+    ## If use labeled somas:
+    # labeled_somas = np_.load("D:\\MorganeNadal\\Results\\labeled_somas.npy")
+    # df = pd_.read_csv(".\combined_features.csv", index_col=0)
+    df0 = pd_.read_csv("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\all_features.csv",
+                      # index_col=0,
+                      )
+    df = df0.drop(["Unnamed: 0"], axis=1)
+    # Statistical analysis
+    # For the moment drop the columns with non scalar values, and un-useful values
+    # - TO BE CHANGED (use distance metrics such as bhattacharyya coef, etc)
+    df = df.drop(["soma uid",
+                  "spherical_angles_eva", "spherical_angles_evb",
+                  "hist_lengths", "hist_lengths_P", "hist_lengths_S",
+                  "hist_curvature", "hist_curvature_P", "hist_curvature_S"],
+                 axis=1)
+    df = df.dropna(axis=0, how="any")
+    # KMeansIntraImage(df, nb_clusters=(2,))
+    # FeatureDistribution(df)
+    # -- PCA with all the features
+    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    df_all = df.drop(["duration"], axis=1)
+    all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
+    # Between the two conditions, for each duration (2 conditions, 3 durations)
+    groupby_duration = df.groupby("duration")
+    duration_pca = []
+    for duration, values in groupby_duration:
+        # print(duration, values.shape)
+        # groupby_condition = values.groupby("condition")
+        # for cond, val in groupby_condition:
+        #     print(cond, val.shape)
+        ## duration: str, values: pd_.DataFrame()
+        duration_df = values.drop(["duration"], axis=1)
+        pca = PCAOnDF(duration_df,
+                      target="condition",
+                      targets=["CHO", "DIO"],
+                      colors=["b", "r"],
+                      save_name=f"{duration}_features",
+                      title=f" - {duration} Sample")
+        duration_pca.append(pca)
+    ## -- K-means with all the features (2 conditions)
+    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # kmeans = fa_.KmeansOnDF(concatenated_features_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
+    ## Between the two conditions, for each duration (2 conditions, 3 durations)
+    # for duration, values in groupby_duration:
+    # kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
+    ## -- Select Discriminant features by statistical analysis
+    # TODO filtered_df = SelectFeatures(concatenated_features_df)
+    ## -- PCA with all the features with the cluster as label
+    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # TODO pca = fa_.PCAOnDF(concatenated_features_df)
+    ## Between the two conditions, for each duration (2 conditions, 3 durations)
+    # for duration, values in groupby_duration:
+    # pca = fa_.PCAOnDF(values)
+    ## -- Select Discriminant features by statistical analysis
+    # TODO filtered_df = SelectFeatures(concatenated_features_df)
+    ## -- PCA with selected features
+    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # TODO pca = fa_.PCAOnDF(filtered_df)
+    ## Between the two conditions, for each duration (2 conditions, 3 durations)
+    # filtered_groupby_duration = filtered_df.groupby("Duration")
+    # for duration, values in filtered_groupby_duration:
+    # pca = fa_.PCAOnDF(values)
+    ## -- K-means with selected features
+    ## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
+    # filtered_kmeans = fa_.KmeansOnDF(filtered_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
+    ## Between the two conditions, for each duration (2 conditions, 3 durations)
+    # for duration, values in filtered_groupby_duration:
+    # filtered_kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
\ No newline at end of file