Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8fedfebd authored by NADAL Morgane's avatar NADAL Morgane
Browse files

dvpt pca

parent 98cd4399
No related branches found
No related tags found
No related merge requests found
...@@ -39,10 +39,21 @@ import scipy as si_ ...@@ -39,10 +39,21 @@ import scipy as si_
import seaborn as sb_ import seaborn as sb_
import os import os
import glob import glob
import pandas as pd from typing import List
def KmeansOnDF(df: pd_.DataFrame(), nb_clusters: tuple, representation: bool = False, labeled_somas=None, elbow: bool = False, intracluster_var : bool = True): def KmeansOnDF(df: pd_.DataFrame(),
nb_clusters: tuple,
representation: bool = False,
labeled_somas=None,
elbow: bool = False,
intracluster_var: bool = True,
) -> KMeans:
'''
Perform kmeans on the pandas dataframe. Can find the best number of cluster with elbow method,
find the intracluster variance, and represent the result on the initial images.
Returns kmeans.
'''
# Data standardization # Data standardization
scaler = StandardScaler() scaler = StandardScaler()
scaler.fit(df) scaler.fit(df)
...@@ -97,6 +108,9 @@ def IntraClusterVariance(df: pd_.DataFrame(), kmeans: KMeans(), nb_cluster: int) ...@@ -97,6 +108,9 @@ def IntraClusterVariance(df: pd_.DataFrame(), kmeans: KMeans(), nb_cluster: int)
def RepresentationOnImages(labeled_somas, kmeans, nb_cluster): def RepresentationOnImages(labeled_somas, kmeans, nb_cluster):
'''
Represent the result of kmeans on labeled image. IN DVPT. Only available for a kmean intra-image.
'''
clustered_somas = labeled_somas.copy() clustered_somas = labeled_somas.copy()
clustered_somas = np_.amax(clustered_somas, axis=0) clustered_somas = np_.amax(clustered_somas, axis=0)
for indx, value in enumerate(kmeans.labels_): for indx, value in enumerate(kmeans.labels_):
...@@ -130,26 +144,151 @@ def FeaturesStatistics(df): ...@@ -130,26 +144,151 @@ def FeaturesStatistics(df):
pl_.close() pl_.close()
def PCAOnDF(df:pd_.DataFrame()): def PCAOnDF(df: pd_.DataFrame(),
target: str,
targets: List[str],
colors: List[str],
save_name: str = None,
title: str = ""
) -> list:
'''
Perform 2D PCA on the CHO-DIO dataframe.
Print ratio variance and plot the PCA.
'''
# Separating the features from their conditions and durations
# all_target = df.loc[:, [target]].values
df_all = df.drop([target], axis=1)
# Standardize the data
scaler = StandardScaler() scaler = StandardScaler()
scaler.fit(df) scaler.fit(df_all)
stand_df = scaler.transform(df) stand_df = scaler.transform(df_all)
# Create the PCA and fit the data
pca = PCA(n_components=2) pca = PCA(n_components=2)
principalComponents = pca.fit_transform(stand_df) principal_components = pca.fit_transform(stand_df)
principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2']) print(f"PCA explained variance ratio ({save_name}): ", pca.explained_variance_ratio_)
principal_df = pd_.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
# if __name__ == "__main__":
# os.chdir("D:\MorganeNadal\Results\Features") # Give the final df containing the principal component and their condition
# all_filenames = [i for i in glob.glob('*.{}'.format("csv"))] final_df = pd_.concat([principal_df, df[target]], axis=1)
# print(all_filenames)
# df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames]) # Plot
# # df.to_csv("D:\MorganeNadal\Results\combined_features.csv") fig = pl_.figure(figsize=(8, 8))
# # labeled_somas = np_.load("D:\\MorganeNadal\\Results\\labeled_somas.npy") ax = fig.add_subplot(1, 1, 1)
# # df = pd_.read_csv("D:\\MorganeNadal\\M2 report\\Results\\features_all_images_DIO_CHO_.csv", index_col=0) ax.set_xlabel('Principal Component 1', fontsize=15)
# df = df.drop(["spherical_angles_eva", "spherical_angles_evb", "hist_lengths", "hist_lengths_P", "hist_lengths_S", ax.set_ylabel('Principal Component 2', fontsize=15)
# "hist_curvature", "hist_curvature_P", "hist_curvature_S"], ax.set_title(f'2 component PCA{title}', fontsize=20)
# axis=1) for tgt, color in zip(targets, colors):
# KMeansIntraImage(df, nb_clusters=(2,)) idx = final_df[target] == tgt
# # FeatureDistribution(df) ax.scatter(final_df.loc[idx, 'principal component 1']
\ No newline at end of file , final_df.loc[idx, 'principal component 2']
, c=color
, s=30)
ax.legend(targets)
ax.grid()
if save_name is not None:
pl_.savefig(f"D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\Features_analysis\\PCA_{save_name}.png")
return pca.explained_variance_ratio_
if __name__ == "__main__":
#
# os.chdir("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion")
## If need to concatenate files:
# all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
# print(all_filenames)
# df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
# df.to_csv(".\combined_features.csv")
## If use labeled somas:
# labeled_somas = np_.load("D:\\MorganeNadal\\Results\\labeled_somas.npy")
# df = pd_.read_csv(".\combined_features.csv", index_col=0)
df0 = pd_.read_csv("D:\\MorganeNadal\\2_RESULTS\\Results_wo_erosion\\all_features.csv",
# index_col=0,
)
df = df0.drop(["Unnamed: 0"], axis=1)
# Statistical analysis
# For the moment drop the columns with non scalar values, and un-useful values
# - TO BE CHANGED (use distance metrics such as bhattacharyya coef, etc)
df = df.drop(["soma uid",
"spherical_angles_eva", "spherical_angles_evb",
"hist_lengths", "hist_lengths_P", "hist_lengths_S",
"hist_curvature", "hist_curvature_P", "hist_curvature_S"],
axis=1)
df = df.dropna(axis=0, how="any")
# KMeansIntraImage(df, nb_clusters=(2,))
# FeatureDistribution(df)
# -- PCA with all the features
# Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
df_all = df.drop(["duration"], axis=1)
all_pca = PCAOnDF(df_all, target="condition", targets=["CHO", "DIO"], colors=["b", "r"], save_name="all_features")
# Between the two conditions, for each duration (2 conditions, 3 durations)
groupby_duration = df.groupby("duration")
duration_pca = []
for duration, values in groupby_duration:
# print(duration, values.shape)
# groupby_condition = values.groupby("condition")
# for cond, val in groupby_condition:
# print(cond, val.shape)
## duration: str, values: pd_.DataFrame()
duration_df = values.drop(["duration"], axis=1)
pca = PCAOnDF(duration_df,
target="condition",
targets=["CHO", "DIO"],
colors=["b", "r"],
save_name=f"{duration}_features",
title=f" - {duration} Sample")
duration_pca.append(pca)
## -- K-means with all the features (2 conditions)
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# kmeans = fa_.KmeansOnDF(concatenated_features_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# for duration, values in groupby_duration:
# kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
## -- Select Discriminant features by statistical analysis
# TODO filtered_df = SelectFeatures(concatenated_features_df)
## -- PCA with all the features with the cluster as label
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# TODO pca = fa_.PCAOnDF(concatenated_features_df)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# for duration, values in groupby_duration:
# pca = fa_.PCAOnDF(values)
## -- Select Discriminant features by statistical analysis
# TODO filtered_df = SelectFeatures(concatenated_features_df)
## -- PCA with selected features
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# TODO pca = fa_.PCAOnDF(filtered_df)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# filtered_groupby_duration = filtered_df.groupby("Duration")
# for duration, values in filtered_groupby_duration:
# pca = fa_.PCAOnDF(values)
## -- K-means with selected features
## Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# filtered_kmeans = fa_.KmeansOnDF(filtered_df, nb_clusters=(2,), elbow=True, intracluster_var=True)
## Between the two conditions, for each duration (2 conditions, 3 durations)
# for duration, values in filtered_groupby_duration:
# filtered_kmeans = fa_.KmeansOnDF(values, nb_clusters=(2,), elbow=True, intracluster_var=True)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment