Newer
Older
print(f"Selection of low correlated features in: {save_in}")
# Statistics for each features
if distribution and (all_conditions.__len__() == 2):
for column in df_scalar.columns:
cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1)
fig = pl_.figure(constrained_layout=False)
gs = fig.add_gridspec(ncols=2, nrows=1)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
# Plot a histogram and kernel density estimate
print(f"Plot a histogram and kernel density estimate for feature {column}")
CHO = cond_col_df.loc[cond_col_df["condition"] == all_conditions[0]]
DIO = cond_col_df.loc[cond_col_df["condition"] == all_conditions[1]]
sb_.histplot(CHO[[column]], color="b", ax=ax1)
sb_.histplot(DIO[[column]], color="r", ax=ax1)
# Draw a boxplot
print(f"Plot a boxplot for feature {column}")
sb_.boxplot(
data=df_cond,
x="condition",
y=column,
hue="condition",
palette=["b", "r"],
ax=ax2,
)
ax1.set_title(f"{column} distribution{title}", fontsize=11)
ax2.set_title(f"{column} boxplot{title}", fontsize=11)
pl_.tight_layout()
if save_in is not None:
pl_.savefig(str(path_t(save_in) / f"feat_distrib_{column}.png"))
pl_.close()
# Decision tree
if decision_tree:
# Test btw CHO and DIO
dt_df = df.drop(["condition", "duration"], axis=1)
clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(dt_df, df["condition"])
fig = pl_.figure(figsize=(150, 65))
tree.plot_tree(
clf,
feature_names=df.columns.tolist(),
class_names=list(all_conditions),
filled=True,
rounded=True,
fontsize=60,
)
fig.suptitle(f"Decision tree all durations", fontsize=120)
if save_in:
pl_.savefig(
str(path_t(save_in) / f"Decision_tree_all_durations_{title}.png")
)
pl_.close()
# Test btw CHO and DIO depending on duration
for duration, values in df_groupby_dur:
duration_df = values.drop(["duration", "condition"], axis=1)
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(duration_df, values["condition"])
fig = pl_.figure(figsize=(30, 16))
tree.plot_tree(
clf,
feature_names=df.columns.tolist(),
class_names=list(all_conditions),
filled=True,
rounded=True,
fontsize=8,
)
fig.suptitle(f"Decision tree {duration}", fontsize=16)
if save_in:
pl_.savefig(
str(path_t(save_in) / f"Decision_tree_{duration}_{title}.png")
)
pl_.close()
def BhattacharyyaSimilarity(h1, h2):
return -nmpy.log(nmpy.sum(nmpy.sqrt(nmpy.multiply(Normalize(h1), Normalize(h2)))))
return h / nmpy.sum(h)
def Main(path, save_in):
# TODO: clean, reduce and optimize the code (many duplicates)
#
# os.chdir("path")
## If need to concatenate files:
# all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
# print(all_filenames)
# df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
# df.to_csv(".\combined_features.csv")
## If use labeled somas:
# labeled_somas = nmpy.load("path.npy")
# df = pd_.read_csv(".\combined_features.csv", index_col=0)
## DF cleaning
str(path),
# index_col=0,
)
LOGGER.info(f"Read: {path}: {df0.shape}")
# "Unnamed: 0"=column of index labels
LOGGER.info(f"After dropping Unnamed: {df.shape}")
## Statistical analysis
# For the moment drop the columns with non scalar values, and un-useful values
# - TO BE CHANGED TODO (use distance metrics such as bhattacharyya coef, etc)
df = df.drop(
[
"soma uid",
"spherical_angles_eva",
"spherical_angles_evb",
"hist_lengths",
"hist_lengths_P",
"hist_lengths_S",
"hist_curvature",
"hist_curvature_P",
"hist_curvature_S",
],
axis=1,
)
LOGGER.info(f"After dropping non-scalar: {df.shape}")
LOGGER.info(f"After dropping NaN: {df.shape}")
# -- PCA with all the features
print("\nALL FEATURES\n")
# Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
PCAOnDF(
df,
target="condition",
targets=_Conditions(df),
colors=["b", "r"],
save_name="all_features",
save_in=save_in,
three_D=True,
)
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = df.groupby("duration")
# for duration, values in groupby_duration:
# ## duration: str, values: pd_.DataFrame()
# PCAOnDF(values,
# target="condition",
# targets=["CHO", "DIO"],
# colors=["b", "r"],
# save_name=f"{duration}_features",
# save_in=save_in,
# title=f" - {duration} Sample",
# plot_duration=False,
# three_D=True,
# )
# -- K-means with all the features (2 conditions)
# Test for multiple glial populations
# Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# kmeans = KmeansOnDF(df,
# target="condition",
# nb_clusters=(2, 3, 4, 5),
# elbow=True,
# save_name="all_features_multiple_pop",
# save_in=save_in,
# features_distribution=False,
# )
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = df.groupby("duration")
# for duration, values in groupby_duration:
# kmeans = KmeansOnDF(values,
# target="condition",
# nb_clusters=(2, 3, 4, 5),
# elbow=False,
# intracluster_var=True,
# plot_bar=True,
# save_name=f"{duration}_features_multiple_pop",
# title=f" - {duration} Sample",
# duration=True,
# save_in=save_in,
# )
# -- Various plots to analyse the data and find discriminant features by statistical analysis
print("\nFEATURE SELECTION\n")
FeaturesStatistics(
df,
save_in=save_in,
)
## TODO: Enter selected features here
# selected_features = []
# selected_df = df[selected_features]
## TODO Or use the csv with dropped features
try:
selected_df = pd_.read_csv(str(path_t(save_in) / "df_drop_highly_corr_feat.csv"))
LOGGER.info(f"Read: {path_t(save_in) / 'df_drop_highly_corr_feat.csv'}: {selected_df.shape}")
# selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv")
except:
raise RuntimeError(
"Only run the part until FeaturesStatistics included to generate df_drop_highly_corr_feat.csv, and then run the last part."
)
## If an error raises, only run the part until FeaturesStatistics included, and then run the last part.
# if other columns need to be dropped:
try:
to_drop = ["Unnamed: 0", "min_curvature"]
selected_df = selected_df.drop(to_drop, axis=1)
LOGGER.info(f"After dropping Unnamed and min_curvature: {selected_df.shape}")
except:
selected_df = selected_df.drop(["Unnamed: 0"], axis=1)
LOGGER.info(f"After dropping Unnamed: {selected_df.shape}")
# -- PCA with all the features
print("\nSELECTED FEATURES\n")
# Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
if selected_df.columns.size == 2:
LOGGER.warning("All features are highly correlated; No selected feature analysis.")
else:
PCAOnDF(
selected_df,
target="condition",
targets=_Conditions(selected_df),
colors=["b", "r"],
save_name="all_selected_features",
save_in=save_in,
three_D=True,
)
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = selected_df.groupby("duration")
# for duration, values in groupby_duration:
# # duration: str, values: pd_.DataFrame()
# PCAOnDF(values,
# target="condition",
# targets=["CHO", "DIO"],
# colors=["b", "r"],
# save_name=f"{duration}_selected_features",
# save_in=save_in,
# title=f" - {duration} Sample - selected features",
# plot_duration=False,
# three_D=True,
# )
# -- K-means with all the features (2 conditions)
# Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
# kmeans = KmeansOnDF(selected_df,
# target="condition",
# nb_clusters=(2, 3, 4, 5),
# intracluster_var=False,
# save_name="all_selected_features",
# save_in=save_in,
# )
# # Between the two conditions, for each duration (2 conditions, 3 durations)
# groupby_duration = selected_df.groupby("duration")
# for duration, values in groupby_duration:
# kmeans = KmeansOnDF(values,
# target="condition",
# nb_clusters=(2,3,4,5),
# elbow=False,
# intracluster_var=True,
# plot_bar=True,
# save_name=f"{duration}_selected_features",
# save_in=save_in,
# title=f" - {duration} Sample - selected features",
# duration=True,
# )
## TODO: Random forests ?
if __name__ == "__main__":
#
Main(sy_.argv[1], sy_.argv[2])