analysis.py

            print(f"Selection of low correlated features in: {save_in}")

    # Statistics for each features
    if distribution and (all_conditions.__len__() == 2):
        for column in df_scalar.columns:
            cond_col_df = pd_.concat((df_scalar[column], df_cond["condition"]), axis=1)

            fig = pl_.figure(constrained_layout=False)
            gs = fig.add_gridspec(ncols=2, nrows=1)
            ax1 = fig.add_subplot(gs[0, 0])
            ax2 = fig.add_subplot(gs[0, 1])

            # Plot a histogram and kernel density estimate
            print(f"Plot a histogram and kernel density estimate for feature {column}")
            CHO = cond_col_df.loc[cond_col_df["condition"] == all_conditions[0]]
            DIO = cond_col_df.loc[cond_col_df["condition"] == all_conditions[1]]

            sb_.histplot(CHO[[column]], color="b", ax=ax1)
            sb_.histplot(DIO[[column]], color="r", ax=ax1)

            # Draw a boxplot
            print(f"Plot a boxplot for feature {column}")
            sb_.boxplot(
                data=df_cond,
                x="condition",
                y=column,
                hue="condition",
                palette=["b", "r"],
                ax=ax2,
            )
            ax1.set_title(f"{column} distribution{title}", fontsize=11)
            ax2.set_title(f"{column} boxplot{title}", fontsize=11)

            pl_.tight_layout()

            if save_in is not None:
                pl_.savefig(str(path_t(save_in) / f"feat_distrib_{column}.png"))
                pl_.close()

    # Decision tree
    if decision_tree:
        # Test btw CHO and DIO
        dt_df = df.drop(["condition", "duration"], axis=1)
        clf = tree.DecisionTreeClassifier(max_depth=4)
        clf = clf.fit(dt_df, df["condition"])
        fig = pl_.figure(figsize=(150, 65))
        tree.plot_tree(
            clf,
            feature_names=df.columns.tolist(),
            class_names=list(all_conditions),
            filled=True,
            rounded=True,
            fontsize=60,
        )
        fig.suptitle(f"Decision tree all durations", fontsize=120)
        if save_in:
            pl_.savefig(
                str(path_t(save_in) / f"Decision_tree_all_durations_{title}.png")
            )
            pl_.close()

        # Test btw CHO and DIO depending on duration
        for duration, values in df_groupby_dur:
            duration_df = values.drop(["duration", "condition"], axis=1)
            clf = tree.DecisionTreeClassifier(max_depth=3)
            clf = clf.fit(duration_df, values["condition"])

            fig = pl_.figure(figsize=(30, 16))
            tree.plot_tree(
                clf,
                feature_names=df.columns.tolist(),
                class_names=list(all_conditions),
                filled=True,
                rounded=True,
                fontsize=8,
            )
            fig.suptitle(f"Decision tree {duration}", fontsize=16)
            if save_in:
                pl_.savefig(
                    str(path_t(save_in) / f"Decision_tree_{duration}_{title}.png")
                )
                pl_.close()


def BhattacharyyaSimilarity(h1, h2):
    return -nmpy.log(nmpy.sum(nmpy.sqrt(nmpy.multiply(Normalize(h1), Normalize(h2)))))


def Normalize(h):
    return h / nmpy.sum(h)


def Main(path, save_in):
    """"""
    # TODO: clean, reduce and optimize the code (many duplicates)
    #
    # os.chdir("path")

    ## If need to concatenate files:
    # all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
    # print(all_filenames)
    # df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
    # df.to_csv(".\combined_features.csv")

    ## If use labeled somas:
    # labeled_somas = nmpy.load("path.npy")
    # df = pd_.read_csv(".\combined_features.csv", index_col=0)

    ## DF cleaning
    df0 = pd_.read_csv(
        str(path),
        # index_col=0,
    )
    LOGGER.info(f"Read: {path}: {df0.shape}")
    # "Unnamed: 0"=column of index labels
    df = df0.drop(["Unnamed: 0"], axis=1)
    LOGGER.info(f"After dropping Unnamed: {df.shape}")

    ## Statistical analysis
    # For the moment drop the columns with non scalar values, and un-useful values
    # - TO BE CHANGED TODO (use distance metrics such as bhattacharyya coef, etc)
    df = df.drop(
        [
            "soma uid",
            "spherical_angles_eva",
            "spherical_angles_evb",
            "hist_lengths",
            "hist_lengths_P",
            "hist_lengths_S",
            "hist_curvature",
            "hist_curvature_P",
            "hist_curvature_S",
        ],
        axis=1,
    )
    LOGGER.info(f"After dropping non-scalar: {df.shape}")
    df = df.dropna(axis=0, how="any")
    LOGGER.info(f"After dropping NaN: {df.shape}")

    # -- PCA with all the features
    print("\nALL FEATURES\n")
    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    PCAOnDF(
        df,
        target="condition",
        targets=_Conditions(df),
        colors=["b", "r"],
        save_name="all_features",
        save_in=save_in,
        three_D=True,
    )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = df.groupby("duration")
    # for duration, values in groupby_duration:
    #     ## duration: str, values: pd_.DataFrame()
    #     PCAOnDF(values,
    #             target="condition",
    #             targets=["CHO", "DIO"],
    #             colors=["b", "r"],
    #             save_name=f"{duration}_features",
    #             save_in=save_in,
    #             title=f" - {duration} Sample",
    #             plot_duration=False,
    #             three_D=True,
    #             )

    # -- K-means with all the features (2 conditions)

    # Test for multiple glial populations
    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    # kmeans = KmeansOnDF(df,
    #                     target="condition",
    #                     nb_clusters=(2, 3, 4, 5),
    #                     elbow=True,
    #                     save_name="all_features_multiple_pop",
    #                     save_in=save_in,
    #                     features_distribution=False,
    #                     )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = df.groupby("duration")
    # for duration, values in groupby_duration:
    #     kmeans = KmeansOnDF(values,
    #                         target="condition",
    #                         nb_clusters=(2, 3, 4, 5),
    #                         elbow=False,
    #                         intracluster_var=True,
    #                         plot_bar=True,
    #                         save_name=f"{duration}_features_multiple_pop",
    #                         title=f" - {duration} Sample",
    #                         duration=True,
    #                         save_in=save_in,
    #                         )

    # -- Various plots to analyse the data and find discriminant features by statistical analysis
    print("\nFEATURE SELECTION\n")
    FeaturesStatistics(
        df,
        save_in=save_in,
    )
    ## TODO: Enter selected features here
    # selected_features = []
    # selected_df = df[selected_features]
    ## TODO Or use the csv with dropped features
    try:
        selected_df = pd_.read_csv(str(path_t(save_in) / "df_drop_highly_corr_feat.csv"))
        LOGGER.info(f"Read: {path_t(save_in) / 'df_drop_highly_corr_feat.csv'}: {selected_df.shape}")
        # selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv")
    except:
        raise RuntimeError(
            "Only run the part until FeaturesStatistics included to generate df_drop_highly_corr_feat.csv, and then run the last part."
        )
        ## If an error raises, only run the part until FeaturesStatistics included, and then run the last part.

    # if other columns need to be dropped:
    try:
        to_drop = ["Unnamed: 0", "min_curvature"]
        selected_df = selected_df.drop(to_drop, axis=1)
        LOGGER.info(f"After dropping Unnamed and min_curvature: {selected_df.shape}")
    except:
        selected_df = selected_df.drop(["Unnamed: 0"], axis=1)
        LOGGER.info(f"After dropping Unnamed: {selected_df.shape}")

    # -- PCA with all the features
    print("\nSELECTED FEATURES\n")
    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    if selected_df.columns.size == 2:
        LOGGER.warning("All features are highly correlated; No selected feature analysis.")
    else:
        PCAOnDF(
            selected_df,
            target="condition",
            targets=_Conditions(selected_df),
            colors=["b", "r"],
            save_name="all_selected_features",
            save_in=save_in,
            three_D=True,
        )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = selected_df.groupby("duration")
    # for duration, values in groupby_duration:
    #     # duration: str, values: pd_.DataFrame()
    #     PCAOnDF(values,
    #             target="condition",
    #             targets=["CHO", "DIO"],
    #             colors=["b", "r"],
    #             save_name=f"{duration}_selected_features",
    #             save_in=save_in,
    #             title=f" - {duration} Sample - selected features",
    #             plot_duration=False,
    #             three_D=True,
    #             )

    # -- K-means with all the features (2 conditions)

    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    # kmeans = KmeansOnDF(selected_df,
    #                     target="condition",
    #                     nb_clusters=(2, 3, 4, 5),
    #                     intracluster_var=False,
    #                     save_name="all_selected_features",
    #                     save_in=save_in,
    #                     )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = selected_df.groupby("duration")
    # for duration, values in groupby_duration:
    #     kmeans = KmeansOnDF(values,
    #                         target="condition",
    #                         nb_clusters=(2,3,4,5),
    #                         elbow=False,
    #                         intracluster_var=True,
    #                         plot_bar=True,
    #                         save_name=f"{duration}_selected_features",
    #                         save_in=save_in,
    #                         title=f" - {duration} Sample - selected features",
    #                         duration=True,
    #                         )

    ## TODO: Random forests ?


if __name__ == "__main__":
    #
    Main(sy_.argv[1], sy_.argv[2])