analysis.py

            DIO = cond_col_df.loc[cond_col_df["condition"] == "DIO"]

            sb_.distplot(CHO[[column]], color="b", ax=ax1)
            sb_.distplot(DIO[[column]], color="r", ax=ax1)

            # Draw a boxplot
            print(f"Plot a boxplot for feature {column}")
            sb_.boxplot(
                data=df_cond,
                x="condition",
                y=column,
                hue="condition",
                palette=["b", "r"],
                ax=ax2,
            )
            ax1.set_title(f"{column} distribution{title}", fontsize=11)
            ax2.set_title(f"{column} boxplot{title}", fontsize=11)

            pl_.tight_layout()

            if save_in is not None:
                pl_.savefig(str(path_t(save_in) / f"feat_distrib_{column}.png"))
                pl_.close()

    # Decision tree
    if decision_tree:
        # Test btw CHO and DIO
        dt_df = df.drop(["condition", "duration"], axis=1)
        clf = tree.DecisionTreeClassifier(max_depth=4)
        clf = clf.fit(dt_df, df["condition"])
        fig = pl_.figure(figsize=(150, 65))
        tree.plot_tree(
            clf,
            feature_names=df.columns,
            class_names=["CHO", "DIO"],
            filled=True,
            rounded=True,
            fontsize=60,
        )
        fig.suptitle(f"Decision tree all durations", fontsize=120)
        if save_in:
            pl_.savefig(
                str(path_t(save_in) / f"Decision_tree_all_durations_{title}.png")
            )
            pl_.close()

        # Test btw CHO and DIO depending on duration
        for duration, values in df_groupby_dur:
            duration_df = values.drop(["duration", "condition"], axis=1)
            clf = tree.DecisionTreeClassifier(max_depth=3)
            clf = clf.fit(duration_df, values["condition"])

            fig = pl_.figure(figsize=(30, 16))
            tree.plot_tree(
                clf,
                feature_names=df.columns,
                class_names=["CHO", "DIO"],
                filled=True,
                rounded=True,
                fontsize=8,
            )
            fig.suptitle(f"Decision tree {duration}", fontsize=16)
            if save_in:
                pl_.savefig(
                    str(path_t(save_in) / f"Decision_tree_{duration}_{title}.png")
                )
                pl_.close()


def BhattacharyyaSimilarity(h1, h2):
    return -nmpy.log(nmpy.sum(nmpy.sqrt(nmpy.multiply(Normalize(h1), Normalize(h2)))))


def Normalize(h):
    return h / nmpy.sum(h)


def Main():
    """"""
    # TODO: clean, reduce and optimize the code (many duplicates)
    #
    # os.chdir("path")

    ## If need to concatenate files:
    # all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
    # print(all_filenames)
    # df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
    # df.to_csv(".\combined_features.csv")

    ## If use labeled somas:
    # labeled_somas = nmpy.load("path.npy")
    # df = pd_.read_csv(".\combined_features.csv", index_col=0)

    ## Parameters
    path = sy_.argv[1]
    save_in = sy_.argv[2]

    ## DF cleaning
    df0 = pd_.read_csv(
        f"{path}\\features.csv",
        # index_col=0,
    )
    df = df0.drop(["Unnamed: 0"], axis=1)

    ## Statistical analysis
    # For the moment drop the columns with non scalar values, and un-useful values
    # - TO BE CHANGED TODO (use distance metrics such as bhattacharyya coef, etc)
    df = df.drop(
        [
            "soma uid",
            "spherical_angles_eva",
            "spherical_angles_evb",
            "hist_lengths",
            "hist_lengths_P",
            "hist_lengths_S",
            "hist_curvature",
            "hist_curvature_P",
            "hist_curvature_S",
        ],
        axis=1,
    )
    df = df.dropna(axis=0, how="any")

    # -- PCA with all the features
    print("\nALL FEATURES\n")
    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    PCAOnDF(
        df,
        target="condition",
        targets=["CHO", "DIO"],
        colors=["b", "r"],
        save_name="all_features",
        save_in=save_in,
        three_D=True,
    )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = df.groupby("duration")
    # for duration, values in groupby_duration:
    #     ## duration: str, values: pd_.DataFrame()
    #     PCAOnDF(values,
    #             target="condition",
    #             targets=["CHO", "DIO"],
    #             colors=["b", "r"],
    #             save_name=f"{duration}_features",
    #             save_in=save_in,
    #             title=f" - {duration} Sample",
    #             plot_duration=False,
    #             three_D=True,
    #             )

    # -- K-means with all the features (2 conditions)

    # Test for multiple glial populations
    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    # kmeans = KmeansOnDF(df,
    #                     target="condition",
    #                     nb_clusters=(2, 3, 4, 5),
    #                     elbow=True,
    #                     save_name="all_features_multiple_pop",
    #                     save_in=save_in,
    #                     features_distribution=False,
    #                     )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = df.groupby("duration")
    # for duration, values in groupby_duration:
    #     kmeans = KmeansOnDF(values,
    #                         target="condition",
    #                         nb_clusters=(2, 3, 4, 5),
    #                         elbow=False,
    #                         intracluster_var=True,
    #                         plot_bar=True,
    #                         save_name=f"{duration}_features_multiple_pop",
    #                         title=f" - {duration} Sample",
    #                         duration=True,
    #                         save_in=save_in,
    #                         )

    # -- Various plots to analyse the data and find discriminant features by statistical analysis
    print("\nFEATURE SELECTION\n")
    FeaturesStatistics(
        df,
        save_in=save_in,
    )
    ## TODO: Enter selected features here
    # selected_features = []
    # selected_df = df[selected_features]
    ## TODO Or use the csv with dropped features
    try:
        selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat.csv")
        # selected_df = pd_.read_csv(f"{save_in}\\df_drop_highly_corr_feat_6H.csv")
    except:
        raise RuntimeError(
            "Only run the part until FeaturesStatistics included to generate df_drop_highly_corr_feat.csv, and then run the last part."
        )
        ## If an error raises, only run the part until FeaturesStatistics included, and then run the last part.

    # if other columns need to be dropped:
    try:
        to_drop = ["Unnamed: 0", "min_curvature"]
        selected_df = selected_df.drop(to_drop, axis=1)
    except:
        selected_df = selected_df.drop(["Unnamed: 0"], axis=1)

    # -- PCA with all the features
    print("\nSELECTED FEATURES\n")
    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    PCAOnDF(
        selected_df,
        target="condition",
        targets=["CHO", "DIO"],
        colors=["b", "r"],
        save_name="all_selected_features",
        save_in=save_in,
        three_D=True,
    )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = selected_df.groupby("duration")
    # for duration, values in groupby_duration:
    #     # duration: str, values: pd_.DataFrame()
    #     PCAOnDF(values,
    #             target="condition",
    #             targets=["CHO", "DIO"],
    #             colors=["b", "r"],
    #             save_name=f"{duration}_selected_features",
    #             save_in=save_in,
    #             title=f" - {duration} Sample - selected features",
    #             plot_duration=False,
    #             three_D=True,
    #             )

    # -- K-means with all the features (2 conditions)

    # Between the two conditions, regardless the duration of experiment (2 conditions, all durations)
    # kmeans = KmeansOnDF(selected_df,
    #                     target="condition",
    #                     nb_clusters=(2, 3, 4, 5),
    #                     intracluster_var=False,
    #                     save_name="all_selected_features",
    #                     save_in=save_in,
    #                     )

    # # Between the two conditions, for each duration (2 conditions, 3 durations)
    # groupby_duration = selected_df.groupby("duration")
    # for duration, values in groupby_duration:
    #     kmeans = KmeansOnDF(values,
    #                         target="condition",
    #                         nb_clusters=(2,3,4,5),
    #                         elbow=False,
    #                         intracluster_var=True,
    #                         plot_bar=True,
    #                         save_name=f"{duration}_selected_features",
    #                         save_in=save_in,
    #                         title=f" - {duration} Sample - selected features",
    #                         duration=True,
    #                         )

    ## TODO: Random forests ?


if __name__ == "__main__":
    #
    Main()