cell_clustering.py

# Copyright CNRS/Inria/UNS
# Contributor(s): Eric Debreuve (since 2019), Morgane Nadal (2020)
#
# eric.debreuve@cnrs.fr
#
# This software is governed by the CeCILL  license under French law and
# abiding by the rules of distribution of free software.  You can  use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and,  more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.

import pandas as pd_
import numpy as np_
import matplotlib.pyplot as pl_
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
import glob
import pandas as pd


def KMeansIntraImage(df, nb_clusters: tuple, representation=False, labeled_somas=None):
    # Data standardization
    scaler = StandardScaler()
    scaler.fit(df)
    stand_df = scaler.transform(df)

    # Best number of clusters using Elbow method
    wcss = []  # within cluster sum of errors(wcss)
    for i in range(1, 24):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kmeans.fit(stand_df)
        wcss.append(kmeans.inertia_)
    pl_.plot(range(1, 24), wcss)
    pl_.plot(range(1, 24), wcss, 'bo')
    pl_.title('Elbow Method')
    pl_.xlabel('Number of clusters')
    pl_.ylabel('WCSS')
    pl_.show(block=True)
    pl_.close()

    # Kmeans with x clusters
    for nb_cluster in nb_clusters:
        kmeans = KMeans(n_clusters=nb_cluster, init='k-means++', max_iter=300, n_init=10, random_state=0)
        pred_y = kmeans.fit_predict(stand_df)

        # Intracluster variance
        var = []
        for cluster in range(nb_cluster):
            soma_cluster = [indx+1 for indx, value in enumerate(kmeans.labels_) if value == cluster]
            mean_cluster = np_.average([df.loc[f"soma {row}", :] for row in soma_cluster], axis=0)
            variance = sum([np_.linalg.norm(df.loc[f"soma {row}", :] - mean_cluster)**2 for row in soma_cluster])/len(soma_cluster)
            var.append(variance)
        print(f"Intracluster variance for {nb_cluster} clusters :", var)

        # Representation on the image
        if representation:
            RepresentationOnImages(labeled_somas, kmeans, nb_cluster)

def RepresentationOnImages(labeled_somas, kmeans, nb_cluster):
    clustered_somas = labeled_somas.copy()
    clustered_somas = np_.amax(clustered_somas, axis=0)
    for indx, value in enumerate(kmeans.labels_):
        for indx_axe, axe in enumerate(clustered_somas):
            for indx_pixel, pixel in enumerate(axe):
                if pixel == indx + 1:
                    clustered_somas[indx_axe][indx_pixel] = value + 1
    pl_.imshow(clustered_somas, cmap="tab20")
    pl_.title(f"n cluster = {nb_cluster}")
    pl_.show(block=True)
    pl_.close()


def FeatureDistribution(df):
    for column in df.columns:
        print(column)
        hist = df[column].hist(bins=20)
        pl_.title(f"{column}")
        pl_.savefig(f"D:\\MorganeNadal\\M2 report\\kmeans24\\feat_distrib_{column}.png")
        pl_.close()


if __name__ == "__main__":
    os.chdir("D:\MorganeNadal\Results\Features")
    all_filenames = [i for i in glob.glob('*.{}'.format("csv"))]
    print(all_filenames)
    df = pd_.concat([pd_.read_csv(f, index_col=0) for f in all_filenames])
    # df.to_csv("D:\MorganeNadal\Results\combined_features.csv")
    # labeled_somas = np_.load("D:\\MorganeNadal\\Results\\labeled_somas.npy")
    # df = pd_.read_csv("D:\\MorganeNadal\\M2 report\\Results\\features_all_images_DIO_CHO_.csv", index_col=0)
    df = df.drop(["spherical_angles_eva", "spherical_angles_evb", "hist_lengths", "hist_lengths_P", "hist_lengths_S",
                  "hist_curvature", "hist_curvature_P", "hist_curvature_S"],
                 axis=1)
    KMeansIntraImage(df, nb_clusters=(2,))
    # FeatureDistribution(df)