remove old files

d4ae31ed · CREMONESI Francesco · ff68684b · ff68684b · ff68684b · ff68684b
Commit d4ae31ed authored 1 year ago by CREMONESI Francesco
--- a/federated_learning/FedAvg_FedProx_MNIST_iid_and_noniid.ipynb
+++ b/federated_learning/FedAvg_FedProx_MNIST_iid_and_noniid.ipynb
--- a/federated_learning/create_MNIST_datasets.py
+++ b/federated_learning/create_MNIST_datasets.py
-import torch
-from torchvision import datasets
-from torchvision import transforms
-import matplotlib.pyplot as plt
-
-def non_iid_split(dataset, nb_nodes, n_samples_per_node, batch_size, shuffle, shuffle_digits=False):
-    assert(nb_nodes>0 and nb_nodes<=10)
-
-    digits=torch.arange(10) if shuffle_digits==False else torch.randperm(10, generator=torch.Generator().manual_seed(0))
-
-    # split the digits in a fair way
-    digits_split=list()
-    i=0
-    for n in range(nb_nodes, 0, -1):
-        inc=int((10-i)/n)
-        digits_split.append(digits[i:i+inc])
-        i+=inc
-
-    # load and shuffle nb_nodes*n_samples_per_node from the dataset
-    loader = torch.utils.data.DataLoader(dataset,
-                                        batch_size=nb_nodes*n_samples_per_node,
-                                        shuffle=shuffle)
-    dataiter = iter(loader)
-    images_train_mnist, labels_train_mnist = dataiter.next()
-
-    data_splitted=list()
-    for i in range(nb_nodes):
-        idx=torch.stack([y_ == labels_train_mnist for y_ in digits_split[i]]).sum(0).bool() # get indices for the digits
-        data_splitted.append(torch.utils.data.DataLoader(torch.utils.data.TensorDataset(images_train_mnist[idx], labels_train_mnist[idx]), batch_size=batch_size, shuffle=shuffle))
-
-    return data_splitted
-
-
-
-def iid_split(dataset, nb_nodes, n_samples_per_node, batch_size, shuffle):
-    # load and shuffle n_samples_per_node from the dataset
-    loader = torch.utils.data.DataLoader(dataset,
-                                        batch_size=n_samples_per_node,
-                                        shuffle=shuffle)
-    dataiter = iter(loader)
-    
-    data_splitted=list()
-    for _ in range(nb_nodes):
-        data_splitted.append(torch.utils.data.DataLoader(torch.utils.data.TensorDataset(*(dataiter.next())), batch_size=batch_size, shuffle=shuffle))
-
-    return data_splitted
-
-
-def  get_MNIST(type="iid", n_samples_train=200, n_samples_test=100, n_clients=3, batch_size=25, shuffle=True):
-    dataset_loaded_train = datasets.MNIST(
-            root="./data",
-            train=True,
-            download=True,
-            transform=transforms.ToTensor()
-    )
-    dataset_loaded_test = datasets.MNIST(
-            root="./data",
-            train=False,
-            download=True,
-            transform=transforms.ToTensor()
-    )
-
-    if type=="iid":
-        train=iid_split(dataset_loaded_train, n_clients, n_samples_train, batch_size, shuffle)
-        test=iid_split(dataset_loaded_test, n_clients, n_samples_test, batch_size, shuffle)
-    elif type=="non_iid":
-        train=non_iid_split(dataset_loaded_train, n_clients, n_samples_train, batch_size, shuffle)
-        test=non_iid_split(dataset_loaded_test, n_clients, n_samples_test, batch_size, shuffle)
-    else:
-        train=[]
-        test=[]
-
-    return train, test
-
-
-    
-def plot_samples(data, channel:int, title=None, plot_name="", n_examples =20):
-
-    n_rows = int(n_examples / 5)
-    plt.figure(figsize=(1* n_rows, 1*n_rows))
-    if title: plt.suptitle(title)
-    X, y= data
-    for idx in range(n_examples):
-        
-        ax = plt.subplot(n_rows, 5, idx + 1)
-
-        image = 255 - X[idx, channel].view((28,28))
-        ax.imshow(image, cmap='gist_gray')
-        ax.axis("off")
-
-    if plot_name!="":plt.savefig(f"plots/"+plot_name+".png")
-
-    plt.tight_layout()
-   
\ No newline at end of file
--- a/federated_learning/create_synthetic_MNIST_datasets.py
+++ b/federated_learning/create_synthetic_MNIST_datasets.py
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""This code to create a custom MNIST dataset was made possible thanks to
- https://github.com/LaRiffle/collateral-learning . 
- 
-Important to know that aside the tampering I did on the build_dataset function
-for my own application, I also had to change rgba_to_rgb. Indeed, the function
-was working as desired on Jupyter but not on Spyder. Do not ask me why !
-"""
-
-
-
-import matplotlib.pyplot as plt
-import numpy as np
-from scipy.ndimage.interpolation import map_coordinates
-from scipy.ndimage.filters import gaussian_filter
-import pickle
-import torch
-import math
-import os
-
-
-import torchvision.datasets as datasets
-import torchvision.transforms as transforms
-from torch.utils.data import Dataset,DataLoader
-
-
-"""PLOT FUNCTIONS TO VISUALIZE THE FONTS AND DATASETS"""
-def show_original_font(family:str):
-    """Plot the original numbers used to create the dataset"""
-    
-    plt.figure()
-    plt.title(family)
-    plt.text(0, 0.4, '1234567890', size=50, family=family)
-    plt.axis("off")
-    plt.tight_layout()
-    plt.savefig(f"plots/{family}_original.png") 
-    
-    
-def convert_to_rgb(data):
-    
-    def rgba_to_rgb(rgba):
-        return rgba[1:]
-
-    return np.apply_along_axis(rgba_to_rgb, 2, data) 
-
-
-
-def elastic_transform(image, alpha, sigma, random_state=None):
-    """Elastic deformation of images as described in [Simard2003]_.
-    .. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
-       Convolutional Neural Networks applied to Visual Document Analysis", in
-       Proc. of the International Conference on Document Analysis and
-       Recognition, 2003.
-    """
-    if random_state is None:
-        random_state = np.random.RandomState(None)
-
-    shape = np.array([28, 28, 3], dtype =int)
-    dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
-    dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
-
-    x, y, z = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), np.arange(shape[2]))
-    #print(x.shape, y.shape, z.shape)
-    #print(dx.shape, dy.shape)
-    #x, y, z = x[:28, :28, :3], y[:28, :28, :3], z[:28, :28, :3]
-    #dx, dy = dx[:28, :28, :3], dy[:28, :28, :3]
-    indices = np.reshape(y+dy, (-1, 1)), np.reshape(x+dx, (-1, 1)), np.reshape(z, (-1, 1))
-
-    distored_image = map_coordinates(image, indices, order=1, mode='reflect')
-    return distored_image.reshape(shape)
-
-
-
-def center(data):
-    # Inverse black and white
-    wb_data = np.ones(data.shape) * 255 - data
-    
-    # normalize
-    prob_data = wb_data / np.sum(wb_data)
-    
-    # marginal distributions
-    dx = np.sum(prob_data, (1, 2))
-    dy = np.sum(prob_data, (0, 2))
-
-    # expected values
-    (X, Y, Z) = prob_data.shape
-    cx = np.sum(dx * np.arange(X))
-    cy = np.sum(dy * np.arange(Y))
-    
-    # Check bounds
-    assert cx > X/4 and cx < 3 * X/4, f"ERROR: {cx} > {X/4} and {cx} < {3 * X/4}"
-    assert cy > Y/4 and cy < 3 * Y/4, f"ERROR: {cy} > {Y/4} and {cy} < {3 * Y/4}"
-    
-    # print('Center', cx, cy)
-    
-    x_min = int(round(cx - X/4))
-    x_max = int(round(cx + X/4))
-    y_min = int(round(cy - Y/4))
-    y_max = int(round(cy + Y/4))
-    
-    return data[x_min:x_max, y_min:y_max, :]
-   
-
-
-def create_transformed_digit(digit:int, size:float, rotation:float, family:str):
-    
-    fig = plt.figure(figsize=(2,2), dpi=28)
-    fig.text(0.4, 0.4, str(digit), size=size, rotation=rotation, family=family)
-
-    # Rm axes, draw and get the rgba shape of the digit
-    plt.axis('off')
-    fig.canvas.draw()
-    data = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
-    data = data.reshape(fig.canvas.get_width_height()[::-1] + (4,))
-
-    # Convert to rgb
-    data = convert_to_rgb(data)
-
-    # Center the data
-    data = center(data)
-
-    # Apply an elastic deformation
-    data = elastic_transform(data, alpha=991, sigma=9)
-
-    # Free memory space
-    plt.close(fig)
-    
-    return data
-
-    
-
-def save_dataset(dataset_name:str, array_X:np.array, array_y:np.array):
-    
-    with open(f'{dataset_name}.pkl', 'wb') as output:
-        dataset = array_X, array_y
-        pickle.dump(dataset, output)
-        
-        
-        
-def build_dataset(C:dict, std_size=2.5):
-    """build a dataset with `dataset_size` according to the chosen font
-    and deformation. Only digits in `datasets_digits` are in the created 
-    dataset."""
-    
-    numbers_str="".join([str(n) for n in C['numbers']])
-    file_name=f"{C['font']}_{numbers_str}_{C['n_samples']}_{C['tilt']}_{C['seed']}"    
-    
-    if os.path.isfile(f"{file_name}.pkl"):
-        return pickle.load(open(f"{file_name}.pkl", "rb"))
-    
-    
-    if C['seed']: np.random.seed(C['seed'])
-    
-    #Make a plot of each original digit to know what they look like
-#    show_original_font(C['font'])
-    
-    list_X = []
-    list_y= []
-    
-    for i in range(C['n_samples']):
-        
-        if i%10 == 0: print(round(i / C['n_samples'] * 100), '%')
-        
-        X = np.zeros((3, 28, 28 ))
-        #Choosing a number at this step and its transformation characteristics
-        digit = C["numbers"][np.random.randint(len(C["numbers"]))]
-
-        for j, tilt in enumerate(C['tilt']):
-        	rotation = tilt + np.random.normal(0, C['std_tilt'])
-        	size = 60 + np.random.normal(0, std_size)         	
-
-        	X_tilt=create_transformed_digit(digit, size, rotation, C['font'])
-
-        	X[j] = X_tilt[:, :, j]
-
-        # Append data to the datasets
-        #list_X.append(X[:,:,0])
-        list_X.append(X)
-        list_y.append([digit])
-    
-    #save the dataset
-    dataset = (np.array(list_X), np.array(list_y))
-    pickle.dump(dataset, open(f'{file_name}.pkl', 'wb'))
-    
-    return np.array(list_X), np.array(list_y)
-
- 
-class Ds_MNIST_modified(Dataset):
-    """Creation of the dataset used to create the clients' dataloader"""
-    
-    def __init__(self, features, labels):
-        self.features = features
-        self.labels = labels
-    
-    def __len__(self): return len(self.features)
-
-    def __getitem__(self,idx):
-        
-        #3D input 1x28x28
-        sample_x = torch.Tensor(self.features[idx])
-        sample_y = self.labels[idx]
-        
-        return sample_x, sample_y
-
-
-    def plot_samples(self, channel:int, title=None, plot_name="", 
-        n_examples =20):
-    
-        n_rows = int(n_examples / 5)
-        plt.figure(figsize=(1* n_rows, 1*n_rows))
-        if title: plt.suptitle(title)
-            
-        for idx in range(n_examples):
-            
-            X, y = self[idx]
-
-            ax = plt.subplot(n_rows, 5, idx + 1)
-
-            image = 255 - X.view((-1, 28, 28))[channel]
-            ax.imshow(image, cmap='gist_gray')
-            ax.axis("off")
-
-        if plot_name!="":plt.savefig(f"plots/"+plot_name+".png")
-
-        plt.tight_layout()
-
-    
-    
-
-def get_synth_MNIST(clients, batch_size:int, shuffle=True):
-    """function returning a list of training and testing dls."""
-    
-    list_train, list_test = [], []
-    
-    for C in clients:
-        X, y = build_dataset(C)
-        X = (255 - X) /255
-
-        X_train, y_train = X[:C['n_samples_train']], y[:C['n_samples_train']]
-        X_test, y_test = X[C['n_samples_train']:], y[C['n_samples_train']:]
-            
-        train_ds = Ds_MNIST_modified(X_train, y_train)         
-        train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = shuffle)
-        list_train.append(train_dl)
-         
-        test_ds = Ds_MNIST_modified(X_test, y_test)         
-        test_dl = DataLoader(test_ds, batch_size = batch_size, shuffle = shuffle)  
-        list_test.append(test_dl)
-        
-    return list_train, list_test
-    
\ No newline at end of file
--- a/federated_learning/federated_mcvae.ipynb
+++ b/federated_learning/federated_mcvae.ipynb
--- a/federated_learning/federated_mcvae_adni.ipynb
+++ b/federated_learning/federated_mcvae_adni.ipynb
--- a/federated_learning/fl-graph.png
+++ b/federated_learning/fl-graph.png
--- a/federated_learning/introduction.md
+++ b/federated_learning/introduction.md
-# Introduction
-
-Standard machine learning approaches require to have a centralizaed dataset in order to train a model. In certain scenarios like in the biomedical field, this is not straightforward due to several reasons like:
-
-* Privacy concerns:
-  * General Data Protection Regulation (GDPR): [General Data Protection Regulation (GDPR) – Official Legal Text](https://gdpr-info.eu/)
-  * Californian Consumer Privacy Act (CCPA): [California Consumer Privacy Act (CCPA) | State of California - Department of Justice - Office of the Attorney General](https://oag.ca.gov/privacy/ccpa)
-* Ethical committee approval
-* Transferring data to a centralized location
-
-This slows down research in healthcare and limits the generalization of certain models.
-
-## Federated Learning
-
-Federated learning (FL) is a machine learning procedure whose goal is to train a model without having data centralized. The goal of FL is to train higher quality models by having access to more data than centralized approaches, as well as to keep data securely decentralized. 
-
-### Infrastructure of a federated learning setting in healthcare
-
-A common scenario of federated learning in healthcare is shown as follows:
-
-![](./fl-graph.png)
-
-Hospitals (a.k.a. clients) across several geographical locations hold data of interest for a researcher. These data can be "made available" for local training but, only the model is authorized to be shared with a third thrusted party (e.g. research center). Once all the models are gathered, different techniques are proposed for **aggregating** them as a single global model. Then, the **Aggregated model** can be used as purposed (e.g. training a neural network for segmentation).
-
-### Theoretical background
-
-One of the critical points in FL is knowing how to aggregate the models submitted by the clients. The main problem relies on finding the best set of **parameters** that define your model in function of the submissions made by the clients.
-
-In a canonical form:
-
-$$
-\min_w F(w) ,\quad \textrm{where} F(w):=\sum_{k=1}^{m} p_k F_k(w)
-$$
-
-Where $m$ is the total number of clients, $p_k>=0$, and $\sum_k p_k=1$ , and $F_k$ is the local objective function for the $k$-th client. The impact (contribution) of each client to the aggregation of the global model is given by $p_k$.
-
-One of the first proposed methodologies in FL for model aggregation was **Federated Averaging `FedAVG`** by (MacMahan _et_ al, 2016), the idea behind it was to define the contribution of each client as $p_k=\frac{n_k}{n}$ where $n_k$ is the number of datapoints in the client $k$ and $n$ is the total number of observations studied.
-
-### Challenges in federated learning
-
-The main challenges in FL are associated to:
-
- **Communication efficiency:** number of iterations between clients and central location to train an optimal model.
-
- **Data heterogeneity:** how to build generalized models with heterogeneous data?
-
- **Security:** adversarial attacks and data leakage.
-
-
-## Links
-
-[Presentation material](https://ecaad164-c957-4008-a451-5e1098ff8953.filesusr.com/ugd/68a50d_a3d074241b3a4342be2fef2413ee61c7.pdf)
-
-[Colab notebook - part 1](https://colab.research.google.com/drive/1_uemRwNuok1wop6wP2Aiokn0KQgcwfr1?usp=sharing)
-
-[Colab notebook - part 2](https://colab.research.google.com/drive/1PiUee4n8T7pIhDV5zDEqhsK5jXvDYHpO?usp=sharing)
-
-[Colab notebook - part 3](https://colab.research.google.com/drive/1kIbrUtNH_WIPQX5vLyzRjs5CTgKA2CMT?usp=sharing)
-
-[Colab notebook - part 4](https://colab.research.google.com/drive/10wEN9eqdE9Z7CtvhRFgsL3gAzunZGlee?usp=sharing)
-
-
---
-
-## References
-
-1. **Konečný, J., McMahan, et al. (2016).** *Federated learning: Strategies for improving communication efficiency*. arXiv preprint arXiv:1610.05492.
-
-2. **Li, T., Sahu, et al. (2018).** *Federated optimization in heterogeneous networks.* arXiv preprint arXiv:1812.06127.
-
-3. **Li, T., Sahu, A. K., Talwalkar, A., & Smith, V. (2020).** *Federated learning: Challenges, methods, and future directions*. IEEE Signal Processing Magazine, 37(3), 50-60.
-
-
-
--- a/federated_learning/mcvae_rotated_mnist.ipynb
+++ b/federated_learning/mcvae_rotated_mnist.ipynb
--- a/heterogeneous_data/heterogeneous_data.ipynb
+++ b/heterogeneous_data/heterogeneous_data.ipynb
--- a/heterogeneous_data/img/mcvae.svg
+++ b/heterogeneous_data/img/mcvae.svg
--- a/heterogeneous_data/img/sparse_mcvae_3ch.svg
+++ b/heterogeneous_data/img/sparse_mcvae_3ch.svg
--- a/heterogeneous_data/img/sparse_vae.svg
+++ b/heterogeneous_data/img/sparse_vae.svg
--- a/heterogeneous_data/img/vae.svg
+++ b/heterogeneous_data/img/vae.svg
--- a/heterogeneous_data/introduction.md
+++ b/heterogeneous_data/introduction.md
-# Introduction 
-
-This lecture aims at covering the statistical background required to perform association analysis in typical studies of heterogeneous information. We will introduce the notion of statistical association, and highlight the standard analysis paradigm in univariate modeling. We will then explore multivariate association models, generalizing to high-dimensional data the notion of statistical association. In particular, we will focus on standard paradigms such as Canonical Correlation Analysis (CCA), Partial Least Squares (PLS), and Reduced Rank Regression (RRR). We will finally introduce more advanced analysis frameworks, such as Bayesian and deep association methods. Within this context we will present the Multi-Channel Variational Autoencoder, recently developed by our group. 
-
-## Links:
-
- [Presentation material](https://marcolorenzi.github.io/material/AI4Health_winter_school_part1.pdf).   
-
- [Colab Notebook](https://colab.research.google.com/drive/1GifcqjQ0OB8JdrnooWZ137nmuxAE4T-z?usp=sharing).
-
- [The (hitchhiker‘s) guide to Imaging-Genetics](https://marcolorenzi.github.io/material/winter_school/Imaging_Genetics_Book_Chapter.pdf).
-This chapter introduces the basics of statistical association models of heterogenous high-dimensional data, with a specific focus to data analysis in imaging-genetics.
-
-
-
-
-
--- a/heterogeneous_data/model.pt
+++ b/heterogeneous_data/model.pt
--- a/heterogeneous_data/model_adni.pt
+++ b/heterogeneous_data/model_adni.pt
--- a/heterogeneous_data/model_multi.pt
+++ b/heterogeneous_data/model_multi.pt
--- a/heterogeneous_data/model_sparse.pt
+++ b/heterogeneous_data/model_sparse.pt
--- a/heterogeneous_data/model_sparse1.pt
+++ b/heterogeneous_data/model_sparse1.pt
--- a/heterogeneous_data/pseudo_adni.csv
+++ b/heterogeneous_data/pseudo_adni.csv