Mentions légales du service

Skip to content
Snippets Groups Projects
Commit d4ae31ed authored by CREMONESI Francesco's avatar CREMONESI Francesco
Browse files

remove old files

parent ff68684b
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 13394 deletions
This diff is collapsed.
import torch
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
def non_iid_split(dataset, nb_nodes, n_samples_per_node, batch_size, shuffle, shuffle_digits=False):
assert(nb_nodes>0 and nb_nodes<=10)
digits=torch.arange(10) if shuffle_digits==False else torch.randperm(10, generator=torch.Generator().manual_seed(0))
# split the digits in a fair way
digits_split=list()
i=0
for n in range(nb_nodes, 0, -1):
inc=int((10-i)/n)
digits_split.append(digits[i:i+inc])
i+=inc
# load and shuffle nb_nodes*n_samples_per_node from the dataset
loader = torch.utils.data.DataLoader(dataset,
batch_size=nb_nodes*n_samples_per_node,
shuffle=shuffle)
dataiter = iter(loader)
images_train_mnist, labels_train_mnist = dataiter.next()
data_splitted=list()
for i in range(nb_nodes):
idx=torch.stack([y_ == labels_train_mnist for y_ in digits_split[i]]).sum(0).bool() # get indices for the digits
data_splitted.append(torch.utils.data.DataLoader(torch.utils.data.TensorDataset(images_train_mnist[idx], labels_train_mnist[idx]), batch_size=batch_size, shuffle=shuffle))
return data_splitted
def iid_split(dataset, nb_nodes, n_samples_per_node, batch_size, shuffle):
# load and shuffle n_samples_per_node from the dataset
loader = torch.utils.data.DataLoader(dataset,
batch_size=n_samples_per_node,
shuffle=shuffle)
dataiter = iter(loader)
data_splitted=list()
for _ in range(nb_nodes):
data_splitted.append(torch.utils.data.DataLoader(torch.utils.data.TensorDataset(*(dataiter.next())), batch_size=batch_size, shuffle=shuffle))
return data_splitted
def get_MNIST(type="iid", n_samples_train=200, n_samples_test=100, n_clients=3, batch_size=25, shuffle=True):
dataset_loaded_train = datasets.MNIST(
root="./data",
train=True,
download=True,
transform=transforms.ToTensor()
)
dataset_loaded_test = datasets.MNIST(
root="./data",
train=False,
download=True,
transform=transforms.ToTensor()
)
if type=="iid":
train=iid_split(dataset_loaded_train, n_clients, n_samples_train, batch_size, shuffle)
test=iid_split(dataset_loaded_test, n_clients, n_samples_test, batch_size, shuffle)
elif type=="non_iid":
train=non_iid_split(dataset_loaded_train, n_clients, n_samples_train, batch_size, shuffle)
test=non_iid_split(dataset_loaded_test, n_clients, n_samples_test, batch_size, shuffle)
else:
train=[]
test=[]
return train, test
def plot_samples(data, channel:int, title=None, plot_name="", n_examples =20):
n_rows = int(n_examples / 5)
plt.figure(figsize=(1* n_rows, 1*n_rows))
if title: plt.suptitle(title)
X, y= data
for idx in range(n_examples):
ax = plt.subplot(n_rows, 5, idx + 1)
image = 255 - X[idx, channel].view((28,28))
ax.imshow(image, cmap='gist_gray')
ax.axis("off")
if plot_name!="":plt.savefig(f"plots/"+plot_name+".png")
plt.tight_layout()
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""This code to create a custom MNIST dataset was made possible thanks to
https://github.com/LaRiffle/collateral-learning .
Important to know that aside the tampering I did on the build_dataset function
for my own application, I also had to change rgba_to_rgb. Indeed, the function
was working as desired on Jupyter but not on Spyder. Do not ask me why !
"""
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage.interpolation import map_coordinates
from scipy.ndimage.filters import gaussian_filter
import pickle
import torch
import math
import os
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader
"""PLOT FUNCTIONS TO VISUALIZE THE FONTS AND DATASETS"""
def show_original_font(family:str):
"""Plot the original numbers used to create the dataset"""
plt.figure()
plt.title(family)
plt.text(0, 0.4, '1234567890', size=50, family=family)
plt.axis("off")
plt.tight_layout()
plt.savefig(f"plots/{family}_original.png")
def convert_to_rgb(data):
def rgba_to_rgb(rgba):
return rgba[1:]
return np.apply_along_axis(rgba_to_rgb, 2, data)
def elastic_transform(image, alpha, sigma, random_state=None):
"""Elastic deformation of images as described in [Simard2003]_.
.. [Simard2003] Simard, Steinkraus and Platt, "Best Practices for
Convolutional Neural Networks applied to Visual Document Analysis", in
Proc. of the International Conference on Document Analysis and
Recognition, 2003.
"""
if random_state is None:
random_state = np.random.RandomState(None)
shape = np.array([28, 28, 3], dtype =int)
dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
x, y, z = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]), np.arange(shape[2]))
#print(x.shape, y.shape, z.shape)
#print(dx.shape, dy.shape)
#x, y, z = x[:28, :28, :3], y[:28, :28, :3], z[:28, :28, :3]
#dx, dy = dx[:28, :28, :3], dy[:28, :28, :3]
indices = np.reshape(y+dy, (-1, 1)), np.reshape(x+dx, (-1, 1)), np.reshape(z, (-1, 1))
distored_image = map_coordinates(image, indices, order=1, mode='reflect')
return distored_image.reshape(shape)
def center(data):
# Inverse black and white
wb_data = np.ones(data.shape) * 255 - data
# normalize
prob_data = wb_data / np.sum(wb_data)
# marginal distributions
dx = np.sum(prob_data, (1, 2))
dy = np.sum(prob_data, (0, 2))
# expected values
(X, Y, Z) = prob_data.shape
cx = np.sum(dx * np.arange(X))
cy = np.sum(dy * np.arange(Y))
# Check bounds
assert cx > X/4 and cx < 3 * X/4, f"ERROR: {cx} > {X/4} and {cx} < {3 * X/4}"
assert cy > Y/4 and cy < 3 * Y/4, f"ERROR: {cy} > {Y/4} and {cy} < {3 * Y/4}"
# print('Center', cx, cy)
x_min = int(round(cx - X/4))
x_max = int(round(cx + X/4))
y_min = int(round(cy - Y/4))
y_max = int(round(cy + Y/4))
return data[x_min:x_max, y_min:y_max, :]
def create_transformed_digit(digit:int, size:float, rotation:float, family:str):
fig = plt.figure(figsize=(2,2), dpi=28)
fig.text(0.4, 0.4, str(digit), size=size, rotation=rotation, family=family)
# Rm axes, draw and get the rgba shape of the digit
plt.axis('off')
fig.canvas.draw()
data = np.frombuffer(fig.canvas.tostring_argb(), dtype=np.uint8)
data = data.reshape(fig.canvas.get_width_height()[::-1] + (4,))
# Convert to rgb
data = convert_to_rgb(data)
# Center the data
data = center(data)
# Apply an elastic deformation
data = elastic_transform(data, alpha=991, sigma=9)
# Free memory space
plt.close(fig)
return data
def save_dataset(dataset_name:str, array_X:np.array, array_y:np.array):
with open(f'{dataset_name}.pkl', 'wb') as output:
dataset = array_X, array_y
pickle.dump(dataset, output)
def build_dataset(C:dict, std_size=2.5):
"""build a dataset with `dataset_size` according to the chosen font
and deformation. Only digits in `datasets_digits` are in the created
dataset."""
numbers_str="".join([str(n) for n in C['numbers']])
file_name=f"{C['font']}_{numbers_str}_{C['n_samples']}_{C['tilt']}_{C['seed']}"
if os.path.isfile(f"{file_name}.pkl"):
return pickle.load(open(f"{file_name}.pkl", "rb"))
if C['seed']: np.random.seed(C['seed'])
#Make a plot of each original digit to know what they look like
# show_original_font(C['font'])
list_X = []
list_y= []
for i in range(C['n_samples']):
if i%10 == 0: print(round(i / C['n_samples'] * 100), '%')
X = np.zeros((3, 28, 28 ))
#Choosing a number at this step and its transformation characteristics
digit = C["numbers"][np.random.randint(len(C["numbers"]))]
for j, tilt in enumerate(C['tilt']):
rotation = tilt + np.random.normal(0, C['std_tilt'])
size = 60 + np.random.normal(0, std_size)
X_tilt=create_transformed_digit(digit, size, rotation, C['font'])
X[j] = X_tilt[:, :, j]
# Append data to the datasets
#list_X.append(X[:,:,0])
list_X.append(X)
list_y.append([digit])
#save the dataset
dataset = (np.array(list_X), np.array(list_y))
pickle.dump(dataset, open(f'{file_name}.pkl', 'wb'))
return np.array(list_X), np.array(list_y)
class Ds_MNIST_modified(Dataset):
"""Creation of the dataset used to create the clients' dataloader"""
def __init__(self, features, labels):
self.features = features
self.labels = labels
def __len__(self): return len(self.features)
def __getitem__(self,idx):
#3D input 1x28x28
sample_x = torch.Tensor(self.features[idx])
sample_y = self.labels[idx]
return sample_x, sample_y
def plot_samples(self, channel:int, title=None, plot_name="",
n_examples =20):
n_rows = int(n_examples / 5)
plt.figure(figsize=(1* n_rows, 1*n_rows))
if title: plt.suptitle(title)
for idx in range(n_examples):
X, y = self[idx]
ax = plt.subplot(n_rows, 5, idx + 1)
image = 255 - X.view((-1, 28, 28))[channel]
ax.imshow(image, cmap='gist_gray')
ax.axis("off")
if plot_name!="":plt.savefig(f"plots/"+plot_name+".png")
plt.tight_layout()
def get_synth_MNIST(clients, batch_size:int, shuffle=True):
"""function returning a list of training and testing dls."""
list_train, list_test = [], []
for C in clients:
X, y = build_dataset(C)
X = (255 - X) /255
X_train, y_train = X[:C['n_samples_train']], y[:C['n_samples_train']]
X_test, y_test = X[C['n_samples_train']:], y[C['n_samples_train']:]
train_ds = Ds_MNIST_modified(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size = batch_size, shuffle = shuffle)
list_train.append(train_dl)
test_ds = Ds_MNIST_modified(X_test, y_test)
test_dl = DataLoader(test_ds, batch_size = batch_size, shuffle = shuffle)
list_test.append(test_dl)
return list_train, list_test
\ No newline at end of file
source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
federated_learning/fl-graph.png

74.9 KiB

# Introduction
Standard machine learning approaches require to have a centralizaed dataset in order to train a model. In certain scenarios like in the biomedical field, this is not straightforward due to several reasons like:
* Privacy concerns:
* General Data Protection Regulation (GDPR): [General Data Protection Regulation (GDPR) – Official Legal Text](https://gdpr-info.eu/)
* Californian Consumer Privacy Act (CCPA): [California Consumer Privacy Act (CCPA) | State of California - Department of Justice - Office of the Attorney General](https://oag.ca.gov/privacy/ccpa)
* Ethical committee approval
* Transferring data to a centralized location
This slows down research in healthcare and limits the generalization of certain models.
## Federated Learning
Federated learning (FL) is a machine learning procedure whose goal is to train a model without having data centralized. The goal of FL is to train higher quality models by having access to more data than centralized approaches, as well as to keep data securely decentralized.
### Infrastructure of a federated learning setting in healthcare
A common scenario of federated learning in healthcare is shown as follows:
![](./fl-graph.png)
Hospitals (a.k.a. clients) across several geographical locations hold data of interest for a researcher. These data can be "made available" for local training but, only the model is authorized to be shared with a third thrusted party (e.g. research center). Once all the models are gathered, different techniques are proposed for **aggregating** them as a single global model. Then, the **Aggregated model** can be used as purposed (e.g. training a neural network for segmentation).
### Theoretical background
One of the critical points in FL is knowing how to aggregate the models submitted by the clients. The main problem relies on finding the best set of **parameters** that define your model in function of the submissions made by the clients.
In a canonical form:
$$
\min_w F(w) ,\quad \textrm{where} F(w):=\sum_{k=1}^{m} p_k F_k(w)
$$
Where $m$ is the total number of clients, $p_k>=0$, and $\sum_k p_k=1$ , and $F_k$ is the local objective function for the $k$-th client. The impact (contribution) of each client to the aggregation of the global model is given by $p_k$.
One of the first proposed methodologies in FL for model aggregation was **Federated Averaging `FedAVG`** by (MacMahan _et_ al, 2016), the idea behind it was to define the contribution of each client as $p_k=\frac{n_k}{n}$ where $n_k$ is the number of datapoints in the client $k$ and $n$ is the total number of observations studied.
### Challenges in federated learning
The main challenges in FL are associated to:
- **Communication efficiency:** number of iterations between clients and central location to train an optimal model.
- **Data heterogeneity:** how to build generalized models with heterogeneous data?
- **Security:** adversarial attacks and data leakage.
## Links
[Presentation material](https://ecaad164-c957-4008-a451-5e1098ff8953.filesusr.com/ugd/68a50d_a3d074241b3a4342be2fef2413ee61c7.pdf)
[Colab notebook - part 1](https://colab.research.google.com/drive/1_uemRwNuok1wop6wP2Aiokn0KQgcwfr1?usp=sharing)
[Colab notebook - part 2](https://colab.research.google.com/drive/1PiUee4n8T7pIhDV5zDEqhsK5jXvDYHpO?usp=sharing)
[Colab notebook - part 3](https://colab.research.google.com/drive/1kIbrUtNH_WIPQX5vLyzRjs5CTgKA2CMT?usp=sharing)
[Colab notebook - part 4](https://colab.research.google.com/drive/10wEN9eqdE9Z7CtvhRFgsL3gAzunZGlee?usp=sharing)
---
## References
1. **Konečný, J., McMahan, et al. (2016).** *Federated learning: Strategies for improving communication efficiency*. arXiv preprint arXiv:1610.05492.
2. **Li, T., Sahu, et al. (2018).** *Federated optimization in heterogeneous networks.* arXiv preprint arXiv:1812.06127.
3. **Li, T., Sahu, A. K., Talwalkar, A., & Smith, V. (2020).** *Federated learning: Challenges, methods, and future directions*. IEEE Signal Processing Magazine, 37(3), 50-60.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Introduction
This lecture aims at covering the statistical background required to perform association analysis in typical studies of heterogeneous information. We will introduce the notion of statistical association, and highlight the standard analysis paradigm in univariate modeling. We will then explore multivariate association models, generalizing to high-dimensional data the notion of statistical association. In particular, we will focus on standard paradigms such as Canonical Correlation Analysis (CCA), Partial Least Squares (PLS), and Reduced Rank Regression (RRR). We will finally introduce more advanced analysis frameworks, such as Bayesian and deep association methods. Within this context we will present the Multi-Channel Variational Autoencoder, recently developed by our group.
## Links:
- [Presentation material](https://marcolorenzi.github.io/material/AI4Health_winter_school_part1.pdf).
- [Colab Notebook](https://colab.research.google.com/drive/1GifcqjQ0OB8JdrnooWZ137nmuxAE4T-z?usp=sharing).
- [The (hitchhiker‘s) guide to Imaging-Genetics](https://marcolorenzi.github.io/material/winter_school/Imaging_Genetics_Book_Chapter.pdf).
This chapter introduces the basics of statistical association models of heterogenous high-dimensional data, with a specific focus to data analysis in imaging-genetics.
File deleted
File deleted
File deleted
File deleted
File deleted
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment