diff --git a/.gitignore b/.gitignore index 1d8b5edc2412b4fdf9d847a2ca8fa2daea58bf63..d27a390eecba5bce4c93eda6deca91092b5b14b6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ *.pyc data/MOT17-3T data/synthetic_trajectories -results/* \ No newline at end of file +results/* +output +.remi +container* +requirements* +script.sh \ No newline at end of file diff --git a/config/cfg_dvae_single.ini b/config/cfg_dvae_single.ini index 3a89473ea3ba99ec8ca4e7618ef63c6d75e4a4d9..b9a52ef5cb48c7ce3d4e4f004bc806c6078ced9f 100644 --- a/config/cfg_dvae_single.ini +++ b/config/cfg_dvae_single.ini @@ -5,7 +5,7 @@ # 1: file model logger_type = 1 print_model = True -save_root = results/dvae_pretraining +save_root = output/dvae_pretraining train_data_dir = data/synthetic_trajectories/train_data val_data_dir = data/synthetic_trajectories/val_data @@ -34,10 +34,11 @@ n_epochs = 500 early_stop_patience = 50 continue_train = False which_epoch = latest -print_frequency = 1024 +print_frequency = 10 save_latest_freq = 1000 -save_epoch_freq = 10 +save_epoch_freq = 50 validation = True +random_seed = 64 [DataFrame] diff --git a/container.def b/container.def new file mode 100644 index 0000000000000000000000000000000000000000..c1643be00842c64a832d56b94adc22b7bde40b21 --- /dev/null +++ b/container.def @@ -0,0 +1,128 @@ +Bootstrap: docker +From: ubuntu:latest + +# vim: ftsh + +# Description of this container. +%help + This is a basic singularity image for general python execution with a focus on deep learning. + To build this container from your workstation at Inria, use: + sudo singularity build container.sif container.def + + +# Metadata for this container +%labels + Author gaetan.lepage@inria.fr + Version v0.1.0 + + +# List of host files to be copied inside the container. +%files + # You can specify a single path. + # In this case, the file will be copied to the same path from / in the container + # Example: + # foo/bar/hello.txt + # will copy `foo/bar/hello.txt` to `/foo/bar/hello.txt` in the container. + + # You may also specify the location in the container + # Example: + # foo/bar/hello.txt /bonjour + # will copy `foo/bar/hello.txt` to `/bonjour/hello.txt` in the container. + requirements.txt + + +# The `%environment` section allows you to define environment variables that will be set at runtime. +# Note that these variables are not made available at build time by their inclusion in the +# `%environment` section. +# This means that if you need the same variables during the build process, you should also define +# them in your `%post` section. +%environment + + +# This section is where we download files from the internet with tools like git and wget, install +# new software and libraries, write configuration files, create new directories, etc. +%post + export DEBIAN_FRONTEND="noninteractive" + export TZ="Europe/Paris" + + ################ + # APT packages # + ################ + + # Update Ubuntu apt repository + apt-get update + + # Upgrade packages + apt-get upgrade -y + + # Install apt packages + alias apt_install="apt-get install -y --no-install-recommends" + + ########## + # PYTHON ####################################################################################### + ########## + + # Secify the version of python you want to install + PYTHON_VERSION='3.9' + + PYTHON=python${PYTHON_VERSION} + VERSION_NUMBER=$(echo $PYTHON_VERSION | cut -d '.' -f 2) + + # Needed for older versions of python (<3.10) + if [ $VERSION_NUMBER -lt 10 ]; then + apt_install software-properties-common gpg-agent + add-apt-repository ppa:deadsnakes/ppa + apt update -y + fi + + + # Python + apt_install curl # To download `get-pip.py` from the internet + apt_install ca-certificates # Required by `curl` + apt_install $PYTHON # The specified version of Python + apt_install ${PYTHON}-dev # Header files, a static library and development tools for + # building Python modules and more. + apt_install ${PYTHON}-distutils # Support for building and installing additional Python + # modules + + # Set the default python + ln -sf /usr/bin/$PYTHON /usr/bin/python3 + ln -sf /usr/bin/python3 /usr/bin/python + + + # >>> Here you can install your apt packages + # For example, + # apt_install cmake + # apt_install nvidia-cuda-toolkit + # apt_install sox time gcc + + apt_install libsndfile1 + apt_install gcc g++ + apt_install git + + + ########## + # Python # + ########## + + + + # Install pip + curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py + python /tmp/get-pip.py + + # Install some pip packages + alias pip_install="python -m pip install" + + pip_install -U pip setuptools + + # >>> Here you can install your python packages + + # Global project requirements + pip_install -r /requirements.txt + + # pypesq compiles against numoy + # So, we install it at the end, after python version is stabilized to 1.22.0 + # Otherwise, if put in the requirements.txt, numoy 1.23 gets installed, pypesq is compiled against it and it crahes at runtime + # as numpy is now 1.22.0 again... + pip_install https://github.com/vBaiCai/python-pesq/archive/master.zip \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..190241226976aaf3295f73e5f6817b421e6bce90 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +numpy==1.22.0 + +Pillow + +scikit-learn +pandas +scipy +SoundFile +librosa +matplotlib + +tensorboard + +torch +torchaudio +torchvision + +motmetrics diff --git a/script.sh b/script.sh new file mode 100644 index 0000000000000000000000000000000000000000..5de3dbf861fa7a16bc3912708975937bee7e4171 --- /dev/null +++ b/script.sh @@ -0,0 +1 @@ +python train_dvae_single.py config/cfg_dvae_single.ini \ No newline at end of file diff --git a/train_dvae_single.py b/train_dvae_single.py index 06aeff8197c7053ba3cfc7663f7765e99f09214b..30e5e3eb3b89a411c55af748d0123ff029f9417d 100644 --- a/train_dvae_single.py +++ b/train_dvae_single.py @@ -22,6 +22,7 @@ import datetime import os +from random import random import shutil import sys from configparser import ConfigParser @@ -89,8 +90,12 @@ def train(cfg_file): print('%s' % info) # Initialize training parameters - n_epochs, early_stop_patience, print_frequency, \ - batch_size, print_frequency, total_steps, start_epoch, epoch_iter, iter_file_path = init_training_params(cfg, save_dir, train_data_loader) + n_epochs, early_stop_patience, \ + total_steps, start_epoch, epoch_iter, iter_file_path, random_seed = init_training_params(cfg, save_dir, train_data_loader) + + torch.manual_seed(random_seed) + np.random.seed(random_seed) + # Start training print('Start training...') if validation: @@ -104,38 +109,34 @@ def train(cfg_file): training_total_loss = 0 training_recon_loss = 0 training_KLD_loss = 0 + epoch_iter_number = 0 for idx, data in enumerate(train_data_loader, start=epoch_iter): + batch_size = data.shape[0] total_steps += batch_size epoch_iter += batch_size + epoch_iter_number += 1 data = data.to(device) recon_data_mean, recon_data_logvar = model(data, compute_loss=True) - batch_size = data.shape[0] loss_dict = model.loss optimizer.zero_grad() loss_dict['loss_tot'].backward() optimizer.step() - # Display current loss - if total_steps % print_frequency == 0: - t = (datetime.datetime.now() - epoch_start_time).seconds / 60 - loss_info = get_loss_info(epoch, epoch_iter, t, loss_dict) - save_log.print_info(loss_info) - for info in loss_info: - print('%s' % info) - save_log.plot_current_training_loss(loss_dict, total_steps) - - training_total_loss += loss_dict['loss_tot'] - training_recon_loss += loss_dict['loss_recon'] - training_KLD_loss += loss_dict['loss_KLD'] + training_total_loss += loss_dict['loss_tot'] * batch_size + training_recon_loss += loss_dict['loss_recon'] * batch_size + training_KLD_loss += loss_dict['loss_KLD'] * batch_size # Save latest model save_log.save_model(epoch, epoch_iter, total_steps, model.state_dict(), iter_file_path, end_of_epoch=False, save_best=False) - training_total_loss = training_total_loss * batch_size / train_data_size - training_recon_loss = training_recon_loss * batch_size / train_data_size - training_KLD_loss = training_KLD_loss * batch_size / train_data_size + training_total_loss = training_total_loss / train_data_size + training_recon_loss = training_recon_loss / train_data_size + training_KLD_loss = training_KLD_loss / train_data_size + + # Display training loss + save_log.plot_current_training_loss(loss_dict, total_steps) #Validation if validation: @@ -144,17 +145,18 @@ def train(cfg_file): val_KLD_loss = 0 with torch.no_grad(): for idx, val_data in enumerate(val_data_loader): + batch_size = val_data.shape[0] val_data = val_data.to(device) val_data = torch.autograd.Variable(val_data) recon_data_mean, recon_data_logvar = model(val_data, compute_loss=True) loss_dict_val = model.loss - val_total_loss += loss_dict_val['loss_tot'] - val_recon_loss += loss_dict_val['loss_recon'] - val_KLD_loss += loss_dict_val['loss_KLD'] - val_total_loss = val_total_loss * batch_size/val_data_size - val_recon_loss = val_recon_loss * batch_size/val_data_size - val_KLD_loss = val_KLD_loss * batch_size/val_data_size + val_total_loss += loss_dict_val['loss_tot'] * batch_size + val_recon_loss += loss_dict_val['loss_recon'] * batch_size + val_KLD_loss += loss_dict_val['loss_KLD'] * batch_size + val_total_loss = val_total_loss / val_data_size + val_recon_loss = val_recon_loss / val_data_size + val_KLD_loss = val_KLD_loss / val_data_size avg_val_loss_dict = {'loss_tot': val_total_loss, 'loss_recon': val_recon_loss, 'loss_KLD': val_KLD_loss} save_log.plot_current_val_loss(avg_val_loss_dict, total_steps) torch.cuda.empty_cache() diff --git a/utils.py b/utils.py index beaa0f0d70c9fed069702f0cb91a6692bfc23308..dbcfb03d273281ba1d93a753c824266b8d5382a0 100644 --- a/utils.py +++ b/utils.py @@ -23,6 +23,7 @@ import datetime import math import os +from random import random import socket import motmetrics as mm import numpy as np @@ -87,9 +88,7 @@ def create_dvae_model(cfg, device, save_dir): def init_training_params(cfg, save_dir, train_data_loader): n_epochs = cfg.getint('Training', 'n_epochs') early_stop_patience = cfg.getint('Training', 'early_stop_patience') - print_frequency = cfg.getint('Training', 'print_frequency') - batch_size = cfg.getint('DataFrame', 'batch_size') - # ss_step = cfg.getint('Training', 'ss_step') + random_seed = cfg.getint('Training', 'random_seed') iter_file_path = os.path.join(save_dir, 'iter.txt') start_epoch, epoch_iter = 1, 0 @@ -99,12 +98,10 @@ def init_training_params(cfg, save_dir, train_data_loader): start_epoch, epoch_iter = np.loadtxt(iter_file_path , delimiter=',', dtype=int) print('Resuming from epoch %d at iteration %d' % (start_epoch, epoch_iter)) - print_frequency = abs(print_frequency * batch_size)/math.gcd(print_frequency, batch_size) total_steps = (start_epoch - 1) * len(train_data_loader) + epoch_iter - total_steps = total_steps // print_frequency * print_frequency - return n_epochs, early_stop_patience, print_frequency, \ - batch_size, print_frequency, total_steps, start_epoch, epoch_iter, iter_file_path + return n_epochs, early_stop_patience, \ + total_steps, start_epoch, epoch_iter, iter_file_path, random_seed def tracking_evaluation_onebatch(gt_seq, normalize_range, acc_list, eta_iter, x_mean_vem_iter): total_iter = eta_iter.shape[0]