Mentions légales du service

Skip to content
Snippets Groups Projects
Commit ef6304ce authored by E Madison Bray's avatar E Madison Bray
Browse files

Merge branch 'embray/issue-109' into 'master'

[bug] get rid of all default seeds in different configuration sources

See merge request !123
parents de5f7c6e 23eb4340
No related branches found
No related tags found
1 merge request!123[bug] get rid of all default seeds in different configuration sources
Pipeline #274126 failed
......@@ -233,14 +233,17 @@ Open `my_model/my_model_simulation_config.yml` in your favorite text editor.
By default we see that `n_scenarios` is `20000` with `n_replicates` of 100
per scenario. This means 2 million simulations which is a good number for
training a real model, but will take a very long time for a simple demo.
Change these to some lower numbers like `20` and `2`. The resulting file
(with none of the other settings changed) should look like:
Change these to some lower numbers like `20` and `2`. You may also set the
`seed` option to seed the random number generator for reproducible results.
The resulting file (with none of the other settings changed) should look
like:
```yaml
# my_model/my_model_simulation_config.yml
data_root: .
n_scenarios: 20
n_replicates: 2
seed: 2
...
```
......
......@@ -13,5 +13,4 @@ dataset_splits:
preprocessing:
min_snp: 1
min_indiv: 1
seed: 0
n_workers: 2
......@@ -22,4 +22,3 @@ batch_size: 1
loader_num_workers: 1
use_cuda: true
cuda_device: null
seed: 0
......@@ -25,7 +25,6 @@ ONE_EVENT_CONFIG_SCHEMA = {
'n_scenarios': {'default': 20000}, # noqa: E241
'n_samples': {'default': 50}, # noqa: E241
'n_replicates': {'default': 100},
'seed': {'default': 2}, # noqa: E241
'scenario_params_path': {
'default': 'one_event_params.csv'
},
......@@ -89,8 +88,7 @@ DEFAULT_ONE_EVENT_CONFIG = Config({
'recombination_rate': 1e-8,
'mutation_rate': 1e-8,
'n_min': np.log10(5000),
'n_max': np.log10(50000),
'seed': 2
'n_max': np.log10(50000)
}, Config.from_default('dataset'))
......@@ -132,7 +130,6 @@ DEFAULT_ONE_EVENT_TRAINING_CONFIG = Config({
'n_epochs': 5,
'batch_size': 20,
'evaluation_interval': 10,
'seed': 2,
'loader_num_workers': 4
})
......
......@@ -32,11 +32,12 @@ TEST_SIMULATION_CACHE_KEY = 'dnadna/tests/test_simulation'
# to just a few:
# TODO: Perhaps make these arguments to cached_simulation so different
# simulations of different sizes can be used for different tests
TEST_SIMULATION_CONFIG = one_event.DEFAULT_ONE_EVENT_CONFIG.copy()
TEST_SIMULATION_CONFIG = one_event.DEFAULT_ONE_EVENT_CONFIG.copy(True)
TEST_SIMULATION_CONFIG.update({
'n_scenarios': 15,
'n_replicates': 10,
'segment_length': 1e5
'segment_length': 1e5,
'seed': 2 # For reproducibility between tests
})
......@@ -119,7 +120,7 @@ def cached_simulation(cache, monkeypatch, change_test_dir):
@pytest.fixture
def cached_preprocessor(cached_simulation, monkeypatch):
def cached_preprocessor(cached_simulation):
"""
Companion to `cached_simulation`.
......@@ -139,13 +140,17 @@ def cached_preprocessor(cached_simulation, monkeypatch):
'validation': 1.0 / 3
}
preprocessing_config['preprocessing'].update({
'min_snp': 100
'min_snp': 100,
'seed': 0 # for reproducibility of tests
})
monkeypatch.setitem(training_config, 'dataset', simulation_config)
monkeypatch.setitem(training_config['dataset_transforms'][0]['crop'],
'max_snp', 100)
monkeypatch.setitem(training_config, 'n_epochs', 1)
training_config['dataset'] = simulation_config
training_config['dataset_transforms'][0]['crop']['max_snp'] = 100
training_config['n_epochs'] = 1
# Set the seed so it is consistent between test runs, for historical
# reasons it is set to 2 (see
# https://gitlab.inria.fr/ml_genetics/private/dnadna/-/issues/109)
training_config['seed'] = 2
source = DictSNPSource(scenarios)
simulation = Simulation(simulation_config, source=source,
......
......@@ -57,6 +57,9 @@ def test_simulation_main(tmp_path, monkeypatch):
monkeypatch.setitem(config, 'n_scenarios', n_scenarios)
monkeypatch.setitem(config, 'n_replicates', n_replicates)
# Set the seed so that the test is reproducible
monkeypatch.setitem(config, 'seed', 2)
# Initialize a new simulation configuration and data directory
argv = ['--debug', dataset_name, 'one_event', str(tmp_path)]
assert SimulationInitCommand.main(argv) is None
......
......@@ -49,6 +49,7 @@ def test_random_seed(cached_preprocessor, network, use_cuda):
if net_params is not None:
processed_config['network']['params'] = net_params
processed_config['use_cuda'] = use_cuda
processed_config['seed'] = 0 # for reproducibility of tests
def train():
source = preprocessor.dataset.source
......@@ -289,6 +290,10 @@ class TestLossComputations:
training_config['network']['name'] = 'MLP'
training_config['learned_params'] = param_config
training_config['use_cuda'] = use_cuda
# Set the seed so it is consistent between test runs, for historical
# reasons it is set to 2 (see
# https://gitlab.inria.fr/ml_genetics/private/dnadna/-/issues/109)
training_config['seed'] = 2
preprocessor = DataPreprocessor(training_config,
dataset=dummy_dataset)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment