Merge branch 'embray/issue-109' into 'master'

[bug] get rid of all default seeds in different configuration sources See merge request !123

Merge branch 'embray/issue-109' into 'master'
ef6304ce · E Madison Bray · de5f7c6e · 23eb4340 · ef6304ce · ef6304ce
Commit ef6304ce authored 3 years ago by E Madison Bray
--- a/README.md
+++ b/README.md
@@ -233,14 +233,17 @@ Open `my_model/my_model_simulation_config.yml` in your favorite text editor.
 By default we see that `n_scenarios` is `20000` with `n_replicates` of 100
 per scenario.  This means 2 million simulations which is a good number for
 training a real model, but will take a very long time for a simple demo.
-Change these to some lower numbers like `20` and `2`.  The resulting file
-(with none of the other settings changed) should look like:
+Change these to some lower numbers like `20` and `2`.  You may also set the
+`seed` option to seed the random number generator for reproducible results.
+The resulting file (with none of the other settings changed) should look
+like:

 ```yaml
 # my_model/my_model_simulation_config.yml
 data_root: .
 n_scenarios: 20
 n_replicates: 2
+seed: 2
 ...
 ```


--- a/dnadna/defaults/preprocessing.yml
+++ b/dnadna/defaults/preprocessing.yml
@@ -13,5 +13,4 @@ dataset_splits:
 preprocessing:
    min_snp: 1
    min_indiv: 1
-    seed: 0
    n_workers: 2
--- a/dnadna/defaults/training.yml
+++ b/dnadna/defaults/training.yml
@@ -22,4 +22,3 @@ batch_size: 1
 loader_num_workers: 1
 use_cuda: true
 cuda_device: null
-seed: 0
--- a/dnadna/examples/one_event.py
+++ b/dnadna/examples/one_event.py
@@ -25,7 +25,6 @@ ONE_EVENT_CONFIG_SCHEMA = {
        'n_scenarios':  {'default': 20000},  # noqa: E241
        'n_samples':    {'default': 50},  # noqa: E241
        'n_replicates': {'default': 100},
-        'seed':         {'default': 2},  # noqa: E241
        'scenario_params_path': {
            'default': 'one_event_params.csv'
        },
@@ -89,8 +88,7 @@ DEFAULT_ONE_EVENT_CONFIG = Config({
    'recombination_rate': 1e-8,
    'mutation_rate': 1e-8,
    'n_min': np.log10(5000),
-    'n_max': np.log10(50000),
-    'seed': 2
+    'n_max': np.log10(50000)
 }, Config.from_default('dataset'))


@@ -132,7 +130,6 @@ DEFAULT_ONE_EVENT_TRAINING_CONFIG = Config({
    'n_epochs': 5,
    'batch_size': 20,
    'evaluation_interval': 10,
-    'seed': 2,
    'loader_num_workers': 4
 })


--- a/dnadna/utils/testing.py
+++ b/dnadna/utils/testing.py
@@ -32,11 +32,12 @@ TEST_SIMULATION_CACHE_KEY = 'dnadna/tests/test_simulation'
 # to just a few:
 # TODO: Perhaps make these arguments to cached_simulation so different
 # simulations of different sizes can be used for different tests
-TEST_SIMULATION_CONFIG = one_event.DEFAULT_ONE_EVENT_CONFIG.copy()
+TEST_SIMULATION_CONFIG = one_event.DEFAULT_ONE_EVENT_CONFIG.copy(True)
 TEST_SIMULATION_CONFIG.update({
    'n_scenarios': 15,
    'n_replicates': 10,
-    'segment_length': 1e5
+    'segment_length': 1e5,
+    'seed': 2  # For reproducibility between tests
 })


@@ -119,7 +120,7 @@ def cached_simulation(cache, monkeypatch, change_test_dir):


 @pytest.fixture
-def cached_preprocessor(cached_simulation, monkeypatch):
+def cached_preprocessor(cached_simulation):
    """
    Companion to `cached_simulation`.

@@ -139,13 +140,17 @@ def cached_preprocessor(cached_simulation, monkeypatch):
        'validation': 1.0 / 3
    }
    preprocessing_config['preprocessing'].update({
-        'min_snp': 100
+        'min_snp': 100,
+        'seed': 0  # for reproducibility of tests
    })

-    monkeypatch.setitem(training_config, 'dataset', simulation_config)
-    monkeypatch.setitem(training_config['dataset_transforms'][0]['crop'],
-                        'max_snp', 100)
-    monkeypatch.setitem(training_config, 'n_epochs', 1)
+    training_config['dataset'] = simulation_config
+    training_config['dataset_transforms'][0]['crop']['max_snp'] = 100
+    training_config['n_epochs'] = 1
+    # Set the seed so it is consistent between test runs, for historical
+    # reasons it is set to 2 (see
+    # https://gitlab.inria.fr/ml_genetics/private/dnadna/-/issues/109)
+    training_config['seed'] = 2

    source = DictSNPSource(scenarios)
    simulation = Simulation(simulation_config, source=source,

--- a/tests/test_simulation.py
+++ b/tests/test_simulation.py
@@ -57,6 +57,9 @@ def test_simulation_main(tmp_path, monkeypatch):
    monkeypatch.setitem(config, 'n_scenarios', n_scenarios)
    monkeypatch.setitem(config, 'n_replicates', n_replicates)

+    # Set the seed so that the test is reproducible
+    monkeypatch.setitem(config, 'seed', 2)
+
    # Initialize a new simulation configuration and data directory
    argv = ['--debug', dataset_name, 'one_event', str(tmp_path)]
    assert SimulationInitCommand.main(argv) is None

--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -49,6 +49,7 @@ def test_random_seed(cached_preprocessor, network, use_cuda):
    if net_params is not None:
        processed_config['network']['params'] = net_params
    processed_config['use_cuda'] = use_cuda
+    processed_config['seed'] = 0  # for reproducibility of tests

    def train():
        source = preprocessor.dataset.source
@@ -289,6 +290,10 @@ class TestLossComputations:
        training_config['network']['name'] = 'MLP'
        training_config['learned_params'] = param_config
        training_config['use_cuda'] = use_cuda
+        # Set the seed so it is consistent between test runs, for historical
+        # reasons it is set to 2 (see
+        # https://gitlab.inria.fr/ml_genetics/private/dnadna/-/issues/109)
+        training_config['seed'] = 2

        preprocessor = DataPreprocessor(training_config,
                                        dataset=dummy_dataset)