diff --git a/dnadna/data_loader.py b/dnadna/data_loader.py index f16a74dcadc83685c46eaadb67c5c2828cce87df..9f9a8588e2752a3d9945ce47b3529c4e421cc03f 100644 --- a/dnadna/data_loader.py +++ b/dnadna/data_loader.py @@ -164,6 +164,196 @@ def generate_loaders(model_name, param_2_learn, num_simulations, filename_kwargs, num_indiv, subsampling, rotate, concat, **kwargs): + """ + Returns a training data `DataLoader` and a validation `DataLoader` for data + from a single model dataset. + + Parameters + ---------- + model_name : str + The name of the model; this is used only for generating sample + filenames. + param_2_learn : dict + Parameter names classified into different parameter types (keys) to + lists of parameters (values) depending on the task they're used for + ("regression", or "classification"). + preprocessed_scenario_params_path : str + Filename for the pre-processed scenario parameters CSV file; this is + read into a DataFrame. + simu_data_path : str + Directory name of the root of the simulated data for the model in the + DNADNA directory structure. + use_cuda : bool + Whether or not we are using CUDA: For this function this just + determines whether or not the returned `DataLoader`s should use pinned + memory buffers for tensors. + loader_num_workers : int + Passed directly to the ``num_workers`` parameter of the `DataLoader`s. + SNP_max : int + Used to limit the number of SNPs to load from each sample. This is a + paramater of `DNADataset`. + num_replicates : int + Not used. + real_num_scenario : int + The original number of scenarios in the model dataset (before + pre-processing). + num_simulations : int + The total number of expected simulated SNP samples (used just as a + sanity check). + filename_kwargs : dict + Additional keyword argumnents to pass to ``generate_filename()``. + num_indiv : int + Not used. + subsampling : int, tuple, or None + If not None, used as an argument for a `Subsample` transformation on + the SNP matrix. See `Subsample` for more details on this argument. + rotate : None or arbitrary + If not None, apply random rotations along the SNP axis. See `Rotate`. + concat : bool + Parameter for `DNADataset`: If True, SNP positions are concatenated + with the SNP matrix into a single tensor. If False, the SNP matrix is + instead multiplied by the position array. + + Additional keyword arguments are ignored. + + TODO + ---- + * The ``model_name`` parameter can be removed once dataset loading is + better generalized; that or it can be read from the simulation + parameters. + + * ``param_2_learn`` is maybe not a great name; instead let's call it + ``learned_params`` or something similar (at the very least it should not + be singular). + + * ``preprocessed_scenario_param_path`` could be generalized to just take an + existing DataFrame; how we load the scenario params from the filesystem + should be abstracted to a higher-level interface. + + * For that matter, for the purpose of the datasets, we only take a slice + of the scenario params using the list of parameters we want to learn, + taken from ``param_2_learn``. Since this is the *only* use of + ``param_2_learn`` in the function, that part could be abstracted out + too, and the function should assume that the given DataFrame contains + only the parameters of interest already. + + * ``simu_data_path`` is just passed directly down to the initalization of + the DNADataset instances. Thus, along with ``model_name``, it can be + abstracted away as we work toward better abstraction of Dataset + serializations. + + * ``use_cuda`` could probably made optional, and even abstracted away to + a dict or parameters for passing through to `DataLoader` instantiation, + since that's all it's used for. Same for ``batch_size`` and + ``loader_num_workers`` (though the former is admittedly a bit more + "fundamental" to the training model, being that it is a hyperparameter of + interest). Note also the ``loader_num_workers`` is prefixed with + ``loader_`` to indicate that it is a `DataLoader` parameter, but the + others are not. This is inconsistent if nothing else. They should also + be made optional since `DataLoader` already has reasonable defaults, so + no reason to insist that these be provided (we could, however, make the + defaults different from those of `DataLoader` itself if desired). + + * ``SNP_max`` appears to be broken in this version of the code--the + inference module, for example, does not actually pass a value for this + parameter when calling this function. Apparently it was at least at one + point assumed to be one of the inference parameters, so the code will + break if there is not an explicit ``"SNP_max"`` property in the given + inference parameters file (it's something you'd expect a default to be + given for at least in the pre-processed inference parameters). That + said, if used at all, it should probably have a default of `None`. + In the "wip_hyperopt" branch this appears to have been replaced with a + parameter named ``SNP_min`` but which is used in the same way, which to + me would seem to make it a misnomer (as it represents the *maximum* + number of SNP location to use from each sample...). + + * ``num_replicates`` can obviously go away (in version from "wip_hyperopt" + branch it already has), as "num_replicates" is now a column in the + scenario params, and in principle each scenario can have a different + number of replicates (especially after pre-processing which could in + principle remove some replicates; I don't think it does this currently + but it might in the future?) + + * ``real_num_scenario`` is superfluous--it's used only for the + ``generate_filename()`` utility in determining the correct amount of + zero-padding used in the scenario index in sample filenames. Smarter + sample file loading will render this superflous. + + * ``num_simulations`` is superfluous--it's passed as a sanity check to make + sure the number of simulations loaded (after looping over all replicates + for all pre-processed scenarios) matches a ``"num_simulations"`` property + in the pre-processed inference params. But this is superfluous because + (num_scenarios x num_replicates) already gives the number of simulations, + unless the pre-processed scenario params were corrupted or tampered with + in some way, in which case the only result is that there would be fewer + actual simulations. Rather than this be a value that can become out of + sync with the inference parameters metadata it's probably fine to just + not have it at all. It's certainly not needed in this function, as this + is a sanity check that could just as easily be performed externally to + this function (in fact, this parameter as already been removed from this + function in the "wip_hyperopt" branch). + + * ``filename_kwargs`` can go away once we add smarter sample file loading; + at the most this might be replaced with a function or instance of a class + that implements the right filename scheme for samples. + + * ``num_indiv`` is not used and should obviously be removed (already has + been in ("wip_hyperopt" branch). It appears to maybe be one of the + simulation parameters but even that is not clear... + + * ``subsampling`` should have a ``None`` default, and could perhaps even be + an instance of a callbable (for custom sub-sampling implementation + replacing the default `Subsample`). + + * ``rotate`` should obviously just be a boolean, unless we also want to + allow it to be replaced with a different callable. In fact it's not + clear if we shouldn't also allow other arbitrary transforms to be passed + through, but that might be a case of YAGNI for now. + + * ``concat`` might be keeping but could have a default value + (`DNADataset` uses `True` by default). ``"concat"`` should be one of the + standard inference parameters, but it is not yet expicitly documented + anywhere that I can find. It should be noted that it does not exist in + the "wip_hyperopt" branch, so either it was determined to be not useful, + or this is a disconnect between the branches that must be resolved (it + looks like it is a feature added quite recently by Jean). + + * After further cleanup the unused ``**kwargs`` should also be removed; + currently the way this function is used in practice is that all inference + and simulation params are just dumped into this function, and any + unrecognized parameters ignored. That can be changed one we clean this + up. + + TODO Summary + ^^^^^^^^^^^^ + + After additional refactoring to how data is loaded, the only existing + arguments to this function that are likely to remain useful are: + + * ``preprocessed_senario_param_path`` (but replaced with just a + ``scenario_params`` argument that would likely be an already loaded + dataframe with the desired parameters picked out) + + * ``batch_size`` with a default value of 1, but it's important enough + that it's probably worth keeping as a separate parameter. + + * *Possibly* ``use_cuda`` and ``loader_num_workers`` but possibly + renamed or bundled together into a collection of keyword arguments to + pass through to the `DataLoader`s. + + * ``subsampling``, ``rotate``, and ``concat`` but reworked as described + above. + + * A new argument representing a generalization of a data source. This + might take the form of a `Dataset` class to use, or even something a + little more abstracted from that, which generalizes away the scheme + for mapping a ``(scenario, replicate)`` pair to a data source for a + simulation. + + * Possibly additional arguments preserved from the "wip_hyperopt" + branch, though it's not clear yet how specialized they are and + whether or not they belong directly in this function. + """ scenario_params_preprocessed = pd.read_csv(preprocessed_scenario_params_path, index_col="scenario_idx") num_scenario = len(scenario_params_preprocessed) @@ -173,8 +363,19 @@ def generate_loaders(model_name, param_2_learn, validation_indices = set() i = 0 for scenario in scenario_params_preprocessed.itertuples(): + # Iterate over all pre-processed scenario parameters (each iteration is + # a single scenario) if (i % 5000) == 0: + # Debug log output every 5000 scenarios + # TODO: This message could be made clearer; also we could accept a + # callback to update a progress bar when running as a script or in + # a notebook. logging.debug(scenario) + + # Iterate through all replicates of the scenario. Thus the index "i" + # just runs across all replicates of all scenarios (this double-loop + # could be squashed into a single sample iterator since apparently we + # don't care about distinguishing between scenarios. for k in range(scenario.num_replicates): indices[i] = (scenario.Index,