diff --git a/declearn/model/torch/_model.py b/declearn/model/torch/_model.py index f8a5dd6f1396d6d6909edbf070ee616e6cc3a6ec..8eefac9cdb0b111f488c29de373bf3f9b7b911dc 100644 --- a/declearn/model/torch/_model.py +++ b/declearn/model/torch/_model.py @@ -17,25 +17,21 @@ """Model subclass to wrap PyTorch models.""" -import functools import io +import functools import warnings -from typing import Any, Callable, Dict, List, Optional, Set, Tuple - -import functorch # type: ignore +from typing import Any, Dict, List, Optional, Set, Tuple -try: - import functorch.compile # type: ignore -except ModuleNotFoundError: - COMPILE_AVAILABLE = False -else: - COMPILE_AVAILABLE = True import numpy as np import torch from typing_extensions import Self # future: import from typing (py >=3.11) from declearn.model.api import Model from declearn.model.torch.utils import AutoDeviceModule, select_device +from declearn.model.torch._samplewise import ( + GetGradientsFunction, + build_samplewise_grads_fn, +) from declearn.model.torch._vector import TorchVector from declearn.model._utils import raise_on_stringsets_mismatch from declearn.typing import Batch @@ -108,8 +104,6 @@ class TorchModel(Model): raise TypeError("'loss' should be a torch.nn.Module instance.") loss.reduction = "none" # type: ignore self._loss_fn = AutoDeviceModule(loss, device=device) - # Compute and assign a functional version of the model. - self._func_model, _ = functorch.make_functional(self._model) @property def device_policy( @@ -281,8 +275,8 @@ class TorchModel(Model): max_norm: float, ) -> TorchVector: """Compute and return batch-averaged sample-wise-clipped gradients.""" - # Compute sample-wise clipped gradients, using functorch. - grads = self._compute_samplewise_gradients(batch, max_norm) + # Compute sample-wise clipped gradients, using functional torch. + grads = self._compute_samplewise_gradients(batch, clip=max_norm) # Batch-average the resulting sample-wise gradients. return TorchVector( {name: tensor.mean(dim=0) for name, tensor in grads.coefs.items()} @@ -291,92 +285,48 @@ class TorchModel(Model): def _compute_samplewise_gradients( self, batch: Batch, - max_norm: Optional[float], + clip: Optional[float], ) -> TorchVector: """Compute and return stacked sample-wise gradients over a batch.""" - # Unpack the inputs, gather parameters and list gradients to compute. inputs, y_true, s_wght = self._unpack_batch(batch) - params = [] # type: List[torch.nn.Parameter] - idxgrd = [] # type: List[int] - pnames = [] # type: List[str] - for index, (name, param) in enumerate(self._model.named_parameters()): - params.append(param) - if param.requires_grad: - idxgrd.append(index + 3) - pnames.append(name) - # Gather or build the sample-wise clipped gradients computing function. grads_fn = self._build_samplewise_grads_fn( - idxgrd=tuple(idxgrd), inputs=len(inputs), y_true=(y_true is not None), s_wght=(s_wght is not None), ) - # Call it on the current inputs, with optional clipping. with torch.no_grad(): - grads = grads_fn(inputs, y_true, s_wght, *params, clip=max_norm) - # Wrap the results into a TorchVector and return it. - return TorchVector(dict(zip(pnames, grads))) + grads = grads_fn(inputs, y_true, s_wght, clip=clip) # type: ignore + return TorchVector(grads) @functools.lru_cache def _build_samplewise_grads_fn( self, - idxgrd: Tuple[int, ...], inputs: int, y_true: bool, s_wght: bool, - ) -> Callable[..., List[torch.Tensor]]: - """Build a functorch-based sample-wise gradients-computation function. + ) -> GetGradientsFunction: + """Build an optimizer sample-wise gradients-computation function. This function is cached, i.e. repeated calls with the same parameters will return the same object - enabling to reduce runtime costs due to building and (when available) compiling the output function. - Parameters - ---------- - idxgrd: tuple of int - Pre-incremented indices of the parameters that require gradients. - inputs: int - Number of input tensors. - y_true: bool - Whether a true labels tensor is provided. - s_wght: bool - Whether a sample weights tensor is provided. - Returns ------- - grads_fn: callable[inputs, y_true, s_wght, *params, /, clip] - Functorch-optimized function to efficiently compute sample- - wise gradients based on batched inputs, and optionally clip - them based on a maximum l2-norm value `clip`. + grads_fn: callable[[inputs, y_true, s_wght, clip], grads] + Function to efficiently compute and return sample-wise gradients + wrt trainable model parameters based on a batch of inputs, with + opt. clipping based on a maximum l2-norm value `clip`. + + Note + ---- + The underlying backend code depends on your Torch version, so as to + enable optimizing operations using either `functorch` for torch 1.1X + or `torch.func` for torch 2.X. """ - - def forward(inputs, y_true, s_wght, *params): - """Conduct the forward pass in a functional way.""" - y_pred = self._func_model(params, *inputs) - return self._compute_loss(y_pred, y_true, s_wght) - - def grads_fn(inputs, y_true, s_wght, *params, clip=None): - """Compute gradients and optionally clip them.""" - gfunc = functorch.grad(forward, argnums=idxgrd) - grads = gfunc(inputs, y_true, None, *params) - if clip: - for grad in grads: - # future: use torch.linalg.norm when supported by functorch - norm = torch.norm(grad, p=2, keepdim=True) - # false-positive; pylint: disable=no-member - grad.mul_(torch.clamp(clip / norm, max=1)) - if s_wght is not None: - grad.mul_(s_wght.to(grad.device)) - return grads - - # Wrap the former function to compute and clip sample-wise gradients. - in_axes = [[0] * inputs, 0 if y_true else None, 0 if s_wght else None] - in_axes.extend([None] * sum(1 for _ in self._model.parameters())) - grads_fn = functorch.vmap(grads_fn, tuple(in_axes)) - # Compile the resulting function to decrease runtime costs. - if not COMPILE_AVAILABLE: - return grads_fn - return functorch.compile.aot_function(grads_fn, functorch.compile.nop) + return build_samplewise_grads_fn( + self._model, self._loss_fn, inputs, y_true, s_wght + ) def apply_updates( self, diff --git a/declearn/model/torch/_samplewise/__init__.py b/declearn/model/torch/_samplewise/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1448a18bb210279d99343f8637cc8d7587cf12a5 --- /dev/null +++ b/declearn/model/torch/_samplewise/__init__.py @@ -0,0 +1,78 @@ +# coding: utf-8 + +# Copyright 2023 Inria (Institut National de Recherche en Informatique +# et Automatique) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Torch-version-dependent code to compute sample-wise gradients.""" + +from typing import Callable, Dict, List, Optional + +import torch + +from .shared import GetGradientsFunction + +if torch.__version__.startswith("2."): + from .torchfunc import build_samplewise_grads_fn_backend +elif torch.__version__.startswith("1.1"): + from .functorch import build_samplewise_grads_fn_backend +else: + # pragma: no cover + raise ImportError(f"Unsupported Torch version: {torch.__version__}") + + +__all__ = [ + "GetGradientsFunction", + "build_samplewise_grads_fn", +] + + +def build_samplewise_grads_fn( + model: torch.nn.Module, + loss_fn: torch.nn.Module, + inputs: int, + y_true: bool, + s_wght: bool, +) -> GetGradientsFunction: + """Build a torch-specific sample-wise gradients-computation function. + + Parameters + ---------- + model: torch.nn.Module + Model that is to be trained. + loss_fn: torch.nn.Module + Loss-computing module, returning sample-wise loss values. + inputs: int + Number of input tensors. + y_true: bool + Whether a true labels tensor is provided. + s_wght: bool + Whether a sample weights tensor is provided. + + Returns + ------- + grads_fn: callable[[inputs, y_true, s_wght, clip], grads] + Function that efficiently computes and returns sample-wise gradients + wrt trainable model parameters based on a batch of inputs, with opt. + clipping based on a maximum l2-norm value `clip`. + + Note + ---- + The underlying backend code depends on your Torch version, so as to + enable optimizing operations using either `functorch` for torch 1.1X + or `torch.func` for torch 2.X. + """ + return build_samplewise_grads_fn_backend( + model, loss_fn, inputs, y_true, s_wght + ) diff --git a/declearn/model/torch/_samplewise/functorch.py b/declearn/model/torch/_samplewise/functorch.py new file mode 100644 index 0000000000000000000000000000000000000000..fc8e613b77a3e925676001ac05df5bc347e0e1fd --- /dev/null +++ b/declearn/model/torch/_samplewise/functorch.py @@ -0,0 +1,93 @@ +# coding: utf-8 + +# Copyright 2023 Inria (Institut National de Recherche en Informatique +# et Automatique) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of `build_samplewise_grads_fn` for Torch 2.0.""" + +from typing import List, Tuple + +# fmt: off +import functorch # type: ignore +try: + import functorch.compile # type: ignore + COMPILE_AVAILABLE = True +except ModuleNotFoundError: + # pragma: no cover + COMPILE_AVAILABLE = False +import torch +# fmt: on + +from declearn.model.torch._samplewise.shared import ( + GetGradientsFunction, + clip_and_scale_grads_inplace, +) + +__all__ = [ + "build_samplewise_grads_fn_backend", +] + + +def build_samplewise_grads_fn_backend( + model: torch.nn.Module, + loss_fn: torch.nn.Module, + inputs: int, + y_true: bool, + s_wght: bool, +) -> GetGradientsFunction: + """Implementation of `build_samplewise_grads_fn` for Torch 1.1X.""" + + func_model, _ = functorch.make_functional(model) + + def run_forward(inputs, y_true, s_wght, *params): + """Run the forward pass in a functional way.""" + y_pred = func_model(params, *inputs) + s_loss = loss_fn(y_pred, y_true) + if s_wght is not None: + s_loss.mul_(s_wght.to(s_loss.device)) + return s_loss.mean() + + def grads_fn(inputs, y_true, s_wght, clip=None): + """Compute gradients and optionally clip them.""" + params, idxgrd, pnames = get_params(model) + gfunc = functorch.grad(run_forward, argnums=tuple(idxgrd)) + grads = gfunc(inputs, y_true, (None if clip else s_wght), *params) + if clip: + clip_and_scale_grads_inplace(grads, clip, s_wght) + return dict(zip(pnames, grads)) + + # Wrap the former function to compute and clip sample-wise gradients. + in_dims = ([0] * inputs, 0 if y_true else None, 0 if s_wght else None) + grads_fn = functorch.vmap(grads_fn, in_dims) + # Compile the resulting function to decrease runtime costs. + if not COMPILE_AVAILABLE: + # pragma: no cover + return grads_fn + return functorch.compile.aot_function(grads_fn, functorch.compile.nop) + + +def get_params( + model: torch.nn.Module, +) -> Tuple[List[torch.nn.Parameter], List[int], List[str]]: + """Return a model's parameters and the index and name of trainable ones.""" + params = [] # type: List[torch.nn.Parameter] + idxgrd = [] # type: List[int] + pnames = [] # type: List[str] + for idx, (name, param) in enumerate(model.named_parameters()): + params.append(param) + if param.requires_grad: + idxgrd.append(idx + 3) + pnames.append(name) + return params, idxgrd, pnames diff --git a/declearn/model/torch/_samplewise/shared.py b/declearn/model/torch/_samplewise/shared.py new file mode 100644 index 0000000000000000000000000000000000000000..451ae7c11998c3d337bffe988df2f6e131f5e539 --- /dev/null +++ b/declearn/model/torch/_samplewise/shared.py @@ -0,0 +1,56 @@ +# coding: utf-8 + +# Copyright 2023 Inria (Institut National de Recherche en Informatique +# et Automatique) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared code for torch-version-dependent backend code.""" + +from typing import Callable, Dict, Iterable, List, Optional + +import torch + +__all__ = [ + "GetGradientsFunction", + "clip_and_scale_grads_inplace", +] + + +GetGradientsFunction = Callable[ + [ + List[torch.Tensor], + Optional[torch.Tensor], + Optional[torch.Tensor], + Optional[float], + ], + Dict[str, torch.Tensor], +] +"""Signature for sample-wise gradients computation functions.""" + + +def clip_and_scale_grads_inplace( + grads: Iterable[torch.Tensor], + clip: float, + wght: Optional[torch.Tensor] = None, +) -> None: + """Clip a collection of tensors in-place, based on their euclidean norm. + + Also apply an optional weight tensor to scale the clipped gradients. + """ + for grad in grads: + norm = torch.norm(grad, p=2, keepdim=True) + # false-positive; pylint: disable=no-member + grad.mul_(torch.clamp(clip / norm, max=1)) + if wght is not None: + grad.mul_(wght.to(grad.device)) diff --git a/declearn/model/torch/_samplewise/torchfunc.py b/declearn/model/torch/_samplewise/torchfunc.py new file mode 100644 index 0000000000000000000000000000000000000000..d330d4f9f59647a48198b80b231ac5612ce5a6f0 --- /dev/null +++ b/declearn/model/torch/_samplewise/torchfunc.py @@ -0,0 +1,76 @@ +# coding: utf-8 + +# Copyright 2023 Inria (Institut National de Recherche en Informatique +# et Automatique) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of `build_samplewise_grads_fn` for Torch 2.0.""" + +from typing import Dict, Tuple + +import torch + +from declearn.model.torch._samplewise.shared import ( + GetGradientsFunction, + clip_and_scale_grads_inplace, +) + +__all__ = [ + "build_samplewise_grads_fn_backend", +] + + +def build_samplewise_grads_fn_backend( + model: torch.nn.Module, + loss_fn: torch.nn.Module, + inputs: int, + y_true: bool, + s_wght: bool, +) -> GetGradientsFunction: + """Implementation of `build_samplewise_grads_fn` for Torch 2.0.""" + + def run_forward(params, frozen, inputs, y_true, s_wght): + """Run the forward pass in a functional way.""" + y_pred = torch.func.functional_call(model, [params, frozen], *inputs) + s_loss = loss_fn(y_pred, y_true) + if s_wght is not None: + s_loss.mul_(s_wght.to(s_loss.device)) + return s_loss.mean() + + get_grads = torch.func.grad(run_forward, argnums=0) + + def get_clipped_grads(inputs, y_true, s_wght, clip=None): + """Compute gradients and optionally clip them.""" + params, frozen = get_params(model) + grads = get_grads( + params, frozen, inputs, y_true, None if clip else s_wght + ) + if clip: + clip_and_scale_grads_inplace(grads.values(), clip, s_wght) + return grads + + # Wrap the former function to compute and clip sample-wise gradients. + in_dims = ([0] * inputs, 0 if y_true else None, 0 if s_wght else None) + return torch.func.vmap(get_clipped_grads, in_dims) + + +def get_params( + model: torch.nn.Module, +) -> Tuple[Dict[str, torch.nn.Parameter], Dict[str, torch.nn.Parameter]]: + """Return a model's parameters, split between trainable and frozen ones.""" + params = {} # type: Dict[str, torch.nn.Parameter] + frozen = {} # type: Dict[str, torch.nn.Parameter] + for name, param in model.named_parameters(): + (params if param.requires_grad else frozen)[name] = param + return params, frozen