Pytorch：torch.optim详解_代码007(未授权)

本文介绍: 本篇笔记主要介绍 torch.optim 模块，记录学习过程在深度学习中，我们通常会使用优化算法来调整神经网络的权重和偏差，以便模型能够更好地拟合训练数据。torch.optim是PyTorch中的一个模块，它提供了各种优化算法的实现，用于自动化地优化神经网络的参数。换句话说，torch.optim 可以帮助我们让模型更好地学习，从而提高性能。

本篇笔记主要介绍 torch.optim 模块，记录学习过程

在深度学习中，我们通常会使用优化算法来调整神经网络的权重和偏差，以便模型能够更好地拟合训练数据。torch.optim是PyTorch中的一个模块，它提供了各种优化算法的实现，用于自动化地优化神经网络的参数。换句话说，torch.optim可以帮助我们让模型更好地学习，从而提高性能。

简单使用示例如下所示：

import torch
import numpy as np
import warnings
warnings.filterwarnings('ignore') #ignore warnings
# 定义数据
x = torch.linspace(-np.pi, np.pi, 2000)
y = torch.sin(x)

p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)
# 定义模型
model = torch.nn.Sequential(
						    torch.nn.Linear(3, 1),
						    torch.nn.Flatten(0, 1)
						)
# 定义损失函数:损失函数是一个衡量模型预测与实际值之间差距的函数
loss_fn = torch.nn.MSELoss(reduction='sum')
# 学习率:learning_rate参数表示学习率，它控制了每次参数更新的步长。
learning_rate = 1e-3
# 优化器:选择一个优化算法来优化模型的参数
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
# 迭代学习:训练模型
for t in range(1, 1001):
    y_pred = model(xx)
    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print('No.{: 5d}, loss: {:.6f}'.format(t, loss.item()))
    # 梯度清零
    optimizer.zero_grad() 
    # 反向传播计算梯度
    loss.backward() 
    # 梯度下降法更新参数
    optimizer.step()

所有优化器都是继承父类 Opti mizer，如下列表是 PyTorch 提供的优化器:
(具体用法和优缺点，后续更新。。。。。。。。。。。)

Optimizer 是所有优化器的父类，它主要有如下公共方法:

add_param_group(param_group): 添加模型可学习参数组
step(closure): 进行一次参数更新
zero_grad(): 清空上次迭代记录的梯度信息
state_dict(): 返回 dict 结构的参数状态
load_state_dict(state_dict): 加载 dict 结构的参数状态

class Optimizer(object):
    def __init__(self, params, defaults):
        # 字典类型，子类传入，用于表示全部参数组的默认超参
        self.defaults = defaults
        # 判断参数是否为torch.Tensor格式
        if isinstance(params, torch.Tensor):
            raise TypeError("params argument given to the optimizer should be "
                            "an iterable of Tensors or dicts, but got " +
                            torch.typename(params))

        self.param_groups = []

        param_groups = list(params)
        # 确保param_groups为一个字典
        if not isinstance(param_groups[0], dict):
            param_groups = [{'params': param_groups}]

        for param_group in param_groups:
        	# 添加模型可学习参数组
            self.add_param_group(param_group)

optimizer = optim.SGD( net.parameters(),               # params的一种形式
					   lr=LR, momentum=0.9
					)

注意__init__方法中param_groups = list(params)，得到一个新的变量 param_groups，(这里 self.param_groups是干什么的呢？？）。

param_groups = list(params)，list 可以把生成器的元素都取出来，所以，很明显，param_groups就是一个Parameter类对象的列表，里面的元素是每个网络层的参数weight和bias（如果有）。

if not isinstance(param_groups[0], dict):
   param_groups = [{'params': param_groups}]

for param_group in param_groups:
    self.add_param_group(param_group)

将param_groups中的每个元素送进self.add_param_group这个列表中。现在的param_groups里只有一个元素{“param”: [参数]}。

该方法在初始化函数中用到，主要用来向 self.param_groups添加不同分组的模型参数

def add_param_group(self, param_group):
        r"""Add a param group to the :class:`Optimizer` s `param_groups`.

        This can be useful when fine tuning a pre-trained network as frozen layers can be made
        trainable and added to the :class:`Optimizer` as training progresses.

        Arguments:
            param_group (dict): Specifies what Tensors should be optimized along with group
            specific optimization options.
        """
        # 步骤1：判断传进来的参数是否是一个字典，必然是一个字典，不是字典报错。
        assert isinstance(param_group, dict), "param group must be a dict"
		# 步骤2：取出字典里的"params"的值，就是参数的列表，这是个列表，然后一系列判断，走到第3步
        params = param_group['params']
        if isinstance(params, torch.Tensor):
            param_group['params'] = [params]
        elif isinstance(params, set):
            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
        else:
        	# 步骤3：重新以列表的形式赋值回去
            param_group['params'] = list(params)

		# 步骤4：判断参数的列表里边的元素类型，必然是Parameter类型，也就是Tensor类型的，并且是叶子结点。
        for param in param_group['params']:
            if not isinstance(param, torch.Tensor):
                raise TypeError("optimizer can only optimize Tensors, "
                                "but one of the params is " + torch.typename(param))
            if not param.is_leaf:
                raise ValueError("can't optimize a non-leaf Tensor")

        # 利用默认参数给所有组设置统一的超参
        # 步骤5：将defaults这个字典里的键值对拿出来，放到现在的param_group这个字典里，这样该字典构成一个具有完整参数的字典，
        # 其所有键为：dict_keys(['params', 'lr', 'momentum', 'dampening', 'weight_decay', 'nesterov'])，方便step()方法调用。
        for name, default in self.defaults.items():
            if default is required and name not in param_group:
                raise ValueError("parameter group didn't specify a value of required optimization parameter "+name)
            else:
                param_group.setdefault(name, default)

        params = param_group['params']
        if len(params) != len(set(params)):
            warnings.warn("optimizer contains a parameter group with duplicate parameters; "
                          "in future, this will cause an error; "
                          "see github.com/pytorch/pytorch/issues/40967 for more information", stacklevel=3)

		# 步骤6： 判定当前字典中的参数组和之前的参数组是不是一样的。对于当前来说，self.param_groups是空的，所以直接到第7步
        param_set = set()
        for group in self.param_groups:
            param_set.update(set(group['params']))

		# 步骤7：判断param_set集合是否和param_group["params"]这个集合中具有相同元素，没有返回True，反之False。显然没有，所以7不执行。
        if not param_set.isdisjoint(set(param_group['params'])):
            raise ValueError("some parameters appear in more than one parameter group")
		# 步骤8：将构造完整的param_group这个字典，加到self.param_groups中去。
        self.param_groups.append(param_group)

fcParamsId = list(map(id, resnet18_ft.fc.parameters()))     # 返回的是parameters的 内存地址
features_params = filter(lambda p: id(p) not in fcParamsId, resnet18_ft.parameters())

optimizer = optim.SGD(
    [{'params': features_params, 'lr': LR * 0.1},                         #  这个列表是params的另一种形式
      {'params': resnet18_ft.fc.parameters()}], 
    'lr': LR, momentum=0.9
)

def step(self, closure):
        r"""Performs a single optimization step (parameter update).

        Arguments:
            closure (callable): A closure that reevaluates the model and
                returns the loss. Optional for most optimizers.

        .. note::
            Unless otherwise specified, this function should not modify the
            ``.grad`` field of the parameters.
        """
        raise NotImplementedError

	@torch.no_grad()
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()
                
		#对参数进行遍历
        for group in self.param_groups: 
        	params_with_grad = []  #有梯度的网路参数收集列表
            d_p_list = [] #收集网络参数的梯度列表
            momentum_buffer_list = []
            
            # #以下为一些超参数的收集
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']
            lr = group['lr']

			# 对而网络参数进行逐个遍历更新
            for p in group['params']: 
                if p.grad is None:
                    continue
                d_p = p.grad
                if weight_decay != 0:
                    d_p = d_p.add(p, alpha=weight_decay)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
                    if nesterov:
                        d_p = d_p.add(buf, alpha=momentum)
                    else:
                        d_p = buf

                p.add_(d_p, alpha=-group['lr'])

        return loss

from torch.nn import CrossEntropyLoss

dummy_model = DummyModel().cuda()

optimizer = SGD(dummy_model.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4)
# 定义loss
loss_fn = CrossEntropyLoss()
# 定义数据
batch_size = 2
data = torch.randn(64, 3, 64, 128).cuda()  # 制造假数据shape=64 * 3 * 64 * 128
data_label = torch.randint(0, 10, size=(64,), dtype=torch.long).cuda()  # 制造假的label

for batch_index in range(10):
    batch_data = data[batch_index*batch_size: batch_index*batch_size + batch_size]
    batch_label = data_label[batch_index*batch_size: batch_index*batch_size + batch_size]
    def closure():
        optimizer.zero_grad()  # 清空梯度
        output = dummy_model(batch_data)  # forward
        loss = loss_fn(output, batch_label)  # 计算loss
        loss.backward()  # backward
        print('No.{: 2d} loss: {:.6f}'.format(batch_index, loss.item()))
        return loss
    optimizer.step(closure=closure)  # 更新参数

def zero_grad(self, set_to_none: bool = False):
        r"""Sets the gradients of all optimized :class:`torch.Tensor` s to zero.

        Arguments:
            set_to_none (bool): instead of setting to zero, set the grads to None.
                This is will in general have lower memory footprint, and can modestly improve performance.
                However, it changes certain behaviors. For example:
                1. When the user tries to access a gradient and perform manual ops on it,
                a None attribute or a Tensor full of 0s will behave differently.
                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``s
                are guaranteed to be None for params that did not receive a gradient.
                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
                (in one case it does the step with a gradient of 0 and in the other it skips
                the step altogether).
        """
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is not None:
                    if set_to_none:
                        p.grad = None
                    else:
                        if p.grad.grad_fn is not None:
                            p.grad.detach_()
                        else:
                            p.grad.requires_grad_(False)
                        p.grad.zero_()

def state_dict(self):
        r"""Returns the state of the optimizer as a :class:`dict`.

        It contains two entries:

        * state - a dict holding current optimization state. Its content
            differs between optimizer classes.
        * param_groups - a dict containing all parameter groups
        """
        # Save order indices instead of Tensors
        param_mappings = {}
        start_index = 0

        def pack_group(group):
            nonlocal start_index
            packed = {k: v for k, v in group.items() if k != 'params'}
            param_mappings.update({id(p): i for i, p in enumerate(group['params'], start_index)
                                   if id(p) not in param_mappings})
            packed['params'] = [param_mappings[id(p)] for p in group['params']]
            start_index += len(packed['params'])
            return packed
        param_groups = [pack_group(g) for g in self.param_groups]
        # Remap state to use order indices as keys
        packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
                        for k, v in self.state.items()}
        return {
            'state': packed_state,
            'param_groups': param_groups,
        }

所有的学习率调整策略类的父类是torch.optim.lr_scheduler._LRScheduler，基类 _LRScheduler 定义了如下方法:

scheduler.step()

初始化函数内部的 with_counter 函数主要是为了确保lr_scheduler.step()是在optimizer.step()之后调用的 (PyTorch=1.1 发生变化). 注意在__init__函数最后一步调用了self.step()，即_LRScheduler在初始化时已经调用过一次step()方法。

class _LRScheduler(object):

    def __init__(self, optimizer, last_epoch=-1, verbose=False):

        # Attach optimizer
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        # Initialize epoch and base learning rates
        if last_epoch == -1:
            for group in optimizer.param_groups:
                group.setdefault('initial_lr', group['lr'])
        else:
            for i, group in enumerate(optimizer.param_groups):
                if 'initial_lr' not in group:
                    raise KeyError("param 'initial_lr' is not specified "
                                   "in param_groups[{}] when resuming an optimizer".format(i))
        self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
        self.last_epoch = last_epoch

        # Following https://github.com/pytorch/pytorch/issues/20124
        # We would like to ensure that `lr_scheduler.step()` is called after
        # `optimizer.step()`
        def with_counter(method):
            if getattr(method, '_with_counter', False):
                # `optimizer.step()` has already been replaced, return.
                return method

            # Keep a weak reference to the optimizer instance to prevent
            # cyclic references.
            instance_ref = weakref.ref(method.__self__)
            # Get the unbound method for the same purpose.
            func = method.__func__
            cls = instance_ref().__class__
            del method

            @wraps(func)
            def wrapper(*args, **kwargs):
                instance = instance_ref()
                instance._step_count += 1
                wrapped = func.__get__(instance, cls)
                return wrapped(*args, **kwargs)

            # Note that the returned function here is no longer a bound method,
            # so attributes like `__func__` and `__self__` no longer exist.
            wrapper._with_counter = True
            return wrapper

        self.optimizer.step = with_counter(self.optimizer.step)
        self.optimizer._step_count = 0
        self._step_count = 0
        self.verbose = verbose

        self.step()

def step(self, epoch=None):
        # Raise a warning if old pattern is detected
        # https://github.com/pytorch/pytorch/issues/20124
        if self._step_count == 1:
            if not hasattr(self.optimizer.step, "_with_counter"):
                warnings.warn("...") # 移除了警告信息

            # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
            elif self.optimizer._step_count < 1:
                warnings.warn("...") # 移除了警告信息
        self._step_count += 1

        class _enable_get_lr_call:

            def __init__(self, o):
                self.o = o

            def __enter__(self):
                self.o._get_lr_called_within_step = True
                return self

            def __exit__(self, type, value, traceback):
                self.o._get_lr_called_within_step = False

        with _enable_get_lr_call(self):
            if epoch is None:
                self.last_epoch += 1
                values = self.get_lr()
            else:
                warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
                self.last_epoch = epoch
                if hasattr(self, "_get_closed_form_lr"):
                    values = self._get_closed_form_lr()
                else:
                    values = self.get_lr()

        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
            param_group, lr = data
            param_group['lr'] = lr
            self.print_lr(self.verbose, i, lr, epoch)

        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]

def get_last_lr(self):
        """ Return last computed learning rate by current scheduler.
        """
        return self._last_lr

    def get_lr(self):
        # Compute learning rate using chainable form of the scheduler
        raise NotImplementedError

    def print_lr(self, is_verbose, group, lr, epoch=None):
        """Display the current learning rate.
        """
        if is_verbose:
            if epoch is None:
                print('Adjusting learning rate'
                      ' of group {} to {:.4e}.'.format(group, lr))
            else:
                print('Epoch {:5d}: adjusting learning rate'
                      ' of group {} to {:.4e}.'.format(epoch, group, lr))

def state_dict(self):
        """Returns the state of the scheduler as a :class:`dict`.

        It contains an entry for every variable in self.__dict__ which
        is not the optimizer.
        """
        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}

    def load_state_dict(self, state_dict):
        """Loads the schedulers state.

        Arguments:
            state_dict (dict): scheduler state. Should be an object returned
                from a call to :meth:`state_dict`.
        """
        self.__dict__.update(state_dict)