Source code for giagrad.optim.Adamax

import numpy as np

from giagrad.tensor import Tensor
from giagrad.optim.optimizer import Optimizer

[docs]class Adamax(Optimizer): r"""Implements Adamax algorithm (a variant of Adam based on infinity norm). Based on `PyTorch Adamax`_. .. math:: \begin{aligned} &\rule{110mm}{0.4pt} \\ &\textbf{input} : \gamma \text{ (lr)}, \: \beta_1, \: \beta_2 \text{ (betas)}, \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}, \: \lambda \text{ (weight decay)}, \\ &\hspace{13mm} \:\epsilon \text{ (epsilon)} \\[-1.ex] &\rule{110mm}{0.4pt} \\ &\textbf{initialize} : m_0 \leftarrow 0 \: \text{(first moment)}, \: u_0 \leftarrow 0 \: \text{(infinity norm)} \\[-1.ex] &\rule{110mm}{0.4pt} \\ \\ &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do} \\ &\hspace{5mm}g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1}) \\ &\hspace{5mm}\textbf{if} \: \lambda \neq 0 \\ &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1} \\ &\hspace{5mm}m_t \leftarrow \beta_1 m_{t-1} + (1-\beta_1) g_t \\ &\hspace{5mm}u_t \leftarrow \max(\beta_2 u_{t-1}, |g_t| + \epsilon) \\ &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \frac{(1-\beta_1^t)}{u_t} \gamma m_t \\ &\rule{110mm}{0.4pt} \\[-1.ex] &\bf{return} \: \theta_t \\ &\rule{110mm}{0.4pt} \\[-1.ex] \end{aligned} .. _PyTorch Adamax: https://pytorch.org/docs/stable/generated/torch.optim.Adamax.html#torch.optim.Adamax Attributes ---------- params: iterable of Tensor Iterable of parameters to optimize. lr: float, default: 0.001 Learning rate. betas: Tuple[float,float], default: (0.9,0.999) Betas. eps: float, default: 1e-8 Epsilon value. weight_decay: float, default: 0 Weight decay (L2 penalty). maximize: bool, default: False Maximize the params based on the objective, instead of minimizing. Examples -------- >>> optimizer = giagrad.optim.Adamax(model.parameters()) >>> model.zero_grad() >>> loss_fn(model(input), target).backward() >>> optimizer.step() """ def __init__( self, params, lr=1e-3, betas=(0.9,0.999), eps=1e-8, weight_decay=0., maximize=False ): super().__init__(params) self.lr, self.eps, self.weight_decay = lr, eps, weight_decay self.beta1, self.beta2 = betas self.maximize = maximize self.m = [np.zeros(p.shape) for p in self.params] self.u = [np.zeros(p.shape) for p in self.params]
[docs] def step(self): for t, m, u in zip(self.params, self.m, self.u): g = t.grad.copy() if self.weight_decay != 0: g += self.weight_decay * t.data m[:] = self.beta1 * m + (1-self.beta1) * g u[:] = np.maximum(self.beta2 * u, np.abs(g + self.eps)) t.data -= (self.lr * m) / ((1 - self.beta1**self.ite) * u) self.ite += 1