神经网络最优化方法比较(代码理解)
生活随笔
收集整理的這篇文章主要介紹了
神经网络最优化方法比较(代码理解)
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
為高效找到使損失函數的值最小的參數,關于最優化(optimization)提了很多方法。
http://ruder.io/optimizing-gradient-descent/? 本網頁介紹的原理可以參考。下面這段代碼取自一本教材,比較經典。
# coding: utf-8 import numpy as np import matplotlib.pyplot as plt from collections import OrderedDictclass SGD:"""隨機梯度下降法(Stochastic Gradient Descent)"""def __init__(self, lr=0.01):self.lr = lrdef update(self, params, grads):for key in params.keys():params[key] -= self.lr * grads[key] class Momentum:"""Momentum SGD"""def __init__(self, lr=0.01, momentum=0.9):self.lr = lrself.momentum = momentumself.v = Nonedef update(self, params, grads):if self.v is None:self.v = {}for key, val in params.items(): self.v[key] = np.zeros_like(val)for key in params.keys():self.v[key] = self.momentum*self.v[key] - self.lr*grads[key] params[key] += self.v[key]class Nesterov:"""Nesterov's Accelerated Gradient (http://arxiv.org/abs/1212.0901)"""def __init__(self, lr=0.01, momentum=0.9):self.lr = lrself.momentum = momentumself.v = Nonedef update(self, params, grads):if self.v is None:self.v = {}for key, val in params.items():self.v[key] = np.zeros_like(val)for key in params.keys():self.v[key] *= self.momentumself.v[key] -= self.lr * grads[key]params[key] += self.momentum * self.momentum * self.v[key]params[key] -= (1 + self.momentum) * self.lr * grads[key]class AdaGrad:"""AdaGrad"""def __init__(self, lr=0.01):self.lr = lrself.h = Nonedef update(self, params, grads):if self.h is None:self.h = {}for key, val in params.items():self.h[key] = np.zeros_like(val)for key in params.keys():self.h[key] += grads[key] * grads[key]params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)class RMSprop:"""RMSprop"""def __init__(self, lr=0.01, decay_rate = 0.99):self.lr = lrself.decay_rate = decay_rateself.h = Nonedef update(self, params, grads):if self.h is None:self.h = {}for key, val in params.items():self.h[key] = np.zeros_like(val)for key in params.keys():self.h[key] *= self.decay_rateself.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)class Adam:"""Adam (http://arxiv.org/abs/1412.6980v8)"""def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):self.lr = lrself.beta1 = beta1self.beta2 = beta2self.iter = 0self.m = Noneself.v = Nonedef update(self, params, grads):if self.m is None:self.m, self.v = {}, {}for key, val in params.items():self.m[key] = np.zeros_like(val)self.v[key] = np.zeros_like(val)self.iter += 1lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter) for key in params.keys():self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)def f(x, y):return x**2 / 20.0 + y**2def df(x, y):return x / 10.0, 2.0*yinit_pos = (-7.0, 2.0) params = {} params['x'], params['y'] = init_pos[0], init_pos[1] grads = {} grads['x'], grads['y'] = 0, 0optimizers = OrderedDict() optimizers["SGD"] = SGD(lr=0.95) optimizers["Momentum"] = Momentum(lr=0.1) optimizers["AdaGrad"] = AdaGrad(lr=1.5) optimizers["Adam"] = Adam(lr=0.3)idx = 1for key in optimizers:optimizer = optimizers[key]x_history = []y_history = []params['x'], params['y'] = init_pos[0], init_pos[1]for i in range(30):x_history.append(params['x'])y_history.append(params['y'])grads['x'], grads['y'] = df(params['x'], params['y'])optimizer.update(params, grads)x = np.arange(-10, 10, 0.01)y = np.arange(-5, 5, 0.01)X, Y = np.meshgrid(x, y) Z = f(X, Y)# for simple contour line mask = Z > 7Z[mask] = 0# plot plt.subplot(2, 2, idx)idx += 1plt.plot(x_history, y_history, 'o-', color="red")plt.contour(X, Y, Z)#繪制等高線plt.ylim(-10, 10)plt.xlim(-10, 10)plt.plot(0, 0, '+')plt.title(key)plt.xlabel("x")plt.ylabel("y")plt.subplots_adjust(wspace =0, hspace =0)#調整子圖間距 plt.show()?
?
總結
以上是生活随笔為你收集整理的神经网络最优化方法比较(代码理解)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Google开源库Image Capti
- 下一篇: CNN卷积层图像和矩阵转换函数