【强化学习】多臂老虎机——E_greedy、UCB、Gradient Bandit 算法 代码实现
生活随笔
收集整理的這篇文章主要介紹了
【强化学习】多臂老虎机——E_greedy、UCB、Gradient Bandit 算法 代码实现
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
多臂老虎機
import numpy as np import matplotlib.pyplot as pltclass E_greedy:def __init__(self,arm_num=10,epsilon=0.5):self.arm_num = arm_numself.epsilon = epsilonself.arms = np.random.uniform(0, 1, self.arm_num)self.Q = np.zeros(arm_num)self.NA = np.zeros(arm_num)self.T = 100000self.R = 0self.R_list = []self.HAP = np.zeros(arm_num)self.HA = np.zeros(arm_num)self.R_ = np.zeros(arm_num)def reset(self):self.Q = np.zeros(self.arm_num)self.NA = np.zeros(self.arm_num)self.R_list = []self.R = 0def get_reward(self,arm_index):return self.arms[arm_index] + np.random.normal(0, 1)def update_Q_NA(self,arm_index,reward):self.NA[arm_index] += 1self.Q[arm_index] += 1/self.NA[arm_index]*(reward-self.Q[arm_index])def update_NA(self,arm_index):self.NA[arm_index] += 1def e_greedy(self,epsilon=None):if epsilon:self.epsilon = epsilonfor iter in range(1,self.T+1):if np.random.random() > self.epsilon:if iter == 1:arm_index = np.random.randint(0,self.arm_num)else:arm_index = np.argmax(self.Q)else:arm_index = np.random.randint(0, self.arm_num)reward = self.get_reward(arm_index)self.R += 1/iter*(reward-self.R)self.R_list.append((iter,self.R))self.update_Q_NA(arm_index,reward)def ucb(self,c):for iter in range(1,self.T+1):A_list = self.Q + c*(np.sqrt(np.log(iter)/(self.NA+1e-8)))arm_index = np.argmax(A_list)reward = self.get_reward(arm_index)self.R += 1/iter*(reward-self.R)self.R_list.append((iter,self.R))self.update_Q_NA(arm_index,reward)def get_HA(self):hae = np.array([np.e**x for x in self.HA])sum = np.sum(hae)self.HAP = hae/sumdef gradient(self,a):for iter in range(1,self.T+1):self.get_HA()arm_index = np.random.choice(list(range(self.arm_num)),p=self.HAP)reward = self.get_reward(arm_index)self.update_NA(arm_index)self.R_[arm_index] += 1/self.NA[arm_index] * (reward - self.R_[arm_index])self.R += 1/iter*(reward-self.R)self.R_list.append((iter,self.R))ha_temp = self.HA[arm_index] + a*(reward-self.R_[arm_index])*(1-self.HAP[arm_index])self.HA -= a * (reward - self.R_[arm_index]) * self.HAPself.HA[arm_index] = ha_tempdef plot_R(self,c):plt.plot([k[0] for k in self.R_list[100:]], [k[1] for k in self.R_list[100:]], c=c)plt.plot([k[0] for k in self.R_list[100:]], [np.max(self.arms) for k in self.R_list[100:]],c='r')def main():e = E_greedy()e.ucb(2)e.plot_R(c='g')e.reset()e.e_greedy(0.1)e.plot_R(c='b')e.reset()e.gradient(0.25)e.plot_R(c='k')plt.show()if __name__ == '__main__':main()總結
以上是生活随笔為你收集整理的【强化学习】多臂老虎机——E_greedy、UCB、Gradient Bandit 算法 代码实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 统计学习方法第十五章作业:SVD矩阵分解
- 下一篇: 统计学习方法第十六章作业:PCA主成分分