Here is the link to the Full Blog Post on K-Armed Bandit Problem.
https://dystillvision.com/writing/engineering/multi_k_armed_bandit_problem_in_reinforcement_learning
Python Program for the K-Armed Bandit Problem
import numpy as np
class EpsilonGreedy:
def __init__(self, k_arms, epsilon):
self.k_arms = k_arms
self.epsilon = epsilon
self.counts = np.zeros(k_arms) # Store for Number of Arm is pulled
self.values = np.zeros(k_arms) # Store for Estimated Value for each Arm
def select_arm(self):
if np.random.rand() < self.epsilon:
print("Selecting 1 random Arm between 1 and k_arms")
return np.random.randint(0, self.k_arms)
else:
max_value = np.argmax(self.values)
print("Selecting Max Value Arm", max_value)
return max_value
def update(self, chosen_arm, reward):
self.counts[chosen_arm] += 1
c = self.counts[chosen_arm]
value = self.values[chosen_arm]
updated_value = ((c-1)/c) * value + (1/c) * reward
self.values[chosen_arm] = updated_value
# print(chosen_arm, " has been selected ", n, "times")
# print("Current value for ", chosen_arm, " is", updated_value)
k_arms = 10 # Ten weapon options
epsilon = 0.1 # Random weapon for 10% of trials
n_trials = 1000
rewards = np.random.randn(k_arms, n_trials)
agent = EpsilonGreedy(k_arms, epsilon)
total_reward = 0
for t in range(n_trials):
arm = agent.select_arm()
print(arm)
reward = rewards[arm, t]
agent.update(arm, reward)
total_reward += reward
print("Total Reward ", total_reward)