import numpy as np
import itertools
from env import GridWorld

DISCOUNT_FACTOR = 0.9


class Agent:
    def __init__(self, env):
        self.env = env

    def policy(self, Q, state, epsilon=0.01):
        action_probs = np.ones(self.env.nA, dtype=float) * epsilon / self.env.nA
        action_probs[np.argmax(Q[state])] += (1.0 - epsilon)
        action = np.random.choice(self.env.nA, p=action_probs)
        return action

    def q_learning(self, num_episodes=1000, alpha=0.5):
        Q = np.zeros((self.env.nS, self.env.nA))

        for ith in range(1, num_episodes + 1):
            if ith % 100 == 0:
                print("\rEpisode {}/{}.".format(ith, num_episodes))
                print(Q)

            state = env.reset()

            for t in itertools.count():
                action = self.policy(Q, state)
                next_state, reward, done, _ = env.step(action)

                best_action = np.argmax(Q[next_state])
                td_target = reward + DISCOUNT_FACTOR * Q[next_state][best_action]
                Q[state][action] += alpha * (td_target - Q[state][action])

                if done:
                    break

                state = next_state
        return Q