diff --git a/.gitignore b/.gitignore index ab5a28a0..695bdda2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ *.pydevproject .idea/ .DS_Store -__pycache__ \ No newline at end of file +__pycache__ +./Code 2. Cartpole/6. A3C/Cartpole_A3C.pgy \ No newline at end of file diff --git a/1-grid-world/1-policy-iteration/environment.py b/1-grid-world/1-policy-iteration/environment.py new file mode 100644 index 00000000..910d4ba8 --- /dev/null +++ b/1-grid-world/1-policy-iteration/environment.py @@ -0,0 +1,245 @@ +import tkinter as tk +from tkinter import Button +import time +import numpy as np +from PIL import ImageTk, Image + +PhotoImage = ImageTk.PhotoImage +UNIT = 100 # pixels +HEIGHT = 5 # grid height +WIDTH = 5 # grid width +TRANSITION_PROB = 1 +POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right +ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates +REWARDS = [] + + +class GraphicDisplay(tk.Tk): + def __init__(self, agent): + super(GraphicDisplay, self).__init__() + self.title('Policy Iteration') + self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) + self.texts = [] + self.arrows = [] + self.env = Env() + self.agent = agent + self.evaluation_count = 0 + self.improvement_count = 0 + self.is_moving = 0 + (self.up, self.down, self.left, self.right), self.shapes = self.load_images() + self.canvas = self._build_canvas() + self.text_reward(2, 2, "R : 1.0") + self.text_reward(1, 2, "R : -1.0") + self.text_reward(2, 1, "R : -1.0") + + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) + # buttons + iteration_button = Button(self, text="Evaluate", + command=self.evaluate_policy) + iteration_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10, + window=iteration_button) + policy_button = Button(self, text="Improve", + command=self.improve_policy) + policy_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10, + window=policy_button) + policy_button = Button(self, text="move", command=self.move_by_policy) + policy_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10, + window=policy_button) + policy_button = Button(self, text="reset", command=self.reset) + policy_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10, + window=policy_button) + + # create grids + for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT + canvas.create_line(x0, y0, x1, y1) + for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row + canvas.create_line(x0, y0, x1, y1) + + # add img to canvas + self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) + canvas.create_image(250, 150, image=self.shapes[1]) + canvas.create_image(150, 250, image=self.shapes[1]) + canvas.create_image(250, 250, image=self.shapes[2]) + + # pack all + canvas.pack() + + return canvas + + def load_images(self): + up = PhotoImage(Image.open("../img/up.png").resize((13, 13))) + right = PhotoImage(Image.open("../img/right.png").resize((13, 13))) + left = PhotoImage(Image.open("../img/left.png").resize((13, 13))) + down = PhotoImage(Image.open("../img/down.png").resize((13, 13))) + rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65))) + triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65))) + circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65))) + return (up, down, left, right), (rectangle, triangle, circle) + + def reset(self): + if self.is_moving == 0: + self.evaluation_count = 0 + self.improvement_count = 0 + for i in self.texts: + self.canvas.delete(i) + + for i in self.arrows: + self.canvas.delete(i) + self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] + self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH + for _ in range(HEIGHT)]) + self.agent.policy_table[2][2] = [] + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + + def text_value(self, row, col, contents, font='Helvetica', size=10, + style='normal', anchor="nw"): + origin_x, origin_y = 85, 70 + x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) + font = (font, str(size), style) + text = self.canvas.create_text(x, y, fill="black", text=contents, + font=font, anchor=anchor) + return self.texts.append(text) + + def text_reward(self, row, col, contents, font='Helvetica', size=10, + style='normal', anchor="nw"): + origin_x, origin_y = 5, 5 + x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) + font = (font, str(size), style) + text = self.canvas.create_text(x, y, fill="black", text=contents, + font=font, anchor=anchor) + return self.texts.append(text) + + def rectangle_move(self, action): + base_action = np.array([0, 0]) + location = self.find_rectangle() + self.render() + if action == 0 and location[0] > 0: # up + base_action[1] -= UNIT + elif action == 1 and location[0] < HEIGHT - 1: # down + base_action[1] += UNIT + elif action == 2 and location[1] > 0: # left + base_action[0] -= UNIT + elif action == 3 and location[1] < WIDTH - 1: # right + base_action[0] += UNIT + # move agent + self.canvas.move(self.rectangle, base_action[0], base_action[1]) + + def find_rectangle(self): + temp = self.canvas.coords(self.rectangle) + x = (temp[0] / 100) - 0.5 + y = (temp[1] / 100) - 0.5 + return int(y), int(x) + + def move_by_policy(self): + if self.improvement_count != 0 and self.is_moving != 1: + self.is_moving = 1 + + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + + x, y = self.find_rectangle() + while len(self.agent.policy_table[x][y]) != 0: + self.after(100, + self.rectangle_move(self.agent.get_action([x, y]))) + x, y = self.find_rectangle() + self.is_moving = 0 + + def draw_one_arrow(self, col, row, policy): + if col == 2 and row == 2: + return + + if policy[0] > 0: # up + origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.up)) + if policy[1] > 0: # down + origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.down)) + if policy[2] > 0: # left + origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.left)) + if policy[3] > 0: # right + origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.right)) + + def draw_from_policy(self, policy_table): + for i in range(HEIGHT): + for j in range(WIDTH): + self.draw_one_arrow(i, j, policy_table[i][j]) + + def print_value_table(self, value_table): + for i in range(WIDTH): + for j in range(HEIGHT): + self.text_value(i, j, value_table[i][j]) + + def render(self): + time.sleep(0.1) + self.canvas.tag_raise(self.rectangle) + self.update() + + def evaluate_policy(self): + self.evaluation_count += 1 + for i in self.texts: + self.canvas.delete(i) + self.agent.policy_evaluation() + self.print_value_table(self.agent.value_table) + + def improve_policy(self): + self.improvement_count += 1 + for i in self.arrows: + self.canvas.delete(i) + self.agent.policy_improvement() + self.draw_from_policy(self.agent.policy_table) + + +class Env: + def __init__(self): + self.transition_probability = TRANSITION_PROB + self.width = WIDTH + self.height = HEIGHT + self.reward = [[0] * WIDTH for _ in range(HEIGHT)] + self.possible_actions = POSSIBLE_ACTIONS + self.reward[2][2] = 1 # reward 1 for circle + self.reward[1][2] = -1 # reward -1 for triangle + self.reward[2][1] = -1 # reward -1 for triangle + self.all_state = [] + + for x in range(WIDTH): + for y in range(HEIGHT): + state = [x, y] + self.all_state.append(state) + + def get_reward(self, state, action): + next_state = self.state_after_action(state, action) + return self.reward[next_state[0]][next_state[1]] + + def state_after_action(self, state, action_index): + action = ACTIONS[action_index] + return self.check_boundary([state[0] + action[0], state[1] + action[1]]) + + @staticmethod + def check_boundary(state): + state[0] = (0 if state[0] < 0 else WIDTH - 1 + if state[0] > WIDTH - 1 else state[0]) + state[1] = (0 if state[1] < 0 else HEIGHT - 1 + if state[1] > HEIGHT - 1 else state[1]) + return state + + def get_transition_prob(self, state, action): + return self.transition_probability + + def get_all_states(self): + return self.all_state diff --git a/1-grid-world/1-policy-iteration/policy_iteration.py b/1-grid-world/1-policy-iteration/policy_iteration.py new file mode 100644 index 00000000..d6dc414e --- /dev/null +++ b/1-grid-world/1-policy-iteration/policy_iteration.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +import random +from environment import GraphicDisplay, Env + + +class PolicyIteration: + def __init__(self, env): + self.env = env + # 2-d list for the value function + self.value_table = [[0.0] * env.width for _ in range(env.height)] + # list of random policy (same probability of up, down, left, right) + self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width + for _ in range(env.height)] + # setting terminal state + self.policy_table[2][2] = [] + self.discount_factor = 0.9 + + def policy_evaluation(self): + next_value_table = [[0.00] * self.env.width + for _ in range(self.env.height)] + + # Bellman Expectation Equation for the every states + for state in self.env.get_all_states(): + value = 0.0 + # keep the value function of terminal states as 0 + if state == [2, 2]: + next_value_table[state[0]][state[1]] = value + continue + + for action in self.env.possible_actions: + next_state = self.env.state_after_action(state, action) + reward = self.env.get_reward(state, action) + next_value = self.get_value(next_state) + value += (self.get_policy(state)[action] * + (reward + self.discount_factor * next_value)) + + next_value_table[state[0]][state[1]] = round(value, 2) + + self.value_table = next_value_table + + def policy_improvement(self): + next_policy = self.policy_table + for state in self.env.get_all_states(): + if state == [2, 2]: + continue + value = -99999 + max_index = [] + result = [0.0, 0.0, 0.0, 0.0] # initialize the policy + + # for every actions, calculate + # [reward + (discount factor) * (next state value function)] + for index, action in enumerate(self.env.possible_actions): + next_state = self.env.state_after_action(state, action) + reward = self.env.get_reward(state, action) + next_value = self.get_value(next_state) + temp = reward + self.discount_factor * next_value + + # We normally can't pick multiple actions in greedy policy. + # but here we allow multiple actions with same max values + if temp == value: + max_index.append(index) + elif temp > value: + value = temp + max_index.clear() + max_index.append(index) + + # probability of action + prob = 1 / len(max_index) + + for index in max_index: + result[index] = prob + + next_policy[state[0]][state[1]] = result + + self.policy_table = next_policy + + # get action according to the current policy + def get_action(self, state): + random_pick = random.randrange(100) / 100 + + policy = self.get_policy(state) + policy_sum = 0.0 + # return the action in the index + for index, value in enumerate(policy): + policy_sum += value + if random_pick < policy_sum: + return index + + # get policy of specific state + def get_policy(self, state): + if state == [2, 2]: + return 0.0 + return self.policy_table[state[0]][state[1]] + + def get_value(self, state): + return round(self.value_table[state[0]][state[1]], 2) + +if __name__ == "__main__": + env = Env() + policy_iteration = PolicyIteration(env) + grid_world = GraphicDisplay(policy_iteration) + grid_world.mainloop() diff --git a/1-grid-world/2-value-iteration/environment.py b/1-grid-world/2-value-iteration/environment.py new file mode 100644 index 00000000..81af3dc5 --- /dev/null +++ b/1-grid-world/2-value-iteration/environment.py @@ -0,0 +1,261 @@ +import tkinter as tk +import time +import numpy as np +import random +from PIL import ImageTk, Image + +PhotoImage = ImageTk.PhotoImage +UNIT = 100 # pixels +HEIGHT = 5 # grid height +WIDTH = 5 # grid width +TRANSITION_PROB = 1 +POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right +ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates +REWARDS = [] + + +class GraphicDisplay(tk.Tk): + def __init__(self, value_iteration): + super(GraphicDisplay, self).__init__() + self.title('Value Iteration') + self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) + self.texts = [] + self.arrows = [] + self.env = Env() + self.agent = value_iteration + self.iteration_count = 0 + self.improvement_count = 0 + self.is_moving = 0 + (self.up, self.down, self.left, + self.right), self.shapes = self.load_images() + self.canvas = self._build_canvas() + self.text_reward(2, 2, "R : 1.0") + self.text_reward(1, 2, "R : -1.0") + self.text_reward(2, 1, "R : -1.0") + + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) + # buttons + iteration_button = tk.Button(self, text="Calculate", + command=self.calculate_value) + iteration_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, + window=iteration_button) + + policy_button = tk.Button(self, text="Print Policy", + command=self.print_optimal_policy) + policy_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, + window=policy_button) + + policy_button = tk.Button(self, text="Move", + command=self.move_by_policy) + policy_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, + window=policy_button) + + policy_button = tk.Button(self, text="Clear", command=self.clear) + policy_button.configure(width=10, activebackground="#33B5E5") + canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, + window=policy_button) + + # create grids + for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT + canvas.create_line(x0, y0, x1, y1) + for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row + canvas.create_line(x0, y0, x1, y1) + + # add img to canvas + self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) + canvas.create_image(250, 150, image=self.shapes[1]) + canvas.create_image(150, 250, image=self.shapes[1]) + canvas.create_image(250, 250, image=self.shapes[2]) + + # pack all + canvas.pack() + + return canvas + + def load_images(self): + PhotoImage = ImageTk.PhotoImage + up = PhotoImage(Image.open("../img/up.png").resize((13, 13))) + right = PhotoImage(Image.open("../img/right.png").resize((13, 13))) + left = PhotoImage(Image.open("../img/left.png").resize((13, 13))) + down = PhotoImage(Image.open("../img/down.png").resize((13, 13))) + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((65, 65))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((65, 65))) + circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65))) + return (up, down, left, right), (rectangle, triangle, circle) + + def clear(self): + + if self.is_moving == 0: + self.iteration_count = 0 + self.improvement_count = 0 + for i in self.texts: + self.canvas.delete(i) + + for i in self.arrows: + self.canvas.delete(i) + + self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)] + + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + + def reset(self): + self.update() + time.sleep(0.5) + self.canvas.delete(self.rectangle) + return self.canvas.coords(self.rectangle) + + def text_value(self, row, col, contents, font='Helvetica', size=12, + style='normal', anchor="nw"): + origin_x, origin_y = 85, 70 + x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) + font = (font, str(size), style) + text = self.canvas.create_text(x, y, fill="black", text=contents, + font=font, anchor=anchor) + return self.texts.append(text) + + def text_reward(self, row, col, contents, font='Helvetica', size=12, + style='normal', anchor="nw"): + origin_x, origin_y = 5, 5 + x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) + font = (font, str(size), style) + text = self.canvas.create_text(x, y, fill="black", text=contents, + font=font, anchor=anchor) + return self.texts.append(text) + + def rectangle_move(self, action): + base_action = np.array([0, 0]) + location = self.find_rectangle() + self.render() + if action == 0 and location[0] > 0: # up + base_action[1] -= UNIT + elif action == 1 and location[0] < HEIGHT - 1: # down + base_action[1] += UNIT + elif action == 2 and location[1] > 0: # left + base_action[0] -= UNIT + elif action == 3 and location[1] < WIDTH - 1: # right + base_action[0] += UNIT + + self.canvas.move(self.rectangle, base_action[0], + base_action[1]) # move agent + + def find_rectangle(self): + temp = self.canvas.coords(self.rectangle) + x = (temp[0] / 100) - 0.5 + y = (temp[1] / 100) - 0.5 + return int(y), int(x) + + def move_by_policy(self): + + if self.improvement_count != 0 and self.is_moving != 1: + self.is_moving = 1 + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + + x, y = self.find_rectangle() + while len(self.agent.get_action([x, y])) != 0: + action = random.sample(self.agent.get_action([x, y]), 1)[0] + self.after(100, self.rectangle_move(action)) + x, y = self.find_rectangle() + self.is_moving = 0 + + def draw_one_arrow(self, col, row, action): + if col == 2 and row == 2: + return + if action == 0: # up + origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.up)) + elif action == 1: # down + origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.down)) + elif action == 3: # right + origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.right)) + elif action == 2: # left + origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) + self.arrows.append(self.canvas.create_image(origin_x, origin_y, + image=self.left)) + + def draw_from_values(self, state, action_list): + i = state[0] + j = state[1] + for action in action_list: + self.draw_one_arrow(i, j, action) + + def print_values(self, values): + for i in range(WIDTH): + for j in range(HEIGHT): + self.text_value(i, j, values[i][j]) + + def render(self): + time.sleep(0.1) + self.canvas.tag_raise(self.rectangle) + self.update() + + def calculate_value(self): + self.iteration_count += 1 + for i in self.texts: + self.canvas.delete(i) + self.agent.value_iteration() + self.print_values(self.agent.value_table) + + def print_optimal_policy(self): + self.improvement_count += 1 + for i in self.arrows: + self.canvas.delete(i) + for state in self.env.get_all_states(): + action = self.agent.get_action(state) + self.draw_from_values(state, action) + + +class Env: + def __init__(self): + self.transition_probability = TRANSITION_PROB + self.width = WIDTH # Width of Grid World + self.height = HEIGHT # Height of GridWorld + self.reward = [[0] * WIDTH for _ in range(HEIGHT)] + self.possible_actions = POSSIBLE_ACTIONS + self.reward[2][2] = 1 # reward 1 for circle + self.reward[1][2] = -1 # reward -1 for triangle + self.reward[2][1] = -1 # reward -1 for triangle + self.all_state = [] + + for x in range(WIDTH): + for y in range(HEIGHT): + state = [x, y] + self.all_state.append(state) + + def get_reward(self, state, action): + next_state = self.state_after_action(state, action) + return self.reward[next_state[0]][next_state[1]] + + def state_after_action(self, state, action_index): + action = ACTIONS[action_index] + return self.check_boundary([state[0] + action[0], state[1] + action[1]]) + + @staticmethod + def check_boundary(state): + state[0] = (0 if state[0] < 0 else WIDTH - 1 + if state[0] > WIDTH - 1 else state[0]) + state[1] = (0 if state[1] < 0 else HEIGHT - 1 + if state[1] > HEIGHT - 1 else state[1]) + return state + + def get_transition_prob(self, state, action): + return self.transition_probability + + def get_all_states(self): + return self.all_state diff --git a/1-grid-world/2-value-iteration/value_iteration.py b/1-grid-world/2-value-iteration/value_iteration.py new file mode 100644 index 00000000..8dff7281 --- /dev/null +++ b/1-grid-world/2-value-iteration/value_iteration.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +from environment import GraphicDisplay, Env + +class ValueIteration: + def __init__(self, env): + self.env = env + # 2-d list for the value function + self.value_table = [[0.0] * env.width for _ in range(env.height)] + self.discount_factor = 0.9 + + # get next value function table from the current value function table + def value_iteration(self): + next_value_table = [[0.0] * self.env.width + for _ in range(self.env.height)] + for state in self.env.get_all_states(): + if state == [2, 2]: + next_value_table[state[0]][state[1]] = 0.0 + continue + value_list = [] + + for action in self.env.possible_actions: + next_state = self.env.state_after_action(state, action) + reward = self.env.get_reward(state, action) + next_value = self.get_value(next_state) + value_list.append((reward + self.discount_factor * next_value)) + # return the maximum value(it is the optimality equation!!) + next_value_table[state[0]][state[1]] = round(max(value_list), 2) + self.value_table = next_value_table + + # get action according to the current value function table + def get_action(self, state): + action_list = [] + max_value = -99999 + + if state == [2, 2]: + return [] + + # calculating q values for the all actions and + # append the action to action list which has maximum q value + for action in self.env.possible_actions: + + next_state = self.env.state_after_action(state, action) + reward = self.env.get_reward(state, action) + next_value = self.get_value(next_state) + value = (reward + self.discount_factor * next_value) + + if value > max_value: + action_list.clear() + action_list.append(action) + max_value = value + elif value == max_value: + action_list.append(action) + + return action_list + + def get_value(self, state): + return round(self.value_table[state[0]][state[1]], 2) + +if __name__ == "__main__": + env = Env() + value_iteration = ValueIteration(env) + grid_world = GraphicDisplay(value_iteration) + grid_world.mainloop() diff --git a/1-grid-world/3-monte-carlo/environment.py b/1-grid-world/3-monte-carlo/environment.py new file mode 100644 index 00000000..d885107d --- /dev/null +++ b/1-grid-world/3-monte-carlo/environment.py @@ -0,0 +1,113 @@ +import time +import numpy as np +import tkinter as tk +from PIL import ImageTk, Image + +np.random.seed(1) +PhotoImage = ImageTk.PhotoImage +UNIT = 100 # pixels +HEIGHT = 5 # grid height +WIDTH = 5 # grid width + + +class Env(tk.Tk): + def __init__(self): + super(Env, self).__init__() + self.action_space = ['u', 'd', 'l', 'r'] + self.n_actions = len(self.action_space) + self.title('monte carlo') + self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) + self.shapes = self.load_images() + self.canvas = self._build_canvas() + self.texts = [] + + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) + # create grids + for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT + canvas.create_line(x0, y0, x1, y1) + for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 + x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r + canvas.create_line(x0, y0, x1, y1) + + # add img to canvas + self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) + self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) + self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) + self.circle = canvas.create_image(250, 250, image=self.shapes[2]) + + # pack all + canvas.pack() + + return canvas + + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((65, 65))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((65, 65))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((65, 65))) + + return rectangle, triangle, circle + + @staticmethod + def coords_to_state(coords): + x = int((coords[0] - 50) / 100) + y = int((coords[1] - 50) / 100) + return [x, y] + + def reset(self): + self.update() + time.sleep(0.5) + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + # return observation + return self.coords_to_state(self.canvas.coords(self.rectangle)) + + def step(self, action): + state = self.canvas.coords(self.rectangle) + base_action = np.array([0, 0]) + self.render() + + if action == 0: # up + if state[1] > UNIT: + base_action[1] -= UNIT + elif action == 1: # down + if state[1] < (HEIGHT - 1) * UNIT: + base_action[1] += UNIT + elif action == 2: # left + if state[0] > UNIT: + base_action[0] -= UNIT + elif action == 3: # right + if state[0] < (WIDTH - 1) * UNIT: + base_action[0] += UNIT + # move agent + self.canvas.move(self.rectangle, base_action[0], base_action[1]) + # move rectangle to top level of canvas + self.canvas.tag_raise(self.rectangle) + + next_state = self.canvas.coords(self.rectangle) + + # reward function + if next_state == self.canvas.coords(self.circle): + reward = 100 + done = True + elif next_state in [self.canvas.coords(self.triangle1), + self.canvas.coords(self.triangle2)]: + reward = -100 + done = True + else: + reward = 0 + done = False + + next_state = self.coords_to_state(next_state) + + return next_state, reward, done + + def render(self): + time.sleep(0.03) + self.update() diff --git a/1-grid-world/3-monte-carlo/mc_agent.py b/1-grid-world/3-monte-carlo/mc_agent.py new file mode 100644 index 00000000..682b59b9 --- /dev/null +++ b/1-grid-world/3-monte-carlo/mc_agent.py @@ -0,0 +1,111 @@ +import numpy as np +import random +from collections import defaultdict +from environment import Env + + +# Monte Carlo Agent which learns every episodes from the sample +class MCAgent: + def __init__(self, actions): + self.width = 5 + self.height = 5 + self.actions = actions + self.learning_rate = 0.01 + self.discount_factor = 0.9 + self.epsilon = 0.1 + self.samples = [] + self.value_table = defaultdict(float) + + # append sample to memory(state, reward, done) + def save_sample(self, state, reward, done): + self.samples.append([state, reward, done]) + + # for every episode, agent updates q function of visited states + def update(self): + G_t = 0 + visit_state = [] + for reward in reversed(self.samples): + state = str(reward[0]) + if state not in visit_state: + visit_state.append(state) + G_t = self.discount_factor * (reward[1] + G_t) + value = self.value_table[state] + self.value_table[state] = (value + + self.learning_rate * (G_t - value)) + + # get action for the state according to the q function table + # agent pick action of epsilon-greedy policy + def get_action(self, state): + if np.random.rand() < self.epsilon: + # take random action + action = np.random.choice(self.actions) + else: + # take action according to the q function table + next_state = self.possible_next_state(state) + action = self.arg_max(next_state) + return int(action) + + # compute arg_max if multiple candidates exit, pick one randomly + @staticmethod + def arg_max(next_state): + max_index_list = [] + max_value = next_state[0] + for index, value in enumerate(next_state): + if value > max_value: + max_index_list.clear() + max_value = value + max_index_list.append(index) + elif value == max_value: + max_index_list.append(index) + return random.choice(max_index_list) + + # get the possible next states + def possible_next_state(self, state): + col, row = state + next_state = [0.0] * 4 + + if row != 0: + next_state[0] = self.value_table[str([col, row - 1])] + else: + next_state[0] = self.value_table[str(state)] + if row != self.height - 1: + next_state[1] = self.value_table[str([col, row + 1])] + else: + next_state[1] = self.value_table[str(state)] + if col != 0: + next_state[2] = self.value_table[str([col - 1, row])] + else: + next_state[2] = self.value_table[str(state)] + if col != self.width - 1: + next_state[3] = self.value_table[str([col + 1, row])] + else: + next_state[3] = self.value_table[str(state)] + + return next_state + + +# main loop +if __name__ == "__main__": + env = Env() + agent = MCAgent(actions=list(range(env.n_actions))) + + for episode in range(1000): + state = env.reset() + action = agent.get_action(state) + + while True: + env.render() + + # forward to next state. reward is number and done is boolean + next_state, reward, done = env.step(action) + agent.save_sample(next_state, reward, done) + + # get next action + action = agent.get_action(next_state) + + # at the end of each episode, update the q function table + if done: + print("episode : ", episode) + agent.update() + agent.samples.clear() + break diff --git a/Code 1. Grid World/4. SARSA/.python-version b/1-grid-world/4-sarsa/.python-version similarity index 100% rename from Code 1. Grid World/4. SARSA/.python-version rename to 1-grid-world/4-sarsa/.python-version diff --git a/Code 1. Grid World/3. Monte-Carlo/environment.py b/1-grid-world/4-sarsa/environment.py similarity index 55% rename from Code 1. Grid World/3. Monte-Carlo/environment.py rename to 1-grid-world/4-sarsa/environment.py index 30074db3..acf6d819 100644 --- a/Code 1. Grid World/3. Monte-Carlo/environment.py +++ b/1-grid-world/4-sarsa/environment.py @@ -4,7 +4,7 @@ from PIL import ImageTk, Image np.random.seed(1) - +PhotoImage = ImageTk.PhotoImage UNIT = 100 # pixels HEIGHT = 5 # grid height WIDTH = 5 # grid width @@ -15,41 +15,47 @@ def __init__(self): super(Env, self).__init__() self.action_space = ['u', 'd', 'l', 'r'] self.n_actions = len(self.action_space) - self.title('monte carlo') + self.title('SARSA') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) - self.buildGraphic() + self.shapes = self.load_images() + self.canvas = self._build_canvas() self.texts = [] - def buildGraphic(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) # create grids for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) + canvas.create_line(x0, y0, x1, y1) for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - self.canvas.create_line(x0, y0, x1, y1) - - # image_load - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) - self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65))) - self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65))) + canvas.create_line(x0, y0, x1, y1) - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image) - self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image) - self.circle = self.canvas.create_image(250, 250, image=self.circle_image) + # add img to canvas + self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) + self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) + self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) + self.circle = canvas.create_image(250, 250, image=self.shapes[2]) # pack all - self.canvas.pack() + canvas.pack() + + return canvas + + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((65, 65))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((65, 65))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((65, 65))) - def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"): + return rectangle, triangle, circle + def text_value(self, row, col, contents, action, font='Helvetica', size=10, + style='normal', anchor="nw"): if action == 0: origin_x, origin_y = 7, 42 elif action == 1: @@ -61,36 +67,33 @@ def text_value(self, row, col, contents, action, font='Helvetica', size=10, styl x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) font = (font, str(size), style) - return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) + text = self.canvas.create_text(x, y, fill="black", text=contents, + font=font, anchor=anchor) + return self.texts.append(text) def print_value_all(self, q_table): for i in self.texts: self.canvas.delete(i) self.texts.clear() - for i in range(HEIGHT): - for j in range(WIDTH): + for x in range(HEIGHT): + for y in range(WIDTH): for action in range(0, 4): - state = [i, j] - if str(state) in q_table.index: - temp = q_table.ix[str(state), action] - self.text_value(j, i, round(temp, 2), action) + state = [x, y] + if str(state) in q_table.keys(): + temp = q_table[str(state)][action] + self.text_value(y, x, round(temp, 2), action) def coords_to_state(self, coords): x = int((coords[0] - 50) / 100) y = int((coords[1] - 50) / 100) return [x, y] - def state_to_coords(self, state): - x = int(state[0] * 100 + 50) - y = int(state[1] * 100 + 50) - return [x, y] - def reset(self): self.update() time.sleep(0.5) - self.canvas.delete(self.rectangle) - origin = np.array([UNIT / 2, UNIT / 2]) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + self.render() # return observation return self.coords_to_state(self.canvas.coords(self.rectangle)) @@ -112,15 +115,18 @@ def step(self, action): if state[0] < (WIDTH - 1) * UNIT: base_action[0] += UNIT - self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent - - next_state = self.canvas.coords(self.rectangle) # next state + # move agent + self.canvas.move(self.rectangle, base_action[0], base_action[1]) + # move rectangle to top level of canvas + self.canvas.tag_raise(self.rectangle) + next_state = self.canvas.coords(self.rectangle) # reward function if next_state == self.canvas.coords(self.circle): reward = 100 done = True - elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]: + elif next_state in [self.canvas.coords(self.triangle1), + self.canvas.coords(self.triangle2)]: reward = -100 done = True else: @@ -132,5 +138,5 @@ def step(self, action): return next_state, reward, done def render(self): - time.sleep(0.05) + time.sleep(0.03) self.update() diff --git a/1-grid-world/4-sarsa/sarsa_agent.py b/1-grid-world/4-sarsa/sarsa_agent.py new file mode 100644 index 00000000..8a8cf9ef --- /dev/null +++ b/1-grid-world/4-sarsa/sarsa_agent.py @@ -0,0 +1,79 @@ +import numpy as np +import random +from collections import defaultdict +from environment import Env + + +# SARSA agent learns every time step from the sample +class SARSAgent: + def __init__(self, actions): + self.actions = actions + self.learning_rate = 0.01 + self.discount_factor = 0.9 + self.epsilon = 0.1 + self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) + + # with sample , learns new q function + def learn(self, state, action, reward, next_state, next_action): + current_q = self.q_table[state][action] + next_state_q = self.q_table[next_state][next_action] + new_q = (current_q + self.learning_rate * + (reward + self.discount_factor * next_state_q - current_q)) + self.q_table[state][action] = new_q + + # get action for the state according to the q function table + # agent pick action of epsilon-greedy policy + def get_action(self, state): + if np.random.rand() < self.epsilon: + # take random action + action = np.random.choice(self.actions) + else: + # take action according to the q function table + state_action = self.q_table[state] + action = self.arg_max(state_action) + return action + + @staticmethod + def arg_max(state_action): + max_index_list = [] + max_value = state_action[0] + for index, value in enumerate(state_action): + if value > max_value: + max_index_list.clear() + max_value = value + max_index_list.append(index) + elif value == max_value: + max_index_list.append(index) + return random.choice(max_index_list) + +if __name__ == "__main__": + env = Env() + agent = SARSAgent(actions=list(range(env.n_actions))) + + for episode in range(1000): + # reset environment and initialize state + + state = env.reset() + # get action of state from agent + action = agent.get_action(str(state)) + + while True: + env.render() + + # take action and proceed one step in the environment + next_state, reward, done = env.step(action) + next_action = agent.get_action(str(next_state)) + + # with sample , agent learns new q function + agent.learn(str(state), action, reward, str(next_state), next_action) + + state = next_state + action = next_action + + # print q function of all states at screen + env.print_value_all(agent.q_table) + + # if episode ends, then break + if done: + break + diff --git a/Code 1. Grid World/5. Q Learning/.python-version b/1-grid-world/5-q-learning/.python-version similarity index 100% rename from Code 1. Grid World/5. Q Learning/.python-version rename to 1-grid-world/5-q-learning/.python-version diff --git a/Code 1. Grid World/5. Q Learning/environment.py b/1-grid-world/5-q-learning/environment.py similarity index 60% rename from Code 1. Grid World/5. Q Learning/environment.py rename to 1-grid-world/5-q-learning/environment.py index 30074db3..e724e5ac 100644 --- a/Code 1. Grid World/5. Q Learning/environment.py +++ b/1-grid-world/5-q-learning/environment.py @@ -4,7 +4,7 @@ from PIL import ImageTk, Image np.random.seed(1) - +PhotoImage = ImageTk.PhotoImage UNIT = 100 # pixels HEIGHT = 5 # grid height WIDTH = 5 # grid width @@ -15,40 +15,47 @@ def __init__(self): super(Env, self).__init__() self.action_space = ['u', 'd', 'l', 'r'] self.n_actions = len(self.action_space) - self.title('monte carlo') + self.title('Q Learning') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) - self.buildGraphic() + self.shapes = self.load_images() + self.canvas = self._build_canvas() self.texts = [] - def buildGraphic(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) # create grids for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) + canvas.create_line(x0, y0, x1, y1) for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - self.canvas.create_line(x0, y0, x1, y1) - - # image_load - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) - self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65))) - self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65))) + canvas.create_line(x0, y0, x1, y1) - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image) - self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image) - self.circle = self.canvas.create_image(250, 250, image=self.circle_image) + # add img to canvas + self.rectangle = canvas.create_image(50, 50, image=self.shapes[0]) + self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1]) + self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1]) + self.circle = canvas.create_image(250, 250, image=self.shapes[2]) # pack all - self.canvas.pack() + canvas.pack() + + return canvas + + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((65, 65))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((65, 65))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((65, 65))) - def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"): + return rectangle, triangle, circle + + def text_value(self, row, col, contents, action, font='Helvetica', size=10, + style='normal', anchor="nw"): if action == 0: origin_x, origin_y = 7, 42 @@ -61,7 +68,9 @@ def text_value(self, row, col, contents, action, font='Helvetica', size=10, styl x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) font = (font, str(size), style) - return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) + text = self.canvas.create_text(x, y, fill="black", text=contents, + font=font, anchor=anchor) + return self.texts.append(text) def print_value_all(self, q_table): for i in self.texts: @@ -71,8 +80,8 @@ def print_value_all(self, q_table): for j in range(WIDTH): for action in range(0, 4): state = [i, j] - if str(state) in q_table.index: - temp = q_table.ix[str(state), action] + if str(state) in q_table.keys(): + temp = q_table[str(state)][action] self.text_value(j, i, round(temp, 2), action) def coords_to_state(self, coords): @@ -88,12 +97,13 @@ def state_to_coords(self, state): def reset(self): self.update() time.sleep(0.5) - self.canvas.delete(self.rectangle) - origin = np.array([UNIT / 2, UNIT / 2]) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) + self.render() # return observation return self.coords_to_state(self.canvas.coords(self.rectangle)) + def step(self, action): state = self.canvas.coords(self.rectangle) base_action = np.array([0, 0]) @@ -112,15 +122,18 @@ def step(self, action): if state[0] < (WIDTH - 1) * UNIT: base_action[0] += UNIT - self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent - - next_state = self.canvas.coords(self.rectangle) # next state + # move agent + self.canvas.move(self.rectangle, base_action[0], base_action[1]) + # move rectangle to top level of canvas + self.canvas.tag_raise(self.rectangle) + next_state = self.canvas.coords(self.rectangle) # reward function if next_state == self.canvas.coords(self.circle): reward = 100 done = True - elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]: + elif next_state in [self.canvas.coords(self.triangle1), + self.canvas.coords(self.triangle2)]: reward = -100 done = True else: @@ -128,9 +141,8 @@ def step(self, action): done = False next_state = self.coords_to_state(next_state) - return next_state, reward, done def render(self): - time.sleep(0.05) + time.sleep(0.03) self.update() diff --git a/1-grid-world/5-q-learning/q_learning_agent.py b/1-grid-world/5-q-learning/q_learning_agent.py new file mode 100644 index 00000000..029c2f36 --- /dev/null +++ b/1-grid-world/5-q-learning/q_learning_agent.py @@ -0,0 +1,69 @@ +import numpy as np +import random +from environment import Env +from collections import defaultdict + +class QLearningAgent: + def __init__(self, actions): + # actions = [0, 1, 2, 3] + self.actions = actions + self.learning_rate = 0.01 + self.discount_factor = 0.9 + self.epsilon = 0.1 + self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) + + # update q function with sample + def learn(self, state, action, reward, next_state): + current_q = self.q_table[state][action] + # using Bellman Optimality Equation to update q function + new_q = reward + self.discount_factor * max(self.q_table[next_state]) + self.q_table[state][action] += self.learning_rate * (new_q - current_q) + + # get action for the state according to the q function table + # agent pick action of epsilon-greedy policy + def get_action(self, state): + if np.random.rand() < self.epsilon: + # take random action + action = np.random.choice(self.actions) + else: + # take action according to the q function table + state_action = self.q_table[state] + action = self.arg_max(state_action) + return action + + @staticmethod + def arg_max(state_action): + max_index_list = [] + max_value = state_action[0] + for index, value in enumerate(state_action): + if value > max_value: + max_index_list.clear() + max_value = value + max_index_list.append(index) + elif value == max_value: + max_index_list.append(index) + return random.choice(max_index_list) + +if __name__ == "__main__": + env = Env() + agent = QLearningAgent(actions=list(range(env.n_actions))) + + for episode in range(1000): + state = env.reset() + + while True: + env.render() + + # take action and proceed one step in the environment + action = agent.get_action(str(state)) + next_state, reward, done = env.step(action) + + # with sample , agent learns new q function + agent.learn(str(state), action, reward, str(next_state)) + + state = next_state + env.print_value_all(agent.q_table) + + # if episode ends, then break + if done: + break diff --git a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py new file mode 100755 index 00000000..a1b1c23b --- /dev/null +++ b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py @@ -0,0 +1,117 @@ +import copy +import pylab +import random +import numpy as np +from environment import Env +from keras.layers import Dense +from keras.optimizers import Adam +from keras.models import Sequential + +EPISODES = 1000 + + +# this is DeepSARSA Agent for the GridWorld +# Utilize Neural Network as q function approximator +class DeepSARSAgent: + def __init__(self): + self.load_model = False + # actions which agent can do + self.action_space = [0, 1, 2, 3, 4] + # get size of state and action + self.action_size = len(self.action_space) + self.state_size = 15 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + + self.epsilon = 1. # exploration + self.epsilon_decay = .9999 + self.epsilon_min = 0.01 + self.model = self.build_model() + + if self.load_model: + self.epsilon = 0.05 + self.model.load_weights('./save_model/deep_sarsa_trained.h5') + + # approximate Q function using Neural Network + # state is input and Q Value of each action is output of network + def build_model(self): + model = Sequential() + model.add(Dense(30, input_dim=self.state_size, activation='relu')) + model.add(Dense(30, activation='relu')) + model.add(Dense(self.action_size, activation='linear')) + model.summary() + model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) + return model + + # get action from model using epsilon-greedy policy + def get_action(self, state): + if np.random.rand() <= self.epsilon: + # The agent acts randomly + return random.randrange(self.action_size) + else: + # Predict the reward value based on the given state + state = np.float32(state) + q_values = self.model.predict(state) + return np.argmax(q_values[0]) + + def train_model(self, state, action, reward, next_state, next_action, done): + if self.epsilon > self.epsilon_min: + self.epsilon *= self.epsilon_decay + + state = np.float32(state) + next_state = np.float32(next_state) + target = self.model.predict(state)[0] + # like Q Learning, get maximum Q value at s' + # But from target model + if done: + target[action] = reward + else: + target[action] = (reward + self.discount_factor * + self.model.predict(next_state)[0][next_action]) + + target = np.reshape(target, [1, 5]) + # make minibatch which includes target q value and predicted q value + # and do the model fit! + self.model.fit(state, target, epochs=1, verbose=0) + + +if __name__ == "__main__": + env = Env() + agent = DeepSARSAgent() + + global_step = 0 + scores, episodes = [], [] + + for e in range(EPISODES): + done = False + score = 0 + state = env.reset() + state = np.reshape(state, [1, 15]) + + while not done: + # fresh env + global_step += 1 + + # get action for the current state and go one step in environment + action = agent.get_action(state) + next_state, reward, done = env.step(action) + next_state = np.reshape(next_state, [1, 15]) + next_action = agent.get_action(next_state) + agent.train_model(state, action, reward, next_state, next_action, + done) + state = next_state + # every time step we do training + score += reward + + state = copy.deepcopy(next_state) + + if done: + scores.append(score) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.savefig("./save_graph/deep_sarsa_.png") + print("episode:", e, " score:", score, "global_step", + global_step, " epsilon:", agent.epsilon) + + if e % 100 == 0: + agent.model.save_weights("./save_model/deep_sarsa.h5") diff --git a/Code 1. Grid World/6. DQN/environment.py b/1-grid-world/6-deep-sarsa/environment.py old mode 100644 new mode 100755 similarity index 52% rename from Code 1. Grid World/6. DQN/environment.py rename to 1-grid-world/6-deep-sarsa/environment.py index a30093a1..c390de8b --- a/Code 1. Grid World/6. DQN/environment.py +++ b/1-grid-world/6-deep-sarsa/environment.py @@ -3,9 +3,10 @@ import tkinter as tk from PIL import ImageTk, Image +PhotoImage = ImageTk.PhotoImage UNIT = 50 # pixels -HEIGHT = 10 # grid height -WIDTH = 10 # grid width +HEIGHT = 5 # grid height +WIDTH = 5 # grid width np.random.seed(1) @@ -15,54 +16,52 @@ def __init__(self): super(Env, self).__init__() self.action_space = ['u', 'd', 'l', 'r'] self.action_size = len(self.action_space) - self.title('DeepQNetwork') + self.title('DeepSARSA') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) - self.build_graphic() + self.shapes = self.load_images() + self.canvas = self._build_canvas() self.counter = 0 + self.rewards = [] + self.goal = [] + # obstacle + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + # #goal + self.set_reward([4, 4], 1) - def build_graphic(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) # create grids for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) + canvas.create_line(x0, y0, x1, y1) for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - self.canvas.create_line(x0, y0, x1, y1) + canvas.create_line(x0, y0, x1, y1) - # image_load - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((30, 30), Image.ANTIALIAS)) - self.fire_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((30, 30))) - self.fish_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((30, 30))) + self.rewards = [] + self.goal = [] + # add image to canvas + x, y = UNIT/2, UNIT/2 + self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) - self.rewards = list() - self.goal = list() + # pack all` + canvas.pack() - # obstacle - self.set_reward([2, 7], -1) - self.set_reward([3, 2], -1) - self.set_reward([2, 5], -1) - self.set_reward([4, 9], -1) - self.set_reward([5, 7], -1) - self.set_reward([6, 4], -1) - self.set_reward([7, 8], -1) - self.set_reward([8, 3], -1) - self.set_reward([9, 1], -1) - - # - # - # #goal - self.set_reward([9, 9], 5) + return canvas - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((30, 30))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((30, 30))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((30, 30))) - # pack all` - self.canvas.pack() + return rectangle, triangle, circle def reset_reward(self): @@ -71,36 +70,33 @@ def reset_reward(self): self.rewards.clear() self.goal.clear() - # obstacle - self.set_reward([2, 7], -1) - self.set_reward([3, 2], -1) - self.set_reward([2, 5], -1) - self.set_reward([4, 9], -1) - self.set_reward([5, 7], -1) - self.set_reward([6, 4], -1) - self.set_reward([7, 8], -1) - self.set_reward([8, 3], -1) - self.set_reward([9, 1], -1) - - # - # + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + # #goal - self.set_reward([9, 9], 5) + self.set_reward([4, 4], 1) def set_reward(self, state, reward): state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) temp = {} if reward > 0: temp['reward'] = reward - temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2, - image=self.fish_image) + temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, + (UNIT * y) + UNIT / 2, + image=self.shapes[2]) + self.goal.append(temp['figure']) elif reward < 0: + temp['direction'] = -1 temp['reward'] = reward - temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2, - image=self.fire_image) + temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, + (UNIT * y) + UNIT / 2, + image=self.shapes[1]) temp['coords'] = self.canvas.coords(temp['figure']) temp['state'] = state @@ -112,28 +108,28 @@ def check_if_reward(self, state): check_list = dict() check_list['if_goal'] = False rewards = 0 + for reward in self.rewards: if reward['state'] == state: rewards += reward['reward'] - if reward['reward'] == 5: + if reward['reward'] == 1: check_list['if_goal'] = True + check_list['rewards'] = rewards return check_list def coords_to_state(self, coords): - x = int((coords[0] - 50) / 100) - y = int((coords[1] - 50) / 100) + x = int((coords[0] - UNIT / 2) / UNIT) + y = int((coords[1] - UNIT / 2) / UNIT) return [x, y] def reset(self): self.update() time.sleep(0.5) - self.canvas.delete(self.rectangle) - origin = np.array([UNIT / 2, UNIT / 2]) - self.rectangle = self.canvas.create_image(UNIT/2, UNIT/2, image=self.rectangle_image) + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) # return observation - self.reset_reward() return self.get_state() @@ -141,66 +137,77 @@ def step(self, action): self.counter += 1 self.render() - next_coords = self.move(self.rectangle, action) - if self.counter % 2 == 1: self.rewards = self.move_rewards() + next_coords = self.move(self.rectangle, action) check = self.check_if_reward(self.coords_to_state(next_coords)) done = check['if_goal'] reward = check['rewards'] + self.canvas.tag_raise(self.rectangle) + s_ = self.get_state() return s_, reward, done def get_state(self): - agent_location = self.coords_to_state(self.canvas.coords(self.rectangle)) - agent_x = agent_location[0] - agent_y = agent_location[1] + location = self.coords_to_state(self.canvas.coords(self.rectangle)) + agent_x = location[0] + agent_y = location[1] - locations = list() + states = list() - locations.append(agent_x) - locations.append(agent_y) + # locations.append(agent_x) + # locations.append(agent_y) for reward in self.rewards: reward_location = reward['state'] - locations.append(agent_x - reward_location[0]) - locations.append(agent_y - reward_location[1]) + states.append(reward_location[0] - agent_x) + states.append(reward_location[1] - agent_y) + if reward['reward'] < 0: + states.append(-1) + states.append(reward['direction']) + else: + states.append(1) - return locations + return states def move_rewards(self): new_rewards = [] for temp in self.rewards: - if temp['reward'] == 10: + if temp['reward'] == 1: new_rewards.append(temp) continue - temp['coords'] = self.move_const(temp['figure']) + temp['coords'] = self.move_const(temp) temp['state'] = self.coords_to_state(temp['coords']) new_rewards.append(temp) return new_rewards def move_const(self, target): - s = self.canvas.coords(target) + + s = self.canvas.coords(target['figure']) base_action = np.array([0, 0]) - if s[0] < (WIDTH - 1) * UNIT: - base_action[0] += UNIT - else: - base_action[0] = -(WIDTH - 1) * UNIT + if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: + target['direction'] = 1 + elif s[0] == UNIT / 2: + target['direction'] = -1 - # if action == 4 # move _none + if target['direction'] == -1: + base_action[0] += UNIT + elif target['direction'] == 1: + base_action[0] -= UNIT - if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]: + if (target['figure'] is not self.rectangle + and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): base_action = np.array([0, 0]) - self.canvas.move(target, base_action[0], base_action[1]) + self.canvas.move(target['figure'], base_action[0], base_action[1]) - s_ = self.canvas.coords(target) + s_ = self.canvas.coords(target['figure']) return s_ @@ -222,11 +229,6 @@ def move(self, target, action): if s[0] > UNIT: base_action[0] -= UNIT - # if action == 4 # move _none - - if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]: - base_action = np.array([0, 0]) - self.canvas.move(target, base_action[0], base_action[1]) s_ = self.canvas.coords(target) @@ -234,5 +236,5 @@ def move(self, target, action): return s_ def render(self): - time.sleep(0.1) + time.sleep(0.07) self.update() diff --git a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png new file mode 100644 index 00000000..8dec1d06 Binary files /dev/null and b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png differ diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 similarity index 55% rename from Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 rename to 1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 index fe4933cb..23ba39c9 100644 Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 and b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 differ diff --git a/Code 1. Grid World/7. Policy Gradient/environment.py b/1-grid-world/7-reinforce/environment.py similarity index 51% rename from Code 1. Grid World/7. Policy Gradient/environment.py rename to 1-grid-world/7-reinforce/environment.py index 620a30b3..c8283baa 100644 --- a/Code 1. Grid World/7. Policy Gradient/environment.py +++ b/1-grid-world/7-reinforce/environment.py @@ -3,11 +3,12 @@ import tkinter as tk from PIL import ImageTk, Image +PhotoImage = ImageTk.PhotoImage UNIT = 50 # pixels -HEIGHT = 10 # grid height -WIDTH = 10 # grid width +HEIGHT = 5 # grid height +WIDTH = 5 # grid width -# np.random.seed(1) +np.random.seed(1) class Env(tk.Tk): @@ -15,54 +16,52 @@ def __init__(self): super(Env, self).__init__() self.action_space = ['u', 'd', 'l', 'r'] self.action_size = len(self.action_space) - self.title('Policy Gradient') + self.title('Reinforce') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) - self.build_graphic() + self.shapes = self.load_images() + self.canvas = self._build_canvas() self.counter = 0 + self.rewards = [] + self.goal = [] + # obstacle + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + # #goal + self.set_reward([4, 4], 1) - def build_graphic(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - + def _build_canvas(self): + canvas = tk.Canvas(self, bg='white', + height=HEIGHT * UNIT, + width=WIDTH * UNIT) # create grids for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) + canvas.create_line(x0, y0, x1, y1) for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - self.canvas.create_line(x0, y0, x1, y1) + canvas.create_line(x0, y0, x1, y1) - # image_load - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((30, 30), Image.ANTIALIAS)) - self.fire_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((30, 30))) - self.fish_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((30, 30))) + self.rewards = [] + self.goal = [] + # add image to canvas + x, y = UNIT/2, UNIT/2 + self.rectangle = canvas.create_image(x, y, image=self.shapes[0]) - self.rewards = list() - self.goal = list() + # pack all` + canvas.pack() - # obstacle - self.set_reward([2, 7], -1) - self.set_reward([3, 2], -1) - self.set_reward([2, 5], -1) - self.set_reward([4, 9], -1) - self.set_reward([5, 7], -1) - self.set_reward([6, 4], -1) - self.set_reward([7, 8], -1) - self.set_reward([8, 3], -1) - self.set_reward([9, 1], -1) - - # - # - # #goal - self.set_reward([9, 9], 5) + return canvas - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) + def load_images(self): + rectangle = PhotoImage( + Image.open("../img/rectangle.png").resize((30, 30))) + triangle = PhotoImage( + Image.open("../img/triangle.png").resize((30, 30))) + circle = PhotoImage( + Image.open("../img/circle.png").resize((30, 30))) - # pack all` - self.canvas.pack() + return rectangle, triangle, circle def reset_reward(self): @@ -71,36 +70,33 @@ def reset_reward(self): self.rewards.clear() self.goal.clear() - # obstacle - self.set_reward([2, 7], -1) - self.set_reward([3, 2], -1) - self.set_reward([2, 5], -1) - self.set_reward([4, 9], -1) - self.set_reward([5, 7], -1) - self.set_reward([6, 4], -1) - self.set_reward([7, 8], -1) - self.set_reward([8, 3], -1) - self.set_reward([9, 1], -1) - - # - # + self.set_reward([0, 1], -1) + self.set_reward([1, 2], -1) + self.set_reward([2, 3], -1) + # #goal - self.set_reward([9, 9], 5) + self.set_reward([4, 4], 1) def set_reward(self, state, reward): state = [int(state[0]), int(state[1])] + x = int(state[0]) + y = int(state[1]) temp = {} if reward > 0: temp['reward'] = reward - temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2, - image=self.fish_image) + temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, + (UNIT * y) + UNIT / 2, + image=self.shapes[2]) + self.goal.append(temp['figure']) elif reward < 0: + temp['direction'] = -1 temp['reward'] = reward - temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2, - image=self.fire_image) + temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2, + (UNIT * y) + UNIT / 2, + image=self.shapes[1]) temp['coords'] = self.canvas.coords(temp['figure']) temp['state'] = state @@ -112,28 +108,27 @@ def check_if_reward(self, state): check_list = dict() check_list['if_goal'] = False rewards = 0 + for reward in self.rewards: if reward['state'] == state: rewards += reward['reward'] - if reward['reward'] == 5: + if reward['reward'] > 0: check_list['if_goal'] = True + check_list['rewards'] = rewards return check_list def coords_to_state(self, coords): - x = int((coords[0] - 50) / 100) - y = int((coords[1] - 50) / 100) + x = int((coords[0] - UNIT / 2) / UNIT) + y = int((coords[1] - UNIT / 2) / UNIT) return [x, y] def reset(self): self.update() - time.sleep(0.5) - self.canvas.delete(self.rectangle) - origin = np.array([UNIT / 2, UNIT / 2]) - self.rectangle = self.canvas.create_image(UNIT/2, UNIT/2, image=self.rectangle_image) + x, y = self.canvas.coords(self.rectangle) + self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y) # return observation - self.reset_reward() return self.get_state() @@ -141,14 +136,15 @@ def step(self, action): self.counter += 1 self.render() - next_coords = self.move(self.rectangle, action) - if self.counter % 2 == 1: self.rewards = self.move_rewards() + next_coords = self.move(self.rectangle, action) check = self.check_if_reward(self.coords_to_state(next_coords)) done = check['if_goal'] reward = check['rewards'] + reward -= 0.1 + self.canvas.tag_raise(self.rectangle) s_ = self.get_state() @@ -156,51 +152,61 @@ def step(self, action): def get_state(self): - agent_location = self.coords_to_state(self.canvas.coords(self.rectangle)) - agent_x = agent_location[0] - agent_y = agent_location[1] + location = self.coords_to_state(self.canvas.coords(self.rectangle)) + agent_x = location[0] + agent_y = location[1] - locations = list() + states = list() - locations.append(agent_x) - locations.append(agent_y) + # locations.append(agent_x) + # locations.append(agent_y) for reward in self.rewards: reward_location = reward['state'] - locations.append(agent_x - reward_location[0]) - locations.append(agent_y - reward_location[1]) + states.append(reward_location[0] - agent_x) + states.append(reward_location[1] - agent_y) + if reward['reward'] < 0: + states.append(-1) + states.append(reward['direction']) + else: + states.append(1) - return locations + return states def move_rewards(self): new_rewards = [] for temp in self.rewards: - if temp['reward'] == 10: + if temp['reward'] > 0: new_rewards.append(temp) continue - temp['coords'] = self.move_const(temp['figure']) + temp['coords'] = self.move_const(temp) temp['state'] = self.coords_to_state(temp['coords']) new_rewards.append(temp) return new_rewards def move_const(self, target): - s = self.canvas.coords(target) + + s = self.canvas.coords(target['figure']) base_action = np.array([0, 0]) - if s[0] < (WIDTH - 1) * UNIT: - base_action[0] += UNIT - else: - base_action[0] = -(WIDTH - 1) * UNIT + if s[0] == (WIDTH - 1) * UNIT + UNIT / 2: + target['direction'] = 1 + elif s[0] == UNIT / 2: + target['direction'] = -1 - # if action == 4 # move _none + if target['direction'] == -1: + base_action[0] += UNIT + elif target['direction'] == 1: + base_action[0] -= UNIT - if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]: + if (target['figure'] is not self.rectangle + and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]): base_action = np.array([0, 0]) - self.canvas.move(target, base_action[0], base_action[1]) + self.canvas.move(target['figure'], base_action[0], base_action[1]) - s_ = self.canvas.coords(target) + s_ = self.canvas.coords(target['figure']) return s_ @@ -222,11 +228,6 @@ def move(self, target, action): if s[0] > UNIT: base_action[0] -= UNIT - # if action == 4 # move _none - - if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]: - base_action = np.array([0, 0]) - self.canvas.move(target, base_action[0], base_action[1]) s_ = self.canvas.coords(target) @@ -234,5 +235,5 @@ def move(self, target, action): return s_ def render(self): - time.sleep(0.1) + time.sleep(0.07) self.update() diff --git a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py b/1-grid-world/7-reinforce/reinforce_agent.py similarity index 57% rename from Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py rename to 1-grid-world/7-reinforce/reinforce_agent.py index 5bf590fd..2a37c851 100644 --- a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py +++ b/1-grid-world/7-reinforce/reinforce_agent.py @@ -7,53 +7,62 @@ from keras.models import Sequential from keras import backend as K -EPISODES = 1000 +EPISODES = 2500 -class PGAgent: +# this is REINFORCE Agent for GridWorld +class ReinforceAgent: def __init__(self): - self.render = False - + self.load_model = True + # actions which agent can do self.action_space = [0, 1, 2, 3, 4] + # get size of state and action self.action_size = len(self.action_space) - self.state_size = 22 - self.discount_factor = 0.99 # decay rate + self.state_size = 15 + self.discount_factor = 0.99 self.learning_rate = 0.001 self.model = self.build_model() self.optimizer = self.optimizer() self.states, self.actions, self.rewards = [], [], [] + if self.load_model: + self.model.load_weights('./save_model/reinforce_trained.h5') + + # state is input and probability of each action(policy) is output of network def build_model(self): model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')) - model.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform')) - # 마지막 softmax 계층으로 각 행동에 대한 확률을 만드는 모델을 생성 - model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')) + model.add(Dense(24, input_dim=self.state_size, activation='relu')) + model.add(Dense(24, activation='relu')) + model.add(Dense(self.action_size, activation='softmax')) model.summary() - return model + # create error function and training function to update policy network def optimizer(self): action = K.placeholder(shape=[None, 5]) discounted_rewards = K.placeholder(shape=[None, ]) - # Policy Gradient 의 핵심 - # log(정책) * return 의 gradient 를 구해서 최대화시킴 - good_prob = K.sum(action * self.model.output, axis=1) - eligibility = K.log(good_prob) * discounted_rewards - loss = -K.sum(eligibility) + # Calculate cross entropy error function + action_prob = K.sum(action * self.model.output, axis=1) + cross_entropy = K.log(action_prob) * discounted_rewards + loss = -K.sum(cross_entropy) + # create training function optimizer = Adam(lr=self.learning_rate) - updates = optimizer.get_updates(self.model.trainable_weights, [], loss) - train = K.function([self.model.input, action, discounted_rewards], [], updates=updates) + updates = optimizer.get_updates(self.model.trainable_weights, [], + loss) + train = K.function([self.model.input, action, discounted_rewards], [], + updates=updates) return train + # get action from policy network def get_action(self, state): - policy = self.model.predict(state, batch_size=1).flatten() + policy = self.model.predict(state)[0] return np.random.choice(self.action_size, 1, p=policy)[0] + # calculate discounted rewards def discount_rewards(self, rewards): discounted_rewards = np.zeros_like(rewards) running_add = 0 @@ -62,14 +71,16 @@ def discount_rewards(self, rewards): discounted_rewards[t] = running_add return discounted_rewards - def memory(self, state, action, reward): + # save states, actions and rewards for an episode + def append_sample(self, state, action, reward): self.states.append(state[0]) self.rewards.append(reward) act = np.zeros(self.action_size) act[action] = 1 self.actions.append(act) - def train_episodes(self): + # update policy neural network + def train_model(self): discounted_rewards = np.float32(self.discount_rewards(self.rewards)) discounted_rewards -= np.mean(discounted_rewards) discounted_rewards /= np.std(discounted_rewards) @@ -77,59 +88,42 @@ def train_episodes(self): self.optimizer([self.states, self.actions, discounted_rewards]) self.states, self.actions, self.rewards = [], [], [] - def load_model(self, name): - self.model.load_weights(name) - - def save_model(self, name): - self.model.save_weights(name) - if __name__ == "__main__": - # maze game - # env = Maze() env = Env() - agent = PGAgent() + agent = ReinforceAgent() global_step = 0 - # agent.load("same_vel_episode2 : 1000") scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 + # fresh env state = env.reset() - state = np.reshape(state, [1, 22]) + state = np.reshape(state, [1, 15]) while not done: - # fresh env - if agent.render: - env.render() global_step += 1 - - # RL choose action based on observation and go one step + # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done = env.step(action) - next_state = np.reshape(next_state, [1, 22]) + next_state = np.reshape(next_state, [1, 15]) - agent.memory(state, action, reward) - # every time step we do train from the replay memory + agent.append_sample(state, action, reward) score += reward - # swap observation state = copy.deepcopy(next_state) if done: - agent.train_episodes() - + # update policy neural network for each episode + agent.train_model() scores.append(score) episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/10by10.png") - print("episode:", e, " score:", score, " time_step:", global_step) + score = round(score, 2) + print("episode:", e, " score:", score, " time_step:", + global_step) if e % 100 == 0: - pass - agent.save_model("./save_model/10by10") - - # end of game - print('game over') - env.destroy() \ No newline at end of file + pylab.plot(episodes, scores, 'b') + pylab.savefig("./save_graph/reinforce.png") + agent.model.save_weights("./save_model/reinforce.h5") diff --git a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png new file mode 100644 index 00000000..3be9edb7 Binary files /dev/null and b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png differ diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 similarity index 64% rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 rename to 1-grid-world/7-reinforce/save_model/reinforce_trained.h5 index c63a4dc6..cb206f51 100644 Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 and b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 differ diff --git a/Code 1. Grid World/README.md b/1-grid-world/README.md similarity index 100% rename from Code 1. Grid World/README.md rename to 1-grid-world/README.md diff --git a/Code 1. Grid World/gridworld.png b/1-grid-world/gridworld.png similarity index 100% rename from Code 1. Grid World/gridworld.png rename to 1-grid-world/gridworld.png diff --git a/Code 1. Grid World/gridworld_changing.png b/1-grid-world/gridworld_changing.png similarity index 100% rename from Code 1. Grid World/gridworld_changing.png rename to 1-grid-world/gridworld_changing.png diff --git a/Code 1. Grid World/resources/circle.png b/1-grid-world/img/circle.png similarity index 100% rename from Code 1. Grid World/resources/circle.png rename to 1-grid-world/img/circle.png diff --git a/Code 1. Grid World/resources/down.png b/1-grid-world/img/down.png similarity index 100% rename from Code 1. Grid World/resources/down.png rename to 1-grid-world/img/down.png diff --git a/Code 1. Grid World/resources/left.png b/1-grid-world/img/left.png similarity index 100% rename from Code 1. Grid World/resources/left.png rename to 1-grid-world/img/left.png diff --git a/Code 1. Grid World/resources/rectangle.png b/1-grid-world/img/rectangle.png similarity index 100% rename from Code 1. Grid World/resources/rectangle.png rename to 1-grid-world/img/rectangle.png diff --git a/Code 1. Grid World/resources/right.png b/1-grid-world/img/right.png similarity index 100% rename from Code 1. Grid World/resources/right.png rename to 1-grid-world/img/right.png diff --git a/Code 1. Grid World/resources/triangle.png b/1-grid-world/img/triangle.png similarity index 100% rename from Code 1. Grid World/resources/triangle.png rename to 1-grid-world/img/triangle.png diff --git a/Code 1. Grid World/resources/up.png b/1-grid-world/img/up.png similarity index 100% rename from Code 1. Grid World/resources/up.png rename to 1-grid-world/img/up.png diff --git a/2-cartpole/1-dqn/SumTree.py b/2-cartpole/1-dqn/SumTree.py new file mode 100644 index 00000000..1b72e9ea --- /dev/null +++ b/2-cartpole/1-dqn/SumTree.py @@ -0,0 +1,55 @@ +import numpy + + +class SumTree: + write = 0 + + def __init__(self, capacity): + self.capacity = capacity + self.tree = numpy.zeros(2 * capacity - 1) + self.data = numpy.zeros(capacity, dtype=object) + + def _propagate(self, idx, change): + parent = (idx - 1) // 2 + + self.tree[parent] += change + + if parent != 0: + self._propagate(parent, change) + + def _retrieve(self, idx, s): + left = 2 * idx + 1 + right = left + 1 + + if left >= len(self.tree): + return idx + + if s <= self.tree[left]: + return self._retrieve(left, s) + else: + return self._retrieve(right, s - self.tree[left]) + + def total(self): + return self.tree[0] + + def add(self, p, data): + idx = self.write + self.capacity - 1 + + self.data[self.write] = data + self.update(idx, p) + + self.write += 1 + if self.write >= self.capacity: + self.write = 0 + + def update(self, idx, p): + change = p - self.tree[idx] + + self.tree[idx] = p + self._propagate(idx, change) + + def get(self, s): + idx = self._retrieve(0, s) + dataIdx = idx - self.capacity + 1 + + return (idx, self.tree[idx], self.data[dataIdx]) diff --git a/Code 2. Cartpole/1. DQN/Cartpole_DQN.py b/2-cartpole/1-dqn/cartpole_dqn.py similarity index 68% rename from Code 2. Cartpole/1. DQN/Cartpole_DQN.py rename to 2-cartpole/1-dqn/cartpole_dqn.py index 84802b76..8b2baaf0 100644 --- a/Code 2. Cartpole/1. DQN/Cartpole_DQN.py +++ b/2-cartpole/1-dqn/cartpole_dqn.py @@ -11,19 +11,20 @@ EPISODES = 300 -# this is DQN Agent for the Cartpole +# DQN Agent for the Cartpole # it uses Neural Network to approximate q function # and replay memory & target q network class DQNAgent: def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False + self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size - # these is hyper parameters for the DQN + # These are hyper parameters for the DQN self.discount_factor = 0.99 self.learning_rate = 0.001 self.epsilon = 1.0 @@ -37,17 +38,23 @@ def __init__(self, state_size, action_size): # create main model and target model self.model = self.build_model() self.target_model = self.build_model() - # copy the model to target model - # --> initialize the target model so that the parameters of model & target model to be same + + # initialize target model self.update_target_model() + if self.load_model: + self.model.load_weights("./save_model/cartpole_dqn.h5") + # approximate Q function using Neural Network # state is input and Q Value of each action is output of network def build_model(self): model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(24, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')) + model.add(Dense(24, input_dim=self.state_size, activation='relu', + kernel_initializer='he_uniform')) + model.add(Dense(24, activation='relu', + kernel_initializer='he_uniform')) + model.add(Dense(self.action_size, activation='linear', + kernel_initializer='he_uniform')) model.summary() model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return model @@ -65,50 +72,47 @@ def get_action(self, state): return np.argmax(q_value[0]) # save sample to the replay memory - def replay_memory(self, state, action, reward, next_state, done): + def append_sample(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # pick samples randomly from replay memory (with batch_size) - def train_replay(self): + def train_model(self): if len(self.memory) < self.train_start: return batch_size = min(self.batch_size, len(self.memory)) mini_batch = random.sample(self.memory, batch_size) update_input = np.zeros((batch_size, self.state_size)) - update_target = np.zeros((batch_size, self.action_size)) - - for i in range(batch_size): - state, action, reward, next_state, done = mini_batch[i] - target = self.model.predict(state)[0] - - # like Q Learning, get maximum Q value at s' - # But from target model - if done: - target[action] = reward + update_target = np.zeros((batch_size, self.state_size)) + action, reward, done = [], [], [] + + for i in range(self.batch_size): + update_input[i] = mini_batch[i][0] + action.append(mini_batch[i][1]) + reward.append(mini_batch[i][2]) + update_target[i] = mini_batch[i][3] + done.append(mini_batch[i][4]) + + target = self.model.predict(update_input) + target_val = self.target_model.predict(update_target) + + for i in range(self.batch_size): + # Q Learning: get maximum Q value at s' from target model + if done[i]: + target[i][action[i]] = reward[i] else: - target[action] = reward + self.discount_factor * \ - np.amax(self.target_model.predict(next_state)[0]) - update_input[i] = state - update_target[i] = target + target[i][action[i]] = reward[i] + self.discount_factor * ( + np.amax(target_val[i])) - # make minibatch which includes target q value and predicted q value # and do the model fit! - self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) - - # load the saved model - def load_model(self, name): - self.model.load_weights(name) - - # save the model which is under training - def save_model(self, name): - self.model.save_weights(name) + self.model.fit(update_input, target, batch_size=self.batch_size, + epochs=1, verbose=0) if __name__ == "__main__": - # in case of CartPole-v1, you can play until 500 time step + # In case of CartPole-v1, maximum length of episode is 500 env = gym.make('CartPole-v1') # get size of state and action from environment state_size = env.observation_space.shape[0] @@ -123,7 +127,6 @@ def save_model(self, name): score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) - # agent.load_model("./save_model/cartpole-master.h5") while not done: if agent.render: @@ -137,14 +140,13 @@ def save_model(self, name): reward = reward if not done or score == 499 else -100 # save the sample to the replay memory - agent.replay_memory(state, action, reward, next_state, done) + agent.append_sample(state, action, reward, next_state, done) # every time step do the training - agent.train_replay() + agent.train_model() score += reward state = next_state if done: - env.reset() # every episode update the target model to be same with model agent.update_target_model() @@ -153,9 +155,9 @@ def save_model(self, name): scores.append(score) episodes.append(e) pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Cartpole_DQN14.png") - print("episode:", e, " score:", score, " memory length:", len(agent.memory), - " epsilon:", agent.epsilon) + pylab.savefig("./save_graph/cartpole_dqn.png") + print("episode:", e, " score:", score, " memory length:", + len(agent.memory), " epsilon:", agent.epsilon) # if the mean of scores of last 10 episode is bigger than 490 # stop training @@ -164,4 +166,4 @@ def save_model(self, name): # save the model if e % 50 == 0: - agent.save_model("./save_model/Cartpole_DQN14.h5") \ No newline at end of file + agent.model.save_weights("./save_model/cartpole_dqn.h5") diff --git a/2-cartpole/1-dqn/cartpole_only_per.py b/2-cartpole/1-dqn/cartpole_only_per.py new file mode 100644 index 00000000..1a66d86b --- /dev/null +++ b/2-cartpole/1-dqn/cartpole_only_per.py @@ -0,0 +1,224 @@ +import sys +import gym +import pylab +import random +import numpy as np +from SumTree import SumTree +from collections import deque +from keras.layers import Dense +from keras.optimizers import Adam +from keras.models import Sequential + +EPISODES = 300 + + +# 카트폴 예제에서의 DQN 에이전트 +class DQNAgent: + def __init__(self, state_size, action_size): + self.render = False + self.load_model = False + + # 상태와 행동의 크기 정의 + self.state_size = state_size + self.action_size = action_size + + # DQN 하이퍼파라미터 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + self.epsilon = 1.0 + self.epsilon_decay = 0.999 + self.epsilon_min = 0.01 + self.batch_size = 64 + self.train_start = 2000 + self.memory_size = 2000 + + # 리플레이 메모리, 최대 크기 2000 + self.memory = Memory(self.memory_size) + + # 모델과 타깃 모델 생성 + self.model = self.build_model() + self.target_model = self.build_model() + + # 타깃 모델 초기화 + self.update_target_model() + + if self.load_model: + self.model.load_weights("./save_model/cartpole_dqn_trained.h5") + + # 상태가 입력, 큐함수가 출력인 인공신경망 생성 + def build_model(self): + model = Sequential() + model.add(Dense(24, input_dim=self.state_size, activation='relu', + kernel_initializer='he_uniform')) + model.add(Dense(24, activation='relu', + kernel_initializer='he_uniform')) + model.add(Dense(self.action_size, activation='linear', + kernel_initializer='he_uniform')) + model.summary() + model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) + return model + + # 타깃 모델을 모델의 가중치로 업데이트 + def update_target_model(self): + self.target_model.set_weights(self.model.get_weights()) + + # 입실론 탐욕 정책으로 행동 선택 + def get_action(self, state): + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model.predict(state) + return np.argmax(q_value[0]) + + # 샘플 을 리플레이 메모리에 저장 + def append_sample(self, state, action, reward, next_state, done): + if self.epsilon == 1: + done = True + + # TD-error 를 구해서 같이 메모리에 저장 + target = self.model.predict([state]) + old_val = target[0][action] + target_val = self.target_model.predict([next_state]) + if done: + target[0][action] = reward + else: + target[0][action] = reward + self.discount_factor * ( + np.amax(target_val[0])) + error = abs(old_val - target[0][action]) + + self.memory.add(error, (state, action, reward, next_state, done)) + + # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습 + def train_model(self): + if self.epsilon > self.epsilon_min: + self.epsilon *= self.epsilon_decay + + # 메모리에서 배치 크기만큼 무작위로 샘플 추출 + mini_batch = self.memory.sample(self.batch_size) + + errors = np.zeros(self.batch_size) + states = np.zeros((self.batch_size, self.state_size)) + next_states = np.zeros((self.batch_size, self.state_size)) + actions, rewards, dones = [], [], [] + + for i in range(self.batch_size): + states[i] = mini_batch[i][1][0] + actions.append(mini_batch[i][1][1]) + rewards.append(mini_batch[i][1][2]) + next_states[i] = mini_batch[i][1][3] + dones.append(mini_batch[i][1][4]) + + # 현재 상태에 대한 모델의 큐함수 + # 다음 상태에 대한 타깃 모델의 큐함수 + target = self.model.predict(states) + target_val = self.target_model.predict(next_states) + + # 벨만 최적 방정식을 이용한 업데이트 타깃 + for i in range(self.batch_size): + old_val = target[i][actions[i]] + if dones[i]: + target[i][actions[i]] = rewards[i] + else: + target[i][actions[i]] = rewards[i] + self.discount_factor * ( + np.amax(target_val[i])) + # TD-error를 저장 + errors[i] = abs(old_val - target[i][actions[i]]) + + # TD-error로 priority 업데이트 + for i in range(self.batch_size): + idx = mini_batch[i][0] + self.memory.update(idx, errors[i]) + + self.model.fit(states, target, batch_size=self.batch_size, + epochs=1, verbose=0) + + +class Memory: # stored as ( s, a, r, s_ ) in SumTree + e = 0.01 + a = 0.6 + + def __init__(self, capacity): + self.tree = SumTree(capacity) + + def _getPriority(self, error): + return (error + self.e) ** self.a + + def add(self, error, sample): + p = self._getPriority(error) + self.tree.add(p, sample) + + def sample(self, n): + batch = [] + segment = self.tree.total() / n + + for i in range(n): + a = segment * i + b = segment * (i + 1) + + s = random.uniform(a, b) + (idx, p, data) = self.tree.get(s) + batch.append((idx, data)) + + return batch + + def update(self, idx, error): + p = self._getPriority(error) + self.tree.update(idx, p) + + +if __name__ == "__main__": + # CartPole-v1 환경, 최대 타임스텝 수가 500 + env = gym.make('CartPole-v1') + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # DQN 에이전트 생성 + agent = DQNAgent(state_size, action_size) + + scores, episodes = [], [] + + step = 0 + for e in range(EPISODES): + done = False + score = 0 + # env 초기화 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + if agent.render: + env.render() + step += 1 + # 현재 상태로 행동을 선택 + action = agent.get_action(state) + # 선택한 행동으로 환경에서 한 타임스텝 진행 + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + # 에피소드가 중간에 끝나면 -100 보상 + r = reward if not done or score+reward == 500 else -10 + # 리플레이 메모리에 샘플 저장 + agent.append_sample(state, action, r, next_state, done) + # 매 타임스텝마다 학습 + if step >= agent.train_start: + agent.train_model() + + score += reward + state = next_state + + if done: + # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트 + agent.update_target_model() + +# score = score if score == 500 else score + 100 + # 에피소드마다 학습 결과 출력 + scores.append(score) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.savefig("./save_graph/cartpole_dqn.png") + print("episode:", e, " score:", score, " memory length:", + step if step <= agent.memory_size else agent.memory_size, " epsilon:", agent.epsilon) + + # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단 + if np.mean(scores[-min(10, len(scores)):]) > 490: + agent.model.save_weights("./save_model/cartpole_dqn.h5") + sys.exit() diff --git a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN.png b/2-cartpole/1-dqn/save_graph/Cartpole_DQN.png similarity index 100% rename from Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN.png rename to 2-cartpole/1-dqn/save_graph/Cartpole_DQN.png diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN1.h5 b/2-cartpole/1-dqn/save_model/cartpole_dqn.h5 similarity index 100% rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN1.h5 rename to 2-cartpole/1-dqn/save_model/cartpole_dqn.h5 diff --git a/Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py b/2-cartpole/2-double-dqn/cartpole_ddqn.py similarity index 71% rename from Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py rename to 2-cartpole/2-double-dqn/cartpole_ddqn.py index b5feb608..73c51140 100644 --- a/Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py +++ b/2-cartpole/2-double-dqn/cartpole_ddqn.py @@ -11,14 +11,14 @@ EPISODES = 300 -# this is Double DQN Agent for the Cartpole +# Double DQN Agent for the Cartpole # it uses Neural Network to approximate q function # and replay memory & target q network class DoubleDQNAgent: def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False - + self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size @@ -37,17 +37,23 @@ def __init__(self, state_size, action_size): # create main model and target model self.model = self.build_model() self.target_model = self.build_model() - # copy the model to target model - # --> initialize the target model so that the parameters of model & target model to be same + + # initialize target model self.update_target_model() + if self.load_model: + self.model.load_weights("./save_model/cartpole_ddqn.h5") + # approximate Q function using Neural Network # state is input and Q Value of each action is output of network def build_model(self): model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(24, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')) + model.add(Dense(24, input_dim=self.state_size, activation='relu', + kernel_initializer='he_uniform')) + model.add(Dense(24, activation='relu', + kernel_initializer='he_uniform')) + model.add(Dense(self.action_size, activation='linear', + kernel_initializer='he_uniform')) model.summary() model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return model @@ -65,54 +71,54 @@ def get_action(self, state): return np.argmax(q_value[0]) # save sample to the replay memory - def replay_memory(self, state, action, reward, next_state, done): + def append_sample(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # pick samples randomly from replay memory (with batch_size) - def train_replay(self): + def train_model(self): if len(self.memory) < self.train_start: return batch_size = min(self.batch_size, len(self.memory)) mini_batch = random.sample(self.memory, batch_size) update_input = np.zeros((batch_size, self.state_size)) - update_target = np.zeros((batch_size, self.action_size)) + update_target = np.zeros((batch_size, self.state_size)) + action, reward, done = [], [], [] for i in range(batch_size): - state, action, reward, next_state, done = mini_batch[i] - target = self.model.predict(state)[0] + update_input[i] = mini_batch[i][0] + action.append(mini_batch[i][1]) + reward.append(mini_batch[i][2]) + update_target[i] = mini_batch[i][3] + done.append(mini_batch[i][4]) + + target = self.model.predict(update_input) + target_next = self.model.predict(update_target) + target_val = self.target_model.predict(update_target) + for i in range(self.batch_size): # like Q Learning, get maximum Q value at s' # But from target model - if done: - target[action] = reward + if done[i]: + target[i][action[i]] = reward[i] else: # the key point of Double DQN # selection of action is from model # update is from target model - a = np.argmax(self.model.predict(next_state)[0]) - target[action] = reward + self.discount_factor * \ - (self.target_model.predict(next_state)[0][a]) - update_input[i] = state - update_target[i] = target + a = np.argmax(target_next[i]) + target[i][action[i]] = reward[i] + self.discount_factor * ( + target_val[i][a]) # make minibatch which includes target q value and predicted q value # and do the model fit! - self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) - - # load the saved model - def load_model(self, name): - self.model.load_weights(name) - - # save the model which is under training - def save_model(self, name): - self.model.save_weights(name) + self.model.fit(update_input, target, batch_size=self.batch_size, + epochs=1, verbose=0) if __name__ == "__main__": - # in case of CartPole-v1, you can play until 500 time step + # In case of CartPole-v1, you can play until 500 time step env = gym.make('CartPole-v1') # get size of state and action from environment state_size = env.observation_space.shape[0] @@ -127,7 +133,6 @@ def save_model(self, name): score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) - # agent.load_model("./save_model/cartpole-master.h5") while not done: if agent.render: @@ -141,14 +146,13 @@ def save_model(self, name): reward = reward if not done or score == 499 else -100 # save the sample to the replay memory - agent.replay_memory(state, action, reward, next_state, done) + agent.append_sample(state, action, reward, next_state, done) # every time step do the training - agent.train_replay() + agent.train_model() score += reward state = next_state if done: - env.reset() # every episode update the target model to be same with model agent.update_target_model() @@ -157,9 +161,9 @@ def save_model(self, name): scores.append(score) episodes.append(e) pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Cartpole_DoubleDQN.png") - print("episode:", e, " score:", score, " memory length:", len(agent.memory), - " epsilon:", agent.epsilon) + pylab.savefig("./save_graph/cartpole_ddqn.png") + print("episode:", e, " score:", score, " memory length:", + len(agent.memory), " epsilon:", agent.epsilon) # if the mean of scores of last 10 episode is bigger than 490 # stop training @@ -168,4 +172,4 @@ def save_model(self, name): # save the model if e % 50 == 0: - agent.save_model("./save_model/Cartpole_DoubleDQN.h5") \ No newline at end of file + agent.model.save_weights("./save_model/cartpole_ddqn.h5") diff --git a/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png b/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png new file mode 100644 index 00000000..26c4fed0 Binary files /dev/null and b/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png differ diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 b/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5 similarity index 73% rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 rename to 2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5 index d4d4bcd5..c54c9886 100644 Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 and b/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5 differ diff --git a/2-cartpole/3-reinforce/cartpole_reinforce.py b/2-cartpole/3-reinforce/cartpole_reinforce.py new file mode 100644 index 00000000..040234d1 --- /dev/null +++ b/2-cartpole/3-reinforce/cartpole_reinforce.py @@ -0,0 +1,146 @@ +import sys +import gym +import pylab +import numpy as np +from keras.layers import Dense +from keras.models import Sequential +from keras.optimizers import Adam + +EPISODES = 1000 + + +# This is Policy Gradient agent for the Cartpole +# In this example, we use REINFORCE algorithm which uses monte-carlo update rule +class REINFORCEAgent: + def __init__(self, state_size, action_size): + # if you want to see Cartpole learning, then change to True + self.render = False + self.load_model = False + # get size of state and action + self.state_size = state_size + self.action_size = action_size + + # These are hyper parameters for the Policy Gradient + self.discount_factor = 0.99 + self.learning_rate = 0.001 + self.hidden1, self.hidden2 = 24, 24 + + # create model for policy network + self.model = self.build_model() + + # lists for the states, actions and rewards + self.states, self.actions, self.rewards = [], [], [] + + if self.load_model: + self.model.load_weights("./save_model/cartpole_reinforce.h5") + + # approximate policy using Neural Network + # state is input and probability of each action is output of network + def build_model(self): + model = Sequential() + model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')) + model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')) + model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')) + model.summary() + # Using categorical crossentropy as a loss is a trick to easily + # implement the policy gradient. Categorical cross entropy is defined + # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set + # p_a = advantage. q_a is the output of the policy network, which is + # the probability of taking the action a, i.e. policy(s, a). + # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a)) + model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate)) + return model + + # using the output of policy network, pick action stochastically + def get_action(self, state): + policy = self.model.predict(state, batch_size=1).flatten() + return np.random.choice(self.action_size, 1, p=policy)[0] + + # In Policy Gradient, Q function is not available. + # Instead agent uses sample returns for evaluating policy + def discount_rewards(self, rewards): + discounted_rewards = np.zeros_like(rewards) + running_add = 0 + for t in reversed(range(0, len(rewards))): + running_add = running_add * self.discount_factor + rewards[t] + discounted_rewards[t] = running_add + return discounted_rewards + + # save of each step + def append_sample(self, state, action, reward): + self.states.append(state) + self.rewards.append(reward) + self.actions.append(action) + + # update policy network every episode + def train_model(self): + episode_length = len(self.states) + + discounted_rewards = self.discount_rewards(self.rewards) + discounted_rewards -= np.mean(discounted_rewards) + discounted_rewards /= np.std(discounted_rewards) + + update_inputs = np.zeros((episode_length, self.state_size)) + advantages = np.zeros((episode_length, self.action_size)) + + for i in range(episode_length): + update_inputs[i] = self.states[i] + advantages[i][self.actions[i]] = discounted_rewards[i] + + self.model.fit(update_inputs, advantages, epochs=1, verbose=0) + self.states, self.actions, self.rewards = [], [], [] + +if __name__ == "__main__": + # In case of CartPole-v1, you can play until 500 time step + env = gym.make('CartPole-v1') + # get size of state and action from environment + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # make REINFORCE agent + agent = REINFORCEAgent(state_size, action_size) + + scores, episodes = [], [] + + for e in range(EPISODES): + done = False + score = 0 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + if agent.render: + env.render() + + # get action for the current state and go one step in environment + action = agent.get_action(state) + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + reward = reward if not done or score == 499 else -100 + + # save the sample to the memory + agent.append_sample(state, action, reward) + + score += reward + state = next_state + + if done: + # every episode, agent learns from sample returns + agent.train_model() + + # every episode, plot the play time + score = score if score == 500 else score + 100 + scores.append(score) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.savefig("./save_graph/cartpole_reinforce.png") + print("episode:", e, " score:", score) + + # if the mean of scores of last 10 episode is bigger than 490 + # stop training + if np.mean(scores[-min(10, len(scores)):]) > 490: + sys.exit() + + # save the model + if e % 50 == 0: + agent.model.save_weights("./save_model/cartpole_reinforce.h5") diff --git a/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png b/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png new file mode 100644 index 00000000..dce280f2 Binary files /dev/null and b/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png differ diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 b/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5 similarity index 67% rename from Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 rename to 2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5 index 1fc158bf..18fb216b 100644 Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 and b/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5 differ diff --git a/2-cartpole/4-actor-critic/cartpole_a2c.py b/2-cartpole/4-actor-critic/cartpole_a2c.py new file mode 100644 index 00000000..fa6310a3 --- /dev/null +++ b/2-cartpole/4-actor-critic/cartpole_a2c.py @@ -0,0 +1,135 @@ +import sys +import gym +import pylab +import numpy as np +from keras.layers import Dense +from keras.models import Sequential +from keras.optimizers import Adam + +EPISODES = 1000 + + +# A2C(Advantage Actor-Critic) agent for the Cartpole +class A2CAgent: + def __init__(self, state_size, action_size): + # if you want to see Cartpole learning, then change to True + self.render = False + self.load_model = False + # get size of state and action + self.state_size = state_size + self.action_size = action_size + self.value_size = 1 + + # These are hyper parameters for the Policy Gradient + self.discount_factor = 0.99 + self.actor_lr = 0.001 + self.critic_lr = 0.005 + + # create model for policy network + self.actor = self.build_actor() + self.critic = self.build_critic() + + if self.load_model: + self.actor.load_weights("./save_model/cartpole_actor.h5") + self.critic.load_weights("./save_model/cartpole_critic.h5") + + # approximate policy and value using Neural Network + # actor: state is input and probability of each action is output of model + def build_actor(self): + actor = Sequential() + actor.add(Dense(24, input_dim=self.state_size, activation='relu', + kernel_initializer='he_uniform')) + actor.add(Dense(self.action_size, activation='softmax', + kernel_initializer='he_uniform')) + actor.summary() + # See note regarding crossentropy in cartpole_reinforce.py + actor.compile(loss='categorical_crossentropy', + optimizer=Adam(lr=self.actor_lr)) + return actor + + # critic: state is input and value of state is output of model + def build_critic(self): + critic = Sequential() + critic.add(Dense(24, input_dim=self.state_size, activation='relu', + kernel_initializer='he_uniform')) + critic.add(Dense(self.value_size, activation='linear', + kernel_initializer='he_uniform')) + critic.summary() + critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr)) + return critic + + # using the output of policy network, pick action stochastically + def get_action(self, state): + policy = self.actor.predict(state, batch_size=1).flatten() + return np.random.choice(self.action_size, 1, p=policy)[0] + + # update policy network every episode + def train_model(self, state, action, reward, next_state, done): + target = np.zeros((1, self.value_size)) + advantages = np.zeros((1, self.action_size)) + + value = self.critic.predict(state)[0] + next_value = self.critic.predict(next_state)[0] + + if done: + advantages[0][action] = reward - value + target[0][0] = reward + else: + advantages[0][action] = reward + self.discount_factor * (next_value) - value + target[0][0] = reward + self.discount_factor * next_value + + self.actor.fit(state, advantages, epochs=1, verbose=0) + self.critic.fit(state, target, epochs=1, verbose=0) + + +if __name__ == "__main__": + # In case of CartPole-v1, maximum length of episode is 500 + env = gym.make('CartPole-v1') + # get size of state and action from environment + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # make A2C agent + agent = A2CAgent(state_size, action_size) + + scores, episodes = [], [] + + for e in range(EPISODES): + done = False + score = 0 + state = env.reset() + state = np.reshape(state, [1, state_size]) + + while not done: + if agent.render: + env.render() + + action = agent.get_action(state) + next_state, reward, done, info = env.step(action) + next_state = np.reshape(next_state, [1, state_size]) + # if an action make the episode end, then gives penalty of -100 + reward = reward if not done or score == 499 else -100 + + agent.train_model(state, action, reward, next_state, done) + + score += reward + state = next_state + + if done: + # every episode, plot the play time + score = score if score == 500.0 else score + 100 + scores.append(score) + episodes.append(e) + pylab.plot(episodes, scores, 'b') + pylab.savefig("./save_graph/cartpole_a2c.png") + print("episode:", e, " score:", score) + + # if the mean of scores of last 10 episode is bigger than 490 + # stop training + if np.mean(scores[-min(10, len(scores)):]) > 490: + sys.exit() + + # save the model + if e % 50 == 0: + agent.actor.save_weights("./save_model/cartpole_actor.h5") + agent.critic.save_weights("./save_model/cartpole_critic.h5") diff --git a/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png b/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png new file mode 100644 index 00000000..aedc6c4c Binary files /dev/null and b/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png differ diff --git a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 b/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5 similarity index 59% rename from Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 rename to 2-cartpole/4-actor-critic/save_model/cartpole_actor.h5 index 24f6b0cf..38b40bba 100644 Binary files a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 and b/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5 differ diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 b/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5 similarity index 56% rename from Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 rename to 2-cartpole/4-actor-critic/save_model/cartpole_critic.h5 index 1146b18e..4cea5ef1 100644 Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 and b/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5 differ diff --git a/2-cartpole/5-a3c/cartpole_a3c.py b/2-cartpole/5-a3c/cartpole_a3c.py new file mode 100644 index 00000000..f2721849 --- /dev/null +++ b/2-cartpole/5-a3c/cartpole_a3c.py @@ -0,0 +1,223 @@ +import threading +import numpy as np +import tensorflow as tf +import pylab +import time +import gym +from keras.layers import Dense, Input +from keras.models import Model +from keras.optimizers import Adam +from keras import backend as K + + +# global variables for threading +episode = 0 +scores = [] + +EPISODES = 2000 + +# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole +# In this example, we use A3C algorithm +class A3CAgent: + def __init__(self, state_size, action_size, env_name): + # get size of state and action + self.state_size = state_size + self.action_size = action_size + + # get gym environment name + self.env_name = env_name + + # these are hyper parameters for the A3C + self.actor_lr = 0.001 + self.critic_lr = 0.001 + self.discount_factor = .99 + self.hidden1, self.hidden2 = 24, 24 + self.threads = 8 + + # create model for actor and critic network + self.actor, self.critic = self.build_model() + + # method for training actor and critic network + self.optimizer = [self.actor_optimizer(), self.critic_optimizer()] + + self.sess = tf.InteractiveSession() + K.set_session(self.sess) + self.sess.run(tf.global_variables_initializer()) + + # approximate policy and value using Neural Network + # actor -> state is input and probability of each action is output of network + # critic -> state is input and value of state is output of network + # actor and critic network share first hidden layer + def build_model(self): + state = Input(batch_shape=(None, self.state_size)) + shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state) + + actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared) + action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden) + + value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared) + state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden) + + actor = Model(inputs=state, outputs=action_prob) + critic = Model(inputs=state, outputs=state_value) + + actor._make_predict_function() + critic._make_predict_function() + + actor.summary() + critic.summary() + + return actor, critic + + # make loss function for Policy Gradient + # [log(action probability) * advantages] will be input for the back prop + # we add entropy of action probability to loss + def actor_optimizer(self): + action = K.placeholder(shape=(None, self.action_size)) + advantages = K.placeholder(shape=(None, )) + + policy = self.actor.output + + good_prob = K.sum(action * policy, axis=1) + eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) + loss = -K.sum(eligibility) + + entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) + + actor_loss = loss + 0.01*entropy + + optimizer = Adam(lr=self.actor_lr) + updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss) + train = K.function([self.actor.input, action, advantages], [], updates=updates) + return train + + # make loss function for Value approximation + def critic_optimizer(self): + discounted_reward = K.placeholder(shape=(None, )) + + value = self.critic.output + + loss = K.mean(K.square(discounted_reward - value)) + + optimizer = Adam(lr=self.critic_lr) + updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) + train = K.function([self.critic.input, discounted_reward], [], updates=updates) + return train + + # make agents(local) and start training + def train(self): + # self.load_model('./save_model/cartpole_a3c.h5') + agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor, + self.action_size, self.state_size) for i in range(self.threads)] + + for agent in agents: + agent.start() + + while True: + time.sleep(20) + + plot = scores[:] + pylab.plot(range(len(plot)), plot, 'b') + pylab.savefig("./save_graph/cartpole_a3c.png") + + self.save_model('./save_model/cartpole_a3c.h5') + + def save_model(self, name): + self.actor.save_weights(name + "_actor.h5") + self.critic.save_weights(name + "_critic.h5") + + def load_model(self, name): + self.actor.load_weights(name + "_actor.h5") + self.critic.load_weights(name + "_critic.h5") + +# This is Agent(local) class for threading +class Agent(threading.Thread): + def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size): + threading.Thread.__init__(self) + + self.states = [] + self.rewards = [] + self.actions = [] + + self.index = index + self.actor = actor + self.critic = critic + self.optimizer = optimizer + self.env_name = env_name + self.discount_factor = discount_factor + self.action_size = action_size + self.state_size = state_size + + # Thread interactive with environment + def run(self): + global episode + env = gym.make(self.env_name) + while episode < EPISODES: + state = env.reset() + score = 0 + while True: + action = self.get_action(state) + next_state, reward, done, _ = env.step(action) + score += reward + + self.memory(state, action, reward) + + state = next_state + + if done: + episode += 1 + print("episode: ", episode, "/ score : ", score) + scores.append(score) + self.train_episode(score != 500) + break + + # In Policy Gradient, Q function is not available. + # Instead agent uses sample returns for evaluating policy + def discount_rewards(self, rewards, done=True): + discounted_rewards = np.zeros_like(rewards) + running_add = 0 + if not done: + running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0] + for t in reversed(range(0, len(rewards))): + running_add = running_add * self.discount_factor + rewards[t] + discounted_rewards[t] = running_add + return discounted_rewards + + # save of each step + # this is used for calculating discounted rewards + def memory(self, state, action, reward): + self.states.append(state) + act = np.zeros(self.action_size) + act[action] = 1 + self.actions.append(act) + self.rewards.append(reward) + + # update policy network and value network every episode + def train_episode(self, done): + discounted_rewards = self.discount_rewards(self.rewards, done) + + values = self.critic.predict(np.array(self.states)) + values = np.reshape(values, len(values)) + + advantages = discounted_rewards - values + + self.optimizer[0]([self.states, self.actions, advantages]) + self.optimizer[1]([self.states, discounted_rewards]) + self.states, self.actions, self.rewards = [], [], [] + + def get_action(self, state): + policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0] + return np.random.choice(self.action_size, 1, p=policy)[0] + + +if __name__ == "__main__": + env_name = 'CartPole-v1' + env = gym.make(env_name) + + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + env.close() + + global_agent = A3CAgent(state_size, action_size, env_name) + global_agent.train() diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 b/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5 similarity index 58% rename from Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 rename to 2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5 index c72bcd68..33ab03a5 100644 Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 and b/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5 differ diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 b/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5 similarity index 57% rename from Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 rename to 2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5 index 6ef1da98..5db01072 100644 Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 and b/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5 differ diff --git a/Code 2. Cartpole/LICENSE b/2-cartpole/LICENSE similarity index 100% rename from Code 2. Cartpole/LICENSE rename to 2-cartpole/LICENSE diff --git a/Code 2. Cartpole/README.md b/2-cartpole/README.md similarity index 65% rename from Code 2. Cartpole/README.md rename to 2-cartpole/README.md index 6882e016..1d8d8701 100644 --- a/Code 2. Cartpole/README.md +++ b/2-cartpole/README.md @@ -15,15 +15,10 @@ This is graph of Double DQN algorithm

-
-This is graph of Dueling DQN algorithm (This doesn't work at all...) - -

-
This is graph of Policy Gradient algorithm -

+


This is graph of Actor Critic algorithm -

\ No newline at end of file +

\ No newline at end of file diff --git a/Code 2. Cartpole/cartpole.png b/2-cartpole/cartpole.png similarity index 100% rename from Code 2. Cartpole/cartpole.png rename to 2-cartpole/cartpole.png diff --git a/3-atari/1-breakout/breakout_a3c.py b/3-atari/1-breakout/breakout_a3c.py new file mode 100644 index 00000000..be339e8e --- /dev/null +++ b/3-atari/1-breakout/breakout_a3c.py @@ -0,0 +1,351 @@ +import gym +import time +import random +import threading +import numpy as np +import tensorflow as tf +from skimage.color import rgb2gray +from skimage.transform import resize +from keras.models import Model +from keras.optimizers import RMSprop +from keras.layers import Dense, Flatten, Input +from keras.layers.convolutional import Conv2D +from keras import backend as K + +# global variables for A3C +global episode +episode = 0 +EPISODES = 8000000 +# In case of BreakoutDeterministic-v3, always skip 4 frames +# Deterministic-v4 version use 4 actions +env_name = "BreakoutDeterministic-v4" + +# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole +# In this example, we use A3C algorithm +class A3CAgent: + def __init__(self, action_size): + # environment settings + self.state_size = (84, 84, 4) + self.action_size = action_size + + self.discount_factor = 0.99 + self.no_op_steps = 30 + + # optimizer parameters + self.actor_lr = 2.5e-4 + self.critic_lr = 2.5e-4 + self.threads = 8 + + # create model for actor and critic network + self.actor, self.critic = self.build_model() + + # method for training actor and critic network + self.optimizer = [self.actor_optimizer(), self.critic_optimizer()] + + self.sess = tf.InteractiveSession() + K.set_session(self.sess) + self.sess.run(tf.global_variables_initializer()) + + self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() + self.summary_writer = tf.summary.FileWriter('summary/breakout_a3c', self.sess.graph) + + def train(self): + # self.load_model("./save_model/breakout_a3c") + agents = [Agent(self.action_size, self.state_size, [self.actor, self.critic], self.sess, self.optimizer, + self.discount_factor, [self.summary_op, self.summary_placeholders, + self.update_ops, self.summary_writer]) for _ in range(self.threads)] + + for agent in agents: + time.sleep(1) + agent.start() + + while True: + time.sleep(60*10) + self.save_model("./save_model/breakout_a3c") + + # approximate policy and value using Neural Network + # actor -> state is input and probability of each action is output of network + # critic -> state is input and value of state is output of network + # actor and critic network share first hidden layer + def build_model(self): + input = Input(shape=self.state_size) + conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) + conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) + conv = Flatten()(conv) + fc = Dense(256, activation='relu')(conv) + policy = Dense(self.action_size, activation='softmax')(fc) + value = Dense(1, activation='linear')(fc) + + actor = Model(inputs=input, outputs=policy) + critic = Model(inputs=input, outputs=value) + + actor._make_predict_function() + critic._make_predict_function() + + actor.summary() + critic.summary() + + return actor, critic + + # make loss function for Policy Gradient + # [log(action probability) * advantages] will be input for the back prop + # we add entropy of action probability to loss + def actor_optimizer(self): + action = K.placeholder(shape=[None, self.action_size]) + advantages = K.placeholder(shape=[None, ]) + + policy = self.actor.output + + good_prob = K.sum(action * policy, axis=1) + eligibility = K.log(good_prob + 1e-10) * advantages + actor_loss = -K.sum(eligibility) + + entropy = K.sum(policy * K.log(policy + 1e-10), axis=1) + entropy = K.sum(entropy) + + loss = actor_loss + 0.01*entropy + optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01) + updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) + train = K.function([self.actor.input, action, advantages], [loss], updates=updates) + + return train + + # make loss function for Value approximation + def critic_optimizer(self): + discounted_reward = K.placeholder(shape=(None, )) + + value = self.critic.output + + loss = K.mean(K.square(discounted_reward - value)) + + optimizer = RMSprop(lr=self.critic_lr, rho=0.99, epsilon=0.01) + updates = optimizer.get_updates(self.critic.trainable_weights, [], loss) + train = K.function([self.critic.input, discounted_reward], [loss], updates=updates) + return train + + def load_model(self, name): + self.actor.load_weights(name + "_actor.h5") + self.critic.load_weights(name + "_critic.h5") + + def save_model(self, name): + self.actor.save_weights(name + "_actor.h5") + self.critic.save_weights(name + '_critic.h5') + + # make summary operators for tensorboard + def setup_summary(self): + episode_total_reward = tf.Variable(0.) + episode_avg_max_q = tf.Variable(0.) + episode_duration = tf.Variable(0.) + + tf.summary.scalar('Total Reward/Episode', episode_total_reward) + tf.summary.scalar('Average Max Prob/Episode', episode_avg_max_q) + tf.summary.scalar('Duration/Episode', episode_duration) + + summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration] + summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] + update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] + summary_op = tf.summary.merge_all() + return summary_placeholders, update_ops, summary_op + +# make agents(local) and start training +class Agent(threading.Thread): + def __init__(self, action_size, state_size, model, sess, optimizer, discount_factor, summary_ops): + threading.Thread.__init__(self) + + self.action_size = action_size + self.state_size = state_size + self.actor, self.critic = model + self.sess = sess + self.optimizer = optimizer + self.discount_factor = discount_factor + self.summary_op, self.summary_placeholders, self.update_ops, self.summary_writer = summary_ops + + self.states, self.actions, self.rewards = [],[],[] + + self.local_actor, self.local_critic = self.build_localmodel() + + self.avg_p_max = 0 + self.avg_loss = 0 + + # t_max -> max batch size for training + self.t_max = 20 + self.t = 0 + + # Thread interactive with environment + def run(self): + # self.load_model('./save_model/breakout_a3c') + global episode + + env = gym.make(env_name) + + step = 0 + + while episode < EPISODES: + done = False + dead = False + # 1 episode = 5 lives + score, start_life = 0, 5 + observe = env.reset() + next_observe = observe + + # this is one of DeepMind's idea. + # just do nothing at the start of episode to avoid sub-optimal + for _ in range(random.randint(1, 30)): + observe = next_observe + next_observe, _, _, _ = env.step(1) + + # At start of episode, there is no preceding frame. So just copy initial states to make history + state = pre_processing(next_observe, observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + step += 1 + self.t += 1 + observe = next_observe + # get action for the current history and go one step in environment + action, policy = self.get_action(history) + # change action to real_action + if action == 0: real_action = 1 + elif action == 1: real_action = 2 + else: real_action = 3 + + if dead: + action = 0 + real_action = 1 + dead = False + + next_observe, reward, done, info = env.step(real_action) + # pre-process the observation --> history + next_state = pre_processing(next_observe, observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + self.avg_p_max += np.amax(self.actor.predict(np.float32(history / 255.))) + + # if the ball is fall, then the agent is dead --> episode is not over + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + score += reward + reward = np.clip(reward, -1., 1.) + + # save the sample to the replay memory + self.memory(history, action, reward) + + # if agent is dead, then reset the history + if dead: + history = np.stack((next_state, next_state, next_state, next_state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + else: + history = next_history + + # + if self.t >= self.t_max or done: + self.train_model(done) + self.update_localmodel() + self.t = 0 + + # if done, plot the score over episodes + if done: + episode += 1 + print("episode:", episode, " score:", score, " step:", step) + + stats = [score, self.avg_p_max / float(step), + step] + for i in range(len(stats)): + self.sess.run(self.update_ops[i], feed_dict={ + self.summary_placeholders[i]: float(stats[i]) + }) + summary_str = self.sess.run(self.summary_op) + self.summary_writer.add_summary(summary_str, episode + 1) + self.avg_p_max = 0 + self.avg_loss = 0 + step = 0 + + # In Policy Gradient, Q function is not available. + # Instead agent uses sample returns for evaluating policy + def discount_rewards(self, rewards, done): + discounted_rewards = np.zeros_like(rewards) + running_add = 0 + if not done: + running_add = self.critic.predict(np.float32(self.states[-1] / 255.))[0] + for t in reversed(range(0, len(rewards))): + running_add = running_add * self.discount_factor + rewards[t] + discounted_rewards[t] = running_add + return discounted_rewards + + # update policy network and value network every episode + def train_model(self, done): + discounted_rewards = self.discount_rewards(self.rewards, done) + + states = np.zeros((len(self.states), 84, 84, 4)) + for i in range(len(self.states)): + states[i] = self.states[i] + + states = np.float32(states / 255.) + + values = self.critic.predict(states) + values = np.reshape(values, len(values)) + + advantages = discounted_rewards - values + + self.optimizer[0]([states, self.actions, advantages]) + self.optimizer[1]([states, discounted_rewards]) + self.states, self.actions, self.rewards = [], [], [] + + def build_localmodel(self): + input = Input(shape=self.state_size) + conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) + conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) + conv = Flatten()(conv) + fc = Dense(256, activation='relu')(conv) + policy = Dense(self.action_size, activation='softmax')(fc) + value = Dense(1, activation='linear')(fc) + + actor = Model(inputs=input, outputs=policy) + critic = Model(inputs=input, outputs=value) + + actor._make_predict_function() + critic._make_predict_function() + + actor.set_weights(self.actor.get_weights()) + critic.set_weights(self.critic.get_weights()) + + actor.summary() + critic.summary() + + return actor, critic + + def update_localmodel(self): + self.local_actor.set_weights(self.actor.get_weights()) + self.local_critic.set_weights(self.critic.get_weights()) + + def get_action(self, history): + history = np.float32(history / 255.) + policy = self.local_actor.predict(history)[0] + action_index = np.random.choice(self.action_size, 1, p=policy)[0] + return action_index, policy + + # save of each step + # this is used for calculating discounted rewards + def memory(self, history, action, reward): + self.states.append(history) + act = np.zeros(self.action_size) + act[action] = 1 + self.actions.append(act) + self.rewards.append(reward) + + +# 210*160*3(color) --> 84*84(mono) +# float --> integer (to reduce the size of replay memory) +def pre_processing(next_observe, observe): + processed_observe = np.maximum(next_observe, observe) + processed_observe = np.uint8(resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + global_agent = A3CAgent(action_size=3) + global_agent.train() diff --git a/3-atari/1-breakout/breakout_ddqn.py b/3-atari/1-breakout/breakout_ddqn.py new file mode 100644 index 00000000..f9f0a5ed --- /dev/null +++ b/3-atari/1-breakout/breakout_ddqn.py @@ -0,0 +1,274 @@ +import gym +import random +import numpy as np +import tensorflow as tf +from collections import deque +from skimage.color import rgb2gray +from skimage.transform import resize +from keras.models import Sequential +from keras.optimizers import RMSprop +from keras.layers import Dense, Flatten +from keras.layers.convolutional import Conv2D +from keras import backend as K + +EPISODES = 50000 + + +class DDQNAgent: + def __init__(self, action_size): + self.render = False + self.load_model = False + # environment settings + self.state_size = (84, 84, 4) + self.action_size = action_size + # parameters about epsilon + self.epsilon = 1. + self.epsilon_start, self.epsilon_end = 1.0, 0.1 + self.exploration_steps = 1000000. + self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ + / self.exploration_steps + # parameters about training + self.batch_size = 32 + self.train_start = 50000 + self.update_target_rate = 10000 + self.discount_factor = 0.99 + self.memory = deque(maxlen=400000) + self.no_op_steps = 30 + # build + self.model = self.build_model() + self.target_model = self.build_model() + self.update_target_model() + + self.optimizer = self.optimizer() + + self.sess = tf.InteractiveSession() + K.set_session(self.sess) + + self.avg_q_max, self.avg_loss = 0, 0 + self.summary_placeholders, self.update_ops, self.summary_op = \ + self.setup_summary() + self.summary_writer = tf.summary.FileWriter( + 'summary/breakout_ddqn', self.sess.graph) + self.sess.run(tf.global_variables_initializer()) + + if self.load_model: + self.model.load_weights("./save_model/breakout_ddqn.h5") + + # if the error is in [-1, 1], then the cost is quadratic to the error + # But outside the interval, the cost is linear to the error + def optimizer(self): + a = K.placeholder(shape=(None, ), dtype='int32') + y = K.placeholder(shape=(None, ), dtype='float32') + + py_x = self.model.output + + a_one_hot = K.one_hot(a, self.action_size) + q_value = K.sum(py_x * a_one_hot, axis=1) + error = K.abs(y - q_value) + + quadratic_part = K.clip(error, 0.0, 1.0) + linear_part = error - quadratic_part + loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) + + optimizer = RMSprop(lr=0.00025, epsilon=0.01) + updates = optimizer.get_updates(self.model.trainable_weights, [], loss) + train = K.function([self.model.input, a, y], [loss], updates=updates) + + return train + + # approximate Q function using Convolution Neural Network + # state is input and Q Value of each action is output of network + def build_model(self): + model = Sequential() + model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=self.state_size)) + model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) + model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) + model.add(Flatten()) + model.add(Dense(512, activation='relu')) + model.add(Dense(self.action_size)) + model.summary() + + return model + + # after some time interval update the target model to be same with model + def update_target_model(self): + self.target_model.set_weights(self.model.get_weights()) + + # get action from model using epsilon-greedy policy + def get_action(self, history): + history = np.float32(history / 255.0) + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model.predict(history) + return np.argmax(q_value[0]) + + # save sample to the replay memory + def replay_memory(self, history, action, reward, next_history, dead): + self.memory.append((history, action, reward, next_history, dead)) + + # pick samples randomly from replay memory (with batch_size) + def train_replay(self): + if len(self.memory) < self.train_start: + return + if self.epsilon > self.epsilon_end: + self.epsilon -= self.epsilon_decay_step + + mini_batch = random.sample(self.memory, self.batch_size) + + history = np.zeros((self.batch_size, self.state_size[0], + self.state_size[1], self.state_size[2])) + next_history = np.zeros((self.batch_size, self.state_size[0], + self.state_size[1], self.state_size[2])) + target = np.zeros((self.batch_size, )) + action, reward, dead = [], [], [] + + for i in range(self.batch_size): + history[i] = np.float32(mini_batch[i][0] / 255.) + next_history[i] = np.float32(mini_batch[i][3] / 255.) + action.append(mini_batch[i][1]) + reward.append(mini_batch[i][2]) + dead.append(mini_batch[i][4]) + + value = self.model.predict(next_history) + target_value = self.target_model.predict(next_history) + + # like Q Learning, get maximum Q value at s' + # But from target model + for i in range(self.batch_size): + if dead[i]: + target[i] = reward[i] + else: + # the key point of Double DQN + # selection of action is from model + # update is from target model + target[i] = reward[i] + self.discount_factor * \ + target_value[i][np.argmax(value[i])] + + loss = self.optimizer([history, action, target]) + self.avg_loss += loss[0] + + # make summary operators for tensorboard + def setup_summary(self): + episode_total_reward = tf.Variable(0.) + episode_avg_max_q = tf.Variable(0.) + episode_duration = tf.Variable(0.) + episode_avg_loss = tf.Variable(0.) + + tf.summary.scalar('Total Reward/Episode', episode_total_reward) + tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) + tf.summary.scalar('Duration/Episode', episode_duration) + tf.summary.scalar('Average Loss/Episode', episode_avg_loss) + + summary_vars = [episode_total_reward, episode_avg_max_q, + episode_duration, episode_avg_loss] + summary_placeholders = [tf.placeholder(tf.float32) for _ in + range(len(summary_vars))] + update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in + range(len(summary_vars))] + summary_op = tf.summary.merge_all() + return summary_placeholders, update_ops, summary_op + + +# 210*160*3(color) --> 84*84(mono) +# float --> integer (to reduce the size of replay memory) +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + # In case of BreakoutDeterministic-v4, always skip 4 frames + # Deterministic-v4 version use 4 actions + env = gym.make('BreakoutDeterministic-v4') + agent = DDQNAgent(action_size=3) + + scores, episodes, global_step = [], [], 0 + + for e in range(EPISODES): + done = False + dead = False + # 1 episode = 5 lives + step, score, start_life = 0, 0, 5 + observe = env.reset() + + # this is one of DeepMind's idea. + # just do nothing at the start of episode to avoid sub-optimal + for _ in range(random.randint(1, agent.no_op_steps)): + observe, _, _, _ = env.step(1) + + # At start of episode, there is no preceding frame. + # So just copy initial states to make history + state = pre_processing(observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + if agent.render: + env.render() + global_step += 1 + step += 1 + + # get action for the current history and go one step in environment + action = agent.get_action(history) + # change action to real_action + if action == 0: real_action = 1 + elif action == 1: real_action = 2 + else: real_action = 3 + + observe, reward, done, info = env.step(real_action) + # pre-process the observation --> history + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + agent.avg_q_max += np.amax( + agent.model.predict(np.float32(history / 255.))[0]) + + # if the agent missed ball, agent is dead --> episode is not over + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + reward = np.clip(reward, -1., 1.) + + # save the sample to the replay memory + agent.replay_memory(history, action, reward, next_history, dead) + # every some time interval, train model + agent.train_replay() + # update the target model with model + if global_step % agent.update_target_rate == 0: + agent.update_target_model() + + score += reward + + # if agent is dead, then reset the history + if dead: + dead = False + else: + history = next_history + + # if done, plot the score over episodes + if done: + if global_step > agent.train_start: + stats = [score, agent.avg_q_max / float(step), step, + agent.avg_loss / float(step)] + for i in range(len(stats)): + agent.sess.run(agent.update_ops[i], feed_dict={ + agent.summary_placeholders[i]: float(stats[i]) + }) + summary_str = agent.sess.run(agent.summary_op) + agent.summary_writer.add_summary(summary_str, e + 1) + + print("episode:", e, " score:", score, " memory length:", + len(agent.memory), " epsilon:", agent.epsilon, + " global_step:", global_step, " average_q:", + agent.avg_q_max/float(step), " average loss:", + agent.avg_loss/float(step)) + + agent.avg_q_max, agent.avg_loss = 0, 0 + + if e % 1000 == 0: + agent.model.save_weights("./save_model/breakout_ddqn.h5") diff --git a/3-atari/1-breakout/breakout_dqn.py b/3-atari/1-breakout/breakout_dqn.py new file mode 100644 index 00000000..b6229a04 --- /dev/null +++ b/3-atari/1-breakout/breakout_dqn.py @@ -0,0 +1,275 @@ +import gym +import random +import numpy as np +import tensorflow as tf +from collections import deque +from skimage.color import rgb2gray +from skimage.transform import resize +from keras.models import Sequential +from keras.optimizers import RMSprop +from keras.layers import Dense, Flatten +from keras.layers.convolutional import Conv2D +from keras import backend as K + +EPISODES = 50000 + + +class DQNAgent: + def __init__(self, action_size): + self.render = False + self.load_model = False + # environment settings + self.state_size = (84, 84, 4) + self.action_size = action_size + # parameters about epsilon + self.epsilon = 1. + self.epsilon_start, self.epsilon_end = 1.0, 0.1 + self.exploration_steps = 1000000. + self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ + / self.exploration_steps + # parameters about training + self.batch_size = 32 + self.train_start = 50000 + self.update_target_rate = 10000 + self.discount_factor = 0.99 + self.memory = deque(maxlen=400000) + self.no_op_steps = 30 + # build model + self.model = self.build_model() + self.target_model = self.build_model() + self.update_target_model() + + self.optimizer = self.optimizer() + + self.sess = tf.InteractiveSession() + K.set_session(self.sess) + + self.avg_q_max, self.avg_loss = 0, 0 + self.summary_placeholders, self.update_ops, self.summary_op = \ + self.setup_summary() + self.summary_writer = tf.summary.FileWriter( + 'summary/breakout_dqn', self.sess.graph) + self.sess.run(tf.global_variables_initializer()) + + if self.load_model: + self.model.load_weights("./save_model/breakout_dqn.h5") + + # if the error is in [-1, 1], then the cost is quadratic to the error + # But outside the interval, the cost is linear to the error + def optimizer(self): + a = K.placeholder(shape=(None,), dtype='int32') + y = K.placeholder(shape=(None,), dtype='float32') + + py_x = self.model.output + + a_one_hot = K.one_hot(a, self.action_size) + q_value = K.sum(py_x * a_one_hot, axis=1) + error = K.abs(y - q_value) + + quadratic_part = K.clip(error, 0.0, 1.0) + linear_part = error - quadratic_part + loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) + + optimizer = RMSprop(lr=0.00025, epsilon=0.01) + updates = optimizer.get_updates(self.model.trainable_weights, [], loss) + train = K.function([self.model.input, a, y], [loss], updates=updates) + + return train + + # approximate Q function using Convolution Neural Network + # state is input and Q Value of each action is output of network + def build_model(self): + model = Sequential() + model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=self.state_size)) + model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) + model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) + model.add(Flatten()) + model.add(Dense(512, activation='relu')) + model.add(Dense(self.action_size)) + model.summary() + return model + + # after some time interval update the target model to be same with model + def update_target_model(self): + self.target_model.set_weights(self.model.get_weights()) + + # get action from model using epsilon-greedy policy + def get_action(self, history): + history = np.float32(history / 255.0) + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model.predict(history) + return np.argmax(q_value[0]) + + # save sample to the replay memory + def replay_memory(self, history, action, reward, next_history, dead): + self.memory.append((history, action, reward, next_history, dead)) + + # pick samples randomly from replay memory (with batch_size) + def train_replay(self): + if len(self.memory) < self.train_start: + return + if self.epsilon > self.epsilon_end: + self.epsilon -= self.epsilon_decay_step + + mini_batch = random.sample(self.memory, self.batch_size) + + history = np.zeros((self.batch_size, self.state_size[0], + self.state_size[1], self.state_size[2])) + next_history = np.zeros((self.batch_size, self.state_size[0], + self.state_size[1], self.state_size[2])) + target = np.zeros((self.batch_size,)) + action, reward, dead = [], [], [] + + for i in range(self.batch_size): + history[i] = np.float32(mini_batch[i][0] / 255.) + next_history[i] = np.float32(mini_batch[i][3] / 255.) + action.append(mini_batch[i][1]) + reward.append(mini_batch[i][2]) + dead.append(mini_batch[i][4]) + + target_value = self.target_model.predict(next_history) + + # like Q Learning, get maximum Q value at s' + # But from target model + for i in range(self.batch_size): + if dead[i]: + target[i] = reward[i] + else: + target[i] = reward[i] + self.discount_factor * \ + np.amax(target_value[i]) + + loss = self.optimizer([history, action, target]) + self.avg_loss += loss[0] + + def save_model(self, name): + self.model.save_weights(name) + + # make summary operators for tensorboard + def setup_summary(self): + episode_total_reward = tf.Variable(0.) + episode_avg_max_q = tf.Variable(0.) + episode_duration = tf.Variable(0.) + episode_avg_loss = tf.Variable(0.) + + tf.summary.scalar('Total Reward/Episode', episode_total_reward) + tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) + tf.summary.scalar('Duration/Episode', episode_duration) + tf.summary.scalar('Average Loss/Episode', episode_avg_loss) + + summary_vars = [episode_total_reward, episode_avg_max_q, + episode_duration, episode_avg_loss] + summary_placeholders = [tf.placeholder(tf.float32) for _ in + range(len(summary_vars))] + update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in + range(len(summary_vars))] + summary_op = tf.summary.merge_all() + return summary_placeholders, update_ops, summary_op + + +# 210*160*3(color) --> 84*84(mono) +# float --> integer (to reduce the size of replay memory) +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + # In case of BreakoutDeterministic-v3, always skip 4 frames + # Deterministic-v4 version use 4 actions + env = gym.make('BreakoutDeterministic-v4') + agent = DQNAgent(action_size=3) + + scores, episodes, global_step = [], [], 0 + + for e in range(EPISODES): + done = False + dead = False + # 1 episode = 5 lives + step, score, start_life = 0, 0, 5 + observe = env.reset() + + # this is one of DeepMind's idea. + # just do nothing at the start of episode to avoid sub-optimal + for _ in range(random.randint(1, agent.no_op_steps)): + observe, _, _, _ = env.step(1) + + # At start of episode, there is no preceding frame + # So just copy initial states to make history + state = pre_processing(observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + if agent.render: + env.render() + global_step += 1 + step += 1 + + # get action for the current history and go one step in environment + action = agent.get_action(history) + # change action to real_action + if action == 0: + real_action = 1 + elif action == 1: + real_action = 2 + else: + real_action = 3 + + observe, reward, done, info = env.step(real_action) + # pre-process the observation --> history + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + agent.avg_q_max += np.amax( + agent.model.predict(np.float32(history / 255.))[0]) + + # if the agent missed ball, agent is dead --> episode is not over + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + reward = np.clip(reward, -1., 1.) + + # save the sample to the replay memory + agent.replay_memory(history, action, reward, next_history, dead) + # every some time interval, train model + agent.train_replay() + # update the target model with model + if global_step % agent.update_target_rate == 0: + agent.update_target_model() + + score += reward + + # if agent is dead, then reset the history + if dead: + dead = False + else: + history = next_history + + # if done, plot the score over episodes + if done: + if global_step > agent.train_start: + stats = [score, agent.avg_q_max / float(step), step, + agent.avg_loss / float(step)] + for i in range(len(stats)): + agent.sess.run(agent.update_ops[i], feed_dict={ + agent.summary_placeholders[i]: float(stats[i]) + }) + summary_str = agent.sess.run(agent.summary_op) + agent.summary_writer.add_summary(summary_str, e + 1) + + print("episode:", e, " score:", score, " memory length:", + len(agent.memory), " epsilon:", agent.epsilon, + " global_step:", global_step, " average_q:", + agent.avg_q_max / float(step), " average loss:", + agent.avg_loss / float(step)) + + agent.avg_q_max, agent.avg_loss = 0, 0 + + if e % 1000 == 0: + agent.model.save_weights("./save_model/breakout_dqn.h5") diff --git a/3-atari/1-breakout/breakout_dueling_ddqn.py b/3-atari/1-breakout/breakout_dueling_ddqn.py new file mode 100644 index 00000000..496b1e05 --- /dev/null +++ b/3-atari/1-breakout/breakout_dueling_ddqn.py @@ -0,0 +1,286 @@ +import gym +import random +import numpy as np +import tensorflow as tf +from collections import deque +from skimage.color import rgb2gray +from skimage.transform import resize +from keras.models import Model +from keras.optimizers import RMSprop +from keras.layers import Input, Dense, Flatten, Lambda, merge +from keras.layers.convolutional import Conv2D +from keras import backend as K + +EPISODES = 50000 + + +class DuelingDDQNAgent: + def __init__(self, action_size): + self.render = False + self.load_model = False + # environment settings + self.state_size = (84, 84, 4) + self.action_size = action_size + # parameters about epsilon + self.epsilon = 1. + self.epsilon_start, self.epsilon_end = 1.0, 0.1 + self.exploration_steps = 1000000. + self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \ + / self.exploration_steps + # parameters about training + self.batch_size = 32 + self.train_start = 50000 + self.update_target_rate = 10000 + self.discount_factor = 0.99 + self.memory = deque(maxlen=400000) + self.no_op_steps = 30 + # build + self.model = self.build_model() + self.target_model = self.build_model() + self.update_target_model() + + self.optimizer = self.optimizer() + + self.sess = tf.InteractiveSession() + K.set_session(self.sess) + + self.avg_q_max, self.avg_loss = 0, 0 + self.summary_placeholders, self.update_ops, self.summary_op = \ + self.setup_summary() + self.summary_writer = tf.summary.FileWriter( + 'summary/breakout_dueling_ddqn', self.sess.graph) + self.sess.run(tf.global_variables_initializer()) + + if self.load_model: + self.model.load_weights("./save_model/breakout_dueling_ddqb.h5") + + # if the error is in [-1, 1], then the cost is quadratic to the error + # But outside the interval, the cost is linear to the error + def optimizer(self): + a = K.placeholder(shape=(None, ), dtype='int32') + y = K.placeholder(shape=(None, ), dtype='float32') + + py_x = self.model.output + + a_one_hot = K.one_hot(a, self.action_size) + q_value = K.sum(py_x * a_one_hot, axis=1) + error = K.abs(y - q_value) + + quadratic_part = K.clip(error, 0.0, 1.0) + linear_part = error - quadratic_part + loss = K.mean(0.5 * K.square(quadratic_part) + linear_part) + + optimizer = RMSprop(lr=0.00025, epsilon=0.01) + updates = optimizer.get_updates(self.model.trainable_weights, [], loss) + train = K.function([self.model.input, a, y], [loss], updates=updates) + + return train + + # approximate Q function using Convolution Neural Network + # state is input and Q Value of each action is output of network + # dueling network's Q Value is sum of advantages and state value + def build_model(self): + input = Input(shape=self.state_size) + shared = Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input) + shared = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(shared) + shared = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(shared) + flatten = Flatten()(shared) + + # network separate state value and advantages + advantage_fc = Dense(512, activation='relu')(flatten) + advantage = Dense(self.action_size)(advantage_fc) + advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), + output_shape=(self.action_size,))(advantage) + + value_fc = Dense(512, activation='relu')(flatten) + value = Dense(1)(value_fc) + value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), + output_shape=(self.action_size,))(value) + + # network merged and make Q Value + q_value = merge([value, advantage], mode='sum') + model = Model(inputs=input, outputs=q_value) + model.summary() + + return model + + # after some time interval update the target model to be same with model + def update_target_model(self): + self.target_model.set_weights(self.model.get_weights()) + + # get action from model using epsilon-greedy policy + def get_action(self, history): + history = np.float32(history / 255.0) + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model.predict(history) + return np.argmax(q_value[0]) + + # save sample to the replay memory + def replay_memory(self, history, action, reward, next_history, dead): + self.memory.append((history, action, reward, next_history, dead)) + + # pick samples randomly from replay memory (with batch_size) + def train_replay(self): + if len(self.memory) < self.train_start: + return + if self.epsilon > self.epsilon_end: + self.epsilon -= self.epsilon_decay_step + + mini_batch = random.sample(self.memory, self.batch_size) + + history = np.zeros((self.batch_size, self.state_size[0], + self.state_size[1], self.state_size[2])) + next_history = np.zeros((self.batch_size, self.state_size[0], + self.state_size[1], self.state_size[2])) + target = np.zeros((self.batch_size, )) + action, reward, dead = [], [], [] + + for i in range(self.batch_size): + history[i] = np.float32(mini_batch[i][0] / 255.) + next_history[i] = np.float32(mini_batch[i][3] / 255.) + action.append(mini_batch[i][1]) + reward.append(mini_batch[i][2]) + dead.append(mini_batch[i][4]) + + value = self.model.predict(history) + target_value = self.target_model.predict(next_history) + + # like Q Learning, get maximum Q value at s' + # But from target model + for i in range(self.batch_size): + if dead[i]: + target[i] = reward[i] + else: + # the key point of Double DQN + # selection of action is from model + # update is from target model + target[i] = reward[i] + self.discount_factor * \ + target_value[i][np.argmax(value[i])] + + loss = self.optimizer([history, action, target]) + self.avg_loss += loss[0] + + def setup_summary(self): + episode_total_reward = tf.Variable(0.) + episode_avg_max_q = tf.Variable(0.) + episode_duration = tf.Variable(0.) + episode_avg_loss = tf.Variable(0.) + + tf.summary.scalar('Total Reward/Episode', episode_total_reward) + tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) + tf.summary.scalar('Duration/Episode', episode_duration) + tf.summary.scalar('Average Loss/Episode', episode_avg_loss) + + summary_vars = [episode_total_reward, episode_avg_max_q, + episode_duration, episode_avg_loss] + summary_placeholders = [tf.placeholder(tf.float32) for _ in + range(len(summary_vars))] + update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in + range(len(summary_vars))] + summary_op = tf.summary.merge_all() + return summary_placeholders, update_ops, summary_op + + +# 210*160*3(color) --> 84*84(mono) +# float --> integer (to reduce the size of replay memory) +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + # In case of BreakoutDeterministic-v3, always skip 4 frames + # Deterministic-v4 version use 4 actions + env = gym.make('BreakoutDeterministic-v4') + agent = DuelingDDQNAgent(action_size=3) + + scores, episodes, global_step = [], [], 0 + + for e in range(EPISODES): + done = False + dead = False + # 1 episode = 5 lives + step, score, start_life = 0, 0, 5 + observe = env.reset() + + # this is one of DeepMind's idea. + # just do nothing at the start of episode to avoid sub-optimal + for _ in range(random.randint(1, agent.no_op_steps)): + observe, _, _, _ = env.step(1) + + # At start of episode, there is no preceding frame. + # So just copy initial states to make history + state = pre_processing(observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + if agent.render: + env.render() + global_step += 1 + step += 1 + + # get action for the current history and go one step in environment + action = agent.get_action(history) + # change action to real_action + if action == 0: real_action = 1 + elif action == 1: real_action = 2 + else: real_action = 3 + + observe, reward, done, info = env.step(real_action) + # pre-process the observation --> history + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + agent.avg_q_max += np.amax( + agent.model.predict(np.float32(history / 255.))[0]) + + # if the agent missed ball, agent is dead --> episode is not over + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + reward = np.clip(reward, -1., 1.) + + # save the sample to the replay memory + agent.replay_memory(history, action, reward, next_history, dead) + # every some time interval, train model + agent.train_replay() + # update the target model with model + if global_step % agent.update_target_rate == 0: + agent.update_target_model() + + score += reward + + # if agent is dead, then reset the history + if dead: + dead = False + else: + history = next_history + + # if done, plot the score over episodes + if done: + if global_step > agent.train_start: + stats = [score, agent.avg_q_max / float(step), step, + agent.avg_loss / float(step)] + for i in range(len(stats)): + agent.sess.run(agent.update_ops[i], feed_dict={ + agent.summary_placeholders[i]: float(stats[i]) + }) + summary_str = agent.sess.run(agent.summary_op) + agent.summary_writer.add_summary(summary_str, e + 1) + + print("episode:", e, " score:", score, " memory length:", + len(agent.memory), " epsilon:", agent.epsilon, + " global_step:", global_step, " average_q:", + agent.avg_q_max/float(step), " average loss:", + agent.avg_loss/float(step)) + + agent.avg_q_max, agent.avg_loss = 0, 0 + + if e % 1000 == 0: + agent.model.save_weights("./save_model/breakout_dueling_ddqn.h5") diff --git a/3-atari/1-breakout/play_a3c_model.py b/3-atari/1-breakout/play_a3c_model.py new file mode 100644 index 00000000..c6a32c83 --- /dev/null +++ b/3-atari/1-breakout/play_a3c_model.py @@ -0,0 +1,125 @@ +import gym +import random +import numpy as np +from skimage.color import rgb2gray +from skimage.transform import resize +from keras.models import Model +from keras.layers import Dense, Flatten, Input +from keras.layers.convolutional import Conv2D + +global episode +episode = 0 +EPISODES = 8000000 +env_name = "BreakoutDeterministic-v4" + +class TestAgent: + def __init__(self, action_size): + self.state_size = (84, 84, 4) + self.action_size = action_size + + self.discount_factor = 0.99 + self.no_op_steps = 30 + + self.actor, self.critic = self.build_model() + + def build_model(self): + input = Input(shape=self.state_size) + conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input) + conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv) + conv = Flatten()(conv) + fc = Dense(256, activation='relu')(conv) + policy = Dense(self.action_size, activation='softmax')(fc) + value = Dense(1, activation='linear')(fc) + + actor = Model(inputs=input, outputs=policy) + critic = Model(inputs=input, outputs=value) + + actor.summary() + critic.summary() + + return actor, critic + + def get_action(self, history): + history = np.float32(history / 255.) + policy = self.actor.predict(history)[0] + + action_index = np.argmax(policy) + return action_index + + def load_model(self, name): + self.actor.load_weights(name) + +def pre_processing(next_observe, observe): + processed_observe = np.maximum(next_observe, observe) + processed_observe = np.uint8( + resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + env = gym.make(env_name) + agent = TestAgent(action_size=3) + agent.load_model("save_model/breakout_a3c_5_actor.h5") + + step = 0 + + while episode < EPISODES: + done = False + dead = False + + score, start_life = 0, 5 + observe = env.reset() + next_observe = observe + + for _ in range(random.randint(1, 20)): + observe = next_observe + next_observe, _, _, _ = env.step(1) + + state = pre_processing(next_observe, observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + env.render() + step += 1 + observe = next_observe + + action = agent.get_action(history) + + if action == 1: + fake_action = 2 + elif action == 2: + fake_action = 3 + else: + fake_action = 1 + + if dead: + fake_action = 1 + dead = False + + next_observe, reward, done, info = env.step(fake_action) + + next_state = pre_processing(next_observe, observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + if start_life > info['ale.lives']: + dead = True + reward = -1 + start_life = info['ale.lives'] + + score += reward + + # if agent is dead, then reset the history + if dead: + history = np.stack( + (next_state, next_state, next_state, next_state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + else: + history = next_history + + # if done, plot the score over episodes + if done: + episode += 1 + print("episode:", episode, " score:", score, " step:", step) + step = 0 \ No newline at end of file diff --git a/3-atari/1-breakout/play_dqn_model.py b/3-atari/1-breakout/play_dqn_model.py new file mode 100644 index 00000000..45662c78 --- /dev/null +++ b/3-atari/1-breakout/play_dqn_model.py @@ -0,0 +1,110 @@ +import gym +import random +import numpy as np +import tensorflow as tf +from skimage.color import rgb2gray +from skimage.transform import resize +from keras.models import Sequential +from keras.layers import Dense, Flatten +from keras.layers.convolutional import Conv2D +from keras import backend as K + +EPISODES = 50000 + + +class TestAgent: + def __init__(self, action_size): + self.state_size = (84, 84, 4) + self.action_size = action_size + self.no_op_steps = 20 + + self.model = self.build_model() + + self.sess = tf.InteractiveSession() + K.set_session(self.sess) + + self.avg_q_max, self.avg_loss = 0, 0 + self.sess.run(tf.global_variables_initializer()) + + def build_model(self): + model = Sequential() + model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', + input_shape=self.state_size)) + model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) + model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) + model.add(Flatten()) + model.add(Dense(512, activation='relu')) + model.add(Dense(self.action_size)) + model.summary() + + return model + + def get_action(self, history): + if np.random.random() < 0.01: + return random.randrange(3) + history = np.float32(history / 255.0) + q_value = self.model.predict(history) + return np.argmax(q_value[0]) + + def load_model(self, filename): + self.model.load_weights(filename) + +def pre_processing(observe): + processed_observe = np.uint8( + resize(rgb2gray(observe), (84, 84), mode='constant') * 255) + return processed_observe + + +if __name__ == "__main__": + env = gym.make('BreakoutDeterministic-v4') + agent = TestAgent(action_size=3) + agent.load_model("./save_model/breakout_dqn_5.h5") + + for e in range(EPISODES): + done = False + dead = False + + step, score, start_life = 0, 0, 5 + observe = env.reset() + + for _ in range(random.randint(1, agent.no_op_steps)): + observe, _, _, _ = env.step(1) + + state = pre_processing(observe) + history = np.stack((state, state, state, state), axis=2) + history = np.reshape([history], (1, 84, 84, 4)) + + while not done: + env.render() + step += 1 + + action = agent.get_action(history) + + if action == 0: + real_action = 1 + elif action == 1: + real_action = 2 + else: + real_action = 3 + + if dead: + real_action = 1 + dead = False + + observe, reward, done, info = env.step(real_action) + + next_state = pre_processing(observe) + next_state = np.reshape([next_state], (1, 84, 84, 1)) + next_history = np.append(next_state, history[:, :, :, :3], axis=3) + + if start_life > info['ale.lives']: + dead = True + start_life = info['ale.lives'] + + score += reward + + history = next_history + + if done: + print("episode:", e, " score:", score) + diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 new file mode 100644 index 00000000..37a6a1ac Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 new file mode 100644 index 00000000..3d3394ae Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 new file mode 100644 index 00000000..21207c0f Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 new file mode 100644 index 00000000..a26f7d8a Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 new file mode 100644 index 00000000..a27e766e Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 new file mode 100644 index 00000000..62236fc7 Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 new file mode 100644 index 00000000..4fc3b773 Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 new file mode 100644 index 00000000..f65494da Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 new file mode 100644 index 00000000..db855b24 Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 new file mode 100644 index 00000000..3636d02d Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn.h5 b/3-atari/1-breakout/save_model/breakout_dqn.h5 new file mode 100644 index 00000000..fec05377 Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_1.h5 b/3-atari/1-breakout/save_model/breakout_dqn_1.h5 new file mode 100644 index 00000000..bb219b8a Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_1.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_2.h5 b/3-atari/1-breakout/save_model/breakout_dqn_2.h5 new file mode 100644 index 00000000..f316b4bc Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_2.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_3.h5 b/3-atari/1-breakout/save_model/breakout_dqn_3.h5 new file mode 100644 index 00000000..3e9ab26d Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_3.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_4.h5 b/3-atari/1-breakout/save_model/breakout_dqn_4.h5 new file mode 100644 index 00000000..2c952d42 Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_4.h5 differ diff --git a/3-atari/1-breakout/save_model/breakout_dqn_5.h5 b/3-atari/1-breakout/save_model/breakout_dqn_5.h5 new file mode 100644 index 00000000..eae4c99b Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_5.h5 differ diff --git a/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 b/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 new file mode 100644 index 00000000..1eb4343a Binary files /dev/null and b/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 differ diff --git a/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name b/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name new file mode 100644 index 00000000..2e394adf Binary files /dev/null and b/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name differ diff --git a/Code 3. Atari Game/2. Pong/README.md b/3-atari/2-pong/README.md similarity index 100% rename from Code 3. Atari Game/2. Pong/README.md rename to 3-atari/2-pong/README.md diff --git a/Code 3. Atari Game/2. Pong/assets/pg.gif b/3-atari/2-pong/assets/pg.gif similarity index 100% rename from Code 3. Atari Game/2. Pong/assets/pg.gif rename to 3-atari/2-pong/assets/pg.gif diff --git a/Code 3. Atari Game/2. Pong/assets/score.png b/3-atari/2-pong/assets/score.png similarity index 100% rename from Code 3. Atari Game/2. Pong/assets/score.png rename to 3-atari/2-pong/assets/score.png diff --git a/Code 3. Atari Game/3. A3C/Breakout_A3C.py b/3-atari/2-pong/pong_a3c.py similarity index 100% rename from Code 3. Atari Game/3. A3C/Breakout_A3C.py rename to 3-atari/2-pong/pong_a3c.py diff --git a/Code 3. Atari Game/2. Pong/pg.py b/3-atari/2-pong/pong_reinforce.py similarity index 95% rename from Code 3. Atari Game/2. Pong/pg.py rename to 3-atari/2-pong/pong_reinforce.py index b6977145..ce346a78 100644 --- a/Code 3. Atari Game/2. Pong/pg.py +++ b/3-atari/2-pong/pong_reinforce.py @@ -29,6 +29,7 @@ def _build_model(self): model.add(Dense(32, activation='relu', init='he_uniform')) model.add(Dense(self.action_size, activation='softmax')) opt = Adam(lr=self.learning_rate) + # See note regarding crossentropy in cartpole_reinforce.py model.compile(loss='categorical_crossentropy', optimizer=opt) return model @@ -92,7 +93,7 @@ def preprocess(I): state_size = 80 * 80 action_size = env.action_space.n agent = PGAgent(state_size, action_size) - agent.load('pong.h5') + agent.load('./save_model/pong_reinforce.h5') while True: env.render() @@ -113,4 +114,4 @@ def preprocess(I): state = env.reset() prev_x = None if episode > 1 and episode % 10 == 0: - agent.save('pong.h5') + agent.save('./save_model/pong_reinforce.h5') diff --git a/Code 3. Atari Game/2. Pong/pong.h5 b/3-atari/2-pong/save_model/pong_reinforce.h5 similarity index 100% rename from Code 3. Atari Game/2. Pong/pong.h5 rename to 3-atari/2-pong/save_model/pong_reinforce.h5 diff --git a/Code 3. Atari Game/LICENSE b/3-atari/LICENSE similarity index 100% rename from Code 3. Atari Game/LICENSE rename to 3-atari/LICENSE diff --git a/4-gym/1-mountaincar/mountaincar_dqn.py b/4-gym/1-mountaincar/mountaincar_dqn.py new file mode 100644 index 00000000..932ba6b0 --- /dev/null +++ b/4-gym/1-mountaincar/mountaincar_dqn.py @@ -0,0 +1,175 @@ +import gym +import pylab +import random +import numpy as np +from collections import deque +from keras.layers import Dense +from keras.optimizers import Adam +from keras.models import Sequential + +EPISODES = 4000 + + +class DQNAgent: + def __init__(self, state_size, action_size): + # Cartpole이 학습하는 것을 보려면 "True"로 바꿀 것 + self.render = True + + # state와 action의 크기를 가져와서 모델을 생성하는데 사용함 + self.state_size = state_size + self.action_size = action_size + + # Cartpole DQN 학습의 Hyper parameter 들 + # deque를 통해서 replay memory 생성 + self.discount_factor = 0.99 + self.learning_rate = 0.001 + self.epsilon = 1.0 + self.epsilon_min = 0.005 + self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000 + self.batch_size = 64 + self.train_start = 1000 + self.memory = deque(maxlen=10000) + + # 학습할 모델과 타겟 모델을 생성 + self.model = self.build_model() + self.target_model = self.build_model() + # 학습할 모델을 타겟 모델로 복사 --> 타겟 모델의 초기화(weight를 같게 해주고 시작해야 함) + self.update_target_model() + + # Deep Neural Network를 통해서 Q Function을 근사 + # state가 입력, 각 행동에 대한 Q Value가 출력인 모델을 생성 + def build_model(self): + model = Sequential() + model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) + model.add(Dense(16, activation='relu', kernel_initializer='he_uniform')) + model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')) + model.summary() + model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) + return model + + # 일정한 시간 간격마다 타겟 모델을 현재 학습하고 있는 모델로 업데이트 + def update_target_model(self): + self.target_model.set_weights(self.model.get_weights()) + + # 행동의 선택은 현재 네트워크에 대해서 epsilon-greedy 정책을 사용 + def get_action(self, state): + if np.random.rand() <= self.epsilon: + return random.randrange(self.action_size) + else: + q_value = self.model.predict(state) + return np.argmax(q_value[0]) + + # 을 replay_memory에 저장함 + def replay_memory(self, state, action, reward, next_state, done): + if action == 2: + action = 1 + self.memory.append((state, action, reward, next_state, done)) + if self.epsilon > self.epsilon_min: + self.epsilon -= self.epsilon_decay + # print(len(self.memory)) + + # replay memory에서 batch_size 만큼의 샘플들을 무작위로 뽑아서 학습 + def train_replay(self): + if len(self.memory) < self.train_start: + return + batch_size = min(self.batch_size, len(self.memory)) + mini_batch = random.sample(self.memory, batch_size) + + update_input = np.zeros((batch_size, self.state_size)) + update_target = np.zeros((batch_size, self.action_size)) + + for i in range(batch_size): + state, action, reward, next_state, done = mini_batch[i] + target = self.model.predict(state)[0] + + # 큐러닝에서와 같이 s'에서의 최대 Q Value를 가져옴. 단, 타겟 모델에서 가져옴 + if done: + target[action] = reward + else: + target[action] = reward + self.discount_factor * \ + np.amax(self.target_model.predict(next_state)[0]) + update_input[i] = state + update_target[i] = target + + # 학습할 정답인 타겟과 현재 자신의 값의 minibatch를 만들고 그것으로 한 번에 모델 업데이트 + self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) + + # 저장한 모델을 불러옴 + def load_model(self, name): + self.model.load_weights(name) + + # 학습된 모델을 저장함 + def save_model(self, name): + self.model.save_weights(name) + + +if __name__ == "__main__": + # CartPole-v1의 경우 500 타임스텝까지 플레이가능 + env = gym.make('MountainCar-v0') + # 환경으로부터 상태와 행동의 크기를 가져옴 + state_size = env.observation_space.shape[0] + #action_size = env.action_space.n + action_size = 2 + # DQN 에이전트의 생성 + agent = DQNAgent(state_size, action_size) + agent.load_model("./save_model/MountainCar_DQN.h5") + scores, episodes = [], [] + + for e in range(EPISODES): + done = False + score = 0 + state = env.reset() + state = np.reshape(state, [1, state_size]) + print(state) + + # 액션 0(좌), 1(아무것도 안함), 3(아무것도 하지 않는 액션을 하지 않기 위한 fake_action 선언 + fake_action = 0 + + # 같은 액션을 4번하기 위한 카운터 + action_count = 0 + + while not done: + if agent.render: + env.render() + + # 현재 상태에서 행동을 선택하고 한 스텝을 진행 + action_count = action_count + 1 + + if action_count == 4: + action = agent.get_action(state) + action_count = 0 + + if action == 0: + fake_action = 0 + elif action == 1: + fake_action = 2 + + # 선택한 액션으로 1 step을 시행한다 + next_state, reward, done, info = env.step(fake_action) + next_state = np.reshape(next_state, [1, state_size]) + # 에피소드를 끝나게 한 행동에 대해서 -100의 패널티를 줌 + #reward = reward if not done else -100 + + # 을 replay memory에 저장 + agent.replay_memory(state, fake_action, reward, next_state, done) + # 매 타임스텝마다 학습을 진행 + agent.train_replay() + score += reward + state = next_state + + if done: + env.reset() + # 매 에피소드마다 학습하는 모델을 타겟 모델로 복사 + agent.update_target_model() + + # 각 에피소드마다 cartpole이 서있었던 타임스텝을 plot + scores.append(score) + episodes.append(e) + #pylab.plot(episodes, scores, 'b') + #pylab.savefig("./save_graph/MountainCar_DQN.png") + print("episode:", e, " score:", score, " memory length:", len(agent.memory), + " epsilon:", agent.epsilon) + + # 50 에피소드마다 학습 모델을 저장 + if e % 50 == 0: + agent.save_model("./save_model/MountainCar_DQN.h5") diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole.h5 b/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5 similarity index 67% rename from Code 2. Cartpole/1. DQN/save_model/Cartpole.h5 rename to 4-gym/1-mountaincar/save_model/MountainCar_DQN.h5 index 25268bae..7f17c818 100644 Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole.h5 and b/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5 differ diff --git a/Code 1. Grid World/1. Policy Iteration/__pycache__/agent.cpython-35.pyc b/Code 1. Grid World/1. Policy Iteration/__pycache__/agent.cpython-35.pyc deleted file mode 100644 index c0fe58b3..00000000 Binary files a/Code 1. Grid World/1. Policy Iteration/__pycache__/agent.cpython-35.pyc and /dev/null differ diff --git a/Code 1. Grid World/1. Policy Iteration/environment.py b/Code 1. Grid World/1. Policy Iteration/environment.py deleted file mode 100644 index ab6e428d..00000000 --- a/Code 1. Grid World/1. Policy Iteration/environment.py +++ /dev/null @@ -1,214 +0,0 @@ -import tkinter as tk -import time -import numpy as np -from PIL import ImageTk, Image -from policy_iteration import PolicyIteration - -UNIT = 100 # pixels -HEIGHT = 5 # grid height -WIDTH = 5 # grid width -TRANSITION_PROB = 1 -POSSIBLE_ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 가능한 모든 행동 -REWARDS = [] - - -class GraphicDisplay(tk.Tk): - - def __init__(self): - super(GraphicDisplay, self).__init__() - self.title('Policy Iteration') - self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) - self.texts = [] - self.arrows = [] - self.util = Util() - self.agent = PolicyIteration(self.util) - self._build_env() - - def _build_env(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - - # Buttons - iteration_button = tk.Button(self, text="Evaluation", command=self.policy_evaluation) - iteration_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button) - - policy_button = tk.Button(self, text="Improvement", command=self.policy_improvement) - policy_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button) - - policy_button = tk.Button(self, text="move", command=self.move_by_policy) - policy_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button) - - policy_button = tk.Button(self, text="clear", command=self.clear) - policy_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button) - - # create grids - for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) - for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row - self.canvas.create_line(x0, y0, x1, y1) - - # image_load - self.up_image = ImageTk.PhotoImage(Image.open("../resources/up.png").resize((13, 13))) - self.right_image = ImageTk.PhotoImage(Image.open("../resources/right.png").resize((13, 13))) - self.left_image = ImageTk.PhotoImage(Image.open("../resources/left.png").resize((13, 13))) - self.down_image = ImageTk.PhotoImage(Image.open("../resources/down.png").resize((13, 13))) - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) - self.triangle_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65))) - self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65))) - - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.triangle1 = self.canvas.create_image(250, 150, image=self.triangle_image) - self.triangle2 = self.canvas.create_image(150, 250, image=self.triangle_image) - self.circle = self.canvas.create_image(250, 250, image=self.circle_image) - - # add reward text - self.text_reward(2, 2, "R : 1.0") - self.text_reward(1, 2, "R : -1.0") - self.text_reward(2, 1, "R : -1.0") - - # pack all - self.canvas.pack() - - def clear(self): - for i in self.texts: - self.canvas.delete(i) - - for i in self.arrows: - self.canvas.delete(i) - - self.canvas.delete(self.rectangle) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.agent = PolicyIteration(self.util) - - def text_value(self, row, col, contents,font='Helvetica', size=10, style='normal', anchor="nw"): - origin_x, origin_y = 85, 70 - x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) - font = (font, str(size), style) - return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) - - def text_reward(self, row, col, contents, font='Helvetica', size=10, style='normal', anchor="nw"): - origin_x, origin_y = 5, 5 - x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) - font = (font, str(size), style) - return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor) - - def rectangle_move(self, action): - base_action = np.array([0, 0]) - self.render() - if action[0] == 1: # down - base_action[1] += UNIT - elif action[0] == -1: # up - base_action[1] -= UNIT - elif action[1] == 1: # right - base_action[0] += UNIT - elif action[1] == -1: # left - base_action[0] -= UNIT - - self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent - - def rectangle_location(self): - temp = self.canvas.coords(self.rectangle) - x = (temp[0] / 100) - 0.5 - y = (temp[1] / 100) - 0.5 - return int(y), int(x) - - def move_by_policy(self): - self.canvas.delete(self.rectangle) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - while len(self.agent.get_policy_table()[self.rectangle_location()[0]][self.rectangle_location()[1]]) != 0: - self.after(100, self.rectangle_move( - self.agent.get_action([self.rectangle_location()[0], self.rectangle_location()[1]]))) - - def draw_one_arrow(self, col, row, action): - - if col == 2 and row == 2: - return - - if action[0] > 0: # up - origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.up_image)) - - if action[1] > 0: # down - origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.down_image)) - - if action[2] > 0: # left - origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.left_image)) - - if action[3] > 0: # right - origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.right_image)) - - def draw_from_policy(self, policy_table): - for i in range(HEIGHT): - for j in range(WIDTH): - self.draw_one_arrow(i, j, policy_table[i][j]) - - def print_value_table(self, value_table): - for i in range(WIDTH): - for j in range(HEIGHT): - self.text_value(i, j, value_table[i][j]) - - def render(self): - time.sleep(0.1) - self.canvas.tag_raise(self.rectangle) - self.update() - - def policy_evaluation(self): - for i in self.texts: - self.canvas.delete(i) - self.agent.policy_evaluation() - self.print_value_table(self.agent.get_value_table()) - - def policy_improvement(self): - for i in self.arrows: - self.canvas.delete(i) - self.agent.policy_improvement() - self.draw_from_policy(self.agent.get_policy_table()) - - -class Util: - def __init__(self): - self.transition_probability = TRANSITION_PROB # 상태 변환 확률 - self.width = WIDTH # 그리드월드의 가로 길이 - self.height = HEIGHT # 그리드 월드의 세로 길이 - self.reward = [[0] * WIDTH for _ in range(HEIGHT)] - self.possible_actions = POSSIBLE_ACTIONS - self.reward[2][2] = 1 # 물고기 자리에 보상 1 - self.reward[1][2] = -1 # 불 자리에 보상 -1 - self.reward[2][1] = -1 # 불 자리에 보상 -1 - self.all_state = [] - - for x in range(WIDTH): - for y in range(HEIGHT): - state = [x, y] - self.all_state.append(state) - - def get_reward(self, state, action): - next_state = self.state_after_action(state, action) - return self.reward[next_state[0]][next_state[1]] - - def state_after_action(self, state, action): - return self.check_boundary([state[0] + action[0], state[1] + action[1]]) - - @staticmethod - def check_boundary(state): - state[0] = 0 if state[0] < 0 else WIDTH - 1 if state[0] > WIDTH - 1 else state[0] - state[1] = 0 if state[1] < 0 else HEIGHT - 1 if state[1] > HEIGHT - 1 else state[1] - return state - - def get_transition_prob(self, state, action): - return self.transition_probability - - def get_all_states(self): - return self.all_state diff --git a/Code 1. Grid World/1. Policy Iteration/policy_iteration.py b/Code 1. Grid World/1. Policy Iteration/policy_iteration.py deleted file mode 100644 index b3976ea5..00000000 --- a/Code 1. Grid World/1. Policy Iteration/policy_iteration.py +++ /dev/null @@ -1,123 +0,0 @@ -# -*- coding: utf-8 -*- -import copy -import random - -DISCOUNT_FACTOR = 0.9 - - -class PolicyIteration: - def __init__(self, env): - # environment object - self.env = env - # creaking 2 dimension list for the value function - self.value_table = [[0.00] * env.width for _ in range(env.height)] - # creating list for the policy - # this is random policy which has same probability for doing up, down, left, right - self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width for _ in range(env.height)] - # setting terminal state - self.policy_table[2][2] = [] - - # method for the policy evaluation - # use Bellman Expectation Equation for calculating next value function table - def policy_evaluation(self): - next_value_table = copy.deepcopy(self.value_table) - - # Bellman Expectation Equation for the every states - for state in self.env.get_all_states(): - next_value_table[state[0]][state[1]] = round(self.calculate_value(state), 2) - - self.value_table = copy.deepcopy(next_value_table) - - # calculating new value function using Bellman Expectation Equation - def calculate_value(self, state): - value = 0 - - for action in self.env.possible_actions: - next_state = self.env.state_after_action(state, action) - reward = self.env.get_reward(state, action) - next_value = self.get_value(next_state) - value += self.get_policy(state, action) * (reward + DISCOUNT_FACTOR * next_value) - - # keep the value function of terminal states as 0 - if state == [2, 2]: - return 0.0 - - return value - - # This is Greedy Policy which always selects action of maximum value - def greedy_policy(self, state): - - value = -99999 - max_index = [] - # initialize the policy - result = [0.0, 0.0, 0.0, 0.0] - - # for every actions, calculating [reward + (discount factor) * (next state value function)] - for index, action in enumerate(self.env.possible_actions): - next_state = self.env.state_after_action(state, action) - reward = self.env.get_reward(state, action) - next_value = self.get_value(next_state) - temp = reward + DISCOUNT_FACTOR * next_value - - # For the greedy policy, originally we can't pick multiple actions - # but in this example, we allow to pick multiple actions which have same maximum values - if temp == value: - max_index.append(index) - elif temp > value: - value = temp - max_index.clear() - max_index.append(index) - - # probability of action - prob = 1 / len(max_index) - - for index in max_index: - result[index] = prob - - return result - - # using the greedy policy method, do the policy improvement - # under the current value function table - def policy_improvement(self): - next_policy = self.get_policy_table() - for state in self.env.get_all_states(): - - if state == [2, 2]: - continue - - next_policy[state[0]][state[1]] = self.greedy_policy(state) - self.policy_table = next_policy - - # get action according to the current policy - def get_action(self, state): - random_pick = random.randrange(100) / 100 - - policy = self.get_policy(state) - policy_sum = 0.0 - # return the action in the index - for index, value in enumerate(policy): - policy_sum += value - if random_pick < policy_sum: - return self.env.possible_actions[index] - - # get the policy table for the all states - def get_policy_table(self): - return copy.deepcopy(self.policy_table) - - # get policy of specific state and action - def get_policy(self, state, action=None): - # if there is no action given, then return the probabilities of all actions - if action is None: - return self.policy_table[state[0]][state[1]] - - if state == [2, 2]: - return 0.0 - - return self.policy_table[state[0]][state[1]][self.env.possible_actions.index(action)] - - def get_value_table(self): - return copy.deepcopy(self.value_table) - - def get_value(self, state): - return round(self.value_table[state[0]][state[1]], 2) - diff --git a/Code 1. Grid World/1. Policy Iteration/run.py b/Code 1. Grid World/1. Policy Iteration/run.py deleted file mode 100644 index 1c1a6f35..00000000 --- a/Code 1. Grid World/1. Policy Iteration/run.py +++ /dev/null @@ -1,5 +0,0 @@ -from environment import GraphicDisplay - -if __name__ == "__main__": - grid_world = GraphicDisplay() - grid_world.mainloop() diff --git a/Code 1. Grid World/2. Value Iteration/environment.py b/Code 1. Grid World/2. Value Iteration/environment.py deleted file mode 100644 index ee3deaa2..00000000 --- a/Code 1. Grid World/2. Value Iteration/environment.py +++ /dev/null @@ -1,256 +0,0 @@ -import tkinter as tk -import time -import numpy as np -from PIL import ImageTk, Image -from value_iteration import ValueIteration - -UNIT = 100 # pixels -HEIGHT = 5 # grid height -WIDTH = 5 # grid width -TRANSITION_PROB = 1 -POSSIBLE_ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # 가능한 모든 행동 -REWARDS = [] - - -class GraphicDisplay(tk.Tk): - def __init__(self): - super(GraphicDisplay, self).__init__() - self.title('Value Iteration') - self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) - self.texts = [] - self.arrows = [] - self.util = Util() - self.agent = ValueIteration(self.util) - self._build_env() - - def _build_env(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - - # Buttons - iteration_button = tk.Button(self, text="Calculate", command=self.calculate_value) - iteration_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button) - - policy_button = tk.Button(self, text="Print Policy", command=self.print_optimal_policy) - policy_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button) - - policy_button = tk.Button(self, text="Move", command=self.move_by_policy) - policy_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button) - - policy_button = tk.Button(self, text="Clear", command=self.clear) - policy_button.configure(width=10, activebackground="#33B5E5") - self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button) - - # create grids - for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) - for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - self.canvas.create_line(x0, y0, x1, y1) - - # image_load - self.up_image = ImageTk.PhotoImage(Image.open("../resources/up.png").resize((13, 13))) - self.right_image = ImageTk.PhotoImage(Image.open("../resources/right.png").resize((13, 13))) - self.left_image = ImageTk.PhotoImage(Image.open("../resources/left.png").resize((13, 13))) - self.down_image = ImageTk.PhotoImage(Image.open("../resources/down.png").resize((13, 13))) - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) - self.triangle_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65))) - self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65))) - - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.hell1 = self.canvas.create_image(250, 150, image=self.triangle_image) - self.hell2 = self.canvas.create_image(150, 250, image=self.triangle_image) - self.circle = self.canvas.create_image(250, 250, image=self.circle_image) - - # add reward text - self.text_reward(2, 2, "R : 1.0") - self.text_reward(1, 2, "R : -1.0") - self.text_reward(2, 1, "R : -1.0") - - # pack all - self.canvas.pack() - - def clear(self): - for i in self.texts: - self.canvas.delete(i) - - for i in self.arrows: - self.canvas.delete(i) - - self.canvas.delete(self.rectangle) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.agent = ValueIteration(self.util) - - def reset(self): - self.update() - time.sleep(0.5) - self.canvas.delete(self.rectangle) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - # return observation - return self.canvas.coords(self.rectangle) - - def text_value(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"): - origin_x, origin_y = 85, 70 - x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) - font = (font, str(size), style) - return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) - - def text_reward(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"): - origin_x, origin_y = 5, 5 - x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) - font = (font, str(size), style) - return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor) - - def step(self, action): - s = self.canvas.coords(self.rectangle) - - base_action = np.array([0, 0]) - if action == 0: # up - if s[1] > UNIT: - base_action[1] -= UNIT - elif action == 1: # down - if s[1] < (HEIGHT - 1) * UNIT: - base_action[1] += UNIT - elif action == 2: # right - if s[0] < (WIDTH - 1) * UNIT: - base_action[0] += UNIT - elif action == 3: # left - if s[0] > UNIT: - base_action[0] -= UNIT - - self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent - s_ = self.canvas.coords(self.rectangle) # next state - # reward function - if s_ == self.canvas.coords(self.circle): - reward = 1 - done = True - elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: - reward = -1 - done = True - else: - reward = 0 - done = False - - return s_, reward, done - - def rectangle_move(self, action): - - base_action = np.array([0, 0]) - self.render() - - if action[0] == 1: # down - base_action[1] += UNIT - elif action[0] == -1: # up - base_action[1] -= UNIT - elif action[1] == 1: # right - base_action[0] += UNIT - elif action[1] == -1: # left - base_action[0] -= UNIT - - self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent - - def rectangle_location(self): - temp = self.canvas.coords(self.rectangle) - x = (temp[0] / 100) - 0.5 - y = (temp[1] / 100) - 0.5 - return int(y), int(x) - - def move_by_policy(self): - self.canvas.delete(self.rectangle) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]] - while len(self.agent.get_action(agent_state, False)) != 0: - agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]] - self.after(100, self.rectangle_move(self.agent.get_action(agent_state, True))) - - def draw_one_arrow(self, col, row, action): - if action[0] == 1: # down - origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.down_image)) - - elif action[0] == -1: # up - origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.up_image)) - - elif action[1] == 1: # right - origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.right_image)) - - elif action[1] == -1: # left - origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) - self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.left_image)) - - def draw_from_values(self, state, action_list): - - i = state[0] - j = state[1] - - for action in action_list: - self.draw_one_arrow(i, j, action) - - def print_values(self, values): - for i in range(WIDTH): - for j in range(HEIGHT): - self.text_value(i, j, values[i][j]) - - def render(self): - time.sleep(0.1) - self.canvas.tag_raise(self.rectangle) - self.update() - - def calculate_value(self): - for i in self.texts: - self.canvas.delete(i) - self.agent.iteration() - print(self.agent.get_value_table) - self.print_values(self.agent.get_value_table()) - - def print_optimal_policy(self): - for i in self.arrows: - self.canvas.delete(i) - for state in self.util.get_all_states(): - action = self.agent.get_action(state, False) - self.draw_from_values(state, action) - - -class Util: - def __init__(self): - self.transition_probability = TRANSITION_PROB # 상태 변환 확률 - self.width = WIDTH # 그리드월드의 가로 길이 - self.height = HEIGHT # 그리드 월드의 세로 길이 - self.reward = [[0] * WIDTH for _ in range(HEIGHT)] - self.possible_actions = POSSIBLE_ACTIONS - self.reward[2][2] = 1 # 물고기 자리에 보상 1 - self.reward[1][2] = -1 # 불 자리에 보상 -1 - self.reward[2][1] = -1 # 불 자리에 보상 -1 - self.all_state = [] - - for x in range(WIDTH): - for y in range(HEIGHT): - state = [x, y] - self.all_state.append(state) - - def get_reward(self, state, action): - next_state = self.state_after_action(state,action) - return self.reward[next_state[0]][next_state[1]] - - def state_after_action(self, state, action): - return self.check_boundary([state[0] + action[0], state[1] + action[1]]) - - def check_boundary(self, state): - state[0] = 0 if state[0] < 0 else WIDTH - 1 if state[0] > WIDTH - 1 else state[0] - state[1] = 0 if state[1] < 0 else HEIGHT - 1 if state[1] > HEIGHT - 1 else state[1] - return state - - def get_transition_prob(self, state, action): - return self.transition_probability - - def get_all_states(self): - return self.all_state \ No newline at end of file diff --git a/Code 1. Grid World/2. Value Iteration/run.py b/Code 1. Grid World/2. Value Iteration/run.py deleted file mode 100644 index 1c1a6f35..00000000 --- a/Code 1. Grid World/2. Value Iteration/run.py +++ /dev/null @@ -1,5 +0,0 @@ -from environment import GraphicDisplay - -if __name__ == "__main__": - grid_world = GraphicDisplay() - grid_world.mainloop() diff --git a/Code 1. Grid World/2. Value Iteration/value_iteration.py b/Code 1. Grid World/2. Value Iteration/value_iteration.py deleted file mode 100644 index bfa9a671..00000000 --- a/Code 1. Grid World/2. Value Iteration/value_iteration.py +++ /dev/null @@ -1,79 +0,0 @@ -# -*- coding: utf-8 -*- -import copy -import random - -DISCOUNT_FACTOR = 0.9 - - -class ValueIteration: - def __init__(self, env): - # environment object - self.env = env - # creaking 2 dimension list for the value function - self.value_table = [[0.00] * env.width for _ in range(env.height)] - - # get next value function table from the current value function table - def iteration(self): - value_table_copy = copy.deepcopy(self.value_table) - for state in self.env.get_all_states(): - value_table_copy[state[0]][state[1]] = round(self.calculate_max_value(state), 2) - self.value_table = copy.deepcopy(value_table_copy) - print("value_table : " , self.value_table) - - # calculate next value function using Bellman Optimality Equation - def calculate_max_value(self, state): - - if state == [2, 2]: - return 0.0 - - # empty list for the value function - value_list = [] - - # do the calculation for the all possible actions - for action in self.env.possible_actions: - next_state = self.env.state_after_action(state, action) - reward = self.env.get_reward(state, action) - next_value = self.get_value(next_state) - value_list.append((reward + DISCOUNT_FACTOR * next_value)) - - print("value _ list : " , value_list) - - # return the maximum value(it is optimality equation!!) - return max(value_list) - - # get action according to the current value function table - def get_action(self, state, random_pick=True): - - action_list = [] - max_value = -99999 - - if state == [2, 2]: - return [] - - # calculating q values for the all actions and - # append the action to action list which has maximum q value - for action in self.env.possible_actions: - - next_state = self.env.state_after_action(state, action) - reward = self.env.get_reward(state, action) - next_value = self.get_value(next_state) - value = (reward + DISCOUNT_FACTOR * next_value) - - if value > max_value: - action_list.clear() - action_list.append(action) - max_value = value - elif value == max_value: - action_list.append(action) - - # pick one action from action_list which has same q value - if random_pick is True: - return random.sample(action_list, 1)[0] - - return action_list - - def get_value_table(self): - return copy.deepcopy(self.value_table) - - def get_value(self, state): - return round(self.value_table[state[0]][state[1]], 2) diff --git a/Code 1. Grid World/3. Monte-Carlo/MC_agent.py b/Code 1. Grid World/3. Monte-Carlo/MC_agent.py deleted file mode 100644 index c874f06e..00000000 --- a/Code 1. Grid World/3. Monte-Carlo/MC_agent.py +++ /dev/null @@ -1,89 +0,0 @@ -import numpy as np -import pandas as pd - - -# this is Monte-Carlo agent for the grid world -# it learns every episodes from the sample(which is the difference with dynamic programming) -class MCAgent: - def __init__(self, actions): - # actions = [0, 1, 2, 3] - self.width = 5 - self.height = 5 - self.actions = actions - self.learning_rate = 0.01 - self.discount_factor = 0.9 - self.epsilon = 0.9 - self.samples = [] - self.value_table = pd.DataFrame(columns=['value']) - - # check whether the state was visited - # if this is first visitation, then initialize the q function of the state - def check_state_exist(self, state): - if str(state) not in self.value_table.index: - self.value_table = self.value_table.append( - pd.Series( - [0] * len(self.value_table.columns), - index=self.value_table.columns, - name=str(state) - ) - ) - - # append sample to memory(state, reward, done) - def save_sample(self, state, reward, done): - self.samples.append([state, reward, done]) - - # for every episode, agent updates q function of visited states - def update(self): - G_t = 0 - visit_state = [] - for reward in reversed(self.samples): - state = str(reward[0]) - if state not in visit_state: - visit_state.append(state) - G_t = self.discount_factor * (reward[1] + G_t) - self.check_state_exist(state) - value = self.value_table.ix[state, 'value'] - self.value_table.ix[state, 'value'] = value + self.learning_rate * (G_t - value) - print("state : ", state, " G : ", G_t, " update : ", value + self.learning_rate * (G_t - value)) - print("values : ", self.value_table) - - # get action for the state according to the q function table - # agent pick action of epsilon-greedy policy - def get_action(self, state): - self.check_state_exist(state) - - if np.random.rand() > self.epsilon: - # take random action - action = np.random.choice(self.actions) - else: - # take action according to the q function table - next_state = self.possible_next_state(state) - next_state = next_state.reindex(np.random.permutation(next_state.index)) - action = next_state.argmax() - - return int(action) - - # get the possible next states - def possible_next_state(self, state): - state_col = state[0] - state_row = state[1] - - next_state = pd.Series( - [0] * len(self.actions), - index=self.actions, - ) - - if state_row != 0: - self.check_state_exist(str([state_col, state_row - 1])) - next_state.set_value(0, self.value_table.ix[str([state_col, state_row - 1]), 'value']) # up - if state_row != self.height - 1: - self.check_state_exist(str([state_col, state_row + 1])) - next_state.set_value(1, self.value_table.ix[str([state_col, state_row + 1]), 'value']) # down - if state_col != 0: - self.check_state_exist(str([state_col - 1, state_row])) - next_state.set_value(2, self.value_table.ix[str([state_col - 1, state_row]), 'value']) # left - if state_col != self.width - 1: - self.check_state_exist(str([state_col + 1, state_row])) - next_state.set_value(3, self.value_table.ix[str([state_col + 1, state_row]), 'value']) # right - - return next_state diff --git a/Code 1. Grid World/3. Monte-Carlo/run.py b/Code 1. Grid World/3. Monte-Carlo/run.py deleted file mode 100644 index a5efec22..00000000 --- a/Code 1. Grid World/3. Monte-Carlo/run.py +++ /dev/null @@ -1,30 +0,0 @@ -from environment import Env -from MC_agent import MCAgent - -# main loop -if __name__ == "__main__": - env = Env() - agent = MCAgent(actions=list(range(env.n_actions))) - - for episode in range(1000): - # reset environment and initialize state - state = env.reset() - - while True: - env.render() - - # take action and doing one step in the environment - # environment return next state, immediate reward and - # information about terminal of episode - action = agent.get_action(state) - next_state, reward, done = env.step(action) - - agent.save_sample(next_state, reward, done) - - # at the end of episode, update the q function table - if done: - print("episode : ", episode) - print("returns : ", agent.returns) - agent.update() - agent.returns.clear() - break \ No newline at end of file diff --git a/Code 1. Grid World/4. SARSA/SARSA_agent.py b/Code 1. Grid World/4. SARSA/SARSA_agent.py deleted file mode 100644 index 7e1298d3..00000000 --- a/Code 1. Grid World/4. SARSA/SARSA_agent.py +++ /dev/null @@ -1,50 +0,0 @@ -import numpy as np -import pandas as pd - - -# this is SARSA agent for the grid world -# it learns every time step from the sample -class SARSAgent: - def __init__(self, actions): - # actions = [0, 1, 2, 3] - self.actions = actions - self.learning_rate = 0.01 - self.discount_factor = 0.9 - self.epsilon = 0.9 - self.q_table = pd.DataFrame(columns=self.actions) - - # check whether the state was visited - # if this is first visitation, then initialize the q function of the state - def check_state_exist(self, state): - if state not in self.q_table.index: - self.q_table = self.q_table.append( - pd.Series( - [0] * len(self.actions), - index=self.q_table.columns, - name=state, - ) - ) - - # with sample , learns new q function - def learn(self, state, action, reward, next_state, next_action): - self.check_state_exist(next_state) - self.q_table.ix[state, action] = \ - self.q_table.ix[state, action] + self.learning_rate * \ - (reward + self.discount_factor * - self.q_table.ix[next_state, next_action - self.q_table.ix[state, action]]) - - # get action for the state according to the q function table - # agent pick action of epsilon-greedy policy - def get_action(self, state): - self.check_state_exist(state) - - if np.random.rand() > self.epsilon: - # take random action - action = np.random.choice(self.actions) - else: - # take action according to the q function table - state_action = self.q_table.ix[state, :] - state_action = state_action.reindex(np.random.permutation(state_action.index)) - action = state_action.argmax() - - return action diff --git a/Code 1. Grid World/4. SARSA/environment.py b/Code 1. Grid World/4. SARSA/environment.py deleted file mode 100644 index 30074db3..00000000 --- a/Code 1. Grid World/4. SARSA/environment.py +++ /dev/null @@ -1,136 +0,0 @@ -import time -import numpy as np -import tkinter as tk -from PIL import ImageTk, Image - -np.random.seed(1) - -UNIT = 100 # pixels -HEIGHT = 5 # grid height -WIDTH = 5 # grid width - - -class Env(tk.Tk): - def __init__(self): - super(Env, self).__init__() - self.action_space = ['u', 'd', 'l', 'r'] - self.n_actions = len(self.action_space) - self.title('monte carlo') - self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT)) - self.buildGraphic() - self.texts = [] - - def buildGraphic(self): - self.canvas = tk.Canvas(self, bg='white', - height=HEIGHT * UNIT, - width=WIDTH * UNIT) - - # create grids - for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT - self.canvas.create_line(x0, y0, x1, y1) - for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 - x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r - self.canvas.create_line(x0, y0, x1, y1) - - # image_load - self.rectangle_image = ImageTk.PhotoImage( - Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) - self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65))) - self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65))) - - # add image to canvas - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image) - self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image) - self.circle = self.canvas.create_image(250, 250, image=self.circle_image) - - # pack all - self.canvas.pack() - - def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"): - - if action == 0: - origin_x, origin_y = 7, 42 - elif action == 1: - origin_x, origin_y = 85, 42 - elif action == 2: - origin_x, origin_y = 42, 5 - else: - origin_x, origin_y = 42, 77 - - x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) - font = (font, str(size), style) - return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) - - def print_value_all(self, q_table): - for i in self.texts: - self.canvas.delete(i) - self.texts.clear() - for i in range(HEIGHT): - for j in range(WIDTH): - for action in range(0, 4): - state = [i, j] - if str(state) in q_table.index: - temp = q_table.ix[str(state), action] - self.text_value(j, i, round(temp, 2), action) - - def coords_to_state(self, coords): - x = int((coords[0] - 50) / 100) - y = int((coords[1] - 50) / 100) - return [x, y] - - def state_to_coords(self, state): - x = int(state[0] * 100 + 50) - y = int(state[1] * 100 + 50) - return [x, y] - - def reset(self): - self.update() - time.sleep(0.5) - self.canvas.delete(self.rectangle) - origin = np.array([UNIT / 2, UNIT / 2]) - self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) - # return observation - return self.coords_to_state(self.canvas.coords(self.rectangle)) - - def step(self, action): - state = self.canvas.coords(self.rectangle) - base_action = np.array([0, 0]) - self.render() - - if action == 0: # up - if state[1] > UNIT: - base_action[1] -= UNIT - elif action == 1: # down - if state[1] < (HEIGHT - 1) * UNIT: - base_action[1] += UNIT - elif action == 2: # left - if state[0] > UNIT: - base_action[0] -= UNIT - elif action == 3: # right - if state[0] < (WIDTH - 1) * UNIT: - base_action[0] += UNIT - - self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent - - next_state = self.canvas.coords(self.rectangle) # next state - - # reward function - if next_state == self.canvas.coords(self.circle): - reward = 100 - done = True - elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]: - reward = -100 - done = True - else: - reward = 0 - done = False - - next_state = self.coords_to_state(next_state) - - return next_state, reward, done - - def render(self): - time.sleep(0.05) - self.update() diff --git a/Code 1. Grid World/4. SARSA/run.py b/Code 1. Grid World/4. SARSA/run.py deleted file mode 100644 index ed41c87d..00000000 --- a/Code 1. Grid World/4. SARSA/run.py +++ /dev/null @@ -1,36 +0,0 @@ -from environment import Env -from SARSA_agent import SARSAgent - -if __name__ == "__main__": - env = Env() - agent = SARSAgent(actions=list(range(env.n_actions))) - - for episode in range(1000): - # reset environment and initialize state - state = env.reset() - # get action of state from agent - action = agent.get_action(str(state)) - - while True: - env.render() - - # take action and doing one step in the environment - # environment return next state, immediate reward and - # information about terminal of episode - next_state, reward, done = env.step(action) - - # get action of state from agent - next_action = agent.get_action(str(next_state)) - - # with sample , agent learns new q function - agent.learn(str(state), action, reward, str(next_state), next_action) - - state = next_state - action = next_action - - # print q function of all states at screen - env.print_value_all(agent.q_table) - - # if episode ends, then break - if done: - break \ No newline at end of file diff --git a/Code 1. Grid World/5. Q Learning/QLearning_agent.py b/Code 1. Grid World/5. Q Learning/QLearning_agent.py deleted file mode 100644 index eae23556..00000000 --- a/Code 1. Grid World/5. Q Learning/QLearning_agent.py +++ /dev/null @@ -1,49 +0,0 @@ -import numpy as np -import pandas as pd - - -class QLearningAgent: - def __init__(self, actions): - # actions = [0, 1, 2, 3] - self.actions = actions - self.learning_rate = 0.01 - self.discount_factor = 0.9 - self.epsilon = 0.9 - self.q_table = pd.DataFrame(columns=self.actions) - - # check whether the state was visited - # if this is first visitation, then initialize the q function of the state - - def check_state_exist(self, state): - if state not in self.q_table.index: - self.q_table = self.q_table.append( - pd.Series( - [0] * len(self.actions), - index=self.q_table.columns, - name=state, - ) - ) - - # update q function with sample - def learn(self, state, action, reward, next_state): - self.check_state_exist(next_state) - q_1 = self.q_table.ix[state, action] - # using Bellman Optimality Equation to update q function - q_2 = reward + self.discount_factor * self.q_table.ix[next_state, :].max() - self.q_table.ix[state, action] += self.learning_rate * (q_2 - q_1) - - # get action for the state according to the q function table - # agent pick action of epsilon-greedy policy - def get_action(self, state): - self.check_state_exist(state) - - if np.random.rand() > self.epsilon: - # take random action - action = np.random.choice(self.actions) - else: - # take action according to the q function table - state_action = self.q_table.ix[state, :] - state_action = state_action.reindex(np.random.permutation(state_action.index)) - action = state_action.argmax() - - return action diff --git a/Code 1. Grid World/5. Q Learning/run.py b/Code 1. Grid World/5. Q Learning/run.py deleted file mode 100644 index 82c6dc5c..00000000 --- a/Code 1. Grid World/5. Q Learning/run.py +++ /dev/null @@ -1,34 +0,0 @@ -from environment import Env -from QLearning_agent import QLearningAgent - - -if __name__ == "__main__": - env = Env() - agent = QLearningAgent(actions=list(range(env.n_actions))) - - for episode in range(1000): - # reset environment and initialize state - state = env.reset() - - while True: - env.render() - - # get action of state from agent - action = agent.get_action(str(state)) - - # take action and doing one step in the environment - # environment return next state, immediate reward and - # information about terminal of episode - next_state, reward, done = env.step(action) - - # with sample , agent learns new q function - agent.learn(str(state), action, reward, str(next_state)) - - state = next_state - - # print q function of all states at screen - env.print_value_all(agent.q_table) - - # if episode ends, then break - if done: - break diff --git a/Code 1. Grid World/6. DQN/Gridworld_DQN.py b/Code 1. Grid World/6. DQN/Gridworld_DQN.py deleted file mode 100644 index e9d5fac1..00000000 --- a/Code 1. Grid World/6. DQN/Gridworld_DQN.py +++ /dev/null @@ -1,174 +0,0 @@ -import copy -import pylab -import random -import numpy as np -from environment import Env -from collections import deque -from keras.layers import Dense -from keras.optimizers import Adam -from keras.models import Sequential - -EPISODES = 1000 - - -# this is DQN Agent for the Cartpole -# it uses Neural Network to approximate q function -# and replay memory & target q network -class DQNAgent: - def __init__(self): - # if you want to see Cartpole learning, then change to True - self.render = False - - # actions which agent can do - self.action_space = [0, 1, 2, 3, 4] - # get size of state and action - self.action_size = len(self.action_space) - self.state_size = 22 - self.discount_factor = 0.99 - self.learning_rate = 0.001 - - self.epsilon = 1. # exploration - self.epsilon_decay = .9999 - self.epsilon_min = 0.01 - self.batch_size = 32 - self.train_start = 100 - - # create replay memory using deque - self.memory = deque(maxlen=10000) - self.model = self.build_model() - self.target_model = self.build_model() - # copy the model to target model - # --> initialize the target model so that the parameters of model & target model to be same - self.update_target_model() - - # approximate Q function using Neural Network - # state is input and Q Value of each action is output of network - def build_model(self): - model = Sequential() - model.add(Dense(20, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(20, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')) - model.summary() - model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) - return model - - # after some time interval update the target model to be same with model - def update_target_model(self): - self.target_model.set_weights(self.model.get_weights()) - - # get action from model using epsilon-greedy policy - def get_action(self, state): - if np.random.rand() <= self.epsilon: - # The agent acts randomly - return random.randrange(self.action_size) - else: - # Predict the reward value based on the given state - state = np.float32(state) - q_values = self.model.predict(state) - return np.argmax(q_values[0]) - - # save sample to the replay memory - def replay_memory(self, state, action, reward, next_state, done): - self.memory.append((state, action, reward, next_state, done)) - if self.epsilon > self.epsilon_min: - self.epsilon *= self.epsilon_decay - - # pick samples randomly from replay memory (with batch_size) - def train_replay(self): - if len(self.memory) < self.train_start: - return - batch_size = min(self.batch_size, len(self.memory)) - mini_batch = random.sample(self.memory, batch_size) - - update_input = np.zeros((batch_size, self.state_size)) - update_target = np.zeros((batch_size, self.action_size)) - - for i in range(batch_size): - state, action, reward, next_state, done = mini_batch[i] - reward = np.float32(reward) - state = np.float32(state) - next_state = np.float32(next_state) - target = self.model.predict(state)[0] - - # like Q Learning, get maximum Q value at s' - # But from target model - if done: - target[action] = reward - else: - target = reward + self.discount_factor * \ - np.amax(self.model.predict(next_state)[0]) - - update_input[i] = state - update_target[i] = target - - # make minibatch which includes target q value and predicted q value - # and do the model fit! - self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) - - # load the saved model - def load_model(self, name): - self.model.load_weights(name) - - # save the model which is under training - def save_model(self, name): - self.model.save_weights(name) - - -if __name__ == "__main__": - # maze game - # env = Maze() - env = Env() - agent = DQNAgent() - - global_step = 0 - # agent.load("same_vel_episode2 : 1000") - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - state = env.reset() - state = np.reshape(state, [1, 22]) - - while not done: - # fresh env - if agent.render: - env.render() - global_step += 1 - - # get action for the current state and go one step in environment - action = agent.get_action(state) - next_state, reward, done = env.step(action) - next_state = np.reshape(next_state, [1, 22]) - - agent.replay_memory(state, action, reward, next_state, done) - # every time step we do training - agent.train_replay() - score += reward - - state = copy.deepcopy(next_state) - print("reward:", reward, " done:", done, " time_step:", global_step, " epsilon:", agent.epsilon) - - # every 100 time steps update the target model to be same with model - if global_step % 100 == 0: - agent.update_target_model() - - if done: - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/10by10.png") - print("episode:", e, " score:", score, " memory length:", len(agent.memory), - " epsilon:", agent.epsilon) - - if e % 100 == 0: - pass - agent.save_model("./save_model/10by10") - - # end of game - print('game over') - env.destroy() - - - - diff --git a/Code 1. Grid World/6. DQN/save_graph/10by10.png b/Code 1. Grid World/6. DQN/save_graph/10by10.png deleted file mode 100644 index b2fdee85..00000000 Binary files a/Code 1. Grid World/6. DQN/save_graph/10by10.png and /dev/null differ diff --git a/Code 1. Grid World/6. DQN/save_model/10by10 b/Code 1. Grid World/6. DQN/save_model/10by10 deleted file mode 100644 index d1b416c4..00000000 Binary files a/Code 1. Grid World/6. DQN/save_model/10by10 and /dev/null differ diff --git a/Code 1. Grid World/7. Policy Gradient/save_graph/10by10.png b/Code 1. Grid World/7. Policy Gradient/save_graph/10by10.png deleted file mode 100644 index dc314d66..00000000 Binary files a/Code 1. Grid World/7. Policy Gradient/save_graph/10by10.png and /dev/null differ diff --git a/Code 1. Grid World/7. Policy Gradient/save_model/10by10 b/Code 1. Grid World/7. Policy Gradient/save_model/10by10 deleted file mode 100644 index abba078c..00000000 Binary files a/Code 1. Grid World/7. Policy Gradient/save_model/10by10 and /dev/null differ diff --git a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN14.png b/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN14.png deleted file mode 100644 index 5c0f54d3..00000000 Binary files a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN14.png and /dev/null differ diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole10.h5 b/Code 2. Cartpole/1. DQN/save_model/Cartpole10.h5 deleted file mode 100644 index dc6f1d69..00000000 Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole10.h5 and /dev/null differ diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole8.h5 b/Code 2. Cartpole/1. DQN/save_model/Cartpole8.h5 deleted file mode 100644 index 02094b4e..00000000 Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole8.h5 and /dev/null differ diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole9.h5 b/Code 2. Cartpole/1. DQN/save_model/Cartpole9.h5 deleted file mode 100644 index ce883e7d..00000000 Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole9.h5 and /dev/null differ diff --git a/Code 2. Cartpole/2. Double DQN/save_graph/Cartpole_DoubleDQN.png b/Code 2. Cartpole/2. Double DQN/save_graph/Cartpole_DoubleDQN.png deleted file mode 100644 index b77a86eb..00000000 Binary files a/Code 2. Cartpole/2. Double DQN/save_graph/Cartpole_DoubleDQN.png and /dev/null differ diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN1.h5 b/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN1.h5 deleted file mode 100644 index da416bfe..00000000 Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN1.h5 and /dev/null differ diff --git a/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py b/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py deleted file mode 100644 index a51bf180..00000000 --- a/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py +++ /dev/null @@ -1,180 +0,0 @@ -import sys -import gym -import pylab -import random -import numpy as np -from collections import deque -from keras import backend as k -from keras.models import Model -from keras.optimizers import Adam -from keras.layers import Dense, Lambda, merge, Input - -EPISODES = 300 - - -# this is Dueling DQN Agent for the Cartpole -# it uses Neural Network to approximate q function -# and replay memory & target q network -class DuelingDQNAgent: - def __init__(self, state_size, action_size): - # if you want to see Cartpole learning, then change to True - self.render = False - - # get size of state and action - self.state_size = state_size - self.action_size = action_size - - # these is hyper parameters for the Dueling DQN - self.discount_factor = 0.99 - self.learning_rate = 0.001 - self.epsilon = 1.0 - self.epsilon_decay = 0.999 - self.epsilon_min = 0.01 - self.batch_size = 12 - self.train_start = 1000 - # create replay memory using deque - self.memory = deque(maxlen=2000) - - # create main model and target model - self.model = self.build_model() - self.target_model = self.build_model() - - # copy the model to target model - # --> initialize the target model so that the parameters of model & target model to be same - self.update_target_model() - - # the key point of Dueling network - # the network devided into two streams, 1. value function 2. advantaget function - # at the end of network, two streams are merged into one output stream which is Q function - def build_model(self): - input = Input(shape=(self.state_size,)) - x = Dense(32, input_shape=(self.state_size,), activation='relu', kernel_initializer='he_uniform')(input) - x = Dense(16, activation='relu', kernel_initializer='he_uniform')(x) - - state_value = Dense(1, kernel_initializer='he_uniform')(x) - state_value = Lambda(lambda s: k.expand_dims(s[:, 0], -1), output_shape=(self.action_size,))(state_value) - - action_advantage = Dense(self.action_size, kernel_initializer='he_uniform')(x) - action_advantage = Lambda(lambda a: a[:, :] - k.mean(a[:, :], keepdims=True), - output_shape=(self.action_size,))(action_advantage) - - q_value = merge([state_value, action_advantage], mode='sum') - model = Model(input=input, output=q_value) - model.summary() - model.compile(loss='mse', optimizer=Adam(self.learning_rate)) - return model - - # after some time interval update the target model to be same with model - def update_target_model(self): - self.target_model.set_weights(self.model.get_weights()) - - # get action from model using epsilon-greedy policy - def get_action(self, state): - if np.random.rand() <= self.epsilon: - return random.randrange(self.action_size) - else: - q_value = self.model.predict(state) - return np.argmax(q_value[0]) - - # save sample to the replay memory - def replay_memory(self, state, action, reward, next_state, done): - self.memory.append((state, action, reward, next_state, done)) - if self.epsilon > self.epsilon_min: - self.epsilon *= self.epsilon_decay - - # pick samples randomly from replay memory (with batch_size) - def train_replay(self): - if len(self.memory) < self.train_start: - return - batch_size = min(self.batch_size, len(self.memory)) - mini_batch = random.sample(self.memory, batch_size) - - update_input = np.zeros((batch_size, self.state_size)) - update_target = np.zeros((batch_size, self.action_size)) - - for i in range(batch_size): - state, action, reward, next_state, done = mini_batch[i] - target = self.model.predict(state)[0] - - # like Q Learning, get maximum Q value at s' - # But from target model - if done: - target[action] = reward - else: - target[action] = reward + self.discount_factor * \ - np.amax(self.target_model.predict(next_state)[0]) - - update_input[i] = state - update_target[i] = target - - # make minibatch which includes target q value and predicted q value - # and do the model fit! - self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) - - # load the saved model - def load_model(self, name): - self.model.load_weights(name) - - # save the model which is under training - def save_model(self, name): - self.model.save_weights(name) - - -if __name__ == "__main__": - # in case of CartPole-v1, you can play until 500 time step - env = gym.make('CartPole-v1') - # get size of state and action from environment - state_size = env.observation_space.shape[0] - action_size = env.action_space.n - - agent = DuelingDQNAgent(state_size, action_size) - - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - state = env.reset() - state = np.reshape(state, [1, state_size]) - # agent.load_model("./save_model/cartpole-master.h5") - - while not done: - if agent.render: - env.render() - - # get action for the current state and go one step in environment - action = agent.get_action(state) - next_state, reward, done, info = env.step(action) - next_state = np.reshape(next_state, [1, state_size]) - # if an action make the episode end, then gives penalty of -100 - reward = reward if not done or score == 499 else -100 - - # save the sample to the replay memory - agent.replay_memory(state, action, reward, next_state, done) - # every time step do the training - agent.train_replay() - score += reward - state = next_state - - if done: - env.reset() - # every episode update the target model to be same with model - - agent.update_target_model() - # every episode, plot the play time - score = score if score == 499 else score + 100 - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Cartpole_Dueling_DQN.png") - print("episode:", e, " score:", score, " memory length:", len(agent.memory), - " epsilon:", agent.epsilon) - - # if the mean of scores of last 10 episode is bigger than 490 - # stop training - if np.mean(scores[-min(10, len(scores)):]) > 490: - sys.exit() - - # save the model - if e % 50 == 0: - agent.save_model("./save_model/Cartpole_DQN.h5") diff --git a/Code 2. Cartpole/3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png b/Code 2. Cartpole/3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png deleted file mode 100644 index 99ad0b00..00000000 Binary files a/Code 2. Cartpole/3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png and /dev/null differ diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN.h5 b/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN.h5 deleted file mode 100644 index 72105e16..00000000 Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN.h5 and /dev/null differ diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN1.h5 b/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN1.h5 deleted file mode 100644 index eac02b32..00000000 Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN1.h5 and /dev/null differ diff --git a/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py b/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py deleted file mode 100644 index 7a0ac089..00000000 --- a/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py +++ /dev/null @@ -1,163 +0,0 @@ -import sys -import gym -import pylab -import numpy as np -from keras.layers import Dense -from keras.models import Sequential -from keras.optimizers import Adam -from keras import backend as K - -EPISODES = 1000 - - -class PGAgent: - def __init__(self, state_size, action_size): - # Cartpole이 학습하는 것을 보려면 True로 바꿀 것 - self.render = True - - # agent를 학습시키지 않으려면 False로 바꿀 것 - self.is_train = True - - # state와 action의 크기를 가져와서 모델을 생성하는데 사용함 - self.state_size = state_size - self.action_size = action_size - - # Cartpole REINFORCE 학습의 Hyper parameter 들 - self.discount_factor = 0.99 - self.learning_rate = 0.001 - - # 학습할 모델을 생성 - self.model = self.build_model() - - # Policy Gradient 네트워크 학습하는 함수를 만듬 - self.optimizer = self.optimizer() - - # 상태, 행동, 보상을 기억하기 위한 리스트 생성 - self.states, self.actions, self.rewards = [], [], [] - - # Deep Neural Network 를 통해서 정책을 근사 - # 상태가 입력, 각 행동에 대한 확률이 출력인 모델을 생성 - def build_model(self): - model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')) - model.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform')) - # 마지막 softmax 계층으로 각 행동에 대한 확률을 만드는 모델을 생성 - model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')) - model.summary() - - return model - - def optimizer(self): - action = K.placeholder(shape=[None, self.action_size]) - discounted_rewards = K.placeholder(shape=[None, ]) - - # Policy Gradient 의 핵심 - # log(정책) * return 의 gradient 를 구해서 최대화시킴 - good_prob = K.sum(action * self.model.output, axis=1) - eligibility = K.log(good_prob) * discounted_rewards - loss = -K.sum(eligibility) - - optimizer = Adam(lr=self.learning_rate) - updates = optimizer.get_updates(self.model.trainable_weights, [], loss) - train = K.function([self.model.input, action, discounted_rewards], [], updates=updates) - - return train - - # 행동의 선택은 현재 네트워크에 대해서 각 행동에 대한 확률로 정책을 사용 - def get_action(self, state): - policy = self.model.predict(state, batch_size=1).flatten() - return np.random.choice(self.action_size, 1, p=policy)[0] - - # 에피소드가 끝나면 해당 에피소드의 보상를 이용해 return을 계산 - def discount_rewards(self, rewards): - discounted_rewards = np.zeros_like(rewards) - running_add = 0 - for t in reversed(range(0, len(rewards))): - running_add = running_add * self.discount_factor + rewards[t] - discounted_rewards[t] = running_add - return discounted_rewards - - # 각 스텝의 을 저장하는 함수 - def memory(self, state, action, reward): - self.states.append(state[0]) - self.rewards.append(reward) - act = np.zeros(self.action_size) - act[action] = 1 - self.actions.append(act) - - # 에피소드가 끝나면 모아진 메모리로 학습 - def train_episodes(self): - discounted_rewards = self.discount_rewards(self.rewards) - discounted_rewards -= np.mean(discounted_rewards) - discounted_rewards /= np.std(discounted_rewards) - - self.optimizer([self.states, self.actions, discounted_rewards]) - self.states, self.actions, self.rewards = [], [], [] - - # 저장한 모델을 불러옴 - def load_model(self, name): - self.model.load_weights(name) - - # 학습된 모델을 저장함 - def save_model(self, name): - self.model.save_weights(name) - - -if __name__ == "__main__": - # CartPole-v1의 경우 500 타임스텝까지 플레이가능 - env = gym.make('CartPole-v1') - - # 환경으로부터 상태와 행동의 크기를 가져옴 - state_size = env.observation_space.shape[0] - action_size = env.action_space.n - - # PG 에이전트의 생성 - agent = PGAgent(state_size, action_size) - - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - state = env.reset() - state = np.reshape(state, [1, state_size]) - # agent.load_model("./save_model/cartpole-master.h5") - - while not done: - if agent.render: - env.render() - - # 현재 상태에서 행동을 선택하고 한 스텝을 진행 - action = agent.get_action(state) - next_state, reward, done, info = env.step(action) - next_state = np.reshape(next_state, [1, state_size]) - reward = reward if not done or score == 499 else -100 - - # 을 memory에 저장 - if agent.is_train: - agent.memory(state, action, reward) - - score += reward - state = next_state - - if done: - env.reset() - # 매 에피소드마다 모아온 을 학습 - if agent.is_train: - agent.train_episodes() - - # 에피소드에 따른 score를 plot - score = score if score == 500 else score + 100 - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Cartpole_PG.png") - print("episode:", e, " score:", score) - - # 지난 10 에피소드의 평균이 490 이상이면 학습을 멈춤 - if np.mean(scores[-min(10, len(scores)):]) > 490: - sys.exit() - - # 50 에피소드마다 학습 모델을 저장 - if e % 50 == 0: - agent.save_model("./save_model/Cartpole_PG.h5") diff --git a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png b/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png deleted file mode 100644 index 796da0d4..00000000 Binary files a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png and /dev/null differ diff --git a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_DQN1.h5 b/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_DQN1.h5 deleted file mode 100644 index 2ff204a3..00000000 Binary files a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_DQN1.h5 and /dev/null differ diff --git a/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py b/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py deleted file mode 100644 index 2b0ea5eb..00000000 --- a/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py +++ /dev/null @@ -1,184 +0,0 @@ -import sys -import gym -import pylab -import random -import numpy as np -from collections import deque -from keras.layers import Dense -from keras.optimizers import Adam -from keras.models import Sequential -from keras import backend as K - -EPISODES = 300 - - -class ACAgent: - def __init__(self, state_size, action_size): - # Cartpole이 학습하는 것을 보려면 True로 바꿀 것 - self.render = False - - # state와 action의 크기를 가져와서 모델을 생성하는데 사용함 - self.state_size = state_size - self.action_size = action_size - - # Cartpole Actor-Critic에 필요한 Hyperparameter들 - self.discount_factor = 0.99 - self.actor_lr = 0.001 - self.critic_lr = 0.01 - self.batch_size = 32 - self.train_start = 1000 - self.memory = deque(maxlen=10000) - - # Actor-Critic C에 필요한 actor 네트워크와 critic 네트워크를 생성 - self.actor, self.critic = self.build_model() - - # actor 네트워크를 학습시키기 위한 optimizer 를 만듬 - self.actor_optimizer = self.actor_optimizer() - - # Deep Neural Network 를 통해서 정책과 가치를 근사 - # actor -> 상태가 입력, 각 행동에 대한 확률이 출력인 모델을 생성 - # critic -> 상태가 입력, 상태에 대한 가치가 출력인 모델을 생성 - def build_model(self): - # actor 네트워크 생성 - actor = Sequential() - actor.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')) - actor.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform')) - actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')) - - # critic 네트워크 생성 - critic = Sequential() - critic.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer="he_uniform")) - critic.add(Dense(24, activation='relu', kernel_initializer='he_uniform')) - critic.add(Dense(1, activation='linear', kernel_initializer='he_uniform')) - critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr)) - - actor.summary() - critic.summary() - - return actor, critic - - def actor_optimizer(self): - action = K.placeholder(shape=[None, self.action_size]) - advantages = K.placeholder(shape=[None, ]) - - # Policy Gradient 의 핵심 - # log(정책) * return 의 gradient 를 구해서 최대화시킴 - good_prob = K.sum(action * self.actor.output, axis=1) - eligibility = K.log(good_prob + 1e-10) * advantages - loss = -K.sum(eligibility) - - optimizer = Adam(lr=self.actor_lr) - updates = optimizer.get_updates(self.actor.trainable_weights, [], loss) - train = K.function([self.actor.input, action, advantages], [], updates=updates) - - return train - - # replay memory에서 batch_size 만큼의 샘플들을 무작위로 뽑아서 학습 - def train_replay(self): - if len(self.memory) < self.train_start: - return - mini_batch = random.sample(self.memory, self.batch_size) - - update_input = np.zeros((self.batch_size, self.state_size)) - update_action = np.zeros((self.batch_size, self.action_size)) - update_target = np.zeros((self.batch_size, 1)) - advantages = np.zeros((self.batch_size,)) - - for i in range(self.batch_size): - state, action, reward, next_state, done = mini_batch[i] - value = self.critic.predict(state)[0] - - # s'의 state value를 가져와서 critic 네트워크를 업데이트함. - if done: - target = reward - else: - target = reward + self.discount_factor * \ - self.critic.predict(next_state)[0] - update_input[i] = state - update_action[i] = action - update_target[i] = target - advantages[i] = target - value - - # 학습할 정답인 타겟과 현재 자신의 값의 minibatch를 만들고 그것으로 한 번에 critic 모델 업데이트 - self.critic.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0) - - # 상태, 행동, 그에 따른 (target-value)를 넣어 actor 네트워크를 학습함 - self.actor_optimizer([update_input, update_action, advantages]) - - # 핻동의 선택은 actor 네트워크에 대해서 각 행동에 대한 확률로 정책을 사용 - def get_action(self, state): - policy = self.actor.predict(state, batch_size=1).flatten() - return np.random.choice(self.action_size, 1, p=policy)[0] - - # 각 스텝의 을 저장 - def replay_memory(self, state, action, reward, next_state, done): - act = np.zeros(self.action_size) - act[action] = 1 - self.memory.append((state, act, reward, next_state, done)) - - # 저장한 모델을 불러옴 - def load_model(self, name): - self.actor.load_weights(name) - self.critic.load_weights(name) - - # 학습된 모델을 저장함 - def save_model(self, name1, name2): - self.actor.save_weights(name1) - self.critic.save_weights(name2) - - -if __name__ == "__main__": - # CartPole-v1의 경우 500 타임스텝까지 플레이가능 - env = gym.make('CartPole-v1') - - # 환경으로부터 상태와 행동의 크기를 가져옴 - state_size = env.observation_space.shape[0] - action_size = env.action_space.n - - agent = ACAgent(state_size, action_size) - scores, episodes = [], [] - - for e in range(EPISODES): - done = False - score = 0 - state = env.reset() - state = np.reshape(state, [1, state_size]) - # agent.load_model("./save_model/Cartpole-Actor.h5", "./save_model/Cartpole-Critic.h5") - - while not done: - if agent.render: - env.render() - - # 현재 상태에서 행동을 선택하고 한 스텝을 진행 - action = agent.get_action(state) - next_state, reward, done, info = env.step(action) - next_state = np.reshape(next_state, [1, state_size]) - # 에피소드를 끝나게 한 행동에 대해서 -100의 패널티를 줌 - reward = reward if not done or score == 499 else -100 - - # 을 replay memory에 저장 - agent.replay_memory(state, action, reward, next_state, done) - # 매 타임스텝마다 학습을 진행 - agent.train_replay() - - score += reward - state = next_state - - if done: - env.reset() - - # 각 에피소드마다 cartpole이 서있었던 타임스텝을 plot - score = score if score == 500 else score + 100 - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Cartpole_ActorCritc.png") - print("episode:", e, " score:", score, " memory length:", len(agent.memory)) - - # 지난 10 에피소드의 평균이 490 이상이면 학습을 멈춤 - if np.mean(scores[-min(10, len(scores)):]) > 490: - sys.exit() - - # 50 에피소드마다 학습 모델을 저장 - if e % 50 == 0: - agent.save_model("./save_model/Cartpole_Actor.h5", "./save_model/Cartpole_Critic.h5") diff --git a/Code 2. Cartpole/5. Actor-Critic/save_graph/Cartpole_ActorCritc.png b/Code 2. Cartpole/5. Actor-Critic/save_graph/Cartpole_ActorCritc.png deleted file mode 100644 index da9f6ed2..00000000 Binary files a/Code 2. Cartpole/5. Actor-Critic/save_graph/Cartpole_ActorCritc.png and /dev/null differ diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Actor.h5 b/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Actor.h5 deleted file mode 100644 index df132fc2..00000000 Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Actor.h5 and /dev/null differ diff --git a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py b/Code 3. Atari Game/1. Breakout/Breakout_DQN.py deleted file mode 100644 index b3c26253..00000000 --- a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py +++ /dev/null @@ -1,174 +0,0 @@ -import gym -import pylab -import random -import numpy as np -from collections import deque -from skimage.color import rgb2gray -from skimage.transform import resize - -from keras.models import Sequential -from keras.optimizers import RMSprop -from keras.layers import Dense, Flatten -from keras.layers.convolutional import Conv2D - -EPISODES = 5000 - - -class DQNAgent: - def __init__(self): - self.render = True - - self.state_size = (84, 84, 4) - self.action_size = 6 - - self.epsilon = 1.0 - self.epsilon_start = 1.0 - self.epsilon_end = 0.1 - self.epsilon_decay = 1000000. - self.epsilon_decay_step = \ - (self.epsilon_start - self.epsilon_end) / self.epsilon_decay - - self.batch_size = 32 - self.train_start = 20000 - self.update_target_rate = 10000 - self.discount_factor = 0.99 - self.memory = deque(maxlen=400000) - self.no_op_steps = 30 - self.learning_rate = 0.00025 - self.momentum = 0.95 - self.min_gradient = 0.01 - - self.model = self.build_model() - self.target_model = self.build_model() - self.update_target_model() - - def build_model(self): - model = Sequential() - model.add(Conv2D(32, (8, 8), input_shape=self.state_size, activation='relu', strides=(4, 4), - kernel_initializer='glorot_uniform')) - model.add(Conv2D(64, (4, 4), activation='relu', strides=(2, 2), - kernel_initializer='glorot_uniform')) - model.add(Conv2D(64, (3, 3), activation='relu', strides=(1, 1), - kernel_initializer='glorot_uniform')) - model.add(Flatten()) - model.add(Dense(512, activation='relu', kernel_initializer='glorot_uniform')) - model.add(Dense(self.action_size)) - model.summary() - model.compile(loss='mse', optimizer=RMSprop( - lr=self.learning_rate, rho=self.momentum, epsilon=self.min_gradient)) - return model - - def update_target_model(self): - self.target_model.set_weights(self.model.get_weights()) - - def get_action(self, history): - history = np.float32(history/255.0) - if np.random.rand() <= self.epsilon: - return random.randrange(self.action_size) - else: - q_value = self.model.predict(history) - return np.argmax(q_value[0]) - - def replay_memory(self, history, action, reward, history1, done): - self.memory.append((history, action, reward, history1, done)) - if self.epsilon > self.epsilon_end: - self.epsilon -= self.epsilon_decay_step - - def train_replay(self): - if len(self.memory) < self.train_start: - return - batch_size = min(self.batch_size, len(self.memory)) - mini_batch = random.sample(self.memory, batch_size) - - update_input = np.zeros((batch_size, self.state_size[0], self.state_size[1], self.state_size[2])) - update_target = np.zeros((batch_size, self.action_size)) - - for i in range(batch_size): - history, action, reward, history1, done = mini_batch[i] - history = np.float32(history/255.) - history1 = np.float32(history1/255.) - target = self.model.predict(history)[0] - - if done: - target[action] = reward - else: - target[action] = reward + self.discount_factor * np.amax(self.target_model.predict(history1)[0]) - update_target[i] = target - update_input[i] = history - - self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) - - def load_model(self, name): - self.model.load_weights(name) - - def save_model(self, name): - self.model.save_weights(name) - - -def pre_processing(next_observe, observe): - processed_observe = np.maximum(next_observe, observe) - processed_observe = np.uint8(resize(rgb2gray(processed_observe), (84, 84), mode='constant')*255) - return processed_observe - - -if __name__ == "__main__": - env = gym.make('BreakoutDeterministic-v3') - agent = DQNAgent() - - scores, episodes, global_step = [], [], 0 - - for e in range(EPISODES): - done = False - dead = False - score, start_live = 0, 5 - observe = env.reset() - next_observe = observe - for _ in range(random.randint(1, agent.no_op_steps)): - observe = next_observe - next_observe, _, _, _ = env.step(1) - - state = pre_processing(next_observe, observe) - history = np.stack((state, state, state, state), axis=2) - history = history.reshape(1, history.shape[0], history.shape[1], history.shape[2]) - - while not done: - if agent.render: - env.render() - observe = next_observe - action = agent.get_action(history) - next_observe, reward, done, info = env.step(action) - next_state = pre_processing(next_observe, observe) - next_state = np.reshape([next_state], (1, 84, 84, 1)) - history1 = np.append(next_state, history[:, :, :, :3], axis=3) - - if start_live > info['ale.lives']: - dead = True - start_live = info['ale.lives'] - - agent.replay_memory(history, action, reward, history1, done) - agent.train_replay() - - score += reward - - if dead: - history = np.stack((next_state, next_state, next_state, next_state), axis=2) - history = np.reshape([history], (1, 84, 84, 4)) - dead = False - else: - history = history1 - - if global_step % agent.update_target_rate == 0: - agent.update_target_model() - - if done: - env.reset() - scores.append(score) - episodes.append(e) - pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Breakout_DQN.png") - print("episode:", e, " score:", score, " memory length:", len(agent.memory), - " epsilon:", agent.epsilon) - - # 20 에피소드마다 학습 모델을 저장 - if e % 1000 == 0: - agent.save_model("./save_model/Breakout_DQN.h5") \ No newline at end of file diff --git a/Code 3. Atari Game/1. Breakout/Breakout_PG.py b/Code 3. Atari Game/1. Breakout/Breakout_PG.py deleted file mode 100644 index 2ab553bf..00000000 --- a/Code 3. Atari Game/1. Breakout/Breakout_PG.py +++ /dev/null @@ -1,211 +0,0 @@ -import gym -import random -import tensorflow as tf -import numpy as np - -DIM = 105*80*2 -gamma = 0.99 -batch_size = 10 - -def weight_variable(shape): - #bound = 1 / np.sqrt(np.sum(shape)) - initial = tf.truncated_normal(shape, stddev=0.05) - return tf.Variable(initial) - #return tf.Variable(tf.random_uniform(shape, minval=0, maxval=bound)) - -def bias_variable(shape): - initial = tf.constant(0.1, shape=shape) - return tf.Variable(initial) - -def prepro(state): - # set images to gray scale - gray_state = np.zeros([210,160]) - - for i in range(210): - for j in range(160): - gray_state[i][j] = np.mean(state[i][j]) - - # get rid of noises - gray_state[gray_state == 142] = 0 - gray_state = gray_state[::2,::2] - return gray_state - -def discountRewards(rewards): - discounted_r = np.zeros_like(rewards) - running_add = 0 - for t in reversed(range(0, len(rewards))): - running_add = running_add * gamma + rewards[t] - discounted_r[t] = running_add - return discounted_r - -class AGENT(): - def __init__(self, learning_rate = 1e-4): - self.learning_rate = learning_rate - - self.conv_W1 = weight_variable([5,5,2,32]) - self.conv_b1 = bias_variable([32]) - self.conv_W2 = weight_variable([4,4,32,64]) - self.conv_b2 = bias_variable([64]) - self.conv_W3 = weight_variable([3,3,64,64]) - self.conv_b3 = bias_variable([64]) - - self.fc_W1 = weight_variable([13*10*64, 512]) - self.fc_b1 = bias_variable([512]) - self.fc_W2 = weight_variable([512,2]) - self.fc_b2 = bias_variable([2]) - - self.v_conv_W1 = weight_variable([5,5,2,16]) - self.v_conv_b1 = bias_variable([16]) - self.v_conv_W2 = weight_variable([4,4,16,32]) - self.v_conv_b2 = bias_variable([32]) - self.v_conv_W3 = weight_variable([3,3,32,32]) - self.v_conv_b3 = bias_variable([32]) - - self.v_fc_W1 = weight_variable([13*10*32, 512]) - self.v_fc_b1 = bias_variable([512]) - self.v_fc_W2 = weight_variable([512,1]) - self.v_fc_b2 = bias_variable([1]) - - self.sess = tf.InteractiveSession() - self.sess.run(tf.initialize_all_variables()) - - - self.state, self.prob, self.conv_drop, self.fc_drop = self.getPolicy() - self.act, self.adv, self.train = self.policyOptimizer() - self.v_state, self.value = self.getValue() - self.v_n_state, self.n_value = self.getValue() - self.rwd, self.v_train = self.valueOptimizer() - - - def getAction(self, get_action_state): - action = 4 if random.random() < .5 else 5 - return action - - - def policyOptimizer(self): - act = tf.placeholder(tf.float32, [None, 2]) - adv = tf.placeholder(tf.float32, [None, 1]) - - good_probabilities = tf.reduce_sum(tf.mul(self.prob, act), - reduction_indices = [1]) - log_probabilities = tf.log(tf.clip_by_value(good_probabilities, 1e-10, 1e+8)) * adv - loss = -tf.reduce_sum(log_probabilities) - optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(loss) - - return act, adv, optimizer - - def valueOptimizer(self): - rwd = tf.placeholder(tf.float32, [None, 1]) - - value1 = self.value - value2 = rwd + self.n_value*gamma - v_loss = tf.reduce_mean(tf.square(value2 - value1), reduction_indices=[1]) - v_optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(v_loss) - - return rwd, v_optimizer - - def getPolicy(self): - state = tf.placeholder(tf.float32, [None, 105, 80, 2]) - conv_drop = tf.placeholder(tf.float32) - fc_drop = tf.placeholder(tf.float32) - state_image = tf.reshape(state, [-1,105,80,2]) - - conv_h1_out = tf.nn.conv2d(state_image, self.conv_W1, strides = [1,2,2,1], padding = "SAME") - conv_h1 = tf.nn.relu(conv_h1_out + self.conv_b1) - conv_h1_drop = tf.nn.dropout(conv_h1, conv_drop) - conv_h2_out = tf.nn.conv2d(conv_h1_drop, self.conv_W2, strides = [1,2,2,1], padding = "SAME") - conv_h2 = tf.nn.relu(conv_h2_out + self.conv_b2) - conv_h2_drop = tf.nn.dropout(conv_h2, conv_drop) - conv_h3_out = tf.nn.conv2d(conv_h2_drop, self.conv_W3, strides = [1,2,2,1], padding = "SAME") - conv_h3 = tf.nn.relu(conv_h3_out + self.conv_b3) - - conv_h3_flat = tf.reshape(conv_h3, [-1, 13*10*64]) - - fc_h1 = tf.nn.relu(tf.matmul(conv_h3_flat, self.fc_W1) + self.fc_b1) - fc_h1_drop = tf.nn.dropout(fc_h1, fc_drop) - prob = tf.nn.softmax(tf.matmul(fc_h1_drop, self.fc_W2) + self.fc_b2) - - return state, prob, conv_drop, fc_drop - - def getValue(self): - v_state = tf.placeholder(tf.float32, [None, 105, 80, 2]) - v_state_image = tf.reshape(v_state, [-1,105,80,2]) - - v_conv_h1_out = tf.nn.conv2d(v_state_image, self.v_conv_W1, strides=[1,2,2,1], padding="SAME") - v_conv_h1 = tf.nn.relu(v_conv_h1_out+self.v_conv_b1) - v_conv_h1_drop = tf.nn.dropout(v_conv_h1, self.conv_drop) - v_conv_h2_out = tf.nn.conv2d(v_conv_h1_drop, self.v_conv_W2, strides=[1,2,2,1], padding="SAME") - v_conv_h2 = tf.nn.relu(v_conv_h2_out+self.v_conv_b2) - v_conv_h2_drop = tf.nn.dropout(v_conv_h2, self.conv_drop) - v_conv_h3_out = tf.nn.conv2d(v_conv_h2_drop, self.v_conv_W3, strides=[1,2,2,1], padding="SAME") - v_conv_h3 = tf.nn.relu(v_conv_h3_out+self.v_conv_b3) - - v_conv_h3_flat = tf.reshape(v_conv_h3, [-1, 13*10*32]) - - v_fc_h1 = tf.nn.relu(tf.matmul(v_conv_h3_flat, self.v_fc_W1) + self.v_fc_b1) - v_fc_h1_drop = tf.nn.dropout(v_fc_h1, self.fc_drop) - value = tf.matmul(v_fc_h1_drop, self.v_fc_W2) + self.v_fc_b2 - - return v_state, value - - - def getActionProb(self, get_action_prob_state): - action_prob = self.sess.run(self.prob, feed_dict = {self.state: get_action_prob_state, self.conv_drop: 1.0, self.fc_drop: 1.0}) - return action_prob - - -env = gym.make("Breakout-v0") -agent = AGENT() -obs = env.reset() -prev_x = None -states, rewards, actions, next_states = [],[],[],[] -running_reward = None -reward_sum, step = 0, 0 -episode_number = 0 - -while True: - env.render() - cur_x = obs - x = [cur_x, prev_x] if prev_x is not None else np.zeros(DIM) - x = np.reshape(x, [-1 ,105, 80, 2]) - states.append(x) - prev_x = cur_x - if step != 0: - next_states.append(x) - - - action = agent.getAction(x) - actions.append([0,1] if action == 5 else [1,0]) - obs, reward, done, info = env.step(action) - rewards.append(reward) - reward_sum += reward - step += 1 - - - if done: - next_states.append(np.reshape(np.zeros(DIM), [-1, 105, 80, 2])) - episode_number += 1 - - discounted_epi_reward = discountRewards(rewards) - - #discounted_epi_reward -= np.mean(discounted_epi_reward) - #discounted_epi_reward /= np.std(discounted_epi_reward) - - epi_state = np.vstack(states) - epi_reward = np.vstack(discounted_epi_reward) - epi_action = np.vstack(actions) - epi_n_state = np.vstack(next_states) - - states,rewards,actions,next_states = [],[],[],[] - - if episode_number % batch_size == 0: - agent.sess.run(agent.v_train, feed_dict={agent.v_state: epi_state, agent.rwd: epi_reward, - agent.v_n_state: epi_n_state, agent.conv_drop: 0.4, agent.fc_drop: 0.5}) - - epi_advantage = epi_reward - agent.sess.run(agent.value, feed_dict={agent.v_state: epi_state}) - - agent.sess.run(agent.train, feed_dict={agent.state: epi_state, agent.act: epi_action, - agent.adv: epi_reward, agent.conv_drop: 0.4, agent.fc_drop: 0.5}) - reward_sum, step = 0,0 - obs = env.reset() # reset env - prev_x = None diff --git a/Code 3. Atari Game/1. Breakout/save_graph/Breakout_DQN1.png b/Code 3. Atari Game/1. Breakout/save_graph/Breakout_DQN1.png deleted file mode 100644 index 7bcd6443..00000000 Binary files a/Code 3. Atari Game/1. Breakout/save_graph/Breakout_DQN1.png and /dev/null differ diff --git a/Code 3. Atari Game/3. A3C/Pong_A3C.py b/Code 3. Atari Game/3. A3C/Pong_A3C.py deleted file mode 100644 index e69de29b..00000000 diff --git a/Code 3. Atari Game/Understanding DQN.pptx b/Code 3. Atari Game/Understanding DQN.pptx deleted file mode 100644 index 80dfbae9..00000000 Binary files a/Code 3. Atari Game/Understanding DQN.pptx and /dev/null differ diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..ac035a7b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 RLCode + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README-kr.md b/README-kr.md deleted file mode 100644 index 74f738b1..00000000 --- a/README-kr.md +++ /dev/null @@ -1,47 +0,0 @@ -

- --------------------------------------------------------------------------------- - -> [RLCode](https://rlcode.github.io)팀이 직접 만든 강화학습 예제들을 모아놓은 Repo 입니다. [영문 (English)](./README.md) -> -> Maintainers - [이웅원](https://github.com/dnddnjs), [이영무](https://github.com/zzing0907), [양혁렬](https://github.com/Hyeokreal), [이의령](https://github.com/wooridle), [김건우](https://github.com/keon) - -[Pull Request](https://github.com/rlcode/reinforcement-learning/pulls)는 언제든 환영입니다. -문제나 버그, 혹은 궁금한 사항이 있으면 [이슈](https://github.com/rlcode/reinforcement-learning/issues)에 글을 남겨주세요. - - -## 필요한 라이브러리들 (Dependencies) -1. Python 3.5 -2. Tensorflow 1.0.0 -3. Keras -4. numpy -5. pandas -6. pillow -7. matplot -8. Skimage -9. h5py - -## 목차 (Table of Contents) - -**Code 1** - 비교적 단순한 환경인 그리드월드에서 강화학습의 기초를 쌓기 - -- [정책 이터레이션 (Policy Iteration)](./Code%201.%20Grid%20World/1.%20Policy%20Iteration) -- [가치 이터레이션 (Value Iteration)](./Code%201.%20Grid%20World/2.%20Value%20Iteration) -- [몬테카를로 (Monte Carlo)](./Code%201.%20Grid%20World/3.%20Monte-Carlo) -- [살사 (SARSA)](./Code%201.%20Grid%20World/4.%20SARSA) -- [큐러닝 (Q-Learning)](./Code%201.%20Grid%20World/5.%20Q%20Learning) -- [Deep Q Network](./Code%201.%20Grid%20World/6.%20DQN) -- [Policy Gradient](./Code%201.%20Grid%20World/7.%20Policy%20Gradient) - -**Code 2** - 카트폴 예제를 이용하여 여러가지 딥러닝을 강화학습에 응용한 알고리즘들을 적용해보기 - -- [Deep Q Network](./Code%202.%20Cartpole/1.%20DQN) -- [Double Deep Q Network](./Code%202.%20Cartpole/2.%20Double%20DQN) -- [Dueling Deep Q Network](./Code%202.%20Cartpole/3.%20Dueling%20DQN) -- [Policy Gradient](./Code%202.%20Cartpole/4.%20Policy%20Gradient) -- [Actor Critic](./Code%202.%20Cartpole/5.%20Actor-Critic) - -**Code 3** - 딥러닝을 응용하여 좀더 복잡한 Atari게임을 마스터하는 에이전트 만들기 - -- [벽돌깨기 (Breakout)](./Code%203.%20Atari%20Game/1.%20Breakout) -- [퐁 (Pong)](./Code%203.%20Atari%20Game/2.%20Pong) diff --git a/README.md b/README.md index 440b96cd..870e686b 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,17 @@ -------------------------------------------------------------------------------- -> Minimal and clean examples of reinforcement learning algorithms presented by [RLCode](https://rlcode.github.io) team. [[한국어]](./README-kr.md) +> Minimal and clean examples of reinforcement learning algorithms presented by [RLCode](https://rlcode.github.io) team. [[한국어]](https://github.com/rlcode/reinforcement-learning-kr) > > Maintainers - [Woongwon](https://github.com/dnddnjs), [Youngmoo](https://github.com/zzing0907), [Hyeokreal](https://github.com/Hyeokreal), [Uiryeong](https://github.com/wooridle), [Keon](https://github.com/keon) -From the most basic algorithms to the more recent ones categorized as 'deep reinforcement learning', the examples are easy to read with comments. +From the basics to deep reinforcement learning, this repo provides easy-to-read code examples. One file for each algorithm. Please feel free to create a [Pull Request](https://github.com/rlcode/reinforcement-learning/pulls), or open an [issue](https://github.com/rlcode/reinforcement-learning/issues)! ## Dependencies 1. Python 3.5 2. Tensorflow 1.0.0 -3. Keras +3. Keras 4. numpy 5. pandas 6. matplot @@ -27,26 +27,29 @@ pip install -r requirements.txt ## Table of Contents -**Code 1** - Mastering the basics of reinforcement learning in the simplified world called "Grid World" +**Grid World** - Mastering the basics of reinforcement learning in the simplified world called "Grid World" -- [Policy Iteration](./Code%201.%20Grid%20World/1.%20Policy%20Iteration) -- [Value Iteration](./Code%201.%20Grid%20World/2.%20Value%20Iteration) -- [Monte Carlo](./Code%201.%20Grid%20World/3.%20Monte-Carlo) -- [SARSA](./Code%201.%20Grid%20World/4.%20SARSA) -- [Q-Learning](./Code%201.%20Grid%20World/5.%20Q%20Learning) -- [Deep Q Network](./Code%201.%20Grid%20World/6.%20DQN) -- [Policy Gradient](./Code%201.%20Grid%20World/7.%20Policy%20Gradient) +- [Policy Iteration](./1-grid-world/1-policy-iteration) +- [Value Iteration](./1-grid-world/2-value-iteration) +- [Monte Carlo](./1-grid-world/3-monte-carlo) +- [SARSA](./1-grid-world/4-sarsa) +- [Q-Learning](./1-grid-world/5-q-learning) +- [Deep SARSA](./1-grid-world/6-deep-sarsa) +- [REINFORCE](./1-grid-world/7-reinforce) -**Code 2** - Applying deep reinforcement learning on basic Cartpole game. +**CartPole** - Applying deep reinforcement learning on basic Cartpole game. -- [Deep Q Network](./Code%202.%20Cartpole/1.%20DQN) -- [Double Deep Q Network](./Code%202.%20Cartpole/2.%20Double%20DQN) -- [Dueling Deep Q Network](./Code%202.%20Cartpole/3.%20Dueling%20DQN) -- [Policy Gradient](./Code%202.%20Cartpole/4.%20Policy%20Gradient) -- [Actor Critic](./Code%202.%20Cartpole/5.%20Actor-Critic) -- Asynchronous Advantage Actor Critic (A3C) - WIP +- [Deep Q Network](./2-cartpole/1-dqn) +- [Double Deep Q Network](./2-cartpole/2-double-dqn) +- [Policy Gradient](./2-cartpole/3-reinforce) +- [Actor Critic (A2C)](./2-cartpole/4-actor-critic) +- [Asynchronous Advantage Actor Critic (A3C)](./2-cartpole/5-a3c) -**Code 3** - Mastering Atari games with Deep Reinforcement Learning +**Atari** - Mastering Atari games with Deep Reinforcement Learning -- [Breakout](./Code%203.%20Atari%20Game/1.%20Breakout) - DQN, PG, A3C -- [Pong](./Code%203.%20Atari%20Game/2.%20Pong) - DQN, PG, A3C +- **Breakout** - [DQN](./3-atari/1-breakout/breakout_dqn.py), [DDQN](./3-atari/1-breakout/breakout_ddqn.py) [Dueling DDQN](./3-atari/1-breakout/breakout_ddqn.py) [A3C](./3-atari/1-breakout/breakout_a3c.py) +- **Pong** - [Policy Gradient](./3-atari/2-pong/pong_reinforce.py) + +**OpenAI GYM** - [WIP] + +- Mountain Car - [DQN](./4-gym/1-mountaincar) diff --git a/wiki/install_guide_osx+ubuntu.md b/wiki/install_guide_osx+ubuntu.md new file mode 100644 index 00000000..58770933 --- /dev/null +++ b/wiki/install_guide_osx+ubuntu.md @@ -0,0 +1,341 @@ +## 개발 환경 설정 1: 리눅스 (우분투) + +리눅스는 소스코드가 공개된 대표적인 오픈소스 운영체제입니다. 리눅스는 모든 소스가 공개되어 있으므로 정말 많은 종류가 있습니다. 그중에서도 우분투(Ubuntu)가 가장 넓은 사용자를 가진 배포판입니다. 매년 상반기 하반기 우분투 재단에서 새로운 버전을 배포하는데 이 책에서는 14년 상반기에 배포한 우분투 14.04 버전을 사용할 것입니다. 우분투 14.04가 설치되어 있다는 가정에 따라 이후의 개발환경 설정을 설명할 것입니다. + + + +### 2.1.1 우분투 파이썬의 버전 확인 + +리눅스의 장점은 바로 파이썬(Python)이 설치가 기본적으로 되어 있다는 것입니다. 파이썬은 2.X 버전과 3.X버전이 있는데 이 책에서는 `파이썬 3.5버전`을 사용할 것입니다. 바탕화면에서 `Ctrl+Alt+t`를 누르면 터미널 창이 뜨는데 +여기에 다음 명령어를 치고 엔터를 누르면 설치된 파이썬의 버전을 확인할 수 있습니다. + +```python +$ python -V +``` + +우분투 14.04 버전에는 `파이썬 2.7버전`과 `3.5버전`이 기본적으로 설치되어 있습니다. + + + +### 2.1.2 파이참 커뮤니티 설치 및 환경 설정 + +앞으로 강화학습 에이전트를 만들고 가상 환경에서 에이전트를 학습시킬 것입니다. 그러기 위해 코드를 짜고 편집하는 +환경이 필요한데 그러한 환경을 IDE(interface Development Environment)라고 합니다. IDE에는 많은 종류가 있지만 이 책에서는 파이참(Pycharm)을 파이썬을 위한 IDE로 사용할 것입니다. + +파이참의 설치는 파이참의 공식 홈페이지[[1\]](#_ftn1)를통해서 할 수 있습니다. 홈페이지에서 윈도우, 리눅스, 맥 OS 버전의 파이참을 다운로드 할 수 있습니다. 파이참은 유료 버전인 `프로페셔녈(PyCharm ProfessionalEdition)`과, 무료 버전인 `커뮤니티(PyCharm Community Edition)`으로 나뉩니다. 앞으로 에이전트를 개발할 때 `파이참 커뮤니티`를 사용할 것이므로 커뮤니티 버전을 기준으로 설치법을 설명할 것입니다. + + + +**설치는 다음과 같은 순서로 진행합니다. ** + +1. 파이참 공식 홈페이지 링크에서 파이참 커뮤니티버전을 다운로드합니다. + + ​ + + 링크: [https://www.jetbrains.com/pycharm/download/#section=linux](https://www.jetbrains.com/pycharm/download/#section=linux) + + + + ​ + +2. 다운받은 경로로 들어가서 다음 명령어로 압축파일을 풀어줍니다. + + ```shell + $tar xfz pycharm-community-2016.3.2.tar.gz + ``` + + + + +3. 압축을 푼 후 아래 경로(bin폴더)로 이동합니다. + + ```shell + $cd ~/pycharm-community-2016.3.2/bin + ``` + ​ + +4. 다음 명령어로 파이참을 실행합니다. + + ```shell + $sh pycharm.sh + ``` + ​ + + + + ​ + + +5. 명령어가 실행되면 설치가 시작됩니다. + + ​ + +6. 설치가 완료되면 다음 화면과 같은 초기 환경설정 화면을 볼 수 있습니다. + + ​ + + + + ​ + + IDE theme 항목에서 Intellij는 바탕이 흰색인 테마이고 Darcula 테마는 바탕이 검은색입니다. 이 + 책에서는 Intellij를 테마로 사용합니다. + + ​ + +7. 초기설정이 완료된 후의 화면입니다. 여기서 프로젝트 생성을 해봅니다. + + + + ​ + + ​ + +8. 프로젝트의 경로와 Interpreter를 설정하는 화면입니다. Home 디렉터리에 PycharmProjects 폴더를 생성하고 그 하위에 프로젝트를 생성합니다. 프로젝트의 이름은 독자가 임의로 정하도록 합니다. “rlcode_book” 이름으로 프로젝트를 생성하는데 Interpreter를 설정해줍니다. Interpreter는 이 프로젝트에서 사용할 언어인데 python 3.5라고 설정합니다. + + ​ + + + ​ + +9. rlcode_book 프로젝트가 생성되면 아래와 같은 화면이 나옵니다. + + ​ + + + + ​ + + ​ + +10. 파이참이 정상적으로 설치되었는지 확인하기 위해 파이썬 스크립트 파일을 생성해봅니다. 가장 간단한 예제인 `“Hello World”`를 실행하기 위해 다음과 같이 hello_world.py 파일을 생성합니다. + ​ + + + ​ + +11. 생성한 파일에 마우스 커서를 놓고 오른쪽 버튼을 누르면 여러 항목이 나옵니다. 그 중에서 “Run ‘hello_world’” 버튼을 누르면 hello_world.py 파일을 실행할 수 있습니다. + ​ + + + + ​ + + +12. hello_world.py 파일 안에 다음 코드를 입력합니다. + ```python + print("hello world") + ``` + + + + +13. hello_world.py 파일을 실행시키면 아래 화면과 같이 실행 창에 “hello world”가 나옵니다. 이를 통해 파이참이 정상적으로 설치된 것을 확인할 수 있습니다. + + ​ + + + + ​ + + +###Virtualenv(가상환경) 사용법 :happy: + +여기까지가기본적인 파이참의 환경설정입니다. 한 컴퓨터에서 여러가지 프로젝트를 진행할 경우에 프로젝트마다 개발환경이다를 수 있습니다. 서로 다른 프로젝트의 개발환경이 다를 경우에 사용자는 상당한 불편을 겪을 수 있습니다. 따라서 프로젝트별로 개발환경을 분리해서 관리하는 것은 상당한 장점이 있는데 그 기능을 하는 것이 VirtualEnv입니다. VirtualEnv를 사용하면 이 책의 프로젝트만을위한 가상 개발환경을 만들 수 있습니다. + +파이참은VirtualEnv를 지원하기 때문에 파이참으로 VirtualEnv를사용하는 법을 설명하겠습니다. VirtualEnv의 설치 및 사용 방법은 여러 가지가 있지만 위에서설치한 파이참을 이용하면 GUI(graphic user interface)형식으로 VirtualEnv를 사용할 수 있습니다. 그리고 파이참은 가상 개발환경에설치된 다양한 파이썬 외부 라이브러리들을 관리 할 수 있는 기능을 제공합니다. + +**파이참에서 VirtualEnv 이용방법은 다음과 같습니다.** + +1. “File” 메뉴에서 “Settings”를 클릭합니다. + + ​ + + + + ​ + +2. Settings의 왼쪽 목록에서 “Project: 프로젝트명”의 하위 항목인 Project Interpreter 클릭합니다. 그리고 Project Interpreter 탭 오른쪽에서 “Create VirtualEnv”를 클릭합니다. + + ​ + + + + ​ + +3. 가상환경 이름을 입력하면 /home/brian/rlcode_book 디렉토리가 생성되어 가상환경이 생깁니다. + + ​ + + + + ​ + +4. 아래와 같이 터미널 창에 (rlcode_book) 표시가 된다면 rlcode_book이름을 가진 가상 환경이 생긴 것입니다. 이제 이 환경을 이 책을 위한 가상환경으로 사용하겠습니다. + + ​ + + + + ​ + +### 2.1.3 오픈에이아이 설치 및 테스트 + +2016년에 오픈에이아이(OpenAI)라는 회사가 세워졌습니다. 이 회사의 목표는 인공지능 기술을 전 세계에공개해서 더 안전한 인공지능을 만들어가며 더 많은 분야에 인공지능을 도입하는 것입니다. 오픈에이아이 짐(Gym)는 오픈에이아이에서 만든 환경인데여기서 여러가지 인공지능 알고리즘을 테스트 해볼 수 있습니다. + +오픈에이아이짐의 코드는 모두 오픈에이아이의 깃허브(Github)[[2\]](#_ftn1)에업로드되어있습니다. + + + + + +오픈에이아이 짐의 설치는 공식 홈페이지에 설명되어있습니다. 오픈에이아이짐을 설치하기 위해서는 깃(Git)를 먼저 설치해줘야 합니다. 깃(Git)은 버전 관리 도구로서개발 프로세스에서 버전 관리가 필요할 때 주로 사용합니다. 오픈에이아이는 오픈소스로 깃헙(Github)에 공개되어 있습니다. 깃헙은 버전관리되는 소스 코드들의원격 저장소 역할을 하는 플랫폼입니다. + +다음과 같은 명령어로 깃를 설치합니다. + +```shell +$ sudo apt-get install git +``` + + + +깃을 설치한 다음에 오픈에이아이 짐을 설치합니다. 터미널 창에서 오픈에이아이 짐을 설치할 디렉토리로 이동한 다음에 다음과 같은 명령어를 실행합니다. + +```shell +$ git clone https://github.com/openai/gym +$ cd gym +$ pip3 install -e +``` + + + +오픈에이아이 짐은 여러가지 다른 설정으로 설치할 수 있는데 `pip install -e .`은 가장 기본적인 부분들만 설치하는 것입니다. 이후에 아타리 게임 등 오픈에이아이 짐의 모든 게임들을 사용하려면 `pip install -e .` 대신에 다음과 같이 입력해야 합니다. + +```shell +$ pip3 install -e .[all] +``` + + + +오픈에이아이 짐이 정상적으로 설치되었는지 확인하기 위해서 간단한 예제를 실행해봅니다. 오픈에이아이 짐의 가장 간단한 예제는 카트폴(CartPole)입니다. 카트폴은 카트에 진자가 달린 형태로 이 문제의 목표는 카트를 움직여서 그 반동으로 진자를 세우는 것입니다. 테스트할 때는 그냥 아무 입력도 카트폴에 주지 않은 상태로 오픈에이아이 짐이 제대로 실행되는지만 확인할 것입니다. + +`CartPole.py` 파일을 생성하고 코드 2.1과 같이 입력합니다. + +```python +import gym +env = gym.make('CartPole-v0') +env.reset() +for _ in range(1000): + env.render() + env.step(env.action_space.sample()) # take a random action +``` + +​ 코드 2.1 카트폴 예제 실행 코드 + + +이 코드를 실행하면 화면에 아무 행동도 하지 않는 카트폴이 실행됩니다. 오픈에이아이 짐은 이와 같은 많은 문제들을 제공하며 사용자들은 오픈에이아이 짐의 여러가지 문제에 자신의 학습 알고리즘을 적용해볼 수 있습니다. 또한 오픈에이아이 짐 사이트에 자신의 알고리즘을 공유하거나 결과를 확인할 수 있습니다. + + + + + + + +## 2.2 개발 환경 설정 2: 맥 OS + +맥 OS에는 기본적으로 파이썬 2.7버전이 설치되어있기 때문에 3.5 버전을 새로 설치를 해야 합니다. + +### 2.2.1 파이썬 3.5 설치 및 환경 설정 + +파이썬다운로드 페이지[[3\]](#_ftnref3)로접속하면 다음과 같은 화면이 나옵니다. + + + + + +1. 위 화면에서 자신의 맥 OS 버전에 맞는 파일을 선택해서 다운로드합니다. 다운로드가 완료된 파일을 실행 후 안내에 따르면 설치가 완료됩니다. + + ​ + + + + ​ + +2. 파이썬 설치가 정상적으로 완료됐는지 확인하기 위해서는 터미널을 실행합니다. 터미널 창에 ‘python3’ 명령어를 입력했을 때 다음 화면과 같이 출력된다면 정상적으로 설치된 것입니다. + + ​ + + + + ​ + +### 2.2.2 파이참 커뮤니티 설치 및 환경 설정 + +파이참의 설치 및 환경 설정은 다음과 같은 순서로 진행합니다. + +1. 파이참홈페이지에 접속하여 커뮤니티버전을 다운로드합니다. + +2. 다운로드가 완료된 파일을 실행하고아래 그림에서 왼쪽 PyCharm CE 아이콘을 오른쪽 폴더 아이콘으로 드래그하면 설치가 완료됩니다. + + + +3. 처음 파이참을 실행하게 되면 설정화면이 나오는데 IDE theme을 통해 IDE의 색상과 스타일을 변경할 수 있습니다. Default는 우분투의 개발환경설정에서 봤던 Intellij 테마입니다. 이 책에서는 Default를 사용할 것입니다. + ​ + +4. 초기 설정을 완료하고 Create New Project 버튼을 클릭합니다. + + ​ + +5. Create New Project 버튼을 클릭하면 아래 그림과 같은 화면이 나옵니다. Location은 프로젝트가 생성될 경로와 프로젝트 폴더명을 설정하는 곳입니다. 프로젝트의 이름과 경로는 독자가 임의로 지정하면 됩니다. + + Interpreter는 프로젝트에서 어떤 파이썬 Interpreter를 사용할 것인지 고르는 것입니다. 우분투에서와 마찬가지로 VirtualEnv를 통해 가상 환경을 만들고 그것을 Interpreter로 사용할 것입니다. Create VirtualEnv 버튼을 누릅니다. + + ​ + + + + ​ + +6. 아래 그림은 VirtualEnv의 생성화면입니다. Name과 Location은 여러분이 임의로 설정하면 됩니다. Base Interpreter는 위와 같이 새로 설치한 python3.5 를 선택합니다. OK버튼을 누르면 해당 VirtualEnv가 생성됩니다. + + ​ + + + + ​ + +7. 처음 New Project 생성화면의 Interpreter에서 방금 생성한 VirtualEnv를 선택해줍니다. 그리고 Create버튼을 누르면 프로젝트 생성이 완료됩니다. + + ​ + + + + ​ + +8. 프로젝트를 생성하고 나면 다음과 같은 작업 환경이 보입니다. 이 화면에서 최상위 폴더를 우클릭한 후 + + New -> Python File을 클릭하면 새로운 파이썬 파일을 생성할 수 있습니다. + + ​ + + + + ​ + +9. 파이참이 제대로 설치됐는지 확인하기 위해 hello world 예제를 실행해봅니다. 우분투에서와 동일하기 때문에 생략하겠습니다. + + ​ + +### 2.2.3 오픈에이아이 설치 및 테스트 + +오픈에이아이를 설치하고 카트폴을 실행해보는 단계는 우분투와 동일하므로 생략합니다. + + + +------ + +[[1\]](#_ftnref1) https://www.jetbrains.com/pycharm/ + +[[2\]](#_ftnref2) https://github.com/openai/gym + +[[3\]](#_ftnref3) https://www.python.org/downloads/release/python-350/ diff --git a/wiki/rlcode_image/cartpole_exam.png b/wiki/rlcode_image/cartpole_exam.png new file mode 100644 index 00000000..3b8674b6 Binary files /dev/null and b/wiki/rlcode_image/cartpole_exam.png differ diff --git a/wiki/rlcode_image/console_hello_world.png b/wiki/rlcode_image/console_hello_world.png new file mode 100644 index 00000000..e14c6a01 Binary files /dev/null and b/wiki/rlcode_image/console_hello_world.png differ diff --git a/wiki/rlcode_image/default_config.png b/wiki/rlcode_image/default_config.png new file mode 100644 index 00000000..7f9e4794 Binary files /dev/null and b/wiki/rlcode_image/default_config.png differ diff --git a/wiki/rlcode_image/file_setting.png b/wiki/rlcode_image/file_setting.png new file mode 100644 index 00000000..264bb279 Binary files /dev/null and b/wiki/rlcode_image/file_setting.png differ diff --git a/wiki/rlcode_image/hello_world_ubuntu.png b/wiki/rlcode_image/hello_world_ubuntu.png new file mode 100644 index 00000000..ede75b89 Binary files /dev/null and b/wiki/rlcode_image/hello_world_ubuntu.png differ diff --git a/wiki/rlcode_image/openai_github.png b/wiki/rlcode_image/openai_github.png new file mode 100644 index 00000000..e5422484 Binary files /dev/null and b/wiki/rlcode_image/openai_github.png differ diff --git a/wiki/rlcode_image/project_interpreter.png b/wiki/rlcode_image/project_interpreter.png new file mode 100644 index 00000000..b22f24cc Binary files /dev/null and b/wiki/rlcode_image/project_interpreter.png differ diff --git a/wiki/rlcode_image/pycham_new_project.png b/wiki/rlcode_image/pycham_new_project.png new file mode 100644 index 00000000..bfd309eb Binary files /dev/null and b/wiki/rlcode_image/pycham_new_project.png differ diff --git a/wiki/rlcode_image/pycharm_community.png b/wiki/rlcode_image/pycharm_community.png new file mode 100644 index 00000000..3e4f1967 Binary files /dev/null and b/wiki/rlcode_image/pycharm_community.png differ diff --git a/wiki/rlcode_image/pycharm_drag.png b/wiki/rlcode_image/pycharm_drag.png new file mode 100644 index 00000000..3fd2faa2 Binary files /dev/null and b/wiki/rlcode_image/pycharm_drag.png differ diff --git a/wiki/rlcode_image/pycharm_init.png b/wiki/rlcode_image/pycharm_init.png new file mode 100644 index 00000000..b2fa23c7 Binary files /dev/null and b/wiki/rlcode_image/pycharm_init.png differ diff --git a/wiki/rlcode_image/python3_terminal.jpg b/wiki/rlcode_image/python3_terminal.jpg new file mode 100644 index 00000000..38fe67ac Binary files /dev/null and b/wiki/rlcode_image/python3_terminal.jpg differ diff --git a/wiki/rlcode_image/python_download.png b/wiki/rlcode_image/python_download.png new file mode 100644 index 00000000..24922c44 Binary files /dev/null and b/wiki/rlcode_image/python_download.png differ diff --git a/wiki/rlcode_image/python_installed.png b/wiki/rlcode_image/python_installed.png new file mode 100644 index 00000000..a6dae073 Binary files /dev/null and b/wiki/rlcode_image/python_installed.png differ diff --git a/wiki/rlcode_image/python_intalled.png b/wiki/rlcode_image/python_intalled.png new file mode 100644 index 00000000..a6dae073 Binary files /dev/null and b/wiki/rlcode_image/python_intalled.png differ diff --git a/wiki/rlcode_image/rl_book_hello_world.png b/wiki/rlcode_image/rl_book_hello_world.png new file mode 100644 index 00000000..5588e095 Binary files /dev/null and b/wiki/rlcode_image/rl_book_hello_world.png differ diff --git a/wiki/rlcode_image/rl_book_project.png b/wiki/rlcode_image/rl_book_project.png new file mode 100644 index 00000000..b1603305 Binary files /dev/null and b/wiki/rlcode_image/rl_book_project.png differ diff --git a/wiki/rlcode_image/rl_book_venv.png b/wiki/rlcode_image/rl_book_venv.png new file mode 100644 index 00000000..c86d7d94 Binary files /dev/null and b/wiki/rlcode_image/rl_book_venv.png differ diff --git a/wiki/rlcode_image/rl_book_virtualenv.png b/wiki/rlcode_image/rl_book_virtualenv.png new file mode 100644 index 00000000..dc783044 Binary files /dev/null and b/wiki/rlcode_image/rl_book_virtualenv.png differ diff --git a/wiki/rlcode_image/rlcode_book_directory.png b/wiki/rlcode_image/rlcode_book_directory.png new file mode 100644 index 00000000..f1c13cd3 Binary files /dev/null and b/wiki/rlcode_image/rlcode_book_directory.png differ diff --git a/wiki/rlcode_image/rlcode_project.png b/wiki/rlcode_image/rlcode_project.png new file mode 100644 index 00000000..d9c89be6 Binary files /dev/null and b/wiki/rlcode_image/rlcode_project.png differ diff --git a/wiki/rlcode_image/run_hello_world.png b/wiki/rlcode_image/run_hello_world.png new file mode 100644 index 00000000..570e979d Binary files /dev/null and b/wiki/rlcode_image/run_hello_world.png differ diff --git a/wiki/rlcode_image/sh_pycharm.sh.png b/wiki/rlcode_image/sh_pycharm.sh.png new file mode 100644 index 00000000..19708444 Binary files /dev/null and b/wiki/rlcode_image/sh_pycharm.sh.png differ diff --git a/wiki/rlcode_image/terminal_rlcode_book.png b/wiki/rlcode_image/terminal_rlcode_book.png new file mode 100644 index 00000000..38279352 Binary files /dev/null and b/wiki/rlcode_image/terminal_rlcode_book.png differ