diff --git a/.gitignore b/.gitignore
index ab5a28a0..695bdda2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@
*.pydevproject
.idea/
.DS_Store
-__pycache__
\ No newline at end of file
+__pycache__
+./Code 2. Cartpole/6. A3C/Cartpole_A3C.pgy
\ No newline at end of file
diff --git a/1-grid-world/1-policy-iteration/environment.py b/1-grid-world/1-policy-iteration/environment.py
new file mode 100644
index 00000000..910d4ba8
--- /dev/null
+++ b/1-grid-world/1-policy-iteration/environment.py
@@ -0,0 +1,245 @@
+import tkinter as tk
+from tkinter import Button
+import time
+import numpy as np
+from PIL import ImageTk, Image
+
+PhotoImage = ImageTk.PhotoImage
+UNIT = 100 # pixels
+HEIGHT = 5 # grid height
+WIDTH = 5 # grid width
+TRANSITION_PROB = 1
+POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right
+ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates
+REWARDS = []
+
+
+class GraphicDisplay(tk.Tk):
+ def __init__(self, agent):
+ super(GraphicDisplay, self).__init__()
+ self.title('Policy Iteration')
+ self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
+ self.texts = []
+ self.arrows = []
+ self.env = Env()
+ self.agent = agent
+ self.evaluation_count = 0
+ self.improvement_count = 0
+ self.is_moving = 0
+ (self.up, self.down, self.left, self.right), self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
+ self.text_reward(2, 2, "R : 1.0")
+ self.text_reward(1, 2, "R : -1.0")
+ self.text_reward(2, 1, "R : -1.0")
+
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
+ # buttons
+ iteration_button = Button(self, text="Evaluate",
+ command=self.evaluate_policy)
+ iteration_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10,
+ window=iteration_button)
+ policy_button = Button(self, text="Improve",
+ command=self.improve_policy)
+ policy_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10,
+ window=policy_button)
+ policy_button = Button(self, text="move", command=self.move_by_policy)
+ policy_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10,
+ window=policy_button)
+ policy_button = Button(self, text="reset", command=self.reset)
+ policy_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10,
+ window=policy_button)
+
+ # create grids
+ for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
+ x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
+ canvas.create_line(x0, y0, x1, y1)
+ for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
+ x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
+ canvas.create_line(x0, y0, x1, y1)
+
+ # add img to canvas
+ self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+ canvas.create_image(250, 150, image=self.shapes[1])
+ canvas.create_image(150, 250, image=self.shapes[1])
+ canvas.create_image(250, 250, image=self.shapes[2])
+
+ # pack all
+ canvas.pack()
+
+ return canvas
+
+ def load_images(self):
+ up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
+ right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
+ left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
+ down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
+ rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65)))
+ triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65)))
+ circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
+ return (up, down, left, right), (rectangle, triangle, circle)
+
+ def reset(self):
+ if self.is_moving == 0:
+ self.evaluation_count = 0
+ self.improvement_count = 0
+ for i in self.texts:
+ self.canvas.delete(i)
+
+ for i in self.arrows:
+ self.canvas.delete(i)
+ self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
+ self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH
+ for _ in range(HEIGHT)])
+ self.agent.policy_table[2][2] = []
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+ def text_value(self, row, col, contents, font='Helvetica', size=10,
+ style='normal', anchor="nw"):
+ origin_x, origin_y = 85, 70
+ x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+ font = (font, str(size), style)
+ text = self.canvas.create_text(x, y, fill="black", text=contents,
+ font=font, anchor=anchor)
+ return self.texts.append(text)
+
+ def text_reward(self, row, col, contents, font='Helvetica', size=10,
+ style='normal', anchor="nw"):
+ origin_x, origin_y = 5, 5
+ x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+ font = (font, str(size), style)
+ text = self.canvas.create_text(x, y, fill="black", text=contents,
+ font=font, anchor=anchor)
+ return self.texts.append(text)
+
+ def rectangle_move(self, action):
+ base_action = np.array([0, 0])
+ location = self.find_rectangle()
+ self.render()
+ if action == 0 and location[0] > 0: # up
+ base_action[1] -= UNIT
+ elif action == 1 and location[0] < HEIGHT - 1: # down
+ base_action[1] += UNIT
+ elif action == 2 and location[1] > 0: # left
+ base_action[0] -= UNIT
+ elif action == 3 and location[1] < WIDTH - 1: # right
+ base_action[0] += UNIT
+ # move agent
+ self.canvas.move(self.rectangle, base_action[0], base_action[1])
+
+ def find_rectangle(self):
+ temp = self.canvas.coords(self.rectangle)
+ x = (temp[0] / 100) - 0.5
+ y = (temp[1] / 100) - 0.5
+ return int(y), int(x)
+
+ def move_by_policy(self):
+ if self.improvement_count != 0 and self.is_moving != 1:
+ self.is_moving = 1
+
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+ x, y = self.find_rectangle()
+ while len(self.agent.policy_table[x][y]) != 0:
+ self.after(100,
+ self.rectangle_move(self.agent.get_action([x, y])))
+ x, y = self.find_rectangle()
+ self.is_moving = 0
+
+ def draw_one_arrow(self, col, row, policy):
+ if col == 2 and row == 2:
+ return
+
+ if policy[0] > 0: # up
+ origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.up))
+ if policy[1] > 0: # down
+ origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.down))
+ if policy[2] > 0: # left
+ origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.left))
+ if policy[3] > 0: # right
+ origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.right))
+
+ def draw_from_policy(self, policy_table):
+ for i in range(HEIGHT):
+ for j in range(WIDTH):
+ self.draw_one_arrow(i, j, policy_table[i][j])
+
+ def print_value_table(self, value_table):
+ for i in range(WIDTH):
+ for j in range(HEIGHT):
+ self.text_value(i, j, value_table[i][j])
+
+ def render(self):
+ time.sleep(0.1)
+ self.canvas.tag_raise(self.rectangle)
+ self.update()
+
+ def evaluate_policy(self):
+ self.evaluation_count += 1
+ for i in self.texts:
+ self.canvas.delete(i)
+ self.agent.policy_evaluation()
+ self.print_value_table(self.agent.value_table)
+
+ def improve_policy(self):
+ self.improvement_count += 1
+ for i in self.arrows:
+ self.canvas.delete(i)
+ self.agent.policy_improvement()
+ self.draw_from_policy(self.agent.policy_table)
+
+
+class Env:
+ def __init__(self):
+ self.transition_probability = TRANSITION_PROB
+ self.width = WIDTH
+ self.height = HEIGHT
+ self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
+ self.possible_actions = POSSIBLE_ACTIONS
+ self.reward[2][2] = 1 # reward 1 for circle
+ self.reward[1][2] = -1 # reward -1 for triangle
+ self.reward[2][1] = -1 # reward -1 for triangle
+ self.all_state = []
+
+ for x in range(WIDTH):
+ for y in range(HEIGHT):
+ state = [x, y]
+ self.all_state.append(state)
+
+ def get_reward(self, state, action):
+ next_state = self.state_after_action(state, action)
+ return self.reward[next_state[0]][next_state[1]]
+
+ def state_after_action(self, state, action_index):
+ action = ACTIONS[action_index]
+ return self.check_boundary([state[0] + action[0], state[1] + action[1]])
+
+ @staticmethod
+ def check_boundary(state):
+ state[0] = (0 if state[0] < 0 else WIDTH - 1
+ if state[0] > WIDTH - 1 else state[0])
+ state[1] = (0 if state[1] < 0 else HEIGHT - 1
+ if state[1] > HEIGHT - 1 else state[1])
+ return state
+
+ def get_transition_prob(self, state, action):
+ return self.transition_probability
+
+ def get_all_states(self):
+ return self.all_state
diff --git a/1-grid-world/1-policy-iteration/policy_iteration.py b/1-grid-world/1-policy-iteration/policy_iteration.py
new file mode 100644
index 00000000..d6dc414e
--- /dev/null
+++ b/1-grid-world/1-policy-iteration/policy_iteration.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+import random
+from environment import GraphicDisplay, Env
+
+
+class PolicyIteration:
+ def __init__(self, env):
+ self.env = env
+ # 2-d list for the value function
+ self.value_table = [[0.0] * env.width for _ in range(env.height)]
+ # list of random policy (same probability of up, down, left, right)
+ self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width
+ for _ in range(env.height)]
+ # setting terminal state
+ self.policy_table[2][2] = []
+ self.discount_factor = 0.9
+
+ def policy_evaluation(self):
+ next_value_table = [[0.00] * self.env.width
+ for _ in range(self.env.height)]
+
+ # Bellman Expectation Equation for the every states
+ for state in self.env.get_all_states():
+ value = 0.0
+ # keep the value function of terminal states as 0
+ if state == [2, 2]:
+ next_value_table[state[0]][state[1]] = value
+ continue
+
+ for action in self.env.possible_actions:
+ next_state = self.env.state_after_action(state, action)
+ reward = self.env.get_reward(state, action)
+ next_value = self.get_value(next_state)
+ value += (self.get_policy(state)[action] *
+ (reward + self.discount_factor * next_value))
+
+ next_value_table[state[0]][state[1]] = round(value, 2)
+
+ self.value_table = next_value_table
+
+ def policy_improvement(self):
+ next_policy = self.policy_table
+ for state in self.env.get_all_states():
+ if state == [2, 2]:
+ continue
+ value = -99999
+ max_index = []
+ result = [0.0, 0.0, 0.0, 0.0] # initialize the policy
+
+ # for every actions, calculate
+ # [reward + (discount factor) * (next state value function)]
+ for index, action in enumerate(self.env.possible_actions):
+ next_state = self.env.state_after_action(state, action)
+ reward = self.env.get_reward(state, action)
+ next_value = self.get_value(next_state)
+ temp = reward + self.discount_factor * next_value
+
+ # We normally can't pick multiple actions in greedy policy.
+ # but here we allow multiple actions with same max values
+ if temp == value:
+ max_index.append(index)
+ elif temp > value:
+ value = temp
+ max_index.clear()
+ max_index.append(index)
+
+ # probability of action
+ prob = 1 / len(max_index)
+
+ for index in max_index:
+ result[index] = prob
+
+ next_policy[state[0]][state[1]] = result
+
+ self.policy_table = next_policy
+
+ # get action according to the current policy
+ def get_action(self, state):
+ random_pick = random.randrange(100) / 100
+
+ policy = self.get_policy(state)
+ policy_sum = 0.0
+ # return the action in the index
+ for index, value in enumerate(policy):
+ policy_sum += value
+ if random_pick < policy_sum:
+ return index
+
+ # get policy of specific state
+ def get_policy(self, state):
+ if state == [2, 2]:
+ return 0.0
+ return self.policy_table[state[0]][state[1]]
+
+ def get_value(self, state):
+ return round(self.value_table[state[0]][state[1]], 2)
+
+if __name__ == "__main__":
+ env = Env()
+ policy_iteration = PolicyIteration(env)
+ grid_world = GraphicDisplay(policy_iteration)
+ grid_world.mainloop()
diff --git a/1-grid-world/2-value-iteration/environment.py b/1-grid-world/2-value-iteration/environment.py
new file mode 100644
index 00000000..81af3dc5
--- /dev/null
+++ b/1-grid-world/2-value-iteration/environment.py
@@ -0,0 +1,261 @@
+import tkinter as tk
+import time
+import numpy as np
+import random
+from PIL import ImageTk, Image
+
+PhotoImage = ImageTk.PhotoImage
+UNIT = 100 # pixels
+HEIGHT = 5 # grid height
+WIDTH = 5 # grid width
+TRANSITION_PROB = 1
+POSSIBLE_ACTIONS = [0, 1, 2, 3] # up, down, left, right
+ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)] # actions in coordinates
+REWARDS = []
+
+
+class GraphicDisplay(tk.Tk):
+ def __init__(self, value_iteration):
+ super(GraphicDisplay, self).__init__()
+ self.title('Value Iteration')
+ self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
+ self.texts = []
+ self.arrows = []
+ self.env = Env()
+ self.agent = value_iteration
+ self.iteration_count = 0
+ self.improvement_count = 0
+ self.is_moving = 0
+ (self.up, self.down, self.left,
+ self.right), self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
+ self.text_reward(2, 2, "R : 1.0")
+ self.text_reward(1, 2, "R : -1.0")
+ self.text_reward(2, 1, "R : -1.0")
+
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
+ # buttons
+ iteration_button = tk.Button(self, text="Calculate",
+ command=self.calculate_value)
+ iteration_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10,
+ window=iteration_button)
+
+ policy_button = tk.Button(self, text="Print Policy",
+ command=self.print_optimal_policy)
+ policy_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10,
+ window=policy_button)
+
+ policy_button = tk.Button(self, text="Move",
+ command=self.move_by_policy)
+ policy_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10,
+ window=policy_button)
+
+ policy_button = tk.Button(self, text="Clear", command=self.clear)
+ policy_button.configure(width=10, activebackground="#33B5E5")
+ canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10,
+ window=policy_button)
+
+ # create grids
+ for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
+ x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
+ canvas.create_line(x0, y0, x1, y1)
+ for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
+ x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
+ canvas.create_line(x0, y0, x1, y1)
+
+ # add img to canvas
+ self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+ canvas.create_image(250, 150, image=self.shapes[1])
+ canvas.create_image(150, 250, image=self.shapes[1])
+ canvas.create_image(250, 250, image=self.shapes[2])
+
+ # pack all
+ canvas.pack()
+
+ return canvas
+
+ def load_images(self):
+ PhotoImage = ImageTk.PhotoImage
+ up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
+ right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
+ left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
+ down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
+ rectangle = PhotoImage(
+ Image.open("../img/rectangle.png").resize((65, 65)))
+ triangle = PhotoImage(
+ Image.open("../img/triangle.png").resize((65, 65)))
+ circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
+ return (up, down, left, right), (rectangle, triangle, circle)
+
+ def clear(self):
+
+ if self.is_moving == 0:
+ self.iteration_count = 0
+ self.improvement_count = 0
+ for i in self.texts:
+ self.canvas.delete(i)
+
+ for i in self.arrows:
+ self.canvas.delete(i)
+
+ self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
+
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+ def reset(self):
+ self.update()
+ time.sleep(0.5)
+ self.canvas.delete(self.rectangle)
+ return self.canvas.coords(self.rectangle)
+
+ def text_value(self, row, col, contents, font='Helvetica', size=12,
+ style='normal', anchor="nw"):
+ origin_x, origin_y = 85, 70
+ x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+ font = (font, str(size), style)
+ text = self.canvas.create_text(x, y, fill="black", text=contents,
+ font=font, anchor=anchor)
+ return self.texts.append(text)
+
+ def text_reward(self, row, col, contents, font='Helvetica', size=12,
+ style='normal', anchor="nw"):
+ origin_x, origin_y = 5, 5
+ x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+ font = (font, str(size), style)
+ text = self.canvas.create_text(x, y, fill="black", text=contents,
+ font=font, anchor=anchor)
+ return self.texts.append(text)
+
+ def rectangle_move(self, action):
+ base_action = np.array([0, 0])
+ location = self.find_rectangle()
+ self.render()
+ if action == 0 and location[0] > 0: # up
+ base_action[1] -= UNIT
+ elif action == 1 and location[0] < HEIGHT - 1: # down
+ base_action[1] += UNIT
+ elif action == 2 and location[1] > 0: # left
+ base_action[0] -= UNIT
+ elif action == 3 and location[1] < WIDTH - 1: # right
+ base_action[0] += UNIT
+
+ self.canvas.move(self.rectangle, base_action[0],
+ base_action[1]) # move agent
+
+ def find_rectangle(self):
+ temp = self.canvas.coords(self.rectangle)
+ x = (temp[0] / 100) - 0.5
+ y = (temp[1] / 100) - 0.5
+ return int(y), int(x)
+
+ def move_by_policy(self):
+
+ if self.improvement_count != 0 and self.is_moving != 1:
+ self.is_moving = 1
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+ x, y = self.find_rectangle()
+ while len(self.agent.get_action([x, y])) != 0:
+ action = random.sample(self.agent.get_action([x, y]), 1)[0]
+ self.after(100, self.rectangle_move(action))
+ x, y = self.find_rectangle()
+ self.is_moving = 0
+
+ def draw_one_arrow(self, col, row, action):
+ if col == 2 and row == 2:
+ return
+ if action == 0: # up
+ origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.up))
+ elif action == 1: # down
+ origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.down))
+ elif action == 3: # right
+ origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.right))
+ elif action == 2: # left
+ origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
+ self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+ image=self.left))
+
+ def draw_from_values(self, state, action_list):
+ i = state[0]
+ j = state[1]
+ for action in action_list:
+ self.draw_one_arrow(i, j, action)
+
+ def print_values(self, values):
+ for i in range(WIDTH):
+ for j in range(HEIGHT):
+ self.text_value(i, j, values[i][j])
+
+ def render(self):
+ time.sleep(0.1)
+ self.canvas.tag_raise(self.rectangle)
+ self.update()
+
+ def calculate_value(self):
+ self.iteration_count += 1
+ for i in self.texts:
+ self.canvas.delete(i)
+ self.agent.value_iteration()
+ self.print_values(self.agent.value_table)
+
+ def print_optimal_policy(self):
+ self.improvement_count += 1
+ for i in self.arrows:
+ self.canvas.delete(i)
+ for state in self.env.get_all_states():
+ action = self.agent.get_action(state)
+ self.draw_from_values(state, action)
+
+
+class Env:
+ def __init__(self):
+ self.transition_probability = TRANSITION_PROB
+ self.width = WIDTH # Width of Grid World
+ self.height = HEIGHT # Height of GridWorld
+ self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
+ self.possible_actions = POSSIBLE_ACTIONS
+ self.reward[2][2] = 1 # reward 1 for circle
+ self.reward[1][2] = -1 # reward -1 for triangle
+ self.reward[2][1] = -1 # reward -1 for triangle
+ self.all_state = []
+
+ for x in range(WIDTH):
+ for y in range(HEIGHT):
+ state = [x, y]
+ self.all_state.append(state)
+
+ def get_reward(self, state, action):
+ next_state = self.state_after_action(state, action)
+ return self.reward[next_state[0]][next_state[1]]
+
+ def state_after_action(self, state, action_index):
+ action = ACTIONS[action_index]
+ return self.check_boundary([state[0] + action[0], state[1] + action[1]])
+
+ @staticmethod
+ def check_boundary(state):
+ state[0] = (0 if state[0] < 0 else WIDTH - 1
+ if state[0] > WIDTH - 1 else state[0])
+ state[1] = (0 if state[1] < 0 else HEIGHT - 1
+ if state[1] > HEIGHT - 1 else state[1])
+ return state
+
+ def get_transition_prob(self, state, action):
+ return self.transition_probability
+
+ def get_all_states(self):
+ return self.all_state
diff --git a/1-grid-world/2-value-iteration/value_iteration.py b/1-grid-world/2-value-iteration/value_iteration.py
new file mode 100644
index 00000000..8dff7281
--- /dev/null
+++ b/1-grid-world/2-value-iteration/value_iteration.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+from environment import GraphicDisplay, Env
+
+class ValueIteration:
+ def __init__(self, env):
+ self.env = env
+ # 2-d list for the value function
+ self.value_table = [[0.0] * env.width for _ in range(env.height)]
+ self.discount_factor = 0.9
+
+ # get next value function table from the current value function table
+ def value_iteration(self):
+ next_value_table = [[0.0] * self.env.width
+ for _ in range(self.env.height)]
+ for state in self.env.get_all_states():
+ if state == [2, 2]:
+ next_value_table[state[0]][state[1]] = 0.0
+ continue
+ value_list = []
+
+ for action in self.env.possible_actions:
+ next_state = self.env.state_after_action(state, action)
+ reward = self.env.get_reward(state, action)
+ next_value = self.get_value(next_state)
+ value_list.append((reward + self.discount_factor * next_value))
+ # return the maximum value(it is the optimality equation!!)
+ next_value_table[state[0]][state[1]] = round(max(value_list), 2)
+ self.value_table = next_value_table
+
+ # get action according to the current value function table
+ def get_action(self, state):
+ action_list = []
+ max_value = -99999
+
+ if state == [2, 2]:
+ return []
+
+ # calculating q values for the all actions and
+ # append the action to action list which has maximum q value
+ for action in self.env.possible_actions:
+
+ next_state = self.env.state_after_action(state, action)
+ reward = self.env.get_reward(state, action)
+ next_value = self.get_value(next_state)
+ value = (reward + self.discount_factor * next_value)
+
+ if value > max_value:
+ action_list.clear()
+ action_list.append(action)
+ max_value = value
+ elif value == max_value:
+ action_list.append(action)
+
+ return action_list
+
+ def get_value(self, state):
+ return round(self.value_table[state[0]][state[1]], 2)
+
+if __name__ == "__main__":
+ env = Env()
+ value_iteration = ValueIteration(env)
+ grid_world = GraphicDisplay(value_iteration)
+ grid_world.mainloop()
diff --git a/1-grid-world/3-monte-carlo/environment.py b/1-grid-world/3-monte-carlo/environment.py
new file mode 100644
index 00000000..d885107d
--- /dev/null
+++ b/1-grid-world/3-monte-carlo/environment.py
@@ -0,0 +1,113 @@
+import time
+import numpy as np
+import tkinter as tk
+from PIL import ImageTk, Image
+
+np.random.seed(1)
+PhotoImage = ImageTk.PhotoImage
+UNIT = 100 # pixels
+HEIGHT = 5 # grid height
+WIDTH = 5 # grid width
+
+
+class Env(tk.Tk):
+ def __init__(self):
+ super(Env, self).__init__()
+ self.action_space = ['u', 'd', 'l', 'r']
+ self.n_actions = len(self.action_space)
+ self.title('monte carlo')
+ self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
+ self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
+ self.texts = []
+
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
+ # create grids
+ for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
+ x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
+ canvas.create_line(x0, y0, x1, y1)
+ for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
+ x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
+ canvas.create_line(x0, y0, x1, y1)
+
+ # add img to canvas
+ self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+ self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
+ self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
+ self.circle = canvas.create_image(250, 250, image=self.shapes[2])
+
+ # pack all
+ canvas.pack()
+
+ return canvas
+
+ def load_images(self):
+ rectangle = PhotoImage(
+ Image.open("../img/rectangle.png").resize((65, 65)))
+ triangle = PhotoImage(
+ Image.open("../img/triangle.png").resize((65, 65)))
+ circle = PhotoImage(
+ Image.open("../img/circle.png").resize((65, 65)))
+
+ return rectangle, triangle, circle
+
+ @staticmethod
+ def coords_to_state(coords):
+ x = int((coords[0] - 50) / 100)
+ y = int((coords[1] - 50) / 100)
+ return [x, y]
+
+ def reset(self):
+ self.update()
+ time.sleep(0.5)
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+ # return observation
+ return self.coords_to_state(self.canvas.coords(self.rectangle))
+
+ def step(self, action):
+ state = self.canvas.coords(self.rectangle)
+ base_action = np.array([0, 0])
+ self.render()
+
+ if action == 0: # up
+ if state[1] > UNIT:
+ base_action[1] -= UNIT
+ elif action == 1: # down
+ if state[1] < (HEIGHT - 1) * UNIT:
+ base_action[1] += UNIT
+ elif action == 2: # left
+ if state[0] > UNIT:
+ base_action[0] -= UNIT
+ elif action == 3: # right
+ if state[0] < (WIDTH - 1) * UNIT:
+ base_action[0] += UNIT
+ # move agent
+ self.canvas.move(self.rectangle, base_action[0], base_action[1])
+ # move rectangle to top level of canvas
+ self.canvas.tag_raise(self.rectangle)
+
+ next_state = self.canvas.coords(self.rectangle)
+
+ # reward function
+ if next_state == self.canvas.coords(self.circle):
+ reward = 100
+ done = True
+ elif next_state in [self.canvas.coords(self.triangle1),
+ self.canvas.coords(self.triangle2)]:
+ reward = -100
+ done = True
+ else:
+ reward = 0
+ done = False
+
+ next_state = self.coords_to_state(next_state)
+
+ return next_state, reward, done
+
+ def render(self):
+ time.sleep(0.03)
+ self.update()
diff --git a/1-grid-world/3-monte-carlo/mc_agent.py b/1-grid-world/3-monte-carlo/mc_agent.py
new file mode 100644
index 00000000..682b59b9
--- /dev/null
+++ b/1-grid-world/3-monte-carlo/mc_agent.py
@@ -0,0 +1,111 @@
+import numpy as np
+import random
+from collections import defaultdict
+from environment import Env
+
+
+# Monte Carlo Agent which learns every episodes from the sample
+class MCAgent:
+ def __init__(self, actions):
+ self.width = 5
+ self.height = 5
+ self.actions = actions
+ self.learning_rate = 0.01
+ self.discount_factor = 0.9
+ self.epsilon = 0.1
+ self.samples = []
+ self.value_table = defaultdict(float)
+
+ # append sample to memory(state, reward, done)
+ def save_sample(self, state, reward, done):
+ self.samples.append([state, reward, done])
+
+ # for every episode, agent updates q function of visited states
+ def update(self):
+ G_t = 0
+ visit_state = []
+ for reward in reversed(self.samples):
+ state = str(reward[0])
+ if state not in visit_state:
+ visit_state.append(state)
+ G_t = self.discount_factor * (reward[1] + G_t)
+ value = self.value_table[state]
+ self.value_table[state] = (value +
+ self.learning_rate * (G_t - value))
+
+ # get action for the state according to the q function table
+ # agent pick action of epsilon-greedy policy
+ def get_action(self, state):
+ if np.random.rand() < self.epsilon:
+ # take random action
+ action = np.random.choice(self.actions)
+ else:
+ # take action according to the q function table
+ next_state = self.possible_next_state(state)
+ action = self.arg_max(next_state)
+ return int(action)
+
+ # compute arg_max if multiple candidates exit, pick one randomly
+ @staticmethod
+ def arg_max(next_state):
+ max_index_list = []
+ max_value = next_state[0]
+ for index, value in enumerate(next_state):
+ if value > max_value:
+ max_index_list.clear()
+ max_value = value
+ max_index_list.append(index)
+ elif value == max_value:
+ max_index_list.append(index)
+ return random.choice(max_index_list)
+
+ # get the possible next states
+ def possible_next_state(self, state):
+ col, row = state
+ next_state = [0.0] * 4
+
+ if row != 0:
+ next_state[0] = self.value_table[str([col, row - 1])]
+ else:
+ next_state[0] = self.value_table[str(state)]
+ if row != self.height - 1:
+ next_state[1] = self.value_table[str([col, row + 1])]
+ else:
+ next_state[1] = self.value_table[str(state)]
+ if col != 0:
+ next_state[2] = self.value_table[str([col - 1, row])]
+ else:
+ next_state[2] = self.value_table[str(state)]
+ if col != self.width - 1:
+ next_state[3] = self.value_table[str([col + 1, row])]
+ else:
+ next_state[3] = self.value_table[str(state)]
+
+ return next_state
+
+
+# main loop
+if __name__ == "__main__":
+ env = Env()
+ agent = MCAgent(actions=list(range(env.n_actions)))
+
+ for episode in range(1000):
+ state = env.reset()
+ action = agent.get_action(state)
+
+ while True:
+ env.render()
+
+ # forward to next state. reward is number and done is boolean
+ next_state, reward, done = env.step(action)
+ agent.save_sample(next_state, reward, done)
+
+ # get next action
+ action = agent.get_action(next_state)
+
+ # at the end of each episode, update the q function table
+ if done:
+ print("episode : ", episode)
+ agent.update()
+ agent.samples.clear()
+ break
diff --git a/Code 1. Grid World/4. SARSA/.python-version b/1-grid-world/4-sarsa/.python-version
similarity index 100%
rename from Code 1. Grid World/4. SARSA/.python-version
rename to 1-grid-world/4-sarsa/.python-version
diff --git a/Code 1. Grid World/3. Monte-Carlo/environment.py b/1-grid-world/4-sarsa/environment.py
similarity index 55%
rename from Code 1. Grid World/3. Monte-Carlo/environment.py
rename to 1-grid-world/4-sarsa/environment.py
index 30074db3..acf6d819 100644
--- a/Code 1. Grid World/3. Monte-Carlo/environment.py
+++ b/1-grid-world/4-sarsa/environment.py
@@ -4,7 +4,7 @@
from PIL import ImageTk, Image
np.random.seed(1)
-
+PhotoImage = ImageTk.PhotoImage
UNIT = 100 # pixels
HEIGHT = 5 # grid height
WIDTH = 5 # grid width
@@ -15,41 +15,47 @@ def __init__(self):
super(Env, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
- self.title('monte carlo')
+ self.title('SARSA')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
- self.buildGraphic()
+ self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
self.texts = []
- def buildGraphic(self):
- self.canvas = tk.Canvas(self, bg='white',
- height=HEIGHT * UNIT,
- width=WIDTH * UNIT)
-
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
# create grids
for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
- self.canvas.create_line(x0, y0, x1, y1)
+ canvas.create_line(x0, y0, x1, y1)
for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
- self.canvas.create_line(x0, y0, x1, y1)
-
- # image_load
- self.rectangle_image = ImageTk.PhotoImage(
- Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
- self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
- self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
+ canvas.create_line(x0, y0, x1, y1)
- # add image to canvas
- self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
- self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image)
- self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image)
- self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
+ # add img to canvas
+ self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+ self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
+ self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
+ self.circle = canvas.create_image(250, 250, image=self.shapes[2])
# pack all
- self.canvas.pack()
+ canvas.pack()
+
+ return canvas
+
+ def load_images(self):
+ rectangle = PhotoImage(
+ Image.open("../img/rectangle.png").resize((65, 65)))
+ triangle = PhotoImage(
+ Image.open("../img/triangle.png").resize((65, 65)))
+ circle = PhotoImage(
+ Image.open("../img/circle.png").resize((65, 65)))
- def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"):
+ return rectangle, triangle, circle
+ def text_value(self, row, col, contents, action, font='Helvetica', size=10,
+ style='normal', anchor="nw"):
if action == 0:
origin_x, origin_y = 7, 42
elif action == 1:
@@ -61,36 +67,33 @@ def text_value(self, row, col, contents, action, font='Helvetica', size=10, styl
x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
font = (font, str(size), style)
- return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
+ text = self.canvas.create_text(x, y, fill="black", text=contents,
+ font=font, anchor=anchor)
+ return self.texts.append(text)
def print_value_all(self, q_table):
for i in self.texts:
self.canvas.delete(i)
self.texts.clear()
- for i in range(HEIGHT):
- for j in range(WIDTH):
+ for x in range(HEIGHT):
+ for y in range(WIDTH):
for action in range(0, 4):
- state = [i, j]
- if str(state) in q_table.index:
- temp = q_table.ix[str(state), action]
- self.text_value(j, i, round(temp, 2), action)
+ state = [x, y]
+ if str(state) in q_table.keys():
+ temp = q_table[str(state)][action]
+ self.text_value(y, x, round(temp, 2), action)
def coords_to_state(self, coords):
x = int((coords[0] - 50) / 100)
y = int((coords[1] - 50) / 100)
return [x, y]
- def state_to_coords(self, state):
- x = int(state[0] * 100 + 50)
- y = int(state[1] * 100 + 50)
- return [x, y]
-
def reset(self):
self.update()
time.sleep(0.5)
- self.canvas.delete(self.rectangle)
- origin = np.array([UNIT / 2, UNIT / 2])
- self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+ self.render()
# return observation
return self.coords_to_state(self.canvas.coords(self.rectangle))
@@ -112,15 +115,18 @@ def step(self, action):
if state[0] < (WIDTH - 1) * UNIT:
base_action[0] += UNIT
- self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent
-
- next_state = self.canvas.coords(self.rectangle) # next state
+ # move agent
+ self.canvas.move(self.rectangle, base_action[0], base_action[1])
+ # move rectangle to top level of canvas
+ self.canvas.tag_raise(self.rectangle)
+ next_state = self.canvas.coords(self.rectangle)
# reward function
if next_state == self.canvas.coords(self.circle):
reward = 100
done = True
- elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]:
+ elif next_state in [self.canvas.coords(self.triangle1),
+ self.canvas.coords(self.triangle2)]:
reward = -100
done = True
else:
@@ -132,5 +138,5 @@ def step(self, action):
return next_state, reward, done
def render(self):
- time.sleep(0.05)
+ time.sleep(0.03)
self.update()
diff --git a/1-grid-world/4-sarsa/sarsa_agent.py b/1-grid-world/4-sarsa/sarsa_agent.py
new file mode 100644
index 00000000..8a8cf9ef
--- /dev/null
+++ b/1-grid-world/4-sarsa/sarsa_agent.py
@@ -0,0 +1,79 @@
+import numpy as np
+import random
+from collections import defaultdict
+from environment import Env
+
+
+# SARSA agent learns every time step from the sample
+class SARSAgent:
+ def __init__(self, actions):
+ self.actions = actions
+ self.learning_rate = 0.01
+ self.discount_factor = 0.9
+ self.epsilon = 0.1
+ self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
+
+ # with sample , learns new q function
+ def learn(self, state, action, reward, next_state, next_action):
+ current_q = self.q_table[state][action]
+ next_state_q = self.q_table[next_state][next_action]
+ new_q = (current_q + self.learning_rate *
+ (reward + self.discount_factor * next_state_q - current_q))
+ self.q_table[state][action] = new_q
+
+ # get action for the state according to the q function table
+ # agent pick action of epsilon-greedy policy
+ def get_action(self, state):
+ if np.random.rand() < self.epsilon:
+ # take random action
+ action = np.random.choice(self.actions)
+ else:
+ # take action according to the q function table
+ state_action = self.q_table[state]
+ action = self.arg_max(state_action)
+ return action
+
+ @staticmethod
+ def arg_max(state_action):
+ max_index_list = []
+ max_value = state_action[0]
+ for index, value in enumerate(state_action):
+ if value > max_value:
+ max_index_list.clear()
+ max_value = value
+ max_index_list.append(index)
+ elif value == max_value:
+ max_index_list.append(index)
+ return random.choice(max_index_list)
+
+if __name__ == "__main__":
+ env = Env()
+ agent = SARSAgent(actions=list(range(env.n_actions)))
+
+ for episode in range(1000):
+ # reset environment and initialize state
+
+ state = env.reset()
+ # get action of state from agent
+ action = agent.get_action(str(state))
+
+ while True:
+ env.render()
+
+ # take action and proceed one step in the environment
+ next_state, reward, done = env.step(action)
+ next_action = agent.get_action(str(next_state))
+
+ # with sample , agent learns new q function
+ agent.learn(str(state), action, reward, str(next_state), next_action)
+
+ state = next_state
+ action = next_action
+
+ # print q function of all states at screen
+ env.print_value_all(agent.q_table)
+
+ # if episode ends, then break
+ if done:
+ break
+
diff --git a/Code 1. Grid World/5. Q Learning/.python-version b/1-grid-world/5-q-learning/.python-version
similarity index 100%
rename from Code 1. Grid World/5. Q Learning/.python-version
rename to 1-grid-world/5-q-learning/.python-version
diff --git a/Code 1. Grid World/5. Q Learning/environment.py b/1-grid-world/5-q-learning/environment.py
similarity index 60%
rename from Code 1. Grid World/5. Q Learning/environment.py
rename to 1-grid-world/5-q-learning/environment.py
index 30074db3..e724e5ac 100644
--- a/Code 1. Grid World/5. Q Learning/environment.py
+++ b/1-grid-world/5-q-learning/environment.py
@@ -4,7 +4,7 @@
from PIL import ImageTk, Image
np.random.seed(1)
-
+PhotoImage = ImageTk.PhotoImage
UNIT = 100 # pixels
HEIGHT = 5 # grid height
WIDTH = 5 # grid width
@@ -15,40 +15,47 @@ def __init__(self):
super(Env, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.n_actions = len(self.action_space)
- self.title('monte carlo')
+ self.title('Q Learning')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
- self.buildGraphic()
+ self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
self.texts = []
- def buildGraphic(self):
- self.canvas = tk.Canvas(self, bg='white',
- height=HEIGHT * UNIT,
- width=WIDTH * UNIT)
-
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
# create grids
for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
- self.canvas.create_line(x0, y0, x1, y1)
+ canvas.create_line(x0, y0, x1, y1)
for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
- self.canvas.create_line(x0, y0, x1, y1)
-
- # image_load
- self.rectangle_image = ImageTk.PhotoImage(
- Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
- self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
- self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
+ canvas.create_line(x0, y0, x1, y1)
- # add image to canvas
- self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
- self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image)
- self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image)
- self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
+ # add img to canvas
+ self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+ self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
+ self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
+ self.circle = canvas.create_image(250, 250, image=self.shapes[2])
# pack all
- self.canvas.pack()
+ canvas.pack()
+
+ return canvas
+
+ def load_images(self):
+ rectangle = PhotoImage(
+ Image.open("../img/rectangle.png").resize((65, 65)))
+ triangle = PhotoImage(
+ Image.open("../img/triangle.png").resize((65, 65)))
+ circle = PhotoImage(
+ Image.open("../img/circle.png").resize((65, 65)))
- def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"):
+ return rectangle, triangle, circle
+
+ def text_value(self, row, col, contents, action, font='Helvetica', size=10,
+ style='normal', anchor="nw"):
if action == 0:
origin_x, origin_y = 7, 42
@@ -61,7 +68,9 @@ def text_value(self, row, col, contents, action, font='Helvetica', size=10, styl
x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
font = (font, str(size), style)
- return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
+ text = self.canvas.create_text(x, y, fill="black", text=contents,
+ font=font, anchor=anchor)
+ return self.texts.append(text)
def print_value_all(self, q_table):
for i in self.texts:
@@ -71,8 +80,8 @@ def print_value_all(self, q_table):
for j in range(WIDTH):
for action in range(0, 4):
state = [i, j]
- if str(state) in q_table.index:
- temp = q_table.ix[str(state), action]
+ if str(state) in q_table.keys():
+ temp = q_table[str(state)][action]
self.text_value(j, i, round(temp, 2), action)
def coords_to_state(self, coords):
@@ -88,12 +97,13 @@ def state_to_coords(self, state):
def reset(self):
self.update()
time.sleep(0.5)
- self.canvas.delete(self.rectangle)
- origin = np.array([UNIT / 2, UNIT / 2])
- self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+ self.render()
# return observation
return self.coords_to_state(self.canvas.coords(self.rectangle))
+
def step(self, action):
state = self.canvas.coords(self.rectangle)
base_action = np.array([0, 0])
@@ -112,15 +122,18 @@ def step(self, action):
if state[0] < (WIDTH - 1) * UNIT:
base_action[0] += UNIT
- self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent
-
- next_state = self.canvas.coords(self.rectangle) # next state
+ # move agent
+ self.canvas.move(self.rectangle, base_action[0], base_action[1])
+ # move rectangle to top level of canvas
+ self.canvas.tag_raise(self.rectangle)
+ next_state = self.canvas.coords(self.rectangle)
# reward function
if next_state == self.canvas.coords(self.circle):
reward = 100
done = True
- elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]:
+ elif next_state in [self.canvas.coords(self.triangle1),
+ self.canvas.coords(self.triangle2)]:
reward = -100
done = True
else:
@@ -128,9 +141,8 @@ def step(self, action):
done = False
next_state = self.coords_to_state(next_state)
-
return next_state, reward, done
def render(self):
- time.sleep(0.05)
+ time.sleep(0.03)
self.update()
diff --git a/1-grid-world/5-q-learning/q_learning_agent.py b/1-grid-world/5-q-learning/q_learning_agent.py
new file mode 100644
index 00000000..029c2f36
--- /dev/null
+++ b/1-grid-world/5-q-learning/q_learning_agent.py
@@ -0,0 +1,69 @@
+import numpy as np
+import random
+from environment import Env
+from collections import defaultdict
+
+class QLearningAgent:
+ def __init__(self, actions):
+ # actions = [0, 1, 2, 3]
+ self.actions = actions
+ self.learning_rate = 0.01
+ self.discount_factor = 0.9
+ self.epsilon = 0.1
+ self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
+
+ # update q function with sample
+ def learn(self, state, action, reward, next_state):
+ current_q = self.q_table[state][action]
+ # using Bellman Optimality Equation to update q function
+ new_q = reward + self.discount_factor * max(self.q_table[next_state])
+ self.q_table[state][action] += self.learning_rate * (new_q - current_q)
+
+ # get action for the state according to the q function table
+ # agent pick action of epsilon-greedy policy
+ def get_action(self, state):
+ if np.random.rand() < self.epsilon:
+ # take random action
+ action = np.random.choice(self.actions)
+ else:
+ # take action according to the q function table
+ state_action = self.q_table[state]
+ action = self.arg_max(state_action)
+ return action
+
+ @staticmethod
+ def arg_max(state_action):
+ max_index_list = []
+ max_value = state_action[0]
+ for index, value in enumerate(state_action):
+ if value > max_value:
+ max_index_list.clear()
+ max_value = value
+ max_index_list.append(index)
+ elif value == max_value:
+ max_index_list.append(index)
+ return random.choice(max_index_list)
+
+if __name__ == "__main__":
+ env = Env()
+ agent = QLearningAgent(actions=list(range(env.n_actions)))
+
+ for episode in range(1000):
+ state = env.reset()
+
+ while True:
+ env.render()
+
+ # take action and proceed one step in the environment
+ action = agent.get_action(str(state))
+ next_state, reward, done = env.step(action)
+
+ # with sample , agent learns new q function
+ agent.learn(str(state), action, reward, str(next_state))
+
+ state = next_state
+ env.print_value_all(agent.q_table)
+
+ # if episode ends, then break
+ if done:
+ break
diff --git a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
new file mode 100755
index 00000000..a1b1c23b
--- /dev/null
+++ b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
@@ -0,0 +1,117 @@
+import copy
+import pylab
+import random
+import numpy as np
+from environment import Env
+from keras.layers import Dense
+from keras.optimizers import Adam
+from keras.models import Sequential
+
+EPISODES = 1000
+
+
+# this is DeepSARSA Agent for the GridWorld
+# Utilize Neural Network as q function approximator
+class DeepSARSAgent:
+ def __init__(self):
+ self.load_model = False
+ # actions which agent can do
+ self.action_space = [0, 1, 2, 3, 4]
+ # get size of state and action
+ self.action_size = len(self.action_space)
+ self.state_size = 15
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+
+ self.epsilon = 1. # exploration
+ self.epsilon_decay = .9999
+ self.epsilon_min = 0.01
+ self.model = self.build_model()
+
+ if self.load_model:
+ self.epsilon = 0.05
+ self.model.load_weights('./save_model/deep_sarsa_trained.h5')
+
+ # approximate Q function using Neural Network
+ # state is input and Q Value of each action is output of network
+ def build_model(self):
+ model = Sequential()
+ model.add(Dense(30, input_dim=self.state_size, activation='relu'))
+ model.add(Dense(30, activation='relu'))
+ model.add(Dense(self.action_size, activation='linear'))
+ model.summary()
+ model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
+ return model
+
+ # get action from model using epsilon-greedy policy
+ def get_action(self, state):
+ if np.random.rand() <= self.epsilon:
+ # The agent acts randomly
+ return random.randrange(self.action_size)
+ else:
+ # Predict the reward value based on the given state
+ state = np.float32(state)
+ q_values = self.model.predict(state)
+ return np.argmax(q_values[0])
+
+ def train_model(self, state, action, reward, next_state, next_action, done):
+ if self.epsilon > self.epsilon_min:
+ self.epsilon *= self.epsilon_decay
+
+ state = np.float32(state)
+ next_state = np.float32(next_state)
+ target = self.model.predict(state)[0]
+ # like Q Learning, get maximum Q value at s'
+ # But from target model
+ if done:
+ target[action] = reward
+ else:
+ target[action] = (reward + self.discount_factor *
+ self.model.predict(next_state)[0][next_action])
+
+ target = np.reshape(target, [1, 5])
+ # make minibatch which includes target q value and predicted q value
+ # and do the model fit!
+ self.model.fit(state, target, epochs=1, verbose=0)
+
+
+if __name__ == "__main__":
+ env = Env()
+ agent = DeepSARSAgent()
+
+ global_step = 0
+ scores, episodes = [], []
+
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ state = env.reset()
+ state = np.reshape(state, [1, 15])
+
+ while not done:
+ # fresh env
+ global_step += 1
+
+ # get action for the current state and go one step in environment
+ action = agent.get_action(state)
+ next_state, reward, done = env.step(action)
+ next_state = np.reshape(next_state, [1, 15])
+ next_action = agent.get_action(next_state)
+ agent.train_model(state, action, reward, next_state, next_action,
+ done)
+ state = next_state
+ # every time step we do training
+ score += reward
+
+ state = copy.deepcopy(next_state)
+
+ if done:
+ scores.append(score)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.savefig("./save_graph/deep_sarsa_.png")
+ print("episode:", e, " score:", score, "global_step",
+ global_step, " epsilon:", agent.epsilon)
+
+ if e % 100 == 0:
+ agent.model.save_weights("./save_model/deep_sarsa.h5")
diff --git a/Code 1. Grid World/6. DQN/environment.py b/1-grid-world/6-deep-sarsa/environment.py
old mode 100644
new mode 100755
similarity index 52%
rename from Code 1. Grid World/6. DQN/environment.py
rename to 1-grid-world/6-deep-sarsa/environment.py
index a30093a1..c390de8b
--- a/Code 1. Grid World/6. DQN/environment.py
+++ b/1-grid-world/6-deep-sarsa/environment.py
@@ -3,9 +3,10 @@
import tkinter as tk
from PIL import ImageTk, Image
+PhotoImage = ImageTk.PhotoImage
UNIT = 50 # pixels
-HEIGHT = 10 # grid height
-WIDTH = 10 # grid width
+HEIGHT = 5 # grid height
+WIDTH = 5 # grid width
np.random.seed(1)
@@ -15,54 +16,52 @@ def __init__(self):
super(Env, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.action_size = len(self.action_space)
- self.title('DeepQNetwork')
+ self.title('DeepSARSA')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
- self.build_graphic()
+ self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
self.counter = 0
+ self.rewards = []
+ self.goal = []
+ # obstacle
+ self.set_reward([0, 1], -1)
+ self.set_reward([1, 2], -1)
+ self.set_reward([2, 3], -1)
+ # #goal
+ self.set_reward([4, 4], 1)
- def build_graphic(self):
- self.canvas = tk.Canvas(self, bg='white',
- height=HEIGHT * UNIT,
- width=WIDTH * UNIT)
-
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
# create grids
for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
- self.canvas.create_line(x0, y0, x1, y1)
+ canvas.create_line(x0, y0, x1, y1)
for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
- self.canvas.create_line(x0, y0, x1, y1)
+ canvas.create_line(x0, y0, x1, y1)
- # image_load
- self.rectangle_image = ImageTk.PhotoImage(
- Image.open("../resources/rectangle.png").resize((30, 30), Image.ANTIALIAS))
- self.fire_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((30, 30)))
- self.fish_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((30, 30)))
+ self.rewards = []
+ self.goal = []
+ # add image to canvas
+ x, y = UNIT/2, UNIT/2
+ self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
- self.rewards = list()
- self.goal = list()
+ # pack all`
+ canvas.pack()
- # obstacle
- self.set_reward([2, 7], -1)
- self.set_reward([3, 2], -1)
- self.set_reward([2, 5], -1)
- self.set_reward([4, 9], -1)
- self.set_reward([5, 7], -1)
- self.set_reward([6, 4], -1)
- self.set_reward([7, 8], -1)
- self.set_reward([8, 3], -1)
- self.set_reward([9, 1], -1)
-
- #
- #
- # #goal
- self.set_reward([9, 9], 5)
+ return canvas
- # add image to canvas
- self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+ def load_images(self):
+ rectangle = PhotoImage(
+ Image.open("../img/rectangle.png").resize((30, 30)))
+ triangle = PhotoImage(
+ Image.open("../img/triangle.png").resize((30, 30)))
+ circle = PhotoImage(
+ Image.open("../img/circle.png").resize((30, 30)))
- # pack all`
- self.canvas.pack()
+ return rectangle, triangle, circle
def reset_reward(self):
@@ -71,36 +70,33 @@ def reset_reward(self):
self.rewards.clear()
self.goal.clear()
- # obstacle
- self.set_reward([2, 7], -1)
- self.set_reward([3, 2], -1)
- self.set_reward([2, 5], -1)
- self.set_reward([4, 9], -1)
- self.set_reward([5, 7], -1)
- self.set_reward([6, 4], -1)
- self.set_reward([7, 8], -1)
- self.set_reward([8, 3], -1)
- self.set_reward([9, 1], -1)
-
- #
- #
+ self.set_reward([0, 1], -1)
+ self.set_reward([1, 2], -1)
+ self.set_reward([2, 3], -1)
+
# #goal
- self.set_reward([9, 9], 5)
+ self.set_reward([4, 4], 1)
def set_reward(self, state, reward):
state = [int(state[0]), int(state[1])]
+ x = int(state[0])
+ y = int(state[1])
temp = {}
if reward > 0:
temp['reward'] = reward
- temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
- image=self.fish_image)
+ temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+ (UNIT * y) + UNIT / 2,
+ image=self.shapes[2])
+
self.goal.append(temp['figure'])
elif reward < 0:
+ temp['direction'] = -1
temp['reward'] = reward
- temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
- image=self.fire_image)
+ temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+ (UNIT * y) + UNIT / 2,
+ image=self.shapes[1])
temp['coords'] = self.canvas.coords(temp['figure'])
temp['state'] = state
@@ -112,28 +108,28 @@ def check_if_reward(self, state):
check_list = dict()
check_list['if_goal'] = False
rewards = 0
+
for reward in self.rewards:
if reward['state'] == state:
rewards += reward['reward']
- if reward['reward'] == 5:
+ if reward['reward'] == 1:
check_list['if_goal'] = True
+
check_list['rewards'] = rewards
return check_list
def coords_to_state(self, coords):
- x = int((coords[0] - 50) / 100)
- y = int((coords[1] - 50) / 100)
+ x = int((coords[0] - UNIT / 2) / UNIT)
+ y = int((coords[1] - UNIT / 2) / UNIT)
return [x, y]
def reset(self):
self.update()
time.sleep(0.5)
- self.canvas.delete(self.rectangle)
- origin = np.array([UNIT / 2, UNIT / 2])
- self.rectangle = self.canvas.create_image(UNIT/2, UNIT/2, image=self.rectangle_image)
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
# return observation
-
self.reset_reward()
return self.get_state()
@@ -141,66 +137,77 @@ def step(self, action):
self.counter += 1
self.render()
- next_coords = self.move(self.rectangle, action)
-
if self.counter % 2 == 1:
self.rewards = self.move_rewards()
+ next_coords = self.move(self.rectangle, action)
check = self.check_if_reward(self.coords_to_state(next_coords))
done = check['if_goal']
reward = check['rewards']
+ self.canvas.tag_raise(self.rectangle)
+
s_ = self.get_state()
return s_, reward, done
def get_state(self):
- agent_location = self.coords_to_state(self.canvas.coords(self.rectangle))
- agent_x = agent_location[0]
- agent_y = agent_location[1]
+ location = self.coords_to_state(self.canvas.coords(self.rectangle))
+ agent_x = location[0]
+ agent_y = location[1]
- locations = list()
+ states = list()
- locations.append(agent_x)
- locations.append(agent_y)
+ # locations.append(agent_x)
+ # locations.append(agent_y)
for reward in self.rewards:
reward_location = reward['state']
- locations.append(agent_x - reward_location[0])
- locations.append(agent_y - reward_location[1])
+ states.append(reward_location[0] - agent_x)
+ states.append(reward_location[1] - agent_y)
+ if reward['reward'] < 0:
+ states.append(-1)
+ states.append(reward['direction'])
+ else:
+ states.append(1)
- return locations
+ return states
def move_rewards(self):
new_rewards = []
for temp in self.rewards:
- if temp['reward'] == 10:
+ if temp['reward'] == 1:
new_rewards.append(temp)
continue
- temp['coords'] = self.move_const(temp['figure'])
+ temp['coords'] = self.move_const(temp)
temp['state'] = self.coords_to_state(temp['coords'])
new_rewards.append(temp)
return new_rewards
def move_const(self, target):
- s = self.canvas.coords(target)
+
+ s = self.canvas.coords(target['figure'])
base_action = np.array([0, 0])
- if s[0] < (WIDTH - 1) * UNIT:
- base_action[0] += UNIT
- else:
- base_action[0] = -(WIDTH - 1) * UNIT
+ if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
+ target['direction'] = 1
+ elif s[0] == UNIT / 2:
+ target['direction'] = -1
- # if action == 4 # move _none
+ if target['direction'] == -1:
+ base_action[0] += UNIT
+ elif target['direction'] == 1:
+ base_action[0] -= UNIT
- if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
+ if (target['figure'] is not self.rectangle
+ and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
base_action = np.array([0, 0])
- self.canvas.move(target, base_action[0], base_action[1])
+ self.canvas.move(target['figure'], base_action[0], base_action[1])
- s_ = self.canvas.coords(target)
+ s_ = self.canvas.coords(target['figure'])
return s_
@@ -222,11 +229,6 @@ def move(self, target, action):
if s[0] > UNIT:
base_action[0] -= UNIT
- # if action == 4 # move _none
-
- if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
- base_action = np.array([0, 0])
-
self.canvas.move(target, base_action[0], base_action[1])
s_ = self.canvas.coords(target)
@@ -234,5 +236,5 @@ def move(self, target, action):
return s_
def render(self):
- time.sleep(0.1)
+ time.sleep(0.07)
self.update()
diff --git a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png
new file mode 100644
index 00000000..8dec1d06
Binary files /dev/null and b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png differ
diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
similarity index 55%
rename from Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5
rename to 1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
index fe4933cb..23ba39c9 100644
Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 and b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 differ
diff --git a/Code 1. Grid World/7. Policy Gradient/environment.py b/1-grid-world/7-reinforce/environment.py
similarity index 51%
rename from Code 1. Grid World/7. Policy Gradient/environment.py
rename to 1-grid-world/7-reinforce/environment.py
index 620a30b3..c8283baa 100644
--- a/Code 1. Grid World/7. Policy Gradient/environment.py
+++ b/1-grid-world/7-reinforce/environment.py
@@ -3,11 +3,12 @@
import tkinter as tk
from PIL import ImageTk, Image
+PhotoImage = ImageTk.PhotoImage
UNIT = 50 # pixels
-HEIGHT = 10 # grid height
-WIDTH = 10 # grid width
+HEIGHT = 5 # grid height
+WIDTH = 5 # grid width
-# np.random.seed(1)
+np.random.seed(1)
class Env(tk.Tk):
@@ -15,54 +16,52 @@ def __init__(self):
super(Env, self).__init__()
self.action_space = ['u', 'd', 'l', 'r']
self.action_size = len(self.action_space)
- self.title('Policy Gradient')
+ self.title('Reinforce')
self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
- self.build_graphic()
+ self.shapes = self.load_images()
+ self.canvas = self._build_canvas()
self.counter = 0
+ self.rewards = []
+ self.goal = []
+ # obstacle
+ self.set_reward([0, 1], -1)
+ self.set_reward([1, 2], -1)
+ self.set_reward([2, 3], -1)
+ # #goal
+ self.set_reward([4, 4], 1)
- def build_graphic(self):
- self.canvas = tk.Canvas(self, bg='white',
- height=HEIGHT * UNIT,
- width=WIDTH * UNIT)
-
+ def _build_canvas(self):
+ canvas = tk.Canvas(self, bg='white',
+ height=HEIGHT * UNIT,
+ width=WIDTH * UNIT)
# create grids
for c in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
- self.canvas.create_line(x0, y0, x1, y1)
+ canvas.create_line(x0, y0, x1, y1)
for r in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80
x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
- self.canvas.create_line(x0, y0, x1, y1)
+ canvas.create_line(x0, y0, x1, y1)
- # image_load
- self.rectangle_image = ImageTk.PhotoImage(
- Image.open("../resources/rectangle.png").resize((30, 30), Image.ANTIALIAS))
- self.fire_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((30, 30)))
- self.fish_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((30, 30)))
+ self.rewards = []
+ self.goal = []
+ # add image to canvas
+ x, y = UNIT/2, UNIT/2
+ self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
- self.rewards = list()
- self.goal = list()
+ # pack all`
+ canvas.pack()
- # obstacle
- self.set_reward([2, 7], -1)
- self.set_reward([3, 2], -1)
- self.set_reward([2, 5], -1)
- self.set_reward([4, 9], -1)
- self.set_reward([5, 7], -1)
- self.set_reward([6, 4], -1)
- self.set_reward([7, 8], -1)
- self.set_reward([8, 3], -1)
- self.set_reward([9, 1], -1)
-
- #
- #
- # #goal
- self.set_reward([9, 9], 5)
+ return canvas
- # add image to canvas
- self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+ def load_images(self):
+ rectangle = PhotoImage(
+ Image.open("../img/rectangle.png").resize((30, 30)))
+ triangle = PhotoImage(
+ Image.open("../img/triangle.png").resize((30, 30)))
+ circle = PhotoImage(
+ Image.open("../img/circle.png").resize((30, 30)))
- # pack all`
- self.canvas.pack()
+ return rectangle, triangle, circle
def reset_reward(self):
@@ -71,36 +70,33 @@ def reset_reward(self):
self.rewards.clear()
self.goal.clear()
- # obstacle
- self.set_reward([2, 7], -1)
- self.set_reward([3, 2], -1)
- self.set_reward([2, 5], -1)
- self.set_reward([4, 9], -1)
- self.set_reward([5, 7], -1)
- self.set_reward([6, 4], -1)
- self.set_reward([7, 8], -1)
- self.set_reward([8, 3], -1)
- self.set_reward([9, 1], -1)
-
- #
- #
+ self.set_reward([0, 1], -1)
+ self.set_reward([1, 2], -1)
+ self.set_reward([2, 3], -1)
+
# #goal
- self.set_reward([9, 9], 5)
+ self.set_reward([4, 4], 1)
def set_reward(self, state, reward):
state = [int(state[0]), int(state[1])]
+ x = int(state[0])
+ y = int(state[1])
temp = {}
if reward > 0:
temp['reward'] = reward
- temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
- image=self.fish_image)
+ temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+ (UNIT * y) + UNIT / 2,
+ image=self.shapes[2])
+
self.goal.append(temp['figure'])
elif reward < 0:
+ temp['direction'] = -1
temp['reward'] = reward
- temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
- image=self.fire_image)
+ temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+ (UNIT * y) + UNIT / 2,
+ image=self.shapes[1])
temp['coords'] = self.canvas.coords(temp['figure'])
temp['state'] = state
@@ -112,28 +108,27 @@ def check_if_reward(self, state):
check_list = dict()
check_list['if_goal'] = False
rewards = 0
+
for reward in self.rewards:
if reward['state'] == state:
rewards += reward['reward']
- if reward['reward'] == 5:
+ if reward['reward'] > 0:
check_list['if_goal'] = True
+
check_list['rewards'] = rewards
return check_list
def coords_to_state(self, coords):
- x = int((coords[0] - 50) / 100)
- y = int((coords[1] - 50) / 100)
+ x = int((coords[0] - UNIT / 2) / UNIT)
+ y = int((coords[1] - UNIT / 2) / UNIT)
return [x, y]
def reset(self):
self.update()
- time.sleep(0.5)
- self.canvas.delete(self.rectangle)
- origin = np.array([UNIT / 2, UNIT / 2])
- self.rectangle = self.canvas.create_image(UNIT/2, UNIT/2, image=self.rectangle_image)
+ x, y = self.canvas.coords(self.rectangle)
+ self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
# return observation
-
self.reset_reward()
return self.get_state()
@@ -141,14 +136,15 @@ def step(self, action):
self.counter += 1
self.render()
- next_coords = self.move(self.rectangle, action)
-
if self.counter % 2 == 1:
self.rewards = self.move_rewards()
+ next_coords = self.move(self.rectangle, action)
check = self.check_if_reward(self.coords_to_state(next_coords))
done = check['if_goal']
reward = check['rewards']
+ reward -= 0.1
+ self.canvas.tag_raise(self.rectangle)
s_ = self.get_state()
@@ -156,51 +152,61 @@ def step(self, action):
def get_state(self):
- agent_location = self.coords_to_state(self.canvas.coords(self.rectangle))
- agent_x = agent_location[0]
- agent_y = agent_location[1]
+ location = self.coords_to_state(self.canvas.coords(self.rectangle))
+ agent_x = location[0]
+ agent_y = location[1]
- locations = list()
+ states = list()
- locations.append(agent_x)
- locations.append(agent_y)
+ # locations.append(agent_x)
+ # locations.append(agent_y)
for reward in self.rewards:
reward_location = reward['state']
- locations.append(agent_x - reward_location[0])
- locations.append(agent_y - reward_location[1])
+ states.append(reward_location[0] - agent_x)
+ states.append(reward_location[1] - agent_y)
+ if reward['reward'] < 0:
+ states.append(-1)
+ states.append(reward['direction'])
+ else:
+ states.append(1)
- return locations
+ return states
def move_rewards(self):
new_rewards = []
for temp in self.rewards:
- if temp['reward'] == 10:
+ if temp['reward'] > 0:
new_rewards.append(temp)
continue
- temp['coords'] = self.move_const(temp['figure'])
+ temp['coords'] = self.move_const(temp)
temp['state'] = self.coords_to_state(temp['coords'])
new_rewards.append(temp)
return new_rewards
def move_const(self, target):
- s = self.canvas.coords(target)
+
+ s = self.canvas.coords(target['figure'])
base_action = np.array([0, 0])
- if s[0] < (WIDTH - 1) * UNIT:
- base_action[0] += UNIT
- else:
- base_action[0] = -(WIDTH - 1) * UNIT
+ if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
+ target['direction'] = 1
+ elif s[0] == UNIT / 2:
+ target['direction'] = -1
- # if action == 4 # move _none
+ if target['direction'] == -1:
+ base_action[0] += UNIT
+ elif target['direction'] == 1:
+ base_action[0] -= UNIT
- if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
+ if (target['figure'] is not self.rectangle
+ and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
base_action = np.array([0, 0])
- self.canvas.move(target, base_action[0], base_action[1])
+ self.canvas.move(target['figure'], base_action[0], base_action[1])
- s_ = self.canvas.coords(target)
+ s_ = self.canvas.coords(target['figure'])
return s_
@@ -222,11 +228,6 @@ def move(self, target, action):
if s[0] > UNIT:
base_action[0] -= UNIT
- # if action == 4 # move _none
-
- if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
- base_action = np.array([0, 0])
-
self.canvas.move(target, base_action[0], base_action[1])
s_ = self.canvas.coords(target)
@@ -234,5 +235,5 @@ def move(self, target, action):
return s_
def render(self):
- time.sleep(0.1)
+ time.sleep(0.07)
self.update()
diff --git a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py b/1-grid-world/7-reinforce/reinforce_agent.py
similarity index 57%
rename from Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py
rename to 1-grid-world/7-reinforce/reinforce_agent.py
index 5bf590fd..2a37c851 100644
--- a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py
+++ b/1-grid-world/7-reinforce/reinforce_agent.py
@@ -7,53 +7,62 @@
from keras.models import Sequential
from keras import backend as K
-EPISODES = 1000
+EPISODES = 2500
-class PGAgent:
+# this is REINFORCE Agent for GridWorld
+class ReinforceAgent:
def __init__(self):
- self.render = False
-
+ self.load_model = True
+ # actions which agent can do
self.action_space = [0, 1, 2, 3, 4]
+ # get size of state and action
self.action_size = len(self.action_space)
- self.state_size = 22
- self.discount_factor = 0.99 # decay rate
+ self.state_size = 15
+ self.discount_factor = 0.99
self.learning_rate = 0.001
self.model = self.build_model()
self.optimizer = self.optimizer()
self.states, self.actions, self.rewards = [], [], []
+ if self.load_model:
+ self.model.load_weights('./save_model/reinforce_trained.h5')
+
+ # state is input and probability of each action(policy) is output of network
def build_model(self):
model = Sequential()
- model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
- model.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform'))
- # 마지막 softmax 계층으로 각 행동에 대한 확률을 만드는 모델을 생성
- model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
+ model.add(Dense(24, input_dim=self.state_size, activation='relu'))
+ model.add(Dense(24, activation='relu'))
+ model.add(Dense(self.action_size, activation='softmax'))
model.summary()
-
return model
+ # create error function and training function to update policy network
def optimizer(self):
action = K.placeholder(shape=[None, 5])
discounted_rewards = K.placeholder(shape=[None, ])
- # Policy Gradient 의 핵심
- # log(정책) * return 의 gradient 를 구해서 최대화시킴
- good_prob = K.sum(action * self.model.output, axis=1)
- eligibility = K.log(good_prob) * discounted_rewards
- loss = -K.sum(eligibility)
+ # Calculate cross entropy error function
+ action_prob = K.sum(action * self.model.output, axis=1)
+ cross_entropy = K.log(action_prob) * discounted_rewards
+ loss = -K.sum(cross_entropy)
+ # create training function
optimizer = Adam(lr=self.learning_rate)
- updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
- train = K.function([self.model.input, action, discounted_rewards], [], updates=updates)
+ updates = optimizer.get_updates(self.model.trainable_weights, [],
+ loss)
+ train = K.function([self.model.input, action, discounted_rewards], [],
+ updates=updates)
return train
+ # get action from policy network
def get_action(self, state):
- policy = self.model.predict(state, batch_size=1).flatten()
+ policy = self.model.predict(state)[0]
return np.random.choice(self.action_size, 1, p=policy)[0]
+ # calculate discounted rewards
def discount_rewards(self, rewards):
discounted_rewards = np.zeros_like(rewards)
running_add = 0
@@ -62,14 +71,16 @@ def discount_rewards(self, rewards):
discounted_rewards[t] = running_add
return discounted_rewards
- def memory(self, state, action, reward):
+ # save states, actions and rewards for an episode
+ def append_sample(self, state, action, reward):
self.states.append(state[0])
self.rewards.append(reward)
act = np.zeros(self.action_size)
act[action] = 1
self.actions.append(act)
- def train_episodes(self):
+ # update policy neural network
+ def train_model(self):
discounted_rewards = np.float32(self.discount_rewards(self.rewards))
discounted_rewards -= np.mean(discounted_rewards)
discounted_rewards /= np.std(discounted_rewards)
@@ -77,59 +88,42 @@ def train_episodes(self):
self.optimizer([self.states, self.actions, discounted_rewards])
self.states, self.actions, self.rewards = [], [], []
- def load_model(self, name):
- self.model.load_weights(name)
-
- def save_model(self, name):
- self.model.save_weights(name)
-
if __name__ == "__main__":
- # maze game
- # env = Maze()
env = Env()
- agent = PGAgent()
+ agent = ReinforceAgent()
global_step = 0
- # agent.load("same_vel_episode2 : 1000")
scores, episodes = [], []
for e in range(EPISODES):
done = False
score = 0
+ # fresh env
state = env.reset()
- state = np.reshape(state, [1, 22])
+ state = np.reshape(state, [1, 15])
while not done:
- # fresh env
- if agent.render:
- env.render()
global_step += 1
-
- # RL choose action based on observation and go one step
+ # get action for the current state and go one step in environment
action = agent.get_action(state)
next_state, reward, done = env.step(action)
- next_state = np.reshape(next_state, [1, 22])
+ next_state = np.reshape(next_state, [1, 15])
- agent.memory(state, action, reward)
- # every time step we do train from the replay memory
+ agent.append_sample(state, action, reward)
score += reward
- # swap observation
state = copy.deepcopy(next_state)
if done:
- agent.train_episodes()
-
+ # update policy neural network for each episode
+ agent.train_model()
scores.append(score)
episodes.append(e)
- pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/10by10.png")
- print("episode:", e, " score:", score, " time_step:", global_step)
+ score = round(score, 2)
+ print("episode:", e, " score:", score, " time_step:",
+ global_step)
if e % 100 == 0:
- pass
- agent.save_model("./save_model/10by10")
-
- # end of game
- print('game over')
- env.destroy()
\ No newline at end of file
+ pylab.plot(episodes, scores, 'b')
+ pylab.savefig("./save_graph/reinforce.png")
+ agent.model.save_weights("./save_model/reinforce.h5")
diff --git a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png
new file mode 100644
index 00000000..3be9edb7
Binary files /dev/null and b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5
similarity index 64%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5
rename to 1-grid-world/7-reinforce/save_model/reinforce_trained.h5
index c63a4dc6..cb206f51 100644
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 and b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 differ
diff --git a/Code 1. Grid World/README.md b/1-grid-world/README.md
similarity index 100%
rename from Code 1. Grid World/README.md
rename to 1-grid-world/README.md
diff --git a/Code 1. Grid World/gridworld.png b/1-grid-world/gridworld.png
similarity index 100%
rename from Code 1. Grid World/gridworld.png
rename to 1-grid-world/gridworld.png
diff --git a/Code 1. Grid World/gridworld_changing.png b/1-grid-world/gridworld_changing.png
similarity index 100%
rename from Code 1. Grid World/gridworld_changing.png
rename to 1-grid-world/gridworld_changing.png
diff --git a/Code 1. Grid World/resources/circle.png b/1-grid-world/img/circle.png
similarity index 100%
rename from Code 1. Grid World/resources/circle.png
rename to 1-grid-world/img/circle.png
diff --git a/Code 1. Grid World/resources/down.png b/1-grid-world/img/down.png
similarity index 100%
rename from Code 1. Grid World/resources/down.png
rename to 1-grid-world/img/down.png
diff --git a/Code 1. Grid World/resources/left.png b/1-grid-world/img/left.png
similarity index 100%
rename from Code 1. Grid World/resources/left.png
rename to 1-grid-world/img/left.png
diff --git a/Code 1. Grid World/resources/rectangle.png b/1-grid-world/img/rectangle.png
similarity index 100%
rename from Code 1. Grid World/resources/rectangle.png
rename to 1-grid-world/img/rectangle.png
diff --git a/Code 1. Grid World/resources/right.png b/1-grid-world/img/right.png
similarity index 100%
rename from Code 1. Grid World/resources/right.png
rename to 1-grid-world/img/right.png
diff --git a/Code 1. Grid World/resources/triangle.png b/1-grid-world/img/triangle.png
similarity index 100%
rename from Code 1. Grid World/resources/triangle.png
rename to 1-grid-world/img/triangle.png
diff --git a/Code 1. Grid World/resources/up.png b/1-grid-world/img/up.png
similarity index 100%
rename from Code 1. Grid World/resources/up.png
rename to 1-grid-world/img/up.png
diff --git a/2-cartpole/1-dqn/SumTree.py b/2-cartpole/1-dqn/SumTree.py
new file mode 100644
index 00000000..1b72e9ea
--- /dev/null
+++ b/2-cartpole/1-dqn/SumTree.py
@@ -0,0 +1,55 @@
+import numpy
+
+
+class SumTree:
+ write = 0
+
+ def __init__(self, capacity):
+ self.capacity = capacity
+ self.tree = numpy.zeros(2 * capacity - 1)
+ self.data = numpy.zeros(capacity, dtype=object)
+
+ def _propagate(self, idx, change):
+ parent = (idx - 1) // 2
+
+ self.tree[parent] += change
+
+ if parent != 0:
+ self._propagate(parent, change)
+
+ def _retrieve(self, idx, s):
+ left = 2 * idx + 1
+ right = left + 1
+
+ if left >= len(self.tree):
+ return idx
+
+ if s <= self.tree[left]:
+ return self._retrieve(left, s)
+ else:
+ return self._retrieve(right, s - self.tree[left])
+
+ def total(self):
+ return self.tree[0]
+
+ def add(self, p, data):
+ idx = self.write + self.capacity - 1
+
+ self.data[self.write] = data
+ self.update(idx, p)
+
+ self.write += 1
+ if self.write >= self.capacity:
+ self.write = 0
+
+ def update(self, idx, p):
+ change = p - self.tree[idx]
+
+ self.tree[idx] = p
+ self._propagate(idx, change)
+
+ def get(self, s):
+ idx = self._retrieve(0, s)
+ dataIdx = idx - self.capacity + 1
+
+ return (idx, self.tree[idx], self.data[dataIdx])
diff --git a/Code 2. Cartpole/1. DQN/Cartpole_DQN.py b/2-cartpole/1-dqn/cartpole_dqn.py
similarity index 68%
rename from Code 2. Cartpole/1. DQN/Cartpole_DQN.py
rename to 2-cartpole/1-dqn/cartpole_dqn.py
index 84802b76..8b2baaf0 100644
--- a/Code 2. Cartpole/1. DQN/Cartpole_DQN.py
+++ b/2-cartpole/1-dqn/cartpole_dqn.py
@@ -11,19 +11,20 @@
EPISODES = 300
-# this is DQN Agent for the Cartpole
+# DQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network
class DQNAgent:
def __init__(self, state_size, action_size):
# if you want to see Cartpole learning, then change to True
self.render = False
+ self.load_model = False
# get size of state and action
self.state_size = state_size
self.action_size = action_size
- # these is hyper parameters for the DQN
+ # These are hyper parameters for the DQN
self.discount_factor = 0.99
self.learning_rate = 0.001
self.epsilon = 1.0
@@ -37,17 +38,23 @@ def __init__(self, state_size, action_size):
# create main model and target model
self.model = self.build_model()
self.target_model = self.build_model()
- # copy the model to target model
- # --> initialize the target model so that the parameters of model & target model to be same
+
+ # initialize target model
self.update_target_model()
+ if self.load_model:
+ self.model.load_weights("./save_model/cartpole_dqn.h5")
+
# approximate Q function using Neural Network
# state is input and Q Value of each action is output of network
def build_model(self):
model = Sequential()
- model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
- model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
- model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
+ model.add(Dense(24, input_dim=self.state_size, activation='relu',
+ kernel_initializer='he_uniform'))
+ model.add(Dense(24, activation='relu',
+ kernel_initializer='he_uniform'))
+ model.add(Dense(self.action_size, activation='linear',
+ kernel_initializer='he_uniform'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
@@ -65,50 +72,47 @@ def get_action(self, state):
return np.argmax(q_value[0])
# save sample to the replay memory
- def replay_memory(self, state, action, reward, next_state, done):
+ def append_sample(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# pick samples randomly from replay memory (with batch_size)
- def train_replay(self):
+ def train_model(self):
if len(self.memory) < self.train_start:
return
batch_size = min(self.batch_size, len(self.memory))
mini_batch = random.sample(self.memory, batch_size)
update_input = np.zeros((batch_size, self.state_size))
- update_target = np.zeros((batch_size, self.action_size))
-
- for i in range(batch_size):
- state, action, reward, next_state, done = mini_batch[i]
- target = self.model.predict(state)[0]
-
- # like Q Learning, get maximum Q value at s'
- # But from target model
- if done:
- target[action] = reward
+ update_target = np.zeros((batch_size, self.state_size))
+ action, reward, done = [], [], []
+
+ for i in range(self.batch_size):
+ update_input[i] = mini_batch[i][0]
+ action.append(mini_batch[i][1])
+ reward.append(mini_batch[i][2])
+ update_target[i] = mini_batch[i][3]
+ done.append(mini_batch[i][4])
+
+ target = self.model.predict(update_input)
+ target_val = self.target_model.predict(update_target)
+
+ for i in range(self.batch_size):
+ # Q Learning: get maximum Q value at s' from target model
+ if done[i]:
+ target[i][action[i]] = reward[i]
else:
- target[action] = reward + self.discount_factor * \
- np.amax(self.target_model.predict(next_state)[0])
- update_input[i] = state
- update_target[i] = target
+ target[i][action[i]] = reward[i] + self.discount_factor * (
+ np.amax(target_val[i]))
- # make minibatch which includes target q value and predicted q value
# and do the model fit!
- self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
- # load the saved model
- def load_model(self, name):
- self.model.load_weights(name)
-
- # save the model which is under training
- def save_model(self, name):
- self.model.save_weights(name)
+ self.model.fit(update_input, target, batch_size=self.batch_size,
+ epochs=1, verbose=0)
if __name__ == "__main__":
- # in case of CartPole-v1, you can play until 500 time step
+ # In case of CartPole-v1, maximum length of episode is 500
env = gym.make('CartPole-v1')
# get size of state and action from environment
state_size = env.observation_space.shape[0]
@@ -123,7 +127,6 @@ def save_model(self, name):
score = 0
state = env.reset()
state = np.reshape(state, [1, state_size])
- # agent.load_model("./save_model/cartpole-master.h5")
while not done:
if agent.render:
@@ -137,14 +140,13 @@ def save_model(self, name):
reward = reward if not done or score == 499 else -100
# save the sample to the replay memory
- agent.replay_memory(state, action, reward, next_state, done)
+ agent.append_sample(state, action, reward, next_state, done)
# every time step do the training
- agent.train_replay()
+ agent.train_model()
score += reward
state = next_state
if done:
- env.reset()
# every episode update the target model to be same with model
agent.update_target_model()
@@ -153,9 +155,9 @@ def save_model(self, name):
scores.append(score)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/Cartpole_DQN14.png")
- print("episode:", e, " score:", score, " memory length:", len(agent.memory),
- " epsilon:", agent.epsilon)
+ pylab.savefig("./save_graph/cartpole_dqn.png")
+ print("episode:", e, " score:", score, " memory length:",
+ len(agent.memory), " epsilon:", agent.epsilon)
# if the mean of scores of last 10 episode is bigger than 490
# stop training
@@ -164,4 +166,4 @@ def save_model(self, name):
# save the model
if e % 50 == 0:
- agent.save_model("./save_model/Cartpole_DQN14.h5")
\ No newline at end of file
+ agent.model.save_weights("./save_model/cartpole_dqn.h5")
diff --git a/2-cartpole/1-dqn/cartpole_only_per.py b/2-cartpole/1-dqn/cartpole_only_per.py
new file mode 100644
index 00000000..1a66d86b
--- /dev/null
+++ b/2-cartpole/1-dqn/cartpole_only_per.py
@@ -0,0 +1,224 @@
+import sys
+import gym
+import pylab
+import random
+import numpy as np
+from SumTree import SumTree
+from collections import deque
+from keras.layers import Dense
+from keras.optimizers import Adam
+from keras.models import Sequential
+
+EPISODES = 300
+
+
+# 카트폴 예제에서의 DQN 에이전트
+class DQNAgent:
+ def __init__(self, state_size, action_size):
+ self.render = False
+ self.load_model = False
+
+ # 상태와 행동의 크기 정의
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # DQN 하이퍼파라미터
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+ self.epsilon = 1.0
+ self.epsilon_decay = 0.999
+ self.epsilon_min = 0.01
+ self.batch_size = 64
+ self.train_start = 2000
+ self.memory_size = 2000
+
+ # 리플레이 메모리, 최대 크기 2000
+ self.memory = Memory(self.memory_size)
+
+ # 모델과 타깃 모델 생성
+ self.model = self.build_model()
+ self.target_model = self.build_model()
+
+ # 타깃 모델 초기화
+ self.update_target_model()
+
+ if self.load_model:
+ self.model.load_weights("./save_model/cartpole_dqn_trained.h5")
+
+ # 상태가 입력, 큐함수가 출력인 인공신경망 생성
+ def build_model(self):
+ model = Sequential()
+ model.add(Dense(24, input_dim=self.state_size, activation='relu',
+ kernel_initializer='he_uniform'))
+ model.add(Dense(24, activation='relu',
+ kernel_initializer='he_uniform'))
+ model.add(Dense(self.action_size, activation='linear',
+ kernel_initializer='he_uniform'))
+ model.summary()
+ model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
+ return model
+
+ # 타깃 모델을 모델의 가중치로 업데이트
+ def update_target_model(self):
+ self.target_model.set_weights(self.model.get_weights())
+
+ # 입실론 탐욕 정책으로 행동 선택
+ def get_action(self, state):
+ if np.random.rand() <= self.epsilon:
+ return random.randrange(self.action_size)
+ else:
+ q_value = self.model.predict(state)
+ return np.argmax(q_value[0])
+
+ # 샘플 을 리플레이 메모리에 저장
+ def append_sample(self, state, action, reward, next_state, done):
+ if self.epsilon == 1:
+ done = True
+
+ # TD-error 를 구해서 같이 메모리에 저장
+ target = self.model.predict([state])
+ old_val = target[0][action]
+ target_val = self.target_model.predict([next_state])
+ if done:
+ target[0][action] = reward
+ else:
+ target[0][action] = reward + self.discount_factor * (
+ np.amax(target_val[0]))
+ error = abs(old_val - target[0][action])
+
+ self.memory.add(error, (state, action, reward, next_state, done))
+
+ # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
+ def train_model(self):
+ if self.epsilon > self.epsilon_min:
+ self.epsilon *= self.epsilon_decay
+
+ # 메모리에서 배치 크기만큼 무작위로 샘플 추출
+ mini_batch = self.memory.sample(self.batch_size)
+
+ errors = np.zeros(self.batch_size)
+ states = np.zeros((self.batch_size, self.state_size))
+ next_states = np.zeros((self.batch_size, self.state_size))
+ actions, rewards, dones = [], [], []
+
+ for i in range(self.batch_size):
+ states[i] = mini_batch[i][1][0]
+ actions.append(mini_batch[i][1][1])
+ rewards.append(mini_batch[i][1][2])
+ next_states[i] = mini_batch[i][1][3]
+ dones.append(mini_batch[i][1][4])
+
+ # 현재 상태에 대한 모델의 큐함수
+ # 다음 상태에 대한 타깃 모델의 큐함수
+ target = self.model.predict(states)
+ target_val = self.target_model.predict(next_states)
+
+ # 벨만 최적 방정식을 이용한 업데이트 타깃
+ for i in range(self.batch_size):
+ old_val = target[i][actions[i]]
+ if dones[i]:
+ target[i][actions[i]] = rewards[i]
+ else:
+ target[i][actions[i]] = rewards[i] + self.discount_factor * (
+ np.amax(target_val[i]))
+ # TD-error를 저장
+ errors[i] = abs(old_val - target[i][actions[i]])
+
+ # TD-error로 priority 업데이트
+ for i in range(self.batch_size):
+ idx = mini_batch[i][0]
+ self.memory.update(idx, errors[i])
+
+ self.model.fit(states, target, batch_size=self.batch_size,
+ epochs=1, verbose=0)
+
+
+class Memory: # stored as ( s, a, r, s_ ) in SumTree
+ e = 0.01
+ a = 0.6
+
+ def __init__(self, capacity):
+ self.tree = SumTree(capacity)
+
+ def _getPriority(self, error):
+ return (error + self.e) ** self.a
+
+ def add(self, error, sample):
+ p = self._getPriority(error)
+ self.tree.add(p, sample)
+
+ def sample(self, n):
+ batch = []
+ segment = self.tree.total() / n
+
+ for i in range(n):
+ a = segment * i
+ b = segment * (i + 1)
+
+ s = random.uniform(a, b)
+ (idx, p, data) = self.tree.get(s)
+ batch.append((idx, data))
+
+ return batch
+
+ def update(self, idx, error):
+ p = self._getPriority(error)
+ self.tree.update(idx, p)
+
+
+if __name__ == "__main__":
+ # CartPole-v1 환경, 최대 타임스텝 수가 500
+ env = gym.make('CartPole-v1')
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ # DQN 에이전트 생성
+ agent = DQNAgent(state_size, action_size)
+
+ scores, episodes = [], []
+
+ step = 0
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ # env 초기화
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ if agent.render:
+ env.render()
+ step += 1
+ # 현재 상태로 행동을 선택
+ action = agent.get_action(state)
+ # 선택한 행동으로 환경에서 한 타임스텝 진행
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+ # 에피소드가 중간에 끝나면 -100 보상
+ r = reward if not done or score+reward == 500 else -10
+ # 리플레이 메모리에 샘플 저장
+ agent.append_sample(state, action, r, next_state, done)
+ # 매 타임스텝마다 학습
+ if step >= agent.train_start:
+ agent.train_model()
+
+ score += reward
+ state = next_state
+
+ if done:
+ # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
+ agent.update_target_model()
+
+# score = score if score == 500 else score + 100
+ # 에피소드마다 학습 결과 출력
+ scores.append(score)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.savefig("./save_graph/cartpole_dqn.png")
+ print("episode:", e, " score:", score, " memory length:",
+ step if step <= agent.memory_size else agent.memory_size, " epsilon:", agent.epsilon)
+
+ # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
+ if np.mean(scores[-min(10, len(scores)):]) > 490:
+ agent.model.save_weights("./save_model/cartpole_dqn.h5")
+ sys.exit()
diff --git a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN.png b/2-cartpole/1-dqn/save_graph/Cartpole_DQN.png
similarity index 100%
rename from Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN.png
rename to 2-cartpole/1-dqn/save_graph/Cartpole_DQN.png
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN1.h5 b/2-cartpole/1-dqn/save_model/cartpole_dqn.h5
similarity index 100%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN1.h5
rename to 2-cartpole/1-dqn/save_model/cartpole_dqn.h5
diff --git a/Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py b/2-cartpole/2-double-dqn/cartpole_ddqn.py
similarity index 71%
rename from Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py
rename to 2-cartpole/2-double-dqn/cartpole_ddqn.py
index b5feb608..73c51140 100644
--- a/Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py
+++ b/2-cartpole/2-double-dqn/cartpole_ddqn.py
@@ -11,14 +11,14 @@
EPISODES = 300
-# this is Double DQN Agent for the Cartpole
+# Double DQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network
class DoubleDQNAgent:
def __init__(self, state_size, action_size):
# if you want to see Cartpole learning, then change to True
self.render = False
-
+ self.load_model = False
# get size of state and action
self.state_size = state_size
self.action_size = action_size
@@ -37,17 +37,23 @@ def __init__(self, state_size, action_size):
# create main model and target model
self.model = self.build_model()
self.target_model = self.build_model()
- # copy the model to target model
- # --> initialize the target model so that the parameters of model & target model to be same
+
+ # initialize target model
self.update_target_model()
+ if self.load_model:
+ self.model.load_weights("./save_model/cartpole_ddqn.h5")
+
# approximate Q function using Neural Network
# state is input and Q Value of each action is output of network
def build_model(self):
model = Sequential()
- model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
- model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
- model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
+ model.add(Dense(24, input_dim=self.state_size, activation='relu',
+ kernel_initializer='he_uniform'))
+ model.add(Dense(24, activation='relu',
+ kernel_initializer='he_uniform'))
+ model.add(Dense(self.action_size, activation='linear',
+ kernel_initializer='he_uniform'))
model.summary()
model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
return model
@@ -65,54 +71,54 @@ def get_action(self, state):
return np.argmax(q_value[0])
# save sample to the replay memory
- def replay_memory(self, state, action, reward, next_state, done):
+ def append_sample(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
# pick samples randomly from replay memory (with batch_size)
- def train_replay(self):
+ def train_model(self):
if len(self.memory) < self.train_start:
return
batch_size = min(self.batch_size, len(self.memory))
mini_batch = random.sample(self.memory, batch_size)
update_input = np.zeros((batch_size, self.state_size))
- update_target = np.zeros((batch_size, self.action_size))
+ update_target = np.zeros((batch_size, self.state_size))
+ action, reward, done = [], [], []
for i in range(batch_size):
- state, action, reward, next_state, done = mini_batch[i]
- target = self.model.predict(state)[0]
+ update_input[i] = mini_batch[i][0]
+ action.append(mini_batch[i][1])
+ reward.append(mini_batch[i][2])
+ update_target[i] = mini_batch[i][3]
+ done.append(mini_batch[i][4])
+
+ target = self.model.predict(update_input)
+ target_next = self.model.predict(update_target)
+ target_val = self.target_model.predict(update_target)
+ for i in range(self.batch_size):
# like Q Learning, get maximum Q value at s'
# But from target model
- if done:
- target[action] = reward
+ if done[i]:
+ target[i][action[i]] = reward[i]
else:
# the key point of Double DQN
# selection of action is from model
# update is from target model
- a = np.argmax(self.model.predict(next_state)[0])
- target[action] = reward + self.discount_factor * \
- (self.target_model.predict(next_state)[0][a])
- update_input[i] = state
- update_target[i] = target
+ a = np.argmax(target_next[i])
+ target[i][action[i]] = reward[i] + self.discount_factor * (
+ target_val[i][a])
# make minibatch which includes target q value and predicted q value
# and do the model fit!
- self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
- # load the saved model
- def load_model(self, name):
- self.model.load_weights(name)
-
- # save the model which is under training
- def save_model(self, name):
- self.model.save_weights(name)
+ self.model.fit(update_input, target, batch_size=self.batch_size,
+ epochs=1, verbose=0)
if __name__ == "__main__":
- # in case of CartPole-v1, you can play until 500 time step
+ # In case of CartPole-v1, you can play until 500 time step
env = gym.make('CartPole-v1')
# get size of state and action from environment
state_size = env.observation_space.shape[0]
@@ -127,7 +133,6 @@ def save_model(self, name):
score = 0
state = env.reset()
state = np.reshape(state, [1, state_size])
- # agent.load_model("./save_model/cartpole-master.h5")
while not done:
if agent.render:
@@ -141,14 +146,13 @@ def save_model(self, name):
reward = reward if not done or score == 499 else -100
# save the sample to the replay memory
- agent.replay_memory(state, action, reward, next_state, done)
+ agent.append_sample(state, action, reward, next_state, done)
# every time step do the training
- agent.train_replay()
+ agent.train_model()
score += reward
state = next_state
if done:
- env.reset()
# every episode update the target model to be same with model
agent.update_target_model()
@@ -157,9 +161,9 @@ def save_model(self, name):
scores.append(score)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
- pylab.savefig("./save_graph/Cartpole_DoubleDQN.png")
- print("episode:", e, " score:", score, " memory length:", len(agent.memory),
- " epsilon:", agent.epsilon)
+ pylab.savefig("./save_graph/cartpole_ddqn.png")
+ print("episode:", e, " score:", score, " memory length:",
+ len(agent.memory), " epsilon:", agent.epsilon)
# if the mean of scores of last 10 episode is bigger than 490
# stop training
@@ -168,4 +172,4 @@ def save_model(self, name):
# save the model
if e % 50 == 0:
- agent.save_model("./save_model/Cartpole_DoubleDQN.h5")
\ No newline at end of file
+ agent.model.save_weights("./save_model/cartpole_ddqn.h5")
diff --git a/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png b/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png
new file mode 100644
index 00000000..26c4fed0
Binary files /dev/null and b/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 b/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5
similarity index 73%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5
rename to 2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5
index d4d4bcd5..c54c9886 100644
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 and b/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5 differ
diff --git a/2-cartpole/3-reinforce/cartpole_reinforce.py b/2-cartpole/3-reinforce/cartpole_reinforce.py
new file mode 100644
index 00000000..040234d1
--- /dev/null
+++ b/2-cartpole/3-reinforce/cartpole_reinforce.py
@@ -0,0 +1,146 @@
+import sys
+import gym
+import pylab
+import numpy as np
+from keras.layers import Dense
+from keras.models import Sequential
+from keras.optimizers import Adam
+
+EPISODES = 1000
+
+
+# This is Policy Gradient agent for the Cartpole
+# In this example, we use REINFORCE algorithm which uses monte-carlo update rule
+class REINFORCEAgent:
+ def __init__(self, state_size, action_size):
+ # if you want to see Cartpole learning, then change to True
+ self.render = False
+ self.load_model = False
+ # get size of state and action
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # These are hyper parameters for the Policy Gradient
+ self.discount_factor = 0.99
+ self.learning_rate = 0.001
+ self.hidden1, self.hidden2 = 24, 24
+
+ # create model for policy network
+ self.model = self.build_model()
+
+ # lists for the states, actions and rewards
+ self.states, self.actions, self.rewards = [], [], []
+
+ if self.load_model:
+ self.model.load_weights("./save_model/cartpole_reinforce.h5")
+
+ # approximate policy using Neural Network
+ # state is input and probability of each action is output of network
+ def build_model(self):
+ model = Sequential()
+ model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
+ model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform'))
+ model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
+ model.summary()
+ # Using categorical crossentropy as a loss is a trick to easily
+ # implement the policy gradient. Categorical cross entropy is defined
+ # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set
+ # p_a = advantage. q_a is the output of the policy network, which is
+ # the probability of taking the action a, i.e. policy(s, a).
+ # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))
+ model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate))
+ return model
+
+ # using the output of policy network, pick action stochastically
+ def get_action(self, state):
+ policy = self.model.predict(state, batch_size=1).flatten()
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+ # In Policy Gradient, Q function is not available.
+ # Instead agent uses sample returns for evaluating policy
+ def discount_rewards(self, rewards):
+ discounted_rewards = np.zeros_like(rewards)
+ running_add = 0
+ for t in reversed(range(0, len(rewards))):
+ running_add = running_add * self.discount_factor + rewards[t]
+ discounted_rewards[t] = running_add
+ return discounted_rewards
+
+ # save of each step
+ def append_sample(self, state, action, reward):
+ self.states.append(state)
+ self.rewards.append(reward)
+ self.actions.append(action)
+
+ # update policy network every episode
+ def train_model(self):
+ episode_length = len(self.states)
+
+ discounted_rewards = self.discount_rewards(self.rewards)
+ discounted_rewards -= np.mean(discounted_rewards)
+ discounted_rewards /= np.std(discounted_rewards)
+
+ update_inputs = np.zeros((episode_length, self.state_size))
+ advantages = np.zeros((episode_length, self.action_size))
+
+ for i in range(episode_length):
+ update_inputs[i] = self.states[i]
+ advantages[i][self.actions[i]] = discounted_rewards[i]
+
+ self.model.fit(update_inputs, advantages, epochs=1, verbose=0)
+ self.states, self.actions, self.rewards = [], [], []
+
+if __name__ == "__main__":
+ # In case of CartPole-v1, you can play until 500 time step
+ env = gym.make('CartPole-v1')
+ # get size of state and action from environment
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ # make REINFORCE agent
+ agent = REINFORCEAgent(state_size, action_size)
+
+ scores, episodes = [], []
+
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ if agent.render:
+ env.render()
+
+ # get action for the current state and go one step in environment
+ action = agent.get_action(state)
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+ reward = reward if not done or score == 499 else -100
+
+ # save the sample to the memory
+ agent.append_sample(state, action, reward)
+
+ score += reward
+ state = next_state
+
+ if done:
+ # every episode, agent learns from sample returns
+ agent.train_model()
+
+ # every episode, plot the play time
+ score = score if score == 500 else score + 100
+ scores.append(score)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.savefig("./save_graph/cartpole_reinforce.png")
+ print("episode:", e, " score:", score)
+
+ # if the mean of scores of last 10 episode is bigger than 490
+ # stop training
+ if np.mean(scores[-min(10, len(scores)):]) > 490:
+ sys.exit()
+
+ # save the model
+ if e % 50 == 0:
+ agent.model.save_weights("./save_model/cartpole_reinforce.h5")
diff --git a/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png b/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png
new file mode 100644
index 00000000..dce280f2
Binary files /dev/null and b/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png differ
diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 b/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5
similarity index 67%
rename from Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5
rename to 2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5
index 1fc158bf..18fb216b 100644
Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 and b/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5 differ
diff --git a/2-cartpole/4-actor-critic/cartpole_a2c.py b/2-cartpole/4-actor-critic/cartpole_a2c.py
new file mode 100644
index 00000000..fa6310a3
--- /dev/null
+++ b/2-cartpole/4-actor-critic/cartpole_a2c.py
@@ -0,0 +1,135 @@
+import sys
+import gym
+import pylab
+import numpy as np
+from keras.layers import Dense
+from keras.models import Sequential
+from keras.optimizers import Adam
+
+EPISODES = 1000
+
+
+# A2C(Advantage Actor-Critic) agent for the Cartpole
+class A2CAgent:
+ def __init__(self, state_size, action_size):
+ # if you want to see Cartpole learning, then change to True
+ self.render = False
+ self.load_model = False
+ # get size of state and action
+ self.state_size = state_size
+ self.action_size = action_size
+ self.value_size = 1
+
+ # These are hyper parameters for the Policy Gradient
+ self.discount_factor = 0.99
+ self.actor_lr = 0.001
+ self.critic_lr = 0.005
+
+ # create model for policy network
+ self.actor = self.build_actor()
+ self.critic = self.build_critic()
+
+ if self.load_model:
+ self.actor.load_weights("./save_model/cartpole_actor.h5")
+ self.critic.load_weights("./save_model/cartpole_critic.h5")
+
+ # approximate policy and value using Neural Network
+ # actor: state is input and probability of each action is output of model
+ def build_actor(self):
+ actor = Sequential()
+ actor.add(Dense(24, input_dim=self.state_size, activation='relu',
+ kernel_initializer='he_uniform'))
+ actor.add(Dense(self.action_size, activation='softmax',
+ kernel_initializer='he_uniform'))
+ actor.summary()
+ # See note regarding crossentropy in cartpole_reinforce.py
+ actor.compile(loss='categorical_crossentropy',
+ optimizer=Adam(lr=self.actor_lr))
+ return actor
+
+ # critic: state is input and value of state is output of model
+ def build_critic(self):
+ critic = Sequential()
+ critic.add(Dense(24, input_dim=self.state_size, activation='relu',
+ kernel_initializer='he_uniform'))
+ critic.add(Dense(self.value_size, activation='linear',
+ kernel_initializer='he_uniform'))
+ critic.summary()
+ critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
+ return critic
+
+ # using the output of policy network, pick action stochastically
+ def get_action(self, state):
+ policy = self.actor.predict(state, batch_size=1).flatten()
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+ # update policy network every episode
+ def train_model(self, state, action, reward, next_state, done):
+ target = np.zeros((1, self.value_size))
+ advantages = np.zeros((1, self.action_size))
+
+ value = self.critic.predict(state)[0]
+ next_value = self.critic.predict(next_state)[0]
+
+ if done:
+ advantages[0][action] = reward - value
+ target[0][0] = reward
+ else:
+ advantages[0][action] = reward + self.discount_factor * (next_value) - value
+ target[0][0] = reward + self.discount_factor * next_value
+
+ self.actor.fit(state, advantages, epochs=1, verbose=0)
+ self.critic.fit(state, target, epochs=1, verbose=0)
+
+
+if __name__ == "__main__":
+ # In case of CartPole-v1, maximum length of episode is 500
+ env = gym.make('CartPole-v1')
+ # get size of state and action from environment
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ # make A2C agent
+ agent = A2CAgent(state_size, action_size)
+
+ scores, episodes = [], []
+
+ for e in range(EPISODES):
+ done = False
+ score = 0
+ state = env.reset()
+ state = np.reshape(state, [1, state_size])
+
+ while not done:
+ if agent.render:
+ env.render()
+
+ action = agent.get_action(state)
+ next_state, reward, done, info = env.step(action)
+ next_state = np.reshape(next_state, [1, state_size])
+ # if an action make the episode end, then gives penalty of -100
+ reward = reward if not done or score == 499 else -100
+
+ agent.train_model(state, action, reward, next_state, done)
+
+ score += reward
+ state = next_state
+
+ if done:
+ # every episode, plot the play time
+ score = score if score == 500.0 else score + 100
+ scores.append(score)
+ episodes.append(e)
+ pylab.plot(episodes, scores, 'b')
+ pylab.savefig("./save_graph/cartpole_a2c.png")
+ print("episode:", e, " score:", score)
+
+ # if the mean of scores of last 10 episode is bigger than 490
+ # stop training
+ if np.mean(scores[-min(10, len(scores)):]) > 490:
+ sys.exit()
+
+ # save the model
+ if e % 50 == 0:
+ agent.actor.save_weights("./save_model/cartpole_actor.h5")
+ agent.critic.save_weights("./save_model/cartpole_critic.h5")
diff --git a/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png b/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png
new file mode 100644
index 00000000..aedc6c4c
Binary files /dev/null and b/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png differ
diff --git a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 b/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5
similarity index 59%
rename from Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5
rename to 2-cartpole/4-actor-critic/save_model/cartpole_actor.h5
index 24f6b0cf..38b40bba 100644
Binary files a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 and b/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5 differ
diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 b/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5
similarity index 56%
rename from Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5
rename to 2-cartpole/4-actor-critic/save_model/cartpole_critic.h5
index 1146b18e..4cea5ef1 100644
Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 and b/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5 differ
diff --git a/2-cartpole/5-a3c/cartpole_a3c.py b/2-cartpole/5-a3c/cartpole_a3c.py
new file mode 100644
index 00000000..f2721849
--- /dev/null
+++ b/2-cartpole/5-a3c/cartpole_a3c.py
@@ -0,0 +1,223 @@
+import threading
+import numpy as np
+import tensorflow as tf
+import pylab
+import time
+import gym
+from keras.layers import Dense, Input
+from keras.models import Model
+from keras.optimizers import Adam
+from keras import backend as K
+
+
+# global variables for threading
+episode = 0
+scores = []
+
+EPISODES = 2000
+
+# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
+# In this example, we use A3C algorithm
+class A3CAgent:
+ def __init__(self, state_size, action_size, env_name):
+ # get size of state and action
+ self.state_size = state_size
+ self.action_size = action_size
+
+ # get gym environment name
+ self.env_name = env_name
+
+ # these are hyper parameters for the A3C
+ self.actor_lr = 0.001
+ self.critic_lr = 0.001
+ self.discount_factor = .99
+ self.hidden1, self.hidden2 = 24, 24
+ self.threads = 8
+
+ # create model for actor and critic network
+ self.actor, self.critic = self.build_model()
+
+ # method for training actor and critic network
+ self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
+
+ self.sess = tf.InteractiveSession()
+ K.set_session(self.sess)
+ self.sess.run(tf.global_variables_initializer())
+
+ # approximate policy and value using Neural Network
+ # actor -> state is input and probability of each action is output of network
+ # critic -> state is input and value of state is output of network
+ # actor and critic network share first hidden layer
+ def build_model(self):
+ state = Input(batch_shape=(None, self.state_size))
+ shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)
+
+ actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
+ action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)
+
+ value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
+ state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)
+
+ actor = Model(inputs=state, outputs=action_prob)
+ critic = Model(inputs=state, outputs=state_value)
+
+ actor._make_predict_function()
+ critic._make_predict_function()
+
+ actor.summary()
+ critic.summary()
+
+ return actor, critic
+
+ # make loss function for Policy Gradient
+ # [log(action probability) * advantages] will be input for the back prop
+ # we add entropy of action probability to loss
+ def actor_optimizer(self):
+ action = K.placeholder(shape=(None, self.action_size))
+ advantages = K.placeholder(shape=(None, ))
+
+ policy = self.actor.output
+
+ good_prob = K.sum(action * policy, axis=1)
+ eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
+ loss = -K.sum(eligibility)
+
+ entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
+
+ actor_loss = loss + 0.01*entropy
+
+ optimizer = Adam(lr=self.actor_lr)
+ updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
+ train = K.function([self.actor.input, action, advantages], [], updates=updates)
+ return train
+
+ # make loss function for Value approximation
+ def critic_optimizer(self):
+ discounted_reward = K.placeholder(shape=(None, ))
+
+ value = self.critic.output
+
+ loss = K.mean(K.square(discounted_reward - value))
+
+ optimizer = Adam(lr=self.critic_lr)
+ updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
+ train = K.function([self.critic.input, discounted_reward], [], updates=updates)
+ return train
+
+ # make agents(local) and start training
+ def train(self):
+ # self.load_model('./save_model/cartpole_a3c.h5')
+ agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
+ self.action_size, self.state_size) for i in range(self.threads)]
+
+ for agent in agents:
+ agent.start()
+
+ while True:
+ time.sleep(20)
+
+ plot = scores[:]
+ pylab.plot(range(len(plot)), plot, 'b')
+ pylab.savefig("./save_graph/cartpole_a3c.png")
+
+ self.save_model('./save_model/cartpole_a3c.h5')
+
+ def save_model(self, name):
+ self.actor.save_weights(name + "_actor.h5")
+ self.critic.save_weights(name + "_critic.h5")
+
+ def load_model(self, name):
+ self.actor.load_weights(name + "_actor.h5")
+ self.critic.load_weights(name + "_critic.h5")
+
+# This is Agent(local) class for threading
+class Agent(threading.Thread):
+ def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
+ threading.Thread.__init__(self)
+
+ self.states = []
+ self.rewards = []
+ self.actions = []
+
+ self.index = index
+ self.actor = actor
+ self.critic = critic
+ self.optimizer = optimizer
+ self.env_name = env_name
+ self.discount_factor = discount_factor
+ self.action_size = action_size
+ self.state_size = state_size
+
+ # Thread interactive with environment
+ def run(self):
+ global episode
+ env = gym.make(self.env_name)
+ while episode < EPISODES:
+ state = env.reset()
+ score = 0
+ while True:
+ action = self.get_action(state)
+ next_state, reward, done, _ = env.step(action)
+ score += reward
+
+ self.memory(state, action, reward)
+
+ state = next_state
+
+ if done:
+ episode += 1
+ print("episode: ", episode, "/ score : ", score)
+ scores.append(score)
+ self.train_episode(score != 500)
+ break
+
+ # In Policy Gradient, Q function is not available.
+ # Instead agent uses sample returns for evaluating policy
+ def discount_rewards(self, rewards, done=True):
+ discounted_rewards = np.zeros_like(rewards)
+ running_add = 0
+ if not done:
+ running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
+ for t in reversed(range(0, len(rewards))):
+ running_add = running_add * self.discount_factor + rewards[t]
+ discounted_rewards[t] = running_add
+ return discounted_rewards
+
+ # save of each step
+ # this is used for calculating discounted rewards
+ def memory(self, state, action, reward):
+ self.states.append(state)
+ act = np.zeros(self.action_size)
+ act[action] = 1
+ self.actions.append(act)
+ self.rewards.append(reward)
+
+ # update policy network and value network every episode
+ def train_episode(self, done):
+ discounted_rewards = self.discount_rewards(self.rewards, done)
+
+ values = self.critic.predict(np.array(self.states))
+ values = np.reshape(values, len(values))
+
+ advantages = discounted_rewards - values
+
+ self.optimizer[0]([self.states, self.actions, advantages])
+ self.optimizer[1]([self.states, discounted_rewards])
+ self.states, self.actions, self.rewards = [], [], []
+
+ def get_action(self, state):
+ policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
+ return np.random.choice(self.action_size, 1, p=policy)[0]
+
+
+if __name__ == "__main__":
+ env_name = 'CartPole-v1'
+ env = gym.make(env_name)
+
+ state_size = env.observation_space.shape[0]
+ action_size = env.action_space.n
+
+ env.close()
+
+ global_agent = A3CAgent(state_size, action_size, env_name)
+ global_agent.train()
diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 b/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5
similarity index 58%
rename from Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5
rename to 2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5
index c72bcd68..33ab03a5 100644
Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 and b/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5 differ
diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 b/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5
similarity index 57%
rename from Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5
rename to 2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5
index 6ef1da98..5db01072 100644
Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 and b/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5 differ
diff --git a/Code 2. Cartpole/LICENSE b/2-cartpole/LICENSE
similarity index 100%
rename from Code 2. Cartpole/LICENSE
rename to 2-cartpole/LICENSE
diff --git a/Code 2. Cartpole/README.md b/2-cartpole/README.md
similarity index 65%
rename from Code 2. Cartpole/README.md
rename to 2-cartpole/README.md
index 6882e016..1d8d8701 100644
--- a/Code 2. Cartpole/README.md
+++ b/2-cartpole/README.md
@@ -15,15 +15,10 @@ This is graph of Double DQN algorithm







+
+
+
+2. 다운받은 경로로 들어가서 다음 명령어로 압축파일을 풀어줍니다.
+
+ ```shell
+ $tar xfz pycharm-community-2016.3.2.tar.gz
+ ```
+
+
+
+
+3. 압축을 푼 후 아래 경로(bin폴더)로 이동합니다.
+
+ ```shell
+ $cd ~/pycharm-community-2016.3.2/bin
+ ```
+
+
+4. 다음 명령어로 파이참을 실행합니다.
+
+ ```shell
+ $sh pycharm.sh
+ ```
+
+
+
+
+
+
+
+5. 명령어가 실행되면 설치가 시작됩니다.
+
+
+
+6. 설치가 완료되면 다음 화면과 같은 초기 환경설정 화면을 볼 수 있습니다.
+
+
+
+
+
+
+
+ IDE theme 항목에서 Intellij는 바탕이 흰색인 테마이고 Darcula 테마는 바탕이 검은색입니다. 이
+ 책에서는 Intellij를 테마로 사용합니다.
+
+
+
+7. 초기설정이 완료된 후의 화면입니다. 여기서 프로젝트 생성을 해봅니다.
+
+
+
+
+
+
+
+8. 프로젝트의 경로와 Interpreter를 설정하는 화면입니다. Home 디렉터리에 PycharmProjects 폴더를 생성하고 그 하위에 프로젝트를 생성합니다. 프로젝트의 이름은 독자가 임의로 정하도록 합니다. “rlcode_book” 이름으로 프로젝트를 생성하는데 Interpreter를 설정해줍니다. Interpreter는 이 프로젝트에서 사용할 언어인데 python 3.5라고 설정합니다.
+
+
+
+
+
+
+9. rlcode_book 프로젝트가 생성되면 아래와 같은 화면이 나옵니다.
+
+
+
+
+
+
+
+
+
+10. 파이참이 정상적으로 설치되었는지 확인하기 위해 파이썬 스크립트 파일을 생성해봅니다. 가장 간단한 예제인 `“Hello World”`를 실행하기 위해 다음과 같이 hello_world.py 파일을 생성합니다.
+
+
+
+
+
+11. 생성한 파일에 마우스 커서를 놓고 오른쪽 버튼을 누르면 여러 항목이 나옵니다. 그 중에서 “Run ‘hello_world’” 버튼을 누르면 hello_world.py 파일을 실행할 수 있습니다.
+
+
+
+
+
+
+
+12. hello_world.py 파일 안에 다음 코드를 입력합니다.
+ ```python
+ print("hello world")
+ ```
+
+
+
+
+13. hello_world.py 파일을 실행시키면 아래 화면과 같이 실행 창에 “hello world”가 나옵니다. 이를 통해 파이참이 정상적으로 설치된 것을 확인할 수 있습니다.
+
+
+
+
+
+
+
+
+###Virtualenv(가상환경) 사용법 :happy:
+
+여기까지가기본적인 파이참의 환경설정입니다. 한 컴퓨터에서 여러가지 프로젝트를 진행할 경우에 프로젝트마다 개발환경이다를 수 있습니다. 서로 다른 프로젝트의 개발환경이 다를 경우에 사용자는 상당한 불편을 겪을 수 있습니다. 따라서 프로젝트별로 개발환경을 분리해서 관리하는 것은 상당한 장점이 있는데 그 기능을 하는 것이 VirtualEnv입니다. VirtualEnv를 사용하면 이 책의 프로젝트만을위한 가상 개발환경을 만들 수 있습니다.
+
+파이참은VirtualEnv를 지원하기 때문에 파이참으로 VirtualEnv를사용하는 법을 설명하겠습니다. VirtualEnv의 설치 및 사용 방법은 여러 가지가 있지만 위에서설치한 파이참을 이용하면 GUI(graphic user interface)형식으로 VirtualEnv를 사용할 수 있습니다. 그리고 파이참은 가상 개발환경에설치된 다양한 파이썬 외부 라이브러리들을 관리 할 수 있는 기능을 제공합니다.
+
+**파이참에서 VirtualEnv 이용방법은 다음과 같습니다.**
+
+1. “File” 메뉴에서 “Settings”를 클릭합니다.
+
+
+
+
+
+
+
+2. Settings의 왼쪽 목록에서 “Project: 프로젝트명”의 하위 항목인 Project Interpreter 클릭합니다. 그리고 Project Interpreter 탭 오른쪽에서 “Create VirtualEnv”를 클릭합니다.
+
+
+
+
+
+
+
+3. 가상환경 이름을 입력하면 /home/brian/rlcode_book 디렉토리가 생성되어 가상환경이 생깁니다.
+
+
+
+
+
+
+
+4. 아래와 같이 터미널 창에 (rlcode_book) 표시가 된다면 rlcode_book이름을 가진 가상 환경이 생긴 것입니다. 이제 이 환경을 이 책을 위한 가상환경으로 사용하겠습니다.
+
+
+
+
+
+
+
+### 2.1.3 오픈에이아이 설치 및 테스트
+
+2016년에 오픈에이아이(OpenAI)라는 회사가 세워졌습니다. 이 회사의 목표는 인공지능 기술을 전 세계에공개해서 더 안전한 인공지능을 만들어가며 더 많은 분야에 인공지능을 도입하는 것입니다. 오픈에이아이 짐(Gym)는 오픈에이아이에서 만든 환경인데여기서 여러가지 인공지능 알고리즘을 테스트 해볼 수 있습니다.
+
+오픈에이아이짐의 코드는 모두 오픈에이아이의 깃허브(Github)[[2\]](#_ftn1)에업로드되어있습니다.
+
+
+
+
+
+오픈에이아이 짐의 설치는 공식 홈페이지에 설명되어있습니다. 오픈에이아이짐을 설치하기 위해서는 깃(Git)를 먼저 설치해줘야 합니다. 깃(Git)은 버전 관리 도구로서개발 프로세스에서 버전 관리가 필요할 때 주로 사용합니다. 오픈에이아이는 오픈소스로 깃헙(Github)에 공개되어 있습니다. 깃헙은 버전관리되는 소스 코드들의원격 저장소 역할을 하는 플랫폼입니다.
+
+다음과 같은 명령어로 깃를 설치합니다.
+
+```shell
+$ sudo apt-get install git
+```
+
+
+
+깃을 설치한 다음에 오픈에이아이 짐을 설치합니다. 터미널 창에서 오픈에이아이 짐을 설치할 디렉토리로 이동한 다음에 다음과 같은 명령어를 실행합니다.
+
+```shell
+$ git clone https://github.com/openai/gym
+$ cd gym
+$ pip3 install -e
+```
+
+
+
+오픈에이아이 짐은 여러가지 다른 설정으로 설치할 수 있는데 `pip install -e .`은 가장 기본적인 부분들만 설치하는 것입니다. 이후에 아타리 게임 등 오픈에이아이 짐의 모든 게임들을 사용하려면 `pip install -e .` 대신에 다음과 같이 입력해야 합니다.
+
+```shell
+$ pip3 install -e .[all]
+```
+
+
+
+오픈에이아이 짐이 정상적으로 설치되었는지 확인하기 위해서 간단한 예제를 실행해봅니다. 오픈에이아이 짐의 가장 간단한 예제는 카트폴(CartPole)입니다. 카트폴은 카트에 진자가 달린 형태로 이 문제의 목표는 카트를 움직여서 그 반동으로 진자를 세우는 것입니다. 테스트할 때는 그냥 아무 입력도 카트폴에 주지 않은 상태로 오픈에이아이 짐이 제대로 실행되는지만 확인할 것입니다.
+
+`CartPole.py` 파일을 생성하고 코드 2.1과 같이 입력합니다.
+
+```python
+import gym
+env = gym.make('CartPole-v0')
+env.reset()
+for _ in range(1000):
+ env.render()
+ env.step(env.action_space.sample()) # take a random action
+```
+
+ 코드 2.1 카트폴 예제 실행 코드
+
+
+이 코드를 실행하면 화면에 아무 행동도 하지 않는 카트폴이 실행됩니다. 오픈에이아이 짐은 이와 같은 많은 문제들을 제공하며 사용자들은 오픈에이아이 짐의 여러가지 문제에 자신의 학습 알고리즘을 적용해볼 수 있습니다. 또한 오픈에이아이 짐 사이트에 자신의 알고리즘을 공유하거나 결과를 확인할 수 있습니다.
+
+
+
+
+
+
+
+## 2.2 개발 환경 설정 2: 맥 OS
+
+맥 OS에는 기본적으로 파이썬 2.7버전이 설치되어있기 때문에 3.5 버전을 새로 설치를 해야 합니다.
+
+### 2.2.1 파이썬 3.5 설치 및 환경 설정
+
+파이썬다운로드 페이지[[3\]](#_ftnref3)로접속하면 다음과 같은 화면이 나옵니다.
+
+
+
+
+
+1. 위 화면에서 자신의 맥 OS 버전에 맞는 파일을 선택해서 다운로드합니다. 다운로드가 완료된 파일을 실행 후 안내에 따르면 설치가 완료됩니다.
+
+
+
+
+
+
+
+2. 파이썬 설치가 정상적으로 완료됐는지 확인하기 위해서는 터미널을 실행합니다. 터미널 창에 ‘python3’ 명령어를 입력했을 때 다음 화면과 같이 출력된다면 정상적으로 설치된 것입니다.
+
+
+
+
+
+
+
+### 2.2.2 파이참 커뮤니티 설치 및 환경 설정
+
+파이참의 설치 및 환경 설정은 다음과 같은 순서로 진행합니다.
+
+1. 파이참홈페이지에 접속하여 커뮤니티버전을 다운로드합니다.
+
+2. 다운로드가 완료된 파일을 실행하고아래 그림에서 왼쪽 PyCharm CE 아이콘을 오른쪽 폴더 아이콘으로 드래그하면 설치가 완료됩니다.
+
+
+
+3. 처음 파이참을 실행하게 되면 설정화면이 나오는데 IDE theme을 통해 IDE의 색상과 스타일을 변경할 수 있습니다. Default는 우분투의 개발환경설정에서 봤던 Intellij 테마입니다. 이 책에서는 Default를 사용할 것입니다.
+
+
+4. 초기 설정을 완료하고 Create New Project 버튼을 클릭합니다.
+
+
+
+5. Create New Project 버튼을 클릭하면 아래 그림과 같은 화면이 나옵니다. Location은 프로젝트가 생성될 경로와 프로젝트 폴더명을 설정하는 곳입니다. 프로젝트의 이름과 경로는 독자가 임의로 지정하면 됩니다.
+
+ Interpreter는 프로젝트에서 어떤 파이썬 Interpreter를 사용할 것인지 고르는 것입니다. 우분투에서와 마찬가지로 VirtualEnv를 통해 가상 환경을 만들고 그것을 Interpreter로 사용할 것입니다. Create VirtualEnv 버튼을 누릅니다.
+
+
+
+
+
+
+
+6. 아래 그림은 VirtualEnv의 생성화면입니다. Name과 Location은 여러분이 임의로 설정하면 됩니다. Base Interpreter는 위와 같이 새로 설치한 python3.5 를 선택합니다. OK버튼을 누르면 해당 VirtualEnv가 생성됩니다.
+
+
+
+
+
+
+
+7. 처음 New Project 생성화면의 Interpreter에서 방금 생성한 VirtualEnv를 선택해줍니다. 그리고 Create버튼을 누르면 프로젝트 생성이 완료됩니다.
+
+
+
+
+
+
+
+8. 프로젝트를 생성하고 나면 다음과 같은 작업 환경이 보입니다. 이 화면에서 최상위 폴더를 우클릭한 후
+
+ New -> Python File을 클릭하면 새로운 파이썬 파일을 생성할 수 있습니다.
+
+
+
+
+
+
+
+9. 파이참이 제대로 설치됐는지 확인하기 위해 hello world 예제를 실행해봅니다. 우분투에서와 동일하기 때문에 생략하겠습니다.
+
+
+
+### 2.2.3 오픈에이아이 설치 및 테스트
+
+오픈에이아이를 설치하고 카트폴을 실행해보는 단계는 우분투와 동일하므로 생략합니다.
+
+
+
+------
+
+[[1\]](#_ftnref1) https://www.jetbrains.com/pycharm/
+
+[[2\]](#_ftnref2) https://github.com/openai/gym
+
+[[3\]](#_ftnref3) https://www.python.org/downloads/release/python-350/
diff --git a/wiki/rlcode_image/cartpole_exam.png b/wiki/rlcode_image/cartpole_exam.png
new file mode 100644
index 00000000..3b8674b6
Binary files /dev/null and b/wiki/rlcode_image/cartpole_exam.png differ
diff --git a/wiki/rlcode_image/console_hello_world.png b/wiki/rlcode_image/console_hello_world.png
new file mode 100644
index 00000000..e14c6a01
Binary files /dev/null and b/wiki/rlcode_image/console_hello_world.png differ
diff --git a/wiki/rlcode_image/default_config.png b/wiki/rlcode_image/default_config.png
new file mode 100644
index 00000000..7f9e4794
Binary files /dev/null and b/wiki/rlcode_image/default_config.png differ
diff --git a/wiki/rlcode_image/file_setting.png b/wiki/rlcode_image/file_setting.png
new file mode 100644
index 00000000..264bb279
Binary files /dev/null and b/wiki/rlcode_image/file_setting.png differ
diff --git a/wiki/rlcode_image/hello_world_ubuntu.png b/wiki/rlcode_image/hello_world_ubuntu.png
new file mode 100644
index 00000000..ede75b89
Binary files /dev/null and b/wiki/rlcode_image/hello_world_ubuntu.png differ
diff --git a/wiki/rlcode_image/openai_github.png b/wiki/rlcode_image/openai_github.png
new file mode 100644
index 00000000..e5422484
Binary files /dev/null and b/wiki/rlcode_image/openai_github.png differ
diff --git a/wiki/rlcode_image/project_interpreter.png b/wiki/rlcode_image/project_interpreter.png
new file mode 100644
index 00000000..b22f24cc
Binary files /dev/null and b/wiki/rlcode_image/project_interpreter.png differ
diff --git a/wiki/rlcode_image/pycham_new_project.png b/wiki/rlcode_image/pycham_new_project.png
new file mode 100644
index 00000000..bfd309eb
Binary files /dev/null and b/wiki/rlcode_image/pycham_new_project.png differ
diff --git a/wiki/rlcode_image/pycharm_community.png b/wiki/rlcode_image/pycharm_community.png
new file mode 100644
index 00000000..3e4f1967
Binary files /dev/null and b/wiki/rlcode_image/pycharm_community.png differ
diff --git a/wiki/rlcode_image/pycharm_drag.png b/wiki/rlcode_image/pycharm_drag.png
new file mode 100644
index 00000000..3fd2faa2
Binary files /dev/null and b/wiki/rlcode_image/pycharm_drag.png differ
diff --git a/wiki/rlcode_image/pycharm_init.png b/wiki/rlcode_image/pycharm_init.png
new file mode 100644
index 00000000..b2fa23c7
Binary files /dev/null and b/wiki/rlcode_image/pycharm_init.png differ
diff --git a/wiki/rlcode_image/python3_terminal.jpg b/wiki/rlcode_image/python3_terminal.jpg
new file mode 100644
index 00000000..38fe67ac
Binary files /dev/null and b/wiki/rlcode_image/python3_terminal.jpg differ
diff --git a/wiki/rlcode_image/python_download.png b/wiki/rlcode_image/python_download.png
new file mode 100644
index 00000000..24922c44
Binary files /dev/null and b/wiki/rlcode_image/python_download.png differ
diff --git a/wiki/rlcode_image/python_installed.png b/wiki/rlcode_image/python_installed.png
new file mode 100644
index 00000000..a6dae073
Binary files /dev/null and b/wiki/rlcode_image/python_installed.png differ
diff --git a/wiki/rlcode_image/python_intalled.png b/wiki/rlcode_image/python_intalled.png
new file mode 100644
index 00000000..a6dae073
Binary files /dev/null and b/wiki/rlcode_image/python_intalled.png differ
diff --git a/wiki/rlcode_image/rl_book_hello_world.png b/wiki/rlcode_image/rl_book_hello_world.png
new file mode 100644
index 00000000..5588e095
Binary files /dev/null and b/wiki/rlcode_image/rl_book_hello_world.png differ
diff --git a/wiki/rlcode_image/rl_book_project.png b/wiki/rlcode_image/rl_book_project.png
new file mode 100644
index 00000000..b1603305
Binary files /dev/null and b/wiki/rlcode_image/rl_book_project.png differ
diff --git a/wiki/rlcode_image/rl_book_venv.png b/wiki/rlcode_image/rl_book_venv.png
new file mode 100644
index 00000000..c86d7d94
Binary files /dev/null and b/wiki/rlcode_image/rl_book_venv.png differ
diff --git a/wiki/rlcode_image/rl_book_virtualenv.png b/wiki/rlcode_image/rl_book_virtualenv.png
new file mode 100644
index 00000000..dc783044
Binary files /dev/null and b/wiki/rlcode_image/rl_book_virtualenv.png differ
diff --git a/wiki/rlcode_image/rlcode_book_directory.png b/wiki/rlcode_image/rlcode_book_directory.png
new file mode 100644
index 00000000..f1c13cd3
Binary files /dev/null and b/wiki/rlcode_image/rlcode_book_directory.png differ
diff --git a/wiki/rlcode_image/rlcode_project.png b/wiki/rlcode_image/rlcode_project.png
new file mode 100644
index 00000000..d9c89be6
Binary files /dev/null and b/wiki/rlcode_image/rlcode_project.png differ
diff --git a/wiki/rlcode_image/run_hello_world.png b/wiki/rlcode_image/run_hello_world.png
new file mode 100644
index 00000000..570e979d
Binary files /dev/null and b/wiki/rlcode_image/run_hello_world.png differ
diff --git a/wiki/rlcode_image/sh_pycharm.sh.png b/wiki/rlcode_image/sh_pycharm.sh.png
new file mode 100644
index 00000000..19708444
Binary files /dev/null and b/wiki/rlcode_image/sh_pycharm.sh.png differ
diff --git a/wiki/rlcode_image/terminal_rlcode_book.png b/wiki/rlcode_image/terminal_rlcode_book.png
new file mode 100644
index 00000000..38279352
Binary files /dev/null and b/wiki/rlcode_image/terminal_rlcode_book.png differ