diff --git a/.gitignore b/.gitignore
index ab5a28a0..695bdda2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@
 *.pydevproject
 .idea/
 .DS_Store
-__pycache__
\ No newline at end of file
+__pycache__
+./Code 2. Cartpole/6. A3C/Cartpole_A3C.pgy
\ No newline at end of file
diff --git a/1-grid-world/1-policy-iteration/environment.py b/1-grid-world/1-policy-iteration/environment.py
new file mode 100644
index 00000000..910d4ba8
--- /dev/null
+++ b/1-grid-world/1-policy-iteration/environment.py
@@ -0,0 +1,245 @@
+import tkinter as tk
+from tkinter import Button
+import time
+import numpy as np
+from PIL import ImageTk, Image
+
+PhotoImage = ImageTk.PhotoImage
+UNIT = 100  # pixels
+HEIGHT = 5  # grid height
+WIDTH = 5  # grid width
+TRANSITION_PROB = 1
+POSSIBLE_ACTIONS = [0, 1, 2, 3]  # up, down, left, right
+ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # actions in coordinates
+REWARDS = []
+
+
+class GraphicDisplay(tk.Tk):
+    def __init__(self, agent):
+        super(GraphicDisplay, self).__init__()
+        self.title('Policy Iteration')
+        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
+        self.texts = []
+        self.arrows = []
+        self.env = Env()
+        self.agent = agent
+        self.evaluation_count = 0
+        self.improvement_count = 0
+        self.is_moving = 0
+        (self.up, self.down, self.left, self.right), self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
+        self.text_reward(2, 2, "R : 1.0")
+        self.text_reward(1, 2, "R : -1.0")
+        self.text_reward(2, 1, "R : -1.0")
+
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                           height=HEIGHT * UNIT,
+                           width=WIDTH * UNIT)
+        # buttons
+        iteration_button = Button(self, text="Evaluate",
+                                  command=self.evaluate_policy)
+        iteration_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.13, HEIGHT * UNIT + 10,
+                             window=iteration_button)
+        policy_button = Button(self, text="Improve",
+                               command=self.improve_policy)
+        policy_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.37, HEIGHT * UNIT + 10,
+                             window=policy_button)
+        policy_button = Button(self, text="move", command=self.move_by_policy)
+        policy_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.62, HEIGHT * UNIT + 10,
+                             window=policy_button)
+        policy_button = Button(self, text="reset", command=self.reset)
+        policy_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.87, HEIGHT * UNIT + 10,
+                             window=policy_button)
+
+        # create grids
+        for col in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
+            x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
+            canvas.create_line(x0, y0, x1, y1)
+        for row in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
+            x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
+            canvas.create_line(x0, y0, x1, y1)
+
+        # add img to canvas
+        self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+        canvas.create_image(250, 150, image=self.shapes[1])
+        canvas.create_image(150, 250, image=self.shapes[1])
+        canvas.create_image(250, 250, image=self.shapes[2])
+
+        # pack all
+        canvas.pack()
+
+        return canvas
+
+    def load_images(self):
+        up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
+        right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
+        left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
+        down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
+        rectangle = PhotoImage(Image.open("../img/rectangle.png").resize((65, 65)))
+        triangle = PhotoImage(Image.open("../img/triangle.png").resize((65, 65)))
+        circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
+        return (up, down, left, right), (rectangle, triangle, circle)
+
+    def reset(self):
+        if self.is_moving == 0:
+            self.evaluation_count = 0
+            self.improvement_count = 0
+            for i in self.texts:
+                self.canvas.delete(i)
+
+            for i in self.arrows:
+                self.canvas.delete(i)
+            self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
+            self.agent.policy_table = ([[[0.25, 0.25, 0.25, 0.25]] * WIDTH
+                                        for _ in range(HEIGHT)])
+            self.agent.policy_table[2][2] = []
+            x, y = self.canvas.coords(self.rectangle)
+            self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+    def text_value(self, row, col, contents, font='Helvetica', size=10,
+                   style='normal', anchor="nw"):
+        origin_x, origin_y = 85, 70
+        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+        font = (font, str(size), style)
+        text = self.canvas.create_text(x, y, fill="black", text=contents,
+                                       font=font, anchor=anchor)
+        return self.texts.append(text)
+
+    def text_reward(self, row, col, contents, font='Helvetica', size=10,
+                    style='normal', anchor="nw"):
+        origin_x, origin_y = 5, 5
+        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+        font = (font, str(size), style)
+        text = self.canvas.create_text(x, y, fill="black", text=contents,
+                                       font=font, anchor=anchor)
+        return self.texts.append(text)
+
+    def rectangle_move(self, action):
+        base_action = np.array([0, 0])
+        location = self.find_rectangle()
+        self.render()
+        if action == 0 and location[0] > 0:  # up
+            base_action[1] -= UNIT
+        elif action == 1 and location[0] < HEIGHT - 1:  # down
+            base_action[1] += UNIT
+        elif action == 2 and location[1] > 0:  # left
+            base_action[0] -= UNIT
+        elif action == 3 and location[1] < WIDTH - 1:  # right
+            base_action[0] += UNIT
+        # move agent
+        self.canvas.move(self.rectangle, base_action[0], base_action[1])
+
+    def find_rectangle(self):
+        temp = self.canvas.coords(self.rectangle)
+        x = (temp[0] / 100) - 0.5
+        y = (temp[1] / 100) - 0.5
+        return int(y), int(x)
+
+    def move_by_policy(self):
+        if self.improvement_count != 0 and self.is_moving != 1:
+            self.is_moving = 1
+
+            x, y = self.canvas.coords(self.rectangle)
+            self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+            x, y = self.find_rectangle()
+            while len(self.agent.policy_table[x][y]) != 0:
+                self.after(100,
+                           self.rectangle_move(self.agent.get_action([x, y])))
+                x, y = self.find_rectangle()
+            self.is_moving = 0
+
+    def draw_one_arrow(self, col, row, policy):
+        if col == 2 and row == 2:
+            return
+
+        if policy[0] > 0:  # up
+            origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.up))
+        if policy[1] > 0:  # down
+            origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.down))
+        if policy[2] > 0:  # left
+            origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.left))
+        if policy[3] > 0:  # right
+            origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.right))
+
+    def draw_from_policy(self, policy_table):
+        for i in range(HEIGHT):
+            for j in range(WIDTH):
+                self.draw_one_arrow(i, j, policy_table[i][j])
+
+    def print_value_table(self, value_table):
+        for i in range(WIDTH):
+            for j in range(HEIGHT):
+                self.text_value(i, j, value_table[i][j])
+
+    def render(self):
+        time.sleep(0.1)
+        self.canvas.tag_raise(self.rectangle)
+        self.update()
+
+    def evaluate_policy(self):
+        self.evaluation_count += 1
+        for i in self.texts:
+            self.canvas.delete(i)
+        self.agent.policy_evaluation()
+        self.print_value_table(self.agent.value_table)
+
+    def improve_policy(self):
+        self.improvement_count += 1
+        for i in self.arrows:
+            self.canvas.delete(i)
+        self.agent.policy_improvement()
+        self.draw_from_policy(self.agent.policy_table)
+
+
+class Env:
+    def __init__(self):
+        self.transition_probability = TRANSITION_PROB
+        self.width = WIDTH
+        self.height = HEIGHT
+        self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
+        self.possible_actions = POSSIBLE_ACTIONS
+        self.reward[2][2] = 1  # reward 1 for circle
+        self.reward[1][2] = -1  # reward -1 for triangle
+        self.reward[2][1] = -1  # reward -1 for triangle
+        self.all_state = []
+
+        for x in range(WIDTH):
+            for y in range(HEIGHT):
+                state = [x, y]
+                self.all_state.append(state)
+
+    def get_reward(self, state, action):
+        next_state = self.state_after_action(state, action)
+        return self.reward[next_state[0]][next_state[1]]
+
+    def state_after_action(self, state, action_index):
+        action = ACTIONS[action_index]
+        return self.check_boundary([state[0] + action[0], state[1] + action[1]])
+
+    @staticmethod
+    def check_boundary(state):
+        state[0] = (0 if state[0] < 0 else WIDTH - 1
+                    if state[0] > WIDTH - 1 else state[0])
+        state[1] = (0 if state[1] < 0 else HEIGHT - 1
+                    if state[1] > HEIGHT - 1 else state[1])
+        return state
+
+    def get_transition_prob(self, state, action):
+        return self.transition_probability
+
+    def get_all_states(self):
+        return self.all_state
diff --git a/1-grid-world/1-policy-iteration/policy_iteration.py b/1-grid-world/1-policy-iteration/policy_iteration.py
new file mode 100644
index 00000000..d6dc414e
--- /dev/null
+++ b/1-grid-world/1-policy-iteration/policy_iteration.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+import random
+from environment import GraphicDisplay, Env
+
+
+class PolicyIteration:
+    def __init__(self, env):
+        self.env = env
+        # 2-d list for the value function
+        self.value_table = [[0.0] * env.width for _ in range(env.height)]
+        # list of random policy (same probability of up, down, left, right)
+        self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width
+                                    for _ in range(env.height)]
+        # setting terminal state
+        self.policy_table[2][2] = []
+        self.discount_factor = 0.9
+
+    def policy_evaluation(self):
+        next_value_table = [[0.00] * self.env.width
+                                    for _ in range(self.env.height)]
+
+        # Bellman Expectation Equation for the every states
+        for state in self.env.get_all_states():
+            value = 0.0
+            # keep the value function of terminal states as 0
+            if state == [2, 2]:
+                next_value_table[state[0]][state[1]] = value
+                continue
+
+            for action in self.env.possible_actions:
+                next_state = self.env.state_after_action(state, action)
+                reward = self.env.get_reward(state, action)
+                next_value = self.get_value(next_state)
+                value += (self.get_policy(state)[action] *
+                          (reward + self.discount_factor * next_value))
+
+            next_value_table[state[0]][state[1]] = round(value, 2)
+
+        self.value_table = next_value_table
+
+    def policy_improvement(self):
+        next_policy = self.policy_table
+        for state in self.env.get_all_states():
+            if state == [2, 2]:
+                continue
+            value = -99999
+            max_index = []
+            result = [0.0, 0.0, 0.0, 0.0]  # initialize the policy
+
+            # for every actions, calculate
+            # [reward + (discount factor) * (next state value function)]
+            for index, action in enumerate(self.env.possible_actions):
+                next_state = self.env.state_after_action(state, action)
+                reward = self.env.get_reward(state, action)
+                next_value = self.get_value(next_state)
+                temp = reward + self.discount_factor * next_value
+
+                # We normally can't pick multiple actions in greedy policy.
+                # but here we allow multiple actions with same max values
+                if temp == value:
+                    max_index.append(index)
+                elif temp > value:
+                    value = temp
+                    max_index.clear()
+                    max_index.append(index)
+
+            # probability of action
+            prob = 1 / len(max_index)
+
+            for index in max_index:
+                result[index] = prob
+
+            next_policy[state[0]][state[1]] = result
+
+        self.policy_table = next_policy
+
+    # get action according to the current policy
+    def get_action(self, state):
+        random_pick = random.randrange(100) / 100
+
+        policy = self.get_policy(state)
+        policy_sum = 0.0
+        # return the action in the index
+        for index, value in enumerate(policy):
+            policy_sum += value
+            if random_pick < policy_sum:
+                return index
+
+    # get policy of specific state
+    def get_policy(self, state):
+        if state == [2, 2]:
+            return 0.0
+        return self.policy_table[state[0]][state[1]]
+
+    def get_value(self, state):
+        return round(self.value_table[state[0]][state[1]], 2)
+
+if __name__ == "__main__":
+    env = Env()
+    policy_iteration = PolicyIteration(env)
+    grid_world = GraphicDisplay(policy_iteration)
+    grid_world.mainloop()
diff --git a/1-grid-world/2-value-iteration/environment.py b/1-grid-world/2-value-iteration/environment.py
new file mode 100644
index 00000000..81af3dc5
--- /dev/null
+++ b/1-grid-world/2-value-iteration/environment.py
@@ -0,0 +1,261 @@
+import tkinter as tk
+import time
+import numpy as np
+import random
+from PIL import ImageTk, Image
+
+PhotoImage = ImageTk.PhotoImage
+UNIT = 100  # pixels
+HEIGHT = 5  # grid height
+WIDTH = 5  # grid width
+TRANSITION_PROB = 1
+POSSIBLE_ACTIONS = [0, 1, 2, 3]  # up, down, left, right
+ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # actions in coordinates
+REWARDS = []
+
+
+class GraphicDisplay(tk.Tk):
+    def __init__(self, value_iteration):
+        super(GraphicDisplay, self).__init__()
+        self.title('Value Iteration')
+        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
+        self.texts = []
+        self.arrows = []
+        self.env = Env()
+        self.agent = value_iteration
+        self.iteration_count = 0
+        self.improvement_count = 0
+        self.is_moving = 0
+        (self.up, self.down, self.left,
+         self.right), self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
+        self.text_reward(2, 2, "R : 1.0")
+        self.text_reward(1, 2, "R : -1.0")
+        self.text_reward(2, 1, "R : -1.0")
+
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                           height=HEIGHT * UNIT,
+                           width=WIDTH * UNIT)
+        # buttons
+        iteration_button = tk.Button(self, text="Calculate",
+                                     command=self.calculate_value)
+        iteration_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10,
+                             window=iteration_button)
+
+        policy_button = tk.Button(self, text="Print Policy",
+                                  command=self.print_optimal_policy)
+        policy_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10,
+                             window=policy_button)
+
+        policy_button = tk.Button(self, text="Move",
+                                  command=self.move_by_policy)
+        policy_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10,
+                             window=policy_button)
+
+        policy_button = tk.Button(self, text="Clear", command=self.clear)
+        policy_button.configure(width=10, activebackground="#33B5E5")
+        canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10,
+                             window=policy_button)
+
+        # create grids
+        for col in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
+            x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
+            canvas.create_line(x0, y0, x1, y1)
+        for row in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
+            x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
+            canvas.create_line(x0, y0, x1, y1)
+
+        # add img to canvas
+        self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+        canvas.create_image(250, 150, image=self.shapes[1])
+        canvas.create_image(150, 250, image=self.shapes[1])
+        canvas.create_image(250, 250, image=self.shapes[2])
+
+        # pack all
+        canvas.pack()
+
+        return canvas
+
+    def load_images(self):
+        PhotoImage = ImageTk.PhotoImage
+        up = PhotoImage(Image.open("../img/up.png").resize((13, 13)))
+        right = PhotoImage(Image.open("../img/right.png").resize((13, 13)))
+        left = PhotoImage(Image.open("../img/left.png").resize((13, 13)))
+        down = PhotoImage(Image.open("../img/down.png").resize((13, 13)))
+        rectangle = PhotoImage(
+            Image.open("../img/rectangle.png").resize((65, 65)))
+        triangle = PhotoImage(
+            Image.open("../img/triangle.png").resize((65, 65)))
+        circle = PhotoImage(Image.open("../img/circle.png").resize((65, 65)))
+        return (up, down, left, right), (rectangle, triangle, circle)
+
+    def clear(self):
+
+        if self.is_moving == 0:
+            self.iteration_count = 0
+            self.improvement_count = 0
+            for i in self.texts:
+                self.canvas.delete(i)
+
+            for i in self.arrows:
+                self.canvas.delete(i)
+
+            self.agent.value_table = [[0.0] * WIDTH for _ in range(HEIGHT)]
+
+            x, y = self.canvas.coords(self.rectangle)
+            self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+    def reset(self):
+        self.update()
+        time.sleep(0.5)
+        self.canvas.delete(self.rectangle)
+        return self.canvas.coords(self.rectangle)
+
+    def text_value(self, row, col, contents, font='Helvetica', size=12,
+                   style='normal', anchor="nw"):
+        origin_x, origin_y = 85, 70
+        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+        font = (font, str(size), style)
+        text = self.canvas.create_text(x, y, fill="black", text=contents,
+                                       font=font, anchor=anchor)
+        return self.texts.append(text)
+
+    def text_reward(self, row, col, contents, font='Helvetica', size=12,
+                    style='normal', anchor="nw"):
+        origin_x, origin_y = 5, 5
+        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
+        font = (font, str(size), style)
+        text = self.canvas.create_text(x, y, fill="black", text=contents,
+                                       font=font, anchor=anchor)
+        return self.texts.append(text)
+
+    def rectangle_move(self, action):
+        base_action = np.array([0, 0])
+        location = self.find_rectangle()
+        self.render()
+        if action == 0 and location[0] > 0:  # up
+            base_action[1] -= UNIT
+        elif action == 1 and location[0] < HEIGHT - 1:  # down
+            base_action[1] += UNIT
+        elif action == 2 and location[1] > 0:  # left
+            base_action[0] -= UNIT
+        elif action == 3 and location[1] < WIDTH - 1:  # right
+            base_action[0] += UNIT
+
+        self.canvas.move(self.rectangle, base_action[0],
+                         base_action[1])  # move agent
+
+    def find_rectangle(self):
+        temp = self.canvas.coords(self.rectangle)
+        x = (temp[0] / 100) - 0.5
+        y = (temp[1] / 100) - 0.5
+        return int(y), int(x)
+
+    def move_by_policy(self):
+
+        if self.improvement_count != 0 and self.is_moving != 1:
+            self.is_moving = 1
+            x, y = self.canvas.coords(self.rectangle)
+            self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+
+            x, y = self.find_rectangle()
+            while len(self.agent.get_action([x, y])) != 0:
+                action = random.sample(self.agent.get_action([x, y]), 1)[0]
+                self.after(100, self.rectangle_move(action))
+                x, y = self.find_rectangle()
+            self.is_moving = 0
+
+    def draw_one_arrow(self, col, row, action):
+        if col == 2 and row == 2:
+            return
+        if action == 0:  # up
+            origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.up))
+        elif action == 1:  # down
+            origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.down))
+        elif action == 3:  # right
+            origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.right))
+        elif action == 2:  # left
+            origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
+            self.arrows.append(self.canvas.create_image(origin_x, origin_y,
+                                                        image=self.left))
+
+    def draw_from_values(self, state, action_list):
+        i = state[0]
+        j = state[1]
+        for action in action_list:
+            self.draw_one_arrow(i, j, action)
+
+    def print_values(self, values):
+        for i in range(WIDTH):
+            for j in range(HEIGHT):
+                self.text_value(i, j, values[i][j])
+
+    def render(self):
+        time.sleep(0.1)
+        self.canvas.tag_raise(self.rectangle)
+        self.update()
+
+    def calculate_value(self):
+        self.iteration_count += 1
+        for i in self.texts:
+            self.canvas.delete(i)
+        self.agent.value_iteration()
+        self.print_values(self.agent.value_table)
+
+    def print_optimal_policy(self):
+        self.improvement_count += 1
+        for i in self.arrows:
+            self.canvas.delete(i)
+        for state in self.env.get_all_states():
+            action = self.agent.get_action(state)
+            self.draw_from_values(state, action)
+
+
+class Env:
+    def __init__(self):
+        self.transition_probability = TRANSITION_PROB
+        self.width = WIDTH  # Width of Grid World
+        self.height = HEIGHT  # Height of GridWorld
+        self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
+        self.possible_actions = POSSIBLE_ACTIONS
+        self.reward[2][2] = 1  # reward 1 for circle
+        self.reward[1][2] = -1  # reward -1 for triangle
+        self.reward[2][1] = -1  # reward -1 for triangle
+        self.all_state = []
+
+        for x in range(WIDTH):
+            for y in range(HEIGHT):
+                state = [x, y]
+                self.all_state.append(state)
+
+    def get_reward(self, state, action):
+        next_state = self.state_after_action(state, action)
+        return self.reward[next_state[0]][next_state[1]]
+
+    def state_after_action(self, state, action_index):
+        action = ACTIONS[action_index]
+        return self.check_boundary([state[0] + action[0], state[1] + action[1]])
+
+    @staticmethod
+    def check_boundary(state):
+        state[0] = (0 if state[0] < 0 else WIDTH - 1
+        if state[0] > WIDTH - 1 else state[0])
+        state[1] = (0 if state[1] < 0 else HEIGHT - 1
+        if state[1] > HEIGHT - 1 else state[1])
+        return state
+
+    def get_transition_prob(self, state, action):
+        return self.transition_probability
+
+    def get_all_states(self):
+        return self.all_state
diff --git a/1-grid-world/2-value-iteration/value_iteration.py b/1-grid-world/2-value-iteration/value_iteration.py
new file mode 100644
index 00000000..8dff7281
--- /dev/null
+++ b/1-grid-world/2-value-iteration/value_iteration.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+from environment import GraphicDisplay, Env
+
+class ValueIteration:
+    def __init__(self, env):
+        self.env = env
+        # 2-d list for the value function
+        self.value_table = [[0.0] * env.width for _ in range(env.height)]
+        self.discount_factor = 0.9
+
+    # get next value function table from the current value function table
+    def value_iteration(self):
+        next_value_table = [[0.0] * self.env.width
+                                    for _ in range(self.env.height)]
+        for state in self.env.get_all_states():
+            if state == [2, 2]:
+                next_value_table[state[0]][state[1]] = 0.0
+                continue
+            value_list = []
+
+            for action in self.env.possible_actions:
+                next_state = self.env.state_after_action(state, action)
+                reward = self.env.get_reward(state, action)
+                next_value = self.get_value(next_state)
+                value_list.append((reward + self.discount_factor * next_value))
+            # return the maximum value(it is the optimality equation!!)
+            next_value_table[state[0]][state[1]] = round(max(value_list), 2)
+        self.value_table = next_value_table
+
+    # get action according to the current value function table
+    def get_action(self, state):
+        action_list = []
+        max_value = -99999
+
+        if state == [2, 2]:
+            return []
+
+        # calculating q values for the all actions and
+        # append the action to action list which has maximum q value
+        for action in self.env.possible_actions:
+
+            next_state = self.env.state_after_action(state, action)
+            reward = self.env.get_reward(state, action)
+            next_value = self.get_value(next_state)
+            value = (reward + self.discount_factor * next_value)
+
+            if value > max_value:
+                action_list.clear()
+                action_list.append(action)
+                max_value = value
+            elif value == max_value:
+                action_list.append(action)
+
+        return action_list
+
+    def get_value(self, state):
+        return round(self.value_table[state[0]][state[1]], 2)
+
+if __name__ == "__main__":
+    env = Env()
+    value_iteration = ValueIteration(env)
+    grid_world = GraphicDisplay(value_iteration)
+    grid_world.mainloop()
diff --git a/1-grid-world/3-monte-carlo/environment.py b/1-grid-world/3-monte-carlo/environment.py
new file mode 100644
index 00000000..d885107d
--- /dev/null
+++ b/1-grid-world/3-monte-carlo/environment.py
@@ -0,0 +1,113 @@
+import time
+import numpy as np
+import tkinter as tk
+from PIL import ImageTk, Image
+
+np.random.seed(1)
+PhotoImage = ImageTk.PhotoImage
+UNIT = 100  # pixels
+HEIGHT = 5  # grid height
+WIDTH = 5  # grid width
+
+
+class Env(tk.Tk):
+    def __init__(self):
+        super(Env, self).__init__()
+        self.action_space = ['u', 'd', 'l', 'r']
+        self.n_actions = len(self.action_space)
+        self.title('monte carlo')
+        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
+        self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
+        self.texts = []
+
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                                height=HEIGHT * UNIT,
+                                width=WIDTH * UNIT)
+        # create grids
+        for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
+            x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
+            canvas.create_line(x0, y0, x1, y1)
+        for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
+            x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
+            canvas.create_line(x0, y0, x1, y1)
+
+        # add img to canvas
+        self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+        self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
+        self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
+        self.circle = canvas.create_image(250, 250, image=self.shapes[2])
+
+        # pack all
+        canvas.pack()
+
+        return canvas
+
+    def load_images(self):
+        rectangle = PhotoImage(
+            Image.open("../img/rectangle.png").resize((65, 65)))
+        triangle = PhotoImage(
+            Image.open("../img/triangle.png").resize((65, 65)))
+        circle = PhotoImage(
+            Image.open("../img/circle.png").resize((65, 65)))
+
+        return rectangle, triangle, circle
+
+    @staticmethod
+    def coords_to_state(coords):
+        x = int((coords[0] - 50) / 100)
+        y = int((coords[1] - 50) / 100)
+        return [x, y]
+
+    def reset(self):
+        self.update()
+        time.sleep(0.5)
+        x, y = self.canvas.coords(self.rectangle)
+        self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+        # return observation
+        return self.coords_to_state(self.canvas.coords(self.rectangle))
+
+    def step(self, action):
+        state = self.canvas.coords(self.rectangle)
+        base_action = np.array([0, 0])
+        self.render()
+
+        if action == 0:  # up
+            if state[1] > UNIT:
+                base_action[1] -= UNIT
+        elif action == 1:  # down
+            if state[1] < (HEIGHT - 1) * UNIT:
+                base_action[1] += UNIT
+        elif action == 2:  # left
+            if state[0] > UNIT:
+                base_action[0] -= UNIT
+        elif action == 3:  # right
+            if state[0] < (WIDTH - 1) * UNIT:
+                base_action[0] += UNIT
+        # move agent
+        self.canvas.move(self.rectangle, base_action[0], base_action[1])
+        # move rectangle to top level of canvas
+        self.canvas.tag_raise(self.rectangle)
+
+        next_state = self.canvas.coords(self.rectangle)
+
+        # reward function
+        if next_state == self.canvas.coords(self.circle):
+            reward = 100
+            done = True
+        elif next_state in [self.canvas.coords(self.triangle1),
+                            self.canvas.coords(self.triangle2)]:
+            reward = -100
+            done = True
+        else:
+            reward = 0
+            done = False
+
+        next_state = self.coords_to_state(next_state)
+
+        return next_state, reward, done
+
+    def render(self):
+        time.sleep(0.03)
+        self.update()
diff --git a/1-grid-world/3-monte-carlo/mc_agent.py b/1-grid-world/3-monte-carlo/mc_agent.py
new file mode 100644
index 00000000..682b59b9
--- /dev/null
+++ b/1-grid-world/3-monte-carlo/mc_agent.py
@@ -0,0 +1,111 @@
+import numpy as np
+import random
+from collections import defaultdict
+from environment import Env
+
+
+# Monte Carlo Agent which learns every episodes from the sample
+class MCAgent:
+    def __init__(self, actions):
+        self.width = 5
+        self.height = 5
+        self.actions = actions
+        self.learning_rate = 0.01
+        self.discount_factor = 0.9
+        self.epsilon = 0.1
+        self.samples = []
+        self.value_table = defaultdict(float)
+
+    # append sample to memory(state, reward, done)
+    def save_sample(self, state, reward, done):
+        self.samples.append([state, reward, done])
+
+    # for every episode, agent updates q function of visited states
+    def update(self):
+        G_t = 0
+        visit_state = []
+        for reward in reversed(self.samples):
+            state = str(reward[0])
+            if state not in visit_state:
+                visit_state.append(state)
+                G_t = self.discount_factor * (reward[1] + G_t)
+                value = self.value_table[state]
+                self.value_table[state] = (value +
+                                           self.learning_rate * (G_t - value))
+
+    # get action for the state according to the q function table
+    # agent pick action of epsilon-greedy policy
+    def get_action(self, state):
+        if np.random.rand() < self.epsilon:
+            # take random action
+            action = np.random.choice(self.actions)
+        else:
+            # take action according to the q function table
+            next_state = self.possible_next_state(state)
+            action = self.arg_max(next_state)
+        return int(action)
+
+    # compute arg_max if multiple candidates exit, pick one randomly
+    @staticmethod
+    def arg_max(next_state):
+        max_index_list = []
+        max_value = next_state[0]
+        for index, value in enumerate(next_state):
+            if value > max_value:
+                max_index_list.clear()
+                max_value = value
+                max_index_list.append(index)
+            elif value == max_value:
+                max_index_list.append(index)
+        return random.choice(max_index_list)
+
+    # get the possible next states
+    def possible_next_state(self, state):
+        col, row = state
+        next_state = [0.0] * 4
+
+        if row != 0:
+            next_state[0] = self.value_table[str([col, row - 1])]
+        else:
+            next_state[0] = self.value_table[str(state)]
+        if row != self.height - 1:
+            next_state[1] = self.value_table[str([col, row + 1])]
+        else:
+            next_state[1] = self.value_table[str(state)]
+        if col != 0:
+            next_state[2] = self.value_table[str([col - 1, row])]
+        else:
+            next_state[2] = self.value_table[str(state)]
+        if col != self.width - 1:
+            next_state[3] = self.value_table[str([col + 1, row])]
+        else:
+            next_state[3] = self.value_table[str(state)]
+
+        return next_state
+
+
+# main loop
+if __name__ == "__main__":
+    env = Env()
+    agent = MCAgent(actions=list(range(env.n_actions)))
+
+    for episode in range(1000):
+        state = env.reset()
+        action = agent.get_action(state)
+
+        while True:
+            env.render()
+
+            # forward to next state. reward is number and done is boolean
+            next_state, reward, done = env.step(action)
+            agent.save_sample(next_state, reward, done)
+
+            # get next action
+            action = agent.get_action(next_state)
+
+            # at the end of each episode, update the q function table
+            if done:
+                print("episode : ", episode)
+                agent.update()
+                agent.samples.clear()
+                break
diff --git a/Code 1. Grid World/4. SARSA/.python-version b/1-grid-world/4-sarsa/.python-version
similarity index 100%
rename from Code 1. Grid World/4. SARSA/.python-version
rename to 1-grid-world/4-sarsa/.python-version
diff --git a/Code 1. Grid World/3. Monte-Carlo/environment.py b/1-grid-world/4-sarsa/environment.py
similarity index 55%
rename from Code 1. Grid World/3. Monte-Carlo/environment.py
rename to 1-grid-world/4-sarsa/environment.py
index 30074db3..acf6d819 100644
--- a/Code 1. Grid World/3. Monte-Carlo/environment.py	
+++ b/1-grid-world/4-sarsa/environment.py
@@ -4,7 +4,7 @@
 from PIL import ImageTk, Image
 
 np.random.seed(1)
-
+PhotoImage = ImageTk.PhotoImage
 UNIT = 100  # pixels
 HEIGHT = 5  # grid height
 WIDTH = 5  # grid width
@@ -15,41 +15,47 @@ def __init__(self):
         super(Env, self).__init__()
         self.action_space = ['u', 'd', 'l', 'r']
         self.n_actions = len(self.action_space)
-        self.title('monte carlo')
+        self.title('SARSA')
         self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
-        self.buildGraphic()
+        self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
         self.texts = []
 
-    def buildGraphic(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                           height=HEIGHT * UNIT,
+                           width=WIDTH * UNIT)
         # create grids
         for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
+            canvas.create_line(x0, y0, x1, y1)
         for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
-            self.canvas.create_line(x0, y0, x1, y1)
-
-        # image_load
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
-        self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
-        self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
+            canvas.create_line(x0, y0, x1, y1)
 
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image)
-        self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image)
-        self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
+        # add img to canvas
+        self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+        self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
+        self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
+        self.circle = canvas.create_image(250, 250, image=self.shapes[2])
 
         # pack all
-        self.canvas.pack()
+        canvas.pack()
+
+        return canvas
+
+    def load_images(self):
+        rectangle = PhotoImage(
+            Image.open("../img/rectangle.png").resize((65, 65)))
+        triangle = PhotoImage(
+            Image.open("../img/triangle.png").resize((65, 65)))
+        circle = PhotoImage(
+            Image.open("../img/circle.png").resize((65, 65)))
 
-    def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"):
+        return rectangle, triangle, circle
 
+    def text_value(self, row, col, contents, action, font='Helvetica', size=10,
+                   style='normal', anchor="nw"):
         if action == 0:
             origin_x, origin_y = 7, 42
         elif action == 1:
@@ -61,36 +67,33 @@ def text_value(self, row, col, contents, action, font='Helvetica', size=10, styl
 
         x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
         font = (font, str(size), style)
-        return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
+        text = self.canvas.create_text(x, y, fill="black", text=contents,
+                                       font=font, anchor=anchor)
+        return self.texts.append(text)
 
     def print_value_all(self, q_table):
         for i in self.texts:
             self.canvas.delete(i)
         self.texts.clear()
-        for i in range(HEIGHT):
-            for j in range(WIDTH):
+        for x in range(HEIGHT):
+            for y in range(WIDTH):
                 for action in range(0, 4):
-                    state = [i, j]
-                    if str(state) in q_table.index:
-                        temp = q_table.ix[str(state), action]
-                        self.text_value(j, i, round(temp, 2), action)
+                    state = [x, y]
+                    if str(state) in q_table.keys():
+                        temp = q_table[str(state)][action]
+                        self.text_value(y, x, round(temp, 2), action)
 
     def coords_to_state(self, coords):
         x = int((coords[0] - 50) / 100)
         y = int((coords[1] - 50) / 100)
         return [x, y]
 
-    def state_to_coords(self, state):
-        x = int(state[0] * 100 + 50)
-        y = int(state[1] * 100 + 50)
-        return [x, y]
-
     def reset(self):
         self.update()
         time.sleep(0.5)
-        self.canvas.delete(self.rectangle)
-        origin = np.array([UNIT / 2, UNIT / 2])
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+        x, y = self.canvas.coords(self.rectangle)
+        self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+        self.render()
         # return observation
         return self.coords_to_state(self.canvas.coords(self.rectangle))
 
@@ -112,15 +115,18 @@ def step(self, action):
             if state[0] < (WIDTH - 1) * UNIT:
                 base_action[0] += UNIT
 
-        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
-
-        next_state = self.canvas.coords(self.rectangle)  # next state
+        # move agent
+        self.canvas.move(self.rectangle, base_action[0], base_action[1])
+        # move rectangle to top level of canvas
+        self.canvas.tag_raise(self.rectangle)
+        next_state = self.canvas.coords(self.rectangle)
 
         # reward function
         if next_state == self.canvas.coords(self.circle):
             reward = 100
             done = True
-        elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]:
+        elif next_state in [self.canvas.coords(self.triangle1),
+                            self.canvas.coords(self.triangle2)]:
             reward = -100
             done = True
         else:
@@ -132,5 +138,5 @@ def step(self, action):
         return next_state, reward, done
 
     def render(self):
-        time.sleep(0.05)
+        time.sleep(0.03)
         self.update()
diff --git a/1-grid-world/4-sarsa/sarsa_agent.py b/1-grid-world/4-sarsa/sarsa_agent.py
new file mode 100644
index 00000000..8a8cf9ef
--- /dev/null
+++ b/1-grid-world/4-sarsa/sarsa_agent.py
@@ -0,0 +1,79 @@
+import numpy as np
+import random
+from collections import defaultdict
+from environment import Env
+
+
+# SARSA agent learns every time step from the sample <s, a, r, s', a'>
+class SARSAgent:
+    def __init__(self, actions):
+        self.actions = actions
+        self.learning_rate = 0.01
+        self.discount_factor = 0.9
+        self.epsilon = 0.1
+        self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
+
+    # with sample <s, a, r, s', a'>, learns new q function
+    def learn(self, state, action, reward, next_state, next_action):
+        current_q = self.q_table[state][action]
+        next_state_q = self.q_table[next_state][next_action]
+        new_q = (current_q + self.learning_rate *
+                (reward + self.discount_factor * next_state_q - current_q))
+        self.q_table[state][action] = new_q
+
+    # get action for the state according to the q function table
+    # agent pick action of epsilon-greedy policy
+    def get_action(self, state):
+        if np.random.rand() < self.epsilon:
+            # take random action
+            action = np.random.choice(self.actions)
+        else:
+            # take action according to the q function table
+            state_action = self.q_table[state]
+            action = self.arg_max(state_action)
+        return action
+
+    @staticmethod
+    def arg_max(state_action):
+        max_index_list = []
+        max_value = state_action[0]
+        for index, value in enumerate(state_action):
+            if value > max_value:
+                max_index_list.clear()
+                max_value = value
+                max_index_list.append(index)
+            elif value == max_value:
+                max_index_list.append(index)
+        return random.choice(max_index_list)
+
+if __name__ == "__main__":
+    env = Env()
+    agent = SARSAgent(actions=list(range(env.n_actions)))
+
+    for episode in range(1000):
+        # reset environment and initialize state
+
+        state = env.reset()
+        # get action of state from agent
+        action = agent.get_action(str(state))
+
+        while True:
+            env.render()
+
+            # take action and proceed one step in the environment
+            next_state, reward, done = env.step(action)
+            next_action = agent.get_action(str(next_state))
+
+            # with sample <s,a,r,s',a'>, agent learns new q function
+            agent.learn(str(state), action, reward, str(next_state), next_action)
+
+            state = next_state
+            action = next_action
+
+            # print q function of all states at screen
+            env.print_value_all(agent.q_table)
+
+            # if episode ends, then break
+            if done:
+                break
+
diff --git a/Code 1. Grid World/5. Q Learning/.python-version b/1-grid-world/5-q-learning/.python-version
similarity index 100%
rename from Code 1. Grid World/5. Q Learning/.python-version
rename to 1-grid-world/5-q-learning/.python-version
diff --git a/Code 1. Grid World/5. Q Learning/environment.py b/1-grid-world/5-q-learning/environment.py
similarity index 60%
rename from Code 1. Grid World/5. Q Learning/environment.py
rename to 1-grid-world/5-q-learning/environment.py
index 30074db3..e724e5ac 100644
--- a/Code 1. Grid World/5. Q Learning/environment.py	
+++ b/1-grid-world/5-q-learning/environment.py
@@ -4,7 +4,7 @@
 from PIL import ImageTk, Image
 
 np.random.seed(1)
-
+PhotoImage = ImageTk.PhotoImage
 UNIT = 100  # pixels
 HEIGHT = 5  # grid height
 WIDTH = 5  # grid width
@@ -15,40 +15,47 @@ def __init__(self):
         super(Env, self).__init__()
         self.action_space = ['u', 'd', 'l', 'r']
         self.n_actions = len(self.action_space)
-        self.title('monte carlo')
+        self.title('Q Learning')
         self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
-        self.buildGraphic()
+        self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
         self.texts = []
 
-    def buildGraphic(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                           height=HEIGHT * UNIT,
+                           width=WIDTH * UNIT)
         # create grids
         for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
+            canvas.create_line(x0, y0, x1, y1)
         for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
-            self.canvas.create_line(x0, y0, x1, y1)
-
-        # image_load
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
-        self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
-        self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
+            canvas.create_line(x0, y0, x1, y1)
 
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image)
-        self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image)
-        self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
+        # add img to canvas
+        self.rectangle = canvas.create_image(50, 50, image=self.shapes[0])
+        self.triangle1 = canvas.create_image(250, 150, image=self.shapes[1])
+        self.triangle2 = canvas.create_image(150, 250, image=self.shapes[1])
+        self.circle = canvas.create_image(250, 250, image=self.shapes[2])
 
         # pack all
-        self.canvas.pack()
+        canvas.pack()
+
+        return canvas
+
+    def load_images(self):
+        rectangle = PhotoImage(
+            Image.open("../img/rectangle.png").resize((65, 65)))
+        triangle = PhotoImage(
+            Image.open("../img/triangle.png").resize((65, 65)))
+        circle = PhotoImage(
+            Image.open("../img/circle.png").resize((65, 65)))
 
-    def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"):
+        return rectangle, triangle, circle
+
+    def text_value(self, row, col, contents, action, font='Helvetica', size=10,
+                   style='normal', anchor="nw"):
 
         if action == 0:
             origin_x, origin_y = 7, 42
@@ -61,7 +68,9 @@ def text_value(self, row, col, contents, action, font='Helvetica', size=10, styl
 
         x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
         font = (font, str(size), style)
-        return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
+        text = self.canvas.create_text(x, y, fill="black", text=contents,
+                                       font=font, anchor=anchor)
+        return self.texts.append(text)
 
     def print_value_all(self, q_table):
         for i in self.texts:
@@ -71,8 +80,8 @@ def print_value_all(self, q_table):
             for j in range(WIDTH):
                 for action in range(0, 4):
                     state = [i, j]
-                    if str(state) in q_table.index:
-                        temp = q_table.ix[str(state), action]
+                    if str(state) in q_table.keys():
+                        temp = q_table[str(state)][action]
                         self.text_value(j, i, round(temp, 2), action)
 
     def coords_to_state(self, coords):
@@ -88,12 +97,13 @@ def state_to_coords(self, state):
     def reset(self):
         self.update()
         time.sleep(0.5)
-        self.canvas.delete(self.rectangle)
-        origin = np.array([UNIT / 2, UNIT / 2])
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+        x, y = self.canvas.coords(self.rectangle)
+        self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
+        self.render()
         # return observation
         return self.coords_to_state(self.canvas.coords(self.rectangle))
 
+
     def step(self, action):
         state = self.canvas.coords(self.rectangle)
         base_action = np.array([0, 0])
@@ -112,15 +122,18 @@ def step(self, action):
             if state[0] < (WIDTH - 1) * UNIT:
                 base_action[0] += UNIT
 
-        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
-
-        next_state = self.canvas.coords(self.rectangle)  # next state
+        # move agent
+        self.canvas.move(self.rectangle, base_action[0], base_action[1])
+        # move rectangle to top level of canvas
+        self.canvas.tag_raise(self.rectangle)
+        next_state = self.canvas.coords(self.rectangle)
 
         # reward function
         if next_state == self.canvas.coords(self.circle):
             reward = 100
             done = True
-        elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]:
+        elif next_state in [self.canvas.coords(self.triangle1),
+                            self.canvas.coords(self.triangle2)]:
             reward = -100
             done = True
         else:
@@ -128,9 +141,8 @@ def step(self, action):
             done = False
 
         next_state = self.coords_to_state(next_state)
-
         return next_state, reward, done
 
     def render(self):
-        time.sleep(0.05)
+        time.sleep(0.03)
         self.update()
diff --git a/1-grid-world/5-q-learning/q_learning_agent.py b/1-grid-world/5-q-learning/q_learning_agent.py
new file mode 100644
index 00000000..029c2f36
--- /dev/null
+++ b/1-grid-world/5-q-learning/q_learning_agent.py
@@ -0,0 +1,69 @@
+import numpy as np
+import random
+from environment import Env
+from collections import defaultdict
+
+class QLearningAgent:
+    def __init__(self, actions):
+        # actions = [0, 1, 2, 3]
+        self.actions = actions
+        self.learning_rate = 0.01
+        self.discount_factor = 0.9
+        self.epsilon = 0.1
+        self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])
+
+    # update q function with sample <s, a, r, s'>
+    def learn(self, state, action, reward, next_state):
+        current_q = self.q_table[state][action]
+        # using Bellman Optimality Equation to update q function
+        new_q = reward + self.discount_factor * max(self.q_table[next_state])
+        self.q_table[state][action] += self.learning_rate * (new_q - current_q)
+
+    # get action for the state according to the q function table
+    # agent pick action of epsilon-greedy policy
+    def get_action(self, state):
+        if np.random.rand() < self.epsilon:
+            # take random action
+            action = np.random.choice(self.actions)
+        else:
+            # take action according to the q function table
+            state_action = self.q_table[state]
+            action = self.arg_max(state_action)
+        return action
+
+    @staticmethod
+    def arg_max(state_action):
+        max_index_list = []
+        max_value = state_action[0]
+        for index, value in enumerate(state_action):
+            if value > max_value:
+                max_index_list.clear()
+                max_value = value
+                max_index_list.append(index)
+            elif value == max_value:
+                max_index_list.append(index)
+        return random.choice(max_index_list)
+
+if __name__ == "__main__":
+    env = Env()
+    agent = QLearningAgent(actions=list(range(env.n_actions)))
+
+    for episode in range(1000):
+        state = env.reset()
+
+        while True:
+            env.render()
+
+            # take action and proceed one step in the environment
+            action = agent.get_action(str(state))
+            next_state, reward, done = env.step(action)
+
+            # with sample <s,a,r,s'>, agent learns new q function
+            agent.learn(str(state), action, reward, str(next_state))
+
+            state = next_state
+            env.print_value_all(agent.q_table)
+
+            # if episode ends, then break
+            if done:
+                break
diff --git a/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
new file mode 100755
index 00000000..a1b1c23b
--- /dev/null
+++ b/1-grid-world/6-deep-sarsa/deep_sarsa_agent.py
@@ -0,0 +1,117 @@
+import copy
+import pylab
+import random
+import numpy as np
+from environment import Env
+from keras.layers import Dense
+from keras.optimizers import Adam
+from keras.models import Sequential
+
+EPISODES = 1000
+
+
+# this is DeepSARSA Agent for the GridWorld
+# Utilize Neural Network as q function approximator
+class DeepSARSAgent:
+    def __init__(self):
+        self.load_model = False
+        # actions which agent can do
+        self.action_space = [0, 1, 2, 3, 4]
+        # get size of state and action
+        self.action_size = len(self.action_space)
+        self.state_size = 15
+        self.discount_factor = 0.99
+        self.learning_rate = 0.001
+
+        self.epsilon = 1.  # exploration
+        self.epsilon_decay = .9999
+        self.epsilon_min = 0.01
+        self.model = self.build_model()
+
+        if self.load_model:
+            self.epsilon = 0.05
+            self.model.load_weights('./save_model/deep_sarsa_trained.h5')
+
+    # approximate Q function using Neural Network
+    # state is input and Q Value of each action is output of network
+    def build_model(self):
+        model = Sequential()
+        model.add(Dense(30, input_dim=self.state_size, activation='relu'))
+        model.add(Dense(30, activation='relu'))
+        model.add(Dense(self.action_size, activation='linear'))
+        model.summary()
+        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
+        return model
+
+    # get action from model using epsilon-greedy policy
+    def get_action(self, state):
+        if np.random.rand() <= self.epsilon:
+            # The agent acts randomly
+            return random.randrange(self.action_size)
+        else:
+            # Predict the reward value based on the given state
+            state = np.float32(state)
+            q_values = self.model.predict(state)
+            return np.argmax(q_values[0])
+
+    def train_model(self, state, action, reward, next_state, next_action, done):
+        if self.epsilon > self.epsilon_min:
+            self.epsilon *= self.epsilon_decay
+
+        state = np.float32(state)
+        next_state = np.float32(next_state)
+        target = self.model.predict(state)[0]
+        # like Q Learning, get maximum Q value at s'
+        # But from target model
+        if done:
+            target[action] = reward
+        else:
+            target[action] = (reward + self.discount_factor *
+                              self.model.predict(next_state)[0][next_action])
+
+        target = np.reshape(target, [1, 5])
+        # make minibatch which includes target q value and predicted q value
+        # and do the model fit!
+        self.model.fit(state, target, epochs=1, verbose=0)
+
+
+if __name__ == "__main__":
+    env = Env()
+    agent = DeepSARSAgent()
+
+    global_step = 0
+    scores, episodes = [], []
+
+    for e in range(EPISODES):
+        done = False
+        score = 0
+        state = env.reset()
+        state = np.reshape(state, [1, 15])
+
+        while not done:
+            # fresh env
+            global_step += 1
+
+            # get action for the current state and go one step in environment
+            action = agent.get_action(state)
+            next_state, reward, done = env.step(action)
+            next_state = np.reshape(next_state, [1, 15])
+            next_action = agent.get_action(next_state)
+            agent.train_model(state, action, reward, next_state, next_action,
+                              done)
+            state = next_state
+            # every time step we do training
+            score += reward
+
+            state = copy.deepcopy(next_state)
+
+            if done:
+                scores.append(score)
+                episodes.append(e)
+                pylab.plot(episodes, scores, 'b')
+                pylab.savefig("./save_graph/deep_sarsa_.png")
+                print("episode:", e, "  score:", score, "global_step",
+                      global_step, "  epsilon:", agent.epsilon)
+
+        if e % 100 == 0:
+            agent.model.save_weights("./save_model/deep_sarsa.h5")
diff --git a/Code 1. Grid World/6. DQN/environment.py b/1-grid-world/6-deep-sarsa/environment.py
old mode 100644
new mode 100755
similarity index 52%
rename from Code 1. Grid World/6. DQN/environment.py
rename to 1-grid-world/6-deep-sarsa/environment.py
index a30093a1..c390de8b
--- a/Code 1. Grid World/6. DQN/environment.py	
+++ b/1-grid-world/6-deep-sarsa/environment.py
@@ -3,9 +3,10 @@
 import tkinter as tk
 from PIL import ImageTk, Image
 
+PhotoImage = ImageTk.PhotoImage
 UNIT = 50  # pixels
-HEIGHT = 10  # grid height
-WIDTH = 10  # grid width
+HEIGHT = 5  # grid height
+WIDTH = 5  # grid width
 
 np.random.seed(1)
 
@@ -15,54 +16,52 @@ def __init__(self):
         super(Env, self).__init__()
         self.action_space = ['u', 'd', 'l', 'r']
         self.action_size = len(self.action_space)
-        self.title('DeepQNetwork')
+        self.title('DeepSARSA')
         self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
-        self.build_graphic()
+        self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
         self.counter = 0
+        self.rewards = []
+        self.goal = []
+        # obstacle
+        self.set_reward([0, 1], -1)
+        self.set_reward([1, 2], -1)
+        self.set_reward([2, 3], -1)
+        # #goal
+        self.set_reward([4, 4], 1)
 
-    def build_graphic(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                           height=HEIGHT * UNIT,
+                           width=WIDTH * UNIT)
         # create grids
         for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
+            canvas.create_line(x0, y0, x1, y1)
         for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
-            self.canvas.create_line(x0, y0, x1, y1)
+            canvas.create_line(x0, y0, x1, y1)
 
-        # image_load
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((30, 30), Image.ANTIALIAS))
-        self.fire_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((30, 30)))
-        self.fish_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((30, 30)))
+        self.rewards = []
+        self.goal = []
+        # add image to canvas
+        x, y = UNIT/2, UNIT/2
+        self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
 
-        self.rewards = list()
-        self.goal = list()
+        # pack all`
+        canvas.pack()
 
-        # obstacle
-        self.set_reward([2, 7], -1)
-        self.set_reward([3, 2], -1)
-        self.set_reward([2, 5], -1)
-        self.set_reward([4, 9], -1)
-        self.set_reward([5, 7], -1)
-        self.set_reward([6, 4], -1)
-        self.set_reward([7, 8], -1)
-        self.set_reward([8, 3], -1)
-        self.set_reward([9, 1], -1)
-
-        #
-        #
-        # #goal
-        self.set_reward([9, 9], 5)
+        return canvas
 
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+    def load_images(self):
+        rectangle = PhotoImage(
+            Image.open("../img/rectangle.png").resize((30, 30)))
+        triangle = PhotoImage(
+            Image.open("../img/triangle.png").resize((30, 30)))
+        circle = PhotoImage(
+            Image.open("../img/circle.png").resize((30, 30)))
 
-        # pack all`
-        self.canvas.pack()
+        return rectangle, triangle, circle
 
     def reset_reward(self):
 
@@ -71,36 +70,33 @@ def reset_reward(self):
 
         self.rewards.clear()
         self.goal.clear()
-        # obstacle
-        self.set_reward([2, 7], -1)
-        self.set_reward([3, 2], -1)
-        self.set_reward([2, 5], -1)
-        self.set_reward([4, 9], -1)
-        self.set_reward([5, 7], -1)
-        self.set_reward([6, 4], -1)
-        self.set_reward([7, 8], -1)
-        self.set_reward([8, 3], -1)
-        self.set_reward([9, 1], -1)
-
-        #
-        #
+        self.set_reward([0, 1], -1)
+        self.set_reward([1, 2], -1)
+        self.set_reward([2, 3], -1)
+
         # #goal
-        self.set_reward([9, 9], 5)
+        self.set_reward([4, 4], 1)
 
     def set_reward(self, state, reward):
         state = [int(state[0]), int(state[1])]
+        x = int(state[0])
+        y = int(state[1])
         temp = {}
         if reward > 0:
             temp['reward'] = reward
-            temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
-                                                      image=self.fish_image)
+            temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+                                                       (UNIT * y) + UNIT / 2,
+                                                       image=self.shapes[2])
+
             self.goal.append(temp['figure'])
 
 
         elif reward < 0:
+            temp['direction'] = -1
             temp['reward'] = reward
-            temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
-                                                      image=self.fire_image)
+            temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+                                                      (UNIT * y) + UNIT / 2,
+                                                      image=self.shapes[1])
 
         temp['coords'] = self.canvas.coords(temp['figure'])
         temp['state'] = state
@@ -112,28 +108,28 @@ def check_if_reward(self, state):
         check_list = dict()
         check_list['if_goal'] = False
         rewards = 0
+
         for reward in self.rewards:
             if reward['state'] == state:
                 rewards += reward['reward']
-                if reward['reward'] == 5:
+                if reward['reward'] == 1:
                     check_list['if_goal'] = True
+
         check_list['rewards'] = rewards
 
         return check_list
 
     def coords_to_state(self, coords):
-        x = int((coords[0] - 50) / 100)
-        y = int((coords[1] - 50) / 100)
+        x = int((coords[0] - UNIT / 2) / UNIT)
+        y = int((coords[1] - UNIT / 2) / UNIT)
         return [x, y]
 
     def reset(self):
         self.update()
         time.sleep(0.5)
-        self.canvas.delete(self.rectangle)
-        origin = np.array([UNIT / 2, UNIT / 2])
-        self.rectangle = self.canvas.create_image(UNIT/2, UNIT/2, image=self.rectangle_image)
+        x, y = self.canvas.coords(self.rectangle)
+        self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
         # return observation
-
         self.reset_reward()
         return self.get_state()
 
@@ -141,66 +137,77 @@ def step(self, action):
         self.counter += 1
         self.render()
 
-        next_coords = self.move(self.rectangle, action)
-
         if self.counter % 2 == 1:
             self.rewards = self.move_rewards()
 
+        next_coords = self.move(self.rectangle, action)
         check = self.check_if_reward(self.coords_to_state(next_coords))
         done = check['if_goal']
         reward = check['rewards']
 
+        self.canvas.tag_raise(self.rectangle)
+
         s_ = self.get_state()
 
         return s_, reward, done
 
     def get_state(self):
 
-        agent_location = self.coords_to_state(self.canvas.coords(self.rectangle))
-        agent_x = agent_location[0]
-        agent_y = agent_location[1]
+        location = self.coords_to_state(self.canvas.coords(self.rectangle))
+        agent_x = location[0]
+        agent_y = location[1]
 
-        locations = list()
+        states = list()
 
-        locations.append(agent_x)
-        locations.append(agent_y)
+        # locations.append(agent_x)
+        # locations.append(agent_y)
 
         for reward in self.rewards:
             reward_location = reward['state']
-            locations.append(agent_x - reward_location[0])
-            locations.append(agent_y - reward_location[1])
+            states.append(reward_location[0] - agent_x)
+            states.append(reward_location[1] - agent_y)
+            if reward['reward'] < 0:
+                states.append(-1)
+                states.append(reward['direction'])
+            else:
+                states.append(1)
 
-        return locations
+        return states
 
     def move_rewards(self):
         new_rewards = []
         for temp in self.rewards:
-            if temp['reward'] == 10:
+            if temp['reward'] == 1:
                 new_rewards.append(temp)
                 continue
-            temp['coords'] = self.move_const(temp['figure'])
+            temp['coords'] = self.move_const(temp)
             temp['state'] = self.coords_to_state(temp['coords'])
             new_rewards.append(temp)
         return new_rewards
 
     def move_const(self, target):
-        s = self.canvas.coords(target)
+
+        s = self.canvas.coords(target['figure'])
 
         base_action = np.array([0, 0])
 
-        if s[0] < (WIDTH - 1) * UNIT:
-            base_action[0] += UNIT
-        else:
-            base_action[0] = -(WIDTH - 1) * UNIT
+        if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
+            target['direction'] = 1
+        elif s[0] == UNIT / 2:
+            target['direction'] = -1
 
-        # if action == 4 # move _none
+        if target['direction'] == -1:
+            base_action[0] += UNIT
+        elif target['direction'] == 1:
+            base_action[0] -= UNIT
 
-        if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
+        if (target['figure'] is not self.rectangle
+           and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
             base_action = np.array([0, 0])
 
-        self.canvas.move(target, base_action[0], base_action[1])
+        self.canvas.move(target['figure'], base_action[0], base_action[1])
 
-        s_ = self.canvas.coords(target)
+        s_ = self.canvas.coords(target['figure'])
 
         return s_
 
@@ -222,11 +229,6 @@ def move(self, target, action):
             if s[0] > UNIT:
                 base_action[0] -= UNIT
 
-        # if action == 4 # move _none
-
-        if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
-            base_action = np.array([0, 0])
-
         self.canvas.move(target, base_action[0], base_action[1])
 
         s_ = self.canvas.coords(target)
@@ -234,5 +236,5 @@ def move(self, target, action):
         return s_
 
     def render(self):
-        time.sleep(0.1)
+        time.sleep(0.07)
         self.update()
diff --git a/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png
new file mode 100644
index 00000000..8dec1d06
Binary files /dev/null and b/1-grid-world/6-deep-sarsa/save_graph/deep_sarsa_trained.png differ
diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
similarity index 55%
rename from Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5
rename to 1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5
index fe4933cb..23ba39c9 100644
Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN.h5 and b/1-grid-world/6-deep-sarsa/save_model/deep_sarsa_trained.h5 differ
diff --git a/Code 1. Grid World/7. Policy Gradient/environment.py b/1-grid-world/7-reinforce/environment.py
similarity index 51%
rename from Code 1. Grid World/7. Policy Gradient/environment.py
rename to 1-grid-world/7-reinforce/environment.py
index 620a30b3..c8283baa 100644
--- a/Code 1. Grid World/7. Policy Gradient/environment.py	
+++ b/1-grid-world/7-reinforce/environment.py
@@ -3,11 +3,12 @@
 import tkinter as tk
 from PIL import ImageTk, Image
 
+PhotoImage = ImageTk.PhotoImage
 UNIT = 50  # pixels
-HEIGHT = 10  # grid height
-WIDTH = 10  # grid width
+HEIGHT = 5  # grid height
+WIDTH = 5  # grid width
 
-# np.random.seed(1)
+np.random.seed(1)
 
 
 class Env(tk.Tk):
@@ -15,54 +16,52 @@ def __init__(self):
         super(Env, self).__init__()
         self.action_space = ['u', 'd', 'l', 'r']
         self.action_size = len(self.action_space)
-        self.title('Policy Gradient')
+        self.title('Reinforce')
         self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
-        self.build_graphic()
+        self.shapes = self.load_images()
+        self.canvas = self._build_canvas()
         self.counter = 0
+        self.rewards = []
+        self.goal = []
+        # obstacle
+        self.set_reward([0, 1], -1)
+        self.set_reward([1, 2], -1)
+        self.set_reward([2, 3], -1)
+        # #goal
+        self.set_reward([4, 4], 1)
 
-    def build_graphic(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
+    def _build_canvas(self):
+        canvas = tk.Canvas(self, bg='white',
+                           height=HEIGHT * UNIT,
+                           width=WIDTH * UNIT)
         # create grids
         for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
+            canvas.create_line(x0, y0, x1, y1)
         for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
             x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
-            self.canvas.create_line(x0, y0, x1, y1)
+            canvas.create_line(x0, y0, x1, y1)
 
-        # image_load
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((30, 30), Image.ANTIALIAS))
-        self.fire_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((30, 30)))
-        self.fish_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((30, 30)))
+        self.rewards = []
+        self.goal = []
+        # add image to canvas
+        x, y = UNIT/2, UNIT/2
+        self.rectangle = canvas.create_image(x, y, image=self.shapes[0])
 
-        self.rewards = list()
-        self.goal = list()
+        # pack all`
+        canvas.pack()
 
-        # obstacle
-        self.set_reward([2, 7], -1)
-        self.set_reward([3, 2], -1)
-        self.set_reward([2, 5], -1)
-        self.set_reward([4, 9], -1)
-        self.set_reward([5, 7], -1)
-        self.set_reward([6, 4], -1)
-        self.set_reward([7, 8], -1)
-        self.set_reward([8, 3], -1)
-        self.set_reward([9, 1], -1)
-
-        #
-        #
-        # #goal
-        self.set_reward([9, 9], 5)
+        return canvas
 
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
+    def load_images(self):
+        rectangle = PhotoImage(
+            Image.open("../img/rectangle.png").resize((30, 30)))
+        triangle = PhotoImage(
+            Image.open("../img/triangle.png").resize((30, 30)))
+        circle = PhotoImage(
+            Image.open("../img/circle.png").resize((30, 30)))
 
-        # pack all`
-        self.canvas.pack()
+        return rectangle, triangle, circle
 
     def reset_reward(self):
 
@@ -71,36 +70,33 @@ def reset_reward(self):
 
         self.rewards.clear()
         self.goal.clear()
-        # obstacle
-        self.set_reward([2, 7], -1)
-        self.set_reward([3, 2], -1)
-        self.set_reward([2, 5], -1)
-        self.set_reward([4, 9], -1)
-        self.set_reward([5, 7], -1)
-        self.set_reward([6, 4], -1)
-        self.set_reward([7, 8], -1)
-        self.set_reward([8, 3], -1)
-        self.set_reward([9, 1], -1)
-
-        #
-        #
+        self.set_reward([0, 1], -1)
+        self.set_reward([1, 2], -1)
+        self.set_reward([2, 3], -1)
+
         # #goal
-        self.set_reward([9, 9], 5)
+        self.set_reward([4, 4], 1)
 
     def set_reward(self, state, reward):
         state = [int(state[0]), int(state[1])]
+        x = int(state[0])
+        y = int(state[1])
         temp = {}
         if reward > 0:
             temp['reward'] = reward
-            temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
-                                                      image=self.fish_image)
+            temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+                                                       (UNIT * y) + UNIT / 2,
+                                                       image=self.shapes[2])
+
             self.goal.append(temp['figure'])
 
 
         elif reward < 0:
+            temp['direction'] = -1
             temp['reward'] = reward
-            temp['figure'] = self.canvas.create_image((UNIT * state[0]) + UNIT/2, (UNIT * state[1]) + UNIT/2,
-                                                      image=self.fire_image)
+            temp['figure'] = self.canvas.create_image((UNIT * x) + UNIT / 2,
+                                                      (UNIT * y) + UNIT / 2,
+                                                      image=self.shapes[1])
 
         temp['coords'] = self.canvas.coords(temp['figure'])
         temp['state'] = state
@@ -112,28 +108,27 @@ def check_if_reward(self, state):
         check_list = dict()
         check_list['if_goal'] = False
         rewards = 0
+
         for reward in self.rewards:
             if reward['state'] == state:
                 rewards += reward['reward']
-                if reward['reward'] == 5:
+                if reward['reward'] > 0:
                     check_list['if_goal'] = True
+
         check_list['rewards'] = rewards
 
         return check_list
 
     def coords_to_state(self, coords):
-        x = int((coords[0] - 50) / 100)
-        y = int((coords[1] - 50) / 100)
+        x = int((coords[0] - UNIT / 2) / UNIT)
+        y = int((coords[1] - UNIT / 2) / UNIT)
         return [x, y]
 
     def reset(self):
         self.update()
-        time.sleep(0.5)
-        self.canvas.delete(self.rectangle)
-        origin = np.array([UNIT / 2, UNIT / 2])
-        self.rectangle = self.canvas.create_image(UNIT/2, UNIT/2, image=self.rectangle_image)
+        x, y = self.canvas.coords(self.rectangle)
+        self.canvas.move(self.rectangle, UNIT / 2 - x, UNIT / 2 - y)
         # return observation
-
         self.reset_reward()
         return self.get_state()
 
@@ -141,14 +136,15 @@ def step(self, action):
         self.counter += 1
         self.render()
 
-        next_coords = self.move(self.rectangle, action)
-
         if self.counter % 2 == 1:
             self.rewards = self.move_rewards()
 
+        next_coords = self.move(self.rectangle, action)
         check = self.check_if_reward(self.coords_to_state(next_coords))
         done = check['if_goal']
         reward = check['rewards']
+        reward -= 0.1
+        self.canvas.tag_raise(self.rectangle)
 
         s_ = self.get_state()
 
@@ -156,51 +152,61 @@ def step(self, action):
 
     def get_state(self):
 
-        agent_location = self.coords_to_state(self.canvas.coords(self.rectangle))
-        agent_x = agent_location[0]
-        agent_y = agent_location[1]
+        location = self.coords_to_state(self.canvas.coords(self.rectangle))
+        agent_x = location[0]
+        agent_y = location[1]
 
-        locations = list()
+        states = list()
 
-        locations.append(agent_x)
-        locations.append(agent_y)
+        # locations.append(agent_x)
+        # locations.append(agent_y)
 
         for reward in self.rewards:
             reward_location = reward['state']
-            locations.append(agent_x - reward_location[0])
-            locations.append(agent_y - reward_location[1])
+            states.append(reward_location[0] - agent_x)
+            states.append(reward_location[1] - agent_y)
+            if reward['reward'] < 0:
+                states.append(-1)
+                states.append(reward['direction'])
+            else:
+                states.append(1)
 
-        return locations
+        return states
 
     def move_rewards(self):
         new_rewards = []
         for temp in self.rewards:
-            if temp['reward'] == 10:
+            if temp['reward'] > 0:
                 new_rewards.append(temp)
                 continue
-            temp['coords'] = self.move_const(temp['figure'])
+            temp['coords'] = self.move_const(temp)
             temp['state'] = self.coords_to_state(temp['coords'])
             new_rewards.append(temp)
         return new_rewards
 
     def move_const(self, target):
-        s = self.canvas.coords(target)
+
+        s = self.canvas.coords(target['figure'])
 
         base_action = np.array([0, 0])
 
-        if s[0] < (WIDTH - 1) * UNIT:
-            base_action[0] += UNIT
-        else:
-            base_action[0] = -(WIDTH - 1) * UNIT
+        if s[0] == (WIDTH - 1) * UNIT + UNIT / 2:
+            target['direction'] = 1
+        elif s[0] == UNIT / 2:
+            target['direction'] = -1
 
-        # if action == 4 # move _none
+        if target['direction'] == -1:
+            base_action[0] += UNIT
+        elif target['direction'] == 1:
+            base_action[0] -= UNIT
 
-        if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
+        if (target['figure'] is not self.rectangle
+           and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]):
             base_action = np.array([0, 0])
 
-        self.canvas.move(target, base_action[0], base_action[1])
+        self.canvas.move(target['figure'], base_action[0], base_action[1])
 
-        s_ = self.canvas.coords(target)
+        s_ = self.canvas.coords(target['figure'])
 
         return s_
 
@@ -222,11 +228,6 @@ def move(self, target, action):
             if s[0] > UNIT:
                 base_action[0] -= UNIT
 
-        # if action == 4 # move _none
-
-        if target is not self.rectangle and s == [(WIDTH - 1) * UNIT, (HEIGHT - 1) * UNIT]:
-            base_action = np.array([0, 0])
-
         self.canvas.move(target, base_action[0], base_action[1])
 
         s_ = self.canvas.coords(target)
@@ -234,5 +235,5 @@ def move(self, target, action):
         return s_
 
     def render(self):
-        time.sleep(0.1)
+        time.sleep(0.07)
         self.update()
diff --git a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py b/1-grid-world/7-reinforce/reinforce_agent.py
similarity index 57%
rename from Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py
rename to 1-grid-world/7-reinforce/reinforce_agent.py
index 5bf590fd..2a37c851 100644
--- a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py	
+++ b/1-grid-world/7-reinforce/reinforce_agent.py
@@ -7,53 +7,62 @@
 from keras.models import Sequential
 from keras import backend as K
 
-EPISODES = 1000
+EPISODES = 2500
 
 
-class PGAgent:
+# this is REINFORCE Agent for GridWorld
+class ReinforceAgent:
     def __init__(self):
-        self.render = False
-
+        self.load_model = True
+        # actions which agent can do
         self.action_space = [0, 1, 2, 3, 4]
+        # get size of state and action
         self.action_size = len(self.action_space)
-        self.state_size = 22
-        self.discount_factor = 0.99  # decay rate
+        self.state_size = 15
+        self.discount_factor = 0.99
         self.learning_rate = 0.001
 
         self.model = self.build_model()
         self.optimizer = self.optimizer()
         self.states, self.actions, self.rewards = [], [], []
 
+        if self.load_model:
+            self.model.load_weights('./save_model/reinforce_trained.h5')
+
+    # state is input and probability of each action(policy) is output of network
     def build_model(self):
         model = Sequential()
-        model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
-        model.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform'))
-        # 마지막 softmax 계층으로 각 행동에 대한 확률을 만드는 모델을 생성
-        model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
+        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
+        model.add(Dense(24, activation='relu'))
+        model.add(Dense(self.action_size, activation='softmax'))
         model.summary()
-
         return model
 
+    # create error function and training function to update policy network
     def optimizer(self):
         action = K.placeholder(shape=[None, 5])
         discounted_rewards = K.placeholder(shape=[None, ])
 
-        # Policy Gradient 의 핵심
-        # log(정책) * return 의 gradient 를 구해서 최대화시킴
-        good_prob = K.sum(action * self.model.output, axis=1)
-        eligibility = K.log(good_prob) * discounted_rewards
-        loss = -K.sum(eligibility)
+        # Calculate cross entropy error function
+        action_prob = K.sum(action * self.model.output, axis=1)
+        cross_entropy = K.log(action_prob) * discounted_rewards
+        loss = -K.sum(cross_entropy)
 
+        # create training function
         optimizer = Adam(lr=self.learning_rate)
-        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
-        train = K.function([self.model.input, action, discounted_rewards], [], updates=updates)
+        updates = optimizer.get_updates(self.model.trainable_weights, [],
+                                        loss)
+        train = K.function([self.model.input, action, discounted_rewards], [],
+                           updates=updates)
 
         return train
 
+    # get action from policy network
     def get_action(self, state):
-        policy = self.model.predict(state, batch_size=1).flatten()
+        policy = self.model.predict(state)[0]
         return np.random.choice(self.action_size, 1, p=policy)[0]
 
+    # calculate discounted rewards
     def discount_rewards(self, rewards):
         discounted_rewards = np.zeros_like(rewards)
         running_add = 0
@@ -62,14 +71,16 @@ def discount_rewards(self, rewards):
             discounted_rewards[t] = running_add
         return discounted_rewards
 
-    def memory(self, state, action, reward):
+    # save states, actions and rewards for an episode
+    def append_sample(self, state, action, reward):
         self.states.append(state[0])
         self.rewards.append(reward)
         act = np.zeros(self.action_size)
         act[action] = 1
         self.actions.append(act)
 
-    def train_episodes(self):
+    # update policy neural network
+    def train_model(self):
         discounted_rewards = np.float32(self.discount_rewards(self.rewards))
         discounted_rewards -= np.mean(discounted_rewards)
         discounted_rewards /= np.std(discounted_rewards)
@@ -77,59 +88,42 @@ def train_episodes(self):
         self.optimizer([self.states, self.actions, discounted_rewards])
         self.states, self.actions, self.rewards = [], [], []
 
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    def save_model(self, name):
-        self.model.save_weights(name)
-
 
 if __name__ == "__main__":
-    # maze game
-    # env = Maze()
     env = Env()
-    agent = PGAgent()
+    agent = ReinforceAgent()
 
     global_step = 0
-    # agent.load("same_vel_episode2 : 1000")
     scores, episodes = [], []
 
     for e in range(EPISODES):
         done = False
         score = 0
+        # fresh env
         state = env.reset()
-        state = np.reshape(state, [1, 22])
+        state = np.reshape(state, [1, 15])
 
         while not done:
-            # fresh env
-            if agent.render:
-                env.render()
             global_step += 1
-
-            # RL choose action based on observation and go one step
+            # get action for the current state and go one step in environment
             action = agent.get_action(state)
             next_state, reward, done = env.step(action)
-            next_state = np.reshape(next_state, [1, 22])
+            next_state = np.reshape(next_state, [1, 15])
 
-            agent.memory(state, action, reward)
-            # every time step we do train from the replay memory
+            agent.append_sample(state, action, reward)
             score += reward
-            # swap observation
             state = copy.deepcopy(next_state)
 
             if done:
-                agent.train_episodes()
-
+                # update policy neural network for each episode
+                agent.train_model()
                 scores.append(score)
                 episodes.append(e)
-                pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/10by10.png")
-                print("episode:", e, "  score:", score, "  time_step:", global_step)
+                score = round(score, 2)
+                print("episode:", e, "  score:", score, "  time_step:",
+                      global_step)
 
         if e % 100 == 0:
-            pass
-            agent.save_model("./save_model/10by10")
-
-    # end of game
-    print('game over')
-    env.destroy()
\ No newline at end of file
+            pylab.plot(episodes, scores, 'b')
+            pylab.savefig("./save_graph/reinforce.png")
+            agent.model.save_weights("./save_model/reinforce.h5")
diff --git a/1-grid-world/7-reinforce/save_graph/reinforce_trained.png b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png
new file mode 100644
index 00000000..3be9edb7
Binary files /dev/null and b/1-grid-world/7-reinforce/save_graph/reinforce_trained.png differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5
similarity index 64%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5
rename to 1-grid-world/7-reinforce/save_model/reinforce_trained.h5
index c63a4dc6..cb206f51 100644
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN13.h5 and b/1-grid-world/7-reinforce/save_model/reinforce_trained.h5 differ
diff --git a/Code 1. Grid World/README.md b/1-grid-world/README.md
similarity index 100%
rename from Code 1. Grid World/README.md
rename to 1-grid-world/README.md
diff --git a/Code 1. Grid World/gridworld.png b/1-grid-world/gridworld.png
similarity index 100%
rename from Code 1. Grid World/gridworld.png
rename to 1-grid-world/gridworld.png
diff --git a/Code 1. Grid World/gridworld_changing.png b/1-grid-world/gridworld_changing.png
similarity index 100%
rename from Code 1. Grid World/gridworld_changing.png
rename to 1-grid-world/gridworld_changing.png
diff --git a/Code 1. Grid World/resources/circle.png b/1-grid-world/img/circle.png
similarity index 100%
rename from Code 1. Grid World/resources/circle.png
rename to 1-grid-world/img/circle.png
diff --git a/Code 1. Grid World/resources/down.png b/1-grid-world/img/down.png
similarity index 100%
rename from Code 1. Grid World/resources/down.png
rename to 1-grid-world/img/down.png
diff --git a/Code 1. Grid World/resources/left.png b/1-grid-world/img/left.png
similarity index 100%
rename from Code 1. Grid World/resources/left.png
rename to 1-grid-world/img/left.png
diff --git a/Code 1. Grid World/resources/rectangle.png b/1-grid-world/img/rectangle.png
similarity index 100%
rename from Code 1. Grid World/resources/rectangle.png
rename to 1-grid-world/img/rectangle.png
diff --git a/Code 1. Grid World/resources/right.png b/1-grid-world/img/right.png
similarity index 100%
rename from Code 1. Grid World/resources/right.png
rename to 1-grid-world/img/right.png
diff --git a/Code 1. Grid World/resources/triangle.png b/1-grid-world/img/triangle.png
similarity index 100%
rename from Code 1. Grid World/resources/triangle.png
rename to 1-grid-world/img/triangle.png
diff --git a/Code 1. Grid World/resources/up.png b/1-grid-world/img/up.png
similarity index 100%
rename from Code 1. Grid World/resources/up.png
rename to 1-grid-world/img/up.png
diff --git a/2-cartpole/1-dqn/SumTree.py b/2-cartpole/1-dqn/SumTree.py
new file mode 100644
index 00000000..1b72e9ea
--- /dev/null
+++ b/2-cartpole/1-dqn/SumTree.py
@@ -0,0 +1,55 @@
+import numpy
+
+
+class SumTree:
+    write = 0
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.tree = numpy.zeros(2 * capacity - 1)
+        self.data = numpy.zeros(capacity, dtype=object)
+
+    def _propagate(self, idx, change):
+        parent = (idx - 1) // 2
+
+        self.tree[parent] += change
+
+        if parent != 0:
+            self._propagate(parent, change)
+
+    def _retrieve(self, idx, s):
+        left = 2 * idx + 1
+        right = left + 1
+
+        if left >= len(self.tree):
+            return idx
+
+        if s <= self.tree[left]:
+            return self._retrieve(left, s)
+        else:
+            return self._retrieve(right, s - self.tree[left])
+
+    def total(self):
+        return self.tree[0]
+
+    def add(self, p, data):
+        idx = self.write + self.capacity - 1
+
+        self.data[self.write] = data
+        self.update(idx, p)
+
+        self.write += 1
+        if self.write >= self.capacity:
+            self.write = 0
+
+    def update(self, idx, p):
+        change = p - self.tree[idx]
+
+        self.tree[idx] = p
+        self._propagate(idx, change)
+
+    def get(self, s):
+        idx = self._retrieve(0, s)
+        dataIdx = idx - self.capacity + 1
+
+        return (idx, self.tree[idx], self.data[dataIdx])
diff --git a/Code 2. Cartpole/1. DQN/Cartpole_DQN.py b/2-cartpole/1-dqn/cartpole_dqn.py
similarity index 68%
rename from Code 2. Cartpole/1. DQN/Cartpole_DQN.py
rename to 2-cartpole/1-dqn/cartpole_dqn.py
index 84802b76..8b2baaf0 100644
--- a/Code 2. Cartpole/1. DQN/Cartpole_DQN.py	
+++ b/2-cartpole/1-dqn/cartpole_dqn.py
@@ -11,19 +11,20 @@
 EPISODES = 300
 
 
-# this is DQN Agent for the Cartpole
+# DQN Agent for the Cartpole
 # it uses Neural Network to approximate q function
 # and replay memory & target q network
 class DQNAgent:
     def __init__(self, state_size, action_size):
         # if you want to see Cartpole learning, then change to True
         self.render = False
+        self.load_model = False
 
         # get size of state and action
         self.state_size = state_size
         self.action_size = action_size
 
-        # these is hyper parameters for the DQN
+        # These are hyper parameters for the DQN
         self.discount_factor = 0.99
         self.learning_rate = 0.001
         self.epsilon = 1.0
@@ -37,17 +38,23 @@ def __init__(self, state_size, action_size):
         # create main model and target model
         self.model = self.build_model()
         self.target_model = self.build_model()
-        # copy the model to target model
-        # --> initialize the target model so that the parameters of model & target model to be same
+
+        # initialize target model
         self.update_target_model()
 
+        if self.load_model:
+            self.model.load_weights("./save_model/cartpole_dqn.h5")
+
     # approximate Q function using Neural Network
     # state is input and Q Value of each action is output of network
     def build_model(self):
         model = Sequential()
-        model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
-        model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
-        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
+        model.add(Dense(24, input_dim=self.state_size, activation='relu',
+                        kernel_initializer='he_uniform'))
+        model.add(Dense(24, activation='relu',
+                        kernel_initializer='he_uniform'))
+        model.add(Dense(self.action_size, activation='linear',
+                        kernel_initializer='he_uniform'))
         model.summary()
         model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
         return model
@@ -65,50 +72,47 @@ def get_action(self, state):
             return np.argmax(q_value[0])
 
     # save sample <s,a,r,s'> to the replay memory
-    def replay_memory(self, state, action, reward, next_state, done):
+    def append_sample(self, state, action, reward, next_state, done):
         self.memory.append((state, action, reward, next_state, done))
         if self.epsilon > self.epsilon_min:
             self.epsilon *= self.epsilon_decay
 
     # pick samples randomly from replay memory (with batch_size)
-    def train_replay(self):
+    def train_model(self):
         if len(self.memory) < self.train_start:
             return
         batch_size = min(self.batch_size, len(self.memory))
         mini_batch = random.sample(self.memory, batch_size)
 
         update_input = np.zeros((batch_size, self.state_size))
-        update_target = np.zeros((batch_size, self.action_size))
-
-        for i in range(batch_size):
-            state, action, reward, next_state, done = mini_batch[i]
-            target = self.model.predict(state)[0]
-
-            # like Q Learning, get maximum Q value at s'
-            # But from target model
-            if done:
-                target[action] = reward
+        update_target = np.zeros((batch_size, self.state_size))
+        action, reward, done = [], [], []
+
+        for i in range(self.batch_size):
+            update_input[i] = mini_batch[i][0]
+            action.append(mini_batch[i][1])
+            reward.append(mini_batch[i][2])
+            update_target[i] = mini_batch[i][3]
+            done.append(mini_batch[i][4])
+
+        target = self.model.predict(update_input)
+        target_val = self.target_model.predict(update_target)
+
+        for i in range(self.batch_size):
+            # Q Learning: get maximum Q value at s' from target model
+            if done[i]:
+                target[i][action[i]] = reward[i]
             else:
-                target[action] = reward + self.discount_factor * \
-                                          np.amax(self.target_model.predict(next_state)[0])
-            update_input[i] = state
-            update_target[i] = target
+                target[i][action[i]] = reward[i] + self.discount_factor * (
+                    np.amax(target_val[i]))
 
-        # make minibatch which includes target q value and predicted q value
         # and do the model fit!
-        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
-    # load the saved model
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    # save the model which is under training
-    def save_model(self, name):
-        self.model.save_weights(name)
+        self.model.fit(update_input, target, batch_size=self.batch_size,
+                       epochs=1, verbose=0)
 
 
 if __name__ == "__main__":
-    # in case of CartPole-v1, you can play until 500 time step
+    # In case of CartPole-v1, maximum length of episode is 500
     env = gym.make('CartPole-v1')
     # get size of state and action from environment
     state_size = env.observation_space.shape[0]
@@ -123,7 +127,6 @@ def save_model(self, name):
         score = 0
         state = env.reset()
         state = np.reshape(state, [1, state_size])
-        # agent.load_model("./save_model/cartpole-master.h5")
 
         while not done:
             if agent.render:
@@ -137,14 +140,13 @@ def save_model(self, name):
             reward = reward if not done or score == 499 else -100
 
             # save the sample <s, a, r, s'> to the replay memory
-            agent.replay_memory(state, action, reward, next_state, done)
+            agent.append_sample(state, action, reward, next_state, done)
             # every time step do the training
-            agent.train_replay()
+            agent.train_model()
             score += reward
             state = next_state
 
             if done:
-                env.reset()
                 # every episode update the target model to be same with model
                 agent.update_target_model()
 
@@ -153,9 +155,9 @@ def save_model(self, name):
                 scores.append(score)
                 episodes.append(e)
                 pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/Cartpole_DQN14.png")
-                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
-                      "  epsilon:", agent.epsilon)
+                pylab.savefig("./save_graph/cartpole_dqn.png")
+                print("episode:", e, "  score:", score, "  memory length:",
+                      len(agent.memory), "  epsilon:", agent.epsilon)
 
                 # if the mean of scores of last 10 episode is bigger than 490
                 # stop training
@@ -164,4 +166,4 @@ def save_model(self, name):
 
         # save the model
         if e % 50 == 0:
-            agent.save_model("./save_model/Cartpole_DQN14.h5")
\ No newline at end of file
+            agent.model.save_weights("./save_model/cartpole_dqn.h5")
diff --git a/2-cartpole/1-dqn/cartpole_only_per.py b/2-cartpole/1-dqn/cartpole_only_per.py
new file mode 100644
index 00000000..1a66d86b
--- /dev/null
+++ b/2-cartpole/1-dqn/cartpole_only_per.py
@@ -0,0 +1,224 @@
+import sys
+import gym
+import pylab
+import random
+import numpy as np
+from SumTree import SumTree
+from collections import deque
+from keras.layers import Dense
+from keras.optimizers import Adam
+from keras.models import Sequential
+
+EPISODES = 300
+
+
+# 카트폴 예제에서의 DQN 에이전트
+class DQNAgent:
+    def __init__(self, state_size, action_size):
+        self.render = False
+        self.load_model = False
+
+        # 상태와 행동의 크기 정의
+        self.state_size = state_size
+        self.action_size = action_size
+
+        # DQN 하이퍼파라미터
+        self.discount_factor = 0.99
+        self.learning_rate = 0.001
+        self.epsilon = 1.0
+        self.epsilon_decay = 0.999
+        self.epsilon_min = 0.01
+        self.batch_size = 64
+        self.train_start = 2000
+        self.memory_size = 2000
+
+        # 리플레이 메모리, 최대 크기 2000
+        self.memory = Memory(self.memory_size)
+
+        # 모델과 타깃 모델 생성
+        self.model = self.build_model()
+        self.target_model = self.build_model()
+
+        # 타깃 모델 초기화
+        self.update_target_model()
+
+        if self.load_model:
+            self.model.load_weights("./save_model/cartpole_dqn_trained.h5")
+
+    # 상태가 입력, 큐함수가 출력인 인공신경망 생성
+    def build_model(self):
+        model = Sequential()
+        model.add(Dense(24, input_dim=self.state_size, activation='relu',
+                        kernel_initializer='he_uniform'))
+        model.add(Dense(24, activation='relu',
+                        kernel_initializer='he_uniform'))
+        model.add(Dense(self.action_size, activation='linear',
+                        kernel_initializer='he_uniform'))
+        model.summary()
+        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
+        return model
+
+    # 타깃 모델을 모델의 가중치로 업데이트
+    def update_target_model(self):
+        self.target_model.set_weights(self.model.get_weights())
+
+    # 입실론 탐욕 정책으로 행동 선택
+    def get_action(self, state):
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_size)
+        else:
+            q_value = self.model.predict(state)
+            return np.argmax(q_value[0])
+
+    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
+    def append_sample(self, state, action, reward, next_state, done):
+        if self.epsilon == 1:
+            done = True
+
+        # TD-error 를 구해서 같이 메모리에 저장
+        target = self.model.predict([state])
+        old_val = target[0][action]
+        target_val = self.target_model.predict([next_state])
+        if done:
+            target[0][action] = reward
+        else:
+            target[0][action] = reward + self.discount_factor * (
+                np.amax(target_val[0]))
+        error = abs(old_val - target[0][action])
+
+        self.memory.add(error, (state, action, reward, next_state, done))
+
+    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
+    def train_model(self):
+        if self.epsilon > self.epsilon_min:
+            self.epsilon *= self.epsilon_decay
+
+        # 메모리에서 배치 크기만큼 무작위로 샘플 추출
+        mini_batch = self.memory.sample(self.batch_size)
+
+        errors = np.zeros(self.batch_size)
+        states = np.zeros((self.batch_size, self.state_size))
+        next_states = np.zeros((self.batch_size, self.state_size))
+        actions, rewards, dones = [], [], []
+
+        for i in range(self.batch_size):
+            states[i] = mini_batch[i][1][0]
+            actions.append(mini_batch[i][1][1])
+            rewards.append(mini_batch[i][1][2])
+            next_states[i] = mini_batch[i][1][3]
+            dones.append(mini_batch[i][1][4])
+
+        # 현재 상태에 대한 모델의 큐함수
+        # 다음 상태에 대한 타깃 모델의 큐함수
+        target = self.model.predict(states)
+        target_val = self.target_model.predict(next_states)
+
+        # 벨만 최적 방정식을 이용한 업데이트 타깃
+        for i in range(self.batch_size):
+            old_val = target[i][actions[i]]
+            if dones[i]:
+                target[i][actions[i]] = rewards[i]
+            else:
+                target[i][actions[i]] = rewards[i] + self.discount_factor * (
+                    np.amax(target_val[i]))
+            # TD-error를 저장
+            errors[i] = abs(old_val - target[i][actions[i]])
+
+        # TD-error로 priority 업데이트
+        for i in range(self.batch_size):
+            idx = mini_batch[i][0]
+            self.memory.update(idx, errors[i])
+
+        self.model.fit(states, target, batch_size=self.batch_size,
+                       epochs=1, verbose=0)
+
+
+class Memory:  # stored as ( s, a, r, s_ ) in SumTree
+    e = 0.01
+    a = 0.6
+
+    def __init__(self, capacity):
+        self.tree = SumTree(capacity)
+
+    def _getPriority(self, error):
+        return (error + self.e) ** self.a
+
+    def add(self, error, sample):
+        p = self._getPriority(error)
+        self.tree.add(p, sample)
+
+    def sample(self, n):
+        batch = []
+        segment = self.tree.total() / n
+
+        for i in range(n):
+            a = segment * i
+            b = segment * (i + 1)
+
+            s = random.uniform(a, b)
+            (idx, p, data) = self.tree.get(s)
+            batch.append((idx, data))
+
+        return batch
+
+    def update(self, idx, error):
+        p = self._getPriority(error)
+        self.tree.update(idx, p)
+
+
+if __name__ == "__main__":
+    # CartPole-v1 환경, 최대 타임스텝 수가 500
+    env = gym.make('CartPole-v1')
+    state_size = env.observation_space.shape[0]
+    action_size = env.action_space.n
+
+    # DQN 에이전트 생성
+    agent = DQNAgent(state_size, action_size)
+
+    scores, episodes = [], []
+
+    step = 0
+    for e in range(EPISODES):
+        done = False
+        score = 0
+        # env 초기화
+        state = env.reset()
+        state = np.reshape(state, [1, state_size])
+
+        while not done:
+            if agent.render:
+                env.render()
+            step += 1
+            # 현재 상태로 행동을 선택
+            action = agent.get_action(state)
+            # 선택한 행동으로 환경에서 한 타임스텝 진행
+            next_state, reward, done, info = env.step(action)
+            next_state = np.reshape(next_state, [1, state_size])
+            # 에피소드가 중간에 끝나면 -100 보상
+            r = reward if not done or score+reward == 500 else -10
+            # 리플레이 메모리에 샘플 <s, a, r, s'> 저장
+            agent.append_sample(state, action, r, next_state, done)
+            # 매 타임스텝마다 학습
+            if step >= agent.train_start:
+                agent.train_model()
+
+            score += reward
+            state = next_state
+
+            if done:
+                # 각 에피소드마다 타깃 모델을 모델의 가중치로 업데이트
+                agent.update_target_model()
+
+#                score = score if score == 500 else score + 100
+                # 에피소드마다 학습 결과 출력
+                scores.append(score)
+                episodes.append(e)
+                pylab.plot(episodes, scores, 'b')
+                pylab.savefig("./save_graph/cartpole_dqn.png")
+                print("episode:", e, "  score:", score, "  memory length:",
+                      step if step <= agent.memory_size else agent.memory_size, "  epsilon:", agent.epsilon)
+
+                # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
+                if np.mean(scores[-min(10, len(scores)):]) > 490:
+                    agent.model.save_weights("./save_model/cartpole_dqn.h5")
+                    sys.exit()
diff --git a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN.png b/2-cartpole/1-dqn/save_graph/Cartpole_DQN.png
similarity index 100%
rename from Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN.png
rename to 2-cartpole/1-dqn/save_graph/Cartpole_DQN.png
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN1.h5 b/2-cartpole/1-dqn/save_model/cartpole_dqn.h5
similarity index 100%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN1.h5
rename to 2-cartpole/1-dqn/save_model/cartpole_dqn.h5
diff --git a/Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py b/2-cartpole/2-double-dqn/cartpole_ddqn.py
similarity index 71%
rename from Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py
rename to 2-cartpole/2-double-dqn/cartpole_ddqn.py
index b5feb608..73c51140 100644
--- a/Code 2. Cartpole/2. Double DQN/Cartpole_DoubleDQN.py	
+++ b/2-cartpole/2-double-dqn/cartpole_ddqn.py
@@ -11,14 +11,14 @@
 EPISODES = 300
 
 
-# this is Double DQN Agent for the Cartpole
+# Double DQN Agent for the Cartpole
 # it uses Neural Network to approximate q function
 # and replay memory & target q network
 class DoubleDQNAgent:
     def __init__(self, state_size, action_size):
         # if you want to see Cartpole learning, then change to True
         self.render = False
-
+        self.load_model = False
         # get size of state and action
         self.state_size = state_size
         self.action_size = action_size
@@ -37,17 +37,23 @@ def __init__(self, state_size, action_size):
         # create main model and target model
         self.model = self.build_model()
         self.target_model = self.build_model()
-        # copy the model to target model
-        # --> initialize the target model so that the parameters of model & target model to be same
+
+        # initialize target model
         self.update_target_model()
 
+        if self.load_model:
+            self.model.load_weights("./save_model/cartpole_ddqn.h5")
+
     # approximate Q function using Neural Network
     # state is input and Q Value of each action is output of network
     def build_model(self):
         model = Sequential()
-        model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
-        model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
-        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
+        model.add(Dense(24, input_dim=self.state_size, activation='relu',
+                        kernel_initializer='he_uniform'))
+        model.add(Dense(24, activation='relu',
+                        kernel_initializer='he_uniform'))
+        model.add(Dense(self.action_size, activation='linear',
+                        kernel_initializer='he_uniform'))
         model.summary()
         model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
         return model
@@ -65,54 +71,54 @@ def get_action(self, state):
             return np.argmax(q_value[0])
 
     # save sample <s,a,r,s'> to the replay memory
-    def replay_memory(self, state, action, reward, next_state, done):
+    def append_sample(self, state, action, reward, next_state, done):
         self.memory.append((state, action, reward, next_state, done))
         if self.epsilon > self.epsilon_min:
             self.epsilon *= self.epsilon_decay
 
     # pick samples randomly from replay memory (with batch_size)
-    def train_replay(self):
+    def train_model(self):
         if len(self.memory) < self.train_start:
             return
         batch_size = min(self.batch_size, len(self.memory))
         mini_batch = random.sample(self.memory, batch_size)
 
         update_input = np.zeros((batch_size, self.state_size))
-        update_target = np.zeros((batch_size, self.action_size))
+        update_target = np.zeros((batch_size, self.state_size))
+        action, reward, done = [], [], []
 
         for i in range(batch_size):
-            state, action, reward, next_state, done = mini_batch[i]
-            target = self.model.predict(state)[0]
+            update_input[i] = mini_batch[i][0]
+            action.append(mini_batch[i][1])
+            reward.append(mini_batch[i][2])
+            update_target[i] = mini_batch[i][3]
+            done.append(mini_batch[i][4])
+
+        target = self.model.predict(update_input)
+        target_next = self.model.predict(update_target)
+        target_val = self.target_model.predict(update_target)
 
+        for i in range(self.batch_size):
             # like Q Learning, get maximum Q value at s'
             # But from target model
-            if done:
-                target[action] = reward
+            if done[i]:
+                target[i][action[i]] = reward[i]
             else:
                 # the key point of Double DQN
                 # selection of action is from model
                 # update is from target model
-                a = np.argmax(self.model.predict(next_state)[0])
-                target[action] = reward + self.discount_factor * \
-                                          (self.target_model.predict(next_state)[0][a])
-            update_input[i] = state
-            update_target[i] = target
+                a = np.argmax(target_next[i])
+                target[i][action[i]] = reward[i] + self.discount_factor * (
+                    target_val[i][a])
 
         # make minibatch which includes target q value and predicted q value
         # and do the model fit!
-        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
-    # load the saved model
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    # save the model which is under training
-    def save_model(self, name):
-        self.model.save_weights(name)
+        self.model.fit(update_input, target, batch_size=self.batch_size,
+                       epochs=1, verbose=0)
 
 
 if __name__ == "__main__":
-    # in case of CartPole-v1, you can play until 500 time step
+    # In case of CartPole-v1, you can play until 500 time step
     env = gym.make('CartPole-v1')
     # get size of state and action from environment
     state_size = env.observation_space.shape[0]
@@ -127,7 +133,6 @@ def save_model(self, name):
         score = 0
         state = env.reset()
         state = np.reshape(state, [1, state_size])
-        # agent.load_model("./save_model/cartpole-master.h5")
 
         while not done:
             if agent.render:
@@ -141,14 +146,13 @@ def save_model(self, name):
             reward = reward if not done or score == 499 else -100
 
             # save the sample <s, a, r, s'> to the replay memory
-            agent.replay_memory(state, action, reward, next_state, done)
+            agent.append_sample(state, action, reward, next_state, done)
             # every time step do the training
-            agent.train_replay()
+            agent.train_model()
             score += reward
             state = next_state
 
             if done:
-                env.reset()
                 # every episode update the target model to be same with model
                 agent.update_target_model()
 
@@ -157,9 +161,9 @@ def save_model(self, name):
                 scores.append(score)
                 episodes.append(e)
                 pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/Cartpole_DoubleDQN.png")
-                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
-                      "  epsilon:", agent.epsilon)
+                pylab.savefig("./save_graph/cartpole_ddqn.png")
+                print("episode:", e, "  score:", score, "  memory length:",
+                      len(agent.memory), "  epsilon:", agent.epsilon)
 
                 # if the mean of scores of last 10 episode is bigger than 490
                 # stop training
@@ -168,4 +172,4 @@ def save_model(self, name):
 
         # save the model
         if e % 50 == 0:
-            agent.save_model("./save_model/Cartpole_DoubleDQN.h5")
\ No newline at end of file
+            agent.model.save_weights("./save_model/cartpole_ddqn.h5")
diff --git a/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png b/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png
new file mode 100644
index 00000000..26c4fed0
Binary files /dev/null and b/2-cartpole/2-double-dqn/save_graph/cartpole_ddqn.png differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 b/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5
similarity index 73%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5
rename to 2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5
index d4d4bcd5..c54c9886 100644
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole_DQN14.h5 and b/2-cartpole/2-double-dqn/save_model/cartpole_ddqn.h5 differ
diff --git a/2-cartpole/3-reinforce/cartpole_reinforce.py b/2-cartpole/3-reinforce/cartpole_reinforce.py
new file mode 100644
index 00000000..040234d1
--- /dev/null
+++ b/2-cartpole/3-reinforce/cartpole_reinforce.py
@@ -0,0 +1,146 @@
+import sys
+import gym
+import pylab
+import numpy as np
+from keras.layers import Dense
+from keras.models import Sequential
+from keras.optimizers import Adam
+
+EPISODES = 1000
+
+
+# This is Policy Gradient agent for the Cartpole
+# In this example, we use REINFORCE algorithm which uses monte-carlo update rule
+class REINFORCEAgent:
+    def __init__(self, state_size, action_size):
+        # if you want to see Cartpole learning, then change to True
+        self.render = False
+        self.load_model = False
+        # get size of state and action
+        self.state_size = state_size
+        self.action_size = action_size
+
+        # These are hyper parameters for the Policy Gradient
+        self.discount_factor = 0.99
+        self.learning_rate = 0.001
+        self.hidden1, self.hidden2 = 24, 24
+
+        # create model for policy network
+        self.model = self.build_model()
+
+        # lists for the states, actions and rewards
+        self.states, self.actions, self.rewards = [], [], []
+
+        if self.load_model:
+            self.model.load_weights("./save_model/cartpole_reinforce.h5")
+
+    # approximate policy using Neural Network
+    # state is input and probability of each action is output of network
+    def build_model(self):
+        model = Sequential()
+        model.add(Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
+        model.add(Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform'))
+        model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
+        model.summary()
+        # Using categorical crossentropy as a loss is a trick to easily
+        # implement the policy gradient. Categorical cross entropy is defined
+        # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set 
+        # p_a = advantage. q_a is the output of the policy network, which is
+        # the probability of taking the action a, i.e. policy(s, a). 
+        # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))
+        model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=self.learning_rate))
+        return model
+
+    # using the output of policy network, pick action stochastically
+    def get_action(self, state):
+        policy = self.model.predict(state, batch_size=1).flatten()
+        return np.random.choice(self.action_size, 1, p=policy)[0]
+
+    # In Policy Gradient, Q function is not available.
+    # Instead agent uses sample returns for evaluating policy
+    def discount_rewards(self, rewards):
+        discounted_rewards = np.zeros_like(rewards)
+        running_add = 0
+        for t in reversed(range(0, len(rewards))):
+            running_add = running_add * self.discount_factor + rewards[t]
+            discounted_rewards[t] = running_add
+        return discounted_rewards
+
+    # save <s, a ,r> of each step
+    def append_sample(self, state, action, reward):
+        self.states.append(state)
+        self.rewards.append(reward)
+        self.actions.append(action)
+
+    # update policy network every episode
+    def train_model(self):
+        episode_length = len(self.states)
+
+        discounted_rewards = self.discount_rewards(self.rewards)
+        discounted_rewards -= np.mean(discounted_rewards)
+        discounted_rewards /= np.std(discounted_rewards)
+
+        update_inputs = np.zeros((episode_length, self.state_size))
+        advantages = np.zeros((episode_length, self.action_size))
+
+        for i in range(episode_length):
+            update_inputs[i] = self.states[i]
+            advantages[i][self.actions[i]] = discounted_rewards[i]
+
+        self.model.fit(update_inputs, advantages, epochs=1, verbose=0)
+        self.states, self.actions, self.rewards = [], [], []
+
+if __name__ == "__main__":
+    # In case of CartPole-v1, you can play until 500 time step
+    env = gym.make('CartPole-v1')
+    # get size of state and action from environment
+    state_size = env.observation_space.shape[0]
+    action_size = env.action_space.n
+
+    # make REINFORCE agent
+    agent = REINFORCEAgent(state_size, action_size)
+
+    scores, episodes = [], []
+
+    for e in range(EPISODES):
+        done = False
+        score = 0
+        state = env.reset()
+        state = np.reshape(state, [1, state_size])
+
+        while not done:
+            if agent.render:
+                env.render()
+
+            # get action for the current state and go one step in environment
+            action = agent.get_action(state)
+            next_state, reward, done, info = env.step(action)
+            next_state = np.reshape(next_state, [1, state_size])
+            reward = reward if not done or score == 499 else -100
+
+            # save the sample <s, a, r> to the memory
+            agent.append_sample(state, action, reward)
+
+            score += reward
+            state = next_state
+
+            if done:
+                # every episode, agent learns from sample returns
+                agent.train_model()
+
+                # every episode, plot the play time
+                score = score if score == 500 else score + 100
+                scores.append(score)
+                episodes.append(e)
+                pylab.plot(episodes, scores, 'b')
+                pylab.savefig("./save_graph/cartpole_reinforce.png")
+                print("episode:", e, "  score:", score)
+
+                # if the mean of scores of last 10 episode is bigger than 490
+                # stop training
+                if np.mean(scores[-min(10, len(scores)):]) > 490:
+                    sys.exit()
+
+        # save the model
+        if e % 50 == 0:
+            agent.model.save_weights("./save_model/cartpole_reinforce.h5")
diff --git a/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png b/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png
new file mode 100644
index 00000000..dce280f2
Binary files /dev/null and b/2-cartpole/3-reinforce/save_graph/cartpole_reinforce.png differ
diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 b/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5
similarity index 67%
rename from Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5
rename to 2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5
index 1fc158bf..18fb216b 100644
Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DQN14.h5 and b/2-cartpole/3-reinforce/save_model/cartpole_reinforce.h5 differ
diff --git a/2-cartpole/4-actor-critic/cartpole_a2c.py b/2-cartpole/4-actor-critic/cartpole_a2c.py
new file mode 100644
index 00000000..fa6310a3
--- /dev/null
+++ b/2-cartpole/4-actor-critic/cartpole_a2c.py
@@ -0,0 +1,135 @@
+import sys
+import gym
+import pylab
+import numpy as np
+from keras.layers import Dense
+from keras.models import Sequential
+from keras.optimizers import Adam
+
+EPISODES = 1000
+
+
+# A2C(Advantage Actor-Critic) agent for the Cartpole
+class A2CAgent:
+    def __init__(self, state_size, action_size):
+        # if you want to see Cartpole learning, then change to True
+        self.render = False
+        self.load_model = False
+        # get size of state and action
+        self.state_size = state_size
+        self.action_size = action_size
+        self.value_size = 1
+
+        # These are hyper parameters for the Policy Gradient
+        self.discount_factor = 0.99
+        self.actor_lr = 0.001
+        self.critic_lr = 0.005
+
+        # create model for policy network
+        self.actor = self.build_actor()
+        self.critic = self.build_critic()
+
+        if self.load_model:
+            self.actor.load_weights("./save_model/cartpole_actor.h5")
+            self.critic.load_weights("./save_model/cartpole_critic.h5")
+
+    # approximate policy and value using Neural Network
+    # actor: state is input and probability of each action is output of model
+    def build_actor(self):
+        actor = Sequential()
+        actor.add(Dense(24, input_dim=self.state_size, activation='relu',
+                        kernel_initializer='he_uniform'))
+        actor.add(Dense(self.action_size, activation='softmax',
+                        kernel_initializer='he_uniform'))
+        actor.summary()
+        # See note regarding crossentropy in cartpole_reinforce.py
+        actor.compile(loss='categorical_crossentropy',
+                      optimizer=Adam(lr=self.actor_lr))
+        return actor
+
+    # critic: state is input and value of state is output of model
+    def build_critic(self):
+        critic = Sequential()
+        critic.add(Dense(24, input_dim=self.state_size, activation='relu',
+                         kernel_initializer='he_uniform'))
+        critic.add(Dense(self.value_size, activation='linear',
+                         kernel_initializer='he_uniform'))
+        critic.summary()
+        critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
+        return critic
+
+    # using the output of policy network, pick action stochastically
+    def get_action(self, state):
+        policy = self.actor.predict(state, batch_size=1).flatten()
+        return np.random.choice(self.action_size, 1, p=policy)[0]
+
+    # update policy network every episode
+    def train_model(self, state, action, reward, next_state, done):
+        target = np.zeros((1, self.value_size))
+        advantages = np.zeros((1, self.action_size))
+
+        value = self.critic.predict(state)[0]
+        next_value = self.critic.predict(next_state)[0]
+
+        if done:
+            advantages[0][action] = reward - value
+            target[0][0] = reward
+        else:
+            advantages[0][action] = reward + self.discount_factor * (next_value) - value
+            target[0][0] = reward + self.discount_factor * next_value
+
+        self.actor.fit(state, advantages, epochs=1, verbose=0)
+        self.critic.fit(state, target, epochs=1, verbose=0)
+
+
+if __name__ == "__main__":
+    # In case of CartPole-v1, maximum length of episode is 500
+    env = gym.make('CartPole-v1')
+    # get size of state and action from environment
+    state_size = env.observation_space.shape[0]
+    action_size = env.action_space.n
+
+    # make A2C agent
+    agent = A2CAgent(state_size, action_size)
+
+    scores, episodes = [], []
+
+    for e in range(EPISODES):
+        done = False
+        score = 0
+        state = env.reset()
+        state = np.reshape(state, [1, state_size])
+
+        while not done:
+            if agent.render:
+                env.render()
+
+            action = agent.get_action(state)
+            next_state, reward, done, info = env.step(action)
+            next_state = np.reshape(next_state, [1, state_size])
+            # if an action make the episode end, then gives penalty of -100
+            reward = reward if not done or score == 499 else -100
+
+            agent.train_model(state, action, reward, next_state, done)
+
+            score += reward
+            state = next_state
+
+            if done:
+                # every episode, plot the play time
+                score = score if score == 500.0 else score + 100
+                scores.append(score)
+                episodes.append(e)
+                pylab.plot(episodes, scores, 'b')
+                pylab.savefig("./save_graph/cartpole_a2c.png")
+                print("episode:", e, "  score:", score)
+
+                # if the mean of scores of last 10 episode is bigger than 490
+                # stop training
+                if np.mean(scores[-min(10, len(scores)):]) > 490:
+                    sys.exit()
+
+        # save the model
+        if e % 50 == 0:
+            agent.actor.save_weights("./save_model/cartpole_actor.h5")
+            agent.critic.save_weights("./save_model/cartpole_critic.h5")
diff --git a/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png b/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png
new file mode 100644
index 00000000..aedc6c4c
Binary files /dev/null and b/2-cartpole/4-actor-critic/save_graph/cartpole_a2c.png differ
diff --git a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 b/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5
similarity index 59%
rename from Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5
rename to 2-cartpole/4-actor-critic/save_model/cartpole_actor.h5
index 24f6b0cf..38b40bba 100644
Binary files a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_PG.h5 and b/2-cartpole/4-actor-critic/save_model/cartpole_actor.h5 differ
diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 b/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5
similarity index 56%
rename from Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5
rename to 2-cartpole/4-actor-critic/save_model/cartpole_critic.h5
index 1146b18e..4cea5ef1 100644
Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_ActorCritic.h5 and b/2-cartpole/4-actor-critic/save_model/cartpole_critic.h5 differ
diff --git a/2-cartpole/5-a3c/cartpole_a3c.py b/2-cartpole/5-a3c/cartpole_a3c.py
new file mode 100644
index 00000000..f2721849
--- /dev/null
+++ b/2-cartpole/5-a3c/cartpole_a3c.py
@@ -0,0 +1,223 @@
+import threading
+import numpy as np
+import tensorflow as tf
+import pylab
+import time
+import gym
+from keras.layers import Dense, Input
+from keras.models import Model
+from keras.optimizers import Adam
+from keras import backend as K
+
+
+# global variables for threading
+episode = 0
+scores = []
+
+EPISODES = 2000
+
+# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
+# In this example, we use A3C algorithm
+class A3CAgent:
+    def __init__(self, state_size, action_size, env_name):
+        # get size of state and action
+        self.state_size = state_size
+        self.action_size = action_size
+
+        # get gym environment name
+        self.env_name = env_name
+
+        # these are hyper parameters for the A3C
+        self.actor_lr = 0.001
+        self.critic_lr = 0.001
+        self.discount_factor = .99
+        self.hidden1, self.hidden2 = 24, 24
+        self.threads = 8
+
+        # create model for actor and critic network
+        self.actor, self.critic = self.build_model()
+
+        # method for training actor and critic network
+        self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
+
+        self.sess = tf.InteractiveSession()
+        K.set_session(self.sess)
+        self.sess.run(tf.global_variables_initializer())
+
+    # approximate policy and value using Neural Network
+    # actor -> state is input and probability of each action is output of network
+    # critic -> state is input and value of state is output of network
+    # actor and critic network share first hidden layer
+    def build_model(self):
+        state = Input(batch_shape=(None,  self.state_size))
+        shared = Dense(self.hidden1, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform')(state)
+
+        actor_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='glorot_uniform')(shared)
+        action_prob = Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform')(actor_hidden)
+
+        value_hidden = Dense(self.hidden2, activation='relu', kernel_initializer='he_uniform')(shared)
+        state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(value_hidden)
+
+        actor = Model(inputs=state, outputs=action_prob)
+        critic = Model(inputs=state, outputs=state_value)
+
+        actor._make_predict_function()
+        critic._make_predict_function()
+
+        actor.summary()
+        critic.summary()
+
+        return actor, critic
+
+    # make loss function for Policy Gradient
+    # [log(action probability) * advantages] will be input for the back prop
+    # we add entropy of action probability to loss
+    def actor_optimizer(self):
+        action = K.placeholder(shape=(None, self.action_size))
+        advantages = K.placeholder(shape=(None, ))
+
+        policy = self.actor.output
+
+        good_prob = K.sum(action * policy, axis=1)
+        eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages)
+        loss = -K.sum(eligibility)
+
+        entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
+
+        actor_loss = loss + 0.01*entropy
+
+        optimizer = Adam(lr=self.actor_lr)
+        updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
+        train = K.function([self.actor.input, action, advantages], [], updates=updates)
+        return train
+
+    # make loss function for Value approximation
+    def critic_optimizer(self):
+        discounted_reward = K.placeholder(shape=(None, ))
+
+        value = self.critic.output
+
+        loss = K.mean(K.square(discounted_reward - value))
+
+        optimizer = Adam(lr=self.critic_lr)
+        updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
+        train = K.function([self.critic.input, discounted_reward], [], updates=updates)
+        return train
+
+    # make agents(local) and start training
+    def train(self):
+        # self.load_model('./save_model/cartpole_a3c.h5')
+        agents = [Agent(i, self.actor, self.critic, self.optimizer, self.env_name, self.discount_factor,
+                        self.action_size, self.state_size) for i in range(self.threads)]
+
+        for agent in agents:
+            agent.start()
+
+        while True:
+            time.sleep(20)
+
+            plot = scores[:]
+            pylab.plot(range(len(plot)), plot, 'b')
+            pylab.savefig("./save_graph/cartpole_a3c.png")
+
+            self.save_model('./save_model/cartpole_a3c.h5')
+
+    def save_model(self, name):
+        self.actor.save_weights(name + "_actor.h5")
+        self.critic.save_weights(name + "_critic.h5")
+
+    def load_model(self, name):
+        self.actor.load_weights(name + "_actor.h5")
+        self.critic.load_weights(name + "_critic.h5")
+
+# This is Agent(local) class for threading
+class Agent(threading.Thread):
+    def __init__(self, index, actor, critic, optimizer, env_name, discount_factor, action_size, state_size):
+        threading.Thread.__init__(self)
+
+        self.states = []
+        self.rewards = []
+        self.actions = []
+
+        self.index = index
+        self.actor = actor
+        self.critic = critic
+        self.optimizer = optimizer
+        self.env_name = env_name
+        self.discount_factor = discount_factor
+        self.action_size = action_size
+        self.state_size = state_size
+
+    # Thread interactive with environment
+    def run(self):
+        global episode
+        env = gym.make(self.env_name)
+        while episode < EPISODES:
+            state = env.reset()
+            score = 0
+            while True:
+                action = self.get_action(state)
+                next_state, reward, done, _ = env.step(action)
+                score += reward
+
+                self.memory(state, action, reward)
+
+                state = next_state
+
+                if done:
+                    episode += 1
+                    print("episode: ", episode, "/ score : ", score)
+                    scores.append(score)
+                    self.train_episode(score != 500)
+                    break
+
+    # In Policy Gradient, Q function is not available.
+    # Instead agent uses sample returns for evaluating policy
+    def discount_rewards(self, rewards, done=True):
+        discounted_rewards = np.zeros_like(rewards)
+        running_add = 0
+        if not done:
+            running_add = self.critic.predict(np.reshape(self.states[-1], (1, self.state_size)))[0]
+        for t in reversed(range(0, len(rewards))):
+            running_add = running_add * self.discount_factor + rewards[t]
+            discounted_rewards[t] = running_add
+        return discounted_rewards
+
+    # save <s, a ,r> of each step
+    # this is used for calculating discounted rewards
+    def memory(self, state, action, reward):
+        self.states.append(state)
+        act = np.zeros(self.action_size)
+        act[action] = 1
+        self.actions.append(act)
+        self.rewards.append(reward)
+
+    # update policy network and value network every episode
+    def train_episode(self, done):
+        discounted_rewards = self.discount_rewards(self.rewards, done)
+
+        values = self.critic.predict(np.array(self.states))
+        values = np.reshape(values, len(values))
+
+        advantages = discounted_rewards - values
+
+        self.optimizer[0]([self.states, self.actions, advantages])
+        self.optimizer[1]([self.states, discounted_rewards])
+        self.states, self.actions, self.rewards = [], [], []
+
+    def get_action(self, state):
+        policy = self.actor.predict(np.reshape(state, [1, self.state_size]))[0]
+        return np.random.choice(self.action_size, 1, p=policy)[0]
+
+
+if __name__ == "__main__":
+    env_name = 'CartPole-v1'
+    env = gym.make(env_name)
+
+    state_size = env.observation_space.shape[0]
+    action_size = env.action_space.n
+
+    env.close()
+
+    global_agent = A3CAgent(state_size, action_size, env_name)
+    global_agent.train()
diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 b/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5
similarity index 58%
rename from Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5
rename to 2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5
index c72bcd68..33ab03a5 100644
Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN2.h5 and b/2-cartpole/5-a3c/save_model/Cartpole_A3C_actor.h5 differ
diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 b/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5
similarity index 57%
rename from Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5
rename to 2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5
index 6ef1da98..5db01072 100644
Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Critic.h5 and b/2-cartpole/5-a3c/save_model/Cartpole_A3C_critic.h5 differ
diff --git a/Code 2. Cartpole/LICENSE b/2-cartpole/LICENSE
similarity index 100%
rename from Code 2. Cartpole/LICENSE
rename to 2-cartpole/LICENSE
diff --git a/Code 2. Cartpole/README.md b/2-cartpole/README.md
similarity index 65%
rename from Code 2. Cartpole/README.md
rename to 2-cartpole/README.md
index 6882e016..1d8d8701 100644
--- a/Code 2. Cartpole/README.md	
+++ b/2-cartpole/README.md
@@ -15,15 +15,10 @@ This is graph of Double DQN algorithm
 
 <p align="left"><img width="40%" src="./2. Double DQN/save_graph/Cartpole_DoubleDQN.png" /></p>
 
-<br/>
-This is graph of Dueling DQN algorithm (This doesn't work at all...)
-
-<p align="left"><img width="40%" src="./3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png" /></p>
-
 <br/>
 This is graph of Policy Gradient algorithm
-<p align="left"><img width="40%" src="./4. Policy Gradient/save_graph/Cartpole_PG.png" /></p>
+<p align="left"><img width="40%" src="./3. Policy Gradient/save_graph/Cartpole_PG.png" /></p>
 
 <br/>
 This is graph of Actor Critic algorithm
-<p align="left"><img width="40%" src="./5. Actor-Critic/save_graph/Cartpole_ActorCritc.png" /></p>
\ No newline at end of file
+<p align="left"><img width="40%" src="./4. Actor-Critic/save_graph/Cartpole_ActorCritc.png" /></p>
\ No newline at end of file
diff --git a/Code 2. Cartpole/cartpole.png b/2-cartpole/cartpole.png
similarity index 100%
rename from Code 2. Cartpole/cartpole.png
rename to 2-cartpole/cartpole.png
diff --git a/3-atari/1-breakout/breakout_a3c.py b/3-atari/1-breakout/breakout_a3c.py
new file mode 100644
index 00000000..be339e8e
--- /dev/null
+++ b/3-atari/1-breakout/breakout_a3c.py
@@ -0,0 +1,351 @@
+import gym
+import time
+import random
+import threading
+import numpy as np
+import tensorflow as tf
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from keras.models import Model
+from keras.optimizers import RMSprop
+from keras.layers import Dense, Flatten, Input
+from keras.layers.convolutional import Conv2D
+from keras import backend as K
+
+# global variables for A3C
+global episode
+episode = 0
+EPISODES = 8000000
+# In case of BreakoutDeterministic-v3, always skip 4 frames
+# Deterministic-v4 version use 4 actions
+env_name = "BreakoutDeterministic-v4"
+
+# This is A3C(Asynchronous Advantage Actor Critic) agent(global) for the Cartpole
+# In this example, we use A3C algorithm
+class A3CAgent:
+    def __init__(self, action_size):
+        # environment settings
+        self.state_size = (84, 84, 4)
+        self.action_size = action_size
+
+        self.discount_factor = 0.99
+        self.no_op_steps = 30
+
+        # optimizer parameters
+        self.actor_lr = 2.5e-4
+        self.critic_lr = 2.5e-4
+        self.threads = 8
+
+        # create model for actor and critic network
+        self.actor, self.critic = self.build_model()
+
+        # method for training actor and critic network
+        self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
+
+        self.sess = tf.InteractiveSession()
+        K.set_session(self.sess)
+        self.sess.run(tf.global_variables_initializer())
+
+        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
+        self.summary_writer = tf.summary.FileWriter('summary/breakout_a3c', self.sess.graph)
+
+    def train(self):
+        # self.load_model("./save_model/breakout_a3c")
+        agents = [Agent(self.action_size, self.state_size, [self.actor, self.critic], self.sess, self.optimizer,
+                        self.discount_factor, [self.summary_op, self.summary_placeholders,
+                        self.update_ops, self.summary_writer]) for _ in range(self.threads)]
+
+        for agent in agents:
+            time.sleep(1)
+            agent.start()
+
+        while True:
+            time.sleep(60*10)
+            self.save_model("./save_model/breakout_a3c")
+
+    # approximate policy and value using Neural Network
+    # actor -> state is input and probability of each action is output of network
+    # critic -> state is input and value of state is output of network
+    # actor and critic network share first hidden layer
+    def build_model(self):
+        input = Input(shape=self.state_size)
+        conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
+        conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
+        conv = Flatten()(conv)
+        fc = Dense(256, activation='relu')(conv)
+        policy = Dense(self.action_size, activation='softmax')(fc)
+        value = Dense(1, activation='linear')(fc)
+
+        actor = Model(inputs=input, outputs=policy)
+        critic = Model(inputs=input, outputs=value)
+
+        actor._make_predict_function()
+        critic._make_predict_function()
+
+        actor.summary()
+        critic.summary()
+
+        return actor, critic
+
+    # make loss function for Policy Gradient
+    # [log(action probability) * advantages] will be input for the back prop
+    # we add entropy of action probability to loss
+    def actor_optimizer(self):
+        action = K.placeholder(shape=[None, self.action_size])
+        advantages = K.placeholder(shape=[None, ])
+
+        policy = self.actor.output
+
+        good_prob = K.sum(action * policy, axis=1)
+        eligibility = K.log(good_prob + 1e-10) * advantages
+        actor_loss = -K.sum(eligibility)
+
+        entropy = K.sum(policy * K.log(policy + 1e-10), axis=1)
+        entropy = K.sum(entropy)
+
+        loss = actor_loss + 0.01*entropy
+        optimizer = RMSprop(lr=self.actor_lr, rho=0.99, epsilon=0.01)
+        updates = optimizer.get_updates(self.actor.trainable_weights, [], loss)
+        train = K.function([self.actor.input, action, advantages], [loss], updates=updates)
+
+        return train
+
+    # make loss function for Value approximation
+    def critic_optimizer(self):
+        discounted_reward = K.placeholder(shape=(None, ))
+
+        value = self.critic.output
+
+        loss = K.mean(K.square(discounted_reward - value))
+
+        optimizer = RMSprop(lr=self.critic_lr, rho=0.99, epsilon=0.01)
+        updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
+        train = K.function([self.critic.input, discounted_reward], [loss], updates=updates)
+        return train
+
+    def load_model(self, name):
+        self.actor.load_weights(name + "_actor.h5")
+        self.critic.load_weights(name + "_critic.h5")
+
+    def save_model(self, name):
+        self.actor.save_weights(name + "_actor.h5")
+        self.critic.save_weights(name + '_critic.h5')
+
+    # make summary operators for tensorboard
+    def setup_summary(self):
+        episode_total_reward = tf.Variable(0.)
+        episode_avg_max_q = tf.Variable(0.)
+        episode_duration = tf.Variable(0.)
+
+        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
+        tf.summary.scalar('Average Max Prob/Episode', episode_avg_max_q)
+        tf.summary.scalar('Duration/Episode', episode_duration)
+
+        summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration]
+        summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
+        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
+        summary_op = tf.summary.merge_all()
+        return summary_placeholders, update_ops, summary_op
+
+# make agents(local) and start training
+class Agent(threading.Thread):
+    def __init__(self, action_size, state_size, model, sess, optimizer, discount_factor, summary_ops):
+        threading.Thread.__init__(self)
+
+        self.action_size = action_size
+        self.state_size = state_size
+        self.actor, self.critic = model
+        self.sess = sess
+        self.optimizer = optimizer
+        self.discount_factor = discount_factor
+        self.summary_op, self.summary_placeholders, self.update_ops, self.summary_writer = summary_ops
+
+        self.states, self.actions, self.rewards = [],[],[]
+
+        self.local_actor, self.local_critic = self.build_localmodel()
+
+        self.avg_p_max = 0
+        self.avg_loss = 0
+
+        # t_max -> max batch size for training
+        self.t_max = 20
+        self.t = 0
+
+    # Thread interactive with environment
+    def run(self):
+        # self.load_model('./save_model/breakout_a3c')
+        global episode
+
+        env = gym.make(env_name)
+
+        step = 0
+
+        while episode < EPISODES:
+            done = False
+            dead = False
+            # 1 episode = 5 lives
+            score, start_life = 0, 5
+            observe = env.reset()
+            next_observe = observe
+
+            # this is one of DeepMind's idea.
+            # just do nothing at the start of episode to avoid sub-optimal
+            for _ in range(random.randint(1, 30)):
+                observe = next_observe
+                next_observe, _, _, _ = env.step(1)
+
+            # At start of episode, there is no preceding frame. So just copy initial states to make history
+            state = pre_processing(next_observe, observe)
+            history = np.stack((state, state, state, state), axis=2)
+            history = np.reshape([history], (1, 84, 84, 4))
+
+            while not done:
+                step += 1
+                self.t += 1
+                observe = next_observe
+                # get action for the current history and go one step in environment
+                action, policy = self.get_action(history)
+                # change action to real_action
+                if action == 0: real_action = 1
+                elif action == 1: real_action = 2
+                else: real_action = 3
+
+                if dead:
+                    action = 0
+                    real_action = 1
+                    dead = False
+
+                next_observe, reward, done, info = env.step(real_action)
+                # pre-process the observation --> history
+                next_state = pre_processing(next_observe, observe)
+                next_state = np.reshape([next_state], (1, 84, 84, 1))
+                next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+                self.avg_p_max += np.amax(self.actor.predict(np.float32(history / 255.)))
+
+                # if the ball is fall, then the agent is dead --> episode is not over
+                if start_life > info['ale.lives']:
+                    dead = True
+                    start_life = info['ale.lives']
+
+                score += reward
+                reward = np.clip(reward, -1., 1.)
+
+                # save the sample <s, a, r, s'> to the replay memory
+                self.memory(history, action, reward)
+
+                # if agent is dead, then reset the history
+                if dead:
+                    history = np.stack((next_state, next_state, next_state, next_state), axis=2)
+                    history = np.reshape([history], (1, 84, 84, 4))
+                else:
+                    history = next_history
+
+                #
+                if self.t >= self.t_max or done:
+                    self.train_model(done)
+                    self.update_localmodel()
+                    self.t = 0
+
+                # if done, plot the score over episodes
+                if done:
+                    episode += 1
+                    print("episode:", episode, "  score:", score, "  step:", step)
+
+                    stats = [score, self.avg_p_max / float(step),
+                             step]
+                    for i in range(len(stats)):
+                        self.sess.run(self.update_ops[i], feed_dict={
+                            self.summary_placeholders[i]: float(stats[i])
+                        })
+                    summary_str = self.sess.run(self.summary_op)
+                    self.summary_writer.add_summary(summary_str, episode + 1)
+                    self.avg_p_max = 0
+                    self.avg_loss = 0
+                    step = 0
+
+    # In Policy Gradient, Q function is not available.
+    # Instead agent uses sample returns for evaluating policy
+    def discount_rewards(self, rewards, done):
+        discounted_rewards = np.zeros_like(rewards)
+        running_add = 0
+        if not done:
+            running_add = self.critic.predict(np.float32(self.states[-1] / 255.))[0]
+        for t in reversed(range(0, len(rewards))):
+            running_add = running_add * self.discount_factor + rewards[t]
+            discounted_rewards[t] = running_add
+        return discounted_rewards
+
+    # update policy network and value network every episode
+    def train_model(self, done):
+        discounted_rewards = self.discount_rewards(self.rewards, done)
+
+        states = np.zeros((len(self.states), 84, 84, 4))
+        for i in range(len(self.states)):
+            states[i] = self.states[i]
+
+        states = np.float32(states / 255.)
+
+        values = self.critic.predict(states)
+        values = np.reshape(values, len(values))
+
+        advantages = discounted_rewards - values
+
+        self.optimizer[0]([states, self.actions, advantages])
+        self.optimizer[1]([states, discounted_rewards])
+        self.states, self.actions, self.rewards = [], [], []
+
+    def build_localmodel(self):
+        input = Input(shape=self.state_size)
+        conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
+        conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
+        conv = Flatten()(conv)
+        fc = Dense(256, activation='relu')(conv)
+        policy = Dense(self.action_size, activation='softmax')(fc)
+        value = Dense(1, activation='linear')(fc)
+
+        actor = Model(inputs=input, outputs=policy)
+        critic = Model(inputs=input, outputs=value)
+
+        actor._make_predict_function()
+        critic._make_predict_function()
+
+        actor.set_weights(self.actor.get_weights())
+        critic.set_weights(self.critic.get_weights())
+
+        actor.summary()
+        critic.summary()
+
+        return actor, critic
+
+    def update_localmodel(self):
+        self.local_actor.set_weights(self.actor.get_weights())
+        self.local_critic.set_weights(self.critic.get_weights())
+
+    def get_action(self, history):
+        history = np.float32(history / 255.)
+        policy = self.local_actor.predict(history)[0]
+        action_index = np.random.choice(self.action_size, 1, p=policy)[0]
+        return action_index, policy
+
+    # save <s, a ,r> of each step
+    # this is used for calculating discounted rewards
+    def memory(self, history, action, reward):
+        self.states.append(history)
+        act = np.zeros(self.action_size)
+        act[action] = 1
+        self.actions.append(act)
+        self.rewards.append(reward)
+
+
+# 210*160*3(color) --> 84*84(mono)
+# float --> integer (to reduce the size of replay memory)
+def pre_processing(next_observe, observe):
+    processed_observe = np.maximum(next_observe, observe)
+    processed_observe = np.uint8(resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255)
+    return processed_observe
+
+
+if __name__ == "__main__":
+    global_agent = A3CAgent(action_size=3)
+    global_agent.train()
diff --git a/3-atari/1-breakout/breakout_ddqn.py b/3-atari/1-breakout/breakout_ddqn.py
new file mode 100644
index 00000000..f9f0a5ed
--- /dev/null
+++ b/3-atari/1-breakout/breakout_ddqn.py
@@ -0,0 +1,274 @@
+import gym
+import random
+import numpy as np
+import tensorflow as tf
+from collections import deque
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from keras.models import Sequential
+from keras.optimizers import RMSprop
+from keras.layers import Dense, Flatten
+from keras.layers.convolutional import Conv2D
+from keras import backend as K
+
+EPISODES = 50000
+
+
+class DDQNAgent:
+    def __init__(self, action_size):
+        self.render = False
+        self.load_model = False
+        # environment settings
+        self.state_size = (84, 84, 4)
+        self.action_size = action_size
+        # parameters about epsilon
+        self.epsilon = 1.
+        self.epsilon_start, self.epsilon_end = 1.0, 0.1
+        self.exploration_steps = 1000000.
+        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
+                                  / self.exploration_steps
+        # parameters about training
+        self.batch_size = 32
+        self.train_start = 50000
+        self.update_target_rate = 10000
+        self.discount_factor = 0.99
+        self.memory = deque(maxlen=400000)
+        self.no_op_steps = 30
+        # build
+        self.model = self.build_model()
+        self.target_model = self.build_model()
+        self.update_target_model()
+
+        self.optimizer = self.optimizer()
+
+        self.sess = tf.InteractiveSession()
+        K.set_session(self.sess)
+
+        self.avg_q_max, self.avg_loss = 0, 0
+        self.summary_placeholders, self.update_ops, self.summary_op = \
+            self.setup_summary()
+        self.summary_writer = tf.summary.FileWriter(
+            'summary/breakout_ddqn', self.sess.graph)
+        self.sess.run(tf.global_variables_initializer())
+
+        if self.load_model:
+            self.model.load_weights("./save_model/breakout_ddqn.h5")
+
+    # if the error is in [-1, 1], then the cost is quadratic to the error
+    # But outside the interval, the cost is linear to the error
+    def optimizer(self):
+        a = K.placeholder(shape=(None, ), dtype='int32')
+        y = K.placeholder(shape=(None, ), dtype='float32')
+
+        py_x = self.model.output
+
+        a_one_hot = K.one_hot(a, self.action_size)
+        q_value = K.sum(py_x * a_one_hot, axis=1)
+        error = K.abs(y - q_value)
+
+        quadratic_part = K.clip(error, 0.0, 1.0)
+        linear_part = error - quadratic_part
+        loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
+
+        optimizer = RMSprop(lr=0.00025, epsilon=0.01)
+        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
+        train = K.function([self.model.input, a, y], [loss], updates=updates)
+
+        return train
+
+    # approximate Q function using Convolution Neural Network
+    # state is input and Q Value of each action is output of network
+    def build_model(self):
+        model = Sequential()
+        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+                         input_shape=self.state_size))
+        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
+        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
+        model.add(Flatten())
+        model.add(Dense(512, activation='relu'))
+        model.add(Dense(self.action_size))
+        model.summary()
+
+        return model
+
+    # after some time interval update the target model to be same with model
+    def update_target_model(self):
+        self.target_model.set_weights(self.model.get_weights())
+
+    # get action from model using epsilon-greedy policy
+    def get_action(self, history):
+        history = np.float32(history / 255.0)
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_size)
+        else:
+            q_value = self.model.predict(history)
+            return np.argmax(q_value[0])
+
+    # save sample <s,a,r,s'> to the replay memory
+    def replay_memory(self, history, action, reward, next_history, dead):
+        self.memory.append((history, action, reward, next_history, dead))
+
+    # pick samples randomly from replay memory (with batch_size)
+    def train_replay(self):
+        if len(self.memory) < self.train_start:
+            return
+        if self.epsilon > self.epsilon_end:
+            self.epsilon -= self.epsilon_decay_step
+
+        mini_batch = random.sample(self.memory, self.batch_size)
+
+        history = np.zeros((self.batch_size, self.state_size[0],
+                            self.state_size[1], self.state_size[2]))
+        next_history = np.zeros((self.batch_size, self.state_size[0],
+                                 self.state_size[1], self.state_size[2]))
+        target = np.zeros((self.batch_size, ))
+        action, reward, dead = [], [], []
+
+        for i in range(self.batch_size):
+            history[i] = np.float32(mini_batch[i][0] / 255.)
+            next_history[i] = np.float32(mini_batch[i][3] / 255.)
+            action.append(mini_batch[i][1])
+            reward.append(mini_batch[i][2])
+            dead.append(mini_batch[i][4])
+
+        value = self.model.predict(next_history)
+        target_value = self.target_model.predict(next_history)
+
+        # like Q Learning, get maximum Q value at s'
+        # But from target model
+        for i in range(self.batch_size):
+            if dead[i]:
+                target[i] = reward[i]
+            else:
+                # the key point of Double DQN
+                # selection of action is from model
+                # update is from target model
+                target[i] = reward[i] + self.discount_factor * \
+                                        target_value[i][np.argmax(value[i])]
+
+        loss = self.optimizer([history, action, target])
+        self.avg_loss += loss[0]
+
+    # make summary operators for tensorboard
+    def setup_summary(self):
+        episode_total_reward = tf.Variable(0.)
+        episode_avg_max_q = tf.Variable(0.)
+        episode_duration = tf.Variable(0.)
+        episode_avg_loss = tf.Variable(0.)
+
+        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
+        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
+        tf.summary.scalar('Duration/Episode', episode_duration)
+        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
+
+        summary_vars = [episode_total_reward, episode_avg_max_q,
+                        episode_duration, episode_avg_loss]
+        summary_placeholders = [tf.placeholder(tf.float32) for _ in
+                                range(len(summary_vars))]
+        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
+                      range(len(summary_vars))]
+        summary_op = tf.summary.merge_all()
+        return summary_placeholders, update_ops, summary_op
+
+
+# 210*160*3(color) --> 84*84(mono)
+# float --> integer (to reduce the size of replay memory)
+def pre_processing(observe):
+    processed_observe = np.uint8(
+        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+    return processed_observe
+
+
+if __name__ == "__main__":
+    # In case of BreakoutDeterministic-v4, always skip 4 frames
+    # Deterministic-v4 version use 4 actions
+    env = gym.make('BreakoutDeterministic-v4')
+    agent = DDQNAgent(action_size=3)
+
+    scores, episodes, global_step = [], [], 0
+
+    for e in range(EPISODES):
+        done = False
+        dead = False
+        # 1 episode = 5 lives
+        step, score, start_life = 0, 0, 5
+        observe = env.reset()
+
+        # this is one of DeepMind's idea.
+        # just do nothing at the start of episode to avoid sub-optimal
+        for _ in range(random.randint(1, agent.no_op_steps)):
+            observe, _, _, _ = env.step(1)
+
+        # At start of episode, there is no preceding frame.
+        # So just copy initial states to make history
+        state = pre_processing(observe)
+        history = np.stack((state, state, state, state), axis=2)
+        history = np.reshape([history], (1, 84, 84, 4))
+
+        while not done:
+            if agent.render:
+                env.render()
+            global_step += 1
+            step += 1
+
+            # get action for the current history and go one step in environment
+            action = agent.get_action(history)
+            # change action to real_action
+            if action == 0: real_action = 1
+            elif action == 1: real_action = 2
+            else: real_action = 3
+
+            observe, reward, done, info = env.step(real_action)
+            # pre-process the observation --> history
+            next_state = pre_processing(observe)
+            next_state = np.reshape([next_state], (1, 84, 84, 1))
+            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+            agent.avg_q_max += np.amax(
+                agent.model.predict(np.float32(history / 255.))[0])
+
+            # if the agent missed ball, agent is dead --> episode is not over
+            if start_life > info['ale.lives']:
+                dead = True
+                start_life = info['ale.lives']
+
+            reward = np.clip(reward, -1., 1.)
+
+            # save the sample <s, a, r, s'> to the replay memory
+            agent.replay_memory(history, action, reward, next_history, dead)
+            # every some time interval, train model
+            agent.train_replay()
+            # update the target model with model
+            if global_step % agent.update_target_rate == 0:
+                agent.update_target_model()
+
+            score += reward
+
+            # if agent is dead, then reset the history
+            if dead:
+                dead = False
+            else:
+                history = next_history
+
+            # if done, plot the score over episodes
+            if done:
+                if global_step > agent.train_start:
+                    stats = [score, agent.avg_q_max / float(step), step,
+                             agent.avg_loss / float(step)]
+                    for i in range(len(stats)):
+                        agent.sess.run(agent.update_ops[i], feed_dict={
+                            agent.summary_placeholders[i]: float(stats[i])
+                        })
+                    summary_str = agent.sess.run(agent.summary_op)
+                    agent.summary_writer.add_summary(summary_str, e + 1)
+
+                print("episode:", e, "  score:", score, "  memory length:",
+                      len(agent.memory), "  epsilon:", agent.epsilon,
+                      "  global_step:", global_step, "  average_q:",
+                      agent.avg_q_max/float(step), "  average loss:",
+                      agent.avg_loss/float(step))
+
+                agent.avg_q_max, agent.avg_loss = 0, 0
+
+        if e % 1000 == 0:
+            agent.model.save_weights("./save_model/breakout_ddqn.h5")
diff --git a/3-atari/1-breakout/breakout_dqn.py b/3-atari/1-breakout/breakout_dqn.py
new file mode 100644
index 00000000..b6229a04
--- /dev/null
+++ b/3-atari/1-breakout/breakout_dqn.py
@@ -0,0 +1,275 @@
+import gym
+import random
+import numpy as np
+import tensorflow as tf
+from collections import deque
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from keras.models import Sequential
+from keras.optimizers import RMSprop
+from keras.layers import Dense, Flatten
+from keras.layers.convolutional import Conv2D
+from keras import backend as K
+
+EPISODES = 50000
+
+
+class DQNAgent:
+    def __init__(self, action_size):
+        self.render = False
+        self.load_model = False
+        # environment settings
+        self.state_size = (84, 84, 4)
+        self.action_size = action_size
+        # parameters about epsilon
+        self.epsilon = 1.
+        self.epsilon_start, self.epsilon_end = 1.0, 0.1
+        self.exploration_steps = 1000000.
+        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
+                                  / self.exploration_steps
+        # parameters about training
+        self.batch_size = 32
+        self.train_start = 50000
+        self.update_target_rate = 10000
+        self.discount_factor = 0.99
+        self.memory = deque(maxlen=400000)
+        self.no_op_steps = 30
+        # build model
+        self.model = self.build_model()
+        self.target_model = self.build_model()
+        self.update_target_model()
+
+        self.optimizer = self.optimizer()
+
+        self.sess = tf.InteractiveSession()
+        K.set_session(self.sess)
+
+        self.avg_q_max, self.avg_loss = 0, 0
+        self.summary_placeholders, self.update_ops, self.summary_op = \
+            self.setup_summary()
+        self.summary_writer = tf.summary.FileWriter(
+            'summary/breakout_dqn', self.sess.graph)
+        self.sess.run(tf.global_variables_initializer())
+
+        if self.load_model:
+            self.model.load_weights("./save_model/breakout_dqn.h5")
+
+    # if the error is in [-1, 1], then the cost is quadratic to the error
+    # But outside the interval, the cost is linear to the error
+    def optimizer(self):
+        a = K.placeholder(shape=(None,), dtype='int32')
+        y = K.placeholder(shape=(None,), dtype='float32')
+
+        py_x = self.model.output
+
+        a_one_hot = K.one_hot(a, self.action_size)
+        q_value = K.sum(py_x * a_one_hot, axis=1)
+        error = K.abs(y - q_value)
+
+        quadratic_part = K.clip(error, 0.0, 1.0)
+        linear_part = error - quadratic_part
+        loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
+
+        optimizer = RMSprop(lr=0.00025, epsilon=0.01)
+        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
+        train = K.function([self.model.input, a, y], [loss], updates=updates)
+
+        return train
+
+    # approximate Q function using Convolution Neural Network
+    # state is input and Q Value of each action is output of network
+    def build_model(self):
+        model = Sequential()
+        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+                         input_shape=self.state_size))
+        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
+        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
+        model.add(Flatten())
+        model.add(Dense(512, activation='relu'))
+        model.add(Dense(self.action_size))
+        model.summary()
+        return model
+
+    # after some time interval update the target model to be same with model
+    def update_target_model(self):
+        self.target_model.set_weights(self.model.get_weights())
+
+    # get action from model using epsilon-greedy policy
+    def get_action(self, history):
+        history = np.float32(history / 255.0)
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_size)
+        else:
+            q_value = self.model.predict(history)
+            return np.argmax(q_value[0])
+
+    # save sample <s,a,r,s'> to the replay memory
+    def replay_memory(self, history, action, reward, next_history, dead):
+        self.memory.append((history, action, reward, next_history, dead))
+
+    # pick samples randomly from replay memory (with batch_size)
+    def train_replay(self):
+        if len(self.memory) < self.train_start:
+            return
+        if self.epsilon > self.epsilon_end:
+            self.epsilon -= self.epsilon_decay_step
+
+        mini_batch = random.sample(self.memory, self.batch_size)
+
+        history = np.zeros((self.batch_size, self.state_size[0],
+                            self.state_size[1], self.state_size[2]))
+        next_history = np.zeros((self.batch_size, self.state_size[0],
+                                 self.state_size[1], self.state_size[2]))
+        target = np.zeros((self.batch_size,))
+        action, reward, dead = [], [], []
+
+        for i in range(self.batch_size):
+            history[i] = np.float32(mini_batch[i][0] / 255.)
+            next_history[i] = np.float32(mini_batch[i][3] / 255.)
+            action.append(mini_batch[i][1])
+            reward.append(mini_batch[i][2])
+            dead.append(mini_batch[i][4])
+
+        target_value = self.target_model.predict(next_history)
+
+        # like Q Learning, get maximum Q value at s'
+        # But from target model
+        for i in range(self.batch_size):
+            if dead[i]:
+                target[i] = reward[i]
+            else:
+                target[i] = reward[i] + self.discount_factor * \
+                                        np.amax(target_value[i])
+
+        loss = self.optimizer([history, action, target])
+        self.avg_loss += loss[0]
+
+    def save_model(self, name):
+        self.model.save_weights(name)
+
+    # make summary operators for tensorboard
+    def setup_summary(self):
+        episode_total_reward = tf.Variable(0.)
+        episode_avg_max_q = tf.Variable(0.)
+        episode_duration = tf.Variable(0.)
+        episode_avg_loss = tf.Variable(0.)
+
+        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
+        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
+        tf.summary.scalar('Duration/Episode', episode_duration)
+        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
+
+        summary_vars = [episode_total_reward, episode_avg_max_q,
+                        episode_duration, episode_avg_loss]
+        summary_placeholders = [tf.placeholder(tf.float32) for _ in
+                                range(len(summary_vars))]
+        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
+                      range(len(summary_vars))]
+        summary_op = tf.summary.merge_all()
+        return summary_placeholders, update_ops, summary_op
+
+
+# 210*160*3(color) --> 84*84(mono)
+# float --> integer (to reduce the size of replay memory)
+def pre_processing(observe):
+    processed_observe = np.uint8(
+        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+    return processed_observe
+
+
+if __name__ == "__main__":
+    # In case of BreakoutDeterministic-v3, always skip 4 frames
+    # Deterministic-v4 version use 4 actions
+    env = gym.make('BreakoutDeterministic-v4')
+    agent = DQNAgent(action_size=3)
+
+    scores, episodes, global_step = [], [], 0
+
+    for e in range(EPISODES):
+        done = False
+        dead = False
+        # 1 episode = 5 lives
+        step, score, start_life = 0, 0, 5
+        observe = env.reset()
+
+        # this is one of DeepMind's idea.
+        # just do nothing at the start of episode to avoid sub-optimal
+        for _ in range(random.randint(1, agent.no_op_steps)):
+            observe, _, _, _ = env.step(1)
+
+        # At start of episode, there is no preceding frame
+        # So just copy initial states to make history
+        state = pre_processing(observe)
+        history = np.stack((state, state, state, state), axis=2)
+        history = np.reshape([history], (1, 84, 84, 4))
+
+        while not done:
+            if agent.render:
+                env.render()
+            global_step += 1
+            step += 1
+
+            # get action for the current history and go one step in environment
+            action = agent.get_action(history)
+            # change action to real_action
+            if action == 0:
+                real_action = 1
+            elif action == 1:
+                real_action = 2
+            else:
+                real_action = 3
+
+            observe, reward, done, info = env.step(real_action)
+            # pre-process the observation --> history
+            next_state = pre_processing(observe)
+            next_state = np.reshape([next_state], (1, 84, 84, 1))
+            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+            agent.avg_q_max += np.amax(
+                agent.model.predict(np.float32(history / 255.))[0])
+
+            # if the agent missed ball, agent is dead --> episode is not over
+            if start_life > info['ale.lives']:
+                dead = True
+                start_life = info['ale.lives']
+
+            reward = np.clip(reward, -1., 1.)
+
+            # save the sample <s, a, r, s'> to the replay memory
+            agent.replay_memory(history, action, reward, next_history, dead)
+            # every some time interval, train model
+            agent.train_replay()
+            # update the target model with model
+            if global_step % agent.update_target_rate == 0:
+                agent.update_target_model()
+
+            score += reward
+
+            # if agent is dead, then reset the history
+            if dead:
+                dead = False
+            else:
+                history = next_history
+
+            # if done, plot the score over episodes
+            if done:
+                if global_step > agent.train_start:
+                    stats = [score, agent.avg_q_max / float(step), step,
+                             agent.avg_loss / float(step)]
+                    for i in range(len(stats)):
+                        agent.sess.run(agent.update_ops[i], feed_dict={
+                            agent.summary_placeholders[i]: float(stats[i])
+                        })
+                    summary_str = agent.sess.run(agent.summary_op)
+                    agent.summary_writer.add_summary(summary_str, e + 1)
+
+                print("episode:", e, "  score:", score, "  memory length:",
+                      len(agent.memory), "  epsilon:", agent.epsilon,
+                      "  global_step:", global_step, "  average_q:",
+                      agent.avg_q_max / float(step), "  average loss:",
+                      agent.avg_loss / float(step))
+
+                agent.avg_q_max, agent.avg_loss = 0, 0
+
+        if e % 1000 == 0:
+            agent.model.save_weights("./save_model/breakout_dqn.h5")
diff --git a/3-atari/1-breakout/breakout_dueling_ddqn.py b/3-atari/1-breakout/breakout_dueling_ddqn.py
new file mode 100644
index 00000000..496b1e05
--- /dev/null
+++ b/3-atari/1-breakout/breakout_dueling_ddqn.py
@@ -0,0 +1,286 @@
+import gym
+import random
+import numpy as np
+import tensorflow as tf
+from collections import deque
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from keras.models import Model
+from keras.optimizers import RMSprop
+from keras.layers import Input, Dense, Flatten, Lambda, merge
+from keras.layers.convolutional import Conv2D
+from keras import backend as K
+
+EPISODES = 50000
+
+
+class DuelingDDQNAgent:
+    def __init__(self, action_size):
+        self.render = False
+        self.load_model = False
+        # environment settings
+        self.state_size = (84, 84, 4)
+        self.action_size = action_size
+        # parameters about epsilon
+        self.epsilon = 1.
+        self.epsilon_start, self.epsilon_end = 1.0, 0.1
+        self.exploration_steps = 1000000.
+        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
+                                  / self.exploration_steps
+        # parameters about training
+        self.batch_size = 32
+        self.train_start = 50000
+        self.update_target_rate = 10000
+        self.discount_factor = 0.99
+        self.memory = deque(maxlen=400000)
+        self.no_op_steps = 30
+        # build
+        self.model = self.build_model()
+        self.target_model = self.build_model()
+        self.update_target_model()
+
+        self.optimizer = self.optimizer()
+
+        self.sess = tf.InteractiveSession()
+        K.set_session(self.sess)
+
+        self.avg_q_max, self.avg_loss = 0, 0
+        self.summary_placeholders, self.update_ops, self.summary_op = \
+            self.setup_summary()
+        self.summary_writer = tf.summary.FileWriter(
+            'summary/breakout_dueling_ddqn', self.sess.graph)
+        self.sess.run(tf.global_variables_initializer())
+
+        if self.load_model:
+            self.model.load_weights("./save_model/breakout_dueling_ddqb.h5")
+
+    # if the error is in [-1, 1], then the cost is quadratic to the error
+    # But outside the interval, the cost is linear to the error
+    def optimizer(self):
+        a = K.placeholder(shape=(None, ), dtype='int32')
+        y = K.placeholder(shape=(None, ), dtype='float32')
+
+        py_x = self.model.output
+
+        a_one_hot = K.one_hot(a, self.action_size)
+        q_value = K.sum(py_x * a_one_hot, axis=1)
+        error = K.abs(y - q_value)
+
+        quadratic_part = K.clip(error, 0.0, 1.0)
+        linear_part = error - quadratic_part
+        loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
+
+        optimizer = RMSprop(lr=0.00025, epsilon=0.01)
+        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
+        train = K.function([self.model.input, a, y], [loss], updates=updates)
+
+        return train
+
+    # approximate Q function using Convolution Neural Network
+    # state is input and Q Value of each action is output of network
+    # dueling network's Q Value is sum of advantages and state value
+    def build_model(self):
+        input = Input(shape=self.state_size)
+        shared = Conv2D(32, (8, 8), strides=(4, 4), activation='relu')(input)
+        shared = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')(shared)
+        shared = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')(shared)
+        flatten = Flatten()(shared)
+
+        # network separate state value and advantages
+        advantage_fc = Dense(512, activation='relu')(flatten)
+        advantage = Dense(self.action_size)(advantage_fc)
+        advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True),
+                           output_shape=(self.action_size,))(advantage)
+
+        value_fc = Dense(512, activation='relu')(flatten)
+        value =  Dense(1)(value_fc)
+        value = Lambda(lambda s: K.expand_dims(s[:, 0], -1),
+                       output_shape=(self.action_size,))(value)
+
+        # network merged and make Q Value
+        q_value = merge([value, advantage], mode='sum')
+        model = Model(inputs=input, outputs=q_value)
+        model.summary()
+
+        return model
+
+    # after some time interval update the target model to be same with model
+    def update_target_model(self):
+        self.target_model.set_weights(self.model.get_weights())
+
+    # get action from model using epsilon-greedy policy
+    def get_action(self, history):
+        history = np.float32(history / 255.0)
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_size)
+        else:
+            q_value = self.model.predict(history)
+            return np.argmax(q_value[0])
+
+    # save sample <s,a,r,s'> to the replay memory
+    def replay_memory(self, history, action, reward, next_history, dead):
+        self.memory.append((history, action, reward, next_history, dead))
+
+    # pick samples randomly from replay memory (with batch_size)
+    def train_replay(self):
+        if len(self.memory) < self.train_start:
+            return
+        if self.epsilon > self.epsilon_end:
+            self.epsilon -= self.epsilon_decay_step
+
+        mini_batch = random.sample(self.memory, self.batch_size)
+
+        history = np.zeros((self.batch_size, self.state_size[0],
+                            self.state_size[1], self.state_size[2]))
+        next_history = np.zeros((self.batch_size, self.state_size[0],
+                                 self.state_size[1], self.state_size[2]))
+        target = np.zeros((self.batch_size, ))
+        action, reward, dead = [], [], []
+
+        for i in range(self.batch_size):
+            history[i] = np.float32(mini_batch[i][0] / 255.)
+            next_history[i] = np.float32(mini_batch[i][3] / 255.)
+            action.append(mini_batch[i][1])
+            reward.append(mini_batch[i][2])
+            dead.append(mini_batch[i][4])
+
+        value = self.model.predict(history)
+        target_value = self.target_model.predict(next_history)
+
+        # like Q Learning, get maximum Q value at s'
+        # But from target model
+        for i in range(self.batch_size):
+            if dead[i]:
+                target[i] = reward[i]
+            else:
+                # the key point of Double DQN
+                # selection of action is from model
+                # update is from target model
+                target[i] = reward[i] + self.discount_factor * \
+                                        target_value[i][np.argmax(value[i])]
+
+        loss = self.optimizer([history, action, target])
+        self.avg_loss += loss[0]
+
+    def setup_summary(self):
+        episode_total_reward = tf.Variable(0.)
+        episode_avg_max_q = tf.Variable(0.)
+        episode_duration = tf.Variable(0.)
+        episode_avg_loss = tf.Variable(0.)
+
+        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
+        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
+        tf.summary.scalar('Duration/Episode', episode_duration)
+        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
+
+        summary_vars = [episode_total_reward, episode_avg_max_q,
+                        episode_duration, episode_avg_loss]
+        summary_placeholders = [tf.placeholder(tf.float32) for _ in
+                                range(len(summary_vars))]
+        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
+                      range(len(summary_vars))]
+        summary_op = tf.summary.merge_all()
+        return summary_placeholders, update_ops, summary_op
+
+
+# 210*160*3(color) --> 84*84(mono)
+# float --> integer (to reduce the size of replay memory)
+def pre_processing(observe):
+    processed_observe = np.uint8(
+        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+    return processed_observe
+
+
+if __name__ == "__main__":
+    # In case of BreakoutDeterministic-v3, always skip 4 frames
+    # Deterministic-v4 version use 4 actions
+    env = gym.make('BreakoutDeterministic-v4')
+    agent = DuelingDDQNAgent(action_size=3)
+
+    scores, episodes, global_step = [], [], 0
+
+    for e in range(EPISODES):
+        done = False
+        dead = False
+        # 1 episode = 5 lives
+        step, score, start_life = 0, 0, 5
+        observe = env.reset()
+
+        # this is one of DeepMind's idea.
+        # just do nothing at the start of episode to avoid sub-optimal
+        for _ in range(random.randint(1, agent.no_op_steps)):
+            observe, _, _, _ = env.step(1)
+
+        # At start of episode, there is no preceding frame.
+        # So just copy initial states to make history
+        state = pre_processing(observe)
+        history = np.stack((state, state, state, state), axis=2)
+        history = np.reshape([history], (1, 84, 84, 4))
+
+        while not done:
+            if agent.render:
+                env.render()
+            global_step += 1
+            step += 1
+
+            # get action for the current history and go one step in environment
+            action = agent.get_action(history)
+            # change action to real_action
+            if action == 0: real_action = 1
+            elif action == 1: real_action = 2
+            else: real_action = 3
+
+            observe, reward, done, info = env.step(real_action)
+            # pre-process the observation --> history
+            next_state = pre_processing(observe)
+            next_state = np.reshape([next_state], (1, 84, 84, 1))
+            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+            agent.avg_q_max += np.amax(
+                agent.model.predict(np.float32(history / 255.))[0])
+
+            # if the agent missed ball, agent is dead --> episode is not over
+            if start_life > info['ale.lives']:
+                dead = True
+                start_life = info['ale.lives']
+
+            reward = np.clip(reward, -1., 1.)
+
+            # save the sample <s, a, r, s'> to the replay memory
+            agent.replay_memory(history, action, reward, next_history, dead)
+            # every some time interval, train model
+            agent.train_replay()
+            # update the target model with model
+            if global_step % agent.update_target_rate == 0:
+                agent.update_target_model()
+
+            score += reward
+
+            # if agent is dead, then reset the history
+            if dead:
+                dead = False
+            else:
+                history = next_history
+
+            # if done, plot the score over episodes
+            if done:
+                if global_step > agent.train_start:
+                    stats = [score, agent.avg_q_max / float(step), step,
+                             agent.avg_loss / float(step)]
+                    for i in range(len(stats)):
+                        agent.sess.run(agent.update_ops[i], feed_dict={
+                            agent.summary_placeholders[i]: float(stats[i])
+                        })
+                    summary_str = agent.sess.run(agent.summary_op)
+                    agent.summary_writer.add_summary(summary_str, e + 1)
+
+                print("episode:", e, "  score:", score, "  memory length:",
+                      len(agent.memory), "  epsilon:", agent.epsilon,
+                      "  global_step:", global_step, "  average_q:",
+                      agent.avg_q_max/float(step), "  average loss:",
+                      agent.avg_loss/float(step))
+
+                agent.avg_q_max, agent.avg_loss = 0, 0
+
+        if e % 1000 == 0:
+            agent.model.save_weights("./save_model/breakout_dueling_ddqn.h5")
diff --git a/3-atari/1-breakout/play_a3c_model.py b/3-atari/1-breakout/play_a3c_model.py
new file mode 100644
index 00000000..c6a32c83
--- /dev/null
+++ b/3-atari/1-breakout/play_a3c_model.py
@@ -0,0 +1,125 @@
+import gym
+import random
+import numpy as np
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from keras.models import Model
+from keras.layers import Dense, Flatten, Input
+from keras.layers.convolutional import Conv2D
+
+global episode
+episode = 0
+EPISODES = 8000000
+env_name = "BreakoutDeterministic-v4"
+
+class TestAgent:
+    def __init__(self, action_size):
+        self.state_size = (84, 84, 4)
+        self.action_size = action_size
+
+        self.discount_factor = 0.99
+        self.no_op_steps = 30
+
+        self.actor, self.critic = self.build_model()
+
+    def build_model(self):
+        input = Input(shape=self.state_size)
+        conv = Conv2D(16, (8, 8), strides=(4, 4), activation='relu')(input)
+        conv = Conv2D(32, (4, 4), strides=(2, 2), activation='relu')(conv)
+        conv = Flatten()(conv)
+        fc = Dense(256, activation='relu')(conv)
+        policy = Dense(self.action_size, activation='softmax')(fc)
+        value = Dense(1, activation='linear')(fc)
+
+        actor = Model(inputs=input, outputs=policy)
+        critic = Model(inputs=input, outputs=value)
+
+        actor.summary()
+        critic.summary()
+
+        return actor, critic
+
+    def get_action(self, history):
+        history = np.float32(history / 255.)
+        policy = self.actor.predict(history)[0]
+
+        action_index = np.argmax(policy)
+        return action_index
+
+    def load_model(self, name):
+        self.actor.load_weights(name)
+
+def pre_processing(next_observe, observe):
+    processed_observe = np.maximum(next_observe, observe)
+    processed_observe = np.uint8(
+        resize(rgb2gray(processed_observe), (84, 84), mode='constant') * 255)
+    return processed_observe
+
+
+if __name__ == "__main__":
+    env = gym.make(env_name)
+    agent = TestAgent(action_size=3)
+    agent.load_model("save_model/breakout_a3c_5_actor.h5")
+
+    step = 0
+
+    while episode < EPISODES:
+        done = False
+        dead = False
+
+        score, start_life = 0, 5
+        observe = env.reset()
+        next_observe = observe
+
+        for _ in range(random.randint(1, 20)):
+            observe = next_observe
+            next_observe, _, _, _ = env.step(1)
+
+        state = pre_processing(next_observe, observe)
+        history = np.stack((state, state, state, state), axis=2)
+        history = np.reshape([history], (1, 84, 84, 4))
+
+        while not done:
+            env.render()
+            step += 1
+            observe = next_observe
+
+            action = agent.get_action(history)
+
+            if action == 1:
+                fake_action = 2
+            elif action == 2:
+                fake_action = 3
+            else:
+                fake_action = 1
+
+            if dead:
+                fake_action = 1
+                dead = False
+
+            next_observe, reward, done, info = env.step(fake_action)
+
+            next_state = pre_processing(next_observe, observe)
+            next_state = np.reshape([next_state], (1, 84, 84, 1))
+            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+            if start_life > info['ale.lives']:
+                dead = True
+                reward = -1
+                start_life = info['ale.lives']
+
+            score += reward
+
+            # if agent is dead, then reset the history
+            if dead:
+                history = np.stack(
+                    (next_state, next_state, next_state, next_state), axis=2)
+                history = np.reshape([history], (1, 84, 84, 4))
+            else:
+                history = next_history
+
+            # if done, plot the score over episodes
+            if done:
+                episode += 1
+                print("episode:", episode, "  score:", score, "  step:", step)
+                step = 0
\ No newline at end of file
diff --git a/3-atari/1-breakout/play_dqn_model.py b/3-atari/1-breakout/play_dqn_model.py
new file mode 100644
index 00000000..45662c78
--- /dev/null
+++ b/3-atari/1-breakout/play_dqn_model.py
@@ -0,0 +1,110 @@
+import gym
+import random
+import numpy as np
+import tensorflow as tf
+from skimage.color import rgb2gray
+from skimage.transform import resize
+from keras.models import Sequential
+from keras.layers import Dense, Flatten
+from keras.layers.convolutional import Conv2D
+from keras import backend as K
+
+EPISODES = 50000
+
+
+class TestAgent:
+    def __init__(self, action_size):
+        self.state_size = (84, 84, 4)
+        self.action_size = action_size
+        self.no_op_steps = 20
+
+        self.model = self.build_model()
+
+        self.sess = tf.InteractiveSession()
+        K.set_session(self.sess)
+
+        self.avg_q_max, self.avg_loss = 0, 0
+        self.sess.run(tf.global_variables_initializer())
+
+    def build_model(self):
+        model = Sequential()
+        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
+                         input_shape=self.state_size))
+        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
+        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
+        model.add(Flatten())
+        model.add(Dense(512, activation='relu'))
+        model.add(Dense(self.action_size))
+        model.summary()
+
+        return model
+
+    def get_action(self, history):
+        if np.random.random() < 0.01:
+            return random.randrange(3)
+        history = np.float32(history / 255.0)
+        q_value = self.model.predict(history)
+        return np.argmax(q_value[0])
+
+    def load_model(self, filename):
+        self.model.load_weights(filename)
+
+def pre_processing(observe):
+    processed_observe = np.uint8(
+        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
+    return processed_observe
+
+
+if __name__ == "__main__":
+    env = gym.make('BreakoutDeterministic-v4')
+    agent = TestAgent(action_size=3)
+    agent.load_model("./save_model/breakout_dqn_5.h5")
+
+    for e in range(EPISODES):
+        done = False
+        dead = False
+       
+        step, score, start_life = 0, 0, 5
+        observe = env.reset()
+
+        for _ in range(random.randint(1, agent.no_op_steps)):
+            observe, _, _, _ = env.step(1)
+
+        state = pre_processing(observe)
+        history = np.stack((state, state, state, state), axis=2)
+        history = np.reshape([history], (1, 84, 84, 4))
+
+        while not done:
+            env.render()
+            step += 1
+
+            action = agent.get_action(history)
+
+            if action == 0:
+                real_action = 1
+            elif action == 1:
+                real_action = 2
+            else:
+                real_action = 3
+
+            if dead:
+                real_action = 1
+                dead = False
+
+            observe, reward, done, info = env.step(real_action)
+
+            next_state = pre_processing(observe)
+            next_state = np.reshape([next_state], (1, 84, 84, 1))
+            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
+
+            if start_life > info['ale.lives']:
+                dead = True
+                start_life = info['ale.lives']
+
+            score += reward
+
+            history = next_history
+
+            if done:
+                print("episode:", e, "  score:", score)
+
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5
new file mode 100644
index 00000000..37a6a1ac
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_1_actor.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5
new file mode 100644
index 00000000..3d3394ae
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_1_critic.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5
new file mode 100644
index 00000000..21207c0f
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_2_actor.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5
new file mode 100644
index 00000000..a26f7d8a
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_2_critic.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5
new file mode 100644
index 00000000..a27e766e
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_3_actor.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5
new file mode 100644
index 00000000..62236fc7
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_3_critic.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5
new file mode 100644
index 00000000..4fc3b773
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_4_actor.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5
new file mode 100644
index 00000000..f65494da
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_4_critic.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5
new file mode 100644
index 00000000..db855b24
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_5_actor.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 b/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5
new file mode 100644
index 00000000..3636d02d
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_a3c_5_critic.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn.h5 b/3-atari/1-breakout/save_model/breakout_dqn.h5
new file mode 100644
index 00000000..fec05377
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_1.h5 b/3-atari/1-breakout/save_model/breakout_dqn_1.h5
new file mode 100644
index 00000000..bb219b8a
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_1.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_2.h5 b/3-atari/1-breakout/save_model/breakout_dqn_2.h5
new file mode 100644
index 00000000..f316b4bc
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_2.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_3.h5 b/3-atari/1-breakout/save_model/breakout_dqn_3.h5
new file mode 100644
index 00000000..3e9ab26d
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_3.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_4.h5 b/3-atari/1-breakout/save_model/breakout_dqn_4.h5
new file mode 100644
index 00000000..2c952d42
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_4.h5 differ
diff --git a/3-atari/1-breakout/save_model/breakout_dqn_5.h5 b/3-atari/1-breakout/save_model/breakout_dqn_5.h5
new file mode 100644
index 00000000..eae4c99b
Binary files /dev/null and b/3-atari/1-breakout/save_model/breakout_dqn_5.h5 differ
diff --git a/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 b/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638
new file mode 100644
index 00000000..1eb4343a
Binary files /dev/null and b/3-atari/1-breakout/summary/breakout_a3c/events.out.tfevents.1497264638 differ
diff --git a/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name b/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name
new file mode 100644
index 00000000..2e394adf
Binary files /dev/null and b/3-atari/1-breakout/summary/breakout_dqn/events.out.tfevents.1496968668.young-System-Product-Name differ
diff --git a/Code 3. Atari Game/2. Pong/README.md b/3-atari/2-pong/README.md
similarity index 100%
rename from Code 3. Atari Game/2. Pong/README.md
rename to 3-atari/2-pong/README.md
diff --git a/Code 3. Atari Game/2. Pong/assets/pg.gif b/3-atari/2-pong/assets/pg.gif
similarity index 100%
rename from Code 3. Atari Game/2. Pong/assets/pg.gif
rename to 3-atari/2-pong/assets/pg.gif
diff --git a/Code 3. Atari Game/2. Pong/assets/score.png b/3-atari/2-pong/assets/score.png
similarity index 100%
rename from Code 3. Atari Game/2. Pong/assets/score.png
rename to 3-atari/2-pong/assets/score.png
diff --git a/Code 3. Atari Game/3. A3C/Breakout_A3C.py b/3-atari/2-pong/pong_a3c.py
similarity index 100%
rename from Code 3. Atari Game/3. A3C/Breakout_A3C.py
rename to 3-atari/2-pong/pong_a3c.py
diff --git a/Code 3. Atari Game/2. Pong/pg.py b/3-atari/2-pong/pong_reinforce.py
similarity index 95%
rename from Code 3. Atari Game/2. Pong/pg.py
rename to 3-atari/2-pong/pong_reinforce.py
index b6977145..ce346a78 100644
--- a/Code 3. Atari Game/2. Pong/pg.py	
+++ b/3-atari/2-pong/pong_reinforce.py
@@ -29,6 +29,7 @@ def _build_model(self):
         model.add(Dense(32, activation='relu', init='he_uniform'))
         model.add(Dense(self.action_size, activation='softmax'))
         opt = Adam(lr=self.learning_rate)
+        # See note regarding crossentropy in cartpole_reinforce.py
         model.compile(loss='categorical_crossentropy', optimizer=opt)
         return model
 
@@ -92,7 +93,7 @@ def preprocess(I):
     state_size = 80 * 80
     action_size = env.action_space.n
     agent = PGAgent(state_size, action_size)
-    agent.load('pong.h5')
+    agent.load('./save_model/pong_reinforce.h5')
     while True:
         env.render()
 
@@ -113,4 +114,4 @@ def preprocess(I):
             state = env.reset()
             prev_x = None
             if episode > 1 and episode % 10 == 0:
-                agent.save('pong.h5')
+                agent.save('./save_model/pong_reinforce.h5')
diff --git a/Code 3. Atari Game/2. Pong/pong.h5 b/3-atari/2-pong/save_model/pong_reinforce.h5
similarity index 100%
rename from Code 3. Atari Game/2. Pong/pong.h5
rename to 3-atari/2-pong/save_model/pong_reinforce.h5
diff --git a/Code 3. Atari Game/LICENSE b/3-atari/LICENSE
similarity index 100%
rename from Code 3. Atari Game/LICENSE
rename to 3-atari/LICENSE
diff --git a/4-gym/1-mountaincar/mountaincar_dqn.py b/4-gym/1-mountaincar/mountaincar_dqn.py
new file mode 100644
index 00000000..932ba6b0
--- /dev/null
+++ b/4-gym/1-mountaincar/mountaincar_dqn.py
@@ -0,0 +1,175 @@
+import gym
+import pylab
+import random
+import numpy as np
+from collections import deque
+from keras.layers import Dense
+from keras.optimizers import Adam
+from keras.models import Sequential
+
+EPISODES = 4000
+
+
+class DQNAgent:
+    def __init__(self, state_size, action_size):
+        # Cartpole이 학습하는 것을 보려면 "True"로 바꿀 것
+        self.render = True
+
+        # state와 action의 크기를 가져와서 모델을 생성하는데 사용함
+        self.state_size = state_size
+        self.action_size = action_size
+
+        # Cartpole DQN 학습의 Hyper parameter 들
+        # deque를 통해서 replay memory 생성
+        self.discount_factor = 0.99
+        self.learning_rate = 0.001
+        self.epsilon = 1.0
+        self.epsilon_min = 0.005
+        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
+        self.batch_size = 64
+        self.train_start = 1000
+        self.memory = deque(maxlen=10000)
+
+        # 학습할 모델과 타겟 모델을 생성
+        self.model = self.build_model()
+        self.target_model = self.build_model()
+        # 학습할 모델을 타겟 모델로 복사 --> 타겟 모델의 초기화(weight를 같게 해주고 시작해야 함)
+        self.update_target_model()
+
+    # Deep Neural Network를 통해서 Q Function을 근사
+    # state가 입력, 각 행동에 대한 Q Value가 출력인 모델을 생성
+    def build_model(self):
+        model = Sequential()
+        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
+        model.add(Dense(16, activation='relu', kernel_initializer='he_uniform'))
+        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
+        model.summary()
+        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
+        return model
+
+    # 일정한 시간 간격마다 타겟 모델을 현재 학습하고 있는 모델로 업데이트
+    def update_target_model(self):
+        self.target_model.set_weights(self.model.get_weights())
+
+    # 행동의 선택은 현재 네트워크에 대해서 epsilon-greedy 정책을 사용
+    def get_action(self, state):
+        if np.random.rand() <= self.epsilon:
+            return random.randrange(self.action_size)
+        else:
+            q_value = self.model.predict(state)
+            return np.argmax(q_value[0])
+
+    # <s,a,r,s'>을 replay_memory에 저장함
+    def replay_memory(self, state, action, reward, next_state, done):
+        if action == 2:
+            action = 1
+        self.memory.append((state, action, reward, next_state, done))
+        if self.epsilon > self.epsilon_min:
+            self.epsilon -= self.epsilon_decay
+        # print(len(self.memory))
+
+    # replay memory에서 batch_size 만큼의 샘플들을 무작위로 뽑아서 학습
+    def train_replay(self):
+        if len(self.memory) < self.train_start:
+            return
+        batch_size = min(self.batch_size, len(self.memory))
+        mini_batch = random.sample(self.memory, batch_size)
+
+        update_input = np.zeros((batch_size, self.state_size))
+        update_target = np.zeros((batch_size, self.action_size))
+
+        for i in range(batch_size):
+            state, action, reward, next_state, done = mini_batch[i]
+            target = self.model.predict(state)[0]
+
+            # 큐러닝에서와 같이 s'에서의 최대 Q Value를 가져옴. 단, 타겟 모델에서 가져옴
+            if done:
+                target[action] = reward
+            else:
+                target[action] = reward + self.discount_factor * \
+                                          np.amax(self.target_model.predict(next_state)[0])
+            update_input[i] = state
+            update_target[i] = target
+
+        # 학습할 정답인 타겟과 현재 자신의 값의 minibatch를 만들고 그것으로 한 번에 모델 업데이트
+        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
+
+    # 저장한 모델을 불러옴
+    def load_model(self, name):
+        self.model.load_weights(name)
+
+    # 학습된 모델을 저장함
+    def save_model(self, name):
+        self.model.save_weights(name)
+
+
+if __name__ == "__main__":
+    # CartPole-v1의 경우 500 타임스텝까지 플레이가능
+    env = gym.make('MountainCar-v0')
+    # 환경으로부터 상태와 행동의 크기를 가져옴
+    state_size = env.observation_space.shape[0]
+    #action_size = env.action_space.n
+    action_size = 2
+    # DQN 에이전트의 생성
+    agent = DQNAgent(state_size, action_size)
+    agent.load_model("./save_model/MountainCar_DQN.h5")
+    scores, episodes = [], []
+
+    for e in range(EPISODES):
+        done = False
+        score = 0
+        state = env.reset()
+        state = np.reshape(state, [1, state_size])
+        print(state)
+
+        # 액션 0(좌), 1(아무것도 안함), 3(아무것도 하지 않는 액션을 하지 않기 위한 fake_action 선언
+        fake_action = 0
+
+        # 같은 액션을 4번하기 위한 카운터
+        action_count = 0
+
+        while not done:
+            if agent.render:
+                env.render()
+
+            # 현재 상태에서 행동을 선택하고 한 스텝을 진행
+            action_count = action_count + 1
+
+            if action_count == 4:
+                action = agent.get_action(state)
+                action_count = 0
+
+                if action == 0:
+                    fake_action = 0
+                elif action == 1:
+                    fake_action = 2
+
+            # 선택한 액션으로 1 step을 시행한다
+            next_state, reward, done, info = env.step(fake_action)
+            next_state = np.reshape(next_state, [1, state_size])
+            # 에피소드를 끝나게 한 행동에 대해서 -100의 패널티를 줌
+            #reward = reward if not done else -100
+
+            # <s, a, r, s'>을 replay memory에 저장
+            agent.replay_memory(state, fake_action, reward, next_state, done)
+            # 매 타임스텝마다 학습을 진행
+            agent.train_replay()
+            score += reward
+            state = next_state
+
+            if done:
+                env.reset()
+                # 매 에피소드마다 학습하는 모델을 타겟 모델로 복사
+                agent.update_target_model()
+
+                # 각 에피소드마다 cartpole이 서있었던 타임스텝을 plot
+                scores.append(score)
+                episodes.append(e)
+                #pylab.plot(episodes, scores, 'b')
+                #pylab.savefig("./save_graph/MountainCar_DQN.png")
+                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
+                      "  epsilon:", agent.epsilon)
+
+        # 50 에피소드마다 학습 모델을 저장
+        if e % 50 == 0:
+             agent.save_model("./save_model/MountainCar_DQN.h5")
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole.h5 b/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5
similarity index 67%
rename from Code 2. Cartpole/1. DQN/save_model/Cartpole.h5
rename to 4-gym/1-mountaincar/save_model/MountainCar_DQN.h5
index 25268bae..7f17c818 100644
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole.h5 and b/4-gym/1-mountaincar/save_model/MountainCar_DQN.h5 differ
diff --git a/Code 1. Grid World/1. Policy Iteration/__pycache__/agent.cpython-35.pyc b/Code 1. Grid World/1. Policy Iteration/__pycache__/agent.cpython-35.pyc
deleted file mode 100644
index c0fe58b3..00000000
Binary files a/Code 1. Grid World/1. Policy Iteration/__pycache__/agent.cpython-35.pyc and /dev/null differ
diff --git a/Code 1. Grid World/1. Policy Iteration/environment.py b/Code 1. Grid World/1. Policy Iteration/environment.py
deleted file mode 100644
index ab6e428d..00000000
--- a/Code 1. Grid World/1. Policy Iteration/environment.py	
+++ /dev/null
@@ -1,214 +0,0 @@
-import tkinter as tk
-import time
-import numpy as np
-from PIL import ImageTk, Image
-from policy_iteration import PolicyIteration
-
-UNIT = 100  # pixels
-HEIGHT = 5  # grid height
-WIDTH = 5  # grid width
-TRANSITION_PROB = 1
-POSSIBLE_ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # 가능한 모든 행동
-REWARDS = []
-
-
-class GraphicDisplay(tk.Tk):
-
-    def __init__(self):
-        super(GraphicDisplay, self).__init__()
-        self.title('Policy Iteration')
-        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
-        self.texts = []
-        self.arrows = []
-        self.util = Util()
-        self.agent = PolicyIteration(self.util)
-        self._build_env()
-
-    def _build_env(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
-        # Buttons
-        iteration_button = tk.Button(self, text="Evaluation", command=self.policy_evaluation)
-        iteration_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button)
-
-        policy_button = tk.Button(self, text="Improvement", command=self.policy_improvement)
-        policy_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button)
-
-        policy_button = tk.Button(self, text="move", command=self.move_by_policy)
-        policy_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button)
-
-        policy_button = tk.Button(self, text="clear", command=self.clear)
-        policy_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button)
-
-        # create grids
-        for col in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
-            x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
-        for row in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
-            x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row
-            self.canvas.create_line(x0, y0, x1, y1)
-
-        # image_load
-        self.up_image = ImageTk.PhotoImage(Image.open("../resources/up.png").resize((13, 13)))
-        self.right_image = ImageTk.PhotoImage(Image.open("../resources/right.png").resize((13, 13)))
-        self.left_image = ImageTk.PhotoImage(Image.open("../resources/left.png").resize((13, 13)))
-        self.down_image = ImageTk.PhotoImage(Image.open("../resources/down.png").resize((13, 13)))
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
-        self.triangle_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
-        self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
-
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.triangle1 = self.canvas.create_image(250, 150, image=self.triangle_image)
-        self.triangle2 = self.canvas.create_image(150, 250, image=self.triangle_image)
-        self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
-
-        # add reward text
-        self.text_reward(2, 2, "R : 1.0")
-        self.text_reward(1, 2, "R : -1.0")
-        self.text_reward(2, 1, "R : -1.0")
-
-        # pack all
-        self.canvas.pack()
-
-    def clear(self):
-        for i in self.texts:
-            self.canvas.delete(i)
-
-        for i in self.arrows:
-            self.canvas.delete(i)
-
-        self.canvas.delete(self.rectangle)
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.agent = PolicyIteration(self.util)
-
-    def text_value(self, row, col, contents,font='Helvetica', size=10, style='normal', anchor="nw"):
-        origin_x, origin_y = 85, 70
-        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
-        font = (font, str(size), style)
-        return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
-
-    def text_reward(self, row, col, contents, font='Helvetica', size=10, style='normal', anchor="nw"):
-        origin_x, origin_y = 5, 5
-        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
-        font = (font, str(size), style)
-        return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)
-
-    def rectangle_move(self, action):
-        base_action = np.array([0, 0])
-        self.render()
-        if action[0] == 1:  # down
-            base_action[1] += UNIT
-        elif action[0] == -1:  # up
-            base_action[1] -= UNIT
-        elif action[1] == 1:  # right
-            base_action[0] += UNIT
-        elif action[1] == -1:  # left
-            base_action[0] -= UNIT
-
-        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
-
-    def rectangle_location(self):
-        temp = self.canvas.coords(self.rectangle)
-        x = (temp[0] / 100) - 0.5
-        y = (temp[1] / 100) - 0.5
-        return int(y), int(x)
-
-    def move_by_policy(self):
-        self.canvas.delete(self.rectangle)
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        while len(self.agent.get_policy_table()[self.rectangle_location()[0]][self.rectangle_location()[1]]) != 0:
-            self.after(100, self.rectangle_move(
-                self.agent.get_action([self.rectangle_location()[0], self.rectangle_location()[1]])))
-
-    def draw_one_arrow(self, col, row, action):
-
-        if col == 2 and row == 2:
-            return
-
-        if action[0] > 0:  # up
-            origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.up_image))
-
-        if action[1] > 0:  # down
-            origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.down_image))
-
-        if action[2] > 0:  # left
-            origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.left_image))
-
-        if action[3] > 0:  # right
-            origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.right_image))
-
-    def draw_from_policy(self, policy_table):
-        for i in range(HEIGHT):
-            for j in range(WIDTH):
-                self.draw_one_arrow(i, j, policy_table[i][j])
-
-    def print_value_table(self, value_table):
-        for i in range(WIDTH):
-            for j in range(HEIGHT):
-                self.text_value(i, j, value_table[i][j])
-
-    def render(self):
-        time.sleep(0.1)
-        self.canvas.tag_raise(self.rectangle)
-        self.update()
-
-    def policy_evaluation(self):
-        for i in self.texts:
-            self.canvas.delete(i)
-        self.agent.policy_evaluation()
-        self.print_value_table(self.agent.get_value_table())
-
-    def policy_improvement(self):
-        for i in self.arrows:
-            self.canvas.delete(i)
-        self.agent.policy_improvement()
-        self.draw_from_policy(self.agent.get_policy_table())
-
-
-class Util:
-    def __init__(self):
-        self.transition_probability = TRANSITION_PROB  # 상태 변환 확률
-        self.width = WIDTH  # 그리드월드의 가로 길이
-        self.height = HEIGHT  # 그리드 월드의 세로 길이
-        self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
-        self.possible_actions = POSSIBLE_ACTIONS
-        self.reward[2][2] = 1  # 물고기 자리에 보상 1
-        self.reward[1][2] = -1  # 불 자리에 보상 -1
-        self.reward[2][1] = -1  # 불 자리에 보상 -1
-        self.all_state = []
-
-        for x in range(WIDTH):
-            for y in range(HEIGHT):
-                state = [x, y]
-                self.all_state.append(state)
-
-    def get_reward(self, state, action):
-        next_state = self.state_after_action(state, action)
-        return self.reward[next_state[0]][next_state[1]]
-
-    def state_after_action(self, state, action):
-        return self.check_boundary([state[0] + action[0], state[1] + action[1]])
-
-    @staticmethod
-    def check_boundary(state):
-        state[0] = 0 if state[0] < 0 else WIDTH - 1 if state[0] > WIDTH - 1 else state[0]
-        state[1] = 0 if state[1] < 0 else HEIGHT - 1 if state[1] > HEIGHT - 1 else state[1]
-        return state
-
-    def get_transition_prob(self, state, action):
-        return self.transition_probability
-
-    def get_all_states(self):
-        return self.all_state
diff --git a/Code 1. Grid World/1. Policy Iteration/policy_iteration.py b/Code 1. Grid World/1. Policy Iteration/policy_iteration.py
deleted file mode 100644
index b3976ea5..00000000
--- a/Code 1. Grid World/1. Policy Iteration/policy_iteration.py	
+++ /dev/null
@@ -1,123 +0,0 @@
-# -*- coding: utf-8 -*-
-import copy
-import random
-
-DISCOUNT_FACTOR = 0.9
-
-
-class PolicyIteration:
-    def __init__(self, env):
-        # environment object
-        self.env = env
-        # creaking 2 dimension list for the value function
-        self.value_table = [[0.00] * env.width for _ in range(env.height)]
-        # creating list for the policy
-        # this is random policy which has same probability for doing up, down, left, right
-        self.policy_table = [[[0.25, 0.25, 0.25, 0.25]] * env.width for _ in range(env.height)]
-        # setting terminal state
-        self.policy_table[2][2] = []
-
-    # method for the policy evaluation
-    # use Bellman Expectation Equation for calculating next value function table
-    def policy_evaluation(self):
-        next_value_table = copy.deepcopy(self.value_table)
-
-        # Bellman Expectation Equation for the every states
-        for state in self.env.get_all_states():
-            next_value_table[state[0]][state[1]] = round(self.calculate_value(state), 2)
-
-        self.value_table = copy.deepcopy(next_value_table)
-
-    # calculating new value function using Bellman Expectation Equation
-    def calculate_value(self, state):
-        value = 0
-
-        for action in self.env.possible_actions:
-            next_state = self.env.state_after_action(state, action)
-            reward = self.env.get_reward(state, action)
-            next_value = self.get_value(next_state)
-            value += self.get_policy(state, action) * (reward + DISCOUNT_FACTOR * next_value)
-
-        # keep the value function of terminal states as 0
-        if state == [2, 2]:
-            return 0.0
-
-        return value
-
-    # This is Greedy Policy which always selects action of maximum value
-    def greedy_policy(self, state):
-
-        value = -99999
-        max_index = []
-        # initialize the policy
-        result = [0.0, 0.0, 0.0, 0.0]
-
-        # for every actions, calculating [reward + (discount factor) * (next state value function)]
-        for index, action in enumerate(self.env.possible_actions):
-            next_state = self.env.state_after_action(state, action)
-            reward = self.env.get_reward(state, action)
-            next_value = self.get_value(next_state)
-            temp = reward + DISCOUNT_FACTOR * next_value
-
-            # For the greedy policy, originally we can't pick multiple actions
-            # but in this example, we allow to pick multiple actions which have same maximum values
-            if temp == value:
-                max_index.append(index)
-            elif temp > value:
-                value = temp
-                max_index.clear()
-                max_index.append(index)
-
-        # probability of action
-        prob = 1 / len(max_index)
-
-        for index in max_index:
-            result[index] = prob
-
-        return result
-
-    # using the greedy policy method, do the policy improvement
-    # under the current value function table
-    def policy_improvement(self):
-        next_policy = self.get_policy_table()
-        for state in self.env.get_all_states():
-
-            if state == [2, 2]:
-                continue
-
-            next_policy[state[0]][state[1]] = self.greedy_policy(state)
-        self.policy_table = next_policy
-
-    # get action according to the current policy
-    def get_action(self, state):
-        random_pick = random.randrange(100) / 100
-
-        policy = self.get_policy(state)
-        policy_sum = 0.0
-        # return the action in the index
-        for index, value in enumerate(policy):
-            policy_sum += value
-            if random_pick < policy_sum:
-                return self.env.possible_actions[index]
-
-    # get the policy table for the all states
-    def get_policy_table(self):
-        return copy.deepcopy(self.policy_table)
-
-    # get policy of specific state and action
-    def get_policy(self, state, action=None):
-        # if there is no action given, then return the probabilities of all actions
-        if action is None:
-            return self.policy_table[state[0]][state[1]]
-
-        if state == [2, 2]:
-            return 0.0
-
-        return self.policy_table[state[0]][state[1]][self.env.possible_actions.index(action)]
-
-    def get_value_table(self):
-        return copy.deepcopy(self.value_table)
-
-    def get_value(self, state):
-        return round(self.value_table[state[0]][state[1]], 2)
-
diff --git a/Code 1. Grid World/1. Policy Iteration/run.py b/Code 1. Grid World/1. Policy Iteration/run.py
deleted file mode 100644
index 1c1a6f35..00000000
--- a/Code 1. Grid World/1. Policy Iteration/run.py	
+++ /dev/null
@@ -1,5 +0,0 @@
-from environment import GraphicDisplay
-
-if __name__ == "__main__":
-    grid_world = GraphicDisplay()
-    grid_world.mainloop()
diff --git a/Code 1. Grid World/2. Value Iteration/environment.py b/Code 1. Grid World/2. Value Iteration/environment.py
deleted file mode 100644
index ee3deaa2..00000000
--- a/Code 1. Grid World/2. Value Iteration/environment.py	
+++ /dev/null
@@ -1,256 +0,0 @@
-import tkinter as tk
-import time
-import numpy as np
-from PIL import ImageTk, Image
-from value_iteration import ValueIteration
-
-UNIT = 100  # pixels
-HEIGHT = 5  # grid height
-WIDTH = 5  # grid width
-TRANSITION_PROB = 1
-POSSIBLE_ACTIONS = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # 가능한 모든 행동
-REWARDS = []
-
-
-class GraphicDisplay(tk.Tk):
-    def __init__(self):
-        super(GraphicDisplay, self).__init__()
-        self.title('Value Iteration')
-        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
-        self.texts = []
-        self.arrows = []
-        self.util = Util()
-        self.agent = ValueIteration(self.util)
-        self._build_env()
-
-    def _build_env(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
-        # Buttons
-        iteration_button = tk.Button(self, text="Calculate", command=self.calculate_value)
-        iteration_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button)
-
-        policy_button = tk.Button(self, text="Print Policy", command=self.print_optimal_policy)
-        policy_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button)
-
-        policy_button = tk.Button(self, text="Move", command=self.move_by_policy)
-        policy_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button)
-
-        policy_button = tk.Button(self, text="Clear", command=self.clear)
-        policy_button.configure(width=10, activebackground="#33B5E5")
-        self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button)
-
-        # create grids
-        for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
-            x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
-        for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
-            x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
-            self.canvas.create_line(x0, y0, x1, y1)
-
-        # image_load
-        self.up_image = ImageTk.PhotoImage(Image.open("../resources/up.png").resize((13, 13)))
-        self.right_image = ImageTk.PhotoImage(Image.open("../resources/right.png").resize((13, 13)))
-        self.left_image = ImageTk.PhotoImage(Image.open("../resources/left.png").resize((13, 13)))
-        self.down_image = ImageTk.PhotoImage(Image.open("../resources/down.png").resize((13, 13)))
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
-        self.triangle_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
-        self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
-
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.hell1 = self.canvas.create_image(250, 150, image=self.triangle_image)
-        self.hell2 = self.canvas.create_image(150, 250, image=self.triangle_image)
-        self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
-
-        # add reward text
-        self.text_reward(2, 2, "R : 1.0")
-        self.text_reward(1, 2, "R : -1.0")
-        self.text_reward(2, 1, "R : -1.0")
-
-        # pack all
-        self.canvas.pack()
-
-    def clear(self):
-        for i in self.texts:
-            self.canvas.delete(i)
-
-        for i in self.arrows:
-            self.canvas.delete(i)
-
-        self.canvas.delete(self.rectangle)
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.agent = ValueIteration(self.util)
-
-    def reset(self):
-        self.update()
-        time.sleep(0.5)
-        self.canvas.delete(self.rectangle)
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        # return observation
-        return self.canvas.coords(self.rectangle)
-
-    def text_value(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"):
-        origin_x, origin_y = 85, 70
-        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
-        font = (font, str(size), style)
-        return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
-
-    def text_reward(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"):
-        origin_x, origin_y = 5, 5
-        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
-        font = (font, str(size), style)
-        return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)
-
-    def step(self, action):
-        s = self.canvas.coords(self.rectangle)
-
-        base_action = np.array([0, 0])
-        if action == 0:  # up
-            if s[1] > UNIT:
-                base_action[1] -= UNIT
-        elif action == 1:  # down
-            if s[1] < (HEIGHT - 1) * UNIT:
-                base_action[1] += UNIT
-        elif action == 2:  # right
-            if s[0] < (WIDTH - 1) * UNIT:
-                base_action[0] += UNIT
-        elif action == 3:  # left
-            if s[0] > UNIT:
-                base_action[0] -= UNIT
-
-        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
-        s_ = self.canvas.coords(self.rectangle)  # next state
-        # reward function
-        if s_ == self.canvas.coords(self.circle):
-            reward = 1
-            done = True
-        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
-            reward = -1
-            done = True
-        else:
-            reward = 0
-            done = False
-
-        return s_, reward, done
-
-    def rectangle_move(self, action):
-
-        base_action = np.array([0, 0])
-        self.render()
-
-        if action[0] == 1:  # down
-            base_action[1] += UNIT
-        elif action[0] == -1:  # up
-            base_action[1] -= UNIT
-        elif action[1] == 1:  # right
-            base_action[0] += UNIT
-        elif action[1] == -1:  # left
-            base_action[0] -= UNIT
-
-        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
-
-    def rectangle_location(self):
-        temp = self.canvas.coords(self.rectangle)
-        x = (temp[0] / 100) - 0.5
-        y = (temp[1] / 100) - 0.5
-        return int(y), int(x)
-
-    def move_by_policy(self):
-        self.canvas.delete(self.rectangle)
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]]
-        while len(self.agent.get_action(agent_state, False)) != 0:
-            agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]]
-            self.after(100, self.rectangle_move(self.agent.get_action(agent_state, True)))
-
-    def draw_one_arrow(self, col, row, action):
-        if action[0] == 1:  # down
-            origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.down_image))
-
-        elif action[0] == -1:  # up
-            origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.up_image))
-
-        elif action[1] == 1:  # right
-            origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.right_image))
-
-        elif action[1] == -1:  # left
-            origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
-            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.left_image))
-
-    def draw_from_values(self, state, action_list):
-
-        i = state[0]
-        j = state[1]
-
-        for action in action_list:
-            self.draw_one_arrow(i, j, action)
-
-    def print_values(self, values):
-        for i in range(WIDTH):
-            for j in range(HEIGHT):
-                self.text_value(i, j, values[i][j])
-
-    def render(self):
-        time.sleep(0.1)
-        self.canvas.tag_raise(self.rectangle)
-        self.update()
-
-    def calculate_value(self):
-        for i in self.texts:
-            self.canvas.delete(i)
-        self.agent.iteration()
-        print(self.agent.get_value_table)
-        self.print_values(self.agent.get_value_table())
-
-    def print_optimal_policy(self):
-        for i in self.arrows:
-            self.canvas.delete(i)
-        for state in self.util.get_all_states():
-            action = self.agent.get_action(state, False)
-            self.draw_from_values(state, action)
-
-
-class Util:
-    def __init__(self):
-        self.transition_probability = TRANSITION_PROB  # 상태 변환 확률
-        self.width = WIDTH  # 그리드월드의 가로 길이
-        self.height = HEIGHT  # 그리드 월드의 세로 길이
-        self.reward = [[0] * WIDTH for _ in range(HEIGHT)]
-        self.possible_actions = POSSIBLE_ACTIONS
-        self.reward[2][2] = 1  # 물고기 자리에 보상 1
-        self.reward[1][2] = -1  # 불 자리에 보상 -1
-        self.reward[2][1] = -1  # 불 자리에 보상 -1
-        self.all_state = []
-
-        for x in range(WIDTH):
-            for y in range(HEIGHT):
-                state = [x, y]
-                self.all_state.append(state)
-
-    def get_reward(self, state, action):
-        next_state = self.state_after_action(state,action)
-        return self.reward[next_state[0]][next_state[1]]
-
-    def state_after_action(self, state, action):
-        return self.check_boundary([state[0] + action[0], state[1] + action[1]])
-
-    def check_boundary(self, state):
-        state[0] = 0 if state[0] < 0 else WIDTH - 1 if state[0] > WIDTH - 1 else state[0]
-        state[1] = 0 if state[1] < 0 else HEIGHT - 1 if state[1] > HEIGHT - 1 else state[1]
-        return state
-
-    def get_transition_prob(self, state, action):
-        return self.transition_probability
-
-    def get_all_states(self):
-        return self.all_state
\ No newline at end of file
diff --git a/Code 1. Grid World/2. Value Iteration/run.py b/Code 1. Grid World/2. Value Iteration/run.py
deleted file mode 100644
index 1c1a6f35..00000000
--- a/Code 1. Grid World/2. Value Iteration/run.py	
+++ /dev/null
@@ -1,5 +0,0 @@
-from environment import GraphicDisplay
-
-if __name__ == "__main__":
-    grid_world = GraphicDisplay()
-    grid_world.mainloop()
diff --git a/Code 1. Grid World/2. Value Iteration/value_iteration.py b/Code 1. Grid World/2. Value Iteration/value_iteration.py
deleted file mode 100644
index bfa9a671..00000000
--- a/Code 1. Grid World/2. Value Iteration/value_iteration.py	
+++ /dev/null
@@ -1,79 +0,0 @@
-# -*- coding: utf-8 -*-
-import copy
-import random
-
-DISCOUNT_FACTOR = 0.9
-
-
-class ValueIteration:
-    def __init__(self, env):
-        # environment object
-        self.env = env
-        # creaking 2 dimension list for the value function
-        self.value_table = [[0.00] * env.width for _ in range(env.height)]
-
-    # get next value function table from the current value function table
-    def iteration(self):
-        value_table_copy = copy.deepcopy(self.value_table)
-        for state in self.env.get_all_states():
-            value_table_copy[state[0]][state[1]] = round(self.calculate_max_value(state), 2)
-        self.value_table = copy.deepcopy(value_table_copy)
-        print("value_table  : " , self.value_table)
-
-    # calculate next value function using Bellman Optimality Equation
-    def calculate_max_value(self, state):
-
-        if state == [2, 2]:
-            return 0.0
-
-        # empty list for the value function
-        value_list = []
-
-        # do the calculation for the all possible actions
-        for action in self.env.possible_actions:
-            next_state = self.env.state_after_action(state, action)
-            reward = self.env.get_reward(state, action)
-            next_value = self.get_value(next_state)
-            value_list.append((reward + DISCOUNT_FACTOR * next_value))
-
-        print("value _ list : " , value_list)
-
-        # return the maximum value(it is optimality equation!!)
-        return max(value_list)
-
-    # get action according to the current value function table
-    def get_action(self, state, random_pick=True):
-
-        action_list = []
-        max_value = -99999
-
-        if state == [2, 2]:
-            return []
-
-        # calculating q values for the all actions and
-        # append the action to action list which has maximum q value
-        for action in self.env.possible_actions:
-
-            next_state = self.env.state_after_action(state, action)
-            reward = self.env.get_reward(state, action)
-            next_value = self.get_value(next_state)
-            value = (reward + DISCOUNT_FACTOR * next_value)
-
-            if value > max_value:
-                action_list.clear()
-                action_list.append(action)
-                max_value = value
-            elif value == max_value:
-                action_list.append(action)
-
-        # pick one action from action_list which has same q value
-        if random_pick is True:
-            return random.sample(action_list, 1)[0]
-
-        return action_list
-
-    def get_value_table(self):
-        return copy.deepcopy(self.value_table)
-
-    def get_value(self, state):
-        return round(self.value_table[state[0]][state[1]], 2)
diff --git a/Code 1. Grid World/3. Monte-Carlo/MC_agent.py b/Code 1. Grid World/3. Monte-Carlo/MC_agent.py
deleted file mode 100644
index c874f06e..00000000
--- a/Code 1. Grid World/3. Monte-Carlo/MC_agent.py	
+++ /dev/null
@@ -1,89 +0,0 @@
-import numpy as np
-import pandas as pd
-
-
-# this is Monte-Carlo agent for the grid world
-# it learns every episodes from the sample(which is the difference with dynamic programming)
-class MCAgent:
-    def __init__(self, actions):
-        # actions = [0, 1, 2, 3]
-        self.width = 5
-        self.height = 5
-        self.actions = actions
-        self.learning_rate = 0.01
-        self.discount_factor = 0.9
-        self.epsilon = 0.9
-        self.samples = []
-        self.value_table = pd.DataFrame(columns=['value'])
-
-    # check whether the state was visited
-    # if this is first visitation, then initialize the q function of the state
-    def check_state_exist(self, state):
-        if str(state) not in self.value_table.index:
-            self.value_table = self.value_table.append(
-                pd.Series(
-                    [0] * len(self.value_table.columns),
-                    index=self.value_table.columns,
-                    name=str(state)
-                )
-            )
-
-    # append sample to memory(state, reward, done)
-    def save_sample(self, state, reward, done):
-        self.samples.append([state, reward, done])
-
-    # for every episode, agent updates q function of visited states
-    def update(self):
-        G_t = 0
-        visit_state = []
-        for reward in reversed(self.samples):
-            state = str(reward[0])
-            if state not in visit_state:
-                visit_state.append(state)
-                G_t = self.discount_factor * (reward[1] + G_t)
-                self.check_state_exist(state)
-                value = self.value_table.ix[state, 'value']
-                self.value_table.ix[state, 'value'] = value + self.learning_rate * (G_t - value)
-                print("state : ", state, " G : ", G_t, " update : ", value + self.learning_rate * (G_t - value))
-        print("values : ", self.value_table)
-
-    # get action for the state according to the q function table
-    # agent pick action of epsilon-greedy policy
-    def get_action(self, state):
-        self.check_state_exist(state)
-
-        if np.random.rand() > self.epsilon:
-            # take random action
-            action = np.random.choice(self.actions)
-        else:
-            # take action according to the q function table
-            next_state = self.possible_next_state(state)
-            next_state = next_state.reindex(np.random.permutation(next_state.index))
-            action = next_state.argmax()
-
-        return int(action)
-
-    # get the possible next states
-    def possible_next_state(self, state):
-        state_col = state[0]
-        state_row = state[1]
-
-        next_state = pd.Series(
-            [0] * len(self.actions),
-            index=self.actions,
-        )
-
-        if state_row != 0:
-            self.check_state_exist(str([state_col, state_row - 1]))
-            next_state.set_value(0, self.value_table.ix[str([state_col, state_row - 1]), 'value'])  # up
-        if state_row != self.height - 1:
-            self.check_state_exist(str([state_col, state_row + 1]))
-            next_state.set_value(1, self.value_table.ix[str([state_col, state_row + 1]), 'value'])  # down
-        if state_col != 0:
-            self.check_state_exist(str([state_col - 1, state_row]))
-            next_state.set_value(2, self.value_table.ix[str([state_col - 1, state_row]), 'value'])  # left
-        if state_col != self.width - 1:
-            self.check_state_exist(str([state_col + 1, state_row]))
-            next_state.set_value(3, self.value_table.ix[str([state_col + 1, state_row]), 'value'])  # right
-
-        return next_state
diff --git a/Code 1. Grid World/3. Monte-Carlo/run.py b/Code 1. Grid World/3. Monte-Carlo/run.py
deleted file mode 100644
index a5efec22..00000000
--- a/Code 1. Grid World/3. Monte-Carlo/run.py	
+++ /dev/null
@@ -1,30 +0,0 @@
-from environment import Env
-from MC_agent import MCAgent
-
-# main loop
-if __name__ == "__main__":
-    env = Env()
-    agent = MCAgent(actions=list(range(env.n_actions)))
-
-    for episode in range(1000):
-        # reset environment and initialize state
-        state = env.reset()
-
-        while True:
-            env.render()
-
-            # take action and doing one step in the environment
-            # environment return next state, immediate reward and
-            # information about terminal of episode
-            action = agent.get_action(state)
-            next_state, reward, done = env.step(action)
-
-            agent.save_sample(next_state, reward, done)
-
-            # at the end of episode, update the q function table
-            if done:
-                print("episode : ", episode)
-                print("returns : ", agent.returns)
-                agent.update()
-                agent.returns.clear()
-                break
\ No newline at end of file
diff --git a/Code 1. Grid World/4. SARSA/SARSA_agent.py b/Code 1. Grid World/4. SARSA/SARSA_agent.py
deleted file mode 100644
index 7e1298d3..00000000
--- a/Code 1. Grid World/4. SARSA/SARSA_agent.py	
+++ /dev/null
@@ -1,50 +0,0 @@
-import numpy as np
-import pandas as pd
-
-
-# this is SARSA agent for the grid world
-# it learns every time step from the sample <s, a, r, s', a'>
-class SARSAgent:
-    def __init__(self, actions):
-        # actions = [0, 1, 2, 3]
-        self.actions = actions
-        self.learning_rate = 0.01
-        self.discount_factor = 0.9
-        self.epsilon = 0.9
-        self.q_table = pd.DataFrame(columns=self.actions)
-
-        # check whether the state was visited
-        # if this is first visitation, then initialize the q function of the state
-    def check_state_exist(self, state):
-        if state not in self.q_table.index:
-            self.q_table = self.q_table.append(
-                pd.Series(
-                    [0] * len(self.actions),
-                    index=self.q_table.columns,
-                    name=state,
-                )
-            )
-
-    # with sample <s, a, r, s', a'>, learns new q function
-    def learn(self, state, action, reward, next_state, next_action):
-        self.check_state_exist(next_state)
-        self.q_table.ix[state, action] = \
-            self.q_table.ix[state, action] + self.learning_rate * \
-                                             (reward + self.discount_factor *
-                                              self.q_table.ix[next_state, next_action - self.q_table.ix[state, action]])
-
-    # get action for the state according to the q function table
-    # agent pick action of epsilon-greedy policy
-    def get_action(self, state):
-        self.check_state_exist(state)
-
-        if np.random.rand() > self.epsilon:
-            # take random action
-            action = np.random.choice(self.actions)
-        else:
-            # take action according to the q function table
-            state_action = self.q_table.ix[state, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))
-            action = state_action.argmax()
-
-        return action
diff --git a/Code 1. Grid World/4. SARSA/environment.py b/Code 1. Grid World/4. SARSA/environment.py
deleted file mode 100644
index 30074db3..00000000
--- a/Code 1. Grid World/4. SARSA/environment.py	
+++ /dev/null
@@ -1,136 +0,0 @@
-import time
-import numpy as np
-import tkinter as tk
-from PIL import ImageTk, Image
-
-np.random.seed(1)
-
-UNIT = 100  # pixels
-HEIGHT = 5  # grid height
-WIDTH = 5  # grid width
-
-
-class Env(tk.Tk):
-    def __init__(self):
-        super(Env, self).__init__()
-        self.action_space = ['u', 'd', 'l', 'r']
-        self.n_actions = len(self.action_space)
-        self.title('monte carlo')
-        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT))
-        self.buildGraphic()
-        self.texts = []
-
-    def buildGraphic(self):
-        self.canvas = tk.Canvas(self, bg='white',
-                                height=HEIGHT * UNIT,
-                                width=WIDTH * UNIT)
-
-        # create grids
-        for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
-            x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
-            self.canvas.create_line(x0, y0, x1, y1)
-        for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
-            x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
-            self.canvas.create_line(x0, y0, x1, y1)
-
-        # image_load
-        self.rectangle_image = ImageTk.PhotoImage(
-            Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
-        self.triange_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
-        self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))
-
-        # add image to canvas
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        self.triangle1 = self.canvas.create_image(250, 150, image=self.triange_image)
-        self.triangle2 = self.canvas.create_image(150, 250, image=self.triange_image)
-        self.circle = self.canvas.create_image(250, 250, image=self.circle_image)
-
-        # pack all
-        self.canvas.pack()
-
-    def text_value(self, row, col, contents, action, font='Helvetica', size=10, style='normal', anchor="nw"):
-
-        if action == 0:
-            origin_x, origin_y = 7, 42
-        elif action == 1:
-            origin_x, origin_y = 85, 42
-        elif action == 2:
-            origin_x, origin_y = 42, 5
-        else:
-            origin_x, origin_y = 42, 77
-
-        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
-        font = (font, str(size), style)
-        return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))
-
-    def print_value_all(self, q_table):
-        for i in self.texts:
-            self.canvas.delete(i)
-        self.texts.clear()
-        for i in range(HEIGHT):
-            for j in range(WIDTH):
-                for action in range(0, 4):
-                    state = [i, j]
-                    if str(state) in q_table.index:
-                        temp = q_table.ix[str(state), action]
-                        self.text_value(j, i, round(temp, 2), action)
-
-    def coords_to_state(self, coords):
-        x = int((coords[0] - 50) / 100)
-        y = int((coords[1] - 50) / 100)
-        return [x, y]
-
-    def state_to_coords(self, state):
-        x = int(state[0] * 100 + 50)
-        y = int(state[1] * 100 + 50)
-        return [x, y]
-
-    def reset(self):
-        self.update()
-        time.sleep(0.5)
-        self.canvas.delete(self.rectangle)
-        origin = np.array([UNIT / 2, UNIT / 2])
-        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
-        # return observation
-        return self.coords_to_state(self.canvas.coords(self.rectangle))
-
-    def step(self, action):
-        state = self.canvas.coords(self.rectangle)
-        base_action = np.array([0, 0])
-        self.render()
-
-        if action == 0:  # up
-            if state[1] > UNIT:
-                base_action[1] -= UNIT
-        elif action == 1:  # down
-            if state[1] < (HEIGHT - 1) * UNIT:
-                base_action[1] += UNIT
-        elif action == 2:  # left
-            if state[0] > UNIT:
-                base_action[0] -= UNIT
-        elif action == 3:  # right
-            if state[0] < (WIDTH - 1) * UNIT:
-                base_action[0] += UNIT
-
-        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
-
-        next_state = self.canvas.coords(self.rectangle)  # next state
-
-        # reward function
-        if next_state == self.canvas.coords(self.circle):
-            reward = 100
-            done = True
-        elif next_state in [self.canvas.coords(self.triangle1), self.canvas.coords(self.triangle2)]:
-            reward = -100
-            done = True
-        else:
-            reward = 0
-            done = False
-
-        next_state = self.coords_to_state(next_state)
-
-        return next_state, reward, done
-
-    def render(self):
-        time.sleep(0.05)
-        self.update()
diff --git a/Code 1. Grid World/4. SARSA/run.py b/Code 1. Grid World/4. SARSA/run.py
deleted file mode 100644
index ed41c87d..00000000
--- a/Code 1. Grid World/4. SARSA/run.py	
+++ /dev/null
@@ -1,36 +0,0 @@
-from environment import Env
-from SARSA_agent import SARSAgent
-
-if __name__ == "__main__":
-    env = Env()
-    agent = SARSAgent(actions=list(range(env.n_actions)))
-
-    for episode in range(1000):
-        # reset environment and initialize state
-        state = env.reset()
-        # get action of state from agent
-        action = agent.get_action(str(state))
-
-        while True:
-            env.render()
-
-            # take action and doing one step in the environment
-            # environment return next state, immediate reward and
-            # information about terminal of episode
-            next_state, reward, done = env.step(action)
-
-            # get action of state from agent
-            next_action = agent.get_action(str(next_state))
-
-            # with sample <s,a,r,s',a'>, agent learns new q function
-            agent.learn(str(state), action, reward, str(next_state), next_action)
-
-            state = next_state
-            action = next_action
-
-            # print q function of all states at screen
-            env.print_value_all(agent.q_table)
-
-            # if episode ends, then break
-            if done:
-                break
\ No newline at end of file
diff --git a/Code 1. Grid World/5. Q Learning/QLearning_agent.py b/Code 1. Grid World/5. Q Learning/QLearning_agent.py
deleted file mode 100644
index eae23556..00000000
--- a/Code 1. Grid World/5. Q Learning/QLearning_agent.py	
+++ /dev/null
@@ -1,49 +0,0 @@
-import numpy as np
-import pandas as pd
-
-
-class QLearningAgent:
-    def __init__(self, actions):
-        # actions = [0, 1, 2, 3]
-        self.actions = actions
-        self.learning_rate = 0.01
-        self.discount_factor = 0.9
-        self.epsilon = 0.9
-        self.q_table = pd.DataFrame(columns=self.actions)
-
-        # check whether the state was visited
-        # if this is first visitation, then initialize the q function of the state
-
-    def check_state_exist(self, state):
-        if state not in self.q_table.index:
-            self.q_table = self.q_table.append(
-                pd.Series(
-                    [0] * len(self.actions),
-                    index=self.q_table.columns,
-                    name=state,
-                )
-            )
-
-    # update q function with sample <s, a, r, s'>
-    def learn(self, state, action, reward, next_state):
-        self.check_state_exist(next_state)
-        q_1 = self.q_table.ix[state, action]
-        # using Bellman Optimality Equation to update q function
-        q_2 = reward + self.discount_factor * self.q_table.ix[next_state, :].max()
-        self.q_table.ix[state, action] += self.learning_rate * (q_2 - q_1)
-
-    # get action for the state according to the q function table
-    # agent pick action of epsilon-greedy policy
-    def get_action(self, state):
-        self.check_state_exist(state)
-
-        if np.random.rand() > self.epsilon:
-            # take random action
-            action = np.random.choice(self.actions)
-        else:
-            # take action according to the q function table
-            state_action = self.q_table.ix[state, :]
-            state_action = state_action.reindex(np.random.permutation(state_action.index))
-            action = state_action.argmax()
-
-        return action
diff --git a/Code 1. Grid World/5. Q Learning/run.py b/Code 1. Grid World/5. Q Learning/run.py
deleted file mode 100644
index 82c6dc5c..00000000
--- a/Code 1. Grid World/5. Q Learning/run.py	
+++ /dev/null
@@ -1,34 +0,0 @@
-from environment import Env
-from QLearning_agent import QLearningAgent
-
-
-if __name__ == "__main__":
-    env = Env()
-    agent = QLearningAgent(actions=list(range(env.n_actions)))
-
-    for episode in range(1000):
-        # reset environment and initialize state
-        state = env.reset()
-
-        while True:
-            env.render()
-
-            # get action of state from agent
-            action = agent.get_action(str(state))
-
-            # take action and doing one step in the environment
-            # environment return next state, immediate reward and
-            # information about terminal of episode
-            next_state, reward, done = env.step(action)
-
-            # with sample <s,a,r,s'>, agent learns new q function
-            agent.learn(str(state), action, reward, str(next_state))
-
-            state = next_state
-
-            # print q function of all states at screen
-            env.print_value_all(agent.q_table)
-
-            # if episode ends, then break
-            if done:
-                break
diff --git a/Code 1. Grid World/6. DQN/Gridworld_DQN.py b/Code 1. Grid World/6. DQN/Gridworld_DQN.py
deleted file mode 100644
index e9d5fac1..00000000
--- a/Code 1. Grid World/6. DQN/Gridworld_DQN.py	
+++ /dev/null
@@ -1,174 +0,0 @@
-import copy
-import pylab
-import random
-import numpy as np
-from environment import Env
-from collections import deque
-from keras.layers import Dense
-from keras.optimizers import Adam
-from keras.models import Sequential
-
-EPISODES = 1000
-
-
-# this is DQN Agent for the Cartpole
-# it uses Neural Network to approximate q function
-# and replay memory & target q network
-class DQNAgent:
-    def __init__(self):
-        # if you want to see Cartpole learning, then change to True
-        self.render = False
-
-        # actions which agent can do
-        self.action_space = [0, 1, 2, 3, 4]
-        # get size of state and action
-        self.action_size = len(self.action_space)
-        self.state_size = 22
-        self.discount_factor = 0.99
-        self.learning_rate = 0.001
-
-        self.epsilon = 1.  # exploration
-        self.epsilon_decay = .9999
-        self.epsilon_min = 0.01
-        self.batch_size = 32
-        self.train_start = 100
-
-        # create replay memory using deque
-        self.memory = deque(maxlen=10000)
-        self.model = self.build_model()
-        self.target_model = self.build_model()
-        # copy the model to target model
-        # --> initialize the target model so that the parameters of model & target model to be same
-        self.update_target_model()
-
-    # approximate Q function using Neural Network
-    # state is input and Q Value of each action is output of network
-    def build_model(self):
-        model = Sequential()
-        model.add(Dense(20, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
-        model.add(Dense(20, activation='relu', kernel_initializer='he_uniform'))
-        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))
-        model.summary()
-        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
-        return model
-
-    # after some time interval update the target model to be same with model
-    def update_target_model(self):
-        self.target_model.set_weights(self.model.get_weights())
-
-    # get action from model using epsilon-greedy policy
-    def get_action(self, state):
-        if np.random.rand() <= self.epsilon:
-            # The agent acts randomly
-            return random.randrange(self.action_size)
-        else:
-            # Predict the reward value based on the given state
-            state = np.float32(state)
-            q_values = self.model.predict(state)
-            return np.argmax(q_values[0])
-
-    # save sample <s,a,r,s'> to the replay memory
-    def replay_memory(self, state, action, reward, next_state, done):
-        self.memory.append((state, action, reward, next_state, done))
-        if self.epsilon > self.epsilon_min:
-            self.epsilon *= self.epsilon_decay
-
-    # pick samples randomly from replay memory (with batch_size)
-    def train_replay(self):
-        if len(self.memory) < self.train_start:
-            return
-        batch_size = min(self.batch_size, len(self.memory))
-        mini_batch = random.sample(self.memory, batch_size)
-
-        update_input = np.zeros((batch_size, self.state_size))
-        update_target = np.zeros((batch_size, self.action_size))
-
-        for i in range(batch_size):
-            state, action, reward, next_state, done = mini_batch[i]
-            reward = np.float32(reward)
-            state = np.float32(state)
-            next_state = np.float32(next_state)
-            target = self.model.predict(state)[0]
-
-            # like Q Learning, get maximum Q value at s'
-            # But from target model
-            if done:
-                target[action] = reward
-            else:
-                target = reward + self.discount_factor * \
-                                  np.amax(self.model.predict(next_state)[0])
-
-            update_input[i] = state
-            update_target[i] = target
-
-        # make minibatch which includes target q value and predicted q value
-        # and do the model fit!
-        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
-    # load the saved model
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    # save the model which is under training
-    def save_model(self, name):
-        self.model.save_weights(name)
-
-
-if __name__ == "__main__":
-    # maze game
-    # env = Maze()
-    env = Env()
-    agent = DQNAgent()
-
-    global_step = 0
-    # agent.load("same_vel_episode2 : 1000")
-    scores, episodes = [], []
-
-    for e in range(EPISODES):
-        done = False
-        score = 0
-        state = env.reset()
-        state = np.reshape(state, [1, 22])
-
-        while not done:
-            # fresh env
-            if agent.render:
-                env.render()
-            global_step += 1
-
-            # get action for the current state and go one step in environment
-            action = agent.get_action(state)
-            next_state, reward, done = env.step(action)
-            next_state = np.reshape(next_state, [1, 22])
-
-            agent.replay_memory(state, action, reward, next_state, done)
-            # every time step we do training
-            agent.train_replay()
-            score += reward
-
-            state = copy.deepcopy(next_state)
-            print("reward:", reward, "  done:", done, "  time_step:", global_step, "  epsilon:", agent.epsilon)
-
-            # every 100 time steps update the target model to be same with model
-            if global_step % 100 == 0:
-                agent.update_target_model()
-
-            if done:
-                scores.append(score)
-                episodes.append(e)
-                pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/10by10.png")
-                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
-                      "  epsilon:", agent.epsilon)
-
-        if e % 100 == 0:
-            pass
-            agent.save_model("./save_model/10by10")
-
-    # end of game
-    print('game over')
-    env.destroy()
-
-
-
-
diff --git a/Code 1. Grid World/6. DQN/save_graph/10by10.png b/Code 1. Grid World/6. DQN/save_graph/10by10.png
deleted file mode 100644
index b2fdee85..00000000
Binary files a/Code 1. Grid World/6. DQN/save_graph/10by10.png and /dev/null differ
diff --git a/Code 1. Grid World/6. DQN/save_model/10by10 b/Code 1. Grid World/6. DQN/save_model/10by10
deleted file mode 100644
index d1b416c4..00000000
Binary files a/Code 1. Grid World/6. DQN/save_model/10by10 and /dev/null differ
diff --git a/Code 1. Grid World/7. Policy Gradient/save_graph/10by10.png b/Code 1. Grid World/7. Policy Gradient/save_graph/10by10.png
deleted file mode 100644
index dc314d66..00000000
Binary files a/Code 1. Grid World/7. Policy Gradient/save_graph/10by10.png and /dev/null differ
diff --git a/Code 1. Grid World/7. Policy Gradient/save_model/10by10 b/Code 1. Grid World/7. Policy Gradient/save_model/10by10
deleted file mode 100644
index abba078c..00000000
Binary files a/Code 1. Grid World/7. Policy Gradient/save_model/10by10 and /dev/null differ
diff --git a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN14.png b/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN14.png
deleted file mode 100644
index 5c0f54d3..00000000
Binary files a/Code 2. Cartpole/1. DQN/save_graph/Cartpole_DQN14.png and /dev/null differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole10.h5 b/Code 2. Cartpole/1. DQN/save_model/Cartpole10.h5
deleted file mode 100644
index dc6f1d69..00000000
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole10.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole8.h5 b/Code 2. Cartpole/1. DQN/save_model/Cartpole8.h5
deleted file mode 100644
index 02094b4e..00000000
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole8.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/1. DQN/save_model/Cartpole9.h5 b/Code 2. Cartpole/1. DQN/save_model/Cartpole9.h5
deleted file mode 100644
index ce883e7d..00000000
Binary files a/Code 2. Cartpole/1. DQN/save_model/Cartpole9.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/2. Double DQN/save_graph/Cartpole_DoubleDQN.png b/Code 2. Cartpole/2. Double DQN/save_graph/Cartpole_DoubleDQN.png
deleted file mode 100644
index b77a86eb..00000000
Binary files a/Code 2. Cartpole/2. Double DQN/save_graph/Cartpole_DoubleDQN.png and /dev/null differ
diff --git a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN1.h5 b/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN1.h5
deleted file mode 100644
index da416bfe..00000000
Binary files a/Code 2. Cartpole/2. Double DQN/save_model/Cartpole_DoubleDQN1.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py b/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py
deleted file mode 100644
index a51bf180..00000000
--- a/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py	
+++ /dev/null
@@ -1,180 +0,0 @@
-import sys
-import gym
-import pylab
-import random
-import numpy as np
-from collections import deque
-from keras import backend as k
-from keras.models import Model
-from keras.optimizers import Adam
-from keras.layers import Dense, Lambda, merge, Input
-
-EPISODES = 300
-
-
-# this is Dueling DQN Agent for the Cartpole
-# it uses Neural Network to approximate q function
-# and replay memory & target q network
-class DuelingDQNAgent:
-    def __init__(self, state_size, action_size):
-        # if you want to see Cartpole learning, then change to True
-        self.render = False
-
-        # get size of state and action
-        self.state_size = state_size
-        self.action_size = action_size
-
-        # these is hyper parameters for the Dueling DQN
-        self.discount_factor = 0.99
-        self.learning_rate = 0.001
-        self.epsilon = 1.0
-        self.epsilon_decay = 0.999
-        self.epsilon_min = 0.01
-        self.batch_size = 12
-        self.train_start = 1000
-        # create replay memory using deque
-        self.memory = deque(maxlen=2000)
-
-        # create main model and target model
-        self.model = self.build_model()
-        self.target_model = self.build_model()
-
-        # copy the model to target model
-        # --> initialize the target model so that the parameters of model & target model to be same
-        self.update_target_model()
-
-    # the key point of Dueling network
-    # the network devided into two streams, 1. value function 2. advantaget function
-    # at the end of network, two streams are merged into one output stream which is Q function
-    def build_model(self):
-        input = Input(shape=(self.state_size,))
-        x = Dense(32, input_shape=(self.state_size,), activation='relu', kernel_initializer='he_uniform')(input)
-        x = Dense(16, activation='relu', kernel_initializer='he_uniform')(x)
-
-        state_value = Dense(1, kernel_initializer='he_uniform')(x)
-        state_value = Lambda(lambda s: k.expand_dims(s[:, 0], -1), output_shape=(self.action_size,))(state_value)
-
-        action_advantage = Dense(self.action_size, kernel_initializer='he_uniform')(x)
-        action_advantage = Lambda(lambda a: a[:, :] - k.mean(a[:, :], keepdims=True),
-                                  output_shape=(self.action_size,))(action_advantage)
-
-        q_value = merge([state_value, action_advantage], mode='sum')
-        model = Model(input=input, output=q_value)
-        model.summary()
-        model.compile(loss='mse', optimizer=Adam(self.learning_rate))
-        return model
-
-    # after some time interval update the target model to be same with model
-    def update_target_model(self):
-        self.target_model.set_weights(self.model.get_weights())
-
-        # get action from model using epsilon-greedy policy
-    def get_action(self, state):
-        if np.random.rand() <= self.epsilon:
-            return random.randrange(self.action_size)
-        else:
-            q_value = self.model.predict(state)
-            return np.argmax(q_value[0])
-
-    # save sample <s,a,r,s'> to the replay memory
-    def replay_memory(self, state, action, reward, next_state, done):
-        self.memory.append((state, action, reward, next_state, done))
-        if self.epsilon > self.epsilon_min:
-            self.epsilon *= self.epsilon_decay
-
-    # pick samples randomly from replay memory (with batch_size)
-    def train_replay(self):
-        if len(self.memory) < self.train_start:
-            return
-        batch_size = min(self.batch_size, len(self.memory))
-        mini_batch = random.sample(self.memory, batch_size)
-
-        update_input = np.zeros((batch_size, self.state_size))
-        update_target = np.zeros((batch_size, self.action_size))
-
-        for i in range(batch_size):
-            state, action, reward, next_state, done = mini_batch[i]
-            target = self.model.predict(state)[0]
-
-            # like Q Learning, get maximum Q value at s'
-            # But from target model
-            if done:
-                target[action] = reward
-            else:
-                target[action] = reward + self.discount_factor * \
-                                          np.amax(self.target_model.predict(next_state)[0])
-
-            update_input[i] = state
-            update_target[i] = target
-
-        # make minibatch which includes target q value and predicted q value
-        # and do the model fit!
-        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
-    # load the saved model
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    # save the model which is under training
-    def save_model(self, name):
-        self.model.save_weights(name)
-
-
-if __name__ == "__main__":
-    # in case of CartPole-v1, you can play until 500 time step
-    env = gym.make('CartPole-v1')
-    # get size of state and action from environment
-    state_size = env.observation_space.shape[0]
-    action_size = env.action_space.n
-
-    agent = DuelingDQNAgent(state_size, action_size)
-
-    scores, episodes = [], []
-
-    for e in range(EPISODES):
-        done = False
-        score = 0
-        state = env.reset()
-        state = np.reshape(state, [1, state_size])
-        # agent.load_model("./save_model/cartpole-master.h5")
-
-        while not done:
-            if agent.render:
-                env.render()
-
-            # get action for the current state and go one step in environment
-            action = agent.get_action(state)
-            next_state, reward, done, info = env.step(action)
-            next_state = np.reshape(next_state, [1, state_size])
-            # if an action make the episode end, then gives penalty of -100
-            reward = reward if not done or score == 499 else -100
-
-            # save the sample <s, a, r, s'> to the replay memory
-            agent.replay_memory(state, action, reward, next_state, done)
-            # every time step do the training
-            agent.train_replay()
-            score += reward
-            state = next_state
-
-            if done:
-                env.reset()
-                # every episode update the target model to be same with model
-
-                agent.update_target_model()
-                # every episode, plot the play time
-                score = score if score == 499 else score + 100
-                scores.append(score)
-                episodes.append(e)
-                pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/Cartpole_Dueling_DQN.png")
-                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
-                      "  epsilon:", agent.epsilon)
-
-                # if the mean of scores of last 10 episode is bigger than 490
-                # stop training
-                if np.mean(scores[-min(10, len(scores)):]) > 490:
-                    sys.exit()
-
-        # save the model
-        if e % 50 == 0:
-            agent.save_model("./save_model/Cartpole_DQN.h5")
diff --git a/Code 2. Cartpole/3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png b/Code 2. Cartpole/3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png
deleted file mode 100644
index 99ad0b00..00000000
Binary files a/Code 2. Cartpole/3. Dueling DQN/save_graph/Cartpole_Dueling_DQN.png and /dev/null differ
diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN.h5 b/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN.h5
deleted file mode 100644
index 72105e16..00000000
Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN1.h5 b/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN1.h5
deleted file mode 100644
index eac02b32..00000000
Binary files a/Code 2. Cartpole/3. Dueling DQN/save_model/Cartpole_DQN1.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py b/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py
deleted file mode 100644
index 7a0ac089..00000000
--- a/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py	
+++ /dev/null
@@ -1,163 +0,0 @@
-import sys
-import gym
-import pylab
-import numpy as np
-from keras.layers import Dense
-from keras.models import Sequential
-from keras.optimizers import Adam
-from keras import backend as K
-
-EPISODES = 1000
-
-
-class PGAgent:
-    def __init__(self, state_size, action_size):
-        # Cartpole이 학습하는 것을 보려면 True로 바꿀 것
-        self.render = True
-        
-        # agent를 학습시키지 않으려면 False로 바꿀 것
-        self.is_train = True
-
-        # state와 action의 크기를 가져와서 모델을 생성하는데 사용함
-        self.state_size = state_size
-        self.action_size = action_size
-
-        # Cartpole REINFORCE 학습의 Hyper parameter 들
-        self.discount_factor = 0.99
-        self.learning_rate = 0.001
-
-        # 학습할 모델을 생성
-        self.model = self.build_model()
-
-        # Policy Gradient 네트워크 학습하는 함수를 만듬
-        self.optimizer = self.optimizer()
-
-        # 상태, 행동, 보상을 기억하기 위한 리스트 생성
-        self.states, self.actions, self.rewards = [], [], []
-
-    # Deep Neural Network 를 통해서 정책을 근사
-    # 상태가 입력, 각 행동에 대한 확률이 출력인 모델을 생성
-    def build_model(self):
-        model = Sequential()
-        model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
-        model.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform'))
-        # 마지막 softmax 계층으로 각 행동에 대한 확률을 만드는 모델을 생성
-        model.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
-        model.summary()
-
-        return model
-
-    def optimizer(self):
-        action = K.placeholder(shape=[None, self.action_size])
-        discounted_rewards = K.placeholder(shape=[None, ])
-
-        # Policy Gradient 의 핵심
-        # log(정책) * return 의 gradient 를 구해서 최대화시킴
-        good_prob = K.sum(action * self.model.output, axis=1)
-        eligibility = K.log(good_prob) * discounted_rewards
-        loss = -K.sum(eligibility)
-
-        optimizer = Adam(lr=self.learning_rate)
-        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
-        train = K.function([self.model.input, action, discounted_rewards], [], updates=updates)
-
-        return train
-
-    # 행동의 선택은 현재 네트워크에 대해서 각 행동에 대한 확률로 정책을 사용
-    def get_action(self, state):
-        policy = self.model.predict(state, batch_size=1).flatten()
-        return np.random.choice(self.action_size, 1, p=policy)[0]
-
-    # 에피소드가 끝나면 해당 에피소드의 보상를 이용해 return을 계산
-    def discount_rewards(self, rewards):
-        discounted_rewards = np.zeros_like(rewards)
-        running_add = 0
-        for t in reversed(range(0, len(rewards))):
-            running_add = running_add * self.discount_factor + rewards[t]
-            discounted_rewards[t] = running_add
-        return discounted_rewards
-
-    # 각 스텝의 <s, a, r>을 저장하는 함수
-    def memory(self, state, action, reward):
-        self.states.append(state[0])
-        self.rewards.append(reward)
-        act = np.zeros(self.action_size)
-        act[action] = 1
-        self.actions.append(act)
-
-    # 에피소드가 끝나면 모아진 메모리로 학습
-    def train_episodes(self):
-        discounted_rewards = self.discount_rewards(self.rewards)
-        discounted_rewards -= np.mean(discounted_rewards)
-        discounted_rewards /= np.std(discounted_rewards)
-
-        self.optimizer([self.states, self.actions, discounted_rewards])
-        self.states, self.actions, self.rewards = [], [], []
-
-    # 저장한 모델을 불러옴
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    # 학습된 모델을 저장함
-    def save_model(self, name):
-        self.model.save_weights(name)
-
-
-if __name__ == "__main__":
-    # CartPole-v1의 경우 500 타임스텝까지 플레이가능
-    env = gym.make('CartPole-v1')
-
-    # 환경으로부터 상태와 행동의 크기를 가져옴
-    state_size = env.observation_space.shape[0]
-    action_size = env.action_space.n
-
-    # PG 에이전트의 생성
-    agent = PGAgent(state_size, action_size)
-
-    scores, episodes = [], []
-
-    for e in range(EPISODES):
-        done = False
-        score = 0
-        state = env.reset()
-        state = np.reshape(state, [1, state_size])
-        # agent.load_model("./save_model/cartpole-master.h5")
-
-        while not done:
-            if agent.render:
-                env.render()
-
-            # 현재 상태에서 행동을 선택하고 한 스텝을 진행
-            action = agent.get_action(state)
-            next_state, reward, done, info = env.step(action)
-            next_state = np.reshape(next_state, [1, state_size])
-            reward = reward if not done or score == 499 else -100
-
-            # <s, a, r>을 memory에 저장
-            if agent.is_train:
-                agent.memory(state, action, reward)
-
-            score += reward
-            state = next_state
-
-            if done:
-                env.reset()
-                # 매 에피소드마다 모아온 <s, a, r>을 학습
-                if agent.is_train:
-                    agent.train_episodes()
-
-                # 에피소드에 따른 score를 plot
-                score = score if score == 500 else score + 100
-                scores.append(score)
-                episodes.append(e)
-                pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/Cartpole_PG.png")
-                print("episode:", e, "  score:", score)
-
-                # 지난 10 에피소드의 평균이 490 이상이면 학습을 멈춤
-                if np.mean(scores[-min(10, len(scores)):]) > 490:
-                    sys.exit()
-
-        # 50 에피소드마다 학습 모델을 저장
-        if e % 50 == 0:
-            agent.save_model("./save_model/Cartpole_PG.h5")
diff --git a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png b/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png
deleted file mode 100644
index 796da0d4..00000000
Binary files a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png and /dev/null differ
diff --git a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_DQN1.h5 b/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_DQN1.h5
deleted file mode 100644
index 2ff204a3..00000000
Binary files a/Code 2. Cartpole/4. Policy Gradient/save_model/Cartpole_DQN1.h5 and /dev/null differ
diff --git a/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py b/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py
deleted file mode 100644
index 2b0ea5eb..00000000
--- a/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py	
+++ /dev/null
@@ -1,184 +0,0 @@
-import sys
-import gym
-import pylab
-import random
-import numpy as np
-from collections import deque
-from keras.layers import Dense
-from keras.optimizers import Adam
-from keras.models import Sequential
-from keras import backend as K
-
-EPISODES = 300
-
-
-class ACAgent:
-    def __init__(self, state_size, action_size):
-        # Cartpole이 학습하는 것을 보려면 True로 바꿀 것
-        self.render = False
-
-        # state와 action의 크기를 가져와서 모델을 생성하는데 사용함
-        self.state_size = state_size
-        self.action_size = action_size
-
-        # Cartpole Actor-Critic에 필요한 Hyperparameter들
-        self.discount_factor = 0.99
-        self.actor_lr = 0.001
-        self.critic_lr = 0.01
-        self.batch_size = 32
-        self.train_start = 1000
-        self.memory = deque(maxlen=10000)
-
-        # Actor-Critic C에 필요한 actor 네트워크와 critic 네트워크를 생성
-        self.actor, self.critic = self.build_model()
-
-        # actor 네트워크를 학습시키기 위한 optimizer 를 만듬
-        self.actor_optimizer = self.actor_optimizer()
-
-    # Deep Neural Network 를 통해서 정책과 가치를 근사
-    # actor -> 상태가 입력, 각 행동에 대한 확률이 출력인 모델을 생성
-    # critic -> 상태가 입력, 상태에 대한 가치가 출력인 모델을 생성
-    def build_model(self):
-        # actor 네트워크 생성
-        actor = Sequential()
-        actor.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_uniform'))
-        actor.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform'))
-        actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='glorot_uniform'))
-
-        # critic 네트워크 생성
-        critic = Sequential()
-        critic.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer="he_uniform"))
-        critic.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))
-        critic.add(Dense(1, activation='linear', kernel_initializer='he_uniform'))
-        critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr))
-
-        actor.summary()
-        critic.summary()
-
-        return actor, critic
-
-    def actor_optimizer(self):
-        action = K.placeholder(shape=[None, self.action_size])
-        advantages = K.placeholder(shape=[None, ])
-
-        # Policy Gradient 의 핵심
-        # log(정책) * return 의 gradient 를 구해서 최대화시킴
-        good_prob = K.sum(action * self.actor.output, axis=1)
-        eligibility = K.log(good_prob + 1e-10) * advantages
-        loss = -K.sum(eligibility)
-
-        optimizer = Adam(lr=self.actor_lr)
-        updates = optimizer.get_updates(self.actor.trainable_weights, [], loss)
-        train = K.function([self.actor.input, action, advantages], [], updates=updates)
-
-        return train
-
-    # replay memory에서 batch_size 만큼의 샘플들을 무작위로 뽑아서 학습
-    def train_replay(self):
-        if len(self.memory) < self.train_start:
-            return
-        mini_batch = random.sample(self.memory, self.batch_size)
-
-        update_input = np.zeros((self.batch_size, self.state_size))
-        update_action = np.zeros((self.batch_size, self.action_size))
-        update_target = np.zeros((self.batch_size, 1))
-        advantages = np.zeros((self.batch_size,))
-
-        for i in range(self.batch_size):
-            state, action, reward, next_state, done = mini_batch[i]
-            value = self.critic.predict(state)[0]
-
-            # s'의 state value를 가져와서 critic 네트워크를 업데이트함.
-            if done:
-                target = reward
-            else:
-                target = reward + self.discount_factor * \
-                                  self.critic.predict(next_state)[0]
-            update_input[i] = state
-            update_action[i] = action
-            update_target[i] = target
-            advantages[i] = target - value
-
-        # 학습할 정답인 타겟과 현재 자신의 값의 minibatch를 만들고 그것으로 한 번에 critic 모델 업데이트
-        self.critic.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
-
-        # 상태, 행동, 그에 따른 (target-value)를 넣어 actor 네트워크를 학습함
-        self.actor_optimizer([update_input, update_action, advantages])
-
-    # 핻동의 선택은 actor 네트워크에 대해서 각 행동에 대한 확률로 정책을 사용
-    def get_action(self, state):
-        policy = self.actor.predict(state, batch_size=1).flatten()
-        return np.random.choice(self.action_size, 1, p=policy)[0]
-
-    # 각 스텝의 <s, a, r, s'>을 저장
-    def replay_memory(self, state, action, reward, next_state, done):
-        act = np.zeros(self.action_size)
-        act[action] = 1
-        self.memory.append((state, act, reward, next_state, done))
-
-    # 저장한 모델을 불러옴
-    def load_model(self, name):
-        self.actor.load_weights(name)
-        self.critic.load_weights(name)
-
-    # 학습된 모델을 저장함
-    def save_model(self, name1, name2):
-        self.actor.save_weights(name1)
-        self.critic.save_weights(name2)
-
-
-if __name__ == "__main__":
-    # CartPole-v1의 경우 500 타임스텝까지 플레이가능
-    env = gym.make('CartPole-v1')
-
-    # 환경으로부터 상태와 행동의 크기를 가져옴
-    state_size = env.observation_space.shape[0]
-    action_size = env.action_space.n
-
-    agent = ACAgent(state_size, action_size)
-    scores, episodes = [], []
-
-    for e in range(EPISODES):
-        done = False
-        score = 0
-        state = env.reset()
-        state = np.reshape(state, [1, state_size])
-        # agent.load_model("./save_model/Cartpole-Actor.h5", "./save_model/Cartpole-Critic.h5")
-
-        while not done:
-            if agent.render:
-                env.render()
-
-            # 현재 상태에서 행동을 선택하고 한 스텝을 진행
-            action = agent.get_action(state)
-            next_state, reward, done, info = env.step(action)
-            next_state = np.reshape(next_state, [1, state_size])
-            # 에피소드를 끝나게 한 행동에 대해서 -100의 패널티를 줌
-            reward = reward if not done or score == 499 else -100
-
-            # <s, a, r, s'>을 replay memory에 저장
-            agent.replay_memory(state, action, reward, next_state, done)
-            # 매 타임스텝마다 학습을 진행
-            agent.train_replay()
-
-            score += reward
-            state = next_state
-
-            if done:
-                env.reset()
-
-                # 각 에피소드마다 cartpole이 서있었던 타임스텝을 plot
-                score = score if score == 500 else score + 100
-                scores.append(score)
-                episodes.append(e)
-                pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/Cartpole_ActorCritc.png")
-                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory))
-
-                # 지난 10 에피소드의 평균이 490 이상이면 학습을 멈춤
-                if np.mean(scores[-min(10, len(scores)):]) > 490:
-                    sys.exit()
-
-        # 50 에피소드마다 학습 모델을 저장
-        if e % 50 == 0:
-            agent.save_model("./save_model/Cartpole_Actor.h5", "./save_model/Cartpole_Critic.h5")
diff --git a/Code 2. Cartpole/5. Actor-Critic/save_graph/Cartpole_ActorCritc.png b/Code 2. Cartpole/5. Actor-Critic/save_graph/Cartpole_ActorCritc.png
deleted file mode 100644
index da9f6ed2..00000000
Binary files a/Code 2. Cartpole/5. Actor-Critic/save_graph/Cartpole_ActorCritc.png and /dev/null differ
diff --git a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Actor.h5 b/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Actor.h5
deleted file mode 100644
index df132fc2..00000000
Binary files a/Code 2. Cartpole/5. Actor-Critic/save_model/Cartpole_Actor.h5 and /dev/null differ
diff --git a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py b/Code 3. Atari Game/1. Breakout/Breakout_DQN.py
deleted file mode 100644
index b3c26253..00000000
--- a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py	
+++ /dev/null
@@ -1,174 +0,0 @@
-import gym
-import pylab
-import random
-import numpy as np
-from collections import deque
-from skimage.color import rgb2gray
-from skimage.transform import resize
-
-from keras.models import Sequential
-from keras.optimizers import RMSprop
-from keras.layers import Dense, Flatten
-from keras.layers.convolutional import Conv2D
-
-EPISODES = 5000
-
-
-class DQNAgent:
-    def __init__(self):
-        self.render = True
-
-        self.state_size = (84, 84, 4)
-        self.action_size = 6
-
-        self.epsilon = 1.0
-        self.epsilon_start = 1.0
-        self.epsilon_end = 0.1
-        self.epsilon_decay = 1000000.
-        self.epsilon_decay_step = \
-            (self.epsilon_start - self.epsilon_end) / self.epsilon_decay
-
-        self.batch_size = 32
-        self.train_start = 20000
-        self.update_target_rate = 10000
-        self.discount_factor = 0.99
-        self.memory = deque(maxlen=400000)
-        self.no_op_steps = 30
-        self.learning_rate = 0.00025
-        self.momentum = 0.95
-        self.min_gradient = 0.01
-
-        self.model = self.build_model()
-        self.target_model = self.build_model()
-        self.update_target_model()
-
-    def build_model(self):
-        model = Sequential()
-        model.add(Conv2D(32, (8, 8), input_shape=self.state_size, activation='relu', strides=(4, 4),
-                         kernel_initializer='glorot_uniform'))
-        model.add(Conv2D(64, (4, 4), activation='relu', strides=(2, 2),
-                         kernel_initializer='glorot_uniform'))
-        model.add(Conv2D(64, (3, 3), activation='relu', strides=(1, 1),
-                         kernel_initializer='glorot_uniform'))
-        model.add(Flatten())
-        model.add(Dense(512, activation='relu', kernel_initializer='glorot_uniform'))
-        model.add(Dense(self.action_size))
-        model.summary()
-        model.compile(loss='mse', optimizer=RMSprop(
-            lr=self.learning_rate, rho=self.momentum, epsilon=self.min_gradient))
-        return model
-
-    def update_target_model(self):
-        self.target_model.set_weights(self.model.get_weights())
-
-    def get_action(self, history):
-        history = np.float32(history/255.0)
-        if np.random.rand() <= self.epsilon:
-            return random.randrange(self.action_size)
-        else:
-            q_value = self.model.predict(history)
-            return np.argmax(q_value[0])
-
-    def replay_memory(self, history, action, reward, history1, done):
-        self.memory.append((history, action, reward, history1, done))
-        if self.epsilon > self.epsilon_end:
-            self.epsilon -= self.epsilon_decay_step
-
-    def train_replay(self):
-        if len(self.memory) < self.train_start:
-            return
-        batch_size = min(self.batch_size, len(self.memory))
-        mini_batch = random.sample(self.memory, batch_size)
-
-        update_input = np.zeros((batch_size, self.state_size[0], self.state_size[1], self.state_size[2]))
-        update_target = np.zeros((batch_size, self.action_size))
-
-        for i in range(batch_size):
-            history, action, reward, history1, done = mini_batch[i]
-            history = np.float32(history/255.)
-            history1 = np.float32(history1/255.)
-            target = self.model.predict(history)[0]
-
-            if done:
-                target[action] = reward
-            else:
-                target[action] = reward + self.discount_factor * np.amax(self.target_model.predict(history1)[0])
-            update_target[i] = target
-            update_input[i] = history
-
-        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)
-
-    def load_model(self, name):
-        self.model.load_weights(name)
-
-    def save_model(self, name):
-        self.model.save_weights(name)
-
-
-def pre_processing(next_observe, observe):
-    processed_observe = np.maximum(next_observe, observe)
-    processed_observe = np.uint8(resize(rgb2gray(processed_observe), (84, 84), mode='constant')*255)
-    return processed_observe
-
-
-if __name__ == "__main__":
-    env = gym.make('BreakoutDeterministic-v3')
-    agent = DQNAgent()
-
-    scores, episodes, global_step = [], [], 0
-
-    for e in range(EPISODES):
-        done = False
-        dead = False
-        score, start_live = 0, 5
-        observe = env.reset()
-        next_observe = observe
-        for _ in range(random.randint(1, agent.no_op_steps)):
-            observe = next_observe
-            next_observe, _, _, _ = env.step(1)
-
-        state = pre_processing(next_observe, observe)
-        history = np.stack((state, state, state, state), axis=2)
-        history = history.reshape(1, history.shape[0], history.shape[1], history.shape[2])
-
-        while not done:
-            if agent.render:
-                env.render()
-            observe = next_observe
-            action = agent.get_action(history)
-            next_observe, reward, done, info = env.step(action)
-            next_state = pre_processing(next_observe, observe)
-            next_state = np.reshape([next_state], (1, 84, 84, 1))
-            history1 = np.append(next_state, history[:, :, :, :3], axis=3)
-
-            if start_live > info['ale.lives']:
-                dead = True
-                start_live = info['ale.lives']
-
-            agent.replay_memory(history, action, reward, history1, done)
-            agent.train_replay()
-
-            score += reward
-
-            if dead:
-                history = np.stack((next_state, next_state, next_state, next_state), axis=2)
-                history = np.reshape([history], (1, 84, 84, 4))
-                dead = False
-            else:
-                history = history1
-
-            if global_step % agent.update_target_rate == 0:
-                agent.update_target_model()
-
-            if done:
-                env.reset()
-                scores.append(score)
-                episodes.append(e)
-                pylab.plot(episodes, scores, 'b')
-                pylab.savefig("./save_graph/Breakout_DQN.png")
-                print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
-                      "  epsilon:", agent.epsilon)
-
-        # 20 에피소드마다 학습 모델을 저장
-        if e % 1000 == 0:
-            agent.save_model("./save_model/Breakout_DQN.h5")
\ No newline at end of file
diff --git a/Code 3. Atari Game/1. Breakout/Breakout_PG.py b/Code 3. Atari Game/1. Breakout/Breakout_PG.py
deleted file mode 100644
index 2ab553bf..00000000
--- a/Code 3. Atari Game/1. Breakout/Breakout_PG.py	
+++ /dev/null
@@ -1,211 +0,0 @@
-import gym
-import random
-import tensorflow as tf
-import numpy as np
-
-DIM = 105*80*2
-gamma = 0.99
-batch_size = 10
-
-def weight_variable(shape):
-    #bound = 1 / np.sqrt(np.sum(shape))
-    initial = tf.truncated_normal(shape, stddev=0.05)
-    return tf.Variable(initial)
-    #return tf.Variable(tf.random_uniform(shape, minval=0, maxval=bound))
-
-def bias_variable(shape):
-    initial = tf.constant(0.1, shape=shape)
-    return tf.Variable(initial)
-
-def prepro(state):
-    # set images to gray scale
-    gray_state = np.zeros([210,160])
-    
-    for i in range(210):
-        for j in range(160):
-            gray_state[i][j] = np.mean(state[i][j])
-    
-    # get rid of noises
-    gray_state[gray_state == 142] = 0
-    gray_state = gray_state[::2,::2]
-    return gray_state
-
-def discountRewards(rewards):
-    discounted_r = np.zeros_like(rewards)
-    running_add = 0
-    for t in reversed(range(0, len(rewards))):
-        running_add = running_add * gamma + rewards[t]
-        discounted_r[t] = running_add
-    return discounted_r
-
-class AGENT():
-    def __init__(self, learning_rate = 1e-4):
-        self.learning_rate = learning_rate
-         
-        self.conv_W1 = weight_variable([5,5,2,32])
-        self.conv_b1 = bias_variable([32])
-        self.conv_W2 = weight_variable([4,4,32,64])
-        self.conv_b2 = bias_variable([64])
-        self.conv_W3 = weight_variable([3,3,64,64])
-        self.conv_b3 = bias_variable([64])
-        
-        self.fc_W1 = weight_variable([13*10*64, 512])
-        self.fc_b1 = bias_variable([512])
-        self.fc_W2 = weight_variable([512,2])
-        self.fc_b2 = bias_variable([2])
-        
-        self.v_conv_W1 = weight_variable([5,5,2,16])
-        self.v_conv_b1 = bias_variable([16])
-        self.v_conv_W2 = weight_variable([4,4,16,32])
-        self.v_conv_b2 = bias_variable([32])
-        self.v_conv_W3 = weight_variable([3,3,32,32])
-        self.v_conv_b3 = bias_variable([32])
-        
-        self.v_fc_W1 = weight_variable([13*10*32, 512])
-        self.v_fc_b1 = bias_variable([512])
-        self.v_fc_W2 = weight_variable([512,1])
-        self.v_fc_b2 = bias_variable([1])
-        
-        self.sess = tf.InteractiveSession()
-        self.sess.run(tf.initialize_all_variables())
-        
-        
-        self.state, self.prob, self.conv_drop, self.fc_drop = self.getPolicy()
-        self.act, self.adv, self.train = self.policyOptimizer()
-        self.v_state, self.value = self.getValue()
-        self.v_n_state, self.n_value = self.getValue()
-        self.rwd, self.v_train = self.valueOptimizer()
-        
-        
-    def getAction(self, get_action_state):
-        action = 4 if random.random() < .5 else 5
-        return action   
-        
-        
-    def policyOptimizer(self):
-        act = tf.placeholder(tf.float32, [None, 2])
-        adv = tf.placeholder(tf.float32, [None, 1])
-        
-        good_probabilities = tf.reduce_sum(tf.mul(self.prob, act),
-                                      reduction_indices = [1])
-        log_probabilities = tf.log(tf.clip_by_value(good_probabilities, 1e-10, 1e+8)) * adv
-        loss = -tf.reduce_sum(log_probabilities)
-        optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(loss)
-        
-        return act, adv, optimizer
-    
-    def valueOptimizer(self):
-        rwd = tf.placeholder(tf.float32, [None, 1])
-        
-        value1 = self.value
-        value2 = rwd + self.n_value*gamma
-        v_loss = tf.reduce_mean(tf.square(value2 - value1), reduction_indices=[1])
-        v_optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(v_loss)
-        
-        return rwd, v_optimizer
-        
-    def getPolicy(self):
-        state = tf.placeholder(tf.float32, [None, 105, 80, 2]) 
-        conv_drop = tf.placeholder(tf.float32)
-        fc_drop = tf.placeholder(tf.float32)
-        state_image = tf.reshape(state, [-1,105,80,2])
-        
-        conv_h1_out = tf.nn.conv2d(state_image, self.conv_W1, strides = [1,2,2,1], padding = "SAME")
-        conv_h1 = tf.nn.relu(conv_h1_out + self.conv_b1)
-        conv_h1_drop = tf.nn.dropout(conv_h1, conv_drop)
-        conv_h2_out = tf.nn.conv2d(conv_h1_drop, self.conv_W2, strides = [1,2,2,1], padding = "SAME")
-        conv_h2 = tf.nn.relu(conv_h2_out + self.conv_b2)
-        conv_h2_drop = tf.nn.dropout(conv_h2, conv_drop)
-        conv_h3_out = tf.nn.conv2d(conv_h2_drop, self.conv_W3, strides = [1,2,2,1], padding = "SAME")
-        conv_h3 = tf.nn.relu(conv_h3_out + self.conv_b3)
-        
-        conv_h3_flat = tf.reshape(conv_h3, [-1, 13*10*64])
-        
-        fc_h1 = tf.nn.relu(tf.matmul(conv_h3_flat, self.fc_W1) + self.fc_b1)
-        fc_h1_drop = tf.nn.dropout(fc_h1, fc_drop)
-        prob = tf.nn.softmax(tf.matmul(fc_h1_drop, self.fc_W2) + self.fc_b2)
-        
-        return state, prob, conv_drop, fc_drop
-    
-    def getValue(self):
-        v_state = tf.placeholder(tf.float32, [None, 105, 80, 2])
-        v_state_image = tf.reshape(v_state, [-1,105,80,2])
-        
-        v_conv_h1_out = tf.nn.conv2d(v_state_image, self.v_conv_W1, strides=[1,2,2,1], padding="SAME")
-        v_conv_h1 = tf.nn.relu(v_conv_h1_out+self.v_conv_b1)
-        v_conv_h1_drop = tf.nn.dropout(v_conv_h1, self.conv_drop)
-        v_conv_h2_out = tf.nn.conv2d(v_conv_h1_drop, self.v_conv_W2, strides=[1,2,2,1], padding="SAME")
-        v_conv_h2 = tf.nn.relu(v_conv_h2_out+self.v_conv_b2)
-        v_conv_h2_drop = tf.nn.dropout(v_conv_h2, self.conv_drop)
-        v_conv_h3_out = tf.nn.conv2d(v_conv_h2_drop, self.v_conv_W3, strides=[1,2,2,1], padding="SAME")
-        v_conv_h3 = tf.nn.relu(v_conv_h3_out+self.v_conv_b3)
-        
-        v_conv_h3_flat = tf.reshape(v_conv_h3, [-1, 13*10*32])
-        
-        v_fc_h1 = tf.nn.relu(tf.matmul(v_conv_h3_flat, self.v_fc_W1) + self.v_fc_b1)
-        v_fc_h1_drop = tf.nn.dropout(v_fc_h1, self.fc_drop)
-        value = tf.matmul(v_fc_h1_drop, self.v_fc_W2) + self.v_fc_b2
-        
-        return v_state, value
-        
-    
-    def getActionProb(self, get_action_prob_state):
-        action_prob = self.sess.run(self.prob, feed_dict = {self.state: get_action_prob_state, self.conv_drop: 1.0, self.fc_drop: 1.0})
-        return action_prob              
-                         
-                         
-env = gym.make("Breakout-v0")
-agent = AGENT()
-obs = env.reset()
-prev_x = None
-states, rewards, actions, next_states = [],[],[],[]
-running_reward = None
-reward_sum, step = 0, 0
-episode_number = 0
-
-while True:
-    env.render()
-    cur_x = obs
-    x = [cur_x, prev_x] if prev_x is not None else np.zeros(DIM)
-    x = np.reshape(x, [-1 ,105, 80, 2])
-    states.append(x)
-    prev_x = cur_x
-    if step != 0:
-        next_states.append(x) 
-    
-    
-    action = agent.getAction(x)
-    actions.append([0,1] if action == 5 else [1,0])
-    obs, reward, done, info = env.step(action)
-    rewards.append(reward)
-    reward_sum += reward
-    step += 1
-
-    
-    if done:
-        next_states.append(np.reshape(np.zeros(DIM), [-1, 105, 80, 2]))
-        episode_number += 1
-        
-        discounted_epi_reward = discountRewards(rewards)
-        
-        #discounted_epi_reward -= np.mean(discounted_epi_reward)
-        #discounted_epi_reward /= np.std(discounted_epi_reward) 
-        
-        epi_state = np.vstack(states)
-        epi_reward = np.vstack(discounted_epi_reward)
-        epi_action = np.vstack(actions)
-        epi_n_state = np.vstack(next_states)
-        
-        states,rewards,actions,next_states = [],[],[],[]
-        
-        if episode_number % batch_size == 0:
-            agent.sess.run(agent.v_train, feed_dict={agent.v_state: epi_state, agent.rwd: epi_reward,
-                                                     agent.v_n_state: epi_n_state, agent.conv_drop: 0.4, agent.fc_drop: 0.5})
-            
-            epi_advantage = epi_reward - agent.sess.run(agent.value, feed_dict={agent.v_state: epi_state})
-            
-            agent.sess.run(agent.train, feed_dict={agent.state: epi_state, agent.act: epi_action, 
-                                                   agent.adv: epi_reward, agent.conv_drop: 0.4, agent.fc_drop: 0.5}) 
-        reward_sum, step = 0,0
-        obs = env.reset() # reset env
-        prev_x = None
diff --git a/Code 3. Atari Game/1. Breakout/save_graph/Breakout_DQN1.png b/Code 3. Atari Game/1. Breakout/save_graph/Breakout_DQN1.png
deleted file mode 100644
index 7bcd6443..00000000
Binary files a/Code 3. Atari Game/1. Breakout/save_graph/Breakout_DQN1.png and /dev/null differ
diff --git a/Code 3. Atari Game/3. A3C/Pong_A3C.py b/Code 3. Atari Game/3. A3C/Pong_A3C.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/Code 3. Atari Game/Understanding DQN.pptx b/Code 3. Atari Game/Understanding DQN.pptx
deleted file mode 100644
index 80dfbae9..00000000
Binary files a/Code 3. Atari Game/Understanding DQN.pptx and /dev/null differ
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..ac035a7b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 RLCode
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README-kr.md b/README-kr.md
deleted file mode 100644
index 74f738b1..00000000
--- a/README-kr.md
+++ /dev/null
@@ -1,47 +0,0 @@
-<p align="center"><img width="90%" src="images/Reinforcement-Learning.png" /></p>
-
---------------------------------------------------------------------------------
-
-> [RLCode](https://rlcode.github.io)팀이 직접 만든 강화학습 예제들을 모아놓은 Repo 입니다. [영문 (English)](./README.md)
->
-> Maintainers - [이웅원](https://github.com/dnddnjs), [이영무](https://github.com/zzing0907), [양혁렬](https://github.com/Hyeokreal), [이의령](https://github.com/wooridle), [김건우](https://github.com/keon)
-
-[Pull Request](https://github.com/rlcode/reinforcement-learning/pulls)는 언제든 환영입니다.
-문제나 버그, 혹은 궁금한 사항이 있으면 [이슈](https://github.com/rlcode/reinforcement-learning/issues)에 글을 남겨주세요.
-
-
-## 필요한 라이브러리들 (Dependencies)
-1. Python 3.5
-2. Tensorflow 1.0.0
-3. Keras 
-4. numpy
-5. pandas
-6. pillow
-7. matplot
-8. Skimage
-9. h5py
-
-## 목차 (Table of Contents)
-
-**Code 1** - 비교적 단순한 환경인 그리드월드에서 강화학습의 기초를 쌓기
-
-- [정책 이터레이션 (Policy Iteration)](./Code%201.%20Grid%20World/1.%20Policy%20Iteration)
-- [가치 이터레이션 (Value Iteration)](./Code%201.%20Grid%20World/2.%20Value%20Iteration)
-- [몬테카를로 (Monte Carlo)](./Code%201.%20Grid%20World/3.%20Monte-Carlo)
-- [살사 (SARSA)](./Code%201.%20Grid%20World/4.%20SARSA)
-- [큐러닝 (Q-Learning)](./Code%201.%20Grid%20World/5.%20Q%20Learning)
-- [Deep Q Network](./Code%201.%20Grid%20World/6.%20DQN)
-- [Policy Gradient](./Code%201.%20Grid%20World/7.%20Policy%20Gradient)
-
-**Code 2** - 카트폴 예제를 이용하여 여러가지 딥러닝을 강화학습에 응용한 알고리즘들을 적용해보기
-
-- [Deep Q Network](./Code%202.%20Cartpole/1.%20DQN)
-- [Double Deep Q Network](./Code%202.%20Cartpole/2.%20Double%20DQN)
-- [Dueling Deep Q Network](./Code%202.%20Cartpole/3.%20Dueling%20DQN)
-- [Policy Gradient](./Code%202.%20Cartpole/4.%20Policy%20Gradient)
-- [Actor Critic](./Code%202.%20Cartpole/5.%20Actor-Critic)
-
-**Code 3** - 딥러닝을 응용하여 좀더 복잡한 Atari게임을 마스터하는 에이전트 만들기
-
-- [벽돌깨기 (Breakout)](./Code%203.%20Atari%20Game/1.%20Breakout)
-- [퐁 (Pong)](./Code%203.%20Atari%20Game/2.%20Pong)
diff --git a/README.md b/README.md
index 440b96cd..870e686b 100644
--- a/README.md
+++ b/README.md
@@ -2,17 +2,17 @@
 
 --------------------------------------------------------------------------------
 
-> Minimal and clean examples of reinforcement learning algorithms presented by [RLCode](https://rlcode.github.io) team. [[한국어]](./README-kr.md)
+> Minimal and clean examples of reinforcement learning algorithms presented by [RLCode](https://rlcode.github.io) team. [[한국어]](https://github.com/rlcode/reinforcement-learning-kr)
 >
 > Maintainers - [Woongwon](https://github.com/dnddnjs), [Youngmoo](https://github.com/zzing0907), [Hyeokreal](https://github.com/Hyeokreal), [Uiryeong](https://github.com/wooridle), [Keon](https://github.com/keon)
 
-From the most basic algorithms to the more recent ones categorized as 'deep reinforcement learning', the examples are easy to read with comments.
+From the basics to deep reinforcement learning, this repo provides easy-to-read code examples. One file for each algorithm.
 Please feel free to create a [Pull Request](https://github.com/rlcode/reinforcement-learning/pulls), or open an [issue](https://github.com/rlcode/reinforcement-learning/issues)!
 
 ## Dependencies
 1. Python 3.5
 2. Tensorflow 1.0.0
-3. Keras 
+3. Keras
 4. numpy
 5. pandas
 6. matplot
@@ -27,26 +27,29 @@ pip install -r requirements.txt
 
 ## Table of Contents
 
-**Code 1** - Mastering the basics of reinforcement learning in the simplified world called "Grid World"
+**Grid World** - Mastering the basics of reinforcement learning in the simplified world called "Grid World"
 
-- [Policy Iteration](./Code%201.%20Grid%20World/1.%20Policy%20Iteration)
-- [Value Iteration](./Code%201.%20Grid%20World/2.%20Value%20Iteration)
-- [Monte Carlo](./Code%201.%20Grid%20World/3.%20Monte-Carlo)
-- [SARSA](./Code%201.%20Grid%20World/4.%20SARSA)
-- [Q-Learning](./Code%201.%20Grid%20World/5.%20Q%20Learning)
-- [Deep Q Network](./Code%201.%20Grid%20World/6.%20DQN)
-- [Policy Gradient](./Code%201.%20Grid%20World/7.%20Policy%20Gradient)
+- [Policy Iteration](./1-grid-world/1-policy-iteration)
+- [Value Iteration](./1-grid-world/2-value-iteration)
+- [Monte Carlo](./1-grid-world/3-monte-carlo)
+- [SARSA](./1-grid-world/4-sarsa)
+- [Q-Learning](./1-grid-world/5-q-learning)
+- [Deep SARSA](./1-grid-world/6-deep-sarsa)
+- [REINFORCE](./1-grid-world/7-reinforce)
 
-**Code 2** - Applying deep reinforcement learning on basic Cartpole game.
+**CartPole** - Applying deep reinforcement learning on basic Cartpole game.
 
-- [Deep Q Network](./Code%202.%20Cartpole/1.%20DQN)
-- [Double Deep Q Network](./Code%202.%20Cartpole/2.%20Double%20DQN)
-- [Dueling Deep Q Network](./Code%202.%20Cartpole/3.%20Dueling%20DQN)
-- [Policy Gradient](./Code%202.%20Cartpole/4.%20Policy%20Gradient)
-- [Actor Critic](./Code%202.%20Cartpole/5.%20Actor-Critic)
-- Asynchronous Advantage Actor Critic (A3C) - WIP
+- [Deep Q Network](./2-cartpole/1-dqn)
+- [Double Deep Q Network](./2-cartpole/2-double-dqn)
+- [Policy Gradient](./2-cartpole/3-reinforce)
+- [Actor Critic (A2C)](./2-cartpole/4-actor-critic)
+- [Asynchronous Advantage Actor Critic (A3C)](./2-cartpole/5-a3c)
 
-**Code 3** - Mastering Atari games with Deep Reinforcement Learning
+**Atari** - Mastering Atari games with Deep Reinforcement Learning
 
-- [Breakout](./Code%203.%20Atari%20Game/1.%20Breakout) - DQN, PG, A3C
-- [Pong](./Code%203.%20Atari%20Game/2.%20Pong) - DQN, PG, A3C
+- **Breakout** - [DQN](./3-atari/1-breakout/breakout_dqn.py), [DDQN](./3-atari/1-breakout/breakout_ddqn.py) [Dueling DDQN](./3-atari/1-breakout/breakout_ddqn.py) [A3C](./3-atari/1-breakout/breakout_a3c.py)
+- **Pong** - [Policy Gradient](./3-atari/2-pong/pong_reinforce.py)
+
+**OpenAI GYM** - [WIP]
+
+- Mountain Car - [DQN](./4-gym/1-mountaincar)
diff --git a/wiki/install_guide_osx+ubuntu.md b/wiki/install_guide_osx+ubuntu.md
new file mode 100644
index 00000000..58770933
--- /dev/null
+++ b/wiki/install_guide_osx+ubuntu.md
@@ -0,0 +1,341 @@
+## 개발 환경 설정 1: 리눅스 (우분투)
+
+리눅스는 소스코드가 공개된 대표적인 오픈소스 운영체제입니다. 리눅스는 모든 소스가 공개되어 있으므로 정말 많은 종류가 있습니다. 그중에서도 우분투(Ubuntu)가 가장 넓은 사용자를 가진 배포판입니다. 매년 상반기 하반기 우분투 재단에서 새로운 버전을 배포하는데 이 책에서는 14년 상반기에 배포한 우분투 14.04 버전을 사용할 것입니다. 우분투 14.04가 설치되어 있다는 가정에 따라 이후의 개발환경 설정을 설명할 것입니다.
+
+
+
+### 2.1.1 우분투 파이썬의 버전 확인
+
+리눅스의 장점은 바로 파이썬(Python)이 설치가 기본적으로 되어 있다는 것입니다. 파이썬은 2.X 버전과 3.X버전이 있는데 이 책에서는 `파이썬 3.5버전`을 사용할 것입니다. 바탕화면에서 `Ctrl+Alt+t`를 누르면 터미널 창이 뜨는데
+여기에 다음 명령어를 치고 엔터를 누르면 설치된 파이썬의 버전을 확인할 수 있습니다.
+
+```python
+$ python -V
+```
+
+우분투 14.04 버전에는 `파이썬 2.7버전`과 `3.5버전`이 기본적으로 설치되어 있습니다.
+
+
+
+### 2.1.2 파이참 커뮤니티 설치 및 환경 설정
+
+앞으로 강화학습 에이전트를 만들고 가상 환경에서 에이전트를 학습시킬 것입니다. 그러기 위해 코드를 짜고 편집하는
+환경이 필요한데 그러한 환경을 IDE(interface Development Environment)라고 합니다. IDE에는 많은 종류가 있지만 이 책에서는 파이참(Pycharm)을 파이썬을 위한 IDE로 사용할 것입니다. 
+
+파이참의 설치는 파이참의 공식 홈페이지[[1\]](#_ftn1)를통해서 할 수 있습니다. 홈페이지에서 윈도우, 리눅스, 맥 OS 버전의 파이참을 다운로드 할 수 있습니다. 파이참은 유료 버전인 `프로페셔녈(PyCharm ProfessionalEdition)`과, 무료 버전인 `커뮤니티(PyCharm Community Edition)`으로 나뉩니다. 앞으로 에이전트를 개발할 때 `파이참 커뮤니티`를 사용할 것이므로 커뮤니티 버전을 기준으로 설치법을 설명할 것입니다.
+
+
+
+**설치는 다음과 같은 순서로 진행합니다. **
+
+1. 파이참 공식 홈페이지 링크에서 파이참 커뮤니티버전을 다운로드합니다.
+
+   ​
+
+   링크: [https://www.jetbrains.com/pycharm/download/#section=linux](https://www.jetbrains.com/pycharm/download/#section=linux)
+
+   <img src="./rlcode_image/pycharm_community.png"/>
+
+   ​
+
+2. 다운받은 경로로 들어가서 다음 명령어로 압축파일을 풀어줍니다.
+
+   ```shell
+   $tar xfz pycharm-community-2016.3.2.tar.gz
+   ```
+
+
+
+
+3. 압축을 푼 후 아래 경로(bin폴더)로 이동합니다.
+
+   ```shell
+   $cd ~/pycharm-community-2016.3.2/bin
+   ```
+   ​
+
+4. 다음 명령어로 파이참을 실행합니다.
+
+   ```shell
+   $sh pycharm.sh
+   ```
+   ​
+
+   <img src="./rlcode_image/sh_pycharm.sh.png"/>
+
+   ​
+
+
+5. 명령어가 실행되면 설치가 시작됩니다.
+
+   ​
+
+6. 설치가 완료되면 다음 화면과 같은 초기 환경설정 화면을 볼 수 있습니다. 
+
+   ​
+
+   <img src="./rlcode_image/default_config.png" style="zoom:70%">
+
+   ​
+
+   IDE theme 항목에서 Intellij는 바탕이 흰색인 테마이고 Darcula 테마는 바탕이 검은색입니다. 이
+   책에서는 Intellij를 테마로 사용합니다.
+
+   ​
+
+7. 초기설정이 완료된 후의 화면입니다. 여기서 프로젝트 생성을 해봅니다.
+
+   <img src="./rlcode_image/pycharm_init.png" style="zoom:70%">
+
+   ​
+
+   ​
+
+8. 프로젝트의 경로와 Interpreter를 설정하는 화면입니다. Home 디렉터리에 PycharmProjects 폴더를 생성하고 그 하위에 프로젝트를 생성합니다. 프로젝트의 이름은 독자가 임의로 정하도록 합니다. “rlcode_book” 이름으로 프로젝트를 생성하는데 Interpreter를 설정해줍니다. Interpreter는 이 프로젝트에서 사용할 언어인데 python 3.5라고 설정합니다.
+
+   ​
+
+   <img src="./rlcode_image/pycham_new_project.png" style="zoom:70%">
+   ​
+
+9. rlcode_book 프로젝트가 생성되면 아래와 같은 화면이 나옵니다.
+
+   ​
+
+   <img src="./rlcode_image/rlcode_project.png" style="zoom:100%">
+
+   ​
+
+   ​
+
+10. 파이참이 정상적으로 설치되었는지 확인하기 위해 파이썬 스크립트 파일을 생성해봅니다. 가장 간단한 예제인 `“Hello World”`를 실행하기 위해 다음과 같이 hello_world.py 파일을 생성합니다.
+  ​
+
+ <img src="./rlcode_image/hello_world_ubuntu.png" style="zoom:100%">
+ ​
+
+11. 생성한 파일에 마우스 커서를 놓고 오른쪽 버튼을 누르면 여러 항목이 나옵니다. 그 중에서 “Run ‘hello_world’” 버튼을 누르면 hello_world.py 파일을 실행할 수 있습니다. 
+    ​
+
+    <img src="./rlcode_image/run_hello_world.png" style="zoom:100%">
+
+    ​
+
+
+12. hello_world.py 파일 안에 다음 코드를 입력합니다.
+   ```python
+   print("hello world")
+   ```
+
+
+
+
+13. hello_world.py 파일을 실행시키면 아래 화면과 같이 실행 창에 “hello world”가 나옵니다. 이를 통해 파이참이 정상적으로 설치된 것을 확인할 수 있습니다.
+
+  ​
+
+  <img src="./rlcode_image/console_hello_world.png" style="zoom:70%">
+
+  ​
+
+
+###Virtualenv(가상환경) 사용법 :happy: 
+
+여기까지가기본적인 파이참의 환경설정입니다. 한 컴퓨터에서 여러가지 프로젝트를 진행할 경우에 프로젝트마다 개발환경이다를 수 있습니다. 서로 다른 프로젝트의 개발환경이 다를 경우에 사용자는 상당한 불편을 겪을 수 있습니다. 따라서 프로젝트별로 개발환경을 분리해서 관리하는 것은 상당한 장점이 있는데 그 기능을 하는 것이 VirtualEnv입니다. VirtualEnv를 사용하면 이 책의 프로젝트만을위한 가상 개발환경을 만들 수 있습니다. 
+
+파이참은VirtualEnv를 지원하기 때문에 파이참으로 VirtualEnv를사용하는 법을 설명하겠습니다. VirtualEnv의 설치 및 사용 방법은 여러 가지가 있지만 위에서설치한 파이참을 이용하면 GUI(graphic user interface)형식으로 VirtualEnv를 사용할 수 있습니다. 그리고 파이참은 가상 개발환경에설치된 다양한 파이썬 외부 라이브러리들을 관리 할 수 있는 기능을 제공합니다. 
+
+**파이참에서 VirtualEnv 이용방법은 다음과 같습니다.**
+
+1. “File” 메뉴에서 “Settings”를 클릭합니다. 
+
+   ​
+
+   <img src="./rlcode_image/file_setting.png" style="zoom:70%">
+
+   ​
+
+2. Settings의 왼쪽 목록에서 “Project: 프로젝트명”의 하위 항목인 Project Interpreter 클릭합니다. 그리고 Project Interpreter 탭 오른쪽에서 “Create VirtualEnv”를 클릭합니다. 
+
+   ​
+
+   <img src="./rlcode_image/project_interpreter.png" style="zoom:70%">
+
+   ​
+
+3. 가상환경 이름을 입력하면 /home/brian/rlcode_book 디렉토리가 생성되어 가상환경이 생깁니다.
+
+   ​
+
+   <img src="./rlcode_image/rlcode_book_directory.png" style="zoom:70%">
+
+   ​
+
+4. 아래와 같이 터미널 창에 (rlcode_book) 표시가 된다면 rlcode_book이름을 가진 가상 환경이 생긴 것입니다. 이제 이 환경을 이 책을 위한 가상환경으로 사용하겠습니다.
+
+   ​
+
+   <img src="./rlcode_image/terminal_rlcode_book.png" style="zoom:70%">
+
+   ​
+
+### 2.1.3 오픈에이아이 설치 및 테스트
+
+2016년에 오픈에이아이(OpenAI)라는 회사가 세워졌습니다. 이 회사의 목표는 인공지능 기술을 전 세계에공개해서 더 안전한 인공지능을 만들어가며 더 많은 분야에 인공지능을 도입하는 것입니다. 오픈에이아이 짐(Gym)는 오픈에이아이에서 만든 환경인데여기서 여러가지 인공지능 알고리즘을 테스트 해볼 수 있습니다. 
+
+오픈에이아이짐의 코드는 모두 오픈에이아이의 깃허브(Github)[[2\]](#_ftn1)에업로드되어있습니다.
+
+<img src="./rlcode_image/openai_github.png" style="zoom:80%">
+
+
+
+오픈에이아이 짐의 설치는 공식 홈페이지에 설명되어있습니다. 오픈에이아이짐을 설치하기 위해서는 깃(Git)를 먼저 설치해줘야 합니다. 깃(Git)은 버전 관리 도구로서개발 프로세스에서 버전 관리가 필요할 때 주로 사용합니다. 오픈에이아이는 오픈소스로 깃헙(Github)에 공개되어 있습니다. 깃헙은 버전관리되는 소스 코드들의원격 저장소 역할을 하는 플랫폼입니다.   
+
+다음과 같은 명령어로 깃를 설치합니다.
+
+```shell
+$ sudo apt-get install git
+```
+
+
+
+깃을 설치한 다음에 오픈에이아이 짐을 설치합니다. 터미널 창에서 오픈에이아이 짐을 설치할 디렉토리로 이동한 다음에 다음과 같은 명령어를 실행합니다. 
+
+```shell
+$ git clone https://github.com/openai/gym
+$ cd gym
+$ pip3 install -e
+```
+
+
+
+오픈에이아이 짐은 여러가지 다른 설정으로 설치할 수 있는데 `pip install -e .`은 가장 기본적인 부분들만 설치하는 것입니다. 이후에 아타리 게임 등 오픈에이아이 짐의 모든 게임들을 사용하려면 `pip install -e .` 대신에 다음과 같이 입력해야 합니다.
+
+```shell
+$ pip3 install -e .[all]
+```
+
+
+
+오픈에이아이 짐이 정상적으로 설치되었는지 확인하기 위해서 간단한 예제를 실행해봅니다. 오픈에이아이 짐의 가장 간단한 예제는 카트폴(CartPole)입니다. 카트폴은 카트에 진자가 달린 형태로 이 문제의 목표는 카트를 움직여서 그 반동으로 진자를 세우는 것입니다. 테스트할 때는 그냥 아무 입력도 카트폴에 주지 않은 상태로 오픈에이아이 짐이 제대로 실행되는지만 확인할 것입니다. 
+
+`CartPole.py` 파일을 생성하고 코드 2.1과 같이 입력합니다.
+
+```python
+import gym
+env = gym.make('CartPole-v0')
+env.reset()
+for _ in range(1000):
+    env.render()
+    env.step(env.action_space.sample()) # take a random action
+```
+
+​         					        	코드 2.1 카트폴 예제 실행 코드
+
+
+이 코드를 실행하면 화면에 아무 행동도 하지 않는 카트폴이 실행됩니다. 오픈에이아이 짐은 이와 같은 많은 문제들을 제공하며 사용자들은 오픈에이아이 짐의 여러가지 문제에 자신의 학습 알고리즘을 적용해볼 수 있습니다. 또한 오픈에이아이 짐 사이트에 자신의 알고리즘을 공유하거나 결과를 확인할 수 있습니다. 
+
+
+
+<img src="./rlcode_image/cartpole_exam.png" style="zoom:80%">
+
+
+
+## 2.2 개발 환경 설정 2: 맥 OS
+
+맥 OS에는 기본적으로 파이썬 2.7버전이 설치되어있기 때문에 3.5 버전을 새로 설치를 해야 합니다.
+
+### 2.2.1 파이썬 3.5 설치 및 환경 설정
+
+파이썬다운로드 페이지[[3\]](#_ftnref3)로접속하면 다음과 같은 화면이 나옵니다.
+
+<img src="./rlcode_image/python_download.png" style="zoom:80%">
+
+
+
+1. 위 화면에서 자신의 맥 OS 버전에 맞는 파일을 선택해서 다운로드합니다. 다운로드가 완료된 파일을 실행 후 안내에 따르면 설치가 완료됩니다.
+
+   ​
+
+   <img src="./rlcode_image/python_installed.png" style="zoom:80%">
+
+   ​
+
+2. 파이썬 설치가 정상적으로 완료됐는지 확인하기 위해서는 터미널을 실행합니다. 터미널 창에 ‘python3’ 명령어를 입력했을 때 다음 화면과 같이 출력된다면 정상적으로 설치된 것입니다.
+
+   ​
+
+   <img src="./rlcode_image/python3_terminal.jpg" style="zoom:100%">
+
+   ​
+
+### 2.2.2 파이참 커뮤니티 설치 및 환경 설정
+
+파이참의 설치 및 환경 설정은 다음과 같은 순서로 진행합니다. 
+
+1. 파이참홈페이지에 접속하여 커뮤니티버전을 다운로드합니다. 
+
+2. 다운로드가 완료된 파일을 실행하고아래 그림에서 왼쪽 PyCharm CE 아이콘을 오른쪽 폴더 아이콘으로 드래그하면 설치가 완료됩니다.
+
+   <img src="./rlcode_image/pycharm_drag.png" style="zoom:80%">
+
+3. 처음 파이참을 실행하게 되면 설정화면이 나오는데 IDE theme을 통해 IDE의 색상과 스타일을 변경할 수 있습니다. Default는 우분투의 개발환경설정에서 봤던 Intellij 테마입니다. 이 책에서는 Default를 사용할 것입니다.
+   ​
+
+4. 초기 설정을 완료하고 Create New Project 버튼을 클릭합니다.
+
+   ​
+
+5. Create New Project 버튼을 클릭하면 아래 그림과 같은 화면이 나옵니다. Location은 프로젝트가 생성될 경로와 프로젝트 폴더명을 설정하는 곳입니다. 프로젝트의 이름과 경로는 독자가 임의로 지정하면 됩니다.
+
+   Interpreter는 프로젝트에서 어떤 파이썬 Interpreter를 사용할 것인지 고르는 것입니다. 우분투에서와 마찬가지로 VirtualEnv를 통해 가상 환경을 만들고 그것을 Interpreter로 사용할 것입니다. Create VirtualEnv 버튼을 누릅니다.
+
+   ​
+
+   <img src="./rlcode_image/rl_book_virtualenv.png" style="zoom:80%">
+
+   ​
+
+6. 아래 그림은 VirtualEnv의 생성화면입니다. Name과 Location은 여러분이 임의로 설정하면 됩니다. Base Interpreter는 위와 같이 새로 설치한 python3.5 를 선택합니다. OK버튼을 누르면 해당 VirtualEnv가 생성됩니다.
+
+   ​
+
+   <img src="./rlcode_image/rl_book_venv.png" style="zoom:80%">
+
+   ​
+
+7. 처음 New Project 생성화면의 Interpreter에서 방금 생성한 VirtualEnv를 선택해줍니다. 그리고 Create버튼을 누르면 프로젝트 생성이 완료됩니다.
+
+   ​
+
+   <img src="./rlcode_image/rl_book_project.png" style="zoom:80%">
+
+   ​
+
+8. 프로젝트를 생성하고 나면 다음과 같은 작업 환경이 보입니다. 이 화면에서 최상위 폴더를 우클릭한 후 
+
+   New -> Python File을 클릭하면 새로운 파이썬 파일을 생성할 수 있습니다. 
+
+   ​
+
+   <img src="./rlcode_image/rl_book_hello_world.png" style="zoom:80%">
+
+   ​
+
+9. 파이참이 제대로 설치됐는지 확인하기 위해 hello world 예제를 실행해봅니다. 우분투에서와 동일하기 때문에 생략하겠습니다.
+
+   ​
+
+### 2.2.3 오픈에이아이 설치 및 테스트
+
+오픈에이아이를 설치하고 카트폴을 실행해보는 단계는 우분투와 동일하므로 생략합니다.
+
+
+
+------
+
+[[1\]](#_ftnref1) https://www.jetbrains.com/pycharm/
+
+[[2\]](#_ftnref2) https://github.com/openai/gym
+
+[[3\]](#_ftnref3) https://www.python.org/downloads/release/python-350/
diff --git a/wiki/rlcode_image/cartpole_exam.png b/wiki/rlcode_image/cartpole_exam.png
new file mode 100644
index 00000000..3b8674b6
Binary files /dev/null and b/wiki/rlcode_image/cartpole_exam.png differ
diff --git a/wiki/rlcode_image/console_hello_world.png b/wiki/rlcode_image/console_hello_world.png
new file mode 100644
index 00000000..e14c6a01
Binary files /dev/null and b/wiki/rlcode_image/console_hello_world.png differ
diff --git a/wiki/rlcode_image/default_config.png b/wiki/rlcode_image/default_config.png
new file mode 100644
index 00000000..7f9e4794
Binary files /dev/null and b/wiki/rlcode_image/default_config.png differ
diff --git a/wiki/rlcode_image/file_setting.png b/wiki/rlcode_image/file_setting.png
new file mode 100644
index 00000000..264bb279
Binary files /dev/null and b/wiki/rlcode_image/file_setting.png differ
diff --git a/wiki/rlcode_image/hello_world_ubuntu.png b/wiki/rlcode_image/hello_world_ubuntu.png
new file mode 100644
index 00000000..ede75b89
Binary files /dev/null and b/wiki/rlcode_image/hello_world_ubuntu.png differ
diff --git a/wiki/rlcode_image/openai_github.png b/wiki/rlcode_image/openai_github.png
new file mode 100644
index 00000000..e5422484
Binary files /dev/null and b/wiki/rlcode_image/openai_github.png differ
diff --git a/wiki/rlcode_image/project_interpreter.png b/wiki/rlcode_image/project_interpreter.png
new file mode 100644
index 00000000..b22f24cc
Binary files /dev/null and b/wiki/rlcode_image/project_interpreter.png differ
diff --git a/wiki/rlcode_image/pycham_new_project.png b/wiki/rlcode_image/pycham_new_project.png
new file mode 100644
index 00000000..bfd309eb
Binary files /dev/null and b/wiki/rlcode_image/pycham_new_project.png differ
diff --git a/wiki/rlcode_image/pycharm_community.png b/wiki/rlcode_image/pycharm_community.png
new file mode 100644
index 00000000..3e4f1967
Binary files /dev/null and b/wiki/rlcode_image/pycharm_community.png differ
diff --git a/wiki/rlcode_image/pycharm_drag.png b/wiki/rlcode_image/pycharm_drag.png
new file mode 100644
index 00000000..3fd2faa2
Binary files /dev/null and b/wiki/rlcode_image/pycharm_drag.png differ
diff --git a/wiki/rlcode_image/pycharm_init.png b/wiki/rlcode_image/pycharm_init.png
new file mode 100644
index 00000000..b2fa23c7
Binary files /dev/null and b/wiki/rlcode_image/pycharm_init.png differ
diff --git a/wiki/rlcode_image/python3_terminal.jpg b/wiki/rlcode_image/python3_terminal.jpg
new file mode 100644
index 00000000..38fe67ac
Binary files /dev/null and b/wiki/rlcode_image/python3_terminal.jpg differ
diff --git a/wiki/rlcode_image/python_download.png b/wiki/rlcode_image/python_download.png
new file mode 100644
index 00000000..24922c44
Binary files /dev/null and b/wiki/rlcode_image/python_download.png differ
diff --git a/wiki/rlcode_image/python_installed.png b/wiki/rlcode_image/python_installed.png
new file mode 100644
index 00000000..a6dae073
Binary files /dev/null and b/wiki/rlcode_image/python_installed.png differ
diff --git a/wiki/rlcode_image/python_intalled.png b/wiki/rlcode_image/python_intalled.png
new file mode 100644
index 00000000..a6dae073
Binary files /dev/null and b/wiki/rlcode_image/python_intalled.png differ
diff --git a/wiki/rlcode_image/rl_book_hello_world.png b/wiki/rlcode_image/rl_book_hello_world.png
new file mode 100644
index 00000000..5588e095
Binary files /dev/null and b/wiki/rlcode_image/rl_book_hello_world.png differ
diff --git a/wiki/rlcode_image/rl_book_project.png b/wiki/rlcode_image/rl_book_project.png
new file mode 100644
index 00000000..b1603305
Binary files /dev/null and b/wiki/rlcode_image/rl_book_project.png differ
diff --git a/wiki/rlcode_image/rl_book_venv.png b/wiki/rlcode_image/rl_book_venv.png
new file mode 100644
index 00000000..c86d7d94
Binary files /dev/null and b/wiki/rlcode_image/rl_book_venv.png differ
diff --git a/wiki/rlcode_image/rl_book_virtualenv.png b/wiki/rlcode_image/rl_book_virtualenv.png
new file mode 100644
index 00000000..dc783044
Binary files /dev/null and b/wiki/rlcode_image/rl_book_virtualenv.png differ
diff --git a/wiki/rlcode_image/rlcode_book_directory.png b/wiki/rlcode_image/rlcode_book_directory.png
new file mode 100644
index 00000000..f1c13cd3
Binary files /dev/null and b/wiki/rlcode_image/rlcode_book_directory.png differ
diff --git a/wiki/rlcode_image/rlcode_project.png b/wiki/rlcode_image/rlcode_project.png
new file mode 100644
index 00000000..d9c89be6
Binary files /dev/null and b/wiki/rlcode_image/rlcode_project.png differ
diff --git a/wiki/rlcode_image/run_hello_world.png b/wiki/rlcode_image/run_hello_world.png
new file mode 100644
index 00000000..570e979d
Binary files /dev/null and b/wiki/rlcode_image/run_hello_world.png differ
diff --git a/wiki/rlcode_image/sh_pycharm.sh.png b/wiki/rlcode_image/sh_pycharm.sh.png
new file mode 100644
index 00000000..19708444
Binary files /dev/null and b/wiki/rlcode_image/sh_pycharm.sh.png differ
diff --git a/wiki/rlcode_image/terminal_rlcode_book.png b/wiki/rlcode_image/terminal_rlcode_book.png
new file mode 100644
index 00000000..38279352
Binary files /dev/null and b/wiki/rlcode_image/terminal_rlcode_book.png differ