Question

我一直在学习有关AI的知识（主要是通过YouTube和其他在线资源）。我观看了有关强化学习（网格世界问题）的视频。

提供的代码在python中，我对python不太熟悉。我决定修改给出的代码以创建自己的使用场景。

这是代码：

Learner.py：

import World
import threading
import time

discount = 0.3
actions = World.actions
states = []
Q = {}
for i in range(World.x):
    for j in range(World.y):
        states.append((i, j))

for state in states:
    temp = {}
    for action in actions:
        temp[action] = 0.1
        World.set_cell_score(state, action, temp[action])
    Q[state] = temp

for (i, j, c, w) in World.specials:
    for action in actions:
        Q[(i, j)][action] = w
        World.set_cell_score((i, j), action, w)


def do_action(action):
    s = World.player
    r = -World.score
    if action == actions[0]:
        World.try_move(0, -1)
    elif action == actions[1]:
        World.try_move(0, 1)
    elif action == actions[2]:
        World.try_move(-1, 0)
    elif action == actions[3]:
        World.try_move(1, 0)
    else:
        return
    s2 = World.player
    r += World.score
    return s, action, r, s2


def max_Q(s):
    val = None
    act = None
    for a, q in Q[s].items():
        if val is None or (q > val):
            val = q
            act = a
    return act, val


def inc_Q(s, a, alpha, inc):
    Q[s][a] *= 1 - alpha
    Q[s][a] += alpha * inc
    World.set_cell_score(s, a, Q[s][a])


def run():
    global discount
    time.sleep(1)
    alpha = 1
    t = 1
    while True:
        # Pick the right action
        s = World.player
        max_act, max_val = max_Q(s)
        (s, a, r, s2) = do_action(max_act)

        # Update Q
        max_act, max_val = max_Q(s2)
        inc_Q(s, a, alpha, r + discount * max_val)

        # Check if the game has restarted
        t += 1.0
        if World.has_restarted():
            World.restart_game()
            time.sleep(0.01)
            t = 1.0

        # Update the learning rate
        alpha = pow(t, -0.1)

        # MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST.
        time.sleep(0.05)


t = threading.Thread(target=run)
t.daemon = True
t.start()
World.start_game()

World.py：

from tkinter import *
master = Tk()

triangle_size = 0.1
cell_score_min = -0.2
cell_score_max = 0.2
Width = 50
(x, y) = (16, 16)
actions = ["up", "down", "left", "right"]

board = Canvas(master, width=x*Width, height=y*Width)
player = (13, 0)
score = 100
restart = False
walk_reward = .01

walls = [(0, 0), (0, 1), (0, 2), (0, 3),(0, 4), (0, 5), (0, 6), (0, 7),(0, 8), (0, 9), (0, 10), (0, 11),(0, 12), (0, 13), (0, 14), (0, 15),
         (0, 0), (1, 0), (2, 0), (3, 0), (4, 0),(5, 0), (6, 0), (7, 0), (8, 0),(9, 0), (10, 0), (11, 0), (12, 0), (15, 0),
         (15, 0), (15, 1), (15, 2), (15, 3),(15, 4), (15, 5), (15, 6), (15, 7),(15, 8), (15, 9), (15, 10), (15, 11),(15, 12), (15, 13), (15, 14), (15, 15),
         (0, 15), (1, 15), (2, 15), (3, 15), (4, 15),(5, 15), (6, 15), (7, 15), (8, 15),(9, 15), (10, 15), (11, 15), (12, 15),(13,15),(14,15), (15, 15),
         (2, 3), (2, 4), (2, 5), (3, 3), (3, 4),(3, 5), (4, 3), (4, 4), (4, 5),
         (7, 3), (7, 4), (7, 5), (8, 3), (8, 4),(8, 5), (9, 3), (9, 4), (9, 5),
         (1, 14), (1, 13), (1, 12), (1, 11), (1, 10),(1, 9), (1, 8), 
         (2, 14), (2, 13), (2, 12), (2, 11), (2, 10),(2, 9), (2, 8), 
         (3, 10),(3, 9), (3, 8), 
         (4, 10),(4, 9), (4, 8), 
         (5, 10),(5, 9), (5, 8), 
         (6, 10),(6, 9), (6, 8), 
         (7, 10),(7, 9), (7, 8), 
         (8, 10),(8, 9), (8, 8), 
         (9, 10),(9, 9), (9, 8), 
         (10, 10),(10, 9), (10, 8), 
         (6, 13), (7, 13), (8, 13),
         (6, 12), (7, 12), (8, 12),
         ]
specials = [(14, 0, "green", -2)]
crum = [(player[0],player[1],"blue",-.05)]
cell_scores = {}


def create_triangle(i, j, action):
    if action == actions[0]:
        return board.create_polygon((i+0.5-triangle_size)*Width, (j+triangle_size)*Width,
                                    (i+0.5+triangle_size)*Width, (j+triangle_size)*Width,
                                    (i+0.5)*Width, j*Width,
                                    fill="white", width=1)
    elif action == actions[1]:
        return board.create_polygon((i+0.5-triangle_size)*Width, (j+1-triangle_size)*Width,
                                    (i+0.5+triangle_size)*Width, (j+1-triangle_size)*Width,
                                    (i+0.5)*Width, (j+1)*Width,
                                    fill="white", width=5)
    elif action == actions[2]:
        return board.create_polygon((i+triangle_size)*Width, (j+0.5-triangle_size)*Width,
                                    (i+triangle_size)*Width, (j+0.5+triangle_size)*Width,
                                    i*Width, (j+0.5)*Width,
                                    fill="white", width=1)
    elif action == actions[3]:
        return board.create_polygon((i+1-triangle_size)*Width, (j+0.5-triangle_size)*Width,
                                    (i+1-triangle_size)*Width, (j+0.5+triangle_size)*Width,
                                    (i+1)*Width, (j+0.5)*Width,
                                    fill="white", width=1)


def render_grid():
    global specials, walls, Width, x, y, player,crum
    for i in range(x):
        for j in range(y):
            board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill="white", width=1)
            temp = {}
            for action in actions:
                temp[action] = create_triangle(i, j, action)
            cell_scores[(i,j)] = temp
    for (i, j, c, w) in specials:
        board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill=c, width=1)
    for (i, j) in walls:
        board.create_rectangle(i*Width, j*Width, (i+1)*Width, (j+1)*Width, fill="black", width=1)
    for (m, h,c,w) in crum:
        board.create_rectangle(m*Width,  h*Width, (m+1)*Width, ( h+1)*Width, fill=c, width=1)

render_grid()


def set_cell_score(state, action, val):
    global cell_score_min, cell_score_max
    triangle = cell_scores[state][action]
    green_dec = int(min(255, max(0, (val - cell_score_min) * 255.0 / (cell_score_max - cell_score_min))))
    green = hex(green_dec)[2:]
    red = hex(255-green_dec)[2:]
    if len(red) == 1:
        red += "0"
    if len(green) == 1:
        green += "0"
    color = "#" + red + green + "00"
    board.itemconfigure(triangle, fill=color)


def try_move(dx, dy):
    global player, x, y, score, walk_reward, me, restart
    if restart == True:
        restart_game()
    new_x = player[0] + dx
    new_y = player[1] + dy
    score += walk_reward
    if (new_x >= 0) and (new_x < x) and (new_y >= 0) and (new_y < y) and not ((new_x, new_y) in walls):
        board.coords(me, new_x*Width+Width*2/10, new_y*Width+Width*2/10, new_x*Width+Width*8/10, new_y*Width+Width*8/10)
        player = (new_x, new_y)
    for (i, j, c, w) in specials:
        if new_x == i and new_y == j:
            score -= walk_reward
            score += w
            if score > 0:
                print ("Success! score: ", score)
            else:
                print ("Fail! score: ", score)
            restart = True
            return

    for (m, h, c, w) in crum:
        if new_x == m and new_y == h:
            score -= walk_reward
            score += w
            if score > 0:
                print ("Success! score: ", score)
            else:
                print ("Fail! score: ", score)
            restart = True
            return
    #print "score: ", score


def call_up(event):
    try_move(0, -1)


def call_down(event):
    try_move(0, 1)


def call_left(event):
    try_move(-1, 0)


def call_right(event):
    try_move(1, 0)


def restart_game():
    global player, score, me, restart
    player = (13, 0)
    score = 1
    restart = False
    board.coords(me, player[0]*Width+Width*2/10, player[1]*Width+Width*2/10, player[0]*Width+Width*8/10, player[1]*Width+Width*8/10)

def has_restarted():
    return restart

master.bind("<Up>", call_up)
master.bind("<Down>", call_down)
master.bind("<Right>", call_right)
master.bind("<Left>", call_left)

me = board.create_rectangle(player[0]*Width+Width*2/10, player[1]*Width+Width*2/10,
                            player[0]*Width+Width*8/10, player[1]*Width+Width*8/10, fill="orange", width=1, tag="me")

board.grid(row=0, column=0)


def start_game():
    master.mainloop()

我试图通过将白色状态转换为蓝色并施加极低的奖励来阻止特工访问那些相同的状态，从而使特工走开。在我修改的代码中，它仅将第一个正方形（从其开始的正方形）转换为蓝色，但之后没有任何状态。我希望它在离开状态后将白色空间转换为蓝色。

如果有人可以告诉我我的方法做错了什么，并为完成我的任务提供建议/帮助，我将不胜感激。 crum是我用于跟踪的变量。

强化学习，步步为营

0 个答案: