门户网站建设课程设计,装修设计图网站,互联网网站样式,wordpress插件漏洞这是一个易理解的 demo#xff0c;300行左右#xff0c;可以作为RL的入门代码#xff0c;辅助基础公式的理解 这个是我自己的学习笔记。三连留下邮箱#xff0c;可以直接发送完整的代码标注文件~ 如有错误#xff0c;麻烦指出#xff01;我已经蛮久没写博了#xff0c;上… 这是一个易理解的 demo300行左右可以作为RL的入门代码辅助基础公式的理解 这个是我自己的学习笔记。三连留下邮箱可以直接发送完整的代码标注文件~ 如有错误麻烦指出我已经蛮久没写博了上一篇RL博客也快一年半了很久没做这一块了。硕士刚入学兜兜转转还是回到了RL。欢迎交流~ 井字棋规则感觉就是三子棋。3 * 3棋盘先连成3子胜利。
代码概览 类的定义定义了State、Judger、Player、HumanPlayer四个类分别代表棋局状态、下棋(裁判)、AI棋手、人类棋手 状态s每个棋局是一种状态使用hash标识唯一的状态。共3^9种状态 训练过程首先让2个AI棋手对战以逐渐完善策略(价值状态函数)。AI棋手训练完后让AI棋手和人类棋手对战 训练AI棋手时初始时设置胜局状态value为1输局状态value为0其余为0.5。然后backup更新即利用 V(s) ⬅ V(s) α[V(s)-V(s)] 不断修正value直到逐渐收敛。α是步长
代码讲解
main主函数
先来看下主函数
if __name__ __main__:train(int(1e5)) # 1e5是浮点型。是epoch # 2AI对战完善value functioncompete(int(1e3)) # 2AI对战训练完后再对战自测胜率 play() # 人类和AI对战
总之就是先AI对战再人机大战
State状态类
引入包并定义3*3棋局
import numpy as np
import pickleBOARD_ROWS 3
BOARD_COLS 3
BOARD_SIZE BOARD_ROWS * BOARD_COLS
再进行State状态类的定义。State类包括函数 __init__()初始化、 hash()计算每个状态的哈希值以索引、 is_end()检查棋局是否结束、 next_station()函数将棋手标志放至下一个下棋位置上、 print()打印当前3*3棋局
详细注释见代码
# 每一个state是棋盘的整个状态共3^9个状态
class State:def __init__(self):# 1 symbol 先行player# -1 symbol后行player# 0 symbolempty positionself.data np.zeros((BOARD_ROWS, BOARD_COLS)) # 代表boardself.winner Noneself.hash_val None # 使用hash标识每个状态self.end None# 计算每个状态的哈希值规则随机def hash(self):if self.hash_val is None:self.hash_val 0for i in self.data.reshape(BOARD_COLS * BOARD_ROWS):if i -1:i 2self.hash_val self.hash_val * 3 ireturn int(self.hash_val)# 检查游戏是否分出胜负或是平局def is_end(self):if self.end is not None:return self.endresults []# check rowfor i in range(0, BOARD_ROWS):results.append(np.sum(self.data[i, :]))# check columnsfor i in range(0, BOARD_COLS):results.append(np.sum(self.data[i, :]))# check diagnoalsresults.append(0)for i in range(0, BOARD_ROWS):results[-1] self.data[i, i]results.append(0)for i in range(0, BOARD_COLS):results[-1] self.data[i, BOARD_ROWS -1 - i]for result in results:if result 3:self.end Trueself.winner 1return self.endif result -3:self.end Trueself.winner -1return self.end# check tiesum np.sum(np.abs(self.data))if sum BOARD_COLS * BOARD_ROWS:self.end Trueself.winner 0 # 平局return self.end# 非胜负/平局继续游戏self.end Falsereturn self.end# 下一个状态# 将棋手标志放置board位置(i, j)def next_station(self, i, j, symbol):new_state State()new_state.data np.copy(self.data)new_state.data[i, j] symbolreturn new_state# 打印棋局def print(self):for i in range(0, BOARD_ROWS):print(-----------------)out | for j in range(0, BOARD_COLS):if self.data[i, j] 1:token *if self.data[i, j] -1:token xif self.data[i, j] 0:token 0out token | print(out)print(-----------------)
# 检索当前状态下所有下一个可能动作带来的状态变换
def get_all_states_impl(current_state, current_symbol, all_states):for i in range(0, BOARD_ROWS):for j in range(0, BOARD_COLS):if current_state.data[i][j] 0: # 检索目前所有空格子newState current_state.next_state(i, j, current_symbol)newHash newState.hash()if newHash not in all_states.keys():isEnd newState.is_end()all_states[newHash] (newState, isEnd)if not isEnd: # 如果棋手1下完还没结束棋局棋手2下get_all_states_impl(newState, -current_symbol, all_states)def get_all_states():current_symbol 1current_state State()all_states dict()all_states[current_state.hash()] (current_state, current_state.is_end())get_all_states_impl(current_state, current_symbol, all_states)return all_states# all_states字典key是某状态对应的唯一哈希值value是(state,isEnd)
all_states get_all_states()
Judger裁判类
Judger类是裁判其实就是两个棋手轮流下棋。包括函数 __init__() 初始化 reset() 重置 alternate() 轮流选择下棋手 play() 双方下棋
详细注释见代码
class Judger:def __init__(self, player1, player2):self.p1 player1self.p2 player2self.current_player Noneself.p1_symbol 1self.p2_symbol -1self.p1.set_symbol(self.p1_symbol)self.p2.set_symbol(self.p2_symbol)def reset(self):self.p1.reset()self.p2.reset()def alternate(self):while True:yield self.p1yield self.p2# play函数用于双方轮流下棋这个play函数是两个AI棋手下# act函数用于为当前player选择value最高的下棋位置# print:if True, print each board during the gamedef play(self, print False):alternator self.alternate()self.reset()current_state State()self.p1.set_state(current_state)self.p1.set_state(current_state)while True: # 一直到棋局结束return了才结束循环player next(alternator) # 双方轮流下棋if print:current_state.print()[i, j, symbol] player.act() # 为棋手选择下一步最佳落子next_state_hash current_state.next_state(i, j, symbol)current_state, is_end all_states[next_state_hash]self.p1.set_state(current_state)self.p2.set_state(current_state)if is_end:if print:current_state.print()return current_state.winner
Player棋手(AI)类
Player是AI棋手类。包括函数 __init__() 初始化 reset() 重置 set_state() 设置状态及是否explore set_symbol() 状态价值初始化赋值 backup() 反向更新状态价值V(s) ⬅ V(s) α[V(s)-V(s)] act() 当前state下选择最优action save_policy() 保存策略(就是estimations价值) load_policy() 加载策略
详细注释见代码
# AI player
# 关于value function(state的函数)解释https://face2ai.com/RL-RSAB-1-5-An-Extended-Example/
class Player:# step_size: the step size to update estimation# (好像就是value function)back up更新里的α详见上方链接解释# epsilon: the probability to exploredef __init__(self, step_size 0.1, epsilon 0.1):self.estimations dict()self.step_size step_sizeself.epsilon epsilonself.states []self.greedy []def reset(self):self.states []self.greedy []def set_state(self, state):self.states.append(state)self.greedy.append(True) # 应该是exploit而完全不explore# 这个函数就是给状态state赋value的(estimation字典key为hash(对应某个状态)值为value)# 在棋局结束状态下如果这个状态赢赋1若平局赋0.5输则赋0# 正在进行时的棋局状态一律赋0.5def set_symbol(self, symbol):self.symbol symbolfor hash_val in all_states.keys():(state, is_end) all_states[hash_val]if is_end:if state.winner self.symbol:self.estimations[hash_val] 1.0elif state.winner 0:self.estimations[hash_val] 0.5else:self.estimations[hash_val] 0else:self.estimations[hash_val] 0.5# 反向更新价值状态函数value estimationdef backup(self):self.states [state.hash() for state in self.states]# 反向更新V(s) ⬅ V(s) α[V(s)-V(s)]# 可参考链接https://face2ai.com/RL-RSAB-1-5-An-Extended-Example/for i in reversed(range(len(self.states) - 1)):state self.states[i]td_error self.greedy[i] * (self.estimations[self.states[i 1]] - self.estimations[state])self.estimations[state] self.step_size * td_error# 当前state下选择下一步的最优action# act函数的返回结果为下棋位置和棋手标志(i, j, symbol)def act(self):state self.states[-1]next_states []next_positions []# 找出目前state下所有空位for i in range(BOARD_ROWS):for j in range(BOARD_COLS):if state.data[i, j] 0:next_positions.append([i,j])next_states.append(state.next_state(i, j, self.symbol).hash())# exploreif np.random.rand() self.epsilon: # 随机生成(0,1)之间数 action next_positions[np.random.randint(len(next_positions))]action.append(self.symbol)self.greedy[-1] Falsereturn action# 否则exploitvalues []for hash, pos in zip(next_states, next_positions):values.append((self.estimations[hash], pos))np.random.shuffle(values)values.sort(key lambda x:x[0], reverse True) # 按照state的value值大小倒序排action values[0][1] # 选择value值最大的action的位置action.append(self.symbol) # 为这个动作加上棋手标志return actiondef save_policy(self):# bin是二进制格式的文件with open(policy_%s.bin % (first if self.symbol 1 else second), wb) as f:pickle.dump(self.estimations, f) # 对象存储def load_policy(self):with open(policy_%s.bin % (first if self.symbol 1 else second), rb) as f:self.estimations pickle.load(f)
HumanPlayer棋手(人类)类
HumanPlayer类就是人类棋手。包括函数 __init__() 初始化 set_state() 设置状态 set_symbol() 设置棋手标志 act() 人类棋手通过键盘下棋
详细注释见代码
# human interface
# input a number to put a chessman
# | q | w | e |
# | a | s | d |
# | z | x | c |
class HumanPlayer:def __init__(self, **kwargs):self.symbol Noneself.keys [q, w, e, a, s, d, z, x, c]self.state Nonereturndef reset(self):returndef set_state(self, state):self.state statedef set_symbol(self, symbol):self.symbol symbolreturndef backup(self, _):returndef act(self):self.state.print()key input(Input your position:) # 将这句话显示在屏幕上接收用户输入的值赋给key# 默认用户的输入是键盘最左边三行三列字母data self.keys.index(key) # 索引i data // int(BOARD_COLS)j data % BOARD_COLSreturn (i, j, self.symbol)
train函数
train()其实就是让两个AI对战不断完善value function直至逐渐收敛。是在训练AI棋手
# 2个AI间训练
# 两个AI player打不断完善value function(即estimations)
# 训练结束后, Epoch 10000, player 1 win 0.08, player 2 win 0.03
def train(epochs):player1 Player(epsilon 0.01)player2 Player(epsilon 0.01)judger Judger(player1, player2)player1_win 0.0player2_win 0.0for i in range(1, epochs 1):winner judger.play(print False) # 开始下棋if winner 1:player1_win 1if winner -1:player2_win 1print(Epoch %d, player1 win %.02f, player2 win %.02f % (i, player1_win / i, player2_win / i))player1.backup()player2.backup()judger.reset()player1.save_policy()player2.save_policy()
compete函数
相当于train后的test。这个游戏规则太简单了测试时两个AI都是平局
# 2个AI间测试
# 训练结束后的测试AI之间就没有输赢了全是平局
# 1000 turns, player 1 win 0.00, player 2 win 0.00
# 计算下turns次棋两个AI棋手的分别胜率
def compete(turns):player1 Player(epsilon 0)player2 Player(epsilon 0)judger Judger(player1, player2)player1.load_policy()player2.load_policy()player1_win 0.0player2_win 0.0for i in range(0, turns):winner judger.play()if winner 1:player1_vin 1if winner -1:player2_win 1judger.reset()print(%d turns, player1 win %.02f, player1 win %.02f % (turns, player1_win / turns, player2_win / turns))play函数
人类和AI下棋
# 人类和AI下棋
def play():while True:player1 HumanPlayer()player2 Player(epsilon 0)judger Judger(player1, player2)player2.load_policy() # AI棋手使用之前两个AI对战储存的policy即value function因为上面设置的epsilon为0直接贪婪地选造成value最大状态的actionwinner judger.play()if winner player2.symbol:print(You lose!)elif winner player1.symbol:print(You win!)else:print(It is a tie!)