HTML+ CSS + JavaScript 迷宫寻路强化学习 Q Learning算法训练

本文我使用了，强化学习Q Learning算法来训练 AI 自动走迷宫，寻找最佳路径的一个小例子。例子中JavaScript 代码+css样式的方式来简单的实现了AI 的学习和训练的过程。非常适合入门学习Q Learning算法，HTML+ CSS + JavaScript代码组合非常适合理解算法过程，解决了很多人懂得Q Learning算法的理论部分但是编码能力不强无法实现代码运行效果，本文可以让你更好的理解代码部分的运行原理，达到能举一反三的功能。同时也满足一些前端朋友无需理解Q Learning算法的理论，通过阅读JavaScript 代码逻辑就能掌握到Q Learning算法中迷宫寻路的一些知识。
在这里插入图片描述

Q Learning算法训练

1 获得地图坐标当前的动作
2 获得动作的奖励与惩罚值
3 更新这个坐标动作的Q值
4 开始下个坐标的训练

this.learn=function(){// 获得地图坐标当前的动作var action =this.selectAction(this.pos);// 获得一个动作 this.currentAction = action;// 在学习训练中获得动作的奖励与惩罚值，得到训练得分// 例如 动作训练值  bottom   [-10,[1,0]]  （撞墙 -10分）var rewardAndState =this.getRewardAndNextState(this.pos, action);// 更新训练动作数据的Q值 this.updateQValue(this.pos, action, reward, nextPos);// 递归执行一下个坐标this._doNext();}this.learn=function(){// 停止学习if(this.stopFlg ===true)return;// 学习开始初始化if(this.pos[0]===this.goal[0]&&this.pos[1]===this.goal[1]){this.iterCount++;this.actCount =0;this.pos =[0,0];returnthis._doNext();}// 训练完成if(this.iterCount >=this.maxIter){returnthis._finishAction();//结束处理 显示训练结果Q值}if(this.actCount >=this.maxAct){this.iterCount++;this.actCount =0;this.pos =[0,0];returnthis._doNext();}// 数据增加this.actCount++;// 获得地图坐标当前的动作var action =this.selectAction(this.pos);// 获得一个动作 this.currentAction = action;// 在学习训练中获得动作的奖励与惩罚值，得到训练得分// 例如 动作训练值  bottom   [-10,[1,0]]  （撞墙 -10分）var rewardAndState =this.getRewardAndNextState(this.pos, action);var reward = rewardAndState[0];var nextPos = rewardAndState[1];// 更新训练动作数据的Q值this.updateQValue(this.pos, action, reward, nextPos);// 更新位置坐标this.pos = nextPos;// 显示路径位置this.maze.showAgent(this.pos);// 递归执行this._doNext();}

获得地图坐标当前的动作

1 通过训练的数据生成贪婪策略值
2 用贪婪策略值与随机数对比，生成下一步动作，会产生两个分支
3 返回随机生成的一个动作
4 通过 Q 值产生4个动作中Q值最高的动作

this.selectAction=function(pos){var q =this.q[pos[0]][pos[1]];//获得当前坐标的q值var epsilon =this._getEpsilon();//通过训练数据来生成 e 贪婪策略值//  explore  训练获得随机动作if(Math.random()<= epsilon){//通过随机产生算法分支  1 随机动作 2 高分胜出this.actionType ='non-greedy';returnshuffle(this.actions)[0];//产生一个随机动作// greedy}else{// 通过Q值 获得算法预测最有可能的动作this.actionType ='greedy';var maxVal  =-999, maxAction ="top";//四个动作对比 Q 值 ，选择 Q 值最高的动作为训练动作shuffle(this.actions).forEach(function(action){if(maxVal < q[action]){
                maxAction = action;
                maxVal = q[action];}});return maxAction;}}// 通过训练数据来生成 e 贪婪策略值this._getEpsilon=function(){//学习最大次数 * 学习中最大操作次数var base =this.maxIter *this.maxAct;//贪婪策略值 -（学习次数*学习中最大操作次数*操作次数/ basevar epsilon =this.epsilon -(this.iterCount *this.maxAct +this.actCount)/ base;
    epsilon = Math.max(0, epsilon);//获得收益最大的数this.currentEpsilon = epsilon;//探索速率  概率随机return epsilon;}

获得动作的奖励与惩罚值

从训练中获得动作的奖励与惩罚值，分为四个动作的判断（上，下，左，右）。例如【1.1】这个坐标中产生一个随机动向下的动作，如果这个动作判断为撞墙，会产生一个撞墙惩罚-10 的训练值。返回训练值的数据结构 bottom（向下） [-10,[1,0]]。

例子中分为三种奖罚机制

撞墙惩罚
到达终点
正常道路

this.getRewardAndNextState=function(pos, action){
    posMaze_x = pos[0]+1;//路径位置坐标 x
    posMaze_y = pos[1]+1;//路径位置坐标 y// 每个动作奖励与惩罚获得值switch(action){case'top':// 墙if(this.maze.box[posMaze_x][posMaze_y -1]===0){return[this.penalty, pos];//撞墙惩罚// 终点}elseif(pos[0]===this.goal[0]&&(pos[1]-1)===this.goal[1]){return[this.goalReward,[pos[0], pos[1]-1]];//到达终点奖励// 道路}else{return[this.stepPenalty,[pos[0], pos[1]-1]];//正常道路}break;case'bottom':// 墙if(this.maze.box[posMaze_x][posMaze_y +1]===0){return[this.penalty, pos];// 终点}elseif(pos[0]===this.goal[0]&&(pos[1]+1)===this.goal[1]){return[this.goalReward,[pos[0], pos[1]+1]];// 道路}else{return[this.stepPenalty,[pos[0], pos[1]+1]];}break;case'left':// 墙if(this.maze.box[posMaze_x -1][posMaze_y]===0){return[this.penalty, pos];// 终点}elseif((pos[0]-1)===this.goal[0]&& pos[1]===this.goal[1]){return[this.goalReward,[pos[0]-1, pos[1]]];// 道路}else{return[this.stepPenalty,[pos[0]-1, pos[1]]];}break;case'right':// 墙if(this.maze.box[posMaze_x +1][posMaze_y]===0){return[this.penalty, pos];// 终点}elseif((pos[0]+1)===this.goal[0]&& pos[1]===this.goal[1]){return[this.goalReward,[pos[0]+1, pos[1]]];// 道路}else{return[this.stepPenalty,[pos[0]+1, pos[1]]];}break;default:// 发生意外return[0, pos];}}

更新坐标动作的Q值

Q-Learning是强化学习方法之一，Q-learning中核心函数（Q函数）的更新公式为Q(s,a) <- (1 - α)Q(s,a) + α[r + γ * max(Q(<s+1>, a))]计算动作的Q值，并且把Q值保存在坐标动作上。

更新坐标的Q值 Q(s<t>,a<t>)<-(1- α)Q(s<t>,a<t>)+ α[r<t>+ γ *max(Q(<s+1>, a))]  
计算坐标位置的 Q 值
this.updateQValue=function(pos, action, reward, nextPos){//pos 地图坐标，action动作，reward 训练数据 ,nextPos 训练坐标var qValue =this.q[pos[0]][pos[1]][action];//获得上一次的坐标位置动作的Qvalue值// Q值默认对比值var nextMaxReward =-999;// 获得坐标上次最大的 Q值 {"top":0,"left":0,"bottom":-5,"right":0}var nextQ =this.q[nextPos[0]][nextPos[1]];for(act in nextQ){//进行（上，下，左，右）动作Q值与默认Q值对比 if(nextMaxReward < nextQ[act]){
            nextMaxReward = nextQ[act];//获得有变化的 动作的Q值}}//Q(s<t>,a<t>) <- (1 - α)Q(s<t>,a<t>) + α[r<t> + γ * max(Q(<s+1>, a))]  公式来计算 Q值var newQValue =(1-this.alpha)* qValue +this.alpha *(reward +this.gampa * nextMaxReward);this.q[pos[0]][pos[1]][action]= newQValue;//更新坐标动作的Q值}

实例代码

<!DOCTYPEhtml><htmllang="en"><head><metacharset="UTF-8"><title> 迷宫寻路 强化学习Q Learning算法训练</title><style>.maze{}.maze:after{width: 0;height: 0;content:"";clear: both;}.w{background-color: blue;width: 10px;height: 10px;padding: 0;margin: 0;float: left;}.p{background-color: white;width: 10px;height: 10px;padding: 0;margin: 0;float: left;}.s, .e{background-color: yellow;width: 10px;height: 10px;padding: 0;margin: 0;float: left;}.a{background-color: red;width: 10px;height: 10px;padding: 0;margin: 0;float: left;}#console2{font-size: 10px;border-collapse: collapse;}#console2 td{border: 1px solid #ccc;padding: 5px;}#console2 td.current{background-color:rgba(255, 255, 0, .3);}.red{color: red;}.blue{color: blue;}</style></head><body><divid="maze"></div><divid="zhtbs"></div><divstyle="padding: 5px 0;font-size: 10px;border-bottom: 1px solid #ccc;margin-bottom: 10px;">
        大小<inputid="dtSize"type="text"value="7"style="width:30px">,
        学习率<inputid="xxRate"type="text"value="0.5"style="width:30px">,
        折扣率<inputid="zkRate"type="text"value="0.9"style="width:30px">,
        贪婪率<inputid="epsilon"type="text"value="0.6"style="width:30px"><buttononclick="onRun()">运行 Q-Learning 算法</button><buttononclick="onDo()">执行训练出来的最佳路径</button></div><divid="console1"style="font-size:10px;"></div><tableid="console2"></table><br><script>/********   迷宫生成部分    **********/functionzht(size){this.pagesum=size;this.size =(size %2===0? size +1: size);this.box =[];this.$maze = document.querySelector("#zhtbs");this.ALGO={STICK:1};this.shuffle=function(o){for(var j, x, i = o.length; i; j = Math.floor(Math.random()* i), x = o[--i], o[i]= o[j], o[j]= x);return o;}this.showAgent=function(agentPos){
        agentPos =[agentPos[0]+1, agentPos[1]+1];var snipet ='';for(var i =0; i <this.size; i++){for(var j =0; j <this.size; j++){// 开始if(j === agentPos[0]&& i === agentPos[1]){
                    snipet +='<div class="a"></div>';// 开始}elseif(i ===1&& j ===1){
                    snipet +='<div class="s"></div>';// 结束}elseif(i ===this.size -2&& j ===this.size -2){
                    snipet +='<div class="e"></div>';}elseif(this.box[j][i]===0){
                    snipet +='<div class="w"></div>';// 路}else{
                    snipet +='<div class="p"></div>';}}}this.$maze.innerHTML = snipet;this.$maze.style.height =(this.size *10)+'px';this.$maze.style.width  =(this.size *10)+'px';}this.show=function(){var snipet ='';for(var i =0; i <this.size; i++){for(var j =0; j <this.size; j++){if(i ==1&& j ==1){//入口
                    snipet +='<div class="s"></div>';}elseif(i ===this.size -2&& j ===this.size -2){// 出口
                    snipet +='<div class="e"></div>';}elseif(this.box[j][i]===0){// 墙
                    snipet +='<div class="w"></div>';}else{// 路
                    snipet +='<div class="p"></div>';}}}this.$maze.innerHTML = snipet;this.$maze.style.height =(this.size *10)+'px';this.$maze.style.width  =(this.size *10)+'px';}this.create=function(options){
            options = options ||{};if(options.algorithm ===this.ALGO.STICK){this._createByStick();}this.show();}this._createByStick=function(){//初始化 墙与路生成this.box =[];for(var i =0; i <this.size; i++){var row =[];this.box.push(row);for(var j =0; j <this.size; j++){// 第一行和最后一行是墙if(i ===0||(i +1)===this.size){
                    row.push(0);// 第一列和最后一列也是墙壁}elseif(j ===0||(j +1)===this.size){
                    row.push(0);// 奇数行都是过道}elseif(i %2===1){
                    row.push(1);// 偶数行墙壁和通道交替排列}else{// 墙壁和通道
                    row.push(j %2);}}}for(var r =0; r <this.box.length; r++){// 第一行与最后一行 生成if(r ===0||(r +1)===this.box.length){continue;}// 生成有墙的行if(r %2===1){continue;}// 获得行var row =this.box[r];// 初始化行，上下左右 参数var direction =['top','bottom','left','right'];if(r >=4){// 出了第一行，默认都是上
                direction = direction.slice(1);}for(var i =0; i < row.length; i++){//边缘不生成路if(i ===0||(i +1)=== row.length){continue;}// 获得处理墙if(i %2===0){// 随机排列墙的方向
                    direction =this.shuffle(direction);// 随机向一个方向，生成路// 在随机方向上去掉墙壁生成路for(var j =0; j < direction.length; j++){if(direction[j]==="top"){if(this.box[r-1][i]===1){this.box[r-1][i]=0;break;}}if(direction[j]==="left"){if(this.box[r][i-1]===1){this.box[r][i-1]=0;break;}}if(direction[j]==="right"){if(this.box[r][i+1]===1){this.box[r][i+1]=0;break;}}if(direction[j]==="bottom"){if(this.box[r+1][i]===1){this.box[r+1][i]=0;break;}}}}}}}}// 随机生成functionshuffle(o){for(var j, x, i = o.length; i; j = Math.floor(Math.random()* i), x = o[--i], o[i]= o[j], o[j]= x);return o;}// 小数四舍五入计算functionfloorNumber(num, digit){if(digit ===undefined){
            digit =2;}var magic = Math.pow(10, digit);return Math.floor(num * magic)/ magic;};/**
        Q Learning. 算法训练
    */functionQLearn(options){this.q =[];this.size = options.size;//算法大小this.maze = options.maze;//算法学习的迷宫地图this.actions =["top","left","bottom","right"];//对比动作策略this.displayInterval =16;//每16个批次统计损失情况//初始化 训练值this.initialize=function(){this.q =[];for(var i =0; i <this.size; i++){var row =[];this.q.push(row);for(var j =0; j <this.size; j++){
                    row.push({top:0,left:0,bottom:0,right:0});//训练值初始化}}this.pos =[0,0];//开始位置this.goal =[this.size -1,this.size -1];//结束位置}/**
            学习训练参数设定
        */this.learningSetting=function(learningRate, discountRate, epsilon){// 奖励设置this.penalty =-10;// 如果撞墙-1this.backPenalty =0;//返回-1this.stepPenalty =-0.1;// 每一步-0.1this.passReward =0;// +1 前进this.goalReward =100;// 达到终点 +100// 训练环境参数设置this.alpha = learningRate;// learning 学习值this.gampa = discountRate;// discount 折扣率this.epsilon = epsilon;// 贪婪值this.maxIter = Math.floor(Math.max(40, Math.pow(this.size,2)));// 生成学习次数//this.maxIter =30;//自己设置学习次数this.maxAct  = Math.floor(Math.max(40, Math.pow(this.size,2.7)));// 生成每次学习的最大操作数this.iterCount =0;// 当前学习次数this.actCount =0;// 当前动作计数}/**
         探索速率 e 贪婪策略
        */this._getEpsilon=function(){//学习最大次数 * 学习中最大操作次数var base =this.maxIter *this.maxAct;//贪婪策略值 -（学习次数*学习中最大操作次数*操作次数/ basevar epsilon =this.epsilon -(this.iterCount *this.maxAct +this.actCount)/ base;
            epsilon = Math.max(0, epsilon);//获得收益最大的数this.currentEpsilon = epsilon;//探索速率  概率随机return epsilon;}/**
           迷宫自动寻路 学习训练方法
        */this.learn=function(){// 停止学习if(this.stopFlg ===true)return;// 学习开始初始化if(this.pos[0]===this.goal[0]&&this.pos[1]===this.goal[1]){this.iterCount++;this.actCount =0;this.pos =[0,0];returnthis._doNext();}//训练完成if(this.iterCount >=this.maxIter){returnthis._finishAction();//结束处理 显示训练结果Q值}if(this.actCount >=this.maxAct){this.iterCount++;this.actCount =0;this.pos =[0,0];returnthis._doNext();}// 数据增加this.actCount++;// 获得地图坐标当前的动作var action =this.selectAction(this.pos);// 获得一个动作 this.currentAction = action;// 在学习训练中获得动作的奖励与惩罚值，得到训练得分// 例如 动作训练值  bottom   [-10,[1,0]]var rewardAndState =this.getRewardAndNextState(this.pos, action);var reward = rewardAndState[0];var nextPos = rewardAndState[1];// 更新训练动作数据的Q值this.updateQValue(this.pos, action, reward, nextPos);// 更新位置坐标this.pos = nextPos;// 显示路径位置this.maze.showAgent(this.pos);// 递归执行this._doNext();}/**
                 更新坐标的Q值   Q-Learning 数学公式
                // Q(s<t>,a<t>) <- (1 - α)Q(s<t>,a<t>) + α[r<t> + γ * max(Q(<s+1>, a))]        
            */this.updateQValue=function(pos, action, reward, nextPos){var qValue =this.q[pos[0]][pos[1]][action];//训练坐标位置的 Qvalue // 找到下一个 目标 的最大奖励var nextMaxReward =-999;var nextQ =this.q[nextPos[0]][nextPos[1]];for(act in nextQ){if(nextMaxReward < nextQ[act]){
                        nextMaxReward = nextQ[act];}}var newQValue =(1-this.alpha)* qValue +this.alpha *(reward +this.gampa * nextMaxReward);this.q[pos[0]][pos[1]][action]= newQValue;}/**
                采取行动并从学习路径中获得回报
            */this.getRewardAndNextState=function(pos, action){

                posMaze_x = pos[0]+1;//路径位置坐标 x
                posMaze_y = pos[1]+1;//路径位置坐标 y// 每个动作奖励与惩罚获得值switch(action){case'top':// 墙if(this.maze.box[posMaze_x][posMaze_y -1]===0){return[this.penalty, pos];// 终点}elseif(pos[0]===this.goal[0]&&(pos[1]-1)===this.goal[1]){return[this.goalReward,[pos[0], pos[1]-1]];// 道路}else{return[this.stepPenalty,[pos[0], pos[1]-1]];}break;case'bottom':// 墙if(this.maze.box[posMaze_x][posMaze_y +1]===0){return[this.penalty, pos];// 终点}elseif(pos[0]===this.goal[0]&&(pos[1]+1)===this.goal[1]){return[this.goalReward,[pos[0], pos[1]+1]];// 道路}else{return[this.stepPenalty,[pos[0], pos[1]+1]];}break;case'left':// 墙if(this.maze.box[posMaze_x -1][posMaze_y]===0){return[this.penalty, pos];// 终点}elseif((pos[0]-1)===this.goal[0]&& pos[1]===this.goal[1]){return[this.goalReward,[pos[0]-1, pos[1]]];// 道路}else{return[this.stepPenalty,[pos[0]-1, pos[1]]];}break;case'right':// 墙if(this.maze.box[posMaze_x +1][posMaze_y]===0){return[this.penalty, pos];// 终点}elseif((pos[0]+1)===this.goal[0]&& pos[1]===this.goal[1]){return[this.goalReward,[pos[0]+1, pos[1]]];// 道路}else{return[this.stepPenalty,[pos[0]+1, pos[1]]];}break;default:// 发生意外return[0, pos];}}/**
                采取下一步训练动作
            */this._doNext=function(){// Q值显示this._showQValue();//显示训练结果Q值var that =this;setTimeout(function(){
                    that.learn();//调用学习训练},this.displayInterval);}/**
                获得一个动作
            */this.selectAction=function(pos){var q =this.q[pos[0]][pos[1]];//获得当前坐标的q值var epsilon =this._getEpsilon();//通过训练数据来生成 e 贪婪策略值//  explore  训练获得随机动作if(Math.random()<= epsilon){//通过随机产生算法分支  1 随机动作 2 高分胜出this.actionType ='non-greedy';returnshuffle(this.actions)[0];//产生一个随机动作// greedy}else{// 通过Q值 获得算法预测最有可能的动作this.actionType ='greedy';var maxVal  =-999, maxAction ="top";//四个动作对比 Q 值 ，选择 Q 值最高的动作为训练动作shuffle(this.actions).forEach(function(action){if(maxVal < q[action]){
                            maxAction = action;
                            maxVal = q[action];}});return maxAction;}}/**
               结束处理
            */this._finishAction=function(){this._showQValue();//显示训练结果Q值}/**
                显示训练结果Q值
            */this._showQValue=function(){var snipet ='';
                snipet +='学习最大次数(maxIter)='+this.maxIter +',学习次数iter='+this.iterCount +',学习中最大操作次数 maxAct='+this.maxAct +', 操作次数act='+this.actCount +', 操作动作='+this.currentAction +'<br>操作类型='+this.actionType +'<br>贪婪策略值='+this.currentEpsilon;
                document.getElementById("console1").innerHTML = snipet;var snipet ='';var actions =['top','left','bottom','right'];for(var i =0; i <this.size; i++){
                    snipet +='<tr>';for(var j =0; j <this.size; j++){if(this.pos[0]=== j &&this.pos[1]=== i){
                            snipet +='<td class="current">';}else{
                            snipet +='<td>';}
                        snipet +='x='+ i +', y='+ j +'<br>';//道路坐标var that =this;//循环展示每个动作的策略值
                        actions.forEach(function(action){// i is col, j is rowvar val = that.q[j][i][action];if(val <0){
                                snipet +='  '+ action +'\t: q=<span class="red">'+floorNumber(val)+'</span><br>';}elseif(val >0){
                                snipet +='  '+ action +'\t: q=<span class="blue">'+floorNumber(val)+'</span><br>';}else{
                                snipet +='  '+ action +'\t: q=<span class="">'+floorNumber(val)+'</span><br>';}});
                        snipet +='</td>';}
                    snipet +='</tr>';}
                document.getElementById("console2").innerHTML = snipet;}/**
                Greedy 获得学习结果
            */this.greedySetting=function(){// settingsthis.alpha =0;this.gampa =0;this.epsilon =0;this.maxIter =1;// 学习次数this.maxAct  = Math.pow(this.size);// 每次学习的最大操作数this.iterCount =0;// 当前学习次数this.actCount =0;// 当前动作计数this.pos =[0,0];}}functiondoAction(){
             window.qlearn=null;// 地图大小取得var mazeSize =parseInt(document.getElementById("dtSize").value);if(mazeSize %2===0) mazeSize++;var fieldSize = mazeSize -2;//学习率设置var learningRate =parseFloat(document.getElementById("xxRate").value);var discountRate =parseFloat(document.getElementById("zkRate").value);var epsilon =parseFloat(document.getElementById("epsilon").value);// 迷宫生成var z=newzht(mazeSize);
           z.create({algorithm:1});// Q-Learning 算法运行
            window.qlearn =newQLearn({size: fieldSize,maze: z});
            qlearn.initialize();
            qlearn.learningSetting(learningRate, discountRate, epsilon);
            qlearn.learn();}functiononRun(){doAction();}functiononDo(){
    qlearn.greedySetting();
     qlearn.learn();}</script></body></html>

标签： html javascript 算法

本文转载自: https://blog.csdn.net/zhtbs/article/details/125336009
版权归原作者 Zht_bs 所有，如有侵权，请联系我们删除。