📄 rl.c
字号:
int FullRewardTime = 50; if (Mem->CurrentTime - Time > FullRewardTime){ /* not lowered beyond factor of 10 */ float divisor = 1 + 9*(Mem->CurrentTime-Time-FullRewardTime)/(MAX_REWARD_TIME-FullRewardTime); reward /= divisor; /* divisor between 1 and 10 */ } }#if GOAL_ONLY_REWARDS if (Mem->PlayMode != BEFORE_KICK_OFF) return; if ( Mem->MyScore > MyScore ) reward = 100; else if ( Mem->TheirScore > TheirScore ) reward = -100; float divisor = Mem->CurrentTime - Time; if (divisor > 0) reward /= divisor;#endif#if POS_5_LEARN if (Mem->GetMyPosition()==5 && Mem->GetPositionOfMyLocation()==5 && QLastActionFrom==5){ ;/*printf("%d - %d reward for %d (%.3f) = %.1f\n", Mem->MyNumber,Mem->CurrentTime,QLastActionTo,QLastActionVal,reward);*/ } else return;#endif RewardQTable->UpdateQTable(QLastActionTo,QLastActionVal,QLastActionConf, QLastFutureVal,reward); FILE *dataFile = fopen(RewardQTable->dataFileName,"w"); while ( dataFile == NULL ){ fclose(dataFile); dataFile = fopen(RewardQTable->dataFileName,"w"); } RewardQTable->Write(dataFile); fclose(dataFile); QActionTaken = FALSE;}void RewardInfo::LookForRewards(){ if ( !KeepLearning ) return; if ( !QActionTaken ) my_error("no action for which to loof for rewards");#if GOAL_ONLY_REWARDS#else /* if it's been 30 seconds, close rewards */ if ( Mem->CurrentTime - Time >= MAX_REWARD_TIME ) CloseRewards();#endif /* Don't want to divide by 0 -- nothing new has happened */ if ( Mem->CurrentTime == Time ) return; /* Here just tabulate ball positions */ /* weighted by time in each place */ float previousWeight = AvgBallX * (AvgBallUpdateTime - Time); float currentWeight = Mem->GetBallGlobalX() * (Mem->CurrentTime - AvgBallUpdateTime); AvgBallX = (previousWeight + currentWeight)/(Mem->CurrentTime - Time); AvgBallUpdateTime = Mem->CurrentTime;}/****************************************************************************/int GetActionNumber(int index, int *actions, int *action_types){ int action_number; if ( action_types[index] == RL_PASS_ACTION ) action_number = Mem->GetPlayerPosition(actions[index]); else /* RL_KNOCK_ACTION */ /* knocks stored after passes */ action_number = (actions[index]-MY_GOAL) + NUM_RL_PASS_ACTIONS; return action_number;}/****************************************************************************/int ChooseReceiverRandom(int NumActions, int *actions, int *action_types, float *FeatureValues, float *Confidences){/* int passes = 0, knocks = 0; for (int i=0; i<NumActions; i++){ if (action_types[i] == RL_KNOCK_ACTION) knocks++; else if (action_types[i] == RL_PASS_ACTION) passes++; else my_error("which type of action?"); }*/ int NumOptions=0; int Options[NumActions]; float maxVal = -20000; for (int i=0; i<NumActions; i++){#if ONLY_KNOCKS if (action_types[i] == RL_PASS_ACTION) continue;#endif#if ONLY_PASSES if (action_types[i] == RL_KNOCK_ACTION && Mem->GetMyLocationsPositionType() != FORWARD) continue;#endif#if BREAK_TIES_MAXQ if (FeatureValues[i] < maxVal) continue; /*else if (FeatureValues[i] > maxVal){*/ else if (FeatureValues[i] > maxVal && Mem->GetMyLocationsPositionType() != FORWARD){ /* Reset to only consider actions with the highest value */ if (maxVal < DT_SUCCESS_CUTOFF) NumOptions=0; maxVal = MIN(DT_SUCCESS_CUTOFF,FeatureValues[i]); }#endif Options[NumOptions++] = i; } if (!NumOptions) return NumActions-1; /* the knock to goal */ else return Options[int_random(NumOptions)];/* #if ONLY_KNOCKS passes=0;#endif#if ONLY_PASSES if (Mem->GetMyLocationsPositionType() != FORWARD){ if (passes) return int_random(NumActions-knocks); else return int_random(knocks)+NumActions-knocks; }#endif /* 50% chance of shooting or passing *//* if (int_random(2) && passes) return int_random(NumActions-knocks); else return int_random(knocks)+NumActions-knocks;*/}int ChooseReceiverMaxQ(int NumActions, int *actions, int *action_types, float *FeatureValues, float *Confidences){ QTable *MyQTable = Mem->GetMyQTable(); #if 0 int action_index = NumActions; /* me */ float maxQ = MyQTable->GetQValue(Mem->GetMyPosition(), KNOCK_CONF,DEFAULT_FUTURE); /*printf("%d to self = %.1f\n",Mem->MyNumber,maxQ);*/#endif int action_index = -1; float maxQ = -20000; /* Smaller than least possible */ float maxVal = -20000; float Q; int action; for (int i=0; i<NumActions; i++){#if BREAK_TIES_MAXQ if (FeatureValues[i] < maxVal) continue; //else if (FeatureValues[i] > maxVal){ else if (FeatureValues[i] > maxVal && Mem->GetMyLocationsPositionType() != FORWARD){ /* Reset to only consider actions with the highest value */ if (maxVal < DT_SUCCESS_CUTOFF){ maxQ = -20000; action_index = -1; } maxVal = MIN(DT_SUCCESS_CUTOFF,FeatureValues[i]); }#endif#if ONLY_KNOCKS if (action_types[i] != RL_KNOCK_ACTION) continue;#endif#if ONLY_PASSES if (Mem->GetMyLocationsPositionType() != FORWARD && action_types[i] != RL_PASS_ACTION) continue;#endif action = GetActionNumber(i,actions,action_types); Q = MyQTable->GetQValue(action,FeatureValues[i],DEFAULT_FUTURE);#if TRUST_CONF //Q += QMAX; /* So not negative -- done to all of them (0-200) */ //Q *= (Confidences[i]+1)/2; Q *= ((Confidences[i]+1)/2 + 1); /* so ranges from 1-2 */ //Q *= ((Confidences[i]+1)/4 + 1); /* Result ranges 0-4, so not actual reward, but should correlate */#endif if ( Q >= maxQ ){ maxQ = Q; action_index = i; } /* printf("%d to %d (conf %.1f, val %.1f) = %.1f (was %.1f) action_index = %d\n", MyQTable->Position,action,Confidences[i],FeatureValues[i],Q, MyQTable->GetQValue(action,FeatureValues[i],DEFAULT_FUTURE), action_index); */ } if (action_index == -1){ // my_error("Should have found SOME best action"); /* no passes */ return ChooseReceiverRandom(NumActions,actions,action_types, FeatureValues,Confidences); }#if BREAK_TIES_MAXQ /*printf("%d (%d) at %d options: ",Mem->MyNumber, MyQTable->Position, Mem->CurrentTime); for (int i =0; i<NumActions; i++){ if (FeatureValues[i] == maxVal) printf("%d ",GetActionNumber(i,actions,action_types)); } printf("\n");*/#endif return action_index;}int ChooseReceiverRandomized(int NumActions, int *actions, int *action_types, float *FeatureValues, float *Confidences){ /* p is probability of choosing random action -- decreases over time */ /* decreasing p linearly to .5 at game 40, to .1 at game 80, to .01 at game 120 */ float p; int game = Mem->CurrentTime/GAME_LENGTH; /* game += 120; */ /* game = game/8; *//* if (game>=200){ game=game%200; game=game/2; }*/#if POS_5_LEARN if (Mem->GetMyPosition()!=5 || Mem->GetPositionOfMyLocation()!=5) game+=160;#endif //game+=160; if ( game<=40 ) p = 1 - .5*game/40; else if ( game<=80 ) p = .5 - .4*(game-40)/40; else if ( game<=120 ) p = .1 - .09*(game-80)/40; else p = .01; if ( range_random(0,1) <= p ) return ChooseReceiverRandom(NumActions,actions,action_types,FeatureValues,Confidences); else return ChooseReceiverMaxQ(NumActions,actions,action_types,FeatureValues,Confidences);}/****************************************************************************/float GetHeuristicConfidence(int action, int action_type){ float destR, destTheta; if (action_type == RL_PASS_ACTION){ destR = Mem->GetTeammateDistance(action); destTheta = Mem->GetTeammateAngle(action); } else{ /* action_type == RL_KNOCK_ACTION */ destR = MIN(40,Mem->GetMarkerDistance(action)); destTheta = Mem->GetMarkerAngle(action); } /*printf("(%.1f,%.1f) ",Mem->GetGlobalX(),Mem->GetGlobalY());*/ float result = 2; float angleDiff, distAlongLine, distFromLine; for (int i=1; i<=TEAM_SIZE; i++){ if (!Mem->OpponentValid(i)) continue; /* Special case to ignore goalie for shots */ if (i==1 && Mem->MarkerValid(THEIR_GOAL) && destR == Mem->GetMarkerDistance(THEIR_GOAL) && destTheta == Mem->GetMarkerAngle(THEIR_GOAL) ) continue; angleDiff = Mem->GetOpponentAngle(i) - destTheta; CleanAngle(&angleDiff); angleDiff = fabs(deg_to_rad(angleDiff)); distAlongLine = Mem->GetOpponentDistance(i) * cos(angleDiff); distFromLine = Mem->GetOpponentDistance(i) * sin(angleDiff); if (distAlongLine > destR) continue; if (distAlongLine < 0) continue; //if (distAlongLine > 40) continue; if (distFromLine > 30) continue; if (distFromLine > distAlongLine) continue; /* printf("%d: %d angleDiff = %.1f (%.1f - %.1f), distAlong = %.1f, distFrom = %.1f\n", action,i,rad_to_deg(angleDiff),Mem->GetOpponentAngle(i),destTheta, distAlongLine,distFromLine);*/ /*printf("%d(%.1f,%.1f) ",i,Mem->GetOpponentGlobalX(i),Mem->GetOpponentGlobalY(i));*/ result *= .8*(distFromLine/(MIN(30,distAlongLine))); } result-=1; /*printf("%d:%d action = %d, result = %.1f\n",Mem->MyNumber, Mem->CurrentTime, action,result);*/ return result;}/****************************************************************************/int RLtest(int NumActions, int *actions, int *action_types, float *Confidences){ /* don't alter actions or confidences */ /* Only alter Q's when you're training (players in fixed positions) */ /* Start by loading 'em up for your position */ /* If position switching, load for all */ /* Take an action, return it, and then start looking for rewards */ /* code for don't pass: me the receiver, passsuccessconf = 0 */ float HConfidences[NumActions]; for (int i=0; i<NumActions; i++) Confidences[i] = GetHeuristicConfidence(actions[i],action_types[i]); float FeatureValues[NumActions]; for (int i=0; i<NumActions; i++) FeatureValues[i] = Confidences[i];#if 0 /* Used for 10-27 values */ for (int i=0; i<NumActions; i++) FeatureValues[i] = MIN_ACTION_VALUE;#endif#if 1 /* Only care if it's a success or failure */ static int greater=0, less=0; for (int i=0; i<NumActions; i++){ if ( FeatureValues[i] >= DT_SUCCESS_CUTOFF ){ FeatureValues[i] = 0.45; greater++; } else{ FeatureValues[i] = -0.45; less++; } if (!((greater+less)%30000)) printf("%d : %d greater: %d, less: %d\n",Mem->MyNumber, Mem->CurrentTime, greater,less); }#endif#if 0 /* 2 states-- one for right-client, one for left */ /* based on #opps on each side */ /* Actually 3 --and didn't correlate very well with opp client type */ int opp_r=0, opp_l=0; for (int i=1; i<=TEAM_SIZE; i++){ if ( !Mem->OpponentValid(i) ) continue; if (Mem->GetOpponentGlobalY(i) > 0) opp_l++; else if (Mem->GetOpponentGlobalY(i) < 0) opp_r++; } float val = 0; if (opp_l > opp_r) val = .45; else if (opp_r > opp_l) val = -.45; for (int i=0; i<NumActions; i++){ FeatureValues[i] = val; /* printf("%.1f ",FeatureValues[i]); */ } /* printf("(%d)\n",Mem->CurrentTime); */#endif#if 0 /* 2 states-- one for right-client, one for left */ /* based on avg opponent y position */ /* Actually 3 --and didn't correlate very well with opp client type */ float opp_y=0,x,y; for (int i=1; i<=TEAM_SIZE; i++){ if ( Mem->GetOpponentGlobalSetTime(i) == 0) continue; Mem->GetOpponentLastGlobalXY(i,&x,&y); opp_y+=y; } float val = 0; if (opp_y > 0) val = .45; else if (opp_y < 0) val = -.45; for (int i=0; i<NumActions; i++){ FeatureValues[i] = val; /* printf("%.1f ",FeatureValues[i]); */ } /* printf("(%d)\n",Mem->CurrentTime); */#endif#if 0 /* One DT value for right and left clients--time cued */ float val = (Mem->CurrentTime/3000)%2 ? .45 : -.45; for (int i=0; i<NumActions; i++){ FeatureValues[i] = val; }#endif#if 0 /* 2 DT values randomly distributed */ for (int i=0; i<NumActions; i++) FeatureValues[i] = int_random(2) ? .45 : -.45;#endif int receiver_index = ChooseReceiverRandomized(NumActions,actions,action_types,FeatureValues,Confidences); //int receiver_index = ChooseReceiverMaxQ(NumActions,actions,action_types,FeatureValues,Confidences); //int receiver_index = ChooseReceiverRandom(NumActions,actions,action_types,FeatureValues,Confidences);#if 0 static int last_max_action = -1; if (Mem->GetMyPosition()==5 && Mem->GetPositionOfMyLocation()==5){ int max_index = ChooseReceiverMaxQ(NumActions,actions,action_types,FeatureValues,Confidences); int max_action = GetActionNumber(max_index,actions,action_types); printf("%d: %d taking action %d\n",Mem->MyNumber,Mem->CurrentTime, GetActionNumber(receiver_index,actions,action_types)); if (max_action != last_max_action){ printf("%d: %d max = %d (was %d)\n",Mem->MyNumber,Mem->CurrentTime,max_action, last_max_action); Mem->GetMyQTable()->Write(stdout); } last_max_action = max_action; }#endif#if 0 /* To always shoot */ if ( int_random(100) ){ receiver_index = NumActions-1; actions[receiver_index] = THEIR_GOAL; action_types[receiver_index] = RL_KNOCK_ACTION; FeatureValues[receiver_index] = MIN_ACTION_VALUE; } else receiver_index = ChooseReceiverRandom(NumActions,actions,action_types,FeatureValues,Confidences);#endif int receiver = actions [receiver_index]; float feature_val = FeatureValues[receiver_index]; float receiver_conf = Confidences[receiver_index]; float receiver_future = DEFAULT_FUTURE; int action_number; /* Was receiver_position */ action_number = GetActionNumber(receiver_index,actions,action_types); if (action_number == UNKNOWN_POSITION) my_error("Should know position of player I'm passing to"); int game = Mem->CurrentTime/GAME_LENGTH;#if 0 static int printed_game = 0; if (game>printed_game && Mem->MyNumber==6){ printf("After Game %d (time %d ):\n",game,Mem->CurrentTime); Mem->GetQTable(Mem->GetCurrentFormationType(),5)->Write(stdout); fflush(stdout); printed_game = game; }#endif#if POS_5_LEARN if (Mem->GetMyPosition()!=5 || Mem->GetPositionOfMyLocation()!=5) game=161;#endif //game=161; if (game<=160 || game>=200) Mem->KeepLearning=TRUE; else Mem->KeepLearning=FALSE; if ( Mem->GetBallDistance() <= KICK_DISTANCE && Mem->KeepLearning){ /* else can't actually act */ /* printf("%d:%d (action %d)\n",Mem->MyNumber,Mem->CurrentTime,receiver_position);*/ Mem->SetActionState(action_number,feature_val,receiver_conf,receiver_future); } if (0 && Mem->MyNumber==11){ printf("%d %d ( %d ) taking action %d (dt: %.2f heur: %.2f )\n", Mem->MyNumber, Mem->CurrentTime, Mem->GetPositionOfMyLocation(), action_number,receiver_conf,HConfidences[receiver_index]); printf("Action DT Heur\n"); for (int i=0; i<NumActions; i++){ printf("%d %f %f\n",GetActionNumber(i,actions,action_types), Confidences[i],HConfidences[i]); } printf("[ Opps seen: "); for (int i=1; i<=TEAM_SIZE; i++) if (Mem->OpponentValid(i)) printf("%d (%.1f %.1f) : ",i,Mem->GetOpponentGlobalX(i), Mem->GetOpponentGlobalY(i)); printf(" ]\n"); printf("[ Teammates seen: "); for (int i=1; i<=TEAM_SIZE; i++) if (Mem->TeammateValid(i)) printf("%d - %d (%.1f %.1f) : ",i, Mem->GetPlayerPosition(i), Mem->GetTeammateGlobalX(i), Mem->GetTeammateGlobalY(i)); printf(" ]\n"); printf("\n\n"); } return receiver_index;}int RLforReceiver(int NumActions, int *actions, int *action_types, float *Confidences){ return RLtest(NumActions,actions,action_types,Confidences);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -