📄 rl.c
字号:
/* rl.c * CMUnited-97 (soccer client for Robocup-97) * Peter Stone <pstone@cs.cmu.edu> * Computer Science Department * Carnegie Mellon University * Copyright (C) 1997 Peter Stone * * CMUnited-97 was created by Peter Stone and Manuela Veloso * * You may copy and distribute this program freely as long as you retain this notice. * If you make any changes or have any comments we would appreciate a message. *//* -*- Mode: C -*- */#include "global.h"#define DEFAULT_FUTURE 0#define ONLY_KNOCKS 1/* for only passes, need to inhibit knocks in knockorpass.c */#define ONLY_PASSES 0#define POS_5_LEARN 0#define GOAL_ONLY_REWARDS 0#define TRUST_CONF 0#define BREAK_TIES_MAXQ 1#define DT_SUCCESS_CUTOFF .734/* USE RL on teammates with confidences not like real RL--don't get to keep acting after your action so it's really MA RL -- put the TEAM in a new state reinforcement comes indirectly Assumes confidences of other actions are independent of success of this one Action (pass to position), conf. ==> value value depends on success of action and subsequent actions so mixes success, value, safety could add: players, w/ DT info, give own estimate of their best value ==> 2 dim. state. Train first dim. first?*/FutureValueInterval::FutureValueInterval(float min, float max, float q){ FutureMin = min; FutureMax = max; QValue = q; if (q == 0){ num = 0; weight = 0; } else { num = 1; weight = 1; } Next = NULL;}FutureValueInterval::~FutureValueInterval(){ if (Next != NULL){ Next->~FutureValueInterval(); delete Next; }}FutureValueInterval *FutureValueInterval::Insert(float min){ if (min < FutureMax){ FutureValueInterval *NewInterval = new FutureValueInterval(min,FutureMax,QValue); FutureMax = min; /* Shrink the current interval */ NewInterval->Next = Next; Next = NewInterval; return NewInterval; } else if (Next == NULL) my_error("Should be able to insert"); else Next->Insert(min);}void FutureValueInterval::UpdateQValue(float ActionConf, float FutureValue, float Reward){ /* split interval criterion???? */ if (FutureValue >= FutureMin && FutureValue < FutureMax){ /* update Q value in noisy environment -- take into account last rewards like Kalman in a way --- can't just be equal to latest reward (sum also bad)*/ //printf("%d reward = %.1f\n",Mem->GetMyPosition(),Reward); /* Latest counts for at least 5% of reward -- shifting concept */ /* Makes first 20 count fo 35% of total after next 20, 12% after 40 */#if TRUST_CONF /*printf("%d:%d %.1f %.1f %.1f %.1f ..... ",Mem->MyNumber,Mem->CurrentTime, QValue,Reward,weight,(1+ActionConf)/2);*//* if (weight >= 15) QValue = weighted_avg(QValue,Reward,14, (ActionConf - MIN_ACTION_VALUE)/ (MAX_ACTION_VALUE - MIN_ACTION_VALUE)); else QValue = weighted_avg(QValue,Reward,weight,(ActionConf - MIN_ACTION_VALUE)/2);*/ //Reward *= 2/(ActionConf+1); //Reward -= QMAX; Reward /= QMAX; Reward += 1; /* Now ranges from 0-2 */ float Q = Reward/((ActionConf+1)/2 + 1); /* so ranges from 1-2 */ //float Q = Reward/((ActionConf+1)/4 + 1); /* Q is now the multiplier that should have been used to get the actual reward. Ranges from 0-2 */ if (num >= 20) QValue = weighted_avg(QValue,Q,19,1); else QValue = weighted_avg(QValue,Q,num,1); /* printf("%d:%d %.1f\n",Mem->MyNumber,Mem->CurrentTime,QValue); */ weight += (ActionConf - MIN_ACTION_VALUE)/2;#else if (num >= 50) QValue = weighted_avg(QValue,Reward,49,1); else QValue = weighted_avg(QValue,Reward,num,1);#endif num++; } else if (Next == NULL) my_error("Can't fit FutureValue into an interval (UpdateQValue)"); else Next->UpdateQValue(ActionConf,FutureValue,Reward);}float FutureValueInterval::GetQValue(float FutureValue){ if (FutureValue >= FutureMin && FutureValue < FutureMax){ return QValue; } else if (Next == NULL) my_error("Can't fit FutureValue into an interval (GetQValue)"); else Next->GetQValue(FutureValue);}void FutureValueInterval::Print(FILE *oStream){ fprintf(oStream,"{[%.1f, %.1f) : %.3f (%d %.1f)}",FutureMin,FutureMax,QValue,num,weight); if (Next != NULL){ fprintf(oStream,", "); Next->Print(oStream); } else fprintf(oStream,".\n");}void FutureValueInterval::Load(FILE *oStream){ float min, max, q, w; int n; fscanf(oStream,"{[%f, %f) : %f (%d %f)}",&min,&max,&q,&n,&w); QValue = q; num = n; weight = w; if ( max != FutureMax ) Insert(max); int chr1; if ( (char)(chr1 = getc(oStream)) != '.' ){ ungetc(chr1,oStream); Next->Load(oStream); } else fscanf(oStream,"\n");}/****************************************************************************/ActionValueInterval::ActionValueInterval(float min, float max, float q){ ActionMin = min; ActionMax = max; FutureValueList = new FutureValueInterval(-QMAX,QMAX,q); Next = NULL; if (min == MIN_ACTION_VALUE && max == MAX_ACTION_VALUE) for (float i=min+.1; i<max; i+=.1) Insert(i,q);}ActionValueInterval::~ActionValueInterval(){ if (Next != NULL){ delete Next; } delete FutureValueList;}ActionValueInterval *ActionValueInterval::Insert(float min, float q){ if (min < ActionMax){ ActionValueInterval *NewInterval = new ActionValueInterval(min,ActionMax,q); ActionMax = min; /* Shrink the current interval */ NewInterval->Next = Next; Next = NewInterval; return NewInterval; } else if (Next == NULL) my_error("Should be able to insert"); else Next->Insert(min,q);}void ActionValueInterval::UpdateQValue(float ActionValue, float ActionConf, float FutureValue, float Reward){ /* split interval criterion? For now, one interval per value */ if (ActionValue < ActionMin+.1 && ActionValue < ActionMax){ /* if (ActionValue == ActionMin){ */ FutureValueList->UpdateQValue(ActionConf, FutureValue, Reward); } else if (ActionValue < ActionMax){ /* Seen values always interval bottoms */ Insert(ActionValue,Reward); } else if (Next == NULL){ if (ActionValue == MAX_ACTION_VALUE) FutureValueList->UpdateQValue(ActionConf, FutureValue, Reward); else my_error("Can't fit ActionValue into an interval (UpdateQValue)"); } else Next->UpdateQValue(ActionValue,ActionConf,FutureValue,Reward);}float ActionValueInterval::GetQValue(float ActionValue, float FutureValue){ if (ActionValue >= ActionMin && ActionValue < ActionMax){ return FutureValueList->GetQValue(FutureValue); } else if (Next == NULL){ if (ActionValue == MAX_ACTION_VALUE) return FutureValueList->GetQValue(FutureValue); else{ char tmp[100]; sprintf(tmp,"Can't fit ActionValue %.3f into an interval (GetQValue): last (%.1f %.1f)", ActionValue,ActionMin,ActionMax); my_error(tmp); } } else Next->GetQValue(ActionValue,FutureValue);}int ActionValueInterval::GetNum(){ int num = 0; FutureValueInterval *FVL = FutureValueList; while (FVL != NULL){ num += FVL->num; FVL = FVL->Next; } return num;}void ActionValueInterval::Print(FILE *oStream){ if ( GetNum() ){ /* Don't print empty intervals */ fprintf(oStream,"[%.3f, %.3f) ::: ",ActionMin,ActionMax); FutureValueList->Print(oStream); } if (Next != NULL){ Next->Print(oStream); } else fprintf(oStream,".\n");}void ActionValueInterval::Load(FILE *oStream){ float min, max; fscanf(oStream,"[%f, %f) ::: ",&min,&max); Load(oStream,min,max);}void ActionValueInterval::Load(FILE *oStream,float min,float max){ if ( min + .00001 >= ActionMax ){ Next->Load(oStream,min,max); return; } else if ( min > ActionMin ){ Insert(min,0); Next->Load(oStream,min,max); return; } else if ( (int)(max*10000) != (int)(ActionMax*10000) ) Insert(max,0); if ( fabs(min-ActionMin)>.00001 || fabs(max-ActionMax)>.00001 ) { printf("(2) %f min, %f ActionMin, %f max, %f ActionMax\n",min,ActionMin,max,ActionMax); my_error("Mins and maxs should line up at this point"); } FutureValueList->Load(oStream); int chr1; if ( (char)(chr1 = getc(oStream)) != '.' ){ ungetc(chr1,oStream); Next->Load(oStream); } else fscanf(oStream,"\n");}/****************************************************************************/QTable::QTable(int form, int pos){ NumActions = NUM_RL_ACTIONS; MightExist= TRUE; Loaded = FALSE; Formation = form; Position = pos; for (int i=0; i<NumActions; i++) Head[i] = new ActionValueInterval(MIN_ACTION_VALUE,MAX_ACTION_VALUE,0); sprintf(dataFileName,"rlDat/rlDat%d-%d.dat",form,pos);}QTable::~QTable(){ for (int i=0; i<NumActions; i++) delete Head[i];}void QTable::UpdateQTable(int position, float FeatureVal, float DTConf, float FutureValue, float Reward){ Head[position]->UpdateQValue(FeatureVal,DTConf,FutureValue,Reward);}float QTable::GetQValue(int position, float DTConf, float FutureValue){ return Head[position]->GetQValue(DTConf,FutureValue);}int QTable::GetNum(int action){ int num = 0; ActionValueInterval *AVI = Head[action]; while (AVI != NULL){ num += AVI->GetNum(); AVI = AVI->Next; } return num;}void QTable::Write(FILE *oStream){ fprintf(oStream,"Formation %d\nPosition %d\n\n",Formation,Position); for (int i=0; i<NumActions; i++){ if ( GetNum(i) || i == NUM_RL_ACTIONS-1 ){ fprintf(oStream,"%2d %2d %2d %2d %2d %2d %2d %2d ",i,i,i,i,i,i,i,i); fprintf(oStream,"%2d %2d %2d %2d %2d %2d %2d %2d\n",i,i,i,i,i,i,i,i); Head[i]->Print(oStream); } }}int QTable::Load(FILE *oStream){ int form,pos; if ( fscanf(oStream,"Formation %d\nPosition %d\n\n",&form,&pos) == EOF ){ /* printf("."); fflush(stdout); */ return FALSE; } if (Formation != form || Position != pos ){ printf("loading wrong form/pos: got Formation %d Position %d\n",form,pos); return FALSE; } int to=0; char junk[100]; while ( to < NUM_RL_ACTIONS-1 ){ fscanf(oStream,"%2d %[^\n]\n",&to,junk); /* if (to != i) my_error("not loading 'to' correctly"); */ int chr1; if ( (char)(chr1 = getc(oStream)) != '.' ){ ungetc(chr1,oStream); delete Head[to]; Head[to] = new ActionValueInterval(MIN_ACTION_VALUE,MAX_ACTION_VALUE,0); Head[to]->Load(oStream); } } Loaded = TRUE; return TRUE;}/****************************************************************************/RewardInfo::RewardInfo(){ for (int i=0; i<NUM_FORMATIONS; i++) for (int j=0; j<TEAM_SIZE; j++) QTables[i][j] = new QTable(i,j); QActionTaken = FALSE; KeepLearning = FALSE; QLastFormation = UNKNOWN_FORMATION_TYPE; QLastActionFrom = UNKNOWN_POSITION; QLastActionTo = UNKNOWN_POSITION; QLastActionVal = 0; QLastActionConf = 0; QLastFutureVal = 0;}RewardInfo::~RewardInfo(){ for (int i=0; i<NUM_FORMATIONS; i++) for (int j=0; j<TEAM_SIZE; j++) delete QTables[i][j];}QTable *RewardInfo::GetMyQTable(){ return GetQTable(Mem->GetCurrentFormationType(), Mem->GetPositionOfMyLocation()); /* Mem->GetMyPosition()); */}QTable *RewardInfo::GetQTable(int formation, int position){ QTable *TheQTable = QTables[formation][position]; if ( Mem->KeepLearning || (!TheQTable->IsLoaded() && TheQTable->MightExist) ){ /* If not loading, shouldn't do this every time */ FILE *rlFile = fopen(TheQTable->dataFileName,"r"); if ( rlFile == NULL ) TheQTable->MightExist=FALSE; else { /* printf("%d Loading from %s\n",Mem->MyNumber,TheQTable->dataFileName); */ while ( TheQTable->Load(rlFile) == FALSE ){ fclose(rlFile); rlFile = fopen(TheQTable->dataFileName,"r"); } /* TheQTable->Write(stdout); exit(0); */ } fclose(rlFile); } return TheQTable;}void RewardInfo::SetActionState(int to, float val, float conf, float future){ if ( QActionTaken && Mem->CurrentTime - Time > 20 ) /* If I was within 2 seconds, assume it's the same action */ CloseRewards(); /* Finalize decision on quality of last action */ QLastFormation = Mem->GetCurrentFormationType(); QLastActionFrom = Mem->GetPositionOfMyLocation(); /* Mem->GetMyPosition(); */ QLastActionTo = to; QLastActionVal = val; QLastActionConf = conf; QLastFutureVal = future; MyScore = Mem->MyScore; TheirScore = Mem->TheirScore; Mem->GetBallGlobalXY(&BallX,&BallY); Time = Mem->CurrentTime; AvgBallX = BallX; AvgBallUpdateTime = Time; QActionTaken = TRUE;}void RewardInfo::CloseRewards(){ if ( !KeepLearning ) return; if ( !QActionTaken ) my_error("no action to reward"); /* Finalize decision on quality of last action: have things improved? */ QTable *RewardQTable = GetQTable(QLastFormation,QLastActionFrom); float reward; switch(Mem->PlayMode){ case MY_KICK_OFF: reward = -100; break; case THEIR_KICK_OFF: reward = 100; break; case BEFORE_KICK_OFF: if ( Mem->MyScore > MyScore ) reward = 100; else if ( Mem->TheirScore > TheirScore ) reward = -100; break; case MY_GOAL_KICK: reward = -10; break; case THEIR_GOAL_KICK: reward = 10; break; case MY_CORNER_KICK: reward = 25; break; case THEIR_CORNER_KICK: reward = -25; break; case MY_KICK_IN: reward = 25*(Mem->GetBallGlobalX() + X0)/(2*X0); break; case THEIR_KICK_IN: reward = -25*(X0 - Mem->GetBallGlobalX())/(2*X0); break; case PLAY_ON: if ( AvgBallX > BallX ) /* Fraction of availabe positive distance from starting point */ reward = 10*(AvgBallX-BallX)/(X0-BallX); else /* AvgBallX <= BallX */ /* Fraction of availabe negative distance from starting point */ reward = -10*(BallX-AvgBallX)/(BallX+X0); break; default: char msg[100]; sprintf(msg,"What mode for getting reward???? (%d)",Mem->PlayMode); my_error(msg); } if (Mem->PlayMode != PLAY_ON){ /* Lower reward based on how long it took to get there */ /* Full reward within the first 5 seconds */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -