📄 rl.c

📁 足球机器人仿真组CMU97的源码
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* rl.c * CMUnited-97 (soccer client for Robocup-97) * Peter Stone <pstone@cs.cmu.edu> * Computer Science Department * Carnegie Mellon University * Copyright (C) 1997 Peter Stone * * CMUnited-97 was created by Peter Stone and Manuela Veloso * * You may copy and distribute this program freely as long as you retain this notice. * If you make any changes or have any comments we would appreciate a message. *//* -*- Mode: C -*- */#include "global.h"#define DEFAULT_FUTURE 0#define ONLY_KNOCKS 1/* for only passes, need to inhibit knocks in knockorpass.c */#define ONLY_PASSES 0#define POS_5_LEARN 0#define GOAL_ONLY_REWARDS 0#define TRUST_CONF 0#define BREAK_TIES_MAXQ 1#define DT_SUCCESS_CUTOFF .734/*  USE RL on teammates with confidences  not like real RL--don't get to keep acting after your action so it's really MA RL -- put the TEAM in a new state reinforcement comes indirectly   Assumes confidences of other actions are independent of success of this one Action (pass to position), conf.  ==> value  value depends on success of action and subsequent actions so mixes success, value, safety could add:  players, w/ DT info, give own estimate of their best value ==> 2 dim. state.  Train first dim. first?*/FutureValueInterval::FutureValueInterval(float min, float max, float q){  FutureMin = min;  FutureMax = max;  QValue    = q;  if (q == 0){    num     = 0;    weight = 0;  }  else {    num     = 1;    weight = 1;  }  Next      = NULL;}FutureValueInterval::~FutureValueInterval(){  if (Next != NULL){    Next->~FutureValueInterval();    delete Next;  }}FutureValueInterval *FutureValueInterval::Insert(float min){  if (min < FutureMax){    FutureValueInterval *NewInterval = new FutureValueInterval(min,FutureMax,QValue);    FutureMax = min; /* Shrink the current interval */    NewInterval->Next = Next;    Next = NewInterval;    return NewInterval;  }  else if (Next == NULL)    my_error("Should be able to insert");  else    Next->Insert(min);}void  FutureValueInterval::UpdateQValue(float ActionConf, float FutureValue, float Reward){  /* split interval criterion???? */  if (FutureValue >= FutureMin && FutureValue < FutureMax){    /* update Q value in noisy environment -- take into account last rewards        like Kalman in a way --- can't just be equal to latest reward (sum also bad)*/    //printf("%d reward = %.1f\n",Mem->GetMyPosition(),Reward);    /* Latest counts for at least 5% of reward -- shifting concept      */    /* Makes first 20 count fo 35% of total after next 20, 12% after 40 */#if TRUST_CONF    /*printf("%d:%d %.1f %.1f %.1f %.1f ..... ",Mem->MyNumber,Mem->CurrentTime,	   QValue,Reward,weight,(1+ActionConf)/2);*//*    if (weight >= 15)      QValue = weighted_avg(QValue,Reward,14,			    (ActionConf - MIN_ACTION_VALUE)/			    (MAX_ACTION_VALUE - MIN_ACTION_VALUE));    else      QValue = weighted_avg(QValue,Reward,weight,(ActionConf - MIN_ACTION_VALUE)/2);*/    //Reward *= 2/(ActionConf+1);    //Reward -= QMAX;    Reward /= QMAX;     Reward += 1; /* Now ranges from 0-2 */    float Q = Reward/((ActionConf+1)/2 + 1);  /* so ranges from 1-2 */    //float Q = Reward/((ActionConf+1)/4 + 1);    /* Q is now the multiplier that should have been used to get the actual        reward.  Ranges from 0-2 */    if (num >= 20)      QValue = weighted_avg(QValue,Q,19,1);    else      QValue = weighted_avg(QValue,Q,num,1);    /* printf("%d:%d %.1f\n",Mem->MyNumber,Mem->CurrentTime,QValue); */    weight += (ActionConf - MIN_ACTION_VALUE)/2;#else    if (num >= 50)      QValue = weighted_avg(QValue,Reward,49,1);    else      QValue = weighted_avg(QValue,Reward,num,1);#endif    num++;  }  else if (Next == NULL)    my_error("Can't fit FutureValue into an interval (UpdateQValue)");  else    Next->UpdateQValue(ActionConf,FutureValue,Reward);}float FutureValueInterval::GetQValue(float FutureValue){  if (FutureValue >= FutureMin && FutureValue < FutureMax){    return QValue;  }  else if (Next == NULL)    my_error("Can't fit FutureValue into an interval (GetQValue)");  else    Next->GetQValue(FutureValue);}void FutureValueInterval::Print(FILE *oStream){  fprintf(oStream,"{[%.1f, %.1f) : %.3f (%d %.1f)}",FutureMin,FutureMax,QValue,num,weight);  if (Next != NULL){    fprintf(oStream,", ");    Next->Print(oStream);  }  else     fprintf(oStream,".\n");}void FutureValueInterval::Load(FILE *oStream){  float min, max, q, w;  int n;  fscanf(oStream,"{[%f, %f) : %f (%d %f)}",&min,&max,&q,&n,&w);  QValue = q;  num = n;  weight = w;  if ( max != FutureMax )    Insert(max);  int chr1;  if ( (char)(chr1 = getc(oStream)) != '.' ){    ungetc(chr1,oStream);    Next->Load(oStream);  }  else     fscanf(oStream,"\n");}/****************************************************************************/ActionValueInterval::ActionValueInterval(float min, float max, float q){  ActionMin = min;  ActionMax = max;  FutureValueList = new FutureValueInterval(-QMAX,QMAX,q);  Next      = NULL;  if (min == MIN_ACTION_VALUE && max == MAX_ACTION_VALUE)    for (float i=min+.1; i<max; i+=.1)      Insert(i,q);}ActionValueInterval::~ActionValueInterval(){  if (Next != NULL){    delete Next;  }  delete FutureValueList;}ActionValueInterval *ActionValueInterval::Insert(float min, float q){  if (min < ActionMax){    ActionValueInterval *NewInterval = new ActionValueInterval(min,ActionMax,q);    ActionMax = min; /* Shrink the current interval */    NewInterval->Next = Next;    Next = NewInterval;    return NewInterval;  }  else if (Next == NULL)    my_error("Should be able to insert");  else    Next->Insert(min,q);}void  ActionValueInterval::UpdateQValue(float ActionValue, float ActionConf, float FutureValue, float Reward){  /* split interval criterion?  For now, one interval per value */  if (ActionValue < ActionMin+.1 && ActionValue < ActionMax){   /* if (ActionValue == ActionMin){ */    FutureValueList->UpdateQValue(ActionConf, FutureValue, Reward);  }  else if (ActionValue < ActionMax){  /* Seen values always interval bottoms */    Insert(ActionValue,Reward);  }  else if (Next == NULL){    if (ActionValue == MAX_ACTION_VALUE)      FutureValueList->UpdateQValue(ActionConf, FutureValue, Reward);    else      my_error("Can't fit ActionValue into an interval (UpdateQValue)");  }  else    Next->UpdateQValue(ActionValue,ActionConf,FutureValue,Reward);}float ActionValueInterval::GetQValue(float ActionValue, float FutureValue){  if (ActionValue >= ActionMin && ActionValue < ActionMax){    return FutureValueList->GetQValue(FutureValue);  }  else if (Next == NULL){    if (ActionValue == MAX_ACTION_VALUE)      return FutureValueList->GetQValue(FutureValue);    else{      char tmp[100];      sprintf(tmp,"Can't fit ActionValue %.3f into an interval (GetQValue): last (%.1f %.1f)",	      ActionValue,ActionMin,ActionMax);      my_error(tmp);    }  }  else    Next->GetQValue(ActionValue,FutureValue);}int ActionValueInterval::GetNum(){  int num = 0;  FutureValueInterval *FVL = FutureValueList;  while (FVL != NULL){    num += FVL->num;    FVL = FVL->Next;  }  return num;}void ActionValueInterval::Print(FILE *oStream){  if ( GetNum() ){  /* Don't print empty intervals */    fprintf(oStream,"[%.3f, %.3f) ::: ",ActionMin,ActionMax);    FutureValueList->Print(oStream);  }  if (Next != NULL){    Next->Print(oStream);  }  else     fprintf(oStream,".\n");}void ActionValueInterval::Load(FILE *oStream){  float min, max;  fscanf(oStream,"[%f, %f) ::: ",&min,&max);  Load(oStream,min,max);}void ActionValueInterval::Load(FILE *oStream,float min,float max){  if ( min + .00001 >= ActionMax ){    Next->Load(oStream,min,max);    return;  }  else if ( min > ActionMin ){    Insert(min,0);    Next->Load(oStream,min,max);    return;  }  else if ( (int)(max*10000) != (int)(ActionMax*10000) )    Insert(max,0);  if ( fabs(min-ActionMin)>.00001 || fabs(max-ActionMax)>.00001 ) {    printf("(2) %f min, %f ActionMin, %f max, %f ActionMax\n",min,ActionMin,max,ActionMax);    my_error("Mins and maxs should line up at this point");  }  FutureValueList->Load(oStream);  int chr1;  if ( (char)(chr1 = getc(oStream)) != '.' ){    ungetc(chr1,oStream);    Next->Load(oStream);  }  else     fscanf(oStream,"\n");}/****************************************************************************/QTable::QTable(int form, int pos){  NumActions = NUM_RL_ACTIONS;  MightExist= TRUE;  Loaded    = FALSE;  Formation = form;  Position  = pos;  for (int i=0; i<NumActions; i++)    Head[i] = new ActionValueInterval(MIN_ACTION_VALUE,MAX_ACTION_VALUE,0);  sprintf(dataFileName,"rlDat/rlDat%d-%d.dat",form,pos);}QTable::~QTable(){  for (int i=0; i<NumActions; i++)    delete Head[i];}void  QTable::UpdateQTable(int position, float FeatureVal, float DTConf, float FutureValue, float Reward){  Head[position]->UpdateQValue(FeatureVal,DTConf,FutureValue,Reward);}float QTable::GetQValue(int position, float DTConf, float FutureValue){  return Head[position]->GetQValue(DTConf,FutureValue);}int QTable::GetNum(int action){  int num = 0;  ActionValueInterval *AVI = Head[action];  while (AVI != NULL){    num += AVI->GetNum();    AVI = AVI->Next;  }  return num;}void QTable::Write(FILE *oStream){  fprintf(oStream,"Formation %d\nPosition %d\n\n",Formation,Position);  for (int i=0; i<NumActions; i++){    if ( GetNum(i) || i == NUM_RL_ACTIONS-1 ){      fprintf(oStream,"%2d %2d %2d %2d %2d %2d %2d %2d ",i,i,i,i,i,i,i,i);      fprintf(oStream,"%2d %2d %2d %2d %2d %2d %2d %2d\n",i,i,i,i,i,i,i,i);      Head[i]->Print(oStream);    }  }}int QTable::Load(FILE *oStream){  int form,pos;  if ( fscanf(oStream,"Formation %d\nPosition %d\n\n",&form,&pos) == EOF ){    /* printf(".");       fflush(stdout); */    return FALSE;  }  if (Formation != form || Position  != pos ){    printf("loading wrong form/pos: got Formation %d Position %d\n",form,pos);    return FALSE;  }    int to=0;  char junk[100];  while ( to < NUM_RL_ACTIONS-1 ){    fscanf(oStream,"%2d %[^\n]\n",&to,junk);    /* if (to != i) my_error("not loading 'to' correctly"); */    int chr1;    if ( (char)(chr1 = getc(oStream)) != '.' ){      ungetc(chr1,oStream);      delete Head[to];      Head[to] = new ActionValueInterval(MIN_ACTION_VALUE,MAX_ACTION_VALUE,0);      Head[to]->Load(oStream);    }  }  Loaded = TRUE;  return TRUE;}/****************************************************************************/RewardInfo::RewardInfo(){  for (int i=0; i<NUM_FORMATIONS; i++)    for (int j=0; j<TEAM_SIZE; j++)      QTables[i][j] = new QTable(i,j);  QActionTaken    = FALSE;  KeepLearning    = FALSE;  QLastFormation  = UNKNOWN_FORMATION_TYPE;  QLastActionFrom = UNKNOWN_POSITION;  QLastActionTo   = UNKNOWN_POSITION;  QLastActionVal = 0;  QLastActionConf = 0;  QLastFutureVal  = 0;}RewardInfo::~RewardInfo(){  for (int i=0; i<NUM_FORMATIONS; i++)    for (int j=0; j<TEAM_SIZE; j++)      delete QTables[i][j];}QTable *RewardInfo::GetMyQTable(){  return GetQTable(Mem->GetCurrentFormationType(),		   Mem->GetPositionOfMyLocation());                     /* Mem->GetMyPosition()); */}QTable *RewardInfo::GetQTable(int formation, int position){    QTable *TheQTable = QTables[formation][position];  if ( Mem->KeepLearning || (!TheQTable->IsLoaded() && TheQTable->MightExist) ){    /* If not loading, shouldn't do this every time */    FILE *rlFile = fopen(TheQTable->dataFileName,"r");    if ( rlFile == NULL )      TheQTable->MightExist=FALSE;    else {      /* printf("%d Loading from %s\n",Mem->MyNumber,TheQTable->dataFileName); */      while ( TheQTable->Load(rlFile) == FALSE ){	fclose(rlFile);	rlFile = fopen(TheQTable->dataFileName,"r");      }      /* TheQTable->Write(stdout);      exit(0); */    }    fclose(rlFile);  }  return TheQTable;}void RewardInfo::SetActionState(int to, float val, float conf, float future){  if ( QActionTaken && Mem->CurrentTime - Time > 20 )    /* If I was within 2 seconds, assume it's the same action */    CloseRewards();  /* Finalize decision on quality of last action */  QLastFormation  = Mem->GetCurrentFormationType();  QLastActionFrom = Mem->GetPositionOfMyLocation(); /* Mem->GetMyPosition(); */  QLastActionTo   = to;  QLastActionVal  = val;  QLastActionConf = conf;  QLastFutureVal  = future;  MyScore    = Mem->MyScore;  TheirScore = Mem->TheirScore;  Mem->GetBallGlobalXY(&BallX,&BallY);  Time       = Mem->CurrentTime;  AvgBallX          = BallX;  AvgBallUpdateTime = Time;  QActionTaken         = TRUE;}void RewardInfo::CloseRewards(){  if ( !KeepLearning )    return;  if ( !QActionTaken ) my_error("no action to reward");  /* Finalize decision on quality of last action: have things improved? */  QTable *RewardQTable = GetQTable(QLastFormation,QLastActionFrom);     float reward;  switch(Mem->PlayMode){  case MY_KICK_OFF:    reward = -100;    break;  case THEIR_KICK_OFF:    reward = 100;    break;  case BEFORE_KICK_OFF:    if ( Mem->MyScore > MyScore )      reward = 100;    else if ( Mem->TheirScore > TheirScore )      reward = -100;    break;  case MY_GOAL_KICK:    reward = -10;     break;  case THEIR_GOAL_KICK:    reward = 10;     break;  case MY_CORNER_KICK:    reward = 25;     break;  case THEIR_CORNER_KICK:    reward = -25;     break;  case MY_KICK_IN:    reward = 25*(Mem->GetBallGlobalX() + X0)/(2*X0);     break;  case THEIR_KICK_IN:    reward = -25*(X0 - Mem->GetBallGlobalX())/(2*X0);     break;  case PLAY_ON:    if ( AvgBallX > BallX )      /* Fraction of availabe positive distance from starting point */      reward = 10*(AvgBallX-BallX)/(X0-BallX);    else /* AvgBallX <= BallX */      /* Fraction of availabe negative distance from starting point */      reward = -10*(BallX-AvgBallX)/(BallX+X0);    break;  default:     char msg[100];    sprintf(msg,"What mode for getting reward???? (%d)",Mem->PlayMode);    my_error(msg);  }  if (Mem->PlayMode != PLAY_ON){    /* Lower reward based on how long it took to get there */    /* Full reward within the first 5 seconds              */
12 下一页
💿 文件大小 199 K
👤 上传用户 SLing2008
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#CMU #97 #足球机器人 #仿真
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -