vfbn1.c

来自「数据挖掘方面的源码」· C语言代码 · 共 1,931 行 · 第 1/5 页
1,931 行
            /* this is the last one that needs adjusting, set it to sum
                          exactly */
            outputValues[i] += targetPij - sum;
            sum = targetPij;
         }
      }

      if(outputValues[i] > 0) {
         plgp += outputValues[i] * log(outputValues[i]);
      }
   }

   MFreePtr(outputValues);

   DebugMessage(1, 4, "  MaxPlgP for row is: %lf\n", plgp);
   return plgp;
}

/* should work as long as no probabilities are greater than 1/e,
   two important observations are that slope increases as move from 1/e
    and it increases faster as p moves towards 0 than towards 1 */
static double _GetMinPLgPCPTRowSum(BeliefNetNode bnn, double *probs, 
                                  double targetPij, double pijkEpsilon) {
   double *outputValues;
   double sum, plgp, maxDelta, neededDelta;
   int i;
   int numSliding, highestActive;

   outputValues = MNewPtr(sizeof(double) * BNNodeGetNumValues(bnn));

   /* do one pass, set output values to maximums & find the current sum */
   sum = 0;
   for(i = 0 ; i < BNNodeGetNumValues(bnn) ; i++) {
      outputValues[i] = min(probs[i] + pijkEpsilon, kOneOverE);
      sum += outputValues[i];
   }

   /* Kinda subtle here, need to move points back from the high end,
        but moving towards the low end needs to be minimized so
        move the highest back till it hits another then move both back, etc */
   numSliding = 1;
   highestActive = BNNodeGetNumValues(bnn) - 1;

   while(sum > targetPij) {
      if(numSliding - 1 == highestActive) {
         /* all sliding */
         maxDelta = min(outputValues[0], 
              outputValues[0] - (probs[0] - pijkEpsilon));
      } else { 
         maxDelta = min(outputValues[highestActive] - 
                    outputValues[highestActive - numSliding], 
          outputValues[highestActive] - (probs[highestActive] - pijkEpsilon));
      }

      neededDelta = (sum - targetPij) / (double)numSliding;

      if(neededDelta <= maxDelta) {
         for(i = highestActive ; i > highestActive - numSliding ; i--) {
            outputValues[i] -= neededDelta;
         }
         sum = targetPij;
      } else {
         /* the delta isn't enough */
         sum -= maxDelta * (double)numSliding;
         for(i = highestActive ; i > highestActive - numSliding ; i--) {
            outputValues[i] -= maxDelta;
         }

         if(outputValues[highestActive] == 
                                    probs[highestActive] - pijkEpsilon) {
            /* that's all the highest can go */
            highestActive--;
            if(numSliding > 1) {
               numSliding--;
            } /* else only one sliding so move on to the next one*/
         } else {
            /* we musta bumped into the next point down */
            numSliding++;
         }
      }
   }

   plgp = 0;
   for(i = BNNodeGetNumValues(bnn) - 1 ; i >= 0 ; i--) {
      if(outputValues[i] > 0) {
         plgp += outputValues[i] * log(outputValues[i]);
      }
   }

   MFreePtr(outputValues);
   DebugMessage(1, 4, "  MinPlgP for row is: %lf\n", plgp);
   return plgp;
}

static double _GetMaxPLgPCPTRowSumOneAbove(BeliefNetNode bnn, double *probs, 
                                  double targetPij, double pijkEpsilon) {
   double *outputValues;
   double sum, plgp, maxDelta;
   int i;

   outputValues = MNewPtr(sizeof(double) * BNNodeGetNumValues(bnn));

   /* do one pass, set output values to minimum & find the current sum */
   /* special case the big one */
   sum = 0;
   for(i = 0 ; i < BNNodeGetNumValues(bnn) - 1 ; i++) {
      outputValues[i] = max(probs[i] - pijkEpsilon, 0);
      sum += outputValues[i];
   }

   i = BNNodeGetNumValues(bnn) - 1;
   outputValues[i] = min(probs[i] + pijkEpsilon, 1.0);
   sum += outputValues[i];

   /* do another pass, moving the larger probs back up till sum is reached */
   /*   and calculate plgp, special case the biggest value */
   plgp = 0;
   for(i = BNNodeGetNumValues(bnn) - 2 ; i >= 0 ; i--) {
      if(sum < targetPij) {
         maxDelta = (probs[i] + pijkEpsilon) - outputValues[i];
         if(sum + maxDelta < targetPij) {
            outputValues[i] += maxDelta;
            sum += maxDelta;
         } else {
            /* this is the last one that needs adjusting, set it to sum
                          exactly */
            outputValues[i] += targetPij - sum;
            sum = targetPij;
         }
      }

      if(outputValues[i] > 0) {
         plgp += outputValues[i] * log(outputValues[i]);
      }
   }

   i = BNNodeGetNumValues(bnn) - 1;
   if(sum != targetPij) {
      /* something kinda odd, the small onces can't ballance the big one
             so we actually need to move the big one back to reach the
             target */

      outputValues[i] = targetPij - (sum - outputValues[i]);
   }

   if(outputValues[i] > 0) {
      plgp += outputValues[i] * log(outputValues[i]);
   }

   MFreePtr(outputValues);

   DebugMessage(1, 4, "  MaxPlgP for row is: %lf\n", plgp);
   return plgp;
}

static double _GetMinPLgPCPTRowSumOneAbove(BeliefNetNode bnn, double *probs, 
                                  double targetPij, double pijkEpsilon) {
   double *outputValues;
   double sum, plgp, maxDelta, neededDelta;
   int i;
   int numSliding, highestActive;

   outputValues = MNewPtr(sizeof(double) * BNNodeGetNumValues(bnn));

   /* do one pass, set output values to maximums & find the current sum */
   /* special case the last one */
   sum = 0;
   for(i = 0 ; i < BNNodeGetNumValues(bnn) - 1 ; i++) {
      outputValues[i] = probs[i] + pijkEpsilon;
      sum += outputValues[i];
   }

   i = BNNodeGetNumValues(bnn) - 1;
   outputValues[i] = probs[i] - pijkEpsilon;
   sum += outputValues[i];

   /* Kinda subtle here, need to move points back from the high end,
        but moving towards the low end needs to be minimized so
        move the highest back till it hits another then move both back, etc */
   numSliding = 1;
   highestActive = BNNodeGetNumValues(bnn) - 2;

   while(sum > targetPij) {
      if(numSliding - 1 == highestActive) {
         /* all sliding */
         maxDelta = min(outputValues[0], 
              outputValues[0] - (probs[0] - pijkEpsilon));
      } else { 
         maxDelta = min(outputValues[highestActive] - 
                    outputValues[highestActive - numSliding], 
          outputValues[highestActive] - (probs[highestActive] - pijkEpsilon));
      }

      neededDelta = (sum - targetPij) / (double)numSliding;

      if(neededDelta <= maxDelta) {
         for(i = highestActive ; i > highestActive - numSliding ; i--) {
            outputValues[i] -= neededDelta;
         }
         sum = targetPij;
      } else {
         /* the delta isn't enough */
         sum -= maxDelta * (double)numSliding;
         for(i = highestActive ; i > highestActive - numSliding ; i--) {
            outputValues[i] -= maxDelta;
         }

         if(outputValues[highestActive] == 
                                    probs[highestActive] - pijkEpsilon) {
            /* that's all the highest can go */
            highestActive--;
            if(numSliding > 1) {
               numSliding--;
            } /* else only one sliding so move on to the next one*/
         } else {
            /* we musta bumped into the next point down */
            numSliding++;
         }
      }
   }

   plgp = 0;
   for(i = BNNodeGetNumValues(bnn) - 1 ; i >= 0 ; i--) {
      if(outputValues[i] > 0) {
         plgp += outputValues[i] * log(outputValues[i]);
      }
   }

   MFreePtr(outputValues);
   DebugMessage(1, 4, "  MinPlgP for row is: %lf\n", plgp);
   return plgp;
}

void _UpdateNodeAveDataLL(BeliefNetNode bnn) {
   int numCPTRows;
   BNNodeUserData data;
   int j, k;
   double numSamples;
   double prob, logCP;

   data = BNNodeGetUserData(bnn);

   /* if this is the current node or is changed from it */
   if(data->isChangedFromCurrent || (data->current == 0)) {
      data->avgDataLL = 0;
      numCPTRows = BNNodeGetNumCPTRows(bnn);
      numSamples = BNNodeGetNumSamples(bnn);

      for(j = 0 ; j < numCPTRows ; j++) {
         /* HACK for efficiency break BNN ADT */
         for(k = 0 ; k < BNNodeGetNumValues(bnn) ; k++) {
            if(bnn->eventCounts[j][k] > 0) {
               prob = bnn->eventCounts[j][k] / numSamples;

               //logCP = log(bnn->eventCounts[j][k] / bnn->rowCounts[j]);
               logCP = log(_CalculateCP(bnn, bnn->eventCounts[j][k],
                                     bnn->rowCounts[j]));

               if(data->p0 > bnn->eventCounts[j][k] / bnn->rowCounts[j]) {
                  data->p0 = bnn->eventCounts[j][k] / bnn->rowCounts[j];
               }

               if(gObservedP0 > bnn->eventCounts[j][k] / bnn->rowCounts[j]) {
                  gObservedP0 = bnn->eventCounts[j][k] / bnn->rowCounts[j];
               }

               data->avgDataLL += prob * logCP;
            }
         }
      }
   }
}

void _UpdateNodeScore(BeliefNetNode bnn) {
   int numCPTRows;
   BNNodeUserData data;
   int j, k;
   double prob = 0, pij;
   double numSamples;
   double pijkEpsilon, pijEpsilon;
   double *probs;
   int canOptimize, canOptimizeOneAbove;

   data = BNNodeGetUserData(bnn);

   /* if this is the current node or is changed from it */
   if(data->isChangedFromCurrent || (data->current == 0)) {
      data->score = 0;
      data->upperBound = 0;
      data->lowerBound = 0;
      numCPTRows = BNNodeGetNumCPTRows(bnn);
      numSamples = BNNodeGetNumSamples(bnn);

      probs = MNewPtr(sizeof(double) * BNNodeGetNumValues(bnn));

      for(j = 0 ; j < numCPTRows ; j++) {
         /* HACK for efficiency break BNN ADT */

         if(bnn->rowCounts[j] > 0) {
            canOptimize = 1;
            canOptimizeOneAbove = -1;
            pijkEpsilon = StatHoeffdingBoundOne(1.0, gDeltaStar,
                                                     bnn->rowCounts[j]);
            pijEpsilon = StatHoeffdingBoundOne(1.0, gDeltaStar, numSamples);
         } else {
            canOptimize = 0;
            canOptimizeOneAbove = -1;
            pijEpsilon = 0;
            pijkEpsilon = 0;
         }

         for(k = 0 ; k < BNNodeGetNumValues(bnn) ; k++) {
            if(numSamples != 0) {
               prob = bnn->eventCounts[j][k] / numSamples;

               probs[k] = prob;

               if(canOptimize && (prob > kOneOverE)) {
                  canOptimize = 0;

                  if((prob - pijkEpsilon > kOneOverE) && 
                         (canOptimizeOneAbove == -1)) {
                     canOptimizeOneAbove = 1;
                  } else {
                     canOptimizeOneAbove = 0;
                     if(prob < kOneOverE) {
                        DebugMessage(1, 4, " lt 1/e and overlaps\n");
                     } else {
                        DebugMessage(1, 4, " gt 1/e and overlaps\n");
                     }
                  }
               }

               DebugMessage(1, 4,
                 "   i: %d j: %d k: %d prob: %f epsilon: %f samples: %f\n",
                   BNNodeGetID(bnn), j, k, prob, 
                   StatHoeffdingBoundOne(1.0, gDeltaStar, bnn->rowCounts[j]),
                                         bnn->rowCounts[j]);
               if(prob != 0) {
                  DebugMessage(1, 4, 
                        "   p_ijk lower %lf score %lf upper %lf samples %lf\n",
                        _MinPLgP(prob, bnn->rowCounts[j]), prob * log(prob), 
                       _MaxPLgP(prob, bnn->rowCounts[j]), bnn->rowCounts[j]);

                  data->score += prob * log(prob);

                  if(data->p0 > bnn->eventCounts[j][k] / bnn->rowCounts[j]) {
                     data->p0 = bnn->eventCounts[j][k] / bnn->rowCounts[j];
                  }

                  if(gObservedP0 > bnn->eventCounts[j][k] / 
                                                   bnn->rowCounts[j]) {
                     gObservedP0 = bnn->eventCounts[j][k] / bnn->rowCounts[j];
                  }


               }
            }
         }

         /* now do the p_ij part */
         if(numSamples != 0) {
            pij = bnn->rowCounts[j] / numSamples;
            DebugMessage(1, 4,
              "row i: %d j: %d pij: %lf epsilon: %lf samples: %f\n",
                BNNodeGetID(bnn), j, pij,
              StatHoeffdingBoundOne(1.0, gDeltaStar, numSamples), numSamples);
            if(pij != 0) {
               DebugMessage(1, 4, 
                  "   p_ij lower %lf score %lf upper %lf samples %lf\n",
               -_MaxPLgP(pij, numSamples), -pij * log(pij),
                  -_MinPLgP(pij, numSamples), numSamples);

               data->score -= pij * log(pij);
            }
         } else {
            /* no samples, just pick something */
            pij = 0;
         }

         /* need to update the bound even if there were no samples
                 or the probability was zero */
         data->upperBound -= _MinPLgP(pij, numSamples);
         data->lowerBound -= _MaxPLgP(pij, numSamples);

         /* now do another pass to get the lower & upper bound for ther row */
         if(canOptimize) {
vfbn1.c - 源码说明

本页面展示了「数据挖掘方面的源码」中的 vfbn1.c 源码文件，采用 C语言编程语言编写，共 1,931 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与数据挖掘相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?