⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tabdelim.cpp

📁 orange源码 数据挖掘技术
💻 CPP
📖 第 1 页 / 共 3 页
字号:
       - sets varType to TValue::varType or -1 if the type is not specified and -2 if it's a basket
       - sets classPos/basketPos to the current position, if the attribute is class/basket attribute
         (and reports an error if there is more than one such attribute)
       - to attributeTypes, appends -1 for ordinary atributes, 1 for metas and 0 for ignored or baskets*/
    int varType = -1; // varType, or -1 for unnown, -2 for basket
    attributeTypes->push_back(-1);
    int &attributeType = attributeTypes->back();

    const char *cptr = (*ni).c_str();
    if (*cptr && (cptr[1]=='#')) {
      if (*cptr == 'm')
        attributeType = 1;
      else if (*cptr == 'i')
        attributeType = 0;
      else if (*cptr == 'c') {
        if (classPos>-1)
          ::raiseError("more than one attribute marked as class");
        else
          classPos = ni-varNames.begin();
      }

      else if (*cptr == 'D')
        varType = TValue::INTVAR;
      else if (*cptr == 'C')
        varType = TValue::FLOATVAR;
      else if (*cptr == 'S')
        varType = STRINGVAR;
      else if (*cptr == 'B') {
          varType = -2;
          attributeType = 0;
          if (basketPos > -1)
            ::raiseError("more than one basket attribute");
          else
            basketPos = ni - varNames.begin();
      }
      else
        ::raiseError("unrecognized flags in attribute name '%s'", cptr);

      *ni = string(cptr+2);
    }

    else if (*cptr && cptr[1] && (cptr[2]=='#')) {
      bool beenWarned = false;
      if (*cptr == 'm')
        attributeType = 1;
      else if (*cptr == 'i')
        attributeType = 0;
      else if (*cptr == 'c') {
        if (classPos>-1)
          ::raiseError("more than one attribute marked as class");
        else
          classPos = ni-varNames.begin();
      }
      else
        ::raiseError("unrecognized flags in attribute name '%s'", cptr);

      cptr++;
      if (*cptr == 'D')
        varType = TValue::INTVAR;
      else if (*cptr == 'C')
        varType = TValue::FLOATVAR;
      else if (*cptr == 'S')
        varType = STRINGVAR;
      else if (*cptr == 'B') {
        if (attributeType) { // basket can be ignored, too
          varType = -2;
          attributeType = 0;  // this is ugly, but baskets are a patch anyway: if not ignored by 'i' flag, it's ignored by 'basket'
          if (basketPos > -1)
            ::raiseError("there can only be one basket attribute");
          else
            basketPos = ni - varNames.begin();
        }
      }
      else
        ::raiseError("unrecognized flags in attribute name '%s'", cptr);

      // remove the prefix (we have already increased cptr once)
      *ni = string(cptr+2);
    }

    /* If the attribute is not to be ignored, we attempt to either find its descriptor
       among the known attributes or create a new attribute if the type is given.
       For ordinary attributes, the descriptor (or PVariable()) is pushed to the list of 'variables'.
       For meta attributes, a meta descriptor is pushed to 'metas'. If the attribute was used as
       meta-attribute in some of known domains, the id is reused; otherwise a new id is created.
       If the descriptor was nor found nor created, a warranty is issued.
    */
      
    if ((classPos == ni-varNames.begin())) {
      classType = varType;
    }
    else {
      if (attributeType == 1) {
        metas.push_back(TDomainDepot::TAttributeDescription(*ni, varType));
        if (varType==-1)
          searchWarranties.push_back(TSearchWarranty(ni-varNames.begin(), -(signed int)(metas.size())));
      }
      else if (attributeType) {
        lastRegular = ni-varNames.begin();
        attributeDescriptions.push_back(TDomainDepot::TAttributeDescription(*ni, varType));
        if (varType==-1)
          searchWarranties.push_back(TSearchWarranty(ni-varNames.begin(), attributeType==-2 ? -1 : attributeDescriptions.size()-1));
      }
    }
  }

  if (classPos > -1) {
    attributeDescriptions.push_back(TDomainDepot::TAttributeDescription(varNames[classPos], classType));
    if (classType<0)
      searchWarranties.push_back(TSearchWarranty(classPos, attributeDescriptions.size()-1));
  }
  else {
    if (!noClass)
      classPos = lastRegular;
  }

  if (!searchWarranties.empty()) {
    vector<string> atoms;
    char numTest[64];
    while (!feof(fei.file) && !searchWarranties.empty()) {
      // seek to the next line non-empty non-comment line
      if (readTabAtom(fei, atoms, true, csv) <= 0)
        continue;
    
      for(list<TSearchWarranty>::iterator wi(searchWarranties.begin()), we(searchWarranties.end()); wi!=we; wi++) {
        if ((*wi).posInFile >= atoms.size())
          continue;
//          raiseError("line %i too short", fei.line);

        const string &atom = atoms[(*wi).posInFile];

        // only discrete attributes can have values longer than 63 characters
        if (atom.length()>63) {
          if ((*wi).posInDomain<0)
            metas[-(*wi).posInDomain - 1].varType = TValue::INTVAR;
          else
            attributeDescriptions[(*wi).posInDomain].varType = TValue::INTVAR;
          wi = searchWarranties.erase(wi);
          wi--;
          continue;
        }

        const char *ceni = atom.c_str();
        if (   !*ceni
            || !ceni[1] && ((*ceni=='?') || (*ceni=='.') || (*ceni=='~') || (*ceni=='*') || (*ceni=='-'))
            || (atom == "NA") || (DC && (atom == DC)) || (DK && (atom == DK)))
          continue;

        // we have encountered some value
        if ((*wi).suspectedType == 3) 
          (*wi).suspectedType = 2;

        // If the attribute is a digit, it can be anything
        if ((!ceni[1]) && (*ceni>='0') && (*ceni<='9'))
          continue;

        // If it is longer than one character, it cannot be a coded discrete
        if (ceni[1])
          (*wi).suspectedType = 1;

        // Convert commas into dots
        strcpy(numTest, ceni);
        for(char *sc = numTest; *sc; sc++)
          if (*sc == ',')
            *sc = '.';

        // If the attribute cannot be converted into a number, it is enum
        char *eptr;
        strtod(numTest, &eptr);
        while (*eptr==32)
          eptr++;
        if (*eptr) {
          if ((*wi).posInDomain<0)
            metas[-(*wi).posInDomain - 1].varType = TValue::INTVAR;
          else
            attributeDescriptions[(*wi).posInDomain].varType = TValue::INTVAR;
          wi = searchWarranties.erase(wi);
          wi--;
          continue;
        }
      }
    }


    ITERATE(list<TSearchWarranty>, wi, searchWarranties) {
      const string &name = varNames[(*wi).posInFile];
      if ((*wi).suspectedType == 3)
        raiseWarning("cannot determine type for attribute '%s'; the attribute will be ignored", name.c_str());

      int type = (*wi).suspectedType == 2 && !noCodedDiscrete ? TValue::INTVAR : TValue::FLOATVAR;
      if ((*wi).posInDomain<0)
        metas[-(*wi).posInDomain - 1].varType = type;
      else
        attributeDescriptions[(*wi).posInDomain].varType = type;
    }

    for(int i = 0; i < attributeDescriptions.size(); )
      if (attributeDescriptions[i].varType == -1)
        attributeDescriptions.erase(attributeDescriptions.begin() + i);
      else
        i++;
  }


  if (basketPos >= 0)
    basketFeeder = mlnew TBasketFeeder(sourceDomain, dontCheckStored, false);
    
  if (sourceDomain) {
    if (!domainDepot_txt.checkDomain(sourceDomain.AS(TDomain), &attributeDescriptions, classPos>-1, NULL))
      raiseError("given domain does not match the file");
    else {
      if (basketFeeder)
        basketFeeder->domain = sourceDomain;
      return sourceDomain;
    }
  }

  int *metaIDs = mlnew int[metas.size()];
  PDomain newDomain = domainDepot_txt.prepareDomain(&attributeDescriptions, classPos>-1, &metas, sourceVars, sourceMetas, false, dontCheckStored, NULL, metaIDs);

  int *mid = metaIDs;
  PITERATE(TIntList, ii, attributeTypes)
    if (*ii == 1)
      *ii = *(mid++);

  mldelete metaIDs;

  if (basketFeeder)
    basketFeeder->domain = newDomain;

  return newDomain;
}


PDomain TTabDelimExampleGenerator::domainWithoutDetection(const string &stem, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored)
{
  TFileExampleIteratorData fei(stem);
  
  vector<string> varNames, varTypes, varFlags;
  
  while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv) == -1));
  if (varNames.empty())
    ::raiseError("empty file");

  while(!feof(fei.file) && (readTabAtom(fei, varTypes, false, csv) == -1));
  if (varTypes.empty())
    ::raiseError("cannot read types of attributes");

  while(!feof(fei.file) && (readTabAtom(fei, varFlags, true, csv) == -1));

  if (varNames.size() != varTypes.size())
    ::raiseError("mismatching number of attributes and their types.");
  if (varNames.size() < varFlags.size())
    ::raiseError("too many flags (third line too long)");
  while (varFlags.size() < varNames.size())
    varFlags.push_back("");

  TDomainDepot::TAttributeDescriptions attributeDescriptions, metas;
  TDomainDepot::TAttributeDescription classDescription("", 0);
  classPos = -1;
  basketPos = -1;
  headerLines = 3;

  attributeTypes = mlnew TIntList(varNames.size(), -1);

  vector<string>::iterator vni(varNames.begin()), vne(varNames.end());
  vector<string>::iterator ti(varTypes.begin());
  vector<string>::iterator fi(varFlags.begin()), fe(varFlags.end());
  TIntList::iterator ati(attributeTypes->begin());
  for(; vni!=vne; fi++, vni++, ti++, ati++) {
    TDomainDepot::TAttributeDescription *attributeDescription = NULL;
    bool ordered = false;

    if (fi!=fe) {
      TProgArguments args("dc: ordered", *fi, false);

      if (args.direct.size()) {
        if (args.direct.size()>1)
          ::raiseError("invalid flags for attribute '%s'", (*vni).c_str());
        string direct = args.direct.front();
        if ((direct=="s") || (direct=="skip") || (direct=="i") || (direct=="ignore"))
          *ati = 0;
        else if ((direct=="c") || (direct=="class"))
          if (classPos==-1) {
            classPos = vni - varNames.begin();
            classDescription.name = *vni;
            attributeDescription = &classDescription;
          }
          else 
            ::raiseError("multiple attributes are specified as class attribute ('%s' and '%s')", (*vni).c_str(), (*vni).c_str());
        else if ((direct=="m") || (direct=="meta"))
          *ati = 1;
      }

      if (args.exists("dc")) {
        const int ind = vni-varNames.begin();
        ITERATE(TMultiStringParameters, mi, args.options)
          if ((*mi).first == "dc") {
            while (DCs.size() <= ind)
              DCs.push_back(vector<string>());
            DCs.at(ind).push_back((*mi).second);
          }
      }

      ordered = args.exists("ordered");
    }

    if (!*ati)
      continue;

    if (!strcmp((*ti).c_str(), "basket")) {
      if (basketPos > -1)
        ::raiseError("multiple basket attributes are defined");
      basketPos = vni - varNames.begin();
      *ati = 0;
      continue;
    }

    if (!attributeDescription) {// this can only be defined if the attribute is a class attribute
      if (*ati==1) {
        metas.push_back(TDomainDepot::TAttributeDescription(*vni, -1, *ti, ordered));
        attributeDescription = &metas.back();
      }
      else {
        attributeDescriptions.push_back(TDomainDepot::TAttributeDescription(*vni, -1, *ti, ordered));
        attributeDescription = &attributeDescriptions.back();
      }
    }
    else
      attributeDescription->ordered = ordered;

    if (!(*ti).length())
      ::raiseError("type for attribute '%s' is missing", (*vni).c_str());

    const TIdentifierDeclaration *tid = typeIdentifiers;
    for(; tid->identifier; tid++)
      if (!(tid->matchRoot ? strncmp(tid->identifier, (*ti).c_str(), tid->matchRoot)
                           : strcmp(tid->identifier, (*ti).c_str()))) {
        attributeDescription->varType = tid->varType;
        break;
      }
    if (!tid->identifier) {
      attributeDescription->varType = TValue::INTVAR;
      attributeDescription->values = mlnew TStringList;

      string vals;
      ITERATE(string, ci, *ti)
        if (*ci==' ') {
          if (vals.length())
            attributeDescription->values->push_back(vals);
          vals="";
        }
        else {
          if ((*ci=='\\') && (ci[1]==' ')) {
            vals += ' ';
            ci++;
          }
          else
            vals += *ci;
        }

      if (vals.length())
        attributeDescription->values->push_back(vals);
    }
  }

  if (classPos > -1)
    attributeDescriptions.push_back(classDescription);

  if (basketPos >= 0)
    basketFeeder = mlnew TBasketFeeder(sourceDomain, dontCheckStored, false);
    
  if (sourceDomain) {
    if (!domainDepot_tab.checkDomain(sourceDomain.AS(TDomain), &attributeDescriptions, classPos >= 0, NULL))
      raiseError("given domain does not match the file");
    else {
      if (basketFeeder)
        basketFeeder->domain = sourceDomain;
      return sourceDomain;
    }
  }

  int *metaIDs = mlnew int[metas.size()];
  PDomain newDomain = domainDepot_tab.prepareDomain(&attributeDescriptions, classPos>=0, &metas, sourceVars, sourceMetas, false, dontCheckStored, NULL, metaIDs);

  int *mid = metaIDs;
  PITERATE(TIntList, ii, attributeTypes)
    if (*ii == 1)
      *ii = *(mid++);

  mldelete metaIDs;

  if (basketFeeder)
    basketFeeder->domain = newDomain;

  return newDomain;
}


bool atomsEmpty(const vector<string> &atoms)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -