📄 tabdelim.cpp

📁 orange源码数据挖掘技术
💻 CPP
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include <string>
#include <vector>
#include <list>

#include <math.h>
#include "stladdon.hpp"
#include "strings.hpp"
#include "getarg.hpp"

#include "values.hpp"
#include "vars.hpp"
#include "stringvars.hpp"
#include "pythonvars.hpp"
#include "domain.hpp"
#include "examples.hpp"

#include "tabdelim.ppp"

int readTabAtom(TFileExampleIteratorData &fei, vector<string> &atoms, bool escapeSpaces=true, bool csv = false);
bool atomsEmpty(const vector<string> &atoms);


TDomainDepot TTabDelimExampleGenerator::domainDepot_tab;
TDomainDepot TTabDelimExampleGenerator::domainDepot_txt;


const TTabDelimExampleGenerator::TIdentifierDeclaration TTabDelimExampleGenerator::typeIdentifiers[] =
 {{"discrete", 0, TValue::INTVAR},      {"d", 0, TValue::INTVAR},
  {"continuous", 0, TValue::FLOATVAR},  {"c", 0, TValue::FLOATVAR},
  {"string", 0, STRINGVAR},             {"s", 0, STRINGVAR},
  {"python", 0, PYTHONVAR},             {"python:", 7, PYTHONVAR},
  {NULL, 0}};


TTabDelimExampleGenerator::TTabDelimExampleGenerator(const TTabDelimExampleGenerator &old)
: TFileExampleGenerator(old),
  attributeTypes(mlnew TIntList(old.attributeTypes.getReference())),
  DCs(old.DCs),
  classPos(old.classPos),
  headerLines(old.headerLines),
  csv(old.csv)
{}


TTabDelimExampleGenerator::TTabDelimExampleGenerator(const string &afname, bool autoDetect, bool acsv, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored, bool dontStore, const char *aDK, const char *aDC, bool noCodedDiscrete, bool noClass)
: TFileExampleGenerator(afname, PDomain()),
  attributeTypes(mlnew TIntList()),
  DCs(),
  DK(aDK ? strcpy((char *)malloc(strlen(aDK)+1), aDK) : NULL),
  DC(aDC ? strcpy((char *)malloc(strlen(aDC)+1), aDC) : NULL),
  classPos(-1),
  headerLines(0),
  csv(acsv)
{ 
  // domain needs to be initialized after attributeTypes, DCs, classPos, headerLines
  domain = readDomain(afname, autoDetect, sourceVars, sourceMetas, sourceDomain, dontCheckStored, dontStore, noCodedDiscrete, noClass);

  TFileExampleIteratorData fei(afname);
  
  vector<string> atoms;
  for (int i = headerLines; !feof(fei.file) && i--; )
    // read one line (not counting comment lines, but counting empty lines)
    while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv) == -1));

  startDataPos = ftell(fei.file);
  startDataLine = fei.line;
}


TTabDelimExampleGenerator::~TTabDelimExampleGenerator()
{
  if (DK)
    free(DK);

  if (DC)
    free(DC);
}

bool TTabDelimExampleGenerator::readExample(TFileExampleIteratorData &fei, TExample &exam)
{
  vector<string> atoms;
  // read lines until eof or a non-empty line
  while(!feof(fei.file) && ((readTabAtom(fei, atoms, true, csv)>0) || atomsEmpty(atoms))) {
    vector<string>::iterator ii(atoms.begin()), ie(atoms.end());
    while ((ii!=ie) && !(*ii).length())
      ii++;
    if (ii==ie)
      atoms.clear();
    else
      break;
  }
  
  if (!atoms.size())
    return false;

  // Add an appropriate number of empty atoms, if needed
  while (atoms.size()<attributeTypes->size())
    atoms.push_back(string(""));
  _ASSERT(exam.domain==domain);

  exam.removeMetas();

  TExample::iterator ei(exam.begin());
  TVarList::iterator vi(domain->attributes->begin());
  vector<string>::iterator ai(atoms.begin());
  TIntList::iterator si(attributeTypes->begin()), se(attributeTypes->end());
  vector<vector<string> >::iterator dci(DCs.begin()), dce(DCs.end());
  int pos=0;
  for (; (si!=se); pos++, si++, ai++) {
    if (*si) { // if attribute is not to be skipped and is not a basket
      string valstr;

      // Check for don't care
      valstr = *ai;
      if (dci != dce)
        ITERATE(vector<string>, dcii, *dci)
          if (*dcii == valstr) {
            valstr = '?';
            break;
          }

      if (!valstr.length() || (valstr == "NA") || (valstr == ".") || (DC && (valstr == DC)))
        valstr = "?";
      else if ((valstr == "*") || (DK && (valstr == DK)))
        valstr = "~";

      try {
        if (*si==-1)
          if (pos==classPos) { // if this is class value
            TValue cval;
            domain->classVar->filestr2val(valstr, cval, exam);
            exam.setClass(cval);
          }
          else { // if this is a normal value
            (*vi++)->filestr2val(valstr, *ei++, exam);
          }
        else { // if this is a meta value
          TMetaDescriptor *md = domain->metas[*si];
          _ASSERT(md!=NULL);
          TValue mval;
          md->variable->filestr2val(valstr, mval, exam);

          exam.setMeta(*si, mval);
        }
      }
      catch (mlexception &err) {
        raiseError("file '%s', line '%i': %s", fei.filename.c_str(), fei.line, err.what());
      }
    }

    // the attribute is marked to be skipped, but may also be a basket
    else { 
      if (pos == basketPos) {
        TSplits splits;
        split(*ai, splits);
        ITERATE(TSplits, si, splits)
          basketFeeder->addItem(exam, string(si->first, si->second), fei.line);
      }
    }

    if (dci != dce)
      dci++;
  }

  if (pos==classPos) // if class is the last value in the line, it is set here
    domain->classVar->filestr2val(ai==atoms.end() ? "?" : *(ai++), exam[domain->variables->size()-1], exam);

  while ((ai!=atoms.end()) && !(*ai).length()) ai++; // line must be empty from now on

  if (ai!=atoms.end()) {
	vector<string>::iterator ii=atoms.begin();
	string s=*ii;
	while(++ii!=atoms.end()) s+=" "+*ii;
    raiseError("example of invalid length (%s)", s.c_str());
  }

  return true;
}


char *TTabDelimExampleGenerator::mayBeTabFile(const string &stem)
{
  vector<string> varNames, atoms;
  vector<string>::const_iterator vi, ai, ei;

  TFileExampleIteratorData fei(stem);

  // if there is no names line, it is not .tab
  while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv)==-1));
  if (varNames.empty()) {
    char *res = mlnew char[128];
    res = strcpy(res, "empty file");
    return res;
  }

  // if any name contains the correct hash formatting it is not tab-delim it's more likely .txt
  for(vi = varNames.begin(), ei = varNames.end(); vi!=ei; vi++) {
    const char *c = (*vi).c_str();
    if ((*c=='m') || (*c=='c') || (*c=='i'))
      c++;
    if (   ((*c=='D') || (*c=='C') || (*c=='S'))
        && (c[1]=='#')) {
      char *res= mlnew char[128 + (*vi).size()];
      sprintf(res, "attribute name '%s' looks suspicious", (*vi).c_str());
      return res;
    }
  }

  // if there is no var types line, it is not .tab
  while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv)==-1));
  if (atoms.empty()) {
    char *res = mlnew char[128];
    res = strcpy(res, "no line with attribute types");
    return res;
  }

  if (atoms.size() != varNames.size())
    raiseError("the number of attribute types does not match the number of attributes");

  // Each atom must be either 'd', 'c' or 's', or contain a space
  for(vi = varNames.begin(), ai = atoms.begin(), ei = atoms.end(); ai != ei; ai++, vi++) {
    const char *c = (*ai).c_str();
    if (!*c) {
      char *res= mlnew char[128 + (*vi).size()];
      sprintf(res, "empty type entry for attribute '%s'", (*vi).c_str());
      return res;
    }

    if (!strcmp("basket", c))
      continue;

    const TIdentifierDeclaration *tid = typeIdentifiers;
    for(; tid->identifier && (tid->matchRoot ? strncmp(tid->identifier, c, tid->matchRoot) : strcmp(tid->identifier, c)); tid++);
    if (tid->identifier)
      continue;

    for(; *c && (*c!=' '); c++);
      if (!*c) {
        char *res= mlnew char[128 + (*vi).size() + (*ai).size()];
        sprintf(res, "attribute '%s' is defined as having only one value ('%s')", (*vi).c_str(), (*ai).c_str());
        return res;
      }
  }

  // if there is no flags line, it is not .tab
  while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv)==-1));
  if (feof(fei.file)) {
    char *res = mlnew char[128];
    res = strcpy(res, "file has only two lines");
    return res;
  }

  if (atoms.size() > varNames.size())
    raiseError("the number of attribute options is greater than the number of attributes");

  // Check flags
  for(vi = varNames.begin(), ai = atoms.begin(), ei = atoms.end(); ai != ei; ai++, vi++) {
    TProgArguments args("dc: ordered", *ai, false);

    if (args.unrecognized.size()) {
      char *res= mlnew char[128 + (*vi).size()];
      sprintf(res, "unrecognized options at attribute '%s'", (*vi).c_str());
      return res;
    }

    if (args.direct.size()) {
      if (args.direct.size()>1) {
        char *res= mlnew char[128 + (*vi).size()];
        sprintf(res, "too many direct options at attribute '%s'", (*vi).c_str());
        return res;
      }

      static const char *legalDirects[] = {"s", "skip","i", "ignore", "c", "class", "m", "meta", NULL};
      string &direct = args.direct.front();
      const char **lc = legalDirects;
      while(*lc && strcmp(*lc, direct.c_str()))
        lc++;
      if (!*lc) {
        char *res= mlnew char[128 + (*vi).size() + (*ai).size()];
        sprintf(res, "unrecognized option ('%s') at attribute '%s'", (*ai).c_str(), (*vi).c_str());
        return res;
      }
    }
  }

  return NULL;
}

PDomain TTabDelimExampleGenerator::readDomain(const string &stem, const bool autoDetect, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored, bool dontStore, bool noCodedDiscrete, bool noClass)
{ 
  // non-NULL when this cannot be tab file (reason given as result)
  // NULL if this seems a valid tab file
  char *isNotTab = mayBeTabFile(stem);

  if (autoDetect) {
    if (!isNotTab)
      raiseWarning("'%s' is being loaded as .txt, but could be .tab file", stem.c_str());
    else
      mldelete isNotTab;

    return domainWithDetection(stem, sourceVars, sourceMetas, sourceDomain, dontCheckStored, noCodedDiscrete, noClass);
  }

  else {
    if (isNotTab) {
      raiseWarning("'%s' is being loaded as .tab, but looks more like .txt file\n(%s)", stem.c_str(), isNotTab);
      mldelete isNotTab;
    }

    return domainWithoutDetection(stem, sourceVars, sourceMetas, sourceDomain, dontCheckStored);
  }
}

 

/* These are the rules for determining the attribute types.

   There are three ways to determine a type.

   1. By header prefixes to attribute names.
      The prefix is formed by [cmi][DCS]#
      c, m and i mean class attribute, meta attribute and ignore,
      respectively.
      D, C and S mean discrete, continuous and string attributes.

   2. By knownVars.
      If the type is not determined from header row (either because
      there was no prefix or it only contained c, m or i)
      knownVars is checked for the attribute with the same name.
      If found, the attribute from knownVars will be used.

   3. From the data.
      These attributes can be either continuous or discrete.
      The file is parsed and values for each attribute are checked.
      Values denoting undefined values ('?', '.', '~', '*', 'NA' and
      empty strings) are ignored.
      If all values can be parsed as numbers, the attribute is continuous.
      An exception to this rule are attributes with values 0, 1, 2, ..., 9.
      These are treated as discrete (the assumption is that those number
      are just codes for otherwise discrete values).
*/

class TSearchWarranty 
{ public:
  int posInFile, posInDomain, suspectedType;
  // suspectedType can be 3 (never seen it yet), 2 (can even be coded discrete), 1 (can be float);
  //   if it's found that it cannot be float, it can only be discrete, so the warranty is removed
  TSearchWarranty(const int &pif, const int &pid)
  : posInFile(pif), posInDomain(pid), suspectedType(3)
  {}
};

PDomain TTabDelimExampleGenerator::domainWithDetection(const string &stem, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored, bool noCodedDiscrete, bool noClass)
{ 
  headerLines = 1;

  TFileExampleIteratorData fei(stem);
  
  vector<string> varNames;
  // read the next non-comment line
  while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv)==-1));
  if (varNames.empty())
    ::raiseError("unexpected end of file '%s'", fei.filename.c_str());

  TDomainDepot::TAttributeDescriptions attributeDescriptions, metas;
  classPos = -1;
  basketPos = -1;
  int classType = -1;
  int lastRegular = -1;


  list<TSearchWarranty> searchWarranties;

  /**** Parse the header row */
  
  ITERATE(vector<string>, ni, varNames) {
    /* Parses the header line
       - sets *ni to a real name (without prefix)
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -