📄 tabdelim.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
#include <string>
#include <vector>
#include <list>
#include <math.h>
#include "stladdon.hpp"
#include "strings.hpp"
#include "getarg.hpp"
#include "values.hpp"
#include "vars.hpp"
#include "stringvars.hpp"
#include "pythonvars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "tabdelim.ppp"
int readTabAtom(TFileExampleIteratorData &fei, vector<string> &atoms, bool escapeSpaces=true, bool csv = false);
bool atomsEmpty(const vector<string> &atoms);
TDomainDepot TTabDelimExampleGenerator::domainDepot_tab;
TDomainDepot TTabDelimExampleGenerator::domainDepot_txt;
const TTabDelimExampleGenerator::TIdentifierDeclaration TTabDelimExampleGenerator::typeIdentifiers[] =
{{"discrete", 0, TValue::INTVAR}, {"d", 0, TValue::INTVAR},
{"continuous", 0, TValue::FLOATVAR}, {"c", 0, TValue::FLOATVAR},
{"string", 0, STRINGVAR}, {"s", 0, STRINGVAR},
{"python", 0, PYTHONVAR}, {"python:", 7, PYTHONVAR},
{NULL, 0}};
TTabDelimExampleGenerator::TTabDelimExampleGenerator(const TTabDelimExampleGenerator &old)
: TFileExampleGenerator(old),
attributeTypes(mlnew TIntList(old.attributeTypes.getReference())),
DCs(old.DCs),
classPos(old.classPos),
headerLines(old.headerLines),
csv(old.csv)
{}
TTabDelimExampleGenerator::TTabDelimExampleGenerator(const string &afname, bool autoDetect, bool acsv, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored, bool dontStore, const char *aDK, const char *aDC, bool noCodedDiscrete, bool noClass)
: TFileExampleGenerator(afname, PDomain()),
attributeTypes(mlnew TIntList()),
DCs(),
DK(aDK ? strcpy((char *)malloc(strlen(aDK)+1), aDK) : NULL),
DC(aDC ? strcpy((char *)malloc(strlen(aDC)+1), aDC) : NULL),
classPos(-1),
headerLines(0),
csv(acsv)
{
// domain needs to be initialized after attributeTypes, DCs, classPos, headerLines
domain = readDomain(afname, autoDetect, sourceVars, sourceMetas, sourceDomain, dontCheckStored, dontStore, noCodedDiscrete, noClass);
TFileExampleIteratorData fei(afname);
vector<string> atoms;
for (int i = headerLines; !feof(fei.file) && i--; )
// read one line (not counting comment lines, but counting empty lines)
while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv) == -1));
startDataPos = ftell(fei.file);
startDataLine = fei.line;
}
TTabDelimExampleGenerator::~TTabDelimExampleGenerator()
{
if (DK)
free(DK);
if (DC)
free(DC);
}
bool TTabDelimExampleGenerator::readExample(TFileExampleIteratorData &fei, TExample &exam)
{
vector<string> atoms;
// read lines until eof or a non-empty line
while(!feof(fei.file) && ((readTabAtom(fei, atoms, true, csv)>0) || atomsEmpty(atoms))) {
vector<string>::iterator ii(atoms.begin()), ie(atoms.end());
while ((ii!=ie) && !(*ii).length())
ii++;
if (ii==ie)
atoms.clear();
else
break;
}
if (!atoms.size())
return false;
// Add an appropriate number of empty atoms, if needed
while (atoms.size()<attributeTypes->size())
atoms.push_back(string(""));
_ASSERT(exam.domain==domain);
exam.removeMetas();
TExample::iterator ei(exam.begin());
TVarList::iterator vi(domain->attributes->begin());
vector<string>::iterator ai(atoms.begin());
TIntList::iterator si(attributeTypes->begin()), se(attributeTypes->end());
vector<vector<string> >::iterator dci(DCs.begin()), dce(DCs.end());
int pos=0;
for (; (si!=se); pos++, si++, ai++) {
if (*si) { // if attribute is not to be skipped and is not a basket
string valstr;
// Check for don't care
valstr = *ai;
if (dci != dce)
ITERATE(vector<string>, dcii, *dci)
if (*dcii == valstr) {
valstr = '?';
break;
}
if (!valstr.length() || (valstr == "NA") || (valstr == ".") || (DC && (valstr == DC)))
valstr = "?";
else if ((valstr == "*") || (DK && (valstr == DK)))
valstr = "~";
try {
if (*si==-1)
if (pos==classPos) { // if this is class value
TValue cval;
domain->classVar->filestr2val(valstr, cval, exam);
exam.setClass(cval);
}
else { // if this is a normal value
(*vi++)->filestr2val(valstr, *ei++, exam);
}
else { // if this is a meta value
TMetaDescriptor *md = domain->metas[*si];
_ASSERT(md!=NULL);
TValue mval;
md->variable->filestr2val(valstr, mval, exam);
exam.setMeta(*si, mval);
}
}
catch (mlexception &err) {
raiseError("file '%s', line '%i': %s", fei.filename.c_str(), fei.line, err.what());
}
}
// the attribute is marked to be skipped, but may also be a basket
else {
if (pos == basketPos) {
TSplits splits;
split(*ai, splits);
ITERATE(TSplits, si, splits)
basketFeeder->addItem(exam, string(si->first, si->second), fei.line);
}
}
if (dci != dce)
dci++;
}
if (pos==classPos) // if class is the last value in the line, it is set here
domain->classVar->filestr2val(ai==atoms.end() ? "?" : *(ai++), exam[domain->variables->size()-1], exam);
while ((ai!=atoms.end()) && !(*ai).length()) ai++; // line must be empty from now on
if (ai!=atoms.end()) {
vector<string>::iterator ii=atoms.begin();
string s=*ii;
while(++ii!=atoms.end()) s+=" "+*ii;
raiseError("example of invalid length (%s)", s.c_str());
}
return true;
}
char *TTabDelimExampleGenerator::mayBeTabFile(const string &stem)
{
vector<string> varNames, atoms;
vector<string>::const_iterator vi, ai, ei;
TFileExampleIteratorData fei(stem);
// if there is no names line, it is not .tab
while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv)==-1));
if (varNames.empty()) {
char *res = mlnew char[128];
res = strcpy(res, "empty file");
return res;
}
// if any name contains the correct hash formatting it is not tab-delim it's more likely .txt
for(vi = varNames.begin(), ei = varNames.end(); vi!=ei; vi++) {
const char *c = (*vi).c_str();
if ((*c=='m') || (*c=='c') || (*c=='i'))
c++;
if ( ((*c=='D') || (*c=='C') || (*c=='S'))
&& (c[1]=='#')) {
char *res= mlnew char[128 + (*vi).size()];
sprintf(res, "attribute name '%s' looks suspicious", (*vi).c_str());
return res;
}
}
// if there is no var types line, it is not .tab
while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv)==-1));
if (atoms.empty()) {
char *res = mlnew char[128];
res = strcpy(res, "no line with attribute types");
return res;
}
if (atoms.size() != varNames.size())
raiseError("the number of attribute types does not match the number of attributes");
// Each atom must be either 'd', 'c' or 's', or contain a space
for(vi = varNames.begin(), ai = atoms.begin(), ei = atoms.end(); ai != ei; ai++, vi++) {
const char *c = (*ai).c_str();
if (!*c) {
char *res= mlnew char[128 + (*vi).size()];
sprintf(res, "empty type entry for attribute '%s'", (*vi).c_str());
return res;
}
if (!strcmp("basket", c))
continue;
const TIdentifierDeclaration *tid = typeIdentifiers;
for(; tid->identifier && (tid->matchRoot ? strncmp(tid->identifier, c, tid->matchRoot) : strcmp(tid->identifier, c)); tid++);
if (tid->identifier)
continue;
for(; *c && (*c!=' '); c++);
if (!*c) {
char *res= mlnew char[128 + (*vi).size() + (*ai).size()];
sprintf(res, "attribute '%s' is defined as having only one value ('%s')", (*vi).c_str(), (*ai).c_str());
return res;
}
}
// if there is no flags line, it is not .tab
while(!feof(fei.file) && (readTabAtom(fei, atoms, true, csv)==-1));
if (feof(fei.file)) {
char *res = mlnew char[128];
res = strcpy(res, "file has only two lines");
return res;
}
if (atoms.size() > varNames.size())
raiseError("the number of attribute options is greater than the number of attributes");
// Check flags
for(vi = varNames.begin(), ai = atoms.begin(), ei = atoms.end(); ai != ei; ai++, vi++) {
TProgArguments args("dc: ordered", *ai, false);
if (args.unrecognized.size()) {
char *res= mlnew char[128 + (*vi).size()];
sprintf(res, "unrecognized options at attribute '%s'", (*vi).c_str());
return res;
}
if (args.direct.size()) {
if (args.direct.size()>1) {
char *res= mlnew char[128 + (*vi).size()];
sprintf(res, "too many direct options at attribute '%s'", (*vi).c_str());
return res;
}
static const char *legalDirects[] = {"s", "skip","i", "ignore", "c", "class", "m", "meta", NULL};
string &direct = args.direct.front();
const char **lc = legalDirects;
while(*lc && strcmp(*lc, direct.c_str()))
lc++;
if (!*lc) {
char *res= mlnew char[128 + (*vi).size() + (*ai).size()];
sprintf(res, "unrecognized option ('%s') at attribute '%s'", (*ai).c_str(), (*vi).c_str());
return res;
}
}
}
return NULL;
}
PDomain TTabDelimExampleGenerator::readDomain(const string &stem, const bool autoDetect, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored, bool dontStore, bool noCodedDiscrete, bool noClass)
{
// non-NULL when this cannot be tab file (reason given as result)
// NULL if this seems a valid tab file
char *isNotTab = mayBeTabFile(stem);
if (autoDetect) {
if (!isNotTab)
raiseWarning("'%s' is being loaded as .txt, but could be .tab file", stem.c_str());
else
mldelete isNotTab;
return domainWithDetection(stem, sourceVars, sourceMetas, sourceDomain, dontCheckStored, noCodedDiscrete, noClass);
}
else {
if (isNotTab) {
raiseWarning("'%s' is being loaded as .tab, but looks more like .txt file\n(%s)", stem.c_str(), isNotTab);
mldelete isNotTab;
}
return domainWithoutDetection(stem, sourceVars, sourceMetas, sourceDomain, dontCheckStored);
}
}
/* These are the rules for determining the attribute types.
There are three ways to determine a type.
1. By header prefixes to attribute names.
The prefix is formed by [cmi][DCS]#
c, m and i mean class attribute, meta attribute and ignore,
respectively.
D, C and S mean discrete, continuous and string attributes.
2. By knownVars.
If the type is not determined from header row (either because
there was no prefix or it only contained c, m or i)
knownVars is checked for the attribute with the same name.
If found, the attribute from knownVars will be used.
3. From the data.
These attributes can be either continuous or discrete.
The file is parsed and values for each attribute are checked.
Values denoting undefined values ('?', '.', '~', '*', 'NA' and
empty strings) are ignored.
If all values can be parsed as numbers, the attribute is continuous.
An exception to this rule are attributes with values 0, 1, 2, ..., 9.
These are treated as discrete (the assumption is that those number
are just codes for otherwise discrete values).
*/
class TSearchWarranty
{ public:
int posInFile, posInDomain, suspectedType;
// suspectedType can be 3 (never seen it yet), 2 (can even be coded discrete), 1 (can be float);
// if it's found that it cannot be float, it can only be discrete, so the warranty is removed
TSearchWarranty(const int &pif, const int &pid)
: posInFile(pif), posInDomain(pid), suspectedType(3)
{}
};
PDomain TTabDelimExampleGenerator::domainWithDetection(const string &stem, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored, bool noCodedDiscrete, bool noClass)
{
headerLines = 1;
TFileExampleIteratorData fei(stem);
vector<string> varNames;
// read the next non-comment line
while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv)==-1));
if (varNames.empty())
::raiseError("unexpected end of file '%s'", fei.filename.c_str());
TDomainDepot::TAttributeDescriptions attributeDescriptions, metas;
classPos = -1;
basketPos = -1;
int classType = -1;
int lastRegular = -1;
list<TSearchWarranty> searchWarranties;
/**** Parse the header row */
ITERATE(vector<string>, ni, varNames) {
/* Parses the header line
- sets *ni to a real name (without prefix)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -