📄 tabdelim.cpp
字号:
- sets varType to TValue::varType or -1 if the type is not specified and -2 if it's a basket
- sets classPos/basketPos to the current position, if the attribute is class/basket attribute
(and reports an error if there is more than one such attribute)
- to attributeTypes, appends -1 for ordinary atributes, 1 for metas and 0 for ignored or baskets*/
int varType = -1; // varType, or -1 for unnown, -2 for basket
attributeTypes->push_back(-1);
int &attributeType = attributeTypes->back();
const char *cptr = (*ni).c_str();
if (*cptr && (cptr[1]=='#')) {
if (*cptr == 'm')
attributeType = 1;
else if (*cptr == 'i')
attributeType = 0;
else if (*cptr == 'c') {
if (classPos>-1)
::raiseError("more than one attribute marked as class");
else
classPos = ni-varNames.begin();
}
else if (*cptr == 'D')
varType = TValue::INTVAR;
else if (*cptr == 'C')
varType = TValue::FLOATVAR;
else if (*cptr == 'S')
varType = STRINGVAR;
else if (*cptr == 'B') {
varType = -2;
attributeType = 0;
if (basketPos > -1)
::raiseError("more than one basket attribute");
else
basketPos = ni - varNames.begin();
}
else
::raiseError("unrecognized flags in attribute name '%s'", cptr);
*ni = string(cptr+2);
}
else if (*cptr && cptr[1] && (cptr[2]=='#')) {
bool beenWarned = false;
if (*cptr == 'm')
attributeType = 1;
else if (*cptr == 'i')
attributeType = 0;
else if (*cptr == 'c') {
if (classPos>-1)
::raiseError("more than one attribute marked as class");
else
classPos = ni-varNames.begin();
}
else
::raiseError("unrecognized flags in attribute name '%s'", cptr);
cptr++;
if (*cptr == 'D')
varType = TValue::INTVAR;
else if (*cptr == 'C')
varType = TValue::FLOATVAR;
else if (*cptr == 'S')
varType = STRINGVAR;
else if (*cptr == 'B') {
if (attributeType) { // basket can be ignored, too
varType = -2;
attributeType = 0; // this is ugly, but baskets are a patch anyway: if not ignored by 'i' flag, it's ignored by 'basket'
if (basketPos > -1)
::raiseError("there can only be one basket attribute");
else
basketPos = ni - varNames.begin();
}
}
else
::raiseError("unrecognized flags in attribute name '%s'", cptr);
// remove the prefix (we have already increased cptr once)
*ni = string(cptr+2);
}
/* If the attribute is not to be ignored, we attempt to either find its descriptor
among the known attributes or create a new attribute if the type is given.
For ordinary attributes, the descriptor (or PVariable()) is pushed to the list of 'variables'.
For meta attributes, a meta descriptor is pushed to 'metas'. If the attribute was used as
meta-attribute in some of known domains, the id is reused; otherwise a new id is created.
If the descriptor was nor found nor created, a warranty is issued.
*/
if ((classPos == ni-varNames.begin())) {
classType = varType;
}
else {
if (attributeType == 1) {
metas.push_back(TDomainDepot::TAttributeDescription(*ni, varType));
if (varType==-1)
searchWarranties.push_back(TSearchWarranty(ni-varNames.begin(), -(signed int)(metas.size())));
}
else if (attributeType) {
lastRegular = ni-varNames.begin();
attributeDescriptions.push_back(TDomainDepot::TAttributeDescription(*ni, varType));
if (varType==-1)
searchWarranties.push_back(TSearchWarranty(ni-varNames.begin(), attributeType==-2 ? -1 : attributeDescriptions.size()-1));
}
}
}
if (classPos > -1) {
attributeDescriptions.push_back(TDomainDepot::TAttributeDescription(varNames[classPos], classType));
if (classType<0)
searchWarranties.push_back(TSearchWarranty(classPos, attributeDescriptions.size()-1));
}
else {
if (!noClass)
classPos = lastRegular;
}
if (!searchWarranties.empty()) {
vector<string> atoms;
char numTest[64];
while (!feof(fei.file) && !searchWarranties.empty()) {
// seek to the next line non-empty non-comment line
if (readTabAtom(fei, atoms, true, csv) <= 0)
continue;
for(list<TSearchWarranty>::iterator wi(searchWarranties.begin()), we(searchWarranties.end()); wi!=we; wi++) {
if ((*wi).posInFile >= atoms.size())
continue;
// raiseError("line %i too short", fei.line);
const string &atom = atoms[(*wi).posInFile];
// only discrete attributes can have values longer than 63 characters
if (atom.length()>63) {
if ((*wi).posInDomain<0)
metas[-(*wi).posInDomain - 1].varType = TValue::INTVAR;
else
attributeDescriptions[(*wi).posInDomain].varType = TValue::INTVAR;
wi = searchWarranties.erase(wi);
wi--;
continue;
}
const char *ceni = atom.c_str();
if ( !*ceni
|| !ceni[1] && ((*ceni=='?') || (*ceni=='.') || (*ceni=='~') || (*ceni=='*') || (*ceni=='-'))
|| (atom == "NA") || (DC && (atom == DC)) || (DK && (atom == DK)))
continue;
// we have encountered some value
if ((*wi).suspectedType == 3)
(*wi).suspectedType = 2;
// If the attribute is a digit, it can be anything
if ((!ceni[1]) && (*ceni>='0') && (*ceni<='9'))
continue;
// If it is longer than one character, it cannot be a coded discrete
if (ceni[1])
(*wi).suspectedType = 1;
// Convert commas into dots
strcpy(numTest, ceni);
for(char *sc = numTest; *sc; sc++)
if (*sc == ',')
*sc = '.';
// If the attribute cannot be converted into a number, it is enum
char *eptr;
strtod(numTest, &eptr);
while (*eptr==32)
eptr++;
if (*eptr) {
if ((*wi).posInDomain<0)
metas[-(*wi).posInDomain - 1].varType = TValue::INTVAR;
else
attributeDescriptions[(*wi).posInDomain].varType = TValue::INTVAR;
wi = searchWarranties.erase(wi);
wi--;
continue;
}
}
}
ITERATE(list<TSearchWarranty>, wi, searchWarranties) {
const string &name = varNames[(*wi).posInFile];
if ((*wi).suspectedType == 3)
raiseWarning("cannot determine type for attribute '%s'; the attribute will be ignored", name.c_str());
int type = (*wi).suspectedType == 2 && !noCodedDiscrete ? TValue::INTVAR : TValue::FLOATVAR;
if ((*wi).posInDomain<0)
metas[-(*wi).posInDomain - 1].varType = type;
else
attributeDescriptions[(*wi).posInDomain].varType = type;
}
for(int i = 0; i < attributeDescriptions.size(); )
if (attributeDescriptions[i].varType == -1)
attributeDescriptions.erase(attributeDescriptions.begin() + i);
else
i++;
}
if (basketPos >= 0)
basketFeeder = mlnew TBasketFeeder(sourceDomain, dontCheckStored, false);
if (sourceDomain) {
if (!domainDepot_txt.checkDomain(sourceDomain.AS(TDomain), &attributeDescriptions, classPos>-1, NULL))
raiseError("given domain does not match the file");
else {
if (basketFeeder)
basketFeeder->domain = sourceDomain;
return sourceDomain;
}
}
int *metaIDs = mlnew int[metas.size()];
PDomain newDomain = domainDepot_txt.prepareDomain(&attributeDescriptions, classPos>-1, &metas, sourceVars, sourceMetas, false, dontCheckStored, NULL, metaIDs);
int *mid = metaIDs;
PITERATE(TIntList, ii, attributeTypes)
if (*ii == 1)
*ii = *(mid++);
mldelete metaIDs;
if (basketFeeder)
basketFeeder->domain = newDomain;
return newDomain;
}
PDomain TTabDelimExampleGenerator::domainWithoutDetection(const string &stem, PVarList sourceVars, TMetaVector *sourceMetas, PDomain sourceDomain, bool dontCheckStored)
{
TFileExampleIteratorData fei(stem);
vector<string> varNames, varTypes, varFlags;
while(!feof(fei.file) && (readTabAtom(fei, varNames, true, csv) == -1));
if (varNames.empty())
::raiseError("empty file");
while(!feof(fei.file) && (readTabAtom(fei, varTypes, false, csv) == -1));
if (varTypes.empty())
::raiseError("cannot read types of attributes");
while(!feof(fei.file) && (readTabAtom(fei, varFlags, true, csv) == -1));
if (varNames.size() != varTypes.size())
::raiseError("mismatching number of attributes and their types.");
if (varNames.size() < varFlags.size())
::raiseError("too many flags (third line too long)");
while (varFlags.size() < varNames.size())
varFlags.push_back("");
TDomainDepot::TAttributeDescriptions attributeDescriptions, metas;
TDomainDepot::TAttributeDescription classDescription("", 0);
classPos = -1;
basketPos = -1;
headerLines = 3;
attributeTypes = mlnew TIntList(varNames.size(), -1);
vector<string>::iterator vni(varNames.begin()), vne(varNames.end());
vector<string>::iterator ti(varTypes.begin());
vector<string>::iterator fi(varFlags.begin()), fe(varFlags.end());
TIntList::iterator ati(attributeTypes->begin());
for(; vni!=vne; fi++, vni++, ti++, ati++) {
TDomainDepot::TAttributeDescription *attributeDescription = NULL;
bool ordered = false;
if (fi!=fe) {
TProgArguments args("dc: ordered", *fi, false);
if (args.direct.size()) {
if (args.direct.size()>1)
::raiseError("invalid flags for attribute '%s'", (*vni).c_str());
string direct = args.direct.front();
if ((direct=="s") || (direct=="skip") || (direct=="i") || (direct=="ignore"))
*ati = 0;
else if ((direct=="c") || (direct=="class"))
if (classPos==-1) {
classPos = vni - varNames.begin();
classDescription.name = *vni;
attributeDescription = &classDescription;
}
else
::raiseError("multiple attributes are specified as class attribute ('%s' and '%s')", (*vni).c_str(), (*vni).c_str());
else if ((direct=="m") || (direct=="meta"))
*ati = 1;
}
if (args.exists("dc")) {
const int ind = vni-varNames.begin();
ITERATE(TMultiStringParameters, mi, args.options)
if ((*mi).first == "dc") {
while (DCs.size() <= ind)
DCs.push_back(vector<string>());
DCs.at(ind).push_back((*mi).second);
}
}
ordered = args.exists("ordered");
}
if (!*ati)
continue;
if (!strcmp((*ti).c_str(), "basket")) {
if (basketPos > -1)
::raiseError("multiple basket attributes are defined");
basketPos = vni - varNames.begin();
*ati = 0;
continue;
}
if (!attributeDescription) {// this can only be defined if the attribute is a class attribute
if (*ati==1) {
metas.push_back(TDomainDepot::TAttributeDescription(*vni, -1, *ti, ordered));
attributeDescription = &metas.back();
}
else {
attributeDescriptions.push_back(TDomainDepot::TAttributeDescription(*vni, -1, *ti, ordered));
attributeDescription = &attributeDescriptions.back();
}
}
else
attributeDescription->ordered = ordered;
if (!(*ti).length())
::raiseError("type for attribute '%s' is missing", (*vni).c_str());
const TIdentifierDeclaration *tid = typeIdentifiers;
for(; tid->identifier; tid++)
if (!(tid->matchRoot ? strncmp(tid->identifier, (*ti).c_str(), tid->matchRoot)
: strcmp(tid->identifier, (*ti).c_str()))) {
attributeDescription->varType = tid->varType;
break;
}
if (!tid->identifier) {
attributeDescription->varType = TValue::INTVAR;
attributeDescription->values = mlnew TStringList;
string vals;
ITERATE(string, ci, *ti)
if (*ci==' ') {
if (vals.length())
attributeDescription->values->push_back(vals);
vals="";
}
else {
if ((*ci=='\\') && (ci[1]==' ')) {
vals += ' ';
ci++;
}
else
vals += *ci;
}
if (vals.length())
attributeDescription->values->push_back(vals);
}
}
if (classPos > -1)
attributeDescriptions.push_back(classDescription);
if (basketPos >= 0)
basketFeeder = mlnew TBasketFeeder(sourceDomain, dontCheckStored, false);
if (sourceDomain) {
if (!domainDepot_tab.checkDomain(sourceDomain.AS(TDomain), &attributeDescriptions, classPos >= 0, NULL))
raiseError("given domain does not match the file");
else {
if (basketFeeder)
basketFeeder->domain = sourceDomain;
return sourceDomain;
}
}
int *metaIDs = mlnew int[metas.size()];
PDomain newDomain = domainDepot_tab.prepareDomain(&attributeDescriptions, classPos>=0, &metas, sourceVars, sourceMetas, false, dontCheckStored, NULL, metaIDs);
int *mid = metaIDs;
PITERATE(TIntList, ii, attributeTypes)
if (*ii == 1)
*ii = *(mid++);
mldelete metaIDs;
if (basketFeeder)
basketFeeder->domain = newDomain;
return newDomain;
}
bool atomsEmpty(const vector<string> &atoms)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -