📄 distance.cpp
字号:
/*
This file is part of Orange.
Orange is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Orange is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Orange; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Authors: Janez Demsar, Blaz Zupan, 1996--2002
Contact: janez.demsar@fri.uni-lj.si
*/
#include <math.h>
#include "stladdon.hpp"
#include "vars.hpp"
#include "domain.hpp"
#include "examples.hpp"
#include "examplegen.hpp"
#include "distvars.hpp"
#include "contingency.hpp"
#include "basstat.hpp"
#include "orvector.hpp"
#include "distance.ppp"
TExamplesDistanceConstructor::TExamplesDistanceConstructor(const bool &ic)
: ignoreClass(ic)
{}
TExamplesDistanceConstructor_Hamming::TExamplesDistanceConstructor_Hamming()
: ignoreClass(true),
ignoreUnknowns(false)
{}
PExamplesDistance TExamplesDistanceConstructor_Hamming::operator()(PExampleGenerator, const int &, PDomainDistributions, PDomainBasicAttrStat) const
{ return mlnew TExamplesDistance_Hamming(ignoreClass, ignoreUnknowns); }
TExamplesDistance_Hamming::TExamplesDistance_Hamming(const bool &ic, const bool &iu)
: ignoreClass(ic),
ignoreUnknowns(iu)
{}
float TExamplesDistance_Hamming::operator()(const TExample &e1, const TExample &e2) const
{ if (e1.domain != e2.domain)
raiseError("cannot compare examples from different domains");
float dist = 0.0;
int Na = e1.domain->attributes->size() + (!ignoreClass && e1.domain->classVar ? 1 : 0);
for(TExample::const_iterator i1 = e1.begin(), i2 = e2.begin(); Na--; i1++, i2++)
if ( (!ignoreUnknowns || !(*i1).isSpecial() && !(*i2).isSpecial())
&& (!(*i1).compatible(*i2)))
dist += 1.0;
return dist;
}
TExamplesDistanceConstructor_Normalized::TExamplesDistanceConstructor_Normalized()
: TExamplesDistanceConstructor(),
normalize(true),
ignoreUnknowns(false)
{}
TExamplesDistanceConstructor_Normalized::TExamplesDistanceConstructor_Normalized(const bool &ic, const bool &no, const bool &iu)
: TExamplesDistanceConstructor(ic),
normalize(no),
ignoreUnknowns(iu)
{}
TExamplesDistance_Normalized::TExamplesDistance_Normalized()
: normalizers(PFloatList()),
domainVersion(-1),
normalize(true),
ignoreUnknowns(false)
{}
TExamplesDistance_Normalized::TExamplesDistance_Normalized(const bool &ignoreClass, const bool &no, const bool &iu, PExampleGenerator egen, const int &weightID, PDomainDistributions ddist, PDomainBasicAttrStat bstat)
: normalizers(mlnew TAttributedFloatList()),
bases(mlnew TAttributedFloatList()),
averages(mlnew TAttributedFloatList()),
variances(mlnew TAttributedFloatList()),
domainVersion(egen ? egen->domain->version : -1),
normalize(no),
ignoreUnknowns(iu)
{ TFloatList &unormalizers = normalizers.getReference();
PVarList varlist;
if (!bstat && !ddist && egen)
bstat = mlnew TDomainBasicAttrStat(egen, weightID);
if (bstat && egen) {
varlist = ignoreClass ? egen->domain->attributes : egen->domain->variables;
TDomainBasicAttrStat::const_iterator si(bstat->begin()), ei(bstat->end());
TVarList::const_iterator vi (egen->domain->variables->begin()), evi(egen->domain->variables->end());
if (ignoreClass && egen->domain->classVar) {
evi--;
ei--;
}
for(; (vi!=evi) && (si!=ei); si++, vi++) {
if ((*vi)->varType==TValue::FLOATVAR) {
if (*si && ((*si)->n>0)) {
normalizers->push_back((*si)->max!=(*si)->min ? 1.0/((*si)->max-(*si)->min) : 0.0);
bases->push_back((*si)->min);
averages->push_back((*si)->avg);
variances->push_back((*si)->dev * (*si)->dev);
}
else {
normalizers->push_back(0.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
}
else if ((*vi)->varType==TValue::INTVAR) {
if ((*vi)->ordered)
if ((*vi)->noOfValues()>0)
normalizers->push_back(1.0/(*vi)->noOfValues());
else
normalizers->push_back(0.0);
else
normalizers->push_back(-1.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
else {
normalizers->push_back(0.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
}
if ((vi!=evi) || (si!=ei))
raiseError("lengths of domain and basic attribute statistics do not match");
}
else if (ddist) {
varlist = mlnew TVarList;
PITERATE(TDomainDistributions, ci, ddist) {
if (*ci) {
const PVariable &vi = (*ci)->variable;
varlist->push_back(vi);
if (vi->varType==TValue::FLOATVAR) {
TContDistribution *dcont = (*ci).AS(TContDistribution);
if (dcont && (dcont->begin() != dcont->end())) {
const float min = (*dcont->distribution.begin()).first;
const float dif = (*dcont->distribution.rbegin()).first - min;
normalizers->push_back(dif > 0.0 ? 1.0/dif : 0.0);
bases->push_back(min);
averages->push_back(dcont->average());
variances->push_back(dcont->var());
}
else {
normalizers->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
}
else if (vi->varType==TValue::INTVAR) {
if (vi->ordered) {
const int nval = vi->noOfValues();
normalizers->push_back(nval ? 1.0/float(nval) : 0.0);
}
else
normalizers->push_back(-1.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
else {
normalizers->push_back(0.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
}
else {
normalizers->push_back(0.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
}
}
else if (bstat) {
varlist = mlnew TVarList;
TDomainBasicAttrStat::const_iterator si(bstat->begin()), ei(bstat->end());
if (ignoreClass) // can't check it, but suppose there is a class attribute
ei--;
for(; si!=ei; si++) {
if (!*si)
raiseError("cannot compute normalizers from BasicAttrStat in presence of non-continuous attributes");
varlist->push_back((*si)->variable);
if (((*si)->n>0) && ((*si)->max!=(*si)->min)) {
normalizers->push_back(1.0/((*si)->max-(*si)->min));
bases->push_back((*si)->min);
averages->push_back((*si)->avg);
variances->push_back((*si)->dev * (*si)->dev);
}
else {
normalizers->push_back(0.0);
bases->push_back(0.0);
averages->push_back(0.0);
variances->push_back(0.0);
}
}
}
else
raiseError("no data");
normalizers->attributes = bases->attributes = averages->attributes = variances->attributes = varlist;
}
/* Returns a vector of normalized differences between the two examples.
Quick checks do not guarantee that domains are really same to the training domain.
To be really on the safe side, we should know the domain and convert both examples. Too slow...
*/
void TExamplesDistance_Normalized::getDifs(const TExample &e1, const TExample &e2, vector<float> &difs) const
{ checkProperty(normalizers);
if (e1.domain!=e2.domain)
raiseError("examples are from different domains");
if (domainVersion>=0
? (domainVersion != e1.domain->version)
: ((normalizers->size() > e1.domain->variables->size()) || (normalizers->size()< e1.domain->attributes->size())))
raiseError("examples are from a wrong domain");
difs = vector<float>(normalizers->size(), 0.0);
vector<float>::iterator di(difs.begin());
TExample::const_iterator i1(e1.begin()), i2(e2.begin());
for(TFloatList::const_iterator si(normalizers->begin()), se(normalizers->end()); si!=se; si++, i1++, i2++, di++)
if ((*i1).isSpecial() || (*i2).isSpecial())
*di = ((*si!=0) && !ignoreUnknowns) ? 0.5 : 0.0;
else
if (normalize) {
if (*si>0) {
if ((*i1).varType == TValue::FLOATVAR)
*di = *si * fabs((*i1).floatV - (*i2).floatV);
else if ((*i1).varType == TValue::INTVAR)
*di = *si * fabs(float((*i1).intV - (*i2).intV));
}
else if (*si<0)
*di = (*i1).compatible(*i2) ? 0.0 : 1.0;
}
else {
if ((*i1).varType == TValue::FLOATVAR) {
*di = fabs((*i1).floatV - (*i2).floatV);
}
else
if (*si>0) {
if ((*i1).varType == TValue::INTVAR)
*di = fabs(float((*i1).intV - (*i2).intV));
else
*di = (*i1).compatible(*i2) ? 0.0 : 1.0;
}
}
}
void TExamplesDistance_Normalized::getNormalized(const TExample &e1, vector<float> &normalized) const
{
checkProperty(normalizers);
checkProperty(bases);
if ( domainVersion>=0
? (domainVersion != e1.domain->version)
: ((normalizers->size() > e1.domain->variables->size()) || (normalizers->size()< e1.domain->attributes->size())))
raiseError("example is from a wrong domain");
normalized.clear();
TExample::const_iterator ei(e1.begin());
for(TFloatList::const_iterator normi(normalizers->begin()), norme(normalizers->end()), basi(bases->begin()); normi!=norme; ei++, normi++, basi++) {
// changed by PJ
/*
if ((*ei).isSpecial())
normalized.push_back(numeric_limits<float>::quiet_NaN());
else
if ((*normi>0) && ((*ei).varType == TValue::FLOATVAR))
normalized.push_back(normalize ? ((*ei).floatV - *basi) / *normi : (*ei).floatV);
else
normalized.push_back(-1.0);
*/
if ((*ei).isSpecial() || ei->varType != TValue::FLOATVAR)
normalized.push_back(numeric_limits<float>::signaling_NaN());
else
if (*normi>0 && normalize)
normalized.push_back(((*ei).floatV - *basi) * (*normi));
else
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -