⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 readdata.cpp

📁 orange源码 数据挖掘技术
💻 CPP
字号:
/*
    This file is part of Orange.

    Orange is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    Orange is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Orange; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    Authors: Janez Demsar, Blaz Zupan, 1996--2002
    Contact: janez.demsar@fri.uni-lj.si
*/


#include <iostream>
#include <fstream>

#ifdef _MSC_VER
  #include <direct.h>
#else
  #include <unistd.h>
#endif

#include "stladdon.hpp"

#include "vars.hpp"
#include "domain.hpp"
#include "table.hpp"

#include "filegen.hpp"
#include "tabdelim.hpp"
#include "c45inter.hpp"
#include "retisinter.hpp"
#include "assistant.hpp"
#include "basket.hpp"

#include <string.h>


#ifdef INCLUDE_EXCEL
  TExampleTable *readExcelFile(char *filename, char *sheet, PVarList sourceVars, PDomain sourceDomain, bool dontCheckStored, bool dontStore);
#endif

bool fileExists(const string &s) {
  FILE *f = fopen(s.c_str(), "rt");
  if (!f)
    return false;

  fclose(f);
  return true;
}


typedef enum {UNKNOWN, TXT, CSV, BASKET, TAB, TSV, C45, RETIS, ASSISTANT, EXCEL} TFileFormats;

char *fileTypes[][2] = {{"Tab-delimited", "*.tab"}, {"Tab-delimited (simplified)", "*.txt"}, {"Comma-separated", "*.csv"},
                       {"C45", "*.names"}, {"Retis", "*.rda"}, {"Assistant", "*.dat"}, {"Basket", "*.basket"},
                       {NULL, NULL}};
                       
TExampleGenerator *readGenerator(char *filename, PVarList knownVars, TMetaVector *knownMetas, PDomain knownDomain, bool dontCheckStored, bool dontStore, const char *DK, const char *DC, bool noExcOnUnknown = false, bool noCodedDiscrete = false, bool noClass = false)
{ char *ext, *hash;
  if (filename) {
    for(ext = hash = filename + strlen(filename); ext!=filename; ext--) {
      if (*ext == '.')
        break;
      else if ((*ext=='/') || (*ext=='\\') || (*ext==':')) {
        ext = NULL;
        break;
      }
      else if (!*hash && (*ext == '#'))
        hash = ext;
    }
    if (ext==filename)
      ext = NULL;
  }
  else
    ext = NULL;

  // If the extension is given, we simply determine the format and load the files
  if (ext) {
    if (!strcmp(ext, ".txt"))
      return mlnew TTabDelimExampleGenerator(filename, true, false, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC, noCodedDiscrete, noClass);

    if (!strcmp(ext, ".csv"))
      return mlnew TTabDelimExampleGenerator(filename, true, true, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC, noCodedDiscrete, noClass);

    if (!strcmp(ext, ".tab") || !strcmp(ext, ".tsv"))
      return mlnew TTabDelimExampleGenerator(filename, false, false, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC);

    if (!strcmp(ext, ".basket"))
      return mlnew TBasketExampleGenerator(filename, knownDomain, dontCheckStored, dontStore);

    if (!strcmp(ext, ".data") || !strcmp(ext, ".names") || !strcmp(ext, ".test"))
      return mlnew TC45ExampleGenerator(strcmp(ext, ".names") ? filename : string(filename, ext) + ".data",
                                                         string(filename, ext) + ".names",
                                                         knownVars, knownDomain, dontCheckStored, dontStore);

    if (!strcmp(ext, ".rda") || !strcmp(ext, ".rdo"))
      return mlnew TRetisExampleGenerator(string(filename, ext) + ".rda",
                                                           string(filename, ext) + ".rdo",
                                                           knownVars, knownDomain, dontCheckStored, dontStore);

    if (!strcmp(ext, ".dat")) {
      char *stem;
      for(stem = ext; (stem!=filename) && (*stem!=':') && (*stem!='\\'); stem--);
      if (stem!=filename)
        stem++;
      if (!strncmp(stem, "asd", 3) || ( (stem[3]!='o') && (stem[4]!='a') ))
        raiseError("invalid assistant filename (it should start with 'asdo' or 'asda')");

      stem += 3;
      return mlnew TAssistantExampleGenerator(string(filename, stem) + "a" + string(stem+1, ext), 
                                                               string(filename, stem) + "o" + string(stem+1, ext),
                                                               knownVars, knownDomain, dontCheckStored, dontStore);
    }

    #ifdef INCLUDE_EXCEL
    if ((hash-ext==4) && !strncmp(ext, ".xls", 4))
      return readExcelFile(filename, hash, knownVars, knownDomain, dontCheckStored, dontStore);
    #endif
  }

  /* If no filename is given at all, assume that the stem equals the last
     subdirectory name. Eg, the directory c:\data\monk1 is supposed to
     contain a file monk1 in one of the supported formats. */
  char *ep;
  if (!filename) {
    #ifdef _MSC_VER
      char dirName[_MAX_PATH];
      _getcwd(dirName, _MAX_PATH);
      ep = dirName + strlen(dirName);
      for(filename = ep; (*filename != '\\') && (*filename != '/'); filename--);
    #else
      char dirName[256];
      getcwd(dirName, 256);
      ep = dirName + strlen(dirName);
      for(filename = ep; *filename != '/'; filename--);
    #endif

    if ((filename == ep ) || (filename == ep-1))
      raiseError("filename not given and cannot be concluded from the working directory");

    filename++;
    hash = filename + strlen(filename);
  }

  int fileFormat = UNKNOWN;
  // CHECKFF(file extension, format name)
  #define CHECKFF(fext,ff) \
     if (fileExists(string(filename)+fext)) \
       if (fileFormat != UNKNOWN) \
         raiseError("Multiple files with stem '%s' exist; specify the complete file name", filename); \
       else \
         fileFormat = ff;
              
  CHECKFF(".txt", TXT);
  CHECKFF(".csv", CSV);
  CHECKFF(".basket", BASKET);
  CHECKFF(".tab", TAB);
  CHECKFF(".tsv", TSV);
  CHECKFF(".names", C45);
  CHECKFF(".rdo", RETIS);

  #ifdef INCLUDE_EXCEL
    if (*hash) {
      *hash = 0;
      CHECKFF(".xls", EXCEL);
      *hash = '#';
    }
    else
      CHECKFF(".xls", EXCEL);
  #endif

  #undef CHECKFF

  if (fileFormat == UNKNOWN) {
    if (noExcOnUnknown)
      return NULL;
    else
      if (ext)
        raiseError("unknown file format for file '%s' or file not found", filename);    
      else
        raiseError("file '%s' is not found or has unknown extension", filename);
  }


  /* Assistant is annoying: if path+stem is given, asd[ao] must be inserted in between */
  char *stem;
  #ifdef _MSC_VER
  for(stem = filename+strlen(filename); (stem != filename) && (*stem != '\\') && (*stem != ':') && (*stem != '/'); stem--);
  #else
  for(stem = filename+strlen(filename); (stem != filename) && (*stem != '/'); stem--);
  #endif
  if (stem!=filename)
    stem++;
  
  if (fileExists(string(filename, stem) + "asdo" + string(stem)+".dat"))
    if (fileFormat != UNKNOWN)
      raiseError("Multiple files with stem '%s' exist; specify the complete file name", filename);
    else
      fileFormat = ASSISTANT;


  string sfilename(filename);

  switch (fileFormat) {
    case TXT: 
      return mlnew TTabDelimExampleGenerator(sfilename+".txt", true, false, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC, noCodedDiscrete, noClass);

    case CSV:
      return mlnew TTabDelimExampleGenerator(sfilename+".csv", true, true, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC, noCodedDiscrete, noClass);

    case TAB:
      return mlnew TTabDelimExampleGenerator(sfilename+".tab", false, false, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC);

    case TSV:
      return mlnew TTabDelimExampleGenerator(sfilename+".tsv", false, false, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC);

    case BASKET:
      return mlnew TBasketExampleGenerator(sfilename+".basket", knownDomain, dontCheckStored, dontStore);

    case C45:
      return mlnew TC45ExampleGenerator(sfilename + ".data", sfilename + ".names", knownVars, knownDomain, dontCheckStored, dontStore);

    case RETIS:
      return mlnew TRetisExampleGenerator(sfilename + ".rda", sfilename + ".rdo", knownVars, knownDomain, dontCheckStored, dontStore);

    case ASSISTANT: {
      return mlnew TAssistantExampleGenerator(string(filename, stem) + "asda" + string(stem)+".dat",
                                                               string(filename, stem) + "asdo" + string(stem)+".dat",
                                                               knownVars, knownDomain, dontCheckStored, dontStore);
    }

    #ifdef INCLUDE_EXCEL
    case EXCEL:
      return readExcelFile(filename, hash, knownVars, knownDomain, dontCheckStored, dontStore);
    #endif

    default:
      if (noExcOnUnknown)
        return NULL;
      else
        raiseError("unknown file format for file '%s'", filename);
  }
  return NULL;
}


TExampleTable *readTable(char *filename, PVarList knownVars, TMetaVector *knownMetas, PDomain knownDomain, bool dontCheckStored, bool dontStore, const char *DK, const char *DC, bool noExcOnUnknown = false, bool noCodedDiscrete = false, bool noClass = false)
{
  TExampleGenerator *gen = readGenerator(filename, knownVars, knownMetas, knownDomain, dontCheckStored, dontStore, DK, DC, noExcOnUnknown, noCodedDiscrete, noClass);
  if (!gen)
    return NULL;
  TExampleTable *table = dynamic_cast<TExampleTable *>(gen);
  return table ? table : new TExampleTable(gen);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -