⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mergesort.java

📁 采用外部归并排序算法对超大文本文件进行排序去重
💻 JAVA
字号:
package wjw.PreTrans;import java.io.*;import java.util.*;import org.apache.commons.io.*;public class MergeSort {  private static FastQSortAlgorithm<String> QQ = new FastQSortAlgorithm<String>();   private MergeSort() {  }  /**   * 得到指定文件的行数   * @param fileC String   * @return int   * @throws IOException   */  private static int getFileLineSize(String fileC) throws IOException {    Reader fC = null;    try {      fC = new FileReader(fileC);      LineIterator it = IOUtils.lineIterator(fC);      int lineSize = 0;      while(it.hasNext()) {        it.nextLine();        lineSize++;      }      return lineSize;    } finally {      IOUtils.closeQuietly(fC);    }  }  /**   * 得到下一行的内容,如果已到文件末尾返回NULL   * @param iterator LineIterator   * @return String   */  private static String nextLine(LineIterator iterator) {    if(iterator.hasNext()) {      return iterator.nextLine();    } else {      return null;    }  }  /**   * 读指定行数的字符串到缓冲区列表里   * @param iterator LineIterator   * @param bufList List   * @param lines int   */  private static void readLines(LineIterator iterator, List<String> bufList, int lines) {    for(int i = 0; i < lines; i++) {      if(!iterator.hasNext()) {        break;      }      bufList.add(iterator.nextLine());    }  }  /**   * 扫描fileC中的归并段并把它们交替分别送到文件fileA和fileB中,   * 本次归并段的大小为k*blockSize   * @param fileA String   * @param fileB String   * @param fileC String   * @param k int   * @param blockSize int   */  private static void split(String fileA, String fileB, String fileC, int k,                            int blockSize) throws FileNotFoundException, IOException {    boolean useA = true;    int i = 0;    List<String> bufList = new ArrayList<String>(blockSize); //大小为blockSize的缓冲区    Writer fA = null;    Writer fB = null;    Reader fC = null;    try {      fA = new BufferedWriter(new FileWriter(fileA));      fB = new BufferedWriter(new FileWriter(fileB));      fC = new FileReader(fileC);      LineIterator itC = IOUtils.lineIterator(fC);      while(itC.hasNext()) {        //->读入数据块        bufList.clear();        readLines(itC, bufList, blockSize);        //<-读入数据块        if(useA) {          IOUtils.writeLines(bufList, "\n", fA);        } else {          IOUtils.writeLines(bufList, "\n", fB);        }        if(++i == k) {          i = 0;          useA = !useA;        }      }    } finally {      bufList.clear();      IOUtils.closeQuietly(fA);      IOUtils.closeQuietly(fB);      IOUtils.closeQuietly(fC);    }  }  /**   * n为当前归并段大小(k*blockSize);将文件fX的后续归并段拷入到fY,变量currRunPos为当前归并段的索引   * @param fileX String   * @param fileY String   * @param currRunPos int   * @param n int   * @return int 当前归并段的索引   */  private static int copyTail(LineIterator fileX, Writer fileY, int currRunPos,                              int n) throws IOException {    //从当前位置到归并段结束,拷贝每个数据    while(currRunPos <= n) {      //若没有更多的数据项,则文件结束且归并段结束      if(!fileX.hasNext()) {        break;      }      //修改当前归并段位置并将数据项写入fY      currRunPos++;      IOUtils.write(fileX.nextLine() + "\n", fileY);    }    return currRunPos;  }  /**   * 将文件fA和fB中长度为n(k*blockSize)的归并段合并回fC中   * @param fileA String   * @param fileB String   * @param fileC String   * @param n int   * @throws IOException   */  private static void merge(String fileA, String fileB, String fileC, int n) throws      IOException {    //currA和currB表示在当前归并段中的位置    int currA = 1;    int currB = 1;    //分别从fA和fB中读出的数据项    String dataA, dataB;    Reader fA = null;    Reader fB = null;    Writer fC = null;    try {      fA = new FileReader(fileA);      fB = new FileReader(fileB);      fC = new BufferedWriter(new FileWriter(fileC));      LineIterator itA = IOUtils.lineIterator(fA);      LineIterator itB = IOUtils.lineIterator(fB);      dataA = nextLine(itA);      dataB = nextLine(itB);      for(; ; ) {        //若(dataA<=dataB),则将dataA拷贝到fC并修改当前归并段的位置        if(dataA.compareTo(dataB) <= 0) {          IOUtils.write(dataA + "\n", fC);          //从fA中取下一归并段,若不存在,则已到文件尾,应将fB的后续归并段拷入到fC;          //若当前位置>n,则已将所有fA的归并段拷完,应拷贝fB的后续归并段          dataA = nextLine(itA);          currA++;          if(dataA == null || currA > n) {            IOUtils.write(dataB + "\n", fC);            currB++;            currB = copyTail(itB, fC, currB, n);            //fA的大小>=fB的大小;若在fA的文件尾,则结束            if(dataA == null) {              break;            } else { //否则,应在新的归并段中,重置当前位置              currA = 1;            }            //取fB中的下一项.若不存在,则只有fA中剩余的部分要拷贝到fC,            //退出循环前将当前归并段写入fC            dataB = nextLine(itB);            if(dataB == null) {              IOUtils.write(dataA + "\n", fC);              currA = 2;              break;            } else { //否则,重置fB中当前归并段              currB = 1;            }          }        } else { //否则(dataA>dataB)          IOUtils.write(dataB + "\n", fC);          //从fB中取下一归并段,若不存在,则已到文件尾,应将fA的后续归并段拷入到fC;          //若当前位置>n,则已将所有fB的归并段拷完,应拷贝fA的后续归并段          dataB = nextLine(itB);          currB++;          if(dataB == null || currB > n) {            IOUtils.write(dataA + "\n", fC);            currA++;            currA = copyTail(itA, fC, currA, n);            //若fB中没有更多项,则置fA的当前位置,准备拷贝fA中的最后归并段到fC中            if(dataB == null) {              currA = 1;              break;            } else { //否则,置fB的当前位置,并从fA中读入数据              currB = 1;              if((dataA = nextLine(itA)) == null) {                break;              } else {                currA = 1;              }            }          }        }      } //<- end for(; ;)      //将fA中可能存在的剩余的归并段写入fC中(注:fA的长度时>=fB的)      if(dataA != null && dataB == null) {        currA = copyTail(itA, fC, currA, n);      }    } finally {      IOUtils.closeQuietly(fA);      IOUtils.closeQuietly(fB);      IOUtils.closeQuietly(fC);    }  }  /**   * 用指定的blockSize块大小,排序指定的文件fileC   * @param fileC String   * @param blockSize int   * @throws IOException   */  /**   * 用指定的blockSize块大小,排序指定的文件fileSource,排序后的文件是fileOut   * @param fileSource String   * @param fileOut String   * @param blockSize int   * @throws IOException   */  public static void sort(String fileSource,String fileOut, int blockSize) throws IOException {    String fileA = File.createTempFile("wjw", null).getAbsolutePath();    String fileB = File.createTempFile("wjw", null).getAbsolutePath();        int mergeIndex = 1;    int lineSize = getFileLineSize(fileSource);    int k = 1;    int n = k * blockSize;    boolean useA = true;    List<String> list = new ArrayList<String>(blockSize);    Writer fA = null;    Writer fB = null;    Reader fC = null;    try {      fA = new BufferedWriter(new FileWriter(fileA));      fB = new BufferedWriter(new FileWriter(fileB));      fC = new FileReader(fileSource);      LineIterator itC = IOUtils.lineIterator(fC);      if(lineSize <= blockSize) { //对于小文件,从fC读入数据,直接排序后写回文件中        readLines(itC, list, lineSize);        Collections.sort(list);        IOUtils.closeQuietly(fC);        FileUtils.writeLines(new File(fileOut), "GBK", list, "\n");        list.clear();        removeDuple(fileOut);        return;      }      //->第一次分割,合并      System.out.println("第:"+mergeIndex+"分割,合并");      while(itC.hasNext()) {        list.clear();        readLines(itC, list, blockSize);        Collections.sort(list);        if(useA) {          IOUtils.writeLines(list, "\n", fA);        } else {          IOUtils.writeLines(list, "\n", fB);        }        useA = !useA;      }      list.clear();      IOUtils.closeQuietly(fA);      IOUtils.closeQuietly(fB);      IOUtils.closeQuietly(fC);      merge(fileA, fileB, fileOut, blockSize);      mergeIndex++;      //<-第一次分割,合并      //->将当前归并段大小加倍,循环进行      k = k * 2;      n = k * blockSize;      while(n < lineSize) { //当n>=文件大小时,fC仅剩一个已排好序的归并段        System.out.println("第:"+mergeIndex+"分割,合并");        split(fileA, fileB, fileOut, k, blockSize);        merge(fileA, fileB, fileOut, n);        mergeIndex++;        k = k * 2;        n = k * blockSize;      }      //->将当前归并段大小加倍,循环进行    } finally {      IOUtils.closeQuietly(fA);      IOUtils.closeQuietly(fB);      IOUtils.closeQuietly(fC);      (new File(fileA)).delete();      (new File(fileB)).delete();    }    removeDuple(fileOut);  }  /**   * 删除已经排好序的文件中重复的数据   * @param fileC String   * @throws IOException   */  private static void removeDuple(String fileC) throws IOException {    System.out.println("去重");    Reader fC = null;    Writer fTemp = null;    File tempFile = File.createTempFile("wjw", null);    try {      fC = new FileReader(fileC);      fTemp = new BufferedWriter(new FileWriter(tempFile));      String tmpA = "";      String tmpB = "";      LineIterator itC = IOUtils.lineIterator(fC);      while(itC.hasNext()) {        tmpB = itC.nextLine();        if(tmpB.compareTo(tmpA) != 0) {          IOUtils.write(tmpB + "\n", fTemp);          tmpA = tmpB;        }      }    } finally {      IOUtils.closeQuietly(fTemp);      IOUtils.closeQuietly(fC);    }    File cFile = new File(fileC);    if(cFile.delete()) {      if(tempFile.renameTo(cFile)) {        tempFile.delete();      }    }  }  public static String formatSecond(long seconds) {    long h = seconds /(60*60);    StringBuffer sb = new StringBuffer();        sb.append(h+"小时");        seconds = seconds%(60*60);    long c = seconds /60;    sb.append(c+"分");    sb.append(seconds %60+"秒");        return sb.toString();  }  public static void main(String args[]) {    try {      String fileName = "d:/ESort.txt";      int blockSize = 100000;      long c1 = System.currentTimeMillis();      MergeSort.sort(fileName,fileName + "_SORT", blockSize);      long c2 = (System.currentTimeMillis() - c1) / 1000;      System.out.println("耗时:"+formatSecond(c2));    } catch(IOException ex) {      ex.printStackTrace();    }  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -