Search code examples
javafileiomergesplit

Splitting and Merging large files (size in GB) in Java


Suppose,

  • I am splitting 2590400 KB (approx 2.5 GB) file in 30 parts.

  • It will produce 30 files with size of 86347 KB.
    Which seems correct, 2590400/30 = 86346.66666667

  • Now if I merge all the parts (30) again it is producing the file of 3453873 KB file, which should be 2590410 KB.

Can anyone help me why this difference is there? I am using below code for merge and split files.

SplitFile.java

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;

/**
 * @author vishal.zanzrukia
 * 
 */
public class SplitFile {

    public static final String INPUT_FILE = "D:\\me\\projects\\input\\file\\path.txt";
    public static final int NUMBER_OF_OUTPUT_FILES = 30;
    public static final String FILE_SUFFIX = ".txt";

    /**
     * split file
     * 
     * @throws Exception
     */
    static void splitFile() throws Exception{

        File inputFile = new File(INPUT_FILE + "_Splits");
        inputFile.mkdir();

        RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");

        long sourceSize = raf.length();
        long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
        long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;

        int maxReadBufferSize = 8 * 1024; // 8KB
        for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\split." + destIx + FILE_SUFFIX));
            if (bytesPerSplit > maxReadBufferSize) {
                long numReads = bytesPerSplit / maxReadBufferSize;
                long numRemainingRead = bytesPerSplit % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
            } else {
                readWrite(raf, bw, bytesPerSplit);
            }
            bw.close();
        }
        if (remainingBytes > 0) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
            readWrite(raf, bw, remainingBytes);
            bw.close();
        }
        raf.close();
    }

    /**
     * join file
     * 
     * @throws Exception
     */
    static void joinFiles() throws Exception{
        int maxReadBufferSize = 8 * 1024; 

        BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX));
        File inputFileDir = new File(INPUT_FILE + "_Splits");
        RandomAccessFile raf = null;
        if(inputFileDir.isDirectory()){
            for(File file : inputFileDir.listFiles()){
                raf = new RandomAccessFile(file, "r");
                long numReads = raf.length() / maxReadBufferSize;
                long numRemainingRead = raf.length()  % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
                raf.close();
            }
        }
        bw.close();
    }

    public static void mergeFiles() {

        File[] files = new File[NUMBER_OF_OUTPUT_FILES];
        for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
            files[i-1] = new File(INPUT_FILE + "_Splits\\split."+i+FILE_SUFFIX);
        }

        String mergedFilePath = INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX;


        File mergedFile = new File(mergedFilePath);

        mergeFiles(files, mergedFile);
    }

    public static void mergeFiles(File[] files, File mergedFile) {

        FileWriter fstream = null;
        BufferedWriter out = null;
        try {
            fstream = new FileWriter(mergedFile, true);
             out = new BufferedWriter(fstream);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        for (File f : files) {
            System.out.println("merging: " + f.getName());
            FileInputStream fis;
            try {
                fis = new FileInputStream(f);
                BufferedReader in = new BufferedReader(new InputStreamReader(fis));

                String aLine;
                while ((aLine = in.readLine()) != null) {
                    out.write(aLine);
                    out.newLine();
                }

                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        try {
            out.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public static void main(String[] args) throws Exception {
//      splitFile();
        mergeFiles();
    }

    static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
        byte[] buf = new byte[(int) numBytes];
        int val = raf.read(buf);
        if (val != -1) {
            bw.write(buf);
        }
    }
}

Solution

  • Use your joinFiles method: don't try to read a file by line-by-line using a Reader if you want to keep it exactly like it was, because line endings may differ by platform.

    Instead read them as a binary file using an InputStream or RandomAccessFile and write using an OutputStream.

    The only problem in your joinFiles method is that it used File.listFiles() which makes no guarantees about the order in which the files are returned.

    I combined your mergeFiles() code with joinFiles() to make this work (remember to invoke joinFiles() instead of mergeFiles() from your main method)

    static void joinFiles(File[] files) throws Exception {
        int maxReadBufferSize = 8 * 1024;
    
        BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin"
                + FILE_SUFFIX));
    
        RandomAccessFile raf = null;
        for (File file : files) {
            raf = new RandomAccessFile(file, "r");
            long numReads = raf.length() / maxReadBufferSize;
            long numRemainingRead = raf.length() % maxReadBufferSize;
            for (int i = 0; i < numReads; i++) {
                readWrite(raf, bw, maxReadBufferSize);
            }
            if (numRemainingRead > 0) {
                readWrite(raf, bw, numRemainingRead);
            }
            raf.close();
    
        }
        bw.close();
    }
    
    public static void joinFiles() throws Exception {
    
        File[] files = new File[NUMBER_OF_OUTPUT_FILES];
        for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
            files[i - 1] = new File(INPUT_FILE + "_Splits\\split." + i + FILE_SUFFIX);
        }
    
        joinFiles(files);
    }