Suppose,
I am splitting 2590400 KB (approx 2.5 GB) file in 30 parts.
It will produce 30 files with size of 86347 KB.
Which seems correct, 2590400/30 = 86346.66666667
Now if I merge all the parts (30) again it is producing the file of 3453873 KB file, which should be 2590410 KB.
Can anyone help me why this difference is there? I am using below code for merge and split files.
SplitFile.java
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
/**
* @author vishal.zanzrukia
*
*/
public class SplitFile {
public static final String INPUT_FILE = "D:\\me\\projects\\input\\file\\path.txt";
public static final int NUMBER_OF_OUTPUT_FILES = 30;
public static final String FILE_SUFFIX = ".txt";
/**
* split file
*
* @throws Exception
*/
static void splitFile() throws Exception{
File inputFile = new File(INPUT_FILE + "_Splits");
inputFile.mkdir();
RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");
long sourceSize = raf.length();
long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;
int maxReadBufferSize = 8 * 1024; // 8KB
for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\split." + destIx + FILE_SUFFIX));
if (bytesPerSplit > maxReadBufferSize) {
long numReads = bytesPerSplit / maxReadBufferSize;
long numRemainingRead = bytesPerSplit % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
} else {
readWrite(raf, bw, bytesPerSplit);
}
bw.close();
}
if (remainingBytes > 0) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
readWrite(raf, bw, remainingBytes);
bw.close();
}
raf.close();
}
/**
* join file
*
* @throws Exception
*/
static void joinFiles() throws Exception{
int maxReadBufferSize = 8 * 1024;
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX));
File inputFileDir = new File(INPUT_FILE + "_Splits");
RandomAccessFile raf = null;
if(inputFileDir.isDirectory()){
for(File file : inputFileDir.listFiles()){
raf = new RandomAccessFile(file, "r");
long numReads = raf.length() / maxReadBufferSize;
long numRemainingRead = raf.length() % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
raf.close();
}
}
bw.close();
}
public static void mergeFiles() {
File[] files = new File[NUMBER_OF_OUTPUT_FILES];
for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
files[i-1] = new File(INPUT_FILE + "_Splits\\split."+i+FILE_SUFFIX);
}
String mergedFilePath = INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX;
File mergedFile = new File(mergedFilePath);
mergeFiles(files, mergedFile);
}
public static void mergeFiles(File[] files, File mergedFile) {
FileWriter fstream = null;
BufferedWriter out = null;
try {
fstream = new FileWriter(mergedFile, true);
out = new BufferedWriter(fstream);
} catch (IOException e1) {
e1.printStackTrace();
}
for (File f : files) {
System.out.println("merging: " + f.getName());
FileInputStream fis;
try {
fis = new FileInputStream(f);
BufferedReader in = new BufferedReader(new InputStreamReader(fis));
String aLine;
while ((aLine = in.readLine()) != null) {
out.write(aLine);
out.newLine();
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
// splitFile();
mergeFiles();
}
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if (val != -1) {
bw.write(buf);
}
}
}
Use your joinFiles
method: don't try to read a file by line-by-line using a Reader
if you want to keep it exactly like it was, because line endings may differ by platform.
Instead read them as a binary file using an InputStream
or RandomAccessFile
and write using an OutputStream
.
The only problem in your joinFiles
method is that it used File.listFiles()
which makes no guarantees about the order in which the files are returned.
I combined your mergeFiles()
code with joinFiles()
to make this work (remember to invoke joinFiles()
instead of mergeFiles()
from your main
method)
static void joinFiles(File[] files) throws Exception {
int maxReadBufferSize = 8 * 1024;
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin"
+ FILE_SUFFIX));
RandomAccessFile raf = null;
for (File file : files) {
raf = new RandomAccessFile(file, "r");
long numReads = raf.length() / maxReadBufferSize;
long numRemainingRead = raf.length() % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
raf.close();
}
bw.close();
}
public static void joinFiles() throws Exception {
File[] files = new File[NUMBER_OF_OUTPUT_FILES];
for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
files[i - 1] = new File(INPUT_FILE + "_Splits\\split." + i + FILE_SUFFIX);
}
joinFiles(files);
}