Search code examples
javasortingexternalmergesort

How do i do external merge sort on binary file


This short program makes a binary file called data.bin with a bunch of randomly generated items that are 1024 bytes each. where the first 24 bytes of each item is the key. so how do i read each item from the data.bin file and do an external merge sort on it all?

import java.math.*;
import java.io.*;
import java.util.*;

public class CreateRandomDataFile {

   public static void main(String[] args) throws Exception {

      RandomAccessFile data = new RandomAccessFile("data.bin","rws");
      Random r = new Random(482010);
      Scanner input = new Scanner(System.in);
      int number = 100000;
      byte[] item = new byte[1024];

      System.out.print("How many items (perhaps 800000)\n> ");
      number = input.nextInt();

      for (int i=0; i<number; i++) {
         r.nextBytes(item);
         data.write(item);
      }
      data.close();
        System.out.println("Done");
   }
}  

Solution

  • look at http://www.vogella.com/articles/JavaAlgorithmsMergesort/article.html

    Making this work on any object with a comparator:

        package com.sel2in.sort;
    
        import java.util.ArrayList;
        import java.util.Arrays;
        import java.util.Comparator;
        import java.util.List;
    
        /** based on http://www.vogella.com/articles/JavaAlgorithmsMergesort/article.html */
        public class MergeSort<T> {
    
            private List<T> items;
            private List<T> helper;
    
            private Comparator<T> comprtr;
            private int cnt;
    
            public void sort(T[] values, Comparator<T> comprtr) {
                items = new ArrayList<T>();
                items.addAll(Arrays.asList(values));
    
                cnt = values.length;
                this.helper = new ArrayList<T>(cnt);
                this.comprtr = comprtr;
                mergesort(0, cnt - 1);
            }
    
            public void sort(List<T> values, Comparator<T> comprtr) {
                items = values;
                //items.addAll(Arrays.asList(values));
                cnt = values.size();
                this.helper = new ArrayList<T>(cnt);
                helper.addAll(items);
                this.comprtr = comprtr;
                mergesort(0, cnt - 1);
            }   
    
            private void mergesort(int low, int high) {
                // Check if low is smaller then high, if not then the array is sorted
                if (low < high) {
                    // Get the index of the element which is in the middle
                    int middle = low + (high - low) / 2;
                    // Sort the left side of the array
                    mergesort(low, middle);
                    // Sort the right side of the array
                    mergesort(middle + 1, high);
                    // Combine them both
                    merge(low, middle, high);
                }
            }
    
            private void merge(int low, int middle, int high) {
    
                // Copy both parts into the helper array
                for (int i = low; i <= high; i++) {
                    // helper[i] = items[i];
                    helper.set(i, items.get(i));
                }
    
                int i = low;
                int j = middle + 1;
                int k = low;
                // Copy the smallest values from either the left or the right side back
                // to the original array
                while (i <= middle && j <= high) {
                    int cm = comprtr.compare(helper.get(i), helper.get(j));
                    //(helper[i] <= helper[j]) 
                    if (cm <= 0) {
                        //items[k] = helper[i];
                        items.set(k, helper.get(i));
                        i++;
                    } else {
                        //items[k] = helper[j];
                        items.set(k, helper.get(j));
                        j++;
                    }
                    k++;
                }
                // Copy the rest of the left side of the array into the target array
                while (i <= middle) {
                    //items[k] = helper[i];
                    items.set(k, helper.get(i));
                    k++;
                    i++;
                }
    
            }
    
        }
    

    String Comparator that works on entry type - useful for you as you have key and rest of line, we could have made our own object did not have to use Entry but its already there

        package com.sel2in.sort;
    
        import java.util.Comparator;    
        import java.util.Map.Entry;
    
        /** compare 2 strings, taken from the key of a Map entry - Tushar Kapila */
        public class MapStrKeyComparator implements Comparator<java.util.Map.Entry<String,Object>> {
            @Override
            public int compare(Entry<String,Object> o1, Entry<String,Object> o2) {
                String s = o1.getKey().toString();
                String n = o2.getKey().toString();
                return s.compareTo(n);
    
            }
        }
    

    Test

        package com.sel2in.sort;
    
        import java.util.ArrayList;
        import java.util.Comparator;
        import java.util.HashMap;
        import java.util.List;
        import java.util.Map;
        import java.util.Map.Entry;
        import java.util.Set;
    
        public class TstMege {
    
            /**
             * @param args
             */
            public static void main(String[] args) {
                Map<String, String>mp = new HashMap<String, String>();
    
                //here instead of the test values load from your file and seperate key and value
                mp.put("tt", "nxcn3er3e33");
                mp.put("aa", "gjkrmt454");
                mp.put("zz", "rft56GDD");
                mp.put("zz", "rft56GDD");
                //sort
                Set<Entry<String, String>> entries = mp.entrySet();
                MapStrKeyComparator cpmr = new MapStrKeyComparator();
                MergeSort sorter = new MergeSort();
                //sort. sort(List<T> values, Comparator<T> comprtr)
                ArrayList<Entry<String, String>> lst = new ArrayList<Entry<String, String>>(entries.size());
                lst.addAll(entries);
                System.out.println("before " + lst);
                sorter.sort(lst, cpmr);
                System.out.println("sorted " + lst);
    
            }
    
        }
    

    Output

    before [tt=nxcn3er3e33, zz=rft56GDD, aa=gjkrmt454]
    sorted [aa=gjkrmt454, tt=nxcn3er3e33, zz=rft56GDD]
    

    Our own entry class

    package com.sel2in.sort;
    
    import java.util.Map.Entry;
    
    public class TEntry<K,V> implements Entry<K, V> {
        final K key;
        V value;
    
        TEntry(){
            key = null;
        }
    
        TEntry(K k){
            key = k;
        }    
    
        TEntry(K k, V v) {
            value = v;
            key = k;
        }    
    
    
    
        @Override
        public K getKey() {
            return key;
        }
    
        @Override
        public V getValue() {
            return value;
        }
    
        @Override
        public V setValue(V newValue) {
            V oldValue = value;
            value = newValue;
            return oldValue;
        }
    
        public String toString(){
            return key + ":" + value;
        }
    
        //can copy hashcode , equals but not important now
    
    }
    

    File read binary and sort

        package com.sel2in.sort;
    
        import java.io.RandomAccessFile;
        import java.util.ArrayList;
        import java.util.Arrays;
        import java.util.Map.Entry;
    
            public class TstFileMegeSort {
    
                /**
                 * @param args
                 */
                public static void main(String[] args) {
                    //Map<String, String>mp = new HashMap<String, String>();
                    ArrayList<Entry<String, String>> lst = new ArrayList<Entry<String, String>> (); 
                    TEntry<String, String> en = null;
                    //here instead of the test values load from your file and seperate key and value
                    try {
                        RandomAccessFile data = new RandomAccessFile("data.bin","rws");
                        long l = data.length();
                        long recs = l / 1024;
                        long cnt = 0;
                        byte []b = new byte[1024];
    
                        while(cnt < recs){
                            cnt++;
                            data.readFully(b);
                            byte []key = Arrays.copyOfRange(b, 0, 24);
                            byte []value = Arrays.copyOfRange(b, 24, 1024);
                            en = new TEntry<String, String>(new String(key), new String(value));
                            lst.add(en);
                        }
    
    
                    } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
    
                    //sort
                    //Set<Entry<String, String>> entries = mp.entrySet();
                    MapStrKeyComparator cpmr = new MapStrKeyComparator();
                    MergeSort sorter = new MergeSort();
                    //sort. sort(List<T> values, Comparator<T> comprtr)
                    //ArrayList<Entry<String, String>> lst = new ArrayList<Entry<String, String>>(entries.size());
                    //lst.addAll(entries);
                    System.out.println("before " + lst);
                    sorter.sort(lst, cpmr);
                    System.out.println("sorted " + lst);
    
                }
    
            }