Search code examples
pythonrandomquicksort

Python histogram of Randomized Quicksort Comparisons per Run


I have to implement the Las Vegas Randomized Quicksort algorithm and count the number of comparisons for each run to sort a random list of integers and create a histogram for the obtained valus with the number of runs being 10^4.

I am having trouble with the histogram, as it shows somethig this:

enter image description here

Instead of a distribution similar to this:

enter image description here

Here is the code I imagined. The number of comparisons is correct.

import random
import numpy as np
import matplotlib.pyplot as plt

def _inPlaceQuickSort(A, start, end):
    count = 0
    if start < end:
        pivot = randint(start, end)
        temp = A[end]
        A[end] = A[pivot]
        A[pivot] = temp

        p, count = _inPlacePartition(A, start, end)
        count += _inPlaceQuickSort(A, start, p - 1)
        count += _inPlaceQuickSort(A, p + 1, end)
    return count


def _inPlacePartition(A, start, end):

    count = 0
    pivot = randint(start, end)
    temp = A[end]
    A[end] = A[pivot]
    A[pivot] = temp
    newPivotIndex = start - 1
    for index in range(start, end):

        count += 1
        if A[index] < A[end]:  # check if current val is less than pivot value
            newPivotIndex = newPivotIndex + 1
            temp = A[newPivotIndex]
            A[newPivotIndex] = A[index]
            A[index] = temp

    temp = A[newPivotIndex + 1]
    A[newPivotIndex + 1] = A[end]
    A[end] = temp
    return newPivotIndex + 1, count 

if __name__ == "__main__": 
    comp = []
    for i in range(10):
        A={}
        for j in range(0, 10000):
            A[j] = random.randint(0, 10000)
        comp.append(_inPlaceQuickSort(A, 0, len(A) - 1)) 
    print(comp[i])

    plt.hist(comp, bins=50)
    plt.gca().set(title='|S|=10^4, Run=10^4', xlabel='Compares', ylabel='Frequency')

Solution

  • As pointed out by @Tom De Coninck and @pjs your problem is the sample size and, as you mentioned in a comment, if you increase your sample size it'll take a lot of time to generate it.

    My idea would be to generate the data with a C++ software (much faster) and then plotting it with Python. With that I can generate and plot 10000 runs in less than 20 seconds.

    Here it's my code (the quicksort algorithm was adapted from C++ Program for QuickSort - GeeksforGeeks)

    The C++ code generate out.txt containing the total number of comparisons for each run separated by a newline. The Python script read the lines and plot them (with various bucket sizes, as the assignment states)

    C++ Generator

    // g++ ./LVQuickSort.cpp -o lvquicksort
    
    #include <iostream>
    #include <fstream>
    #include <cstdlib>
    
    int ARRAY_TO_SORT_SIZE = 10000;
    int RUNS = 10000;
    
    void swap(int *a, int *b)
    {
      int t = *a;
      *a = *b;
      *b = t;
    }
    
    int partition(int arr[], int low, int high, int &comps)
    {
      int pivot = arr[(rand() % (high - low)) + low];
      int i = low - 1;
    
      for (int j = low; j <= high - 1; j++)
      {
        comps++;
        if (arr[j] <= pivot)
        {
          i++;
          swap(&arr[i], &arr[j]);
        }
      }
      swap(&arr[i + 1], &arr[high]);
      return i + 1;
    }
    
    void quickSort(int arr[], int low, int high, int &comps)
    {
      if (low < high)
      {
        int pi = partition(arr, low, high, comps);
    
        quickSort(arr, low, pi - 1, comps);
        quickSort(arr, pi + 1, high, comps);
      }
    }
    
    std::ofstream file;
    
    void write_comps_to_file(int comps)
    {
      file << comps << std::endl;
    }
    
    int main()
    {
      file.open("./out.txt", std::fstream::trunc);
    
      for (size_t i = 0; i < RUNS; i++)
      {
        int *arr = (int *)malloc(sizeof(int) * ARRAY_TO_SORT_SIZE);
        for (int i = 0; i < ARRAY_TO_SORT_SIZE; i++)
          arr[i] = rand() % 1000;
    
        int comps = 0;
    
        if (i % (RUNS / 50) == 0)
          std::cout << i << "/" << RUNS<< std::endl;
    
        quickSort(arr, 0, ARRAY_TO_SORT_SIZE - 1, comps);
        write_comps_to_file(comps);
      }
    
      file.close();
    }
    

    Python plotter

    import matplotlib.pyplot as plt
    
    f = open('out.txt', 'r')
    
    binCounts = [10, 50, 100, 200, 1000, 5000]
    
    for binCount in binCounts:
      vals = []
      f.seek(0)
      for line in f.readlines():
        vals.append(int(line))
    
      plt.hist(vals, bins=binCount)
      plt.gca().set(title='|S|=10^4 | Runs=10^4', xlabel='Comparisons', ylabel='Runs')
      plt.savefig(f'out{binCount}.png')
      plt.close()