Search code examples
pythonpython-3.xnumpyperformance

Efficient creation of a sequence of sets from values and thresholds


Given a shortish sequence of thresholds ordered ascendantly and numerous values (unordered).

Wanted result is a sequence of sets, first one containing all distinct values below the lowest/first threshold; next values not below lowest threshold, but below 2nd threshold, if any; and so on 'til last threshold; finally all values not below highest threshold.

There are similar questions about dicts (pointers to helpful solutions there welcome, too),
suggestions amounting to

from itertools import pairwise

def partition(values, thresholds):
    """ Partition values into a list of sets 
        with values in right-open intervals specified by thresholds.
    """
    return [ { v for v in values if v < thresholds[0] }
       ] + [ { v for v in values if lo <= v < hi }
                                for lo, hi in tuple(pairwise(thresholds))
       ] + [ { v for v in values if thresholds[-1] <= v } ]

This "iterates" values len(thresholds)+1 times.

How to efficiently create a sequence of sets partitioning values according to thresholds?

I failed to find something helpful SciPy/NumPy.
Using numpy.digitize() to index an array of add() members was in the ball park of partition_Kelly3b() for non-trivial values and thresholds.


Solution

  • Test results of various solutions:

    10000 values and 3 thresholds:
      0.83 ± 0.00 ms  partition_Kelly4
      0.83 ± 0.00 ms  partition_Kelly4c
      0.83 ± 0.01 ms  partition_Kelly4a
      1.42 ± 0.02 ms  partition_Kelly3b
      1.55 ± 0.02 ms  partition_Kelly3
      1.76 ± 0.02 ms  partition_Kelly2
      1.93 ± 0.00 ms  partition_Kelly4b
      2.04 ± 0.01 ms  partition_Kelly
      2.55 ± 0.03 ms  partition_original
    
    10000 values and 10 thresholds:
      0.86 ± 0.01 ms  partition_Kelly4a
      0.87 ± 0.01 ms  partition_Kelly4c
      0.88 ± 0.02 ms  partition_Kelly4
      1.98 ± 0.03 ms  partition_Kelly4b
      2.03 ± 0.03 ms  partition_Kelly3b
      2.06 ± 0.05 ms  partition_Kelly2
      2.22 ± 0.05 ms  partition_Kelly3
      2.52 ± 0.02 ms  partition_Kelly
      6.19 ± 0.19 ms  partition_original
    
    10000 values and 100 thresholds:
      0.94 ± 0.02 ms  partition_Kelly4a
      0.97 ± 0.02 ms  partition_Kelly4
      0.99 ± 0.02 ms  partition_Kelly4c
      2.05 ± 0.03 ms  partition_Kelly4b
      2.62 ± 0.17 ms  partition_Kelly2
      3.41 ± 0.05 ms  partition_Kelly
      3.58 ± 0.33 ms  partition_Kelly3b
      3.91 ± 0.25 ms  partition_Kelly3
     60.49 ± 10.98 ms  partition_original
    
    Wrong:
      partition_dankal444
    

    Code (Attempt This Online!):

    from itertools import pairwise
    import random
    from bisect import bisect, bisect_left
    from time import perf_counter as time
    from statistics import mean, stdev
    
    
    def partition_original(values, thresholds):
        """ Partition values into a list of sets 
            with values in right-open intervals specified by thresholds.
        """
        return [ { v for v in values if v < thresholds[0] }
           ] + [ { v for v in values if lo <= v < hi }
                                    for lo, hi in tuple(pairwise(thresholds))
           ] + [ { v for v in values if thresholds[-1] <= v } ]
    
    
    def partition_Kelly(values, thresholds):
        res = [set() for _ in thresholds]
        res.append(set())
        for v in values:
            i = bisect(thresholds, v)
            res[i].add(v)
        return res
    
    
    def partition_Kelly2(values, thresholds):
        res = [set() for _ in thresholds]
        res.append(set())
        for v in set(values):
            i = bisect(thresholds, v)
            res[i].add(v)
        return res
    
    
    def partition_Kelly3(values, thresholds):
        def partition(values, thresholds):
            if not thresholds:
                return [set(values)]
            i = len(thresholds) // 2
            threshold = thresholds[i]
            hi = []
            lo = [x for x in values if x < threshold or hi.append(x)]
            return partition(lo, thresholds[:i]) + partition(hi, thresholds[i+1:])
        return partition(set(values), thresholds)
    
    
    def partition_Kelly3b(values, thresholds):
        def partition(values, thresholds):
            if not thresholds:
                return [set(values)]
            i = len(thresholds) // 2
            threshold = thresholds[i]
            hi = [].append
            lo = [x for x in values if x < threshold or hi(x)]
            return partition(lo, thresholds[:i]) + partition(hi.__self__, thresholds[i+1:])
        return partition(set(values), thresholds)
    
    
    def partition_Kelly4(values, thresholds):
        values = sorted(set(values))
        res = []
        i = 0
        for threshold in thresholds:
            j = bisect_left(values, threshold)
            res.append(set(values[i:j]))
            i = j
        res.append(set(values[i:]))
        return res
    
    
    def partition_Kelly4a(values, thresholds):
        values = sorted(set(values))
        res = []
        i = 0
        for threshold in thresholds:
            j = bisect_left(values, threshold, i)
            res.append(set(values[i:j]))
            i = j
        res.append(set(values[i:]))
        return res
    
    
    def partition_Kelly4b(values, thresholds):
        values = sorted(values)
        res = []
        i = 0
        for threshold in thresholds:
            j = bisect_left(values, threshold)
            res.append(set(values[i:j]))
            i = j
        res.append(set(values[i:]))
        return res
    
    
    def partition_Kelly4c(values, thresholds):
        def partition(start, stop, thresholds):
            if not thresholds:
                return [set(values[start:stop])]
            i = len(thresholds) // 2
            threshold = thresholds[i]
            j = bisect_left(values, threshold, start, stop)
            return partition(start, j, thresholds[:i]) + partition(j, stop, thresholds[i+1:])
        values = sorted(set(values))
        return partition(0, len(values), thresholds)
    
    
    def partition_dankal444(values, thresholds):
        import bisect
        sets = [set() for _ in range(len(thresholds) + 1)]
    
        for value in values:
            set_idx = bisect.bisect_left(thresholds, value)
            sets[set_idx].add(value)
        return sets
    
    
    funcs = [partition_original, partition_Kelly, partition_Kelly2, partition_Kelly3, partition_Kelly3b, partition_Kelly4, partition_Kelly4a, partition_Kelly4b, partition_Kelly4c, partition_dankal444]
    wrong = set()
    
    def test(n, k, repeat):
        print(n, 'values and', k, 'thresholds:')
        t0 = time()
    
        times = {f: [] for f in funcs}
        def stats(f):
            ts = [t * 1e3 for t in sorted(times[f])[:5]]
            return f'{mean(ts):6.2f} ± {stdev(ts):4.2f} ms '
    
        for _ in range(repeat):
            values = random.choices(range(n), k=n)
            thresholds = [int((i+.5)/k * n) for i in range(k)]
            expect = none = object()
            for f in funcs:
                t = time()
                result = f(values, thresholds)
                times[f].append(time() - t)
                if expect is none:
                    expect = result
                elif result != expect:
                    wrong.add(f)
            funcs[:] = [f for f in funcs if f not in wrong]
    
        for f in sorted(funcs, key=stats):
            print(stats(f), f.__name__)
    
        print()#time() - t0)
    
    test(10**4, 3, 100)
    test(10**4, 10, 80)
    test(10**4, 10**2, 25)
    print('Wrong:')
    for f in wrong:
        print(' ', f.__name__)