Search code examples
pythonjsonbinary-search

Binary search for first occurrence in a dictionary list


So I am dealing with large datasets, n>1000000. The data contains order information about an item. There is a boolean within the JSON formatted order called is_buy_order. I would like to split the list of orders into two separate lists depending if the boolean is true or false.

I have come up with an algorithm that is flawed but is faster than iteration.

The algorithm splits the dataset in half by choosing a pivot, then it checks either side to determine which side is closer to the transition point (false -> true). It continues to half until the value either side of the pivot is different or the pivot == 1 which indicates no change.

start = time.time()
orders_file = open("resources/regions/"+x.replace(" ", "")[1:-1]+".json", 'r')
orders = orders_file.readlines()
orders_file.close()


item_buy, item_sell = [], []

pivot_found = False
print(len(orders))

if len(orders) > 1:
    while not pivot_found:
        temp_orders = orders
        pivot = len(temp_orders)//2

        if pivot == 1:
            break

        if json.loads(orders[pivot].replace("\n", ""))["is_buy_order"]:
            orders = orders[:pivot]
            buy_sell_index -= pivot
        else:
            orders = orders[pivot:]

        if json.loads(temp_orders[pivot].replace("\n", ""))["is_buy_order"] != json.loads(temp_orders[pivot-1].replace("\n", ""))["is_buy_order"]:
            pivot_found = True


item_buy, item_sell = temp_orders[:pivot], temp_orders[pivot:]
buy_sell_index = orders.index(item_sell[0])
print(x, time.time()-start, buy_sell_index) 

Below is the contents of a severely reduced dataset:

{"duration":90,"is_buy_order":false,"issued":"2018-06-09T01:52:42Z","location_id":1027547438558,"min_volume":1,"order_id":5180297455,"price":16000.0,"range":"40","system_id":30001811,"type_id":28362,"volume_remain":892,"volume_total":892}
{"duration":90,"is_buy_order":false,"issued":"2018-06-09T01:53:11Z","location_id":1027547438558,"min_volume":1,"order_id":5180297673,"price":100000.0,"range":"40","system_id":30001811,"type_id":28366,"volume_remain":907,"volume_total":907}
{"duration":90,"is_buy_order":false,"issued":"2018-06-09T01:53:42Z","location_id":1027547438558,"min_volume":1,"order_id":5180297903,"price":100000.0,"range":"40","system_id":30001811,"type_id":21815,"volume_remain":906,"volume_total":906}
{"duration":90,"is_buy_order":true,"issued":"2018-08-03T01:50:59Z","location_id":1027954902335,"min_volume":1,"order_id":5191398100,"price":4.0,"range":"5","system_id":30001780,"type_id":34,"volume_remain":10000000,"volume_total":10000000}
{"duration":90,"is_buy_order":true,"issued":"2018-08-05T07:30:18Z","location_id":1028168079013,"min_volume":1,"order_id":5221892906,"price":2250000.0,"range":"4","system_id":30001748,"type_id":25615,"volume_remain":100,"volume_total":100}
{"duration":90,"is_buy_order":true,"issued":"2018-07-21T05:23:37Z","location_id":1022958758740,"min_volume":1,"order_id":5211030090,"price":185.0,"range":"5","system_id":30001786,"type_id":204,"volume_remain":40000,"volume_total":40000}
{"duration":90,"is_buy_order":true,"issued":"2018-08-05T07:31:23Z","location_id":1028168079013,"min_volume":1,"order_id":5221893610,"price":6000.0,"range":"4","system_id":30001748,"type_id":25616,"volume_remain":1000,"volume_total":1000}
{"duration":90,"is_buy_order":true,"issued":"2018-08-05T07:27:50Z","location_id":1028168079013,"min_volume":1,"order_id":5221891669,"price":1150000.0,"range":"4","system_id":30001748,"type_id":25619,"volume_remain":200,"volume_total":200}
{"duration":90,"is_buy_order":true,"issued":"2018-07-22T17:46:06Z","location_id":1022958758740,"min_volume":1,"order_id":5212328909,"price":12.0,"range":"5","system_id":30001786,"type_id":211,"volume_remain":1000000,"volume_total":1000000}
{"duration":30,"is_buy_order":true,"issued":"2018-07-19T22:18:58Z","location_id":1028168079013,"min_volume":1,"order_id":5210158811,"price":2000000.0,"range":"5","system_id":30001748,"type_id":16278,"volume_remain":3,"volume_total":3}
{"duration":90,"is_buy_order":true,"issued":"2018-08-05T07:32:18Z","location_id":1028168079013,"min_volume":1,"order_id":5221894118,"price":65000.0,"range":"4","system_id":30001748,"type_id":25606,"volume_remain":1000,"volume_total":1000}

If the dataset needs new formatting to achieve this is a possibility.


Solution

  • This can indeed be done with a simple binary search.

    def find_first_buy_order(data):
        """
        Performs a binary search on the passed data to find the first buy order.
    
        Parameters
        ----------
        data : array_like
            List of order dictionaries
    
        Raises
        ------
        ValueError
            When the data is unsorted, or no buy order exists in the data
    
        Returns
        -------
        int
            The index in data of the first buy order
        dict
            The first buy order
        """
        low = 0
        high = len(data)
    
        # Check boundary conditions first
        if not data or not data[-1]["is_buy_order"]:
            raise ValueError("There are no buy orders in the data set!")
    
        if data[0]["is_buy_order"]:
            return 0, data[0]
    
        while low != high:
            mid = low + (high - low) // 2
    
            previous = data[mid - 1]["is_buy_order"]
            current = data[mid]["is_buy_order"]
    
            if previous != current:     # current is True, previous is False
                return mid, data[mid]
    
            if previous:                # previous is True, we need to go left
                high = mid
            else:                       # need to go right
                low = mid
    
        raise ValueError("Are you sure the data is sorted?")
    

    For your data set (I took the liberty to convert it into a list of dictionaries), I find,

    >>>idx, value = find_first_buy_order(DATA)
    
    >>>print(idx, value)
    
    <<<3 {'duration': 90, 'min_volume': 1, 'system_id': 30001780, 'type_id': 34, 'location_id': 1027954902335, 'order_id': 5191398100, 'issued': '2018-08-03T01:50:59Z', 'price': 4.0, 'volume_remain': 10000000, 'range': '5', 'is_buy_order': True, 'volume_total': 10000000}