python algorithm binary-search stress-testing

Binary Search with Duplicates producing wrong result at unknown key and list

I am trying to solve a Binary Search problem, where the output is the location of the first appearance of the key in a sorted list.

I have managed to solve the first part, where I find any location where the key appears in the sorted list by using Binary Search. But the second part, where its first appearance is found, is going wrong for an unknown key and sorted list.

The code for binary search with duplicates:

import math
import random

def binary_search(keys, query):
    low = 0
    high = len(keys) - 1


    def searchdef(keys, query, low, high):
        if low > high:
            return -1, 0, 0
        x = math.floor(low + (high-low)/2)
        if query != keys[x]:
            if query > keys[x]:
                for i in range(0, len(keys)-1):
                    return searchdef(keys, query, x+1, high)
            if query < keys[x]:
                return searchdef(keys, query, low, x-1)
        else:
            return x, high, low

    def finddup2(x, low):
        z = x-1
        while query == keys[z] and z != -1:
            return min(z, finddup2(z, low))
        else: return x
    
        

    x, high, low = searchdef(keys, query, low, high)
    ans = finddup2(x, low)
    return ans     



if __name__ == '__main__':
    
    num_keys = int(input())
    input_keys = list(map(int, input().split()))
    assert len(input_keys) == num_keys

    num_queries = int(input())
    input_queries = list(map(int, input().split()))
    assert len(input_queries) == num_queries

    for q in input_queries:
        print(binary_search(input_keys, q), end=' ')

I have tried to stress test the code by using a linear search, but after many checks it produces a recursion error.

Stress Test Code:

import math
import random

def binary_search(keys, query):
    low = 0
    high = len(keys) - 1


    def searchdef(keys, query, low, high):
        if low > high:
            return -1, 0, 0
        x = math.floor(low + (high-low)/2)
        if query != keys[x]:
            if query > keys[x]:
                for i in range(0, len(keys)-1):
                    return searchdef(keys, query, x+1, high)
            if query < keys[x]:
                return searchdef(keys, query, low, x-1)
        else:
            return x, high, low

    
    def finddup2(x, low):
        z = x-1
        while query == keys[z] and z != -1:
            return min(z, finddup2(z-1, low))
        else: return x
    
        

    x, high, low = searchdef(keys, query, low, high)
    ans = finddup2(x, low)
    return ans     


def search(arr, N, x):
 
    for i in range(0, N):
        if (arr[i] == x):
            return i
    return -1

if __name__ == '__main__':

    def random_no():
        num_keys = int(50)
        input_keys = random.sample(range(1, 100), 50)
        input_keys.sort()
        assert len(input_keys) == num_keys

        num_queries = int(1)
        input_queries = [random.choice(input_keys)]
        assert len(input_queries) == num_queries
        return input_keys, input_queries

    input_keys, input_queries = random_no()

    def check(input_keys, input_queries):
        for q in input_queries:
            a = binary_search(input_keys, q)
            b = search(input_keys, len(input_keys), q)
            print(a, end=' ')
            print(b, end=' ')
            while a == b:
                print(a, end=' ')
                print(b, end=' ')
                print("ok")
                input_keys, input_queries = random_no()
                check(input_keys, input_queries)
            else:
                print(input_keys)
                print(q)
                print(a, end=' ')
                print(b, end=' ')               
                print("wrong")
                
    

    check(input_keys, input_queries)

Error I'm getting:

RecursionError: maximum recursion depth exceeded in comparison

I would like to know what has gone wrong with my code, and why the stress test is also not working.

Edit:

My new code after some changes:

import math
import random

def binary_search(keys, query):
    low = 0
    high = len(keys) - 1


    def searchdef(keys, query, low, high):
        if low > high:
            return -1, high, low
        x = math.floor(low + (high-low)/2)
        if query != keys[x]:
            if query > keys[x]:
                return searchdef(keys, query, x+1, high)
            if query < keys[x]:
                return searchdef(keys, query, low, x-1)
        else:
            return x, high, low
    
    def finddup2(x, low):
        z = x-1
        if query == keys[z] and z != -1:
            return min(z, finddup2(z, low))
        else: return x
    
        

    x, high, low = searchdef(keys, query, low, high)
    ans = finddup2(x, low)
    return ans     



if __name__ == '__main__':
    
    num_keys = int(input())
    input_keys = list(map(int, input().split()))
    assert len(input_keys) == num_keys

    num_queries = int(input())
    input_queries = list(map(int, input().split()))
    assert len(input_queries) == num_queries

    for q in input_queries:
        print(binary_search(input_keys, q), end=' ')

It still returns a wrong answer after black box testing. My stress test cannot find what the value causing the wrong answer is. Could you please help me?

Edit 2: I came up with a slight change. Now, the code is returning the correct answer, but is exceeding the time limit. I request help.

import math

def binary_search(keys, query):
    low = 0
    high = len(keys) - 1


    def searchdef(keys, query, low, high):
        if low > high:
            return -1
        if query < keys[low]:
            return -1
        if query > keys[high]:
            return -1
        if low <= high :
            x = math.floor(low + (high-low)/2)
            if query == keys[x]:
                while query == keys[x-1] and x != 0:
                    x -=1
                return x
            if query > keys[x]:
                return searchdef(keys, query, x+1, high)
            if query < keys[x]:
                return searchdef(keys, query, low, x-1)    
        

    x = searchdef(keys, query, low, high)
    return x



if __name__ == '__main__':
    
    num_keys = int(input())
    input_keys = list(map(int, input().split()))
    assert len(input_keys) == num_keys

    num_queries = int(input())
    input_queries = list(map(int, input().split()))
    assert len(input_queries) == num_queries

    for q in input_queries:
        print(binary_search(input_keys, q), end=' ')

Solution

The problem with the binary search is that return min(z, finddup2(z-1, low)) skips an index: z is already one less than the current index, so you should not subtract 1 from it again. It should be return min(z, finddup2(z, low))

Secondly, when the value is not found, and x == -1, the function finddup2 should not be called. So:

    x, high, low = searchdef(keys, query, low, high)
    if x == -1:
        return -1
    ans = finddup2(x, low)
    return ans

There are many other comments to make, like why you have a while loop that returns in its first iteration. It should just be an if. Also a for loop has a return in its first iteration, so that loop should not be there at all. Making finddup2 recursive is not a good idea as this may use a lot of stack space. Make it iterative.

The problem with the stress test code is that it is calling itself recursively and causes a stack overflow.

Here is proposed correction of your testing code:

    def random_no(num_keys=50, num_queries=10):
        input_keys = random.sample(range(1, 100), num_keys)
        input_keys.sort()
        input_queries = random.choices(input_keys, k=num_queries)
        return input_keys, input_queries

    def check(input_keys, input_queries):
        for q in input_queries:
            a = binary_search(input_keys, q)
            b = search(input_keys, len(input_keys), q)
            if a != b:
                print(input_keys)
                print(q)
                print(a, b, "wrong")
                raise ValueError("wrong result")
    

    for _ in range(100):
        input_keys, input_queries = random_no()
        check(input_keys, input_queries)

    print("all done")

Time efficiency

The function finddup2 is inefficient, both in terms of space (stack) and time. In the worst case it would have to scan half of the input list, which makes it O(𝑛) and kills the advantage you would have from the binary search algorithm, which has a time complexity of O(log𝑛). But O(log𝑛) + O(𝑛) = O(𝑛), so it has degraded to the performance of a linear search.

Also using math to convert a floating point number to integer is worse than just using the integer division operator (//).

To make this efficient, you should make sure that the binary search algorithm continues to drill down the tree until it is certain to have found the leftmost occurrence of a number.

Here is an adaptation of your binary_search that doesn't need finddup2. Note that you don't need to pass keys and query as arguments to your inner function, as it has access to those names anyhow:

def binary_search(keys, query):
    low = 0
    high = len(keys) - 1

    def searchdef(low, high):
        if low > high:
            if low >= len(keys) or keys[low] != query:
                return -1
            return low
        x = (low + high) // 2
        if query > keys[x]: 
            return searchdef(x + 1, high)
        else:  # Continue also when equal!
            return searchdef(low, x - 1)
            
    return searchdef(low, high)