Search code examples
pythonnumpyvectorization

Vectorizing a custom parsing function gives a ValueError


I've written a custom number parsing function. Basically, I want to convert app size information as it is given in the Google Play store (5.6M, 3M, 112K) to a standard float number.

To apply this function to a column of data in my data frame, I want to vectorize it using numpy.vectorize. However, when I'm testing it, i'm getting an error.

This is the function:

import numpy as np
import re

def parse_numbers(x, homo = False):
    if homo == False:
        if bool(re.match("^[0-9.]+[Mm]{1}$", x)):
            new_number = float(re.sub("[^0-9.]", "", x))
            return new_number * 1000000
        elif bool(re.match("^[0-9.]+[Kk]{1}$", x)):
            new_number = float(re.sub("[^0-9.]", "", x))
            return new_number * 1000
        else:
            return(x)
    elif homo == True:
        if bool(re.match("^[0-9.]+[MmKk]{1}$", x)):
            return "parsed_number"
        else:
            return(x)
    else:
        return "invalid setting for homo attribute"

As you can see, if it receives any input that can not be parsed as a number, it returns the original input.

When I test this manually, it works fine: parse_numbers("3.1M") returns 3100000 and parse_numbers("not a number") returns not a number.

Now I try to vectorize the function, and I test it like this:

vparse_numbers = np.vectorize(parse_numbers)
vparse_numbers(["3.1M", "2k", "not a number"])

I get the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_27076/2263610390.py in <module>
     21 
     22 vparse_numbers = np.vectorize(parse_numbers)
---> 23 vparse_numbers(["3.0M", "2k", "not a number"])

c:\programdata\miniconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
   2161             vargs.extend([kwargs[_n] for _n in names])
   2162 
-> 2163         return self._vectorize_call(func=func, args=vargs)
   2164 
   2165     def _get_ufunc_and_otypes(self, func, args):

c:\programdata\miniconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
   2247 
   2248             if ufunc.nout == 1:
-> 2249                 res = asanyarray(outputs, dtype=otypes[0])
   2250             else:
   2251                 res = tuple([asanyarray(x, dtype=t)

ValueError: could not convert string to float: 'not a number'

When I test it using only the parseable numbers: vparse_numbers(["3.1M", "2k", "not a number"]) it does return a list of the correct numbers to me.

What am I missing here? Am I not using the numpy.vectorize function correctly?


Solution

  • np.vectorize creates numpy.array which can't mix different type of data - string and float - and it tries to convert all to float. You would have to return np.NaN instead of all strings.

    import numpy as np
    import re
    import pandas as pd
    
    def parse_numbers(text, homo=False):
        if homo == False:
            if bool(re.match("^[0-9.]+[Mm]{1}$", text)):
                new_number = float(re.sub("[^0-9.]", "", text))
                return new_number * 1_000_000
            elif bool(re.match("^[0-9.]+[Kk]{1}$", text)):
                new_number = float(re.sub("[^0-9.]", "", text))
                return new_number * 1_000
            else:
                return np.NaN
        elif homo == True:
            if bool(re.match("^[0-9.]+[MmKk]{1}$", text)):
                return "parsed_number"
            else:
                return np.NaN
        else:
            raise Exception(f"invalid setting for homo attribute: {homo}")
    
    vparse_numbers = np.vectorize(parse_numbers)
    vparse_numbers(["3.1M", "2k", "not a number"])
    

    But if you want to use DataFrame then you should use .apply() and it can return different type of data.

    import numpy as np
    import re
    import pandas as pd
    
    def parse_numbers(text, homo=False):
        if homo == False:
            if bool(re.match("^[0-9.]+[Mm]{1}$", text)):
                new_number = float(re.sub("[^0-9.]", "", text))
                return new_number * 1_000_000
            elif bool(re.match("^[0-9.]+[Kk]{1}$", text)):
                new_number = float(re.sub("[^0-9.]", "", text))
                return new_number * 1_000
            else:
                return text  # np.NaN
        elif homo == True:
            if bool(re.match("^[0-9.]+[MmKk]{1}$", text)):
                return "parsed_number"
            else:
                return text  # np.NaN
        else:
            raise Exception(f"invalid setting for homo attribute: {homo}")
    
    df = pd.DataFrame({"test": ["3.1M", "2k", "not a number"]})
    
    df['result'] = df['test'].apply(parse_numbers)
    
    print(df)
    

    Result:

               test        result
    0          3.1M     3100000.0
    1            2k        2000.0
    2  not a number  not a number
    

    EDIT:

    I would write parse_numbers little different

    • with re in only one line.
    • using {,1} to convert to number also strings like "123"
    • converting to integer at the end
    • using name parse_number without s because it parses only one number.
    import re
    import pandas as pd
    
    
    def parse_number(text, homo=False):
        if not isinstance(homo, bool):
            raise Exception(f"invalid setting for homo attribute: {homo}")
    
        results = re.findall("^([0-9.]+)([MmKk]{,1})$", text)
        #print(results)
    
        if results:
            if homo:
                return "parsed_number"
            else:
                number, name = results[0]
                new_number = float(number)
                if name in ('M', 'm'):
                    new_number *= 1_000_000
                elif name in ('K', 'k'):
                    new_number *= 1_000
                return int(new_number)
        else:
            return text  # np.NaN
    
            
    df = pd.DataFrame({
        "test": ["3.1M", "2k", "123", "not a number"]
    })
    
    df['result'] = df['test'].apply(parse_number)
    df['homo'] = df['test'].apply(lambda x:parse_number(x, True))
    
    print(df)
    

    Result:

               test        result           homo
    0          3.1M       3100000  parsed_number
    1            2k          2000  parsed_number
    2           123           123  parsed_number
    3  not a number  not a number   not a number