I've written a custom number parsing function. Basically, I want to convert app size information as it is given in the Google Play store (5.6M, 3M, 112K) to a standard float number.
To apply this function to a column of data in my data frame, I want to vectorize it using numpy.vectorize. However, when I'm testing it, i'm getting an error.
This is the function:
import numpy as np
import re
def parse_numbers(x, homo = False):
if homo == False:
if bool(re.match("^[0-9.]+[Mm]{1}$", x)):
new_number = float(re.sub("[^0-9.]", "", x))
return new_number * 1000000
elif bool(re.match("^[0-9.]+[Kk]{1}$", x)):
new_number = float(re.sub("[^0-9.]", "", x))
return new_number * 1000
else:
return(x)
elif homo == True:
if bool(re.match("^[0-9.]+[MmKk]{1}$", x)):
return "parsed_number"
else:
return(x)
else:
return "invalid setting for homo attribute"
As you can see, if it receives any input that can not be parsed as a number, it returns the original input.
When I test this manually, it works fine: parse_numbers("3.1M")
returns 3100000
and parse_numbers("not a number")
returns not a number
.
Now I try to vectorize the function, and I test it like this:
vparse_numbers = np.vectorize(parse_numbers)
vparse_numbers(["3.1M", "2k", "not a number"])
I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_27076/2263610390.py in <module>
21
22 vparse_numbers = np.vectorize(parse_numbers)
---> 23 vparse_numbers(["3.0M", "2k", "not a number"])
c:\programdata\miniconda3\lib\site-packages\numpy\lib\function_base.py in __call__(self, *args, **kwargs)
2161 vargs.extend([kwargs[_n] for _n in names])
2162
-> 2163 return self._vectorize_call(func=func, args=vargs)
2164
2165 def _get_ufunc_and_otypes(self, func, args):
c:\programdata\miniconda3\lib\site-packages\numpy\lib\function_base.py in _vectorize_call(self, func, args)
2247
2248 if ufunc.nout == 1:
-> 2249 res = asanyarray(outputs, dtype=otypes[0])
2250 else:
2251 res = tuple([asanyarray(x, dtype=t)
ValueError: could not convert string to float: 'not a number'
When I test it using only the parseable numbers: vparse_numbers(["3.1M", "2k", "not a number"])
it does return a list of the correct numbers to me.
What am I missing here? Am I not using the numpy.vectorize function correctly?
np.vectorize
creates numpy.array
which can't mix different type of data - string
and float
- and it tries to convert all to float
. You would have to return np.NaN
instead of all strings.
import numpy as np
import re
import pandas as pd
def parse_numbers(text, homo=False):
if homo == False:
if bool(re.match("^[0-9.]+[Mm]{1}$", text)):
new_number = float(re.sub("[^0-9.]", "", text))
return new_number * 1_000_000
elif bool(re.match("^[0-9.]+[Kk]{1}$", text)):
new_number = float(re.sub("[^0-9.]", "", text))
return new_number * 1_000
else:
return np.NaN
elif homo == True:
if bool(re.match("^[0-9.]+[MmKk]{1}$", text)):
return "parsed_number"
else:
return np.NaN
else:
raise Exception(f"invalid setting for homo attribute: {homo}")
vparse_numbers = np.vectorize(parse_numbers)
vparse_numbers(["3.1M", "2k", "not a number"])
But if you want to use DataFrame
then you should use .apply()
and it can return different type of data.
import numpy as np
import re
import pandas as pd
def parse_numbers(text, homo=False):
if homo == False:
if bool(re.match("^[0-9.]+[Mm]{1}$", text)):
new_number = float(re.sub("[^0-9.]", "", text))
return new_number * 1_000_000
elif bool(re.match("^[0-9.]+[Kk]{1}$", text)):
new_number = float(re.sub("[^0-9.]", "", text))
return new_number * 1_000
else:
return text # np.NaN
elif homo == True:
if bool(re.match("^[0-9.]+[MmKk]{1}$", text)):
return "parsed_number"
else:
return text # np.NaN
else:
raise Exception(f"invalid setting for homo attribute: {homo}")
df = pd.DataFrame({"test": ["3.1M", "2k", "not a number"]})
df['result'] = df['test'].apply(parse_numbers)
print(df)
Result:
test result
0 3.1M 3100000.0
1 2k 2000.0
2 not a number not a number
EDIT:
I would write parse_numbers
little different
re
in only one line.{,1}
to convert to number also strings like "123"
integer
at the endparse_number
without s
because it parses only one number.import re
import pandas as pd
def parse_number(text, homo=False):
if not isinstance(homo, bool):
raise Exception(f"invalid setting for homo attribute: {homo}")
results = re.findall("^([0-9.]+)([MmKk]{,1})$", text)
#print(results)
if results:
if homo:
return "parsed_number"
else:
number, name = results[0]
new_number = float(number)
if name in ('M', 'm'):
new_number *= 1_000_000
elif name in ('K', 'k'):
new_number *= 1_000
return int(new_number)
else:
return text # np.NaN
df = pd.DataFrame({
"test": ["3.1M", "2k", "123", "not a number"]
})
df['result'] = df['test'].apply(parse_number)
df['homo'] = df['test'].apply(lambda x:parse_number(x, True))
print(df)
Result:
test result homo
0 3.1M 3100000 parsed_number
1 2k 2000 parsed_number
2 123 123 parsed_number
3 not a number not a number not a number