I have a dataframe contains a name column with this format (last name, first name initial) like below. The initial will be always at the last and has one letter only.
How to remove the initial? I know re may be able to handle this task but I am not sure how the pattern would look like. Any advice would be great.
import pandas as pd
d={'name':['Smith, John', 'Smith, Johnson W', 'Smith, Joan D', 'Smith, Joan X', 'Smith, Brian Borbinson] }
df=pd.DataFrame(data=d)
import pandas as pd
import re
d={'name':['Smith, John', 'Smith, Johnson W',
'Smith, Joan D', 'Smith, Joan X', 'Smith, Brian Borbinson'] }
df=pd.DataFrame(data=d)
def remove_second_name_initial(string):
string_copy = string
string_surname = string.split(', ')[0]
string_name = string.split(', ')[1]
string_first_name = string_name.split(' ')[0]
try:
string_second_name = string_name.split(' ')[1]
except:
# In case that there is not initial or second name
return string_surname + ', ' + string_name
if len(string_second_name) == 1:
# Just removing the intial
return string_surname+', '+string_first_name
# this give back the whole name, this is to leave the complete second name
# as in Smith, Brian Borbinson
return string_surname+', '+string_name
df.apply(lambda x:remove_second_name_initial(x['name']) ,axis = 1)
0 Smith, John
1 Smith, Johnson
2 Smith, Joan
3 Smith, Joan
4 Smith, Brian Borbinson
dtype: object