I'm trying to remove punctuation from all the rows in a column. All of these rows contain string data. I tried a couple of regular expressions but none work. Can anyone tell me where the problem is in this syntax?
for i in range(0, 3847):
#Remove punctuation
text = re.sub(r'[^\w\s]','',dataset['abstract1'][i])
This is the error I got:
4 #Remove punctuations
----> 5 text = re.sub('[^\w\s]','',dataset['abstract1'][i])
6
7 #Convert to lowercase
G:\Anaconda3\lib\site-packages\pandas\core\series.py in
__getitem__(self, key)
866 key = com.apply_if_callable(key, self)
867 try:
--> 868 result = self.index.get_value(self, key)
869
870 if not is_scalar(result):
G:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
4373 try:
4374 return self._engine.get_value(s, k,
-> 4375 tz=getattr(series.dtype, 'tz', None))
4376 except KeyError as e1:
4377 if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.Int64HashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.hashtable.Int64HashTable.get_item()
If you're dealing with pandas.DataFrame
object, you could avoid using the for-loop
. Instead, use pandas.Series.str.replace
to remove punctuation.
# sample data
dataset = pd.DataFrame({
'abstract1': ['so,me p#nct*!&io* issues', '!@#hfd87***}}|', 't&e%s$t@']
})
abstract1
0 so,me p#nct*!&io* issues
1 !@#hfd87***}}|
2 t&e%s$t@
dataset['punct_removed'] = dataset['abstract1'].str.replace(r'[^\w\s]', '')
abstract1 punct_removed
0 so,me p#nct*!&io* issues some pnctio issues
1 !@#hfd87***}}| hfd87
2 t&e%s$t@ test