I would like to remove italian stopwords using this function, but I don't know as I can do. I have seen several script with stop-word removing but always after the tokenizer. It is possible before? I mean, I would like the text without stopwords before tokenization. For stop words I used this library:stop-words
! pip install stop-words
from stop_words import get_stop_words
stop = get_stop_words('italian')
import re
# helper function to clean tweets
def processTweet(tweet):
# Remove HTML special entities (e.g. &)
tweet = re.sub(r'\&\w*;', '', tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','',tweet)
# Remove tickers
tweet = re.sub(r'\$\w*', '', tweet)
# To lowercase
tweet = tweet.lower()
# Remove hyperlinks
tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
# Remove hashtags
tweet = re.sub(r'#\w*', '', tweet)
# Remove Punctuation and split 's, 't, 've with a space for filter
tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#)|(\w+:\/\/\S+)|(\S*\d\S*)|([,;.?!:])",
" ", tweet).split())
#tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
# Remove words with 2 or fewer letters
tweet = re.sub(r'\b\w{1,3}\b', '', tweet)
# Remove whitespace (including new line characters)
tweet = re.sub(r'\s\s+', ' ', tweet)
# Remove single space remaining at the front of the tweet.
tweet = tweet.lstrip(' ')
# Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
tweet = ''.join(c for c in tweet if c <= '\uFFFF')
return tweet
df['text'] = df['text'].apply(processTweet)
Just use re.sub() as you've been using:
exclusions = '|'.join(stop)
tweet = re.sub(exclusions, '', tweet)