I use the following class in Python to preprocess a string before passing it to a machine learning classification model for predicting its sentiment.
I use regex for most of the transformation along with some libraries like emoji and tweet-preprocessor. The code works fine but I believe that it is slow.
Do you have any suggestions on how to improve its speed?
Example of usage:
string = "I am very happy with @easyjet #happy customer 🙂. Second sentence"
preprocessor = TextPreprocessing()
result = preprocessor.text_preprocessor(string)
The result will be : ["i am very happy with happy smiling face", "second sentence", "i am very happy with happy smiling face second sentence"]
import re
import preprocessor as p # this is the tweet-preprocessor library
import emoji
import os
import numpy as np
import pandas as pd
class TextPreprocessing:
def __init__(self):
p.set_options(p.OPT.MENTION, p.OPT.URL)
# remove punctuation
def _punctuation(self, val):
val = re.sub(r'[^\w\s]', ' ', val)
val = re.sub('_', ' ', val)
return val
#remove white spaces
def _whitespace(self, val):
return " ".join(val.split())
#remove numbers
def _removenumbers(self, val):
val = re.sub('[0-9]+', '', val)
return val
#remove unicode
def _remove_unicode(self, val):
val = unidecode(val).encode("ascii")
val = str(val, "ascii")
return val
#split string into sentenses
def _split_to_sentences(self, body_text):
sentences = re.split(
r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", body_text)
return sentences
# cleaning functions that combines all of the above functions
def _clean_text(self, val):
val = val.lower()
val = self._removenumbers(val)
val = p.clean(val)
val = ' '.join(self._punctuation(emoji.demojize(val)).split())
val = self._remove_unicode(val)
val = self._whitespace(val)
return val
def text_preprocessor(self, body_text):
body_text_df = pd.DataFrame({"body_text": body_text}, index=[1])
sentence_split_df = body_text_df.copy()
sentence_split_df["body_text"] = sentence_split_df["body_text"].apply(
self._split_to_sentences)
lst_col = "body_text"
sentence_split_df = pd.DataFrame(
{
col: np.repeat(
sentence_split_df[col].values, sentence_split_df[lst_col].str.len(
)
)
for col in sentence_split_df.columns.drop(lst_col)
}
).assign(**{lst_col: np.concatenate(sentence_split_df[lst_col].values)})[
sentence_split_df.columns
]
final_df = (
pd.concat([sentence_split_df, body_text_df])
.reset_index()
.drop(columns=["index"])
)
final_df["body_text"] = final_df["body_text"].apply(self._clean_text)
return final_df["body_text"]
This question might be relevant to all those Data Scientists who want to move their NLP models into production.
Since I cannot comment I will try to answer your question (to some extent):
import timeit
from functools import partial
...
if __name__ == "__main__":
# http://25.io/toau/audio/sample.txt
with open("sample.txt") as f:
text = f.read()
tp = TextPreprocessing()
print(min(timeit.Timer(partial(tp.text_preprocessor, text)).repeat(repeat=10, number=1)))
You can also use timeit on specific methdos to check for bottlenecks.
Sadly I could not run your code sample due to the undefined np.
in L58 and L64
so I cannot test my assumptions. Also you did not provide sample data.
Some general thoughts:
re.compile()
to compile all of your regular expressions.copy()
operations are expensive try to get rid of them