python scikit-learn pipeline sentiment-analysis feature-extraction

How to add a feature using a pipeline and FeatureUnion

In the code below I use a tweeter dataset to perform sentiment analysis. I use a pipeline which performs the following processes:

1) performs some basic text preprocessing

2) vectorizes the tweet text

3) adds an extra feature ( text length)

4) classification

I would like to add one more feature which is the scaled number of followers. I wrote a function that takes as an input the whole dataframe (df) and returns a new dataframe with scaled number of followers. However, I am finding it challenging to add this process on the pipeline e.g. add this feature to the other features using the sklearn pipeline.

Any help or advise on this problem will be much appreciated.

the question and code below is inspired by Ryan's post:pipelines


import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def import_data(filename,sep,eng,header = None,skiprows=1):
    #read csv
    dataset = pd.read_csv(filename,sep=sep,engine=eng,header = header,skiprows=skiprows)
    #rename columns
    dataset.columns = ['text','followers','sentiment']
    return dataset

df = import_data('apple_v3.txt','\t','python')
X, y = df.text, df.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, y)

tokenizer = nltk.casual.TweetTokenizer(preserve_case=False, reduce_len=True)
count_vect = CountVectorizer(tokenizer=tokenizer.tokenize) 
classifier = LogisticRegression()

def get_scalled_followers(df):
    scaler = MinMaxScaler()
    df[['followers']] = df[['followers']].astype(float)
    df[['followers']] = scaler.fit_transform(df[['followers']])
    followers = df['followers'].values
    followers_reshaped = followers.reshape((len(followers),1))
    return df

def get_tweet_length(text):
    return len(text)
import numpy as np

def genericize_mentions(text):
    return re.sub(r'@[\w_-]+', 'thisisanatmention', text)

def reshape_a_feature_column(series):
    return np.reshape(np.asarray(series), (len(series), 1))

def pipelinize_feature(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            processed = [function(i) for i in list_or_series]
            processed = reshape_a_feature_column(processed)
            return processed

        else:
            return reshape_a_feature_column(np.zeros(len(list_or_series)))

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn_helpers import pipelinize, genericize_mentions, train_test_and_evaluate


sentiment_pipeline = Pipeline([
        ('genericize_mentions', pipelinize(genericize_mentions, active=True)),
        ('features', FeatureUnion([
                    ('vectorizer', count_vect),
                    ('post_length', pipelinize_feature(get_tweet_length, active=True))
                ])),
        ('classifier', classifier)
    ])

sentiment_pipeline, confusion_matrix = train_test_and_evaluate(sentiment_pipeline, X_train, y_train, X_test, y_test)

Solution

The best explanation I have found so far is at the following post: pipelines

My data includes heterogenous features and the following step by step approach works well and is easy to understand:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

#step1 - select data from dataframe and split the dataset in train and test sets

features= [c for c in df.columns.values if c  not in ['sentiment']]
numeric_features= [c for c in df.columns.values if c  not in ['text','sentiment']]
target = 'sentiment'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)

#step2 - create a number selector class and text selector class. These classes allow to select specific columns from the dataframe

class NumberSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

class TextSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

#step 3 create one pipeline for the text data and one for the numerical data


text = Pipeline([
                ('selector', TextSelector(key='content')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

text.fit_transform(X_train)

followers =  Pipeline([
                ('selector', NumberSelector(key='followers')),
                ('standard', MinMaxScaler())
            ])

followers.fit_transform(X_train)

#step 4 - features union

feats = FeatureUnion([('text', text), 
                      ('length', followers)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

# step 5 - add the classifier and predict 

pipeline = Pipeline([
    ('features',feats),
    ('classifier', SVC(kernel = 'linear', probability=True, C=1, class_weight = 'balanced'))
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
np.mean(preds == y_test)

# step 6 use the model to predict new data not included in the test set
# in my example the pipeline expects a dataframe as an input which should have a column called 'text' and a column called 'followers'

array = [["@apple is amazing",25000]]
dfObj = pd.DataFrame(array,columns = ['text' , 'followers']) 

#prints the expected class e.g. positive or negative sentiment
print(pipeline.predict(dfObj))

#print the probability for each class
print(pipeline.predict_proba(dfObj))