Search code examples
pythonmachine-learningtextnlplogistic-regression

Text Classification with Python


HI i am new to python programming language, based on the various reference i have build the text classification model using logistic regression, Below is the code.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np
import string

import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


Train = pd.read_excel("/Desktop/ML Based Text classification/test.xlsx")
real = pd.read_excel("/Desktop/ML Based Text classification/test.xlsx", sheet_name = 'Test')
Train_data = Train['description']
Test_data = real['description']

stop = stopwords.words('english')
porter = PorterStemmer()

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)
def stemmer(stem_text):
    stem_text = [porter.stem(word) for word in stem_text.split()]
    return " ".join(stem_text)

def clean_data(data):
     text_clean =  (data.str.replace('[^\w\s]','')
                  .str.replace('\d+', '')
                  .apply(remove_stopwords)
                  .apply(stemmer)
                  .astype(str))
     return (text_clean)
Train_data = clean_data(Train_data)

counter = Counter(Train['tags'].tolist())
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(50))}
Train['Mapping'] = Train['tags'].map(top_10_varieties)
#top_10_varieties = {'Outlook Related Issue': 0, 'Password Reset': 1, 'VPN Issue': 2}


tfidf_converter = TfidfVectorizer()
model_log = LogisticRegression()

X = Train_data
Y = Train['Mapping']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.95, random_state = 0)

svc = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf',LogisticRegression()),
               ])

svc.fit(X_train, y_train)

ytest = np.array(y_test)
y_pred = svc.predict(X_test)

Test_data = clean_data(Test_data)
y_pred = svc.predict(Test_data)

Now i have no error running this code, when i print "y_pred" i am getting an output as

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 2, 1, 2, 0, 2, 2, 2, 1, 0, 1,
       1, 2, 1, 2, 0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 0], dtype=int64)

I am not sure, how do i convert this to the mapping string and tag this against my raw data, i want an output like this:

enter image description here


Solution

  • Please try:

    reverse_top_10_varieties = {idx:i[0] for idx, i in enumerate(counter.most_common(50))}
    [reverse_top_10_varieties[id] for id in y_pred]
    

    and see if this solves your problem