I have an algorithm for texts classification, working. But, it returns me only 1 item in the prediction. I would like to know how or if it is possible for it to return, 3 items with each new entry.
Train:
def train(texts, marcas):
print("Training.....")
tvect = TfidfTransformer() # -> Obsolete <- TfidfVectorizer(min_df=1, max_df=2, lowercase=False, preprocessor=None)
# gera a vetorização (posição das palavras) para treino e predição
count_vect = CountVectorizer(stop_words=None)
vect_count_text = count_vect.fit_transform(texts) # [vectorize_text(texto, tradutor) for texto in texts]
vetoresDeTexto = tvect.fit_transform(vect_count_text)
# Define o conjunto de dados X
X = vetoresDeTexto # np.array(vetoresDeTexto)
# Define o conjunto de dados Y (labels)
Y = np.array(marcas.tolist())
# Define porcentagem do treino
porcentagem_de_treino = 0.8
# Separa o tamanho do treino a partir da porcentagem
tamanho_do_treino = int(porcentagem_de_treino * len(Y))
# O restante fica para a validacao
tamanho_de_validacao = (len(Y) - tamanho_do_treino)
# Separa os dados de treino
treino_dados = X[0:tamanho_do_treino]
# Separa as marcacoes de treino
treino_marcacoes = Y[0:tamanho_do_treino]
# Separa os dados de validacao
validacao_dados = X[tamanho_do_treino:]
# Separa as marcacoes de validacao
validacao_marcacoes = Y[tamanho_do_treino:]
print("Validacao Marcacoes: ")
print(validacao_marcacoes)
clf = LogisticRegression(class_weight=None) # MultinomialNB() obtive 62% de acerto#GaussianNB()
clf.fit(treino_dados, treino_marcacoes)
# accuracy
accuracy = clf.score(validacao_dados, validacao_marcacoes)
file_name = 'train_data.pkl'
cPickle.dump(clf, open(file_name, 'wb'))
# fit_file = joblib.dump(clf, file_name)
print("Accuracy: ")
print("%.2f " % round(accuracy * 100) + "%\n")
print("End of train...")
predict(file_name, tvect, count_vect, treino_marcacoes, clf, treino_dados)
# To get a fit_file
# return fit_file
My predict code:
# to predict
def predict(fit, tvect, count_vect, y_test, clf, treinoX):
print("\nPredict......")
# new text to predict
newTextToPredict = ["Just a new text to predict"] # returns label J44
new = count_vect.transform(newTextToPredict)
# carrega o modelo treinado
loaded_model = cPickle.load(open(fit, 'rb'))
# faz a predição do novo texto de entrada
result = loaded_model.predict(new)
probs = clf.predict_proba(new)
# precision_score(result, treinoY, average='samples')
print(result)
Current output:
....
End of train...
Predict......
['Z000']
Is it possible that this output shows me the 3 most likely results?
Edit:
I tried to use the predict_proba
, but I did not understand the results, it follows the output:
print(probs)
Predict......
['Z000']
[[0.00472141 0.00468681 0.00545111 0.00473597 0.00742972 0.00459905
0.00472848 0.00471651 0.00830986 0.00472729 0.00537823 0.00539556
0.00463566 0.00469166 0.00473597 0.00469889 0.00473122 0.00510944
0.00475248 0.00475248 0.00472681 0.00465737 0.0046238 0.00538928
0.0053852 0.00469701 0.00470745 0.0052977 0.00468655 0.00472517
0.00601271 0.00540062 0.00471387 0.00471311 0.00471592 0.00468392
0.00470526 0.00454069 0.00467939 0.00471795 0.00706113 0.00475248
0.00470356 0.00451991 0.00473597 0.02389303 0.00472151 0.00475248
0.00573423 0.00469125 0.00471707 0.00450935 0.00458729 0.00607249
0.00556578 0.00661622 0.00747174 0.00528275 0.00469896 0.00527276
0.00537725 0.0046918 0.00472592 0.00523041 0.00466061 0.00523704
0.00535152 0.00471286 0.00456425 0.00473597 0.00466597 0.00475248
0.00471198 0.00470039 0.00545111 0.00473597 0.0059082 0.00471645
0.0050765 0.00536772 0.00469146 0.0047054 0.00583113 0.00556937
0.00530836 0.00724415 0.00499861 0.00469217 0.00471454 0.00456743
0.00473241 0.00468181 0.00545604 0.00471984 0.00466745 0.00606397
0.01230014 0.00467241 0.00472609 0.00541621 0.00473499 0.00468064
0.00472712 0.00470356 0.00497979 0.00453495 0.00469214 0.00668041
0.00528025 0.00468329 0.00777699 0.00468618 0.00537916 0.00455798
0.0046802 0.00468039 0.00534045 0.00466915 0.00521349 0.00465117
0.00466947 0.00688886 0.00460614 0.00648024 0.00469368 0.00456555
0.2215044 0.01841092 0.00594679 0.00467938 0.01121442 0.00537937
0.00468134 0.00472712 0.00470844 0.00470639 0.00580538 0.00535144
0.00473597 0.00465237 0.00577107 0.00539569 0.00472306 0.00538426
0.00472506]]
Try to print probs
:
probs = clf.predict_proba(new)
print(probs)
You will get an array of probabilities. These probabilities' sum is equal to 1. Then you can choose the 3 most higher valued elements's indexes from this array.
top3_classes = np.argsort(probs)[:3]
These are the top 3 class labels indexes, you need. So you can do then:
print(your_classes[top3_classes])
You got the top3 classes of the prediction.