I finally find as a need your guidance and support as I don't detect what is my error in the next piece of code.
It supposes that "list index out of range" rises when you initialize counter improperly to the length of df, but what I am attempting is to return the first then lines of the column Descripción
as sample (doc) to apply the NLTK stopwords analysis.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
pd.set_option('display.max_columns', None)
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
from nltk.stem import WordNetLemmatizer
import string
import base64
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import sklearn.feature_extraction.stop_words
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import spacy
spacy.load('es_core_news_sm')
from spacy.lang.es import Spanish
parser = Spanish()
df = pd.read_csv('geografia_empleos_MX.csv')
df.head(2)
del df['Unnamed: 0']
df.head(1)
df.isnull().sum()
df1 = df.copy()
df1['fraudulento'].value_counts()
import spacy
nlp = spacy.load('es_core_news_lg')
stopwords = stopwords.words('spanish')
punctuations = string.punctuation
def limpia_texto(docs, logging = False):
texts = []
counter = 1
for doc in docs:
if counter % 100 == 0 and logging:
print('Procesados: {} de {} documentos'.format(counter, len(docs)))
counter += 1
doc = nlp(doc, disable = ['parser', 'ner'])
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
tokens = ' '.join(tokens)
texts.append(tokens)
return pd.Series(texts)
Falso_1 = [text for text in df1[df1['fraudulento'] == 1]['Descripción']]
Falso_1[10] # Here is when index error raises :(
Falso_1 does not contain 10 indexes, that's why it raising an error. This line is collecting the columns from your dataframe.
Falso_1 = [text for text in df1[df1['fraudulento'] == 1]['Descripción']]
You should replace it by the more pandas-like:
Falso_1 = df1.loc[df1['fraudulento'] == 1, 'Descripción'].to_numpy()
Falso_1 .shape
Falso_1.shape
will give you the number of indexes you have in it