Search code examples
pythonpandastwitter

Delete rows that contains no information on Tweet text on pandas


I´m trying to remove rows containing blank texts or in tweet texts column. But I have tried in different ways counting the rows that only contain whitespace or counting the leading spaces and trailing spaces but to get a criterion to eliminate it.

ID    tweet           WhiteSpaceCount HaveWhiteSpace
0    this is a text        0              False
1                         0              False
2   Hello im fine         0              False

I want to delete all the rows that don´t have any information on the tweet column.

Code here:

def extractAndSave(api, name):
    # Creamos una lista de tweets:
    previous_date = date.today() - timedelta(days=1)
    query_date = date.today()
    name = name
    tweets = API_EXTRACTOR.search(q=name + "-filter:retweets", result_type='recent', timeout=999999, count=200,
                                  end_time=previous_date, tweet_mode='extended')
    # Podemos crear un dataframe como sigue:
    tweet_list = []
    for tweet in tweets:
        tweet_list.append(tweet.full_text)
    datos = pd.DataFrame(data=tweet_list, columns=['TWEETS'])

    # CREANDO COLUMNA DE ID
    id_list = []
    for id in tweets:
        id_list.append(id.id)
    id = pd.DataFrame(data=id_list, columns=['ID'])

    # CREANDO COLUMNA DE ID
    creado_list = []
    for creado in tweets:
        creado_list.append(creado.created_at)
    creado = pd.DataFrame(data=creado_list, columns=['FECHA_CREACION'])

    # CREANDO COLUMNA DE nombre de usuario
    user_list = []
    for usuario in tweets:
        user_list.append(usuario.user.screen_name)
    usuario = pd.DataFrame(data=user_list, columns=['USUARIO'])

    # CREANDO COLUMNA DE FUENTE
    fuente_list = []
    for fuente in tweets:
        fuente_list.append(fuente.source)
    fuente = pd.DataFrame(data=fuente_list, columns=['FUENTE'])

    # CREANDO COLUMNA DE ME GUSTA
    like_list = []
    for like in tweets:
        like_list.append(like.favorite_count)
    like = pd.DataFrame(data=like_list, columns=['ME_GUSTA'])

    # CREANDO COLUMNA DE RT
    rt_list = []
    for rt in tweets:
        rt_list.append(rt.retweet_count)
    retweet = pd.DataFrame(data=rt_list, columns=['ME_GUSTA'])

    # CREANDO COLUMNA DE IDIOMA
    idioma_list = []
    for idioma in tweets:
        idioma_list.append(idioma.lang)
    idioma = pd.DataFrame(data=idioma_list, columns=['IDIOMA'])

    # CREANDO COLUMNA DE IDIOMA
    quote_list = []
    for quote in tweets:
        quote_list.append(quote.is_quote_status)
    quote = pd.DataFrame(data=quote_list, columns=['CITADO'])

    # CREANDO COLUMNA DE IDIOMA
    location_list = []
    for location in tweets:
        location_list.append(location.user.location)
    location = pd.DataFrame(data=location_list, columns=['LOCACION'])

    # CONCATENANDO DATAFRAMES
    datos = pd.concat([datos, id, creado, usuario, fuente, like, retweet, quote, idioma, location], axis=1)

    # Dropear toda la fila si la columna tweets viene vacia.
    datos['pass/fail'] = np.where(datos['TWEETS'].astype(str).str.fullmatch(r"\s*"),'FAIL','PASS')

    datos['CONTEO_ESPACIOS']= (datos['TWEETS'].str.startswith(" ") | datos['TWEETS'].str.endswith(" ")).sum()
                    

    # Hora de publicación
    datos['HORA_PUBLICACION'] = datos['FECHA_CREACION'].dt.hour
    datos['DIA_SEMANA'] = datos['FECHA_CREACION'].dt.day_name()

    # Extrayendo solo los tweets del día anterior
    datos['FECHA_CREACION'] = pd.to_datetime(datos['FECHA_CREACION']).dt.date
    datos = datos[datos['FECHA_CREACION'] == previous_date]

    print(datos)

    # Guardando en dataframe.
    return datos

Solution

  • Instead of removing rows that you don't need, keep only the ones you do need:

    df = df[df["tweet"].str.strip().str.len()>0]
    
    >>> df
    
       ID           tweet  WhiteSpaceCount  HaveWhiteSpace
    0   0  this is a text                0           False
    2   2   Hello im fine                0           False