Search code examples
pythonpandasdata-cleaning

How to remove links from tweets (proper)?


I got this:

def non_ascii(s):
    return "".join(i for i in s if ord(i)<128)

def lower(text):
    return text.lower()

def clean_links(text):
    txt = re.compile('http[s]{1}://[\w+][.]{1}[\w+][.]{1}[\w]{2,3}')
    return txt.sub(r'', text)

def clean_html(text):
    html = re.compile('<.*?>')
    return html.sub(r'', text)

def punct(text):
    token = RegexpTokenizer(r'\w+')
    text = token.tokenize(text)
    text = " ".join(text)
    return text

Then later I call these functions like:

data['cleaned'] = data['tweet'].apply(non_ascii)
data['cleaned'] = data['tweet'].apply(lower)
data['cleaned'] = data['tweet'].apply(clean_links)
data['cleaned'] = data['tweet'].apply(clean_html)
data['cleaned'] = data['tweet'].apply(punct)

The problem is that any link still is in the data['cleaned'] column, I need those pesky links erased!

The original tweets are in data['tweet'].

Please add your support, or your way of doing this "removing links".

Links still in the data look like:

https t co OR1IkVzzgO

Solution

  • Second function (and next functions) you have to run on data['cleaned']

    data['cleaned'] = data['tweet'].apply(non_ascii)
    data['cleaned'] = data['cleaned'].apply(lower)
    data['cleaned'] = data['cleaned'].apply(clean_links)
    data['cleaned'] = data['cleaned'].apply(clean_html)
    data['cleaned'] = data['cleaned'].apply(punct)
    

    OR you should chain it

    data['cleaned'] = data['tweet'].apply(non_ascii).apply(lower).apply(clean_links).apply(clean_html).apply(punct)
    

    OR you should put all functions in one function and run apply() only once

    def clean(text):
        text = non_ascii(text)
        text = lower(text)
        text = clean_links(text)
        text = clean_html(text)
        text = punct(text)
        return text
    
    data['cleaned'] = data['tweet'].apply(clean)
    

    EDIT:

    Instead of text.lower() you can use str.lower(text) and you don't have to create own function lower()

    Your regex doesn't match to links so I used something little better 'http(s)?://\w+(\.\w+){1,}(/\w+)*' - but it may not work with more complex links and you should use regex suggested in comments.

    Stackoverflow doesn't allow to use https:// t.co/ OR1IkVzzgO in code so you have to remove spaces from link.


    Minimal working code with example data

    import re
    import nltk.tokenize
    import pandas as pd
    
    def non_ascii(s):
        return "".join(i for i in s if ord(i)<128)
    
    def clean_links(text):
        txt = re.compile('http(s)?://\w+(\.\w+){1,}(/\w+)*')
        return txt.sub(r'', text)
    
    def clean_html(text):
        html = re.compile('<.*?>')
        return html.sub(r'', text)
    
    def punct(text):
        token = nltk.tokenize.RegexpTokenizer(r'\w+')
        text = token.tokenize(text)
        text = " ".join(text)
        return text
    
    def clean(text):
        text = non_ascii(text)
        text = str.lower(text)
        text = clean_links(text)
        text = clean_html(text)
        text = punct(text)
        return text
    
    # --- main ---
    
    data = pd.DataFrame({
        'tweet': ['Example https://stackoverflow.com/ 
    . And <tag>other</tag> line https:// t.co/ OR1IkVzzgO. Any question?']
    })
    
    data['cleaned'] = data['tweet'].apply(clean)
    
    print(data.to_string())
    

    EDIT:

    More universal version which gets list of functions

    def clean(text, *functions):
        for func in functions:
            text = func(text)
        return text
    
    data['cleaned'] = data['tweet'].apply(clean, args=[non_ascii, str.lower, clean_links, clean_html, punct])