How to remove links from tweets (proper)?

I got this:

def non_ascii(s):
    return "".join(i for i in s if ord(i)<128)

def lower(text):
    return text.lower()

def clean_links(text):
    txt = re.compile('http[s]{1}://[\w+][.]{1}[\w+][.]{1}[\w]{2,3}')
    return txt.sub(r'', text)

def clean_html(text):
    html = re.compile('<.*?>')
    return html.sub(r'', text)

def punct(text):
    token = RegexpTokenizer(r'\w+')
    text = token.tokenize(text)
    text = " ".join(text)
    return text

Then later I call these functions like:

data['cleaned'] = data['tweet'].apply(non_ascii)
data['cleaned'] = data['tweet'].apply(lower)
data['cleaned'] = data['tweet'].apply(clean_links)
data['cleaned'] = data['tweet'].apply(clean_html)
data['cleaned'] = data['tweet'].apply(punct)

The problem is that any link still is in the data['cleaned'] column, I need those pesky links erased!

The original tweets are in data['tweet'].

Please add your support, or your way of doing this "removing links".

Links still in the data look like:

https t co OR1IkVzzgO

Solution

Second function (and next functions) you have to run on data['cleaned']

data['cleaned'] = data['tweet'].apply(non_ascii)
data['cleaned'] = data['cleaned'].apply(lower)
data['cleaned'] = data['cleaned'].apply(clean_links)
data['cleaned'] = data['cleaned'].apply(clean_html)
data['cleaned'] = data['cleaned'].apply(punct)

OR you should chain it

data['cleaned'] = data['tweet'].apply(non_ascii).apply(lower).apply(clean_links).apply(clean_html).apply(punct)

OR you should put all functions in one function and run apply() only once

def clean(text):
    text = non_ascii(text)
    text = lower(text)
    text = clean_links(text)
    text = clean_html(text)
    text = punct(text)
    return text

data['cleaned'] = data['tweet'].apply(clean)

EDIT:

Instead of text.lower() you can use str.lower(text) and you don't have to create own function lower()

Your regex doesn't match to links so I used something little better 'http(s)?://\w+(\.\w+){1,}(/\w+)*' - but it may not work with more complex links and you should use regex suggested in comments.

Stackoverflow doesn't allow to use https:// t.co/ OR1IkVzzgO in code so you have to remove spaces from link.

Minimal working code with example data

import re
import nltk.tokenize
import pandas as pd

def non_ascii(s):
    return "".join(i for i in s if ord(i)<128)

def clean_links(text):
    txt = re.compile('http(s)?://\w+(\.\w+){1,}(/\w+)*')
    return txt.sub(r'', text)

def clean_html(text):
    html = re.compile('<.*?>')
    return html.sub(r'', text)

def punct(text):
    token = nltk.tokenize.RegexpTokenizer(r'\w+')
    text = token.tokenize(text)
    text = " ".join(text)
    return text

def clean(text):
    text = non_ascii(text)
    text = str.lower(text)
    text = clean_links(text)
    text = clean_html(text)
    text = punct(text)
    return text

# --- main ---

data = pd.DataFrame({
    'tweet': ['Example https://stackoverflow.com/ 
. And <tag>other</tag> line https:// t.co/ OR1IkVzzgO. Any question?']
})

data['cleaned'] = data['tweet'].apply(clean)

print(data.to_string())

EDIT:

More universal version which gets list of functions

def clean(text, *functions):
    for func in functions:
        text = func(text)
    return text

data['cleaned'] = data['tweet'].apply(clean, args=[non_ascii, str.lower, clean_links, clean_html, punct])