I got this:
def non_ascii(s):
return "".join(i for i in s if ord(i)<128)
def lower(text):
return text.lower()
def clean_links(text):
txt = re.compile('http[s]{1}://[\w+][.]{1}[\w+][.]{1}[\w]{2,3}')
return txt.sub(r'', text)
def clean_html(text):
html = re.compile('<.*?>')
return html.sub(r'', text)
def punct(text):
token = RegexpTokenizer(r'\w+')
text = token.tokenize(text)
text = " ".join(text)
return text
Then later I call these functions like:
data['cleaned'] = data['tweet'].apply(non_ascii)
data['cleaned'] = data['tweet'].apply(lower)
data['cleaned'] = data['tweet'].apply(clean_links)
data['cleaned'] = data['tweet'].apply(clean_html)
data['cleaned'] = data['tweet'].apply(punct)
The problem is that any link still is in the data['cleaned'] column, I need those pesky links erased!
The original tweets are in data['tweet'].
Please add your support, or your way of doing this "removing links".
Links still in the data look like:
https t co OR1IkVzzgO
Second function (and next functions) you have to run on data['cleaned']
data['cleaned'] = data['tweet'].apply(non_ascii)
data['cleaned'] = data['cleaned'].apply(lower)
data['cleaned'] = data['cleaned'].apply(clean_links)
data['cleaned'] = data['cleaned'].apply(clean_html)
data['cleaned'] = data['cleaned'].apply(punct)
OR you should chain it
data['cleaned'] = data['tweet'].apply(non_ascii).apply(lower).apply(clean_links).apply(clean_html).apply(punct)
OR you should put all functions in one function and run apply()
only once
def clean(text):
text = non_ascii(text)
text = lower(text)
text = clean_links(text)
text = clean_html(text)
text = punct(text)
return text
data['cleaned'] = data['tweet'].apply(clean)
EDIT:
Instead of text.lower()
you can use str.lower(text)
and you don't have to create own function lower()
Your regex doesn't match to links so I used something little better 'http(s)?://\w+(\.\w+){1,}(/\w+)*'
- but it may not work with more complex links and you should use regex suggested in comments.
Stackoverflow doesn't allow to use https:// t.co/ OR1IkVzzgO
in code so you have to remove spaces from link.
Minimal working code with example data
import re
import nltk.tokenize
import pandas as pd
def non_ascii(s):
return "".join(i for i in s if ord(i)<128)
def clean_links(text):
txt = re.compile('http(s)?://\w+(\.\w+){1,}(/\w+)*')
return txt.sub(r'', text)
def clean_html(text):
html = re.compile('<.*?>')
return html.sub(r'', text)
def punct(text):
token = nltk.tokenize.RegexpTokenizer(r'\w+')
text = token.tokenize(text)
text = " ".join(text)
return text
def clean(text):
text = non_ascii(text)
text = str.lower(text)
text = clean_links(text)
text = clean_html(text)
text = punct(text)
return text
# --- main ---
data = pd.DataFrame({
'tweet': ['Example https://stackoverflow.com/
. And <tag>other</tag> line https:// t.co/ OR1IkVzzgO. Any question?']
})
data['cleaned'] = data['tweet'].apply(clean)
print(data.to_string())
EDIT:
More universal version which gets list of functions
def clean(text, *functions):
for func in functions:
text = func(text)
return text
data['cleaned'] = data['tweet'].apply(clean, args=[non_ascii, str.lower, clean_links, clean_html, punct])