Search code examples
pythonnltksentiment-analysistextblob

Python TextBlob translate issue


I am doing a quick sentiment analysis console application with Python, TextBlob and NLTK.

Currently i am using a link to a wiki article in spanish, so i don't need to translate it and i can use the nltk spanish stopword list, but what if i wanted to make this code work for different language links?

If i use the line TextFinal=TextFinal.translate(to="es") below textFinal=TextBlob(texto) (code below) i get an error since it can't translate spanish into spanish.

Could i prevent this just by using a try/catch? Is there a way to make the code try to translate to different languages (as well as using different stopword list) depending on the language of the links im feeding to the application?

import nltk
nltk.download('stopwords')
from nltk import  word_tokenize
from nltk.corpus import stopwords
import string
from textblob import TextBlob, Word
import urllib.request
from bs4 import BeautifulSoup

response = urllib.request.urlopen('https://es.wikipedia.org/wiki/Valencia')
html = response.read()

soup = BeautifulSoup(html,'html5lib')
text = soup.get_text(strip = True)


tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]

table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]

stop_words = set(stopwords.words('spanish'))

words = [w for w in words if not w in stop_words]

with open('palabras.txt', 'w') as f:
    for word in words:
        f.write(" " + word)

with open('palabras.txt', 'r') as myfile:
    texto=myfile.read().replace('\n', '')


textFinal=TextBlob(texto)

print (textFinal.sentiment)

freq = nltk.FreqDist(words)

freq.plot(20, cumulative=False)

Solution

  • Take a look at the package langdetect. You could check the language of the page you are feeding in and skip translation if the page language matches the translation language. Something like the following:

    import string
    import urllib.request
    
    import nltk
    from bs4 import BeautifulSoup
    from langdetect import detect
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from textblob import TextBlob, Word
    
    nltk.download("stopwords")
    # nltk.download("punkt")
    
    response = urllib.request.urlopen("https://es.wikipedia.org/wiki/Valencia")
    html = response.read()
    
    soup = BeautifulSoup(html, "html5lib")
    text = soup.get_text(strip=True)
    lang = detect(text)
    
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    
    stop_words = set(stopwords.words("spanish"))
    
    words = [w for w in words if w not in stop_words]
    
    with open("palabras.txt", "w", encoding="utf-8") as f:
        for word in words:
            f.write(" " + word)
    
    with open("palabras.txt", "r", encoding="utf-8") as myfile:
        texto = myfile.read().replace("\n", "")
    
    
    textFinal = TextBlob(texto)
    
    translate_to = "es"
    if lang != translate_to:
        textFinal = textFinal.translate(to=translate_to)
    
    print(textFinal.sentiment)
    
    freq = nltk.FreqDist(words)
    
    freq.plot(20, cumulative=False)