I am doing a quick sentiment analysis console application with Python, TextBlob and NLTK.
Currently i am using a link to a wiki article in spanish, so i don't need to translate it and i can use the nltk spanish stopword list, but what if i wanted to make this code work for different language links?
If i use the line TextFinal=TextFinal.translate(to="es")
below textFinal=TextBlob(texto)
(code below) i get an error since it can't translate spanish into spanish.
Could i prevent this just by using a try/catch? Is there a way to make the code try to translate to different languages (as well as using different stopword list) depending on the language of the links im feeding to the application?
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from textblob import TextBlob, Word
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('https://es.wikipedia.org/wiki/Valencia')
html = response.read()
soup = BeautifulSoup(html,'html5lib')
text = soup.get_text(strip = True)
tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
stop_words = set(stopwords.words('spanish'))
words = [w for w in words if not w in stop_words]
with open('palabras.txt', 'w') as f:
for word in words:
f.write(" " + word)
with open('palabras.txt', 'r') as myfile:
texto=myfile.read().replace('\n', '')
textFinal=TextBlob(texto)
print (textFinal.sentiment)
freq = nltk.FreqDist(words)
freq.plot(20, cumulative=False)
Take a look at the package langdetect. You could check the language of the page you are feeding in and skip translation if the page language matches the translation language. Something like the following:
import string
import urllib.request
import nltk
from bs4 import BeautifulSoup
from langdetect import detect
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob, Word
nltk.download("stopwords")
# nltk.download("punkt")
response = urllib.request.urlopen("https://es.wikipedia.org/wiki/Valencia")
html = response.read()
soup = BeautifulSoup(html, "html5lib")
text = soup.get_text(strip=True)
lang = detect(text)
tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]
table = str.maketrans("", "", string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
stop_words = set(stopwords.words("spanish"))
words = [w for w in words if w not in stop_words]
with open("palabras.txt", "w", encoding="utf-8") as f:
for word in words:
f.write(" " + word)
with open("palabras.txt", "r", encoding="utf-8") as myfile:
texto = myfile.read().replace("\n", "")
textFinal = TextBlob(texto)
translate_to = "es"
if lang != translate_to:
textFinal = textFinal.translate(to=translate_to)
print(textFinal.sentiment)
freq = nltk.FreqDist(words)
freq.plot(20, cumulative=False)