I am trying to build a generic crawler for my marketing project and keep track of where the information came from viz blogs, testimonials etc. I am using Python 3.5 and Spyder/pycharm as IDE and I keep getting the following error in using encode - decode. The input to my code is a list of company names and product features in an excel file. I also searched for possible solutions but the recommendations in the community are for typecasting, which I am not sure is the problem. Kindly let me know if some more clarification is required from my side.
from __future__ import division, unicode_literals
import codecs
import re
import os
import xlrd
import requests
from urllib.request import urlopen
from time import sleep
from bs4 import BeautifulSoup
import openpyxl
from collections import Counter
page=0
b=0
n=0
w=0
p=0
o=0
workbook=xlrd.open_workbook("C:\Product.xlsx")
workbook1=xlrd.open_workbook("C:\linkslist.xlsx")
sheet_names = workbook.sheet_names()
sheet_names1 = workbook1.sheet_names()
wb= openpyxl.Workbook() #User Spreadsheet
ws = wb.active
ws.title = "User"
ws['A1'] = 'Feature'
ws['B1'] = 'Customer-Testimonials'
ws['C1'] = 'Case Study'
ws['D1'] = 'Blog'
ws['E1'] = 'Press'
ws['F1'] = 'Total posts'
ws1 = wb.create_sheet(title="Ml")
ws1['A1'] = 'Feature'
ws1['B1'] = 'Phrase'
ws1['C1'] = 'Address'
ws1['D1'] = 'Tag Count'
worksheet = workbook.sheet_by_name(sheet_names[0])
worksheet1 = workbook1.sheet_by_name(sheet_names[0])
for linknumber in range(0,25):
u = worksheet1.cell(linknumber,0).value
url='www.' + u.lower() + '.com'
print (url)
r=''
while r == '':
try:
print ("in loop")
r = requests.get("http://" +url)
except:
sleep(3)#if the code still gives that error then try increasing the sleep time to 5 maybe
print (r)
data = r.text
#print data
soup1 = BeautifulSoup(data, "html.parser")
#print soup1
num=3 #starting row number and keep the column same.
word = ''
word = worksheet.cell(num,3).value
while not word == 'end':
print (num)
#print word
tag_list=[]
phrase= []
counts=[]
address=[]
counts = Counter(tag_list)
for link in soup1.find_all('a'):
#print link
add = link.encode("ascii", "ignore")
print (add)
if not'Log In' in add:
#print link.get('href')
i=0
content = ''
for i in range(1,5):
if content=='':
try:
print (link.get('href'))
i+=1
req = urllib.request.Request(link.get('href'))
with urllib.request.urlopen(req) as response:
content = response.read()
except:
sleep(3)
#if the code still gives that error then try increasing the sleep time to 5 maybe
continue
soup = BeautifulSoup(content, "html.parser")
s=soup(text=re.compile(word))
if s:
print ("TRUE")
add = link.encode('ascii','ignore')
print (type(add))
if 'customer-testimonial' in add :
b+=1
elif 'case-study' in add :
n+=1
elif 'blog' in add :
w+=1
elif 'press' in add :
p+=1
else :
o+=1
#phrase_type=["Customer testimonials","news","ads","twitter","facebook","instagram"]
#print(os.path.join(root, name))
print (add)
for tag in s:
parent_html = tag.parent.name
print (parent_html)
tag_list.append(parent_html)
phrase.append(s)
address.append(add)
#print str(phrase)
counts = Counter(tag_list)
page +=1
else:
counts = Counter(tag_list)
no =num-1
print(counts)
print (word)
ws['A%d'%no] = word.encode('utf-8' , 'ignore')
ws1['A%d'%no] = word.encode('utf-8' , 'ignore')
print ("Number of pages is %d" %page)
print ("Number of Customer testimonials posts is %d" %b)
ws['B%d'%no] = b
print ("Number of Case Studies posts is %d" %n)
ws['C%d'%no] = n
print ("Number of blog posts is %d" %w)
ws['D%d'%no] = w
print ("Number of press posts is %d" %p)
ws['E%d'%no] = p
print ("Number of posts is %d" %page)
ws['F%d'%no] = page
ws1['B%d'%no] = phrase.encode('utf-8' , 'ignore')
ws1['C%d'%no] = address.encode('utf-8' , 'ignore')
ws1['D%d'%no] = counts.encode('utf-8' , 'ignore')
counts.clear()
num += 1
word = worksheet.cell(num,3).value
#print word
page=0
b=0
n=0
w=0
p=0
o=0
phrase=[]
address=[]
tag_list=[]
wb.save('%s.xlsx'%worksheet1.cell(linknumber,0).value)
I get the following output and error while running the code:
www.amobee.com
in loop
<Response [200]>
3
Traceback (most recent call last):
File "C:/project_web_parser.py", line 69, in <module>
add = link.encode("ascii", "ignore")
File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1094, in encode
u = self.decode(indent_level, encoding, formatter)
File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1159, in decode
indent_space = (' ' * (indent_level - 1))
TypeError: unsupported operand type(s) for -: 'str' and 'int'
Process finished with exit code 1
Traceback shows error in line 69 where you try to encode link. To fix it, just change that line to:
add = link.encode("ascii", errors="ignore")
Why does it happen?
Your link
variable is type of bs4.element.Tag
>>>type(link)
<class 'bs4.element.Tag'>
.encode()
method for tags takes more arguments then .encode()
method for strings.
In source code of bs4 in file \bs4\element.py
on line 1089 you can find definition of it:
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
First argument is encoding, second is indent_level (int or None) and errors handling is forth.
Error
unsupported operand type(s) for -: 'str' and 'int'
means that you tried to subtract 'ignore' - 1
.