Search code examples
python-3.xpycharmtypeerrorspyderoperand

Python 3.x unsupported operand type in using encode decode


I am trying to build a generic crawler for my marketing project and keep track of where the information came from viz blogs, testimonials etc. I am using Python 3.5 and Spyder/pycharm as IDE and I keep getting the following error in using encode - decode. The input to my code is a list of company names and product features in an excel file. I also searched for possible solutions but the recommendations in the community are for typecasting, which I am not sure is the problem. Kindly let me know if some more clarification is required from my side.

from __future__ import division, unicode_literals 
import codecs
import re
import os
import xlrd
import requests
from urllib.request import urlopen
from time import sleep
from bs4 import BeautifulSoup
import openpyxl
from collections import Counter

page=0
b=0
n=0
w=0
p=0
o=0
workbook=xlrd.open_workbook("C:\Product.xlsx")
workbook1=xlrd.open_workbook("C:\linkslist.xlsx")
sheet_names = workbook.sheet_names()
sheet_names1 = workbook1.sheet_names()
wb= openpyxl.Workbook() #User Spreadsheet
ws = wb.active
ws.title = "User"
ws['A1'] = 'Feature'
ws['B1'] = 'Customer-Testimonials'
ws['C1'] = 'Case Study'
ws['D1'] = 'Blog'
ws['E1'] = 'Press'
ws['F1'] = 'Total posts'
ws1 = wb.create_sheet(title="Ml")
ws1['A1'] = 'Feature'
ws1['B1'] = 'Phrase'
ws1['C1'] = 'Address'
ws1['D1'] = 'Tag Count'
worksheet = workbook.sheet_by_name(sheet_names[0])
worksheet1 = workbook1.sheet_by_name(sheet_names[0])
for linknumber in range(0,25):
    u = worksheet1.cell(linknumber,0).value
    url='www.' + u.lower() + '.com'
    print (url)
    r=''
    while r == '':
        try:
            print ("in loop")
            r  = requests.get("http://" +url)
        except:
            sleep(3)#if the code still gives that error then try increasing the sleep time to 5 maybe
    print (r)
    data = r.text
    #print data
    soup1 = BeautifulSoup(data, "html.parser")
    #print soup1
    num=3 #starting row number and keep the column same.
    word = ''
    word = worksheet.cell(num,3).value

    while not word == 'end':
        print (num)
        #print word
        tag_list=[]
        phrase= []
        counts=[]
        address=[]        
        counts = Counter(tag_list)
        for link in soup1.find_all('a'):
            #print link
            add = link.encode("ascii", "ignore")
            print (add) 
            if not'Log In' in add:
                #print link.get('href')
                i=0
                content = ''
                for i in range(1,5):
                    if content=='':
                        try:
                            print (link.get('href'))
                            i+=1
                            req = urllib.request.Request(link.get('href'))
                            with urllib.request.urlopen(req) as response:
                                content = response.read()    
                        except:
                            sleep(3)
                            #if the code still gives that error then try increasing the sleep time to 5 maybe
                            continue
                soup = BeautifulSoup(content, "html.parser") 
                s=soup(text=re.compile(word))
                if s:
                    print ("TRUE")
                    add = link.encode('ascii','ignore')
                    print (type(add))
                    if 'customer-testimonial' in add :
                        b+=1
                    elif 'case-study' in add :
                        n+=1
                    elif 'blog' in add :
                        w+=1  
                    elif 'press' in add :
                        p+=1
                    else :
                        o+=1
                    #phrase_type=["Customer testimonials","news","ads","twitter","facebook","instagram"]
                    #print(os.path.join(root, name))
                    print (add)
                    for tag in s:
                        parent_html = tag.parent.name 
                        print (parent_html)
                        tag_list.append(parent_html)
                    phrase.append(s)
                    address.append(add)
                    #print str(phrase)
                    counts = Counter(tag_list)
                    page +=1
                else:
                    counts = Counter(tag_list)
        no =num-1
        print(counts)
        print (word)
        ws['A%d'%no] = word.encode('utf-8' , 'ignore')
        ws1['A%d'%no] = word.encode('utf-8' , 'ignore')
        print ("Number of pages is %d" %page)
        print ("Number of Customer testimonials posts is %d" %b)
        ws['B%d'%no] = b
        print ("Number of Case Studies posts is %d" %n)
        ws['C%d'%no] = n
        print ("Number of blog posts is %d" %w)
        ws['D%d'%no] = w
        print ("Number of press posts is %d" %p)
        ws['E%d'%no] = p
        print ("Number of posts is %d" %page)
        ws['F%d'%no] = page
        ws1['B%d'%no] = phrase.encode('utf-8' , 'ignore')
        ws1['C%d'%no] = address.encode('utf-8' , 'ignore')
        ws1['D%d'%no] = counts.encode('utf-8' , 'ignore')
        counts.clear()
        num += 1
        word = worksheet.cell(num,3).value
        #print word
        page=0
        b=0
        n=0
        w=0
        p=0
        o=0
        phrase=[]
        address=[]
        tag_list=[]
wb.save('%s.xlsx'%worksheet1.cell(linknumber,0).value)

I get the following output and error while running the code:

www.amobee.com
in loop
<Response [200]>
3
Traceback (most recent call last):
  File "C:/project_web_parser.py", line 69, in <module>
    add = link.encode("ascii", "ignore")
  File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1094, in encode
    u = self.decode(indent_level, encoding, formatter)
  File "C:\ProgramData\Ana3\lib\site-packages\bs4\element.py", line 1159, in decode
    indent_space = (' ' * (indent_level - 1))
TypeError: unsupported operand type(s) for -: 'str' and 'int'

Process finished with exit code 1

Solution

  • Traceback shows error in line 69 where you try to encode link. To fix it, just change that line to:

    add = link.encode("ascii", errors="ignore") 
    

    Why does it happen?

    Your link variable is type of bs4.element.Tag

    >>>type(link)
    <class 'bs4.element.Tag'>
    

    .encode() method for tags takes more arguments then .encode() method for strings. In source code of bs4 in file \bs4\element.py on line 1089 you can find definition of it:

    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
               indent_level=None, formatter="minimal",
               errors="xmlcharrefreplace"):
    

    First argument is encoding, second is indent_level (int or None) and errors handling is forth.

    Error

    unsupported operand type(s) for -: 'str' and 'int'
    

    means that you tried to subtract 'ignore' - 1.