For research purpose, I need to build a set of benign programs. First, I need to get these programs from http://downloads.informer.com. To do so, I have written a python script that iterates each downloading page and extract the download links into a list. After that the script uses these links to download the programs (these programs are exe, msi, or zip files). Unfortunately, at this step, the script runs into error stating that (AttributeError: 'Request' object has no attribute 'decode').
Following is the script that works on single page and retrieves single program (for simplicity):
import wget
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://sweet-home-3d.informer.com/download'
import urllib.request
req = urllib.request.Request(
my_url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
uClient = uReq(req)
page_html = uClient.read()
page_soup = soup(page_html, 'lxml' )
cont01 = page_soup.findAll('a', {'class':'download_button'})
conts = cont01[1]
ref= conts['href']
addr = urllib.request.Request(
ref,
data=None,
headers={
'User-Agent': 'Mozilla/5.0'
}
)
wget.download(addr)
The error I get is following:
AttributeError Traceback (most recent call last)
<ipython-input-1-93c4caaa1777> in <module>()
31 }
32 )
---> 33 wget.download(addr)
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in download(url, out, bar)
503
504 # get filename for temp file in current directory
--> 505 prefix = detect_filename(url, out)
506 (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
507 os.close(fd)
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in detect_filename(url, out, headers, default)
482 names["out"] = out or ''
483 if url:
--> 484 names["url"] = filename_from_url(url) or ''
485 if headers:
486 names["headers"] = filename_from_headers(headers) or ''
C:\Users\bander\Anaconda3\lib\site-packages\wget.py in filename_from_url(url)
228 """:return: detected filename as unicode or None"""
229 # [ ] test urlparse behavior with unicode url
--> 230 fname = os.path.basename(urlparse.urlparse(url).path)
231 if len(fname.strip(" \n\t.")) == 0:
232 return None
C:\Users\bander\Anaconda3\lib\urllib\parse.py in urlparse(url, scheme, allow_fragments)
292 Note that we don't break the components up in smaller bits
293 (e.g. netloc is a single string) and we don't expand % escapes."""
--> 294 url, scheme, _coerce_result = _coerce_args(url, scheme)
295 splitresult = urlsplit(url, scheme, allow_fragments)
296 scheme, netloc, url, query, fragment = splitresult
C:\Users\bander\Anaconda3\lib\urllib\parse.py in _coerce_args(*args)
112 if str_input:
113 return args + (_noop,)
--> 114 return _decode_args(args) + (_encode_result,)
115
116 # Result objects are more helpful than simple tuples
C:\Users\bander\Anaconda3\lib\urllib\parse.py in _decode_args(args, encoding, errors)
96 def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
---> 98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100 def _coerce_args(*args):
C:\Users\bander\Anaconda3\lib\urllib\parse.py in <genexpr>(.0)
96 def _decode_args(args, encoding=_implicit_encoding,
97 errors=_implicit_errors):
---> 98 return tuple(x.decode(encoding, errors) if x else '' for x in args)
99
100 def _coerce_args(*args):
AttributeError: 'Request' object has no attribute 'decode'
I would be grateful if someone could help me fix this. Thanks beforehand.
Actually you don't need Selenium for this. It's a cookie issue. I'm sure you can do cookies somehow with urllib too, but that's not my area of expertise.
If you were to do the job - without a browser and wget - in requests, you could grab the files like so:
import requests
from bs4 import BeautifulSoup as bs
# you need headers or the site won't let you grab the data
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3181.0 Safari/537.36"
}
url = 'http://sweet-home-3d.informer.com/download/'
# you need a cookie to download. Create a persistens session
s = requests.Session()
r = s.get(url, headers=headers)
soup = bs(r.text, "html.parser")
# all download options lie in a div with class table
links_table = soup.find('div', {'class': 'table'})
file_name = links_table.find('div', {'class': 'table-cell file_name'})['title']
download_link = links_table.find('a', {'class': 'download_button'})['href']
# for some reason the url-page doesn't set the cookie you need.
# the subpages do, so we need to get it from one of them - before we call download_link
cookie_link = links_table.a['href']
r = s.get(cookie_link, headers=headers)
# now with a cookie set, we can download the file
r = s.get(download_link,headers=headers)
with open(file_name, 'wb') as f:
f.write(r.content)