I'm a beginner in web spider and i am so confused these days when using aiohttp. Here is my code:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
'Referer': 'https://www.mzitu.com/',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
}
class MZiTu(object):
def __init__(self):
self.timeout = 5
self.file_path = 'D:\mzitu'
self.common_page_url = 'https://www.mzitu.com/page/'
self.total_page_num = 0
self.end_album_num = 0
self.session = None
async def start(self):
async with aiohttp.ClientSession(headers=header) as mzt.session:
for page in range(1, self.total_page_num+1):
await self.crawlAlbum(self.common_page_url, page)
async def crawlAlbum(self, common_url, page_num):
page_url = self.common_page_url + str(page_num)
async with self.session.get(page_url, timeout=self.timeout) as resp:
html = await resp.text()
bsop = BeautifulSoup(html, 'lxml')
album_items = bsop.find('ul', {'id': 'pins'}).findAll('li')
for item in album_items:
try:
album_title = item.find('img').attrs['alt']
album_url = item.find('a').attrs['href']
if not os.path.exists(os.path.join(self.file_path, album_title)):
os.mkdir(os.path.join(self.file_path, album_title))
os.chdir(os.path.join(self.file_path, album_title))
await self.crawlImgs(album_url)
except:
continue
async def crawlImgs(self, album_url):
self.end_album_num = await self.getAlbumTotalNum(album_url)
for i in range(1, self.end_album_num+1):
img_page_url = album_url + str(i)
async with self.session.get(img_page_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
try:
img_url = bsop.find('div', {'class': 'main-image'}).find('img').attrs['src']
await self.downloadImg(i, img_url)
except:
continue
async def getAlbumTotalNum(self, album_url):
async with self.session.get(album_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
return total_num
async def downloadImg(self,index, img_url):
async with self.session.get(img_url, timeout=self.timeout) as resq:
content = await resq.read()
async with aiofiles.open(str(index)+'.jpg', 'wb') as f:
await f.write(content)
if __name__ == "__main__":
mzt = MZiTu()
mzt.total_page_num = 2
loop = asyncio.get_event_loop()
to_do = [mzt.start()]
wait_future = asyncio.wait(to_do)
loop.run_until_complete(wait_future)
loop.close()
my code return directly at the first line below,why? so confused
async def getAlbumTotalNum(self, album_url):
async with self.session.get(album_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
return total_num
i can't find any errors in my program. so confused. so confused. if there are some Learning materials about aiohttp and asyncio, i feel so difficult.
The first issue is that you are using pokemon exception handling, you really don't want to catch them all.
Catch specific exceptions, only, or at least only catch Exception
and make sure to re-raise asyncio.CancelledError
(you don't want to block task cancellations), and log or print the exceptions that are raised so you can further clean up your handler. As a quick fix, I replaced your try:... except: continue
blocks with:
try:
# ...
except asyncio.CancelledError:
raise
except Exception:
traceback.print_exc()
continue
and added import traceback
at the top. When you then run your code, you see why your code is failing:
Traceback (most recent call last):
File "test.py", line 43, in crawlAlbum
await self.crawlImgs(album_url)
File "test.py", line 51, in crawlImgs
self.end_album_num = await self.getAlbumTotalNum(album_url)
File "test.py", line 72, in getAlbumTotalNum
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
AttributeError: 'NoneType' object has no attribute 'findAll'
Either the way the site marked up links changed, or the site uses Javascript to alter the DOM in a browser after loading the HTML. Either way, using a blanket except:
clause without logging the error hides such issues from you and make it really hard to debug.
I'd at least add some logging to record what URL the code was trying to parse when exceptions occur, so you can replicate the issue in an interactive, non-asyncio setup and try out different approaches to parse the pages.
Rather than use .find()
and .findAll()
calls, use CSS selector the find the correct elements:
links = bsop.select(f'div.pagenavi a[href^="{album_url}"] span')
return 1 if len(links) < 3 else int(links[-2].string)
The above uses the current URL to limit the search to the specific span
elements with a a
element parent that have href
attribute whose value at least starts with the current page URL.
Note that the above is not the only problem however, when that one is fixed, the next exception is
Traceback (most recent call last):
File "test.py", line 59, in crawlImgs
img_url = bsop.find('div', {'class': 'main-image'}).find('img').attrs['src']
AttributeError: 'NoneType' object has no attribute 'find'
This one is actually caused by your incorrect URL handling for albums, assuming that they'll always end in /
. Correct this:
async def crawlImgs(self, album_url):
end_album_num = await self.getAlbumTotalNum(album_url)
if album_url[-1] != '/':
album_url += '/'
for i in range(1, end_album_num + 1):
img_page_url = album_url + str(i)
# ...
You do not want to set album_num
as an attribute on self
however! The class instance state is shared between tasks, while you don't actually create multiple tasks in your code (it is all one sequential task at the moment), you want to avoid altering shared state.