I would like to download the dynamic generated image from a website. The website is has javascript code and click button to turn to previous image and next image. I inspected the http request and response in chrome. The request is almost the same except the image name(it is numerically increased like:000001.jpg,000002.jpg
). Now I can access the first image and save it to disk by subclassing QWebView with a customized QNetworkAccessManager. I overload the createRequest function:
import sys,urllib,time,os
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import *
from PIL import Image
class NetworkAccessManager(QNetworkAccessManager):
def __init__(self,old_manager):
QNetworkAccessManager.__init__(self)
self.old_manager = old_manager
self.setCache(old_manager.cache())
self.setCookieJar(old_manager.cookieJar())
self.setProxy(old_manager.proxy())
self.setProxyFactory(old_manager.proxyFactory())
self.imreply=None
self.reqstr=None
self.otherreply=None
self.current_req=None
self.cnt=0
self.jpgName="test.jpg"
self.first=True
self.ba=QByteArray()
self.ba.clear()
def createRequest(self, operation, request, data):
req = request.url().toString()
if req.contains(QString("zoom=")) and req.contains(QString("ss2jpg")) and not req.contains(QString("pi=2")):
strreq=str(req)
l=strreq.find("jid=")
r=strreq.find(".jpg&a")
self.jpgName=strreq[l+5:r+4]
self.jpgcnt=int(strreq[l+5:r])
print self.jpgName,self.jpgcnt
self.imreply=QNetworkAccessManager.createRequest(self,operation, request, data)
self.connect(self.imreply,SIGNAL("readyRead()"),self.saveImage)
return self.imreply
elif req.contains(QString("uf=ssr")):
strreq=str(req)
self.reqstr=strreq
self.current_req=request
r=strreq.find("?")
self.jpgcnt=int(strreq[r-6:r])
self.otherreply=QNetworkAccessManager.createRequest(self,operation, request, data)
return self.otherreply
else:
return QNetworkAccessManager.createRequest(self,operation, request, data)
def saveImage(self):
if self.imreply.header(QNetworkRequest.ContentTypeHeader).toString().contains(QString("image/jpeg")) or self.imreply.header(QNetworkRequest.ContentTypeHeader).toString().contains(QString("image/png")):
contentLen,flag = QString(self.imreply.rawHeader("Content-Length")).toInt()
self.ba=self.ba.append(self.imreply.readAll())
if self.ba.size() == contentLen:
#self.ba=self.imreply.readAll()
im=QImage.fromData(self.ba)
im.save(self.jpgName)
im=Image.open(self.jpgName)
print "saving image",contentLen,self.jpgName
im.save(self.jpgName)
self.ba.clear()
self.emit(SIGNAL("nextPage()"))
class dxWebView(QWebView):
def __init__(self):
QWebView.__init__(self)
def clickNext(self):
manager=self.page().networkAccessManager()
if manager.cnt<50:
nextreq=manager.current_req
nexturl=manager.reqstr.replace(str(manager.jpgcnt),str(manager.jpgcnt+1))
print "next url",nexturl
nextreq.setUrl(QUrl(nexturl))
manager.get(QNetworkRequest(nextreq))
manager.cnt=manager.cnt+1
def main():
app=QApplication(sys.argv)
QWebSettings.globalSettings().setAttribute(QWebSettings.PluginsEnabled, True);
view=dxWebView()
old_manager=view.page().networkAccessManager()
new_manager=NetworkAccessManager(old_manager)
view.page().setNetworkAccessManager(new_manager)
QObject.connect(new_manager,SIGNAL("nextPage()"),view.clickNext)
url="http://www.yishuleia.cn/DrsPath.do?kid=686A67696A6F6A673134343438303337&username=gdnz2&spagenum=201&pages=50&fid=14813857&a=3fc3e380601ced0f08749c964294120e&btime=2013-04-03&etime=2013-04-23&template=bookdsr1&firstdrs=http%3A%2F%2Fbook.duxiu.com%2FbookDetail.jsp%3FdxNumber%3D000008299393%26d%3D592DC22226A893A958A6578E7D039A43"
view.load(QUrl(url))
view.show()
sys.exit(app.exec_())
if __name__=='__main__':
main()
When the first image is saved, the clickNext is triggered and qnetworkaccessmanager send the next request.But I found the manager.get(nextreq) did not work.The http analyzer did not siffered any http request and response. Am I wrong in clickNext function? How to do this? Thanks!
As such the QNetworkAccessManager is a part of the QWebPage object, and the createRequest() method is invoked whenever there is any request for a resource from the rendered HTML (and any javascript it contains). As per my understanding the clickNext() function won't really have access to the actual DOM of the webpage in the manner you require.
If your aim is to build an application that can download all of these pictures, you can run some simple javascript on the site that automatically clicks through to the 'Next' image. Then, as you have done, you watch for requests to load images in your overloaded createRequest() function.