Search code examples
pythonscrapysplash-screenscrapyjs

Using scrapyjs crawl onclick pages by splash


I am trying to get url from pages which using javascript like

<span onclick="go1()">click here </span>
<script>function go1(){
        window.location = "../innerpages/" + myname + ".php";
    }
</script>

this is my code using scrapyjs with splash

def start_requests(self):
    for url in self.start_urls:
        yield Request(url, self.parse, meta={
            'splash': {
                'endpoint': 'render.html',
                'args': {'wait': 4, 'html': 1, 'png': 1, 'render_all': 1, 'js_source': 'document.getElementsByTagName("span")[0].click()'},
            }
        })

if i write

'js_source': 'document.title="hello world"'

it will work

is seems like i can handle text inside page but i can not get the url from go1()

what should i do if i want get the url inside go1()

Thanks!


Solution

  • You can use the /execute endpoint:

    class MySpider(scrapy.Spider):
        ...
    
        def start_requests(self):
            script = """
            function main(splash)
                local url = splash.args.url
                assert(splash:go(url))
                assert(splash:wait(1))
    
                assert(splash:runjs('document.getElementsByTagName("span")[0].click()'))
                assert(splash:wait(1))
    
                -- return result as a JSON object
                return {
                    html = splash:html()
                }
            end
            """
            for url in self.start_urls:
                yield scrapy.Request(url, self.parse_result, meta={
                    'splash': {
                        'args': {'lua_source': script},
                        'endpoint': 'execute',
                    }
                })
    
        def parse_result(self, response):
    
            # fetch base URL because response url is the Splash endpoint
            baseurl = response.meta["_splash_processed"]["args"]["url"]
    
            # decode JSON response
            splash_json = json.loads(response.body_as_unicode())
    
            # and build a new selector from the response "html" key from that object
            selector = scrapy.Selector(text=splash_json["html"], type="html")
    
            ...