Search code examples
pythoncssweb-scrapingscrapycss-selectors

How to especify search in webscraping python selectors


my objective is to get data from this site: https://pokemondb.net/pokedex/all

i´m struggling to get the abilities, they must be like this:

URL Name Description of effect

But some of these informations are in another page. Any tips on how i can get them?

I need to access the link for each skill and gather the information:

enter image description here

My code alredy looks like this now:

import scrapy

class PokeSpider(scrapy.Spider):
name = 'pokespider'
start_urls = ['https://pokemondb.net/pokedex/all']

def parse(self, response):
    linha = response.css('table#pokedex > tbody > tr:first-child')
    link = linha.css("td:nth-child(2) > a::attr(href)")
    yield response.follow(link.get(), self.parser_pokemon)

def parser_pokemon(self, response):
    nome = response.css('h1::text').get()
    id = response.css('table.vitals-table > tbody > tr:nth-child(1) > td > strong::text').get()
    tamanho = response.css('table.vitals-table > tbody > tr:nth-child(4) > td::text').get()
    peso = response.css('table.vitals-table > tbody > tr:nth-child(5) > td::text').get()
    url_pokemon = response.url
    tipos = response.css('table.vitals-table tbody tr:nth-child(2) td a::text').getall()[:2]
    evolucoes = []
    evolucoes_possiveis = response.css('#main div.infocard-list-evo div span.infocard-lg-data.text-muted')
    
    for evolucao in evolucoes_possiveis:
        nome_evolucao = evolucao.css('a::text').get()
        id_evolucao = evolucao.css('small:nth-child(1)::text').get()
        url_evolucao = evolucao.css('a::attr(href)').get()
        url_evolucao_completinha = f'https://pokemondb.net{url_evolucao}'
      
        evolucoes.append({
            "nome_evolucao": nome_evolucao,
            "id_evolucao": id_evolucao,
            "url_evolucao": url_evolucao_completinha
        })
      
    yield {
        "nome": nome,
        "id": id,
        "tamanho": tamanho,
        "peso": peso,
        "url_pokemon": url_pokemon,
        "tipos": tipos,
        "evolucoes": evolucoes,
    }

Solution

  • I advise you to read the documentation about cb_kwargs in https://docs.scrapy.org/en/latest/topics/debug.html?highlight=cb_kwargs and scrapy items in https://docs.scrapy.org/en/latest/topics/items.html

    You can make next request and pass information to the next function through meta argument like this

        def parser_pokemon(self, response):
        nome = response.css('h1::text').get()
        id = response.css('table.vitals-table > tbody > tr:nth-child(1) > td > strong::text').get()
        tamanho = response.css('table.vitals-table > tbody > tr:nth-child(4) > td::text').get()
        peso = response.css('table.vitals-table > tbody > tr:nth-child(5) > td::text').get()
        url_pokemon = response.url
        tipos = response.css('table.vitals-table tbody tr:nth-child(2) td a::text').getall()[:2]
        evolucoes = []
        evolucoes_possiveis = response.css('#main div.infocard-list-evo div span.infocard-lg-data.text-muted')
    
        for evolucao in evolucoes_possiveis:
            nome_evolucao = evolucao.css('a::text').get()
            id_evolucao = evolucao.css('small:nth-child(1)::text').get()
            url_evolucao = evolucao.css('a::attr(href)').get()
            url_evolucao_completinha = f'https://pokemondb.net{url_evolucao}'
    
            evolucoes.append(
                {
                    "nome_evolucao": nome_evolucao,
                    "id_evolucao": id_evolucao,
                    "url_evolucao": url_evolucao_completinha
                }
            )
            # VVVVVVVVVVVV next code is updated VVVVVVVVVVVV
            yield Request(
                url='https://example.com/next_page_path',
                callback=self.parse_attributes,
                meta={
                    'pokemon_attribs': {
                        "nome": nome,
                        "id": id,
                        "tamanho": tamanho,
                        "peso": peso,
                        "url_pokemon": url_pokemon,
                        "tipos": tipos,
                        "evolucoes": evolucoes,
                    },
                },
            )
    
    def parse_attributes(self, response):
        pokemon_attribs = response.meta['pokemon_attribs']
        pokemon_lastname = response.css('a::text').get()
        pokemon_attribs.update({'pokemon_lastname': pokemon_lastname})
        yield pokemon_attribs