the next page button doesn't change the url when it's pressed, so i have problem with scrapy.
'''
import scrapy
class LegonSpider(scrapy.Spider):
name = "legon"
def start_requests(self):
yield scrapy.Request(
url="https://mylegion.org/PersonifyEbusiness/Find-a-Post",
callback=self.parse
)
def parse(self, response):
# Select distance and country
yield scrapy.FormRequest.from_response(
response,
formid='aspnetForm',
formdata={'dnn$ctr2802$DNNWebControlContainer$ctl00$DistanceList': '100',
'@IP_COUNTRY': 'USA',
'@IP_DEPARTMENT': '00000000001L'},
callback=self.parse_post_page
)
def parse_post_page(self, response):
# Extract and yield requests for post detail pages
post_elements = response.xpath("//div[@class='membership-dir-result-item']")
for post_element in post_elements:
post_num = post_element.xpath(".//div[contains(@class,'POST_NAME')]/text()").get().strip()
post_link = post_element.xpath("./a/@href").get()
yield response.follow(post_link, callback=self.parse_post_detail, meta={'post_num': post_num})
next_page_button = response.xpath("/input[@id='dnn_ctr2802_DNNWebControlContainer_ctl00_Next']")
if next_page_button:
# Extract form data for next page submission
formdata = {
'__EVENTTARGET': 'dnn$ctr2802$DNNWebControlContainer$ctl00$Next',
'__EVENTARGUMENT': ''
}
yield scrapy.FormRequest.from_response(response, formdata=formdata, callback=self.parse_post_page)
def parse_post_detail(self,response):
leader1 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[1]").get()
leader2 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[2]").get()
address = response.xpath("//div[contains(@class,'Address')]/div[2]/text()").get()
typ = response.xpath("//div[contains(@class,'Type')]/div[2]/text()").get()
yield {
"post_num": response.meta['post_num'],
"leader1": leader1,
"leader2": leader2,
"address": address,
"type" : typ
}
i think scrapy didn't even go the next page he's going to the base url which is not changing at all when i press next page or i try to use new search method .
When I checked the responses I saw that I get the same page over and over.
If we use BurpSuite to inspect the requests and compare them we can see this part:
You can see on the RHS the the value "Next", but if we inspect the form data in the response we can see that the value is missing. We just need to add it:
import scrapy
class LegonSpider(scrapy.Spider):
name = "legon"
def start_requests(self):
yield scrapy.Request(
url="https://mylegion.org/PersonifyEbusiness/Find-a-Post",
callback=self.parse
)
def parse(self, response):
# Select distance and country
yield scrapy.FormRequest.from_response(
response,
formid='aspnetForm',
formdata={'dnn$ctr2802$DNNWebControlContainer$ctl00$DistanceList': '100',
'@IP_COUNTRY': 'USA',
'@IP_DEPARTMENT': '00000000001L'},
callback=self.parse_post_page
)
def parse_post_page(self, response):
post_elements = response.xpath("//div[@class='membership-dir-result-item']")
for post_element in post_elements:
post_num = post_element.xpath(".//div[contains(@class,'POST_NAME')]/text()").get().strip()
post_link = post_element.xpath("./a/@href").get()
yield response.follow(post_link, callback=self.parse_post_detail, meta={'post_num': post_num})
next_page_button = response.xpath("//input[@id='dnn_ctr2802_DNNWebControlContainer_ctl00_Next']")
if next_page_button:
form_data = {'dnn$ctr2802$DNNWebControlContainer$ctl00$Next': 'Next'}
yield scrapy.FormRequest.from_response(response, formdata=form_data, callback=self.parse_post_page)
def parse_post_detail(self, response):
leader1 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[1]").get()
leader2 = response.xpath("(//div[contains(@class,'Leadership')]/div[2]/text())[2]").get()
address = response.xpath("//div[contains(@class,'Address')]/div[2]/text()").get()
typ = response.xpath("//div[contains(@class,'Type')]/div[2]/text()").get()
yield {
"post_num": response.meta['post_num'],
"leader1": leader1,
"leader2": leader2,
"address": address,
"type": typ
}
See the differences between my form data and yours.
BTW you missed a /
in the selector of next_page_button
.