I've been at this for days, I'm trying to scrape this website: "https://careers.ispor.org/jobseeker/search/results/". I've got everything covered, from the script that will extract the information in every job page to the script that counts how many pages are there, but in order to get the individual links of every job posting, I need to loop through the pages containing 25 listings each, but THAT is the problem:
Here's my code so far, I got the first page's data successfully using this:
from zenrows import ZenRowsClient
client = ZenRowsClient("the_api_key_i_got_from_the_trial_account")
url = "https://careers.ispor.org/jobseeker/search/results"
params = {"js_render":"true","premium_proxy":"true"}
response = client.get(url, params=params)
soup = BeautifulSoup(response.content, 'html.parser')
after that I've been trying to replicate the ajax request that I inspected when I clicked on the "next page" button, but the result I keep getting only mimics having gone to the next page while the job postings on the returned result are of the first page:
data = {
'page': '2',
'pos_flt': '0',
'location_autocomplete': 'true',
'radius': '320',
'ajaxRequest': '1',
'user_latlong': 'lat=33.874698638916$long=10.102299690247',
"js_render": "true",
"premium_proxy": "true"
}
cookies = {
'AWSALB': "e6+c5w9IR/N4+ERov3onMB85zlZbl+mughxR4zfjLRLMoq9SJwBHTesVwdSAoTLuK88spU0tbqTVZ8jI7NGHLxMo/7Q+DefZBboxMZDGRMLBY60+HRQaBnKOYDhJ",
'AWSALBCORS': "e6+c5w9IR/N4+ERov3onMB85zlZbl+mughxR4zfjLRLMoq9SJwBHTesVwdSAoTLuK88spU0tbqTVZ8jI7NGHLxMo/7Q+DefZBboxMZDGRMLBY60+HRQaBnKOYDhJ",
'JTSUBREF': "careers.ispor.org",
'datadome': "pn970laC_lalBETD5NWHB~pVKYYLrP2fg9_1JlfW1POc~Ny5Usr37BfuNP1UiAl3kCxoOA7z0Pvlwo69rK5WBre5T9znj0U3p55vC_mMGn1w56eqcSU1eWpla3DYLyJb"
}
headers = {
'authority': 'careers.ispor.org',
'method': 'GET',
'path': '/c/@search_results/controller/includes/search_jobs.cfm?page=2&pos_flt=0&location_autocomplete=true&radius=320&ajaxRequest=1&user_latlong=lat%3D33.874698638916%24long%3D10.102299690247',
'scheme': 'https'
}
response = client.get(url, params=params, headers=headers, cookies=cookies, data=data)
soup2 = BeautifulSoup(response.content, 'html.parser')
You don't need Zenrows for this, you can simply replicate the ajax requests. Here's a quick example:
import requests
from bs4 import BeautifulSoup
import json
def get_jobs_list(page_number):
params = {
'page': page_number,
'pos_flt': 0,
'location_autocomplete': True,
'radius': 320,
'ajaxRequest': 1
}
url = 'https://careers.ispor.org/c/@search_results/controller/includes/search_jobs.cfm'
response = requests.get(url, params=params, headers=headers)
data = response.json()
soup = BeautifulSoup(data['search_results'], 'html.parser')
divs = soup.select('div.job-result-tiles > div[role=button]:not(.candidate-products-promotion-tile)')
names = ['job_id', 'job_Position', 'job_company', 'job_source', 'job_Location']
jobs = [{ n: d.select_one(f'input[name="{n}"]').get('value') for n in names} for d in divs]
return jobs
def get_job_details(job_id, job_source=None):
params = {
'job_id': job_id,
'job_source': job_source,
'site_id': 23051,
'ajaxRequest': 1,
}
url = 'https://careers.ispor.org/c/@search_results/controller/includes/search_job.cfm'
response = requests.get(url, params=params, headers=headers)
data = response.json()
soup = BeautifulSoup(data['job_view'], 'html.parser')
desc = soup.select_one('div.job-description-parent-container')
apply_json = json.loads(soup.select_one('input#applyButtonJson').get('value'))
details = {
'description': desc.get_text(separator='\n').strip(),
'apply_link': apply_json['linkAttr']['href'] if apply_json['link'] else apply_json['modalAttr']['href_email']
}
return details
headers = {
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
}
jobs = get_jobs_list(page_number=3)
print(jobs)
# get the datails of a job:
# job_id, job_source = jobs[0]['job_id'], jobs[0]['job_source']
# details = get_job_details(job_id, job_source)
# print(details)