Search code examples
pythonajaxselenium-webdriverweb-scrapingbeautifulsoup

scrape a website which has the same url for multiple pages? with the page jump being an ajax request


I've been at this for days, I'm trying to scrape this website: "https://careers.ispor.org/jobseeker/search/results/". I've got everything covered, from the script that will extract the information in every job page to the script that counts how many pages are there, but in order to get the individual links of every job posting, I need to loop through the pages containing 25 listings each, but THAT is the problem:

  1. it's all under the same URL
  2. forget about Selenium, it gets detected and blocked after 2 tries
  3. the data loads with javascript so the regular requests method doesn't work, the only working method I found is by using Zenrows' free trial to make the request

Here's my code so far, I got the first page's data successfully using this:

from zenrows import ZenRowsClient

client = ZenRowsClient("the_api_key_i_got_from_the_trial_account")
url = "https://careers.ispor.org/jobseeker/search/results"
params = {"js_render":"true","premium_proxy":"true"}

response = client.get(url, params=params)
soup = BeautifulSoup(response.content, 'html.parser')

after that I've been trying to replicate the ajax request that I inspected when I clicked on the "next page" button, but the result I keep getting only mimics having gone to the next page while the job postings on the returned result are of the first page:

data = {
    'page': '2',
    'pos_flt': '0',
    'location_autocomplete': 'true',
    'radius': '320',
    'ajaxRequest': '1',
    'user_latlong': 'lat=33.874698638916$long=10.102299690247',
    "js_render": "true",
    "premium_proxy": "true"
}
cookies = {
    'AWSALB': "e6+c5w9IR/N4+ERov3onMB85zlZbl+mughxR4zfjLRLMoq9SJwBHTesVwdSAoTLuK88spU0tbqTVZ8jI7NGHLxMo/7Q+DefZBboxMZDGRMLBY60+HRQaBnKOYDhJ",
    'AWSALBCORS': "e6+c5w9IR/N4+ERov3onMB85zlZbl+mughxR4zfjLRLMoq9SJwBHTesVwdSAoTLuK88spU0tbqTVZ8jI7NGHLxMo/7Q+DefZBboxMZDGRMLBY60+HRQaBnKOYDhJ",
    'JTSUBREF': "careers.ispor.org",
    'datadome': "pn970laC_lalBETD5NWHB~pVKYYLrP2fg9_1JlfW1POc~Ny5Usr37BfuNP1UiAl3kCxoOA7z0Pvlwo69rK5WBre5T9znj0U3p55vC_mMGn1w56eqcSU1eWpla3DYLyJb"
}
headers = {
    'authority': 'careers.ispor.org',
    'method': 'GET',
    'path': '/c/@search_results/controller/includes/search_jobs.cfm?page=2&pos_flt=0&location_autocomplete=true&radius=320&ajaxRequest=1&user_latlong=lat%3D33.874698638916%24long%3D10.102299690247',
    'scheme': 'https'
}
response = client.get(url, params=params, headers=headers, cookies=cookies, data=data)
soup2 = BeautifulSoup(response.content, 'html.parser')

Solution

  • You don't need Zenrows for this, you can simply replicate the ajax requests. Here's a quick example:

    import requests
    from bs4 import BeautifulSoup
    import json
    
    def get_jobs_list(page_number):
        params = {
            'page': page_number,
            'pos_flt': 0,
            'location_autocomplete': True,
            'radius': 320,
            'ajaxRequest': 1
        }
    
        url = 'https://careers.ispor.org/c/@search_results/controller/includes/search_jobs.cfm'
        response = requests.get(url, params=params, headers=headers)
        data = response.json()
    
        soup = BeautifulSoup(data['search_results'], 'html.parser')
        divs = soup.select('div.job-result-tiles > div[role=button]:not(.candidate-products-promotion-tile)')
    
        names = ['job_id', 'job_Position', 'job_company', 'job_source', 'job_Location']
        jobs = [{ n: d.select_one(f'input[name="{n}"]').get('value') for n in names} for d in divs]
    
        return jobs
    
    
    def get_job_details(job_id, job_source=None):
        params = {
            'job_id': job_id,
            'job_source': job_source,
            'site_id': 23051,
            'ajaxRequest': 1,
        }
    
        url = 'https://careers.ispor.org/c/@search_results/controller/includes/search_job.cfm'
        response = requests.get(url, params=params, headers=headers)
        data = response.json()
        soup = BeautifulSoup(data['job_view'], 'html.parser')
    
        desc = soup.select_one('div.job-description-parent-container')
        apply_json =  json.loads(soup.select_one('input#applyButtonJson').get('value'))
    
        details = {
            'description': desc.get_text(separator='\n').strip(),
            'apply_link': apply_json['linkAttr']['href'] if apply_json['link'] else apply_json['modalAttr']['href_email']
        }
    
        return details
    
    
    headers = {
        'accept-language': 'en-US,en;q=0.9',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    }
    
    
    jobs = get_jobs_list(page_number=3)
    print(jobs)
    
    # get the datails of a job:
    # job_id, job_source = jobs[0]['job_id'], jobs[0]['job_source']
    # details = get_job_details(job_id, job_source)
    # print(details)