Search code examples
python-3.xselenium-webdriverweb-scraping

Scrape all data of a dynamic website using Selenium in Python


I am trying to scrape data from below url

url=https://www.usaspending.gov/search/?hash=7e5e5a79e871a86ff6b69395e47ab41e

But it contains a scroller also, resulting in all values not getting scraped.

The code I tried is mentioned below requesting your help please

import requests
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains

driver=webdriver.Chrome()

url = "https://www.usaspending.gov/search/?hash=7e5e5a79e871a86ff6b69395e47ab41e"
driver.get(url)

action = ActionChains(driver)

from selenium.webdriver.common.by import By
time.sleep(7)

headings_row = driver.find_elements(By.CLASS_NAME, 'award-result-header-cell')
award_link = driver.find_elements(By.PARTIAL_LINK_TEXT, 'FA')
link="/recipient/"
recp_link = driver.find_elements(By.XPATH,'//a[@href="'+link+'"]')


list_headings_row = []
award_id = []
reciepient = []

for i in range(15):
    list_headings_row.append(headings_row[i].text)
    award_id.append(award_link[i].text)
    reciepient.append(recp_link[i].text)

print(list_headings_row)

Below is the response i get from printing list_headings_row

> ['Award ID',  'Recipient Name',  'Start Date\n(Period of
> Performance)',  'End Date\n(Period of Performance)',  '',  '',  '', 
> '',  '',  '',  '',  '',  '',  '',  '']

Solution

  • As mentioned in the comment, the data can be obtained with the official API.

    Full docs on the spending by award API endpoint.

    Here's how to do this:

    import json
    import time
    
    import requests
    
    api_url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.200",
        "X-Requested-With": "XMLHttpRequest",
    }
    
    payload = {
        "filters": {
            "time_period": [
                {
                    "start_date": "2022-10-01",
                    "end_date": "2023-09-30"
                },
                {
                    "start_date": "2021-10-01",
                    "end_date": "2022-09-30"
                },
                {
                    "start_date": "2020-10-01",
                    "end_date": "2021-09-30"
                }
            ],
            "award_type_codes": [
                "A",
                "B",
                "C",
                "D"
            ],
            "agencies": [
                {
                    "type": "awarding",
                    "tier": "subtier",
                    "name": "Department of the Air Force",
                    "toptier_name": "Department of Defense"
                }
            ],
            "award_amounts": [
                {
                    "upper_bound": 75000000
                }
            ]
        },
        "fields": [
            "Award ID",
            "Recipient Name",
            "Start Date",
            "End Date",
            "Award Amount",
            "Total Outlays",
            "Description",
            "def_codes",
            "COVID-19 Obligations",
            "COVID-19 Outlays",
            "Infrastructure Obligations",
            "Infrastructure Outlays",
            "Awarding Agency",
            "Awarding Sub Agency",
            "Contract Award Type",
            "recipient_id",
            "prime_award_recipient_id"
        ],
        "page": 1,
        "limit": 60,
        "sort": "Award Amount",
        "order": "desc",
        "subawards": False
    }
    
    
    def wait_a_bit(wait_for: int = 1, message: bool = False) -> None:
        if message:
            print(f"Waiting for {wait_for} seconds...")
        time.sleep(wait_for)
    
    
    def get_spending_data():
        with requests.Session() as session:
            while True:
                response = session.post(api_url, headers=headers, json=payload)
                response.raise_for_status()
                spending_data = response.json()
                awards = spending_data["results"]
                # This shows only the first award in the list; remove [0] to see all
                print(json.dumps(awards[0], indent=4))
                payload["page"] += 1
                wait_a_bit(wait_for=1, message=True)
                if not spending_data["page_metadata"]["hasNext"]:
                    break
    
    
    if __name__ == "__main__":
        get_spending_data()
    

    This should put first "row" of the data as a dict for each "scroll" (page of the table).

    {
        "internal_id": 90310986,
        "Award ID": "FA875019C1518",
        "Recipient Name": "INTERNATIONAL BUSINESS MACHINES CORP",
        "Start Date": "2019-08-16",
        "End Date": "2023-08-16",
        "Award Amount": 74999951.0,
        "Total Outlays": 5984934.86,
        "Description": "IBM NORTHPOLE NEURAL INFERENCE MACHINE: ARCHITECTURE, SOFT INTELLECTUAL PROPERTY (IP) CORE TECHNOLOGY, SOFTWARE ECOSYSTEM, PROTOTYPE CHIP&BOARD PHASE 2",
        "def_codes": [
            "N",
            "Q"
        ],
        "COVID-19 Obligations": -3175836.46,
        "COVID-19 Outlays": 3175836.46,
        "Infrastructure Obligations": null,
        "Infrastructure Outlays": null,
        "Awarding Agency": "Department of Defense",
        "Awarding Sub Agency": "Department of the Air Force",
        "Contract Award Type": "DEFINITIVE CONTRACT",
        "recipient_id": "d1776a20-1dbc-351a-8f2e-e20d504a1d3f-C",
        "prime_award_recipient_id": null,
        "awarding_agency_id": 1173,
        "agency_slug": "department-of-defense",
        "generated_internal_id": "CONT_AWD_FA875019C1518_9700_-NONE-_-NONE-"
    }
    Waiting for 1 seconds...
    {
        "internal_id": 15069464,
        "Award ID": "FA867217C0010",
        "Recipient Name": "RAYTHEON COMPANY",
        "Start Date": "2019-06-30",
        "End Date": "2023-06-30",
        "Award Amount": 70866143.0,
        "Total Outlays": 2579829.0,
        "Description": "SMALL DIAMETER BOMB II - LOT 3 PRODUCTION",
        "def_codes": [
            "Q"
        ],
        "COVID-19 Obligations": null,
        "COVID-19 Outlays": null,
        "Infrastructure Obligations": null,
        "Infrastructure Outlays": null,
        "Awarding Agency": "Department of Defense",
        "Awarding Sub Agency": "Department of the Air Force",
        "Contract Award Type": "DEFINITIVE CONTRACT",
        "recipient_id": "01c4a3a3-b4c5-ce4e-822b-d17f09985001-C",
        "prime_award_recipient_id": null,
        "awarding_agency_id": 1173,
        "agency_slug": "department-of-defense",
        "generated_internal_id": "CONT_AWD_FA867217C0010_9700_-NONE-_-NONE-"
    }
    Waiting for 1 seconds...
    {
        "internal_id": 15058192,
        "Award ID": "FA862215F8112",
        "Recipient Name": "HX5 LLC",
        "Start Date": "2015-08-14",
        "End Date": "2020-08-31",
        "Award Amount": 66839178.32,
        "Total Outlays": 0.0,
        "Description": "IGF::CL::IGF SCATI ENGINEERING PROFESSIONAL AND ADMINISTRATIVE SUPPORT SERVICES (EPASS) ADVISORY AND ASSISTANCE SERVICES (A&AS) SUPPORT IN SUPPORT OF AIR FORCE PROGRAM EXECUTIVE OFFICER, AGILE COMBAT SUPPORT (AFPEO/ACS) AIR FORCE LIFE CYCLE MANAGEMENT CENTER (AFLCMC) AGILE COMBAT SUPPORT DIRECTORATE (AFLCMC/WN) WRIGHT-PATTERSON AFB",
        "def_codes": [
            "Q"
        ],
        "COVID-19 Obligations": null,
        "COVID-19 Outlays": null,
        "Infrastructure Obligations": null,
        "Infrastructure Outlays": null,
        "Awarding Agency": "Department of Defense",
        "Awarding Sub Agency": "Department of the Air Force",
        "Contract Award Type": "DELIVERY ORDER",
        "recipient_id": "385dd1df-55cb-ae3f-a24c-0b7430d4ae02-C",
        "prime_award_recipient_id": null,
        "awarding_agency_id": 1173,
        "agency_slug": "department-of-defense",
        "generated_internal_id": "CONT_AWD_FA862215F8112_9700_GS00Q14OADS712_4732"
    }
    Waiting for 1 seconds...