I am trying to scrape data from below url
url=https://www.usaspending.gov/search/?hash=7e5e5a79e871a86ff6b69395e47ab41e
But it contains a scroller also, resulting in all values not getting scraped.
The code I tried is mentioned below requesting your help please
import requests
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
driver=webdriver.Chrome()
url = "https://www.usaspending.gov/search/?hash=7e5e5a79e871a86ff6b69395e47ab41e"
driver.get(url)
action = ActionChains(driver)
from selenium.webdriver.common.by import By
time.sleep(7)
headings_row = driver.find_elements(By.CLASS_NAME, 'award-result-header-cell')
award_link = driver.find_elements(By.PARTIAL_LINK_TEXT, 'FA')
link="/recipient/"
recp_link = driver.find_elements(By.XPATH,'//a[@href="'+link+'"]')
list_headings_row = []
award_id = []
reciepient = []
for i in range(15):
list_headings_row.append(headings_row[i].text)
award_id.append(award_link[i].text)
reciepient.append(recp_link[i].text)
print(list_headings_row)
Below is the response i get from printing list_headings_row
> ['Award ID', 'Recipient Name', 'Start Date\n(Period of
> Performance)', 'End Date\n(Period of Performance)', '', '', '',
> '', '', '', '', '', '', '', '']
As mentioned in the comment, the data can be obtained with the official API.
Full docs on the spending by award API endpoint.
Here's how to do this:
import json
import time
import requests
api_url = "https://api.usaspending.gov/api/v2/search/spending_by_award/"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.200",
"X-Requested-With": "XMLHttpRequest",
}
payload = {
"filters": {
"time_period": [
{
"start_date": "2022-10-01",
"end_date": "2023-09-30"
},
{
"start_date": "2021-10-01",
"end_date": "2022-09-30"
},
{
"start_date": "2020-10-01",
"end_date": "2021-09-30"
}
],
"award_type_codes": [
"A",
"B",
"C",
"D"
],
"agencies": [
{
"type": "awarding",
"tier": "subtier",
"name": "Department of the Air Force",
"toptier_name": "Department of Defense"
}
],
"award_amounts": [
{
"upper_bound": 75000000
}
]
},
"fields": [
"Award ID",
"Recipient Name",
"Start Date",
"End Date",
"Award Amount",
"Total Outlays",
"Description",
"def_codes",
"COVID-19 Obligations",
"COVID-19 Outlays",
"Infrastructure Obligations",
"Infrastructure Outlays",
"Awarding Agency",
"Awarding Sub Agency",
"Contract Award Type",
"recipient_id",
"prime_award_recipient_id"
],
"page": 1,
"limit": 60,
"sort": "Award Amount",
"order": "desc",
"subawards": False
}
def wait_a_bit(wait_for: int = 1, message: bool = False) -> None:
if message:
print(f"Waiting for {wait_for} seconds...")
time.sleep(wait_for)
def get_spending_data():
with requests.Session() as session:
while True:
response = session.post(api_url, headers=headers, json=payload)
response.raise_for_status()
spending_data = response.json()
awards = spending_data["results"]
# This shows only the first award in the list; remove [0] to see all
print(json.dumps(awards[0], indent=4))
payload["page"] += 1
wait_a_bit(wait_for=1, message=True)
if not spending_data["page_metadata"]["hasNext"]:
break
if __name__ == "__main__":
get_spending_data()
This should put first "row" of the data as a dict for each "scroll" (page of the table).
{
"internal_id": 90310986,
"Award ID": "FA875019C1518",
"Recipient Name": "INTERNATIONAL BUSINESS MACHINES CORP",
"Start Date": "2019-08-16",
"End Date": "2023-08-16",
"Award Amount": 74999951.0,
"Total Outlays": 5984934.86,
"Description": "IBM NORTHPOLE NEURAL INFERENCE MACHINE: ARCHITECTURE, SOFT INTELLECTUAL PROPERTY (IP) CORE TECHNOLOGY, SOFTWARE ECOSYSTEM, PROTOTYPE CHIP&BOARD PHASE 2",
"def_codes": [
"N",
"Q"
],
"COVID-19 Obligations": -3175836.46,
"COVID-19 Outlays": 3175836.46,
"Infrastructure Obligations": null,
"Infrastructure Outlays": null,
"Awarding Agency": "Department of Defense",
"Awarding Sub Agency": "Department of the Air Force",
"Contract Award Type": "DEFINITIVE CONTRACT",
"recipient_id": "d1776a20-1dbc-351a-8f2e-e20d504a1d3f-C",
"prime_award_recipient_id": null,
"awarding_agency_id": 1173,
"agency_slug": "department-of-defense",
"generated_internal_id": "CONT_AWD_FA875019C1518_9700_-NONE-_-NONE-"
}
Waiting for 1 seconds...
{
"internal_id": 15069464,
"Award ID": "FA867217C0010",
"Recipient Name": "RAYTHEON COMPANY",
"Start Date": "2019-06-30",
"End Date": "2023-06-30",
"Award Amount": 70866143.0,
"Total Outlays": 2579829.0,
"Description": "SMALL DIAMETER BOMB II - LOT 3 PRODUCTION",
"def_codes": [
"Q"
],
"COVID-19 Obligations": null,
"COVID-19 Outlays": null,
"Infrastructure Obligations": null,
"Infrastructure Outlays": null,
"Awarding Agency": "Department of Defense",
"Awarding Sub Agency": "Department of the Air Force",
"Contract Award Type": "DEFINITIVE CONTRACT",
"recipient_id": "01c4a3a3-b4c5-ce4e-822b-d17f09985001-C",
"prime_award_recipient_id": null,
"awarding_agency_id": 1173,
"agency_slug": "department-of-defense",
"generated_internal_id": "CONT_AWD_FA867217C0010_9700_-NONE-_-NONE-"
}
Waiting for 1 seconds...
{
"internal_id": 15058192,
"Award ID": "FA862215F8112",
"Recipient Name": "HX5 LLC",
"Start Date": "2015-08-14",
"End Date": "2020-08-31",
"Award Amount": 66839178.32,
"Total Outlays": 0.0,
"Description": "IGF::CL::IGF SCATI ENGINEERING PROFESSIONAL AND ADMINISTRATIVE SUPPORT SERVICES (EPASS) ADVISORY AND ASSISTANCE SERVICES (A&AS) SUPPORT IN SUPPORT OF AIR FORCE PROGRAM EXECUTIVE OFFICER, AGILE COMBAT SUPPORT (AFPEO/ACS) AIR FORCE LIFE CYCLE MANAGEMENT CENTER (AFLCMC) AGILE COMBAT SUPPORT DIRECTORATE (AFLCMC/WN) WRIGHT-PATTERSON AFB",
"def_codes": [
"Q"
],
"COVID-19 Obligations": null,
"COVID-19 Outlays": null,
"Infrastructure Obligations": null,
"Infrastructure Outlays": null,
"Awarding Agency": "Department of Defense",
"Awarding Sub Agency": "Department of the Air Force",
"Contract Award Type": "DELIVERY ORDER",
"recipient_id": "385dd1df-55cb-ae3f-a24c-0b7430d4ae02-C",
"prime_award_recipient_id": null,
"awarding_agency_id": 1173,
"agency_slug": "department-of-defense",
"generated_internal_id": "CONT_AWD_FA862215F8112_9700_GS00Q14OADS712_4732"
}
Waiting for 1 seconds...