Following this tutorial, I am trying to extract basic property information from zillow.com. More specifically, I want to extract the information pertinent to property cards displayed on the website.
The following code is able to extract information of only 3 properties, even though several property cards exist on the first page. Can someone please explain why is the code skipping the remaining properties?
import requests
import ast
from bs4 import BeautifulSoup
url = 'https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-106.43826441618356%2C%22east%22%3A-103.36483912321481%2C%22south%22%3A38.903882034738686%2C%22north%22%3A40.52008627183672%7D%2C%22mapZoom%22%3A9%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Afalse%2C%22filterState%22%3A%7B%22hoa%22%3A%7B%22max%22%3A200%7D%2C%22con%22%3A%7B%22value%22%3Afalse%7D%2C%22apa%22%3A%7B%22value%22%3Afalse%7D%2C%22sch%22%3A%7B%22value%22%3Atrue%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22schu%22%3A%7B%22value%22%3Afalse%7D%2C%22manu%22%3A%7B%22value%22%3Afalse%7D%2C%22schr%22%3A%7B%22value%22%3Afalse%7D%2C%22apco%22%3A%7B%22value%22%3Afalse%7D%2C%22basf%22%3A%7B%22value%22%3Atrue%7D%2C%22schc%22%3A%7B%22value%22%3Afalse%7D%2C%22schb%22%3A%7B%22min%22%3A%227%22%7D%7D%2C%22isListVisible%22%3Atrue%7D'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'zguid=23|%24ca6368b9-7b92-4d51-ab67-c2be89065efd; _ga=GA1.2.1460486079.1621047110; _pxvid=7fa13d96-b528-11eb-9860-0242ac120012; _gcl_au=1.1.2025797213.1621047113; __gads=ID=66253ab863481044:T=1621047113:S=ALNI_MZr3mehwm2Wjo7NOrmalVtEcJSXag; __pdst=50987f626deb4767a53b5d8ca2ea406a; _fbp=fb.1.1621047115574.1019382068; _pin_unauth=dWlkPU5EVm1PRGRpTVRBdE5UTTFaUzAwWlRBNExUZzJZall0TWpZMU1HWTBNV0ppWlRkbA; G_ENABLED_IDPS=google; userid=X|3|231a9d744e104379%7C3%7CiEt8bkUx9hWaFeyCeAwN9tHl_T0d0Cq-kynGuEvNYr4%3D; loginmemento=1|c2274ba4a4ad76bbe89263d30695c182e9177b9c40a2691f3054987d66a944be; zjs_user_id=%22X1-ZU158jhpb2klds9_4wzn7%22; zgcus_lbut=; zgcus_aeut=189997416; zgcus_ludi=b44a961b-c7ef-11eb-a48f-96824e7eff50-18999; optimizelyEndUserId=oeu1623111792776r0.8778663892923859; _cs_c=1; WRUIDAWS=3326630244368428; visitor_id701843=248614376; visitor_id701843-hash=4be116fbd77089f953bfb6eaf5996ef92662a6ef7d237d3c49f154ffaf4eaa9295c64fb254b106bdff234e183c94498c01af2aab; __stripe_mid=80125db1-17d1-4fc5-ae37-86b12a68709cf3da6d; g_state={"i_p":1627697570928,"i_l":4}; zjs_anonymous_id=%22ca6368b9-7b92-4d51-ab67-c2be89065efd%22; _gac_UA-21174015-56=1.1626042638.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; _gcl_aw=GCL.1626042640.Cj0KCQjwraqHBhDsARIsAKuGZeH8gi095UkXfohW-WWvyLosdmTdL8cfJwgAabYF9hS2XU6JlXqpWLcaAq5SEALw_wcB; zgsession=1|1edd82e6-372a-4546-bc8b-c2bbadfd29b4; DoubleClickSession=true; fbc=fb.1.1626412984774.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _fbc=fb.1.1626413249162.IwAR2QM6bzrTskAWN5Sk8UnmPlAxb1HRy1h1GRch888QqXfczHZZWb2vDZfIw; _csrf=lV2BBFim7Vy2gFTn--PUt0VA; _gaexp=GAX1.2.w27igyYtRQaAa8XQM3MjDw.18837.2!VDVoDKTnRcyv8f4FAcJ8PA.18915.2!Khnq27RoQmSe5DEusmh5xA.18913.3; _gid=GA1.2.705011419.1630004829; FSsampler=707279376; __CT_Data=gpv=26&ckp=tld&dm=zillow.com&apv_82_www33=26&cpv_82_www33=26&rpv_82_www33=13; OptanonConsent=isIABGlobal=false&datestamp=Fri+Aug+27+2021+12%3A39%3A52+GMT-0600+(Mountain+Daylight+Time)&version=5.11.0&landingPath=NotLandingPage&groups=1%3A1%2C3%3A1%2C4%3A1&AwaitingReconsent=false; _cs_id=41cbdc9c-bb0b-aad9-9521-b1328a65ff77.1623111795.22.1630089665.1630089591.1.1657275795752; utag_main=v_id:01796deff9e3001a59964343177e03079002907100838$_sn:41$_se:2$_ss:0$_st:1630255637884$dc_visit:38$ses_id:1630253822479%3Bexp-session$_pn:1%3Bexp-session$dcsyncran:1%3Bexp-session$tdsyncran:1%3Bexp-session$dc_event:2%3Bexp-session$dc_region:us-east-1%3Bexp-session$ttd_uuid:7b8796ca-44dd-45c9-97d9-bcb642d04cd1%3Bexp-session; JSESSIONID=6CB8C410E0FE216644E8C3A0D0851618; ZILLOW_SID=1|AAAAAVVbFRIBVVsVEklf443J474nftKzJe5PKLD80sujgHvySB7tGcqZunX3BDDH9VwceMqGMTPC54%2F0q4CH%2BfmwsC6P; KruxPixel=true; _derived_epik=dj0yJnU9ai1PSUp1eHZ2Y3J3d0c2NVU1N3BBOFlHbnRBOGFzT0smbj1vLWRISDFwdUNoblN5MjQ4cTVyN213Jm09MSZ0PUFBQUFBR0VzRjRVJnJtPTEmcnQ9QUFBQUFHRXNGNFU; KruxAddition=true; search=6|1632872450375%7Crect%3D40.241821806991595%252C-103.77545313688668%252C39.18758562803622%252C-106.02765040251168%26disp%3Dmap%26mdm%3Dauto%26type%3Dhouse%252Cmultifamily%252Ctownhouse%26fs%3D1%26fr%3D0%26mmm%3D1%26rs%3D0%26ah%3D0%09%0911093%09%09%09%09%09%09; _uetsid=d5e0465006a011ecbe3bd1a0f1c47d01; _uetvid=987e1c70c40a11ebaed8859af36f82fb; _px3=ba45c3df5d5d63d4d9780a102253cd60b21ab52b04778344e332e05474011c21:oCvapPXE6jD0rCXhSf4UjtEC2U956148EDyiWwRFOF8z5vwK63/hC8OWsk09O61g1spnZw64iXApZu1wOmKpyA==:1000:68UzJ5+ar5XwNm61bm41bhSHp8Zp1PfQQlL/5tcqdUIJ3RmA106//vvYGewCCwmln6acqbDAVKgqfB8Th05yX0Cw0TBW7dhfNdeNRjp9bxeLvKqZ56yuW+aVoYYp/zj6MNKv9c16vKlP771xSdCgUTvZ0CDmh7Ng55sHugOHt/jj+2Zmp2WLnuYR4rf7SEndqWBbAyQhhG4BKeyrZyEMpA==; AWSALB=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ; AWSALBCORS=3BIj2fUDeYgoAcLKaZdMkcyTzWSof62v91DQuCssJMyknlpZWcRcVnUU5Me29AcnFcjg1k9H2ehS6N0rSwxo4w8lmEvFCy6hgQfKm1HH8oVoWtpICS36NoLMMxmZ',
'referer': 'https://www.google.com/',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?1',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'
}
params = {
'searchQueryState': '{"mapBounds":{"west":-106.02765040251168,"east":-103.77545313688668,"south":39.18758562803622,"north":40.241821806991595},"isMapVisible":true,"filterState":{"sort":{"value":"globalrelevanceex"},"ah":{"value":true},"con":{"value":false},"apco":{"value":false},"land":{"value":false},"apa":{"value":false},"manu":{"value":false},"basf":{"value":true},"hoa":{"max":200},"sch":{"value":true},"schb":{"min":"7"},"schc":{"value":false},"schr":{"value":false},"schu":{"value":false}},"isListVisible":true,"mapZoom":9,"customRegionId":"fcac4612c1X1-CR9xde3hldsvpa_v24ah","pagination":{}}'
}
class ZillowScraper:
def __init__(self, url, headers, params):
self.headers = headers
self.url = url
self.params = params
def fetch(self):
response = requests.get(url=self.url, headers=self.headers, params=self.params)
return response
def get_cards_info(self, deck_text):
urls = []
for card in deck_text.contents:
script = card.find('script', {'type': 'application/ld+json'})
if script:
script_json = ast.literal_eval(str(script.contents[0]))
print(script_json)
def parse(self, response_text):
content = BeautifulSoup(response_text, features="html.parser")
deck_text = content.find('ul', {'class': 'photo-cards photo-cards_wow photo-cards_short photo-cards_extra-attribution'})
cards_info = self.get_cards_info(deck_text)
def run(self):
response = self.fetch()
self.parse(response.text)
if __name__ == "__main__":
scraper = ZillowScraper(url, headers, params)
scraper.run()
OUTPUT
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '11615 River Run Cir, Henderson, CO 80640', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '2,001'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '11615 River Run Cir', 'addressLocality': 'Henderson', 'addressRegion': 'CO', 'postalCode': '80640'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.908753, 'longitude': -104.851576}, 'url': 'https://www.zillow.com/homedetails/11615-River-Run-Cir-Henderson-CO-80640/49457209_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '5089 Enid Way, Denver, CO 80239', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,852'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '5089 Enid Way', 'addressLocality': 'Denver', 'addressRegion': 'CO', 'postalCode': '80239'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.784449, 'longitude': -104.815903}, 'url': 'https://www.zillow.com/homedetails/5089-Enid-Way-Denver-CO-80239/13271929_zpid/'}
{'@type': 'SingleFamilyResidence', '@context': 'http://schema.org', 'name': '6088 S Pierson Ct, Littleton, CO 80127', 'floorSize': {'@type': 'QuantitativeValue', '@context': 'http://schema.org', 'value': '1,810'}, 'address': {'@type': 'PostalAddress', '@context': 'http://schema.org', 'streetAddress': '6088 S Pierson Ct', 'addressLocality': 'Littleton', 'addressRegion': 'CO', 'postalCode': '80127'}, 'geo': {'@type': 'GeoCoordinates', '@context': 'http://schema.org', 'latitude': 39.605764, 'longitude': -105.123466}, 'url': 'https://www.zillow.com/homedetails/6088-S-Pierson-Ct-Littleton-CO-80127/13818492_zpid/'}
The results are stored in <script>
variable inside the page. To parse them, you can use next example:
import json
import requests
from bs4 import BeautifulSoup
url = "https://www.zillow.com/homes/for_sale/house,multifamily,townhouse_type/?searchQueryState={%22pagination%22%3A{}%2C%22mapBounds%22%3A{%22west%22%3A-106.97384791227731%2C%22east%22%3A-102.82925562712106%2C%22south%22%3A39.18758562803622%2C%22north%22%3A40.241821806991595}%2C%22customRegionId%22%3A%22fcac4612c1X1-CR9xde3hldsvpa_v24ah%22%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A{%22hoa%22%3A{%22max%22%3A200}%2C%22con%22%3A{%22value%22%3Afalse}%2C%22apa%22%3A{%22value%22%3Afalse}%2C%22sch%22%3A{%22value%22%3Atrue}%2C%22ah%22%3A{%22value%22%3Atrue}%2C%22sort%22%3A{%22value%22%3A%22globalrelevanceex%22}%2C%22land%22%3A{%22value%22%3Afalse}%2C%22schu%22%3A{%22value%22%3Afalse}%2C%22manu%22%3A{%22value%22%3Afalse}%2C%22schr%22%3A{%22value%22%3Afalse}%2C%22apco%22%3A{%22value%22%3Afalse}%2C%22basf%22%3A{%22value%22%3Atrue}%2C%22schc%22%3A{%22value%22%3Afalse}%2C%22schb%22%3A{%22min%22%3A%227%22}}%2C%22isListVisible%22%3Atrue}"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"
}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
data = json.loads(
soup.select_one("script[data-zrr-shared-data-key]")
.contents[0]
.strip("!<>-")
)
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
for result in data["cat1"]["searchResults"]["listResults"]:
print(
"{:<15} {:<50} {:<15}".format(
result["statusText"], result["address"], result["price"]
)
)
Prints:
House for sale 6092 S Marshall Dr, Littleton, CO 80123 $680,000
House for sale 3050 S Roslyn St, Denver, CO 80231 $774,900
House for sale 15538 Greenstone Cir, Parker, CO 80134 $590,000
House for sale 7141 Fenton Cir, Arvada, CO 80003 $549,500
House for sale 7823 S Logan Dr, Littleton, CO 80122 $665,000
House for sale 1825 Clermont St, Denver, CO 80220 $599,900
House for sale 408 S Locust St, Denver, CO 80224 $550,000
House for sale 8660 De Soto St, Denver, CO 80229 $450,000
House for sale 1811 S Humboldt St, Denver, CO 80210 $675,000
House for sale 7329 E Easter Ave, Centennial, CO 80112 $699,900
House for sale 13638 W Montana Pl, Lakewood, CO 80228 $600,000
House for sale 8296 E Hinsdale Dr, Centennial, CO 80112 $699,900
House for sale 10325 Ravenswood Ln, Highlands Ranch, CO 80130 $660,000
House for sale 2833 E 90th Pl, Denver, CO 80229 $445,000
House for sale 5756 W 8th Ave, Lakewood, CO 80214 $600,000
House for sale 6088 S Pierson Ct, Littleton, CO 80127 $509,000
House for sale 2829 S Lowell Blvd, Denver, CO 80236 $475,000
House for sale 604 Eldridge St, Golden, CO 80401 $650,000
House for sale 7171 McIntyre Ct, Arvada, CO 80007 $850,000
House for sale 1301 S Blackhawk Way, Aurora, CO 80012 $500,000
House for sale 215 S Julian St, Denver, CO 80219 $350,000
House for sale 7095 E 67th Ave, Commerce City, CO 80022 $440,000
House for sale 8248 S Yukon St, Littleton, CO 80128 $695,000
House for sale 2846 S Macon Ct, Aurora, CO 80014 $520,000
House for sale 9340 Burgundy Cir, Littleton, CO 80126 $799,000
House for sale 2072 S Cathay Way, Aurora, CO 80013 $560,000
House for sale 1317 W 85th Ave, Federal Heights, CO 80260 $405,000
House for sale 6701 Eagle Shadow Ave, Brighton, CO 80602 $1,145,000
House for sale 2900 Webster St, Wheat Ridge, CO 80033 $660,000
House for sale 3943 S Allison Ct, Lakewood, CO 80235 $799,950
House for sale 511 E Irwin Ave, Littleton, CO 80122 $624,500
House for sale 4700 E Montana Pl, Denver, CO 80222 $600,000
House for sale 2344 S Gray Dr, Lakewood, CO 80227 $585,000
House for sale 5546 E 130th Dr, Thornton, CO 80241 $490,000
House for sale 2270 S Joyce St, Lakewood, CO 80228 $1,340,000
House for sale 12171 W Dakota Dr, Lakewood, CO 80228 $600,000
House for sale 6641 Miller St, Arvada, CO 80004 $625,000
House for sale 3220 W Nevada Pl, Denver, CO 80219 $510,000
House for sale 8630 W 64th Pl, Arvada, CO 80004 $447,000
House for sale 5890 Wood Sorrel Dr, Littleton, CO 80123 $975,000