I'm scraping real estate data from Bayut using Scrapy but can't extract thethe green tick (DLD-validated info)
I found the API request in the network tab and replicated it exactly, but still get a 401 error. Could the website be using additional security measures like session-based authentication or IP restrictions?
What I Tried:
How can I access this data efficiently? Any insights would be appreciated.
Attaching the code for requesting the POST API:
import requests
import base64
# Define the URL
url = "https://fenix-data-es2.bayut.com/_msearch"
# Encode credentials manually (decoded: "bayut_read_user_es2:10yNmg5+6K")
auth_string = "bayut_read_user_es2:10yNmg5+6K"
auth_encoded = base64.b64encode(auth_string.encode()).decode() # Convert to Base64
# Headers with Authorization
headers = {
"Authorization": f"Basic {auth_encoded}",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"content-type": "application/x-ndjson",
"origin": "https://www.bayut.com",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://www.bayut.com/",
"sec-ch-ua": "\"Not(A:Brand\";v=\"99\", \"Google Chrome\";v=\"133\", \"Chromium\";v=\"133\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
}
# Query parameters (filter_path)
params = {
"filter_path": "took,*.took,*.suggest.*.options.text,*.suggest.*.options._source.*,*.hits.total.*,*.hits.hits._source.*,*.hits.hits._score,*.hits.hits.highlight.*,*.error,*.aggregations.*.buckets.key,*.aggregations.*.buckets.doc_count,*.aggregations.*.buckets.complex_value.hits.hits._source,*.aggregations.*.filtered_agg.facet.buckets.key,*.aggregations.*.filtered_agg.facet.buckets.doc_count,*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
}
# POST data (formatted in NDJSON format)
post_data = """{"index":"dld_matched_property_details_prod_alias"}
{"from":0,"size":5,"track_total_hits":10000,"query":{"bool":{"must":[{"term":{"external_id":"10228377"}}]}}}
"""
# Sending the POST request
response = requests.post(url, headers=headers, params=params, data=post_data)
# Check if the request was successful
if response.status_code == 200:
print("Request Successful!")
print(response.json()) # Print the response in JSON format
else:
print(f"Request failed with status code: {response.status_code}")
print(response.text) # Print the error message if any
You need the hb-session-id
cookie; you can get it from the /.humbucker/challenge/js/validate
post request which requires an x-hb-co
header and the correct post data (fingerprints in a specific order).
Here's how to do all of that:
import requests
import re
fingerprints = {
"screenProperties": {
"window": {
"innerHeight": 1080,
"outerHeight": 1080,
"innerWidth": 1920,
"outerWidth": 1920,
"screenX": 0,
"screenY": 0,
"pageXOffset": 0,
"pageYOffset": 0,
"devicePixelRation": 2,
},
"client": {"width": 1920, "height": 1080},
"screen": {
"width": 1920,
"height": 1080,
"availWidth": 1920,
"availHeight": 1080,
"colorDepth": 24,
"pixelDepth": 24,
},
},
"screenDesc": "function get width() { [native code] }",
"headlessProperties": {
"__nightmare": False,
"callPhantom": False,
"_phantom": False,
"phantom": False,
"webdriver": False,
"_Selenium_IDE_Recorder": False,
"callSelenium": False,
"_selenium": False,
"__webdriver_script_fn": False,
"__driver_evaluate": False,
"__webdriver_evaluate": False,
"__selenium_evaluate": False,
"__fxdriver_evaluate": False,
"__driver_unwrapped": False,
"__webdriver_unwrapped": False,
"__selenium_unwrapped": False,
"__fxdriver_unwrapped": False,
"__webdriver_script_func": False,
"documentSelenium": False,
"documentWebdriver": False,
"documentDriver": False,
},
"audioCodecs": {
"ogg": "probably",
"mp3": "probably",
"wav": "probably",
"m4a": "maybe",
"aac": "probably",
},
"notificationPermissions": {"name": "notifications", "state": "prompt"},
"videoCodecs": {"ogg": "", "h264": "probably", "webm": "probably"},
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
"touchScreen": {
"maxTouchPoints": 0,
"onTouchStart": False,
"canHandleTouchEvents": False,
},
"webdriver": False,
"multimediaDevices": [
{"deviceId": "", "kind": "audiooutput", "label": "", "groupId": ""}
],
"platform": "Win32",
"mimeTypes": [],
"navigatorPrototype": {
"vendorSub": "function get vendorSub() { [native code] }",
"productSub": "function get productSub() { [native code] }",
"vendor": "function get vendor() { [native code] }",
"maxTouchPoints": "function get maxTouchPoints() { [native code] }",
"scheduling": "function get scheduling() { [native code] }",
"userActivation": "function get userActivation() { [native code] }",
"doNotTrack": "function get doNotTrack() { [native code] }",
"geolocation": "function get geolocation() { [native code] }",
"connection": "function get connection() { [native code] }",
"plugins": "function get plugins() { [native code] }",
"mimeTypes": "function get mimeTypes() { [native code] }",
"pdfViewerEnabled": "function get pdfViewerEnabled() { [native code] }",
"webkitTemporaryStorage": "function get webkitTemporaryStorage() { [native code] }",
"webkitPersistentStorage": "function get webkitPersistentStorage() { [native code] }",
"windowControlsOverlay": "function get windowControlsOverlay() { [native code] }",
"hardwareConcurrency": "function get hardwareConcurrency() { [native code] }",
"cookieEnabled": "function get cookieEnabled() { [native code] }",
"appCodeName": "function get appCodeName() { [native code] }",
"appName": "function get appName() { [native code] }",
"appVersion": "function get appVersion() { [native code] }",
"platform": "function get platform() { [native code] }",
"product": "function get product() { [native code] }",
"userAgent": "function get userAgent() { [native code] }",
"language": "function get language() { [native code] }",
"languages": "function get languages() { [native code] }",
"onLine": "function get onLine() { [native code] }",
"webdriver": "function get webdriver() { [native code] }",
"getGamepads": "function getGamepads() { [native code] }",
"javaEnabled": "function javaEnabled() { [native code] }",
"sendBeacon": "function sendBeacon() { [native code] }",
"vibrate": "function vibrate() { [native code] }",
"constructor": "function Navigator() { [native code] }",
"deprecatedRunAdAuctionEnforcesKAnonymity": "function get deprecatedRunAdAuctionEnforcesKAnonymity() { [native code] }",
"protectedAudience": "function get protectedAudience() { [native code] }",
"bluetooth": "function get bluetooth() { [native code] }",
"storageBuckets": "function get storageBuckets() { [native code] }",
"clipboard": "function get clipboard() { [native code] }",
"credentials": "function get credentials() { [native code] }",
"keyboard": "function get keyboard() { [native code] }",
"managed": "function get managed() { [native code] }",
"mediaDevices": "function get mediaDevices() { [native code] }",
"storage": "function get storage() { [native code] }",
"serviceWorker": "function get serviceWorker() { [native code] }",
"virtualKeyboard": "function get virtualKeyboard() { [native code] }",
"wakeLock": "function get wakeLock() { [native code] }",
"deviceMemory": "function get deviceMemory() { [native code] }",
"userAgentData": "function get userAgentData() { [native code] }",
"login": "function get login() { [native code] }",
"ink": "function get ink() { [native code] }",
"mediaCapabilities": "function get mediaCapabilities() { [native code] }",
"devicePosture": "function get devicePosture() { [native code] }",
"hid": "function get hid() { [native code] }",
"locks": "function get locks() { [native code] }",
"gpu": "function get gpu() { [native code] }",
"mediaSession": "function get mediaSession() { [native code] }",
"permissions": "function get permissions() { [native code] }",
"presentation": "function get presentation() { [native code] }",
"usb": "function get usb() { [native code] }",
"xr": "function get xr() { [native code] }",
"serial": "function get serial() { [native code] }",
"adAuctionComponents": "function adAuctionComponents() { [native code] }",
"runAdAuction": "function runAdAuction() { [native code] }",
"canLoadAdAuctionFencedFrame": "function canLoadAdAuctionFencedFrame() { [native code] }",
"canShare": "function canShare() { [native code] }",
"share": "function share() { [native code] }",
"clearAppBadge": "function clearAppBadge() { [native code] }",
"getBattery": "function getBattery() { [native code] }",
"getUserMedia": "function getUserMedia() { [native code] }",
"requestMIDIAccess": "function requestMIDIAccess() { [native code] }",
"requestMediaKeySystemAccess": "function requestMediaKeySystemAccess() { [native code] }",
"setAppBadge": "function setAppBadge() { [native code] }",
"webkitGetUserMedia": "function webkitGetUserMedia() { [native code] }",
"clearOriginJoinedAdInterestGroups": "function clearOriginJoinedAdInterestGroups() { [native code] }",
"createAuctionNonce": "function createAuctionNonce() { [native code] }",
"joinAdInterestGroup": "function joinAdInterestGroup() { [native code] }",
"leaveAdInterestGroup": "function leaveAdInterestGroup() { [native code] }",
"updateAdInterestGroups": "function updateAdInterestGroups() { [native code] }",
"deprecatedReplaceInURN": "function deprecatedReplaceInURN() { [native code] }",
"deprecatedURNToURL": "function deprecatedURNToURL() { [native code] }",
"getInstalledRelatedApps": "function getInstalledRelatedApps() { [native code] }",
"getInterestGroupAdAuctionData": "function getInterestGroupAdAuctionData() { [native code] }",
"registerProtocolHandler": "function registerProtocolHandler() { [native code] }",
"unregisterProtocolHandler": "function unregisterProtocolHandler() { [native code] }",
},
"videoCard": {
"vendor": "NVIDIA",
"renderer": "ANGLE",
},
"languages": ["en-US"],
"evalToString": 33,
"deviceMemory": 8,
"canvasFingerprint": "1078009449",
}
def get_postdata_and_secret():
response = requests.get('https://www.bayut.com/.humbucker/challenge/js/generate/script')
functions = re.findall(r'Fingerprint\.(\w+)\(\)', response.text)
secret = re.search(r'secret: \"(.+)\",', response.text).group(1)
post_data = [fingerprints.get(func, '') for func in functions]
return post_data, secret
def get_session_cookie():
post_data, secret = get_postdata_and_secret()
headers = {'x-hb-co': secret}
url = 'https://www.bayut.com/.humbucker/challenge/js/validate'
response = requests.post(url, headers=headers, json=post_data)
return response.cookies.get('hb-session-id')
def get_validation_data():
cookies = {
'hb-session-id': get_session_cookie()
}
headers = {
'authorization': 'Basic YmF5dXRfcmVhZF91c2VyX2VzMjoxMHlObWc1KzZL',
'content-type': 'application/x-ndjson',
}
params = {
"filter_path": "took,*.took,*.suggest.*.options.text,*.suggest.*.options._source.*,*.hits.total.*,*.hits.hits._source.*,*.hits.hits._score,*.hits.hits.highlight.*,*.error,*.aggregations.*.buckets.key,*.aggregations.*.buckets.doc_count,*.aggregations.*.buckets.complex_value.hits.hits._source,*.aggregations.*.filtered_agg.facet.buckets.key,*.aggregations.*.filtered_agg.facet.buckets.doc_count,*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
}
data = '{"index":"dld_matched_property_details_prod_alias"}\n{"from":0,"size":5,"track_total_hits":10000,"query":{"bool":{"must":[{"term":{"external_id":"9060114"}}]}}}\n'
url = 'https://fenix-data-es2.bayut.com/_msearch'
response = requests.post(url, params=params, headers=headers, cookies=cookies, data=data)
return response.json()
print(get_validation_data())