Search code examples
pythonweb-scrapingpostpostmanbasic-authentication

How to Scrape Data from Bayut (DLD-Validated Properties) Without Getting 401 Error?


I'm scraping real estate data from Bayut using Scrapy but can't extract thethe green tick (DLD-validated info)

  • The information is fetched via a POST API with basic authentication.
  • Calling the API in Postman/Python with the same headers, payload, and params returns 401 Unauthorized.Autenticated Post API
  • Using Selenium works but is too slow for large-scale scraping (~210K properties/week).

I found the API request in the network tab and replicated it exactly, but still get a 401 error. Could the website be using additional security measures like session-based authentication or IP restrictions?

What I Tried:

  • Scrapy (fails to fetch validation info).
  • Postman & Python requests (401 error).
  • Selenium (works but too slow).

How can I access this data efficiently? Any insights would be appreciated.

Attaching the code for requesting the POST API:

import requests
import base64

# Define the URL
url = "https://fenix-data-es2.bayut.com/_msearch"

# Encode credentials manually (decoded: "bayut_read_user_es2:10yNmg5+6K")
auth_string = "bayut_read_user_es2:10yNmg5+6K"
auth_encoded = base64.b64encode(auth_string.encode()).decode()  # Convert to Base64

# Headers with Authorization
headers = {
    "Authorization": f"Basic {auth_encoded}",
    "accept": "*/*",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "no-cache",
    "content-type": "application/x-ndjson",
    "origin": "https://www.bayut.com",
    "pragma": "no-cache",
    "priority": "u=1, i",
    "referer": "https://www.bayut.com/",
    "sec-ch-ua": "\"Not(A:Brand\";v=\"99\", \"Google Chrome\";v=\"133\", \"Chromium\";v=\"133\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-site",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
}

# Query parameters (filter_path)
params = {
    "filter_path": "took,*.took,*.suggest.*.options.text,*.suggest.*.options._source.*,*.hits.total.*,*.hits.hits._source.*,*.hits.hits._score,*.hits.hits.highlight.*,*.error,*.aggregations.*.buckets.key,*.aggregations.*.buckets.doc_count,*.aggregations.*.buckets.complex_value.hits.hits._source,*.aggregations.*.filtered_agg.facet.buckets.key,*.aggregations.*.filtered_agg.facet.buckets.doc_count,*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
}

# POST data (formatted in NDJSON format)
post_data = """{"index":"dld_matched_property_details_prod_alias"}
{"from":0,"size":5,"track_total_hits":10000,"query":{"bool":{"must":[{"term":{"external_id":"10228377"}}]}}}
"""

# Sending the POST request
response = requests.post(url, headers=headers, params=params, data=post_data)

# Check if the request was successful
if response.status_code == 200:
    print("Request Successful!")
    print(response.json())  # Print the response in JSON format
else:
    print(f"Request failed with status code: {response.status_code}")
    print(response.text)  # Print the error message if any
 

Solution

  • You need the hb-session-id cookie; you can get it from the /.humbucker/challenge/js/validate post request which requires an x-hb-co header and the correct post data (fingerprints in a specific order).

    Here's how to do all of that:

    import requests
    import re
    
    fingerprints = {
        "screenProperties": {
            "window": {
                "innerHeight": 1080,
                "outerHeight": 1080,
                "innerWidth": 1920,
                "outerWidth": 1920,
                "screenX": 0,
                "screenY": 0,
                "pageXOffset": 0,
                "pageYOffset": 0,
                "devicePixelRation": 2,
            },
            "client": {"width": 1920, "height": 1080},
            "screen": {
                "width": 1920,
                "height": 1080,
                "availWidth": 1920,
                "availHeight": 1080,
                "colorDepth": 24,
                "pixelDepth": 24,
            },
        },
        "screenDesc": "function get width() { [native code] }",
        "headlessProperties": {
            "__nightmare": False,
            "callPhantom": False,
            "_phantom": False,
            "phantom": False,
            "webdriver": False,
            "_Selenium_IDE_Recorder": False,
            "callSelenium": False,
            "_selenium": False,
            "__webdriver_script_fn": False,
            "__driver_evaluate": False,
            "__webdriver_evaluate": False,
            "__selenium_evaluate": False,
            "__fxdriver_evaluate": False,
            "__driver_unwrapped": False,
            "__webdriver_unwrapped": False,
            "__selenium_unwrapped": False,
            "__fxdriver_unwrapped": False,
            "__webdriver_script_func": False,
            "documentSelenium": False,
            "documentWebdriver": False,
            "documentDriver": False,
        },
        "audioCodecs": {
            "ogg": "probably",
            "mp3": "probably",
            "wav": "probably",
            "m4a": "maybe",
            "aac": "probably",
        },
        "notificationPermissions": {"name": "notifications", "state": "prompt"},
        "videoCodecs": {"ogg": "", "h264": "probably", "webm": "probably"},
        "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
        "touchScreen": {
            "maxTouchPoints": 0,
            "onTouchStart": False,
            "canHandleTouchEvents": False,
        },
        "webdriver": False,
        "multimediaDevices": [
            {"deviceId": "", "kind": "audiooutput", "label": "", "groupId": ""}
        ],
        "platform": "Win32",
        "mimeTypes": [],
        "navigatorPrototype": {
            "vendorSub": "function get vendorSub() { [native code] }",
            "productSub": "function get productSub() { [native code] }",
            "vendor": "function get vendor() { [native code] }",
            "maxTouchPoints": "function get maxTouchPoints() { [native code] }",
            "scheduling": "function get scheduling() { [native code] }",
            "userActivation": "function get userActivation() { [native code] }",
            "doNotTrack": "function get doNotTrack() { [native code] }",
            "geolocation": "function get geolocation() { [native code] }",
            "connection": "function get connection() { [native code] }",
            "plugins": "function get plugins() { [native code] }",
            "mimeTypes": "function get mimeTypes() { [native code] }",
            "pdfViewerEnabled": "function get pdfViewerEnabled() { [native code] }",
            "webkitTemporaryStorage": "function get webkitTemporaryStorage() { [native code] }",
            "webkitPersistentStorage": "function get webkitPersistentStorage() { [native code] }",
            "windowControlsOverlay": "function get windowControlsOverlay() { [native code] }",
            "hardwareConcurrency": "function get hardwareConcurrency() { [native code] }",
            "cookieEnabled": "function get cookieEnabled() { [native code] }",
            "appCodeName": "function get appCodeName() { [native code] }",
            "appName": "function get appName() { [native code] }",
            "appVersion": "function get appVersion() { [native code] }",
            "platform": "function get platform() { [native code] }",
            "product": "function get product() { [native code] }",
            "userAgent": "function get userAgent() { [native code] }",
            "language": "function get language() { [native code] }",
            "languages": "function get languages() { [native code] }",
            "onLine": "function get onLine() { [native code] }",
            "webdriver": "function get webdriver() { [native code] }",
            "getGamepads": "function getGamepads() { [native code] }",
            "javaEnabled": "function javaEnabled() { [native code] }",
            "sendBeacon": "function sendBeacon() { [native code] }",
            "vibrate": "function vibrate() { [native code] }",
            "constructor": "function Navigator() { [native code] }",
            "deprecatedRunAdAuctionEnforcesKAnonymity": "function get deprecatedRunAdAuctionEnforcesKAnonymity() { [native code] }",
            "protectedAudience": "function get protectedAudience() { [native code] }",
            "bluetooth": "function get bluetooth() { [native code] }",
            "storageBuckets": "function get storageBuckets() { [native code] }",
            "clipboard": "function get clipboard() { [native code] }",
            "credentials": "function get credentials() { [native code] }",
            "keyboard": "function get keyboard() { [native code] }",
            "managed": "function get managed() { [native code] }",
            "mediaDevices": "function get mediaDevices() { [native code] }",
            "storage": "function get storage() { [native code] }",
            "serviceWorker": "function get serviceWorker() { [native code] }",
            "virtualKeyboard": "function get virtualKeyboard() { [native code] }",
            "wakeLock": "function get wakeLock() { [native code] }",
            "deviceMemory": "function get deviceMemory() { [native code] }",
            "userAgentData": "function get userAgentData() { [native code] }",
            "login": "function get login() { [native code] }",
            "ink": "function get ink() { [native code] }",
            "mediaCapabilities": "function get mediaCapabilities() { [native code] }",
            "devicePosture": "function get devicePosture() { [native code] }",
            "hid": "function get hid() { [native code] }",
            "locks": "function get locks() { [native code] }",
            "gpu": "function get gpu() { [native code] }",
            "mediaSession": "function get mediaSession() { [native code] }",
            "permissions": "function get permissions() { [native code] }",
            "presentation": "function get presentation() { [native code] }",
            "usb": "function get usb() { [native code] }",
            "xr": "function get xr() { [native code] }",
            "serial": "function get serial() { [native code] }",
            "adAuctionComponents": "function adAuctionComponents() { [native code] }",
            "runAdAuction": "function runAdAuction() { [native code] }",
            "canLoadAdAuctionFencedFrame": "function canLoadAdAuctionFencedFrame() { [native code] }",
            "canShare": "function canShare() { [native code] }",
            "share": "function share() { [native code] }",
            "clearAppBadge": "function clearAppBadge() { [native code] }",
            "getBattery": "function getBattery() { [native code] }",
            "getUserMedia": "function getUserMedia() { [native code] }",
            "requestMIDIAccess": "function requestMIDIAccess() { [native code] }",
            "requestMediaKeySystemAccess": "function requestMediaKeySystemAccess() { [native code] }",
            "setAppBadge": "function setAppBadge() { [native code] }",
            "webkitGetUserMedia": "function webkitGetUserMedia() { [native code] }",
            "clearOriginJoinedAdInterestGroups": "function clearOriginJoinedAdInterestGroups() { [native code] }",
            "createAuctionNonce": "function createAuctionNonce() { [native code] }",
            "joinAdInterestGroup": "function joinAdInterestGroup() { [native code] }",
            "leaveAdInterestGroup": "function leaveAdInterestGroup() { [native code] }",
            "updateAdInterestGroups": "function updateAdInterestGroups() { [native code] }",
            "deprecatedReplaceInURN": "function deprecatedReplaceInURN() { [native code] }",
            "deprecatedURNToURL": "function deprecatedURNToURL() { [native code] }",
            "getInstalledRelatedApps": "function getInstalledRelatedApps() { [native code] }",
            "getInterestGroupAdAuctionData": "function getInterestGroupAdAuctionData() { [native code] }",
            "registerProtocolHandler": "function registerProtocolHandler() { [native code] }",
            "unregisterProtocolHandler": "function unregisterProtocolHandler() { [native code] }",
        },
        "videoCard": {
            "vendor": "NVIDIA",
            "renderer": "ANGLE",
        },
        "languages": ["en-US"],
        "evalToString": 33,
        "deviceMemory": 8,
        "canvasFingerprint": "1078009449",
    }
    
    def get_postdata_and_secret():
        response = requests.get('https://www.bayut.com/.humbucker/challenge/js/generate/script')
    
        functions = re.findall(r'Fingerprint\.(\w+)\(\)', response.text)
        secret = re.search(r'secret: \"(.+)\",', response.text).group(1)
    
        post_data = [fingerprints.get(func, '') for func in functions]
    
        return post_data, secret
    
    
    def get_session_cookie():
        post_data, secret = get_postdata_and_secret()
        headers = {'x-hb-co': secret}
    
        url = 'https://www.bayut.com/.humbucker/challenge/js/validate'
        response = requests.post(url, headers=headers, json=post_data)
    
        return response.cookies.get('hb-session-id')
    
    
    def get_validation_data():
        cookies = {
            'hb-session-id': get_session_cookie()
        }
    
        headers = {
            'authorization': 'Basic YmF5dXRfcmVhZF91c2VyX2VzMjoxMHlObWc1KzZL',
            'content-type': 'application/x-ndjson',
        }
    
        params = {
            "filter_path": "took,*.took,*.suggest.*.options.text,*.suggest.*.options._source.*,*.hits.total.*,*.hits.hits._source.*,*.hits.hits._score,*.hits.hits.highlight.*,*.error,*.aggregations.*.buckets.key,*.aggregations.*.buckets.doc_count,*.aggregations.*.buckets.complex_value.hits.hits._source,*.aggregations.*.filtered_agg.facet.buckets.key,*.aggregations.*.filtered_agg.facet.buckets.doc_count,*.aggregations.*.filtered_agg.facet.buckets.complex_value.hits.hits._source"
        }
    
        data = '{"index":"dld_matched_property_details_prod_alias"}\n{"from":0,"size":5,"track_total_hits":10000,"query":{"bool":{"must":[{"term":{"external_id":"9060114"}}]}}}\n'
        url = 'https://fenix-data-es2.bayut.com/_msearch'
        response = requests.post(url, params=params, headers=headers, cookies=cookies, data=data)
        
        return response.json()
    
    
    print(get_validation_data())