Search code examples
python-3.xweb-scrapingpostpython-requestshttpx

Handling POST Request in expedia.com


I came across this post, the answer to this question shows how to handle POST request for hotels.com

here is the code made by αԋɱҽԃ αмєяιcαη

import trio
import httpx
import pandas as pd


async def main():
    async with httpx.AsyncClient(timeout=None) as client:
        data = {
            "operationName": "reviewsQuery",
            "query": "query reviewsQuery($hotelId: String!, $reviewType: String, $reviewOrder: String, $tripTypeFilter: String, $paginationURL: String) {\n  reviews(\n    hotelId: $hotelId\n    reviewType: $reviewType\n    reviewOrder: $reviewOrder\n    tripTypeFilter: $tripTypeFilter\n    paginationURL: $paginationURL\n  ) {\n    body {\n      reviewContent {\n        filters {\n          type\n          name\n          count\n          url\n          __typename\n        }\n        overall {\n          selectedFilterType\n          rating\n          badgeText\n          total\n          scores {\n            score\n            count\n            url\n            __typename\n          }\n          ratingAspects {\n            cleanliness\n            service\n            comfort\n            condition\n            neighbourhood\n            __typename\n          }\n          whatGuestsSay {\n            type\n            text\n            __typename\n          }\n          topRated {\n            category\n            explanation\n            __typename\n          }\n          __typename\n        }\n        sort {\n          url\n          options {\n            value\n            label\n            __typename\n          }\n          __typename\n        }\n        reviews {\n          hermes {\n            groups {\n              separatorText\n              items {\n                itineraryId\n                brand\n                googleTranslateEnabled\n                reviewDbDate\n                ...GuestReviewsFragment\n                __typename\n              }\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        pagination {\n          currentPage\n          nextURL\n          totalPages\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment GuestReviewsFragment on ReviewsItem {\n  genuineMsg\n  tripType\n  tripTypeText\n  reviewDate\n  reviewSubmitDate\n  rating\n  reviewer {\n    name\n    locality\n    locale\n    __typename\n  }\n  badge\n  summary\n  description\n  __typename\n}\n",
            "variables": {
                "hotelId": "344560",
                "reviewOrder": "date_newest_first",
                "reviewType": "brand",
                "tripTypeFilter": "all"
            }
        }
        r = await client.post('https://fr.hotels.com/kes/graphql', json=data)
        allin = []
        for x in r.json()['data']['reviews']['body']['reviewContent']['reviews']['hermes']['groups']:
            for i in x['items']:
                allin.append(i)
        df = pd.DataFrame(allin)
        print(df)


if __name__ == "__main__":
    trio.run(main)

I have two questions related to this:

  1. How the above code, because the above code is not able to extract all the reviews it just extracts 50 reviews, I am trying to understand how to read the POST query here

How to modify the code for expedia.com I tried above method for expedia.com but it did not work, here :

import trio
import httpx
import pandas as pd


async def main():
    async with httpx.AsyncClient(timeout=None) as client:
        data = {
            "operationName": "reviewsQuery",
            "query": "query reviewsQuery($hotelId: String!, $reviewType: String, $reviewOrder: String, $tripTypeFilter: String, $paginationURL: String) {\n  reviews(\n    hotelId: $hotelId\n    reviewType: $reviewType\n    reviewOrder: $reviewOrder\n    tripTypeFilter: $tripTypeFilter\n    paginationURL: $paginationURL\n  ) {\n    body {\n      reviewContent {\n        filters {\n          type\n          name\n          count\n          url\n          __typename\n        }\n        overall {\n          selectedFilterType\n          rating\n          badgeText\n          total\n          scores {\n            score\n            count\n            url\n            __typename\n          }\n          ratingAspects {\n            cleanliness\n            service\n            comfort\n            condition\n            neighbourhood\n            __typename\n          }\n          whatGuestsSay {\n            type\n            text\n            __typename\n          }\n          topRated {\n            category\n            explanation\n            __typename\n          }\n          __typename\n        }\n        sort {\n          url\n          options {\n            value\n            label\n            __typename\n          }\n          __typename\n        }\n        reviews {\n          hermes {\n            groups {\n              separatorText\n              items {\n                itineraryId\n                brand\n                googleTranslateEnabled\n                reviewDbDate\n                ...GuestReviewsFragment\n                __typename\n              }\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        pagination {\n          currentPage\n          nextURL\n          totalPages\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment GuestReviewsFragment on ReviewsItem {\n  genuineMsg\n  tripType\n  tripTypeText\n  reviewDate\n  reviewSubmitDate\n  rating\n  reviewer {\n    name\n    locality\n    locale\n    __typename\n  }\n  badge\n  summary\n  description\n  __typename\n}\n",
            "variables": {
                "hotelId": "344560",
                "reviewOrder": "date_newest_first",
                "reviewType": "brand",
                "tripTypeFilter": "all"
            }
        }
        r = await client.post('https://fr.expedia.com/kes/graphql', json=data)
        allin = []
        for x in r.json()['data']['reviews']['body']['reviewContent']['reviews']['hermes']['groups']:
            for i in x['items']:
                allin.append(i)
        df = pd.DataFrame(allin)
        print(df)


if __name__ == "__main__":
    trio.run(main)

here is the error gaierror Traceback (most recent call last)

/usr/local/lib/python3.10/dist-packages/httpcore/_exceptions.py in map_exceptions(map) 9 try: ---> 10 yield 11 except Exception as exc: # noqa: PIE786

31 frames

gaierror: [Errno -2] Name or service not known

The above exception was the direct cause of the following exception:

ConnectError Traceback (most recent call last)

ConnectError: [Errno -2] Name or service not known

The above exception was the direct cause of the following exception:

ConnectError Traceback (most recent call last)

[... skipping hidden 1 frame]

/usr/local/lib/python3.10/dist-packages/httpx/_transports/default.py in map_httpcore_exceptions() 75 76 message = str(exc) ---> 77 raise mapped_exc(message) from exc 78 79

ConnectError: [Errno -2] Name or service not known

here is the another code I tried this one showed requests.exceptions.HTTPError: 429 Client Error: Too Many Requests for url: https://www.expedia.com/graphql

import requests
import pandas as pd

def main():
    headers = {
        "Content-Type": "application/json",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"
    }

    data = [
        {
            "operationName": "PropertyFilteredReviewsQuery",
            "variables": {
                "context": {
                    "siteId": 1,
                    "locale": "en_US",
                    "eapid": 0,
                    "currency": "USD",
                    "device": {
                        "type": "DESKTOP"
                    },
                    "identity": {
                        "duaid": "-1",
                        "expUserId": "832921361",
                        "tuid": "-1",
                        "authState": "ANONYMOUS"
                    },
                    "privacyTrackingState": "CAN_TRACK",
                    "debugContext": {
                        "abacusOverrides": [],
                        "alterMode": "RELEASED"
                    }
                },
                "propertyId": "24625",
                "searchCriteria": {
                    "primary": {
                        "dateRange": None,
                        "rooms": [{"adults": 2}],
                        "destination": {"regionId": "178305"}
                    },
                    "secondary": {
                        "booleans": [
                            {"id": "includeRecentReviews", "value": True},
                            {"id": "includeRatingsOnlyReviews", "value": True},
                            {"id": "overrideEmbargoForIndividualReviews", "value": True}
                        ],
                        "counts": [{"id": "startIndex", "value": 0}, {"id": "size", "value": 10}],
                        "selections": [{"id": "sortBy", "value": "NEWEST_TO_OLDEST_BY_LANGUAGE"}, {"id": "searchTerm", "value": ""}]
                    }
                }
            },
            "query": "query PropertyFilteredReviewsQuery($context: ContextInput!, $propertyId: String!, $searchCriteria: PropertySearchCriteriaInput!) {\n  propertyReviewSummaries(\n    context: $context\n    propertyIds: [$propertyId]\n    searchCriteria: $searchCriteria\n  ) {\n    ...__PropertyReviewSummaryFragment\n    __typename\n  }\n  propertyInfo(context: $context, propertyId: $propertyId) {\n    id\n    reviewInfo(searchCriteria: $searchCriteria) {\n      ...__PropertyReviewsListFragment\n      sortAndFilter {\n        ...TravelerTypeFragment\n        ...SortTypeFragment\n        ...SearchTextFragment\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n\n... (rest of the GraphQL query) ..."
        }
    ]

    response = requests.post("https://www.expedia.com/graphql", json=data, headers=headers)
    response.raise_for_status()

    allin = []
    for x in response.json()[0]['data']['propertyInfo']['reviewInfo']['reviews']['content']['reviews']:
        allin.append(x)
    df = pd.DataFrame(allin)
    print(df)

if __name__ == "__main__":
    main()


Solution

  • This works for me but I did not look into how client-info and duaid are generated.

    import requests
    import pandas as pd
    
    def main():
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
            'Client-Info': "blossom-flex-ui,9382ef788e9311fcea3ce7a7b749cd68c4059a45,us-west-2",  # not sure where this comes from just took it from browser dev tools
        }
    
        data = [
      {
        "operationName": "PropertyFilteredReviewsQuery",
        "variables": {
          "context": {
            "siteId": 1,
            "locale": "en_US",
            "eapid": 0,
            "currency": "USD",
            "device": {
              "type": "DESKTOP"
            },
            "identity": {
              "duaid": "1003be2b-6834-4cf8-bb66-66a49107b76c",  # not sure where this comes from just took it from browser dev tools
              "expUserId": "-1",
              "tuid": "-1",
              "authState": "ANONYMOUS"
            },
            "privacyTrackingState": "CAN_TRACK",
            "debugContext": {
              "abacusOverrides": [],
              "alterMode": "RELEASED"
            }
          },
          "propertyId": "24625",
          "searchCriteria": {
            "primary": {
              "dateRange": None,
              "rooms": [
                {
                  "adults": 2
                }
              ],
              "destination": {
                "regionId": "178305"
              }
            },
            "secondary": {
              "booleans": [
                {
                  "id": "includeRecentReviews",
                  "value": True
                },
                {
                  "id": "includeRatingsOnlyReviews",
                  "value": True
                },
                {
                  "id": "overrideEmbargoForIndividualReviews",
                  "value": True
                }
              ],
              "counts": [
                {
                  "id": "startIndex",
                  "value": 0
                },
                {
                  "id": "size",
                  "value": 10
                }
              ],
              "selections": [
                {
                  "id": "sortBy",
                  "value": "NEWEST_TO_OLDEST_BY_LANGUAGE"
                },
                {
                  "id": "searchTerm",
                  "value": ""
                }
              ]
            }
          }
        },
        "query": "query PropertyFilteredReviewsQuery($context: ContextInput!, $propertyId: String!, $searchCriteria: PropertySearchCriteriaInput!) {\n  propertyReviewSummaries(\n    context: $context\n    propertyIds: [$propertyId]\n    searchCriteria: $searchCriteria\n  ) {\n    ...__PropertyReviewSummaryFragment\n    __typename\n  }\n  propertyInfo(context: $context, propertyId: $propertyId) {\n    id\n    reviewInfo(searchCriteria: $searchCriteria) {\n      ...__PropertyReviewsListFragment\n      sortAndFilter {\n        ...TravelerTypeFragment\n        ...SortTypeFragment\n        ...SearchTextFragment\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment __PropertyReviewSummaryFragment on PropertyReviewSummary {\n  accessibilityLabel\n  overallScoreWithDescriptionA11y {\n    ...LodgingEnrichedMessageFragment\n    __typename\n  }\n  propertyReviewCountDetails {\n    fullDescription\n    __typename\n  }\n  ...ReviewDisclaimerFragment\n  reviewSummaryDetails {\n    label\n    ratingPercentage\n    formattedRatingOutOfMax\n    __typename\n  }\n  totalCount {\n    raw\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewDisclaimerFragment on PropertyReviewSummary {\n  reviewDisclaimer\n  reviewDisclaimerHeading\n  strategy\n  reviewDisclaimerValues {\n    text\n    __typename\n  }\n  reviewDisclaimerLabel\n  reviewDisclaimerAnalytics {\n    referrerId\n    linkName\n    __typename\n  }\n  reviewDisclaimerUrl {\n    value\n    accessibilityLabel\n    link {\n      url\n      __typename\n    }\n    __typename\n  }\n  reviewDisclaimerAccessibilityLabel\n  __typename\n}\n\nfragment LodgingEnrichedMessageFragment on LodgingEnrichedMessage {\n  __typename\n  subText\n  value\n  theme\n  state\n  accessibilityLabel\n  icon {\n    id\n    size\n    theme\n    __typename\n  }\n  mark {\n    id\n    __typename\n  }\n  egdsMark {\n    url {\n      value\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment __PropertyReviewsListFragment on PropertyReviews {\n  summary {\n    paginateAction {\n      text\n      analytics {\n        referrerId\n        linkName\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  reviews {\n    contentDirectFeedbackPromptId\n    ...ReviewParentFragment\n    managementResponses {\n      ...ReviewChildFragment\n      __typename\n    }\n    reviewInteractionSections {\n      primaryDisplayString\n      reviewInteractionType\n      __typename\n    }\n    __typename\n  }\n  ...NoResultsMessageFragment\n  __typename\n}\n\nfragment ReviewParentFragment on PropertyReview {\n  id\n  superlative\n  locale\n  title\n  brandType\n  reviewScoreWithDescription {\n    label\n    value\n    __typename\n  }\n  text\n  seeMoreAnalytics {\n    linkName\n    referrerId\n    __typename\n  }\n  submissionTime {\n    longDateFormat\n    __typename\n  }\n  impressionAnalytics {\n    event\n    referrerId\n    __typename\n  }\n  themes {\n    ...ReviewThemeFragment\n    __typename\n  }\n  reviewFooter {\n    ...PropertyReviewFooterSectionFragment\n    __typename\n  }\n  ...FeedbackIndicatorFragment\n  ...AuthorFragment\n  ...PhotosFragment\n  ...TravelersFragment\n  ...ReviewTranslationInfoFragment\n  ...PropertyReviewSourceFragment\n  ...PropertyReviewRegionFragment\n  __typename\n}\n\nfragment AuthorFragment on PropertyReview {\n  reviewAuthorAttribution {\n    text\n    __typename\n  }\n  __typename\n}\n\nfragment PhotosFragment on PropertyReview {\n  id\n  photoSection {\n    imageClickAnalytics {\n      referrerId\n      linkName\n      __typename\n    }\n    exitAnalytics {\n      referrerId\n      linkName\n      __typename\n    }\n    navClickAnalytics {\n      referrerId\n      linkName\n      __typename\n    }\n    __typename\n  }\n  photos {\n    description\n    url\n    __typename\n  }\n  __typename\n}\n\nfragment TravelersFragment on PropertyReview {\n  travelers\n  __typename\n}\n\nfragment ReviewThemeFragment on ReviewThemes {\n  icon {\n    id\n    __typename\n  }\n  label\n  __typename\n}\n\nfragment FeedbackIndicatorFragment on PropertyReview {\n  reviewInteractionSections {\n    primaryDisplayString\n    accessibilityLabel\n    reviewInteractionType\n    feedbackAnalytics {\n      linkName\n      referrerId\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewTranslationInfoFragment on PropertyReview {\n  translationInfo {\n    loadingTranslationText\n    targetLocale\n    translatedBy {\n      description\n      __typename\n    }\n    translationCallToActionLabel\n    seeOriginalText\n    __typename\n  }\n  __typename\n}\n\nfragment PropertyReviewSourceFragment on PropertyReview {\n  propertyReviewSource {\n    accessibilityLabel\n    graphic {\n      description\n      id\n      size\n      token\n      url {\n        value\n        __typename\n      }\n      __typename\n    }\n    text {\n      value\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment PropertyReviewRegionFragment on PropertyReview {\n  reviewRegion {\n    id\n    __typename\n  }\n  __typename\n}\n\nfragment PropertyReviewFooterSectionFragment on PropertyReviewFooterSection {\n  messages {\n    seoStructuredData {\n      itemscope\n      itemprop\n      itemtype\n      content\n      __typename\n    }\n    text {\n      ... on EGDSPlainText {\n        text\n        __typename\n      }\n      ... on EGDSGraphicText {\n        text\n        graphic {\n          ... on Mark {\n            description\n            id\n            size\n            url {\n              ... on HttpURI {\n                relativePath\n                value\n                __typename\n              }\n              __typename\n            }\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment ReviewChildFragment on ManagementResponse {\n  id\n  header {\n    text\n    __typename\n  }\n  response\n  __typename\n}\n\nfragment NoResultsMessageFragment on PropertyReviews {\n  noResultsMessage {\n    __typename\n    ...MessagingCardFragment\n    ...EmptyStateFragment\n  }\n  __typename\n}\n\nfragment MessagingCardFragment on UIMessagingCard {\n  graphic {\n    __typename\n    ... on Icon {\n      id\n      description\n      __typename\n    }\n  }\n  primary\n  secondaries\n  __typename\n}\n\nfragment EmptyStateFragment on UIEmptyState {\n  heading\n  body\n  __typename\n}\n\nfragment TravelerTypeFragment on SortAndFilterViewModel {\n  sortAndFilter {\n    name\n    label\n    options {\n      label\n      isSelected\n      optionValue\n      description\n      clickAnalytics {\n        linkName\n        referrerId\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment SortTypeFragment on SortAndFilterViewModel {\n  sortAndFilter {\n    name\n    label\n    clickAnalytics {\n      linkName\n      referrerId\n      __typename\n    }\n    options {\n      label\n      isSelected\n      optionValue\n      description\n      clickAnalytics {\n        linkName\n        referrerId\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment SearchTextFragment on SortAndFilterViewModel {\n  sortAndFilter {\n    name\n    label\n    graphic {\n      ... on Icon {\n        description\n        id\n        token\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n"
      }
    ]
    
        response = requests.post("https://www.expedia.com/graphql", json=data, headers=headers)
        response.raise_for_status()
    
        allin = pd.DataFrame()
        for x in response.json()[0]['data']['propertyInfo']['reviewInfo']['reviews']:
            review = {
                'title': x['title'],
                'superlative': x['superlative'],
                'text': x['text'],
            }
            allin = pd.concat([allin, pd.DataFrame([review])])
        print(allin)
    
    if __name__ == "__main__":
        main()
    

    I noticed that the duaid comes from the endpoint https://www.expedia.com/api/ucs/shortlist//fetch/?clientId=flex

    the request to that endpoint requires a client-token which is in a tag when the page loads. I haven't checked if it can be obtained using requests since it is presumably javascript. But this should be a start in the right direction. You need the right duaid and client-info token to automate making these calls.