Search code examples
pythonhtmljsonscrapy

How to extract data from a precise JSON subnode in HTML


I am trying to extract product data (name, price, url) as well as which sizes and colours are available from the json formatted website (https://www.bergzeit.de/marken/salewa/). I cannot find a way to get to the 'elementList' where i can see the data encoded. I used this code to get to the correct script on the page.

scripts = response.xpath('//script/text()').getall()
the_script = [s for s in scripts if 'elementsList' in s] 
data = json.loads(the_script[0])

I get JSONDecodeError: Expecting value: line 2 column 1 (char 1)

This is the HTML

<script>
window.__initialAppState = {
  global: {
    gtmId: 'GTM-NKB4VN4',
    cookiePresent: false,
    user: {
      loggedIn: false,
      b2b: false,
      b2bPrefs: {},
      b2bAdmin: false
    },

  modules: {
    checkout: {

      removePromoCodeUrl: '/summary/removevoucher/'
    },
    accountPage: {

    },
    loyaltyPage: {
      urls: {
        register: '/login/?registerAccountUrl=/register/loyalty/',
        overview: '/club'
      },

    },
    productsListPage: {

          headline: JSON.parse("\"Salewa\""),

        resultCount: 1754,
        elementsList: [{"data":{"brand":{"name":"Salewa"},"description":"Wetterfest und ultrawarm für anspruchsvolle Herren: 550 cuin Daunenjacke mit wind- und wasserabweisender Außenschicht","features":[],"googleClick":{"event":"eec.productClick","ecommerce":{"click":{"actionField":{"list":"brand"},"products":[{"brand":"Salewa","category":"Marken/Salewa","id":"1080071-051","name":"Herren Brenta Jacke","position":"1","price":"167.18","variant":"dark olive|S"}]}}},"googleImpression":{"event":"eec.productImpression","ecommerce":{"impressions":[{"brand":"Salewa","category":"Marken/Salewa","id":"1080071-051","list":"brand","name":"Herren Brenta Jacke","position":"1","price":"167.18","variant":"dark olive|S"}]}},"id":"1080071","images":[{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic1.jpg","title":"Herren Brenta Jacke"},{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic3.jpg","title":"Herren Brenta Jacke"},{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic4.jpg","title":"Herren Brenta Jacke"},{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic5.jpg","title":"Herren Brenta Jacke"},{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic6.jpg","title":"Herren Brenta Jacke"},{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic7.jpg","title":"Herren Brenta Jacke"}],"isNew":true,"itemId":"1080071-051","labels":["sale"],"manyAvailableSizes":true,"name":"Herren Brenta Jacke","price":{"savedAmount":"51,00 €","savedPercentage":20,"priceForSchemaOrgOffer":"198.95","previous":"249,95 €","old":"249,95 €","current":"198,95 €","conf":"189,00 €","isFromPrice":false,"isOldPrice":true,"isRetailPrice":false,"isSignificantlyReduced":true,"bestSavedAmount":"51,00 €"},"productId":"1080071","ratings":4.571428571428571,"ratingsCount":14,"sellingPoints":[],"sizedImages":[{"mobile":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular_mobile/1080071-051_pic1.jpg","title":"Herren Brenta Jacke"},"desktop":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic1.jpg","title":"Herren Brenta Jacke"}},{"mobile":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular_mobile/1080071-051_pic3.jpg","title":"Herren Brenta Jacke"},"desktop":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic3.jpg","title":"Herren Brenta Jacke"}},{"mobile":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular_mobile/1080071-051_pic4.jpg","title":"Herren Brenta Jacke"},"desktop":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic4.jpg","title":"Herren Brenta Jacke"}},{"mobile":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular_mobile/1080071-051_pic5.jpg","title":"Herren Brenta Jacke"},"desktop":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic5.jpg","title":"Herren Brenta Jacke"}},{"mobile":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular_mobile/1080071-051_pic6.jpg","title":"Herren Brenta Jacke"},"desktop":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic6.jpg","title":"Herren Brenta Jacke"}},{"mobile":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular_mobile/1080071-051_pic7.jpg","title":"Herren Brenta Jacke"},"desktop":{"alt":"Herren Brenta Jacke","src":"https://static.bergzeit.com/product_tile_regular/1080071-051_pic7.jpg","title":"Herren Brenta Jacke"}}],"sustainable":true,"url":"/p/salewa-herren-brenta-jacke/1080071/#itemId=1080071-051","variations":[{"attributeName":"import:style","colorVariation":true,"display_string":"Herstellerfarbe","name":"var[import:style]","options":[{"isAnyAvailable":true,"isAvailable":false,"label":"autumnal","stock":0,"value":"4171-autumnal","valueThumb":"orange"},{"isAnyAvailable":true,"isAvailable":false,"label":"black out","stock":0,"value":"0910-black_out","valueThumb":"schwarz"},{"isAnyAvailable":true,"isAvailable":false,"label":"dark olive","stock":0,"value":"5281-dark_olive","valueThumb":"oliv"},{"isAnyAvailable":true,"isAvailable":false,"label":"electric","stock":0,"value":"8621-electric","valueThumb":"blau"},{"isAnyAvailable":true,"isAvailable":false,"label":"flame","stock":0,"value":"1501-flame","valueThumb":"rot"},{"isAnyAvailable":true,"isAvailable":false,"label":"gold","stock":0,"value":"2191-gold","valueThumb":"gelb"},{"isAnyAvailable":true,"isAvailable":false,"label":"syrah","stock":0,"value":"1571-syrah","valueThumb":"rot"},{"isAnyAvailable":true,"isAvailable":false,"label":"thyme","stock":0,"value":"5561-thyme","valueThumb":"gruen"}],"selectedOption":"5281-dark_olive"},{"attributeName":"import:manufacturer_size","colorVariation":false,"display_string":"Herstellergröße","name":"var[import:manufacturer_size]","options":[{"isAnyAvailable":true,"isAvailable":false,"label":"S","sameDiscount":true,"value":"S"},{"isAnyAvailable":true,"isAvailable":false,"label":"M","sameDiscount":true,"value":"M"},{"isAnyAvailable":true,"isAvailable":false,"label":"L","sameDiscount":true,"value":"L"},{"isAnyAvailable":true,"isAvailable":false,"label":"XL","sameDiscount":true,"value":"XL"},{"isAnyAvailable":true,"isAvailable":false,"label":"XXL","sameDiscount":true,"value":"XXL"}],"selectedOption":"S"}]},"type":"product"}],
        oneTileTeaserInsertions: [],
        salesTeaserInsertion: "",
        highlightedArticlesIds: [],
        selectableVariations: ["import:Var_Article"],

          currentCategory: {"id":"-s-2831-salewa","name":"Salewa","path":"marken-salewa","url":"/marken/salewa/?show=all","root":false},
          parentCategory: {"id":"-s-2831","name":"Marken","path":"marken","url":"/marken/?show=all","root":false},
        subCategoryTeasers: [],
        search: JSON.parse("null"),
        ms: null,
        isBot: false,
      
    },
    productPage: {

          productData: {},

    },
    cartPage:
    {}
    ,
    wishlistPage: {

    }
  }
};
</script>

Solution

  • You're getting an error because the window.__initialAppState is not a JSON but a Javascript structure.

    One way to do that is to treat this as a string and extract the elementsList, which can be parsed to a JSON. You can use a Regular Expression such as elementsList: (.*) for that. Then, you can handle the resulting string as a JSON using the python json module.

    Hope this helps :)