Search code examples
javascriptjsonweb-scraping

Scrape specific properties from Json into a new Json


I have a wordpress website, and i have an API there exporting posts as a Json. Im building a new website, with this news, but I have to take this Json file, and somehow formated it into a new Json file with my properties names. I also want to discard some properties from it. So here is a quick example of a Json i get from a post:

{
  "status": "ok",
  "post": {
    "id": 2335,
    "type": "post",
    "slug": "litoral-awards14-no-jornal-diario-porto-canal",
    "url": "https:\/\/litoralmagazine.com\/litoral-awards14-no-jornal-diario-porto-canal\/",
    "status": "publish",
    "title": "Litoral Awards’14 no Jornal Di\u00e1rio do Porto Canal",
    "title_plain": "Litoral Awards’14 no Jornal Di\u00e1rio do Porto Canal",
    "content": "<div id=\"tps_slideContainer_2335\" class=\"theiaPostSlider_slides\"><div>\n\n\n\n<\/div><\/div><div class=\"theiaPostSlider_footer _footer\"><\/div><p><!-- END THEIA POST SLIDER --><\/p>\n\n                <script type='text\/javascript'>\n                    \n                var tpsInstance;\n                var tpsOptions = {\"slideContainer\":\"#tps_slideContainer_2335\",\"nav\":[\".theiaPostSlider_nav\"],\"navText\":\"%{currentSlide} de %{totalSlides}\",\"helperText\":\"\",\"defaultSlide\":0,\"transitionEffect\":\"slide\",\"transitionSpeed\":400,\"keyboardShortcuts\":true,\"scrollAfterRefresh\":true,\"numberOfSlides\":1,\"slides\":[],\"useSlideSources\":true,\"themeType\":\"font\",\"prevText\":\"Anterior\",\"nextText\":\"Seguinte\",\"buttonWidth\":\"0\",\"buttonWidth_post\":\"0\",\"postUrl\":\"https:\\\/\\\/litoralmagazine.com\\\/litoral-awards14-no-jornal-diario-porto-canal\\\/\",\"postId\":2335,\"refreshAds\":false,\"refreshAdsEveryNSlides\":\"1\",\"adRefreshingMechanism\":\"javascript\",\"siteUrl\":\"https:\\\/\\\/litoralmagazine.com\",\"loopSlides\":false,\"scrollTopOffset\":\"0\",\"prevFontIcon\":\"<span aria-hidden=\\\"true\\\" class=\\\"tps-icon-chevron-circle-left\\\"><\\\/span>\",\"nextFontIcon\":\"<span aria-hidden=\\\"true\\\" class=\\\"tps-icon-chevron-circle-right\\\"><\\\/span>\"};\n            \n                    (function ($) {\n                        $(document).ready(function () {\n                            \n                            tpsInstance = new tps.createSlideshow(tpsOptions);\n                        });\n                    }(jQuery));\n                <\/script>\n            ",
    "excerpt": "",
    "date": "2014-12-13 12:02:05",
    "modified": "2016-05-18 09:31:00",
    "categories": [
      {
        "id": 299,
        "slug": "litoral-awards",
        "title": "Litoral Awards",
        "description": "",
        "parent": 0,
        "post_count": 91
      },
      {
        "id": 342,
        "slug": "clipping-2014",
        "title": "clipping-2014",
        "description": "",
        "parent": 0,
        "post_count": 3
      },
      {
        "id": 573,
        "slug": "litoral-awards-2014",
        "title": "Litoral Awards 2014",
        "description": "",
        "parent": 0,
        "post_count": 21
      }
    ],
    "tags": [
      {
        "id": 82,
        "slug": "featured2",
        "title": "Featured2",
        "description": "",
        "post_count": 10
      },
      {
        "id": 312,
        "slug": "litoral-awards-2015",
        "title": "litoral awards 2015",
        "description": "",
        "post_count": 19
      }
    ],
    "author": {
      "id": 4,
      "slug": "litoral-magazine",
      "name": "Litoral Magazine",
      "first_name": "Litoral",
      "last_name": "Magazine",
      "nickname": "Litoral Magazine",
      "url": "https:\/\/litoralmagazine.com",
      "description": ""
    },
    "comments": [],
    "attachments": [
      {
        "id": 2336,
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
        "slug": "porto-canal-jornal-diario-litoral-awards-2014",
        "title": "porto-canal-jornal-diario-litoral-awards-2014",
        "description": "",
        "caption": "",
        "parent": 2335,
        "mime_type": "image\/jpeg",
        "images": {
          "full": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
            "width": 1000,
            "height": 600
          },
          "thumbnail": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-150x150.jpg",
            "width": 150,
            "height": 150
          },
          "medium": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-300x180.jpg",
            "width": 300,
            "height": 180
          },
          "medium_large": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
            "width": 1000,
            "height": 600
          },
          "post-thumbnail": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
            "width": 1000,
            "height": 600
          },
          "post-thumb": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
            "width": 1000,
            "height": 600
          },
          "medium-thumb": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-400x240.jpg",
            "width": 400,
            "height": 240
          },
          "small-thumb": {
            "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-95x60.jpg",
            "width": 95,
            "height": 60
          }
        }
      }
    ],
    "comment_count": 0,
    "comment_status": "closed",
    "thumbnail": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
    "custom_fields": {
      "tps_options": [
        "a:1:{s:7:\"enabled\";s:6:\"global\";}",
        "a:1:{s:7:\"enabled\";s:6:\"global\";}",
        "a:1:{s:7:\"enabled\";s:6:\"global\";}"
      ],
      "mvp_photo_credit": [
        "Nulla pariatur excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia. Photo by Shutterstock."
      ],
      "mvp_post_template": [
        "def-wide"
      ],
      "mvp_featured_image": [
        "show"
      ],
      "post_views_count": [
        "998"
      ],
      "mvp_video_embed": [
        "<iframe width=\"560\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/BG5RVursEGQ?list=PLtUECMbIwb1X1NJEIU9pAbWkzBWMmg6Hj\" frameborder=\"0\" allowfullscreen><\/iframe>"
      ],
      "dpsp_networks_shares": [
        "a:0:{}"
      ]
    },
    "thumbnail_size": "post-thumbnail",
    "thumbnail_images": {
      "full": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
        "width": 1000,
        "height": 600
      },
      "thumbnail": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-150x150.jpg",
        "width": 150,
        "height": 150
      },
      "medium": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-300x180.jpg",
        "width": 300,
        "height": 180
      },
      "medium_large": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
        "width": 1000,
        "height": 600
      },
      "post-thumbnail": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
        "width": 1000,
        "height": 600
      },
      "post-thumb": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014.jpg",
        "width": 1000,
        "height": 600
      },
      "medium-thumb": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-400x240.jpg",
        "width": 400,
        "height": 240
      },
      "small-thumb": {
        "url": "https:\/\/litoralmagazine.com\/wp-content\/uploads\/2016\/01\/porto-canal-jornal-diario-litoral-awards-2014-95x60.jpg",
        "width": 95,
        "height": 60
      }
    }
  },
  "previous_url": "https:\/\/litoralmagazine.com\/litoral-awards14-no-canal-central\/",
  "next_url": "https:\/\/litoralmagazine.com\/premio-musica-sons-em-transito\/"
}

How can i make a script, for example to output this json in this format:

{
  "status": "ok",
  "slug": "litoral-com-teste-noticia-url",
  "title": "Titulo de teste",
  "type": "post",
  "content": "Lorem",
  "published": 1,
  "excerpt": "",
  "categories": [1, 2, 4],
  "tags": [1, 2, 4],
  "author": 1,
  "comments": {
    "user_comment": 1,
    "body_comment": "comentario de teste"
  },
  "images": {
    "thumbnail_image": "http://wwww.imagem.com.jpeg",
    "featured_image": "http://wwww.imagem.com.jpeg"
  }
}

I want for example to say: originalJson.status = newJson.status. originalJson.author.id = newJson.author. originalJson.thumbnail.url = newJson.images.thumbnail_image. I hope you understand my point. You also can see on the json i get, on the originalJson.post.content i have a lot of html, like this:

<div id=\"tps_nav_upper_1785\" class=\"theiaPostSlider_nav _right fontTheme _upper\"><div class=\"_buttons\"><span class=\"_button _prev _another_post _disabled\"><span class=\"_1\"></span><span class=\"_2\" ><span aria-hidden=\"true\" class=\"tps-icon-chevron-circle-left\"></span></span><span class=\"_3\">Anterior</span></span><span class=\"_text\">1 de 15</span><a href=\"https://litoralmagazine.com/testemunhos-litoral-awards/2/\" class=\"_button _next\"><span class=\"_1\">Seguinte</span><span class=\"_2\" ><span aria-hidden=\"true\" class=\"tps-icon-chevron-circle-right\"></span></span><span class=\"_3\"></span></a></div><div class=\"_title\"><span class=\"_helper\"></span></div></div><div id=\"tps_slideContainer_1785\" class=\"theiaPostSlider_slides\"><div>\n\n<div id=\"attachment_1958\" style=\"width: 210px\" class=\"wp-caption alignleft\"><img class=\"wp-image-1958 size-medium\" src=\"https://litoralmagazine.com/wp-content/uploads/2014/11/ribau-esteves-200x300.jpg\" alt=\"ribau-esteves\" width=\"200\" height=\"300\" srcset=\"https://litoralmagazine.com/wp-content/uploads/2014/11/ribau-esteves-200x300.jpg 200w, https://litoralmagazine.com/wp-content/uploads/2014/11/ribau-esteves-600x900.jpg 600w, https://litoralmagazine.com/wp-content/uploads/2014/11/ribau-esteves.jpg 601w\" sizes=\"(max-width: 200px) 100vw, 200px\" /><p class=\"wp-caption-text\">Ribau Esteves &#8211; Presidente da Câmara Municipal de Aveiro</p></div>\n<p style=\"text-align: left;\">A edição de 2015 da Gala Litoral Awards marcou a agenda do município e da região de Aveiro, num exercício elegante e justo que reconhece e motiva todos para fazermos mais e melhor pela vida, pelas empresas, pelos municípios, pela região e pelas pessoas. Fazer muito e bem tem de ser, cada vez mais, uma tarefa importante e permanente. Reconhecer e premiar o mérito são atos nobres e necessários, para que juntos possamos continuar a crescer.</p>\n<h6 style=\"text-align: left;\">\n\n</div></div><div id=\"tps_nav_lower_1785\" class=\"theiaPostSlider_nav _right fontTheme _lower\"><div class=\"_buttons\"><span class=\"_button _prev _another_post _disabled\"><span class=\"_1\"></span><span class=\"_2\" ><span aria-hidden=\"true\" class=\"tps-icon-chevron-circle-left\"></span></span><span class=\"_3\">Anterior</span></span><span class=\"_text\">1 de 15</span><a href=\"https://litoralmagazine.com/testemunhos-litoral-awards/2/\" class=\"_button _next\"><span class=\"_1\">Seguinte</span><span class=\"_2\" ><span aria-hidden=\"true\" class=\"tps-icon-chevron-circle-right\"></span></span><span class=\"_3\"></span></a></div><div class=\"_title\"><span class=\"_helper\"></span></div></div><div class=\"theiaPostSlider_footer _footer\"></div><p><!-- END THEIA POST SLIDER --></p>\n\n                <script type='text/javascript'>\n                    \n                var tpsInstance;\n                var tpsOptions = {\"slideContainer\":\"#tps_slideContainer_1785\",\"nav\":[\".theiaPostSlider_nav\"],\"navText\":\"%{currentSlide} de %{totalSlides}\",\"helperText\":\"\",\"defaultSlide\":0,\"transitionEffect\":\"slide\",\"transitionSpeed\":400,\"keyboardShortcuts\":true,\"scrollAfterRefresh\":true,\"numberOfSlides\":15,\"slides\":[],\"useSlideSources\":true,\"themeType\":\"font\",\"prevText\":\"Anterior\",\"nextText\":\"Seguinte\",\"buttonWidth\":\"0\",\"buttonWidth_post\":\"0\",\"postUrl\":\"https:\\/\\/litoralmagazine.com\\/testemunhos-litoral-awards\\/\",\"postId\":1785,\"refreshAds\":false,\"refreshAdsEveryNSlides\":\"1\",\"adRefreshingMechanism\":\"javascript\",\"siteUrl\":\"https:\\/\\/litoralmagazine.com\",\"loopSlides\":false,\"scrollTopOffset\":\"0\",\"prevFontIcon\":\"<span aria-hidden=\\\"true\\\" class=\\\"tps-icon-chevron-circle-left\\\"><\\/span>\",\"nextFontIcon\":\"<span aria-hidden=\\\"true\\\" class=\\\"tps-icon-chevron-circle-right\\\"><\\/span>\"};\n            \n                    (function ($) {\n                        $(document).ready(function () {\n                            \n                            tpsInstance = new tps.createSlideshow(tpsOptions);\n                        });\n                    }(jQuery));\n                </script>\n 

What's the best way for, when passing this info to the new json, cut all the div's? I only want what's inside the div. How can i achieve this?

I tried to search this but i couldn't find it. Basically how can i make like a JS script, that will get as input the old Json file, and output a new json file, but with only the properties i want, and also rename most of the properties like i explained.

Thanks for any help guys!


Solution

  • If you have a valid html string you can use this function to extract text:

    function extractText(selector, html) {
      const elem = document.createElement('div');
      elem.innerHTML = html;
    
      return elem.querySelector(selector).innerText;
    }
    

    You can use it like this:

    const caption = extractText('p.wp-caption-text', "<div id=\"tps_nav_u...");
    // now caption will be "Ribau Esteves – Presidente da Câmara Municipal de Aveiro"
    

    Mind you, this function will throw if the selector does not match any element in the html. You will have to handle that accordingly.