Search code examples
rjsonrjson

Convert a list to a dataframe


I read the data from a json file using this option

library("rjson")
result <- fromJSON(file = "input.json")

And the result is a list.

I would like to convert this list into a dataframe with colum names.

However, the list has many nested options. How is it possible to convert them into a dataframe?

Here an example of the first json element:

{ "id": 6119227, "node_id": "MDEwOlJlcG9zaXRvcnk2MTE5MjI3", "name": "sentiment", "full_name": "thisandagain/sentiment", "private": false, "owner": { "login": "thisandagain", "id": 747641, "node_id": "MDQ6VXNlcjc0NzY0MQ==", "avatar_url": "https://avatars.githubusercontent.com/u/747641?v=4", "gravatar_id": "", "url": "https://api.github.com/users/thisandagain", "html_url": "https://github.com/thisandagain", "followers_url": "https://api.github.com/users/thisandagain/followers", "following_url": "https://api.github.com/users/thisandagain/following{/other_user}", "gists_url": "https://api.github.com/users/thisandagain/gists{/gist_id}", "starred_url": "https://api.github.com/users/thisandagain/starred{/owner}{/repo}", "subscriptions_url": "https://api.github.com/users/thisandagain/subscriptions", "organizations_url": "https://api.github.com/users/thisandagain/orgs", "repos_url": "https://api.github.com/users/thisandagain/repos", "events_url": "https://api.github.com/users/thisandagain/events{/privacy}", "received_events_url": "https://api.github.com/users/thisandagain/received_events", "type": "User", "site_admin": false }, "html_url": "https://github.com/thisandagain/sentiment", "description": "AFINN-based sentiment analysis for Node.js.", "fork": false, "url": "https://api.github.com/repos/thisandagain/sentiment", "forks_url": "https://api.github.com/repos/thisandagain/sentiment/forks", "keys_url": "https://api.github.com/repos/thisandagain/sentiment/keys{/key_id}", "collaborators_url": "https://api.github.com/repos/thisandagain/sentiment/collaborators{/collaborator}", "teams_url": "https://api.github.com/repos/thisandagain/sentiment/teams", "hooks_url": "https://api.github.com/repos/thisandagain/sentiment/hooks", "issue_events_url": "https://api.github.com/repos/thisandagain/sentiment/issues/events{/number}", "events_url": "https://api.github.com/repos/thisandagain/sentiment/events", "assignees_url": "https://api.github.com/repos/thisandagain/sentiment/assignees{/user}", "branches_url": "https://api.github.com/repos/thisandagain/sentiment/branches{/branch}", "tags_url": "https://api.github.com/repos/thisandagain/sentiment/tags", "blobs_url": "https://api.github.com/repos/thisandagain/sentiment/git/blobs{/sha}", "git_tags_url": "https://api.github.com/repos/thisandagain/sentiment/git/tags{/sha}", "git_refs_url": "https://api.github.com/repos/thisandagain/sentiment/git/refs{/sha}", "trees_url": "https://api.github.com/repos/thisandagain/sentiment/git/trees{/sha}", "statuses_url": "https://api.github.com/repos/thisandagain/sentiment/statuses/{sha}", "languages_url": "https://api.github.com/repos/thisandagain/sentiment/languages", "stargazers_url": "https://api.github.com/repos/thisandagain/sentiment/stargazers", "contributors_url": "https://api.github.com/repos/thisandagain/sentiment/contributors", "subscribers_url": "https://api.github.com/repos/thisandagain/sentiment/subscribers", "subscription_url": "https://api.github.com/repos/thisandagain/sentiment/subscription", "commits_url": "https://api.github.com/repos/thisandagain/sentiment/commits{/sha}", "git_commits_url": "https://api.github.com/repos/thisandagain/sentiment/git/commits{/sha}", "comments_url": "https://api.github.com/repos/thisandagain/sentiment/comments{/number}", "issue_comment_url": "https://api.github.com/repos/thisandagain/sentiment/issues/comments{/number}", "contents_url": "https://api.github.com/repos/thisandagain/sentiment/contents/{+path}", "compare_url": "https://api.github.com/repos/thisandagain/sentiment/compare/{base}...{head}", "merges_url": "https://api.github.com/repos/thisandagain/sentiment/merges", "archive_url": "https://api.github.com/repos/thisandagain/sentiment/{archive_format}{/ref}", "downloads_url": "https://api.github.com/repos/thisandagain/sentiment/downloads", "issues_url": "https://api.github.com/repos/thisandagain/sentiment/issues{/number}", "pulls_url": "https://api.github.com/repos/thisandagain/sentiment/pulls{/number}", "milestones_url": "https://api.github.com/repos/thisandagain/sentiment/milestones{/number}", "notifications_url": "https://api.github.com/repos/thisandagain/sentiment/notifications{?since,all,participating}", "labels_url": "https://api.github.com/repos/thisandagain/sentiment/labels{/name}", "releases_url": "https://api.github.com/repos/thisandagain/sentiment/releases{/id}", "deployments_url": "https://api.github.com/repos/thisandagain/sentiment/deployments", "created_at": "2012-10-08T03:45:22Z", "updated_at": "2023-01-20T12:34:09Z", "pushed_at": "2020-05-18T13:55:58Z", "git_url": "git://github.com/thisandagain/sentiment.git", "ssh_url": "[email protected]:thisandagain/sentiment.git", "clone_url": "https://github.com/thisandagain/sentiment.git", "svn_url": "https://github.com/thisandagain/sentiment", "homepage": "", "size": 416, "stargazers_count": 2560, "watchers_count": 2560, "language": "JavaScript", "has_issues": true, "has_projects": true, "has_downloads": true, "has_wiki": false, "has_pages": false, "has_discussions": false, "forks_count": 319, "mirror_url": null, "archived": false, "disabled": false, "open_issues_count": 16, "license": { "key": "mit", "name": "MIT License", "spdx_id": "MIT", "url": "https://api.github.com/licenses/mit", "node_id": "MDc6TGljZW5zZTEz" }, "allow_forking": true, "is_template": false, "web_commit_signoff_required": false, "topics": [ "afinn", "analysis", "javascript", "nlp", "sentiment", "sentiment-analysis" ], "visibility": "public", "forks": 319, "open_issues": 16, "watchers": 2560, "default_branch": "develop", "score": 1.0 }

Solution

  • I have saved your_json-element_ as input.json. With the package RJSONIO this file is read in as json. Your example is deeply nested json.

    This list may consist of lists within a list. For such cases the list is flattened with str_c() and transformed into tibble, if its length is 1 it is directly transformed into a tibble.

    With purrr 's list_cbind() this is all bind into one DF with - in this case - one row and 80 columns.

    library(tidyverse)
    library(RJSONIO)
    
    json <- fromJSON("input.json", nullValue = NA)
    
    dat <- lapply(json, function(j) {
      if (length(j) > 1) {
        j |>
          stringr::str_c(collapse = ",") |>
          as_tibble()
      } else {
        as_tibble(j)
      }
    })
    
    purrr::list_cbind(dat)
    
    # A tibble: 1 × 80
      id$value node_id…¹ name$…² full_…³ priva…⁴ owner…⁵ html_…⁶ descr…⁷
         <dbl> <chr>     <chr>   <chr>   <lgl>   <chr>   <chr>   <chr>  
    1  6119227 MDEwOlJl… sentim… thisan… FALSE   thisan… https:… AFINN-…
    # … with 72 more variables: fork <tibble[,1]>, url <tibble[,1]>,
    #   forks_url <tibble[,1]>, keys_url <tibble[,1]>,
    #   collaborators_url <tibble[,1]>, teams_url <tibble[,1]>,
    #   hooks_url <tibble[,1]>, issue_events_url <tibble[,1]>,
    #   events_url <tibble[,1]>, assignees_url <tibble[,1]>,
    #   branches_url <tibble[,1]>, tags_url <tibble[,1]>,
    #   blobs_url <tibble[,1]>, git_tags_url <tibble[,1]>, …
    # ℹ Use `colnames()` to see all variable names
    

    NEW EDIT based on comment

    Here is another approach, which I have adopted from themockup

    library(tidyverse)
    library(jsonlite)
    library(httr)
    
    url_json <- "https://api.github.com/search/commits?q=sentiment&per_page=100&page=8"
    
    raw_json <- url_json |>
      httr::GET() |>
      httr::content()
    
    raw_json$items |> str(max.level = 1)
    #> List of 100
    
    rj <- raw_json |>
      discard(is_empty) |>
      map_if(is.data.frame, list) |>
      as_tibble() 
    
    df <- rj |>
      mutate(no = seq_along(rj$items)) |>
      unnest_wider(items) 
    
    df
    #> # A tibble: 100 × 14
    #>    total…¹ incom…² url   sha   node_id html_…³ comme…⁴ commit       author      
    #>      <int> <lgl>   <chr> <chr> <chr>   <chr>   <chr>   <list>       <list>      
    #>  1   44460 TRUE    http… 5233… MDY6Q2… https:… https:… <named list> <named list>
    #>  2   44460 TRUE    http… c27c… MDY6Q2… https:… https:… <named list> <named list>
    #>  3   44460 TRUE    http… 9779… MDY6Q2… https:… https:… <named list> <named list>
    #>  4   44460 TRUE    http… 5533… MDY6Q2… https:… https:… <named list> <named list>
    #>  5   44460 TRUE    http… d815… MDY6Q2… https:… https:… <named list> <named list>
    #>  6   44460 TRUE    http… eb12… MDY6Q2… https:… https:… <named list> <named list>
    #>  7   44460 TRUE    http… 70a3… MDY6Q2… https:… https:… <named list> <named list>
    #>  8   44460 TRUE    http… 8a72… MDY6Q2… https:… https:… <named list> <NULL>      
    #>  9   44460 TRUE    http… 4ae3… MDY6Q2… https:… https:… <named list> <named list>
    #> 10   44460 TRUE    http… b324… MDY6Q2… https:… https:… <named list> <NULL>      
    #> # … with 90 more rows, 5 more variables: committer <list>, parents <list>,
    #> #   repository <list>, score <dbl>, no <int>, and abbreviated variable names
    #> #   ¹​total_count, ²​incomplete_results, ³​html_url, ⁴​comments_url
    
    glimpse(df)
    #> Rows: 100
    #> Columns: 14
    #> $ total_count        <int> 44460, 44460, 44460, 44460, 44460, 44460, 44460, 44…
    #> $ incomplete_results <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
    #> $ url                <chr> "https://api.github.com/repos/esforexco/esforexco.g…
    #> $ sha                <chr> "523339a9eef676382f86caa7cb4019840089eda1", "c27ce7…
    #> $ node_id            <chr> "MDY6Q29tbWl0MjI1MzU4MjMwOjUyMzMzOWE5ZWVmNjc2MzgyZj…
    #> $ html_url           <chr> "https://github.com/esforexco/esforexco.github.io/c…
    #> $ comments_url       <chr> "https://api.github.com/repos/esforexco/esforexco.g…
    #> $ commit             <list> ["https://api.github.com/repos/esforexco/esforexco…
    #> $ author             <list> ["justprodev", 58386042, "MDQ6VXNlcjU4Mzg2MDQy", "…
    #> $ committer          <list> ["justprodev", 58386042, "MDQ6VXNlcjU4Mzg2MDQy", "…
    #> $ parents            <list> [["https://api.github.com/repos/esforexco/esforexc…
    #> $ repository         <list> [225358230, "MDEwOlJlcG9zaXRvcnkyMjUzNTgyMzA=", "e…
    #> $ score              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
    #> $ no                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …