Skip to contents

In this vignette we will ingest project metadata from a CSV, restructure the data, and then create a json object.

library(wddsWizard)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(stringr)

Load in the CSV and clean it up

Our example data requires some light cleaning to make transforming it into csv easier.

project_metadata <- read.csv("../inst/extdata/example_data/example_project_metadata.csv")

## turn empty strings into NAs in the group field
project_metadata <- project_metadata |>
  dplyr::mutate(Group = dplyr::case_when(
    Group != "" ~ Group,
    TRUE ~ NA
  )) 

## use `fill` to complete the items column and `mutate` to make groups a little
## more ergonomic

 project_metadata_filled <- tidyr::fill(data = project_metadata,Group) 

Restructure data

The validation schema is expecting JSON, so we have to restructure the data into a list that can be converted to JSON.

For Creators, Resources, and Funding References, its possible to have multiple entities in each group. In our example data, there are two creators and three funding references. So we need to pull out the entity_ids for the creators and funding references then clean up the Group field so it can be used a general category for Creators, Resources, and Funding References.


# get ids for components of a group.
project_metadata_ids <- project_metadata_filled |>
  dplyr::mutate(
    entity_id = stringr::str_extract(string = Group,pattern = "[0-9]"),
    # make sure that there are no NA entity IDs
    entity_id = dplyr::case_when(
      is.na(entity_id) ~ "1",
      TRUE ~ entity_id
    )
  ) |>
  # drop entity ids from group field and convert to camel case
   dplyr::mutate(Group = stringr::str_replace_all(string = Group,pattern = " [0-9]", replacement = ""),
                 Group = snakecase::to_lower_camel_case(Group))

 ## split dataframe by Group for further processing
 
project_metadata_list  <- split(project_metadata_ids,project_metadata_ids$Group)


# The `get_entity` function creates standard entities that will be easier to transform json

project_metadata_list_entities <- purrr::map(project_metadata_list,function(x){
  if(all(x$entity_id == "1")){
    out <- get_entity(x)
    return(out)
  }
  
  x_list <- split(x,x$entity_id)
  names(x_list) <- NULL
  out <-purrr::map(x_list, get_entity)
  return(out)
})

Make the json!

In a simpler world - you could just run the following code and it would work.

## if only, if only the mockingbird sings
jsonlite::toJSON(project_metadata_list_entities,pretty = TRUE,dataframe = "columns")

BUT because datacite’s structures are more complex, we need to do some prep. Luckily, there are a host of prep functions that already exist in this package! These mostly tag list items with jsonlite::unbox and/or wrap things in lists so that when converted to json, they match the data standard’s expected formats.

prep_for_json(project_metadata_list_entities) |>
  jsonlite::toJSON(pretty  = TRUE)
#> {
#>   "creators": [
#>     {
#>         "name": "Daniel J. Becker",
#>         "givenName": "Daniel J.",
#>         "familyName": "Becker",
#>         "affiliation": [
#>           {
#>             "name": "Department of Biology, University of Oklahoma, Norman, OK, USA",
#>             "affiliationIdentifier": "https://ror.org/02aqsxs83"
#>           }
#>         ],
#>         "nameIdentifiers": [
#>           {
#>             "nameIdentifier": "https://orcid.org/0000-0003-4315-8628",
#>             "nameIdentifierScheme": "ORCID"
#>           }
#>         ]
#>       },
#>     {
#>         "name": "Guang-Sheng Lei",
#>         "givenName": "Guang-Sheng",
#>         "familyName": "Lei",
#>         "affiliation": [
#>           {
#>             "name": "Department of Pathology and Laboratory Medicine, Indiana University School of Medicine, Indianapolis, IN, USA",
#>             "affiliationIdentifier": "https://ror.org/02ets8c94"
#>           }
#>         ]
#>       }
#>   ],
#>   "descriptions": [
#>     {
#>         "description": "Bats can harbor many pathogens without showing disease…",
#>         "descriptionType": "Abstract"
#>       }
#>   ],
#>   "fundingReferences": [
#>     {
#>         "funderName": "National Geographic Society",
#>         "funderIdentifier": "http://dx.doi.org/10.13039/100006733",
#>         "awardNumber": "NGS-55503R-19"
#>       },
#>     {
#>         "funderName": "Indiana University",
#>         "funderIdentifier": "http://dx.doi.org/10.13039/100006733"
#>       },
#>     {
#>         "funderName": "College of Charleston",
#>         "funderIdentifier": "http://dx.doi.org/10.13039/100009789"
#>       }
#>   ],
#>   "identifiers": [
#>     {
#>         "identifier": "https://www.ebi.ac.uk/pride/archive/projects/PXD031075"
#>       }
#>   ],
#>   "language": "en",
#>   "methodology": {
#>     "eventBased": false,
#>     "archival": false
#>   },
#>   "publicationYear": "2022",
#>   "rights": [
#>     {
#>         "rights": "CC0"
#>       }
#>   ],
#>   "subjects": [
#>     {
#>         "subject": "Proteomics"
#>       },
#>     {
#>         "subject": "Immune Response"
#>       }
#>   ],
#>   "titles": [
#>     {
#>       "title": "Serum proteomics of coronavirus shedding in vampire bats (Desmodus rotundus)"
#>     }
#>   ]
#> }