Project Metadata
In this vignette we will ingest project metadata from a CSV, restructure the data, and then create a json object.
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> filter, lag
#> The following objects are masked from 'package:base':
#> intersect, setdiff, setequal, union
Load in the CSV and clean it up
Our example data requires some light cleaning to make transforming it into csv easier.
project_metadata <- read.csv("../inst/extdata/example_data/example_project_metadata.csv")
## turn empty strings into NAs in the group field
project_metadata <- project_metadata |>
dplyr::mutate(Group = dplyr::case_when(
Group != "" ~ Group,
## use `fill` to complete the items column and `mutate` to make groups a little
## more ergonomic
project_metadata_filled <- tidyr::fill(data = project_metadata,Group)
Restructure data
The validation schema is expecting JSON, so we have to restructure the data into a list that can be converted to JSON.
For Creators, Resources, and Funding References, its possible to have
multiple entities in each group. In our example data, there are two
creators and three funding references. So we need to pull out the
s for the creators and funding references then
clean up the Group
field so it can be used a general
category for Creators, Resources, and Funding References.
# get ids for components of a group.
project_metadata_ids <- project_metadata_filled |>
entity_id = stringr::str_extract(string = Group,pattern = "[0-9]"),
# make sure that there are no NA entity IDs
entity_id = dplyr::case_when( ~ "1",
TRUE ~ entity_id
) |>
# drop entity ids from group field and convert to camel case
dplyr::mutate(Group = stringr::str_replace_all(string = Group,pattern = " [0-9]", replacement = ""),
Group = snakecase::to_lower_camel_case(Group))
## split dataframe by Group for further processing
project_metadata_list <- split(project_metadata_ids,project_metadata_ids$Group)
# The `get_entity` function creates standard entities that will be easier to transform json
project_metadata_list_entities <- purrr::map(project_metadata_list,function(x){
if(all(x$entity_id == "1")){
out <- get_entity(x)
x_list <- split(x,x$entity_id)
names(x_list) <- NULL
out <-purrr::map(x_list, get_entity)
Make the json!
In a simpler world - you could just run the following code and it would work.
## if only, if only the mockingbird sings
jsonlite::toJSON(project_metadata_list_entities,pretty = TRUE,dataframe = "columns")
BUT because datacite’s structures are more complex, we need to do
some prep. Luckily, there are a host of prep functions that already
exist in this package! These mostly tag list items with
and/or wrap things in lists so that when
converted to json, they match the data standard’s expected formats.
prep_for_json(project_metadata_list_entities) |>
jsonlite::toJSON(pretty = TRUE)
#> {
#> "creators": [
#> {
#> "name": "Daniel J. Becker",
#> "givenName": "Daniel J.",
#> "familyName": "Becker",
#> "affiliation": [
#> {
#> "name": "Department of Biology, University of Oklahoma, Norman, OK, USA",
#> "affiliationIdentifier": ""
#> }
#> ],
#> "nameIdentifiers": [
#> {
#> "nameIdentifier": "",
#> "nameIdentifierScheme": "ORCID"
#> }
#> ]
#> },
#> {
#> "name": "Guang-Sheng Lei",
#> "givenName": "Guang-Sheng",
#> "familyName": "Lei",
#> "affiliation": [
#> {
#> "name": "Department of Pathology and Laboratory Medicine, Indiana University School of Medicine, Indianapolis, IN, USA",
#> "affiliationIdentifier": ""
#> }
#> ]
#> }
#> ],
#> "descriptions": [
#> {
#> "description": "Bats can harbor many pathogens without showing disease…",
#> "descriptionType": "Abstract"
#> }
#> ],
#> "fundingReferences": [
#> {
#> "funderName": "National Geographic Society",
#> "funderIdentifier": "",
#> "awardNumber": "NGS-55503R-19"
#> },
#> {
#> "funderName": "Indiana University",
#> "funderIdentifier": ""
#> },
#> {
#> "funderName": "College of Charleston",
#> "funderIdentifier": ""
#> }
#> ],
#> "identifiers": [
#> {
#> "identifier": ""
#> }
#> ],
#> "language": "en",
#> "methodology": {
#> "eventBased": false,
#> "archival": false
#> },
#> "publicationYear": "2022",
#> "rights": [
#> {
#> "rights": "CC0"
#> }
#> ],
#> "subjects": [
#> {
#> "subject": "Proteomics"
#> },
#> {
#> "subject": "Immune Response"
#> }
#> ],
#> "titles": [
#> {
#> "title": "Serum proteomics of coronavirus shedding in vampire bats (Desmodus rotundus)"
#> }
#> ]
#> }