Analyze_Translations_CrossR.../load_data.R

49 lines
2.1 KiB
R

load_data <- function(translationFileDirName, callstrTranslation) {
# Load data from the CrossRef database that have been marked as either having
# or being a translation. Not to querry the database every run, this data is
# stored in the file translationFileDirName. Delete this file for a fresh copy.
#
# CrossRef gives out larger datasets in multiple pages. The variable nextCursor
# is used to track of which page you are on. It starts as "*" for the first page.
# Details on the API can be found here:
# https://github.com/CrossRef/rest-api-doc/blob/master/api_format.md
if (file.exists(translationFileDirName) == FALSE) {
# If the cached data file does not exist, load a full new JSON dataset from Crossref.
translationDataPages <- list()
noValues = +Inf
nextCursor = "*"
i = 1 # Counts the number of pages the database gives out
iVal = 0 # Counts the number of records
while(iVal < NoValues) {
APIstr = URLencode(paste0(callstrTranslation, "&cursor=", nextCursor))
translationData <- fromJSON(APIstr, flatten=TRUE)
if ( translationData[[1]] != "ok" ) {
stop("Data transfer failed.")
}
if ( translationData[[2]] != "work-list" ) {
stop("Wrong kind of data.")
}
if ( translationData[[3]] != "1.0.0" ) {
stop("Data format version unknown")
}
# If the data is right, put the actual data in the 4th element into the variable to simplify data addressing.
translationData = translationData[[4]]
nextCursor = translationData$'next-cursor'
noValues = translationData$'total-results'
noValPerPage = translationData$'items-per-page'
# The actual data is in the third element items
translationDataPages[[i]] = translationData$items
iVal = iVal + noValPerPage
print(iVal)
i = i + 1
}
translationData <- rbind_pages(translationDataPages) # Combining data from all pages into one list.
save(translationData, file=translationFileDirName)
} else {
load(translationFileDirName) # load TranslationData
}
return(translationData)
}