The code used to analyze the information in the CrossRef Database on works that are or have translations.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

303 lines
14 KiB

clean_data <- function(crossRefData) {
# Clean the data extracted from the CrossRef database. Remove test data, doubles,
# metadata put in the wrong variables, empty variables, Remove HTML code and C characters, etc.
# Initialize
verbose = FALSE
debugDOIstr = "10.47612/0514-7506-2022-89-1-35-42"
# There is a test entry that need to be removed.
crossRefData = subset(crossRefData, crossRefData$publisher != "Test accounts")
crossRefData = subset(crossRefData, crossRefData$publisher != "Society of Psychoceramics")
# Some DOIs mentioned in has-translation and is-translation-of are not in the CrossRef database and should be removed.
crossRefData = subset(crossRefData, crossRefData$source != "API - Data transfer failed.")
noVal = nrow(crossRefData)
# Someone accidentally put the wrong information in the journal title for two entries.
# isTranslationData$'container-title'[[1941]]
# [1] "Vol. 54, Issue 1 (Obituaries, News &amp; Commentaries, Community Reports)"
# [2] "Bulletin of the AAS"
# "Frieda Edelstein, Carmen Fenechiu & Dana LaCourse Munteanu, Sintaxă latină. I.
# Sintaxa cazurilor, Presa Universitară Clujeană, Cluj-Napoca, 2012, 166 p.; II.
# Sintaxa modurilor, Presa Universitară Clujeană, Cluj-Napoca, 2014, 218 p."
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
if ( !is.null(crossRefData$'container-title'[[i]]) ) {
if ( crossRefData$'container-title'[[i]][1] == "Vol. 54, Issue 1 (Obituaries, News &amp; Commentaries, Community Reports)" ) {
crossRefData$'container-title'[[i]] = "Bulletin of the AAS"
# print(crossRefData$'container-title'[[i]])
}
# The title of the next entry points to two articles. I hope I have summarized this in a useful way. Could also make it two entries pointing to the same translation.
if ( crossRefData$'container-title'[[i]][1] == "Frieda Edelstein, Carmen Fenechiu & Dana LaCourse Munteanu, Sintaxă latină. I. Sintaxa cazurilor, Presa Universitară Clujeană, Cluj-Napoca, 2012, 166 p.; II. Sintaxa modurilor, Presa Universitară Clujeană, Cluj-Napoca, 2014, 218 p." ) {
crossRefData$'container-title'[[i]] = "Sintaxă latină. I. Sintaxa cazurilor II. Sintaxa modurilor"
print(crossRefData$'container-title'[[i]])
}
}
}
# Remove NULL entries and set them to an empty string. Then unlist all entries
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
if ( is.null(crossRefData$title[[i]]) == TRUE ) {
crossRefData$title[[i]] = ""
}
if ( is.null(crossRefData$`container-title`[[i]]) == TRUE ) {
crossRefData$`container-title`[[i]] = ""
}
if ( is.null(crossRefData$`short-container-title`[[i]]) == TRUE ) {
crossRefData$`short-container-title`[[i]] = ""
}
}
crossRefData$title = unlist(crossRefData$title)
# crossRefData$`container-title` = unlist(crossRefData$`container-title`)
# crossRefData$`short-container-title` = unlist(crossRefData$`short-container-title`)
# Remove hard line breaks, tabs and HTML from titles (and maybe other items)
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
crossRefData$title[[i]] = improve_formatting(crossRefData$title[[i]])
crossRefData$'container-title'[[i]] = improve_formatting(crossRefData$'container-title'[[i]])
crossRefData$'short-container-title'[[i]] = improve_formatting(crossRefData$'short-container-title'[[i]])
}
# Remove cases where is-translation-of items have multiple entries, which nearly always are identical DOIs.
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
# Remove multiple DOIs
noDOIs = sum(crossRefData$'relation.is-translation-of'[[i]]$'id-type' == "doi")
if ( (noDOIs) > 1 ) {
if ( verbose == TRUE ) {
print(paste("Before1: ", crossRefData$'relation.is-translation-of'[[i]]))
}
bool = rep(FALSE, noDOIs)
bool[1] = TRUE # Pick the first DOI to be the only DOI
crossRefData$'relation.is-translation-of'[[i]] = subset(crossRefData$'relation.is-translation-of'[[i]], bool)
if ( verbose == TRUE ) {
print(paste("After1: ", crossRefData$'relation.is-translation-of'[[i]]))
}
# There are a few cases where the two DOIs do make sense (an article and a preprint), we could salvage that with more complicated code. I started below, but did not finish, it is quite a lot of work for a few more DOIs.
# if ( crossRefData$'relation.is-translation-of'[[i]]$id[1] == crossRefData$'relation.is-translation-of'[[i]]$id[2] ) {
# # If simply double (looks like this is because asserted-by is both subject and object)
# crossRefData$'relation.is-translation-of'[[i]] = subset(crossRefData$'relation.is-translation-of'[[i]], c(TRUE, FALSE))
# } else {
# # If there are two different DOIs, we would need to make an additional entry for every single item in the list.
# tmp = crossRefData$'relation.is-translation-of'[[i]] = subset(crossRefData$'relation.is-translation-of'[[i]], c(FALSE, TRUE))
# crossRefData$'relation.is-translation-of'[[i]] = subset(crossRefData$'relation.is-translation-of'[[i]], c(TRUE, FALSE))
# }
}
# Remove multiple entries when there are not multiple DOIs (but other IDs)
# For example a case with a DOI and a URI
# 1645 PubPub 10.21428/671d579e.5a08c432 Postface - The CFRP, from Archeology to Futurology https://cfrp.pubpub.org/pub/cioahqzh 10.21428/671d579e.ad4280ba https://cfrp.pubpub.org/pub/cioahqzh 10.21428/671d579e.ad4280ba
noItems = length(crossRefData$'relation.is-translation-of'[[i]]$'id-type' )
if ( (noItems) > 1 ) {
bool = crossRefData$'relation.is-translation-of'[[i]]$'id-type' == "doi" # If there is a DOI, pick that entry
if ( sum(bool) == 0 ) {
bool[1] = TRUE
}
if ( verbose == TRUE ) {
print(paste("Before2: ", crossRefData$'relation.is-translation-of'[[i]]))
}
crossRefData$'relation.is-translation-of'[[i]] = subset(crossRefData$'relation.is-translation-of'[[i]], bool)
if ( verbose == TRUE ) {
print(paste("After2: ", crossRefData$'relation.is-translation-of'[[i]]))
}
a=0
}
}
# Remove entries where the id-type is not DOI or URI. There are 10 such entries, two are URIs, but dead, several are "other" and do not give an id and there are one with an isbn and isnn.
# Some of these may be found with a web search, but for now lets ignore these problematic cases.
a=0
bool = vector(mode="logical", length=noVal)
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
if ( !is.null(crossRefData$'relation.is-translation-of'[[i]]) ) {
if ( crossRefData$'relation.is-translation-of'[[i]]$'id-type' == "doi" |
crossRefData$'relation.is-translation-of'[[i]]$'id-type' == "uri" ) {
bool[i] = TRUE
} else {
bool[i] = FALSE
}
} else {
bool[i] = TRUE
}
}
crossRefData = subset(crossRefData, bool)
noVal = nrow(crossRefData)
# The DOI mentioned as translation does not have the DOI prefix, which is the same as the prefix of the original.
# 15 such cases from the same publisher.
# 3876 Keldysh Institute of Applied Mathematics KIAM Prepr. 10.20948/prepr-2018-177-e Two variants of parallel implementation of high-order accurate bicompact schemes for multi-dimensional inhomogeneous transport equation prepr-2018-177
# Should be: 10.20948/prepr-2018-177
# DOI 10.20948/prepr-2016-38-e Translation DOI: prepr-2016-38
# Should be: 10.20948/prepr-2016-38 (this DOI actually works)
# 3897 Keldysh Institute of Applied Mathematics KIAM Prepr. 10.20948/prepr-2016-133-e Study of the accuracy of active magnetic damping algorithm prepr-2016-133
# Should be: 10.20948/prepr-2016-133
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
if ( !is.null(crossRefData$DOI) ) {
# print(paste("Grep output:", grep("^10.20948/prepr.*", crossRefData$DOI[[i]])))
a=0
if ( length(grep("^10.20948/prepr.*", crossRefData$DOI[[i]])) > 0 ) {
# print(paste("Original: ", crossRefData$DOI[[i]]))
if ( !is.null(crossRefData$`relation.has-translation`[[i]]$id) ) {
# print(paste("Before: ", crossRefData$`relation.has-translation`[[i]]$id))
if ( length(grep("^10.20948/", crossRefData$`relation.has-translation`[[i]]$id)) == 0 ) {
crossRefData$`relation.has-translation`[[i]]$id = paste0("10.20948/", crossRefData$`relation.has-translation`[[i]]$id)
}
# print(paste("After: ", crossRefData$`relation.has-translation`[[i]]$id))
}
if ( !is.null(crossRefData$`relation.is-translation-of`[[i]]$id) ) {
# print(paste("Before: ", crossRefData$`relation.has-translation`[[i]]$id))
if ( length(grep("^10.20948/", crossRefData$`relation.is-translation-of`[[i]]$id)) == 0 ) {
crossRefData$`relation.is-translation-of`[[i]]$id = paste0("10.20948/", crossRefData$`relation.is-translation-of`[[i]]$id)
}
# print(paste("After: ", crossRefData$`relation.has-translation`[[i]]$id))
}
}
}
}
a=0
# Sometimes journals have multiple names and article titles are also sometimes double. To simplify the database, we combine them into one name.
# shortcontrainertitle = c(unlist(hasTranslationData$`short-container-title`), unlist(isTranslationData$`short-container-title`))
# containertitle = c(unlist(hasTranslationData$`container-title`), unlist(isTranslationData$`container-title`))
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
tmp = crossRefData$'container-title'[[i]]
if ( is.null(tmp) ) {
crossRefData$'container-title'[[i]] = ""
} else {
if (length(crossRefData$'container-title'[[i]]) > 1 ) {
crossRefData$'container-title'[[i]] = paste(crossRefData$'container-title'[[i]], collapse=" ")
}
}
# crossRefData$'container-title'[[i]] = unlist(crossRefData$'container-title'[[i]])
tmp = crossRefData$'short-container-title'[[i]]
if ( is.null(tmp) ) {
crossRefData$'short-container-title'[[i]] = ""
} else {
if (length(crossRefData$'short-container-title'[[i]]) > 1 ) {
crossRefData$'short-container-title'[[i]] = paste(crossRefData$'short-container-title'[[i]], collapse=" ")
}
}
# crossRefData$'short-container-title'[[i]] = unlist(crossRefData$'short-container-title'[[i]])
tmp = crossRefData$'title'[[i]]
if ( is.null(tmp) ) {
crossRefData$'title'[[i]] = ""
} else {
if (length(crossRefData$'title'[[i]]) > 1 ) {
crossRefData$'title'[[i]] = paste(crossRefData$'title'[[i]], collapse=" ")
}
}
# crossRefData$'title'[[i]] = unlist(crossRefData$'title'[[i]])
} # For all values
a=0
crossRefData$'container-title' = unlist(crossRefData$'container-title')
crossRefData$'short-container-title' = unlist(crossRefData$'short-container-title')
crossRefData$'title' = unlist(crossRefData$'title')
# Remove double/multiple entries
uniqueDOIs = unique(crossRefData$DOI)
noUnique = length(uniqueDOIs)
noVal = nrow(crossRefData)
bool = vector(mode="logical", length=noVal)
if ( noUnique < noVal ) {
for ( iDOI in 1:noUnique ) {
index = which(crossRefData$DOI == uniqueDOIs[iDOI])
if ( length(index) > 1 ) {
if ( verbose == TRUE ) {
print(paste0(length(index), ": ", uniqueDOIs[iDOI]))
}
bool[index[2:length(index)]] = TRUE
}
}
}
crossRefData = subset(crossRefData, bool==FALSE)
noVal = nrow(crossRefData)
# Set all (case insensitive) DOIs to lower case to make comparisons easier
for (i in 1:noVal) {
if ( APItranslationData$DOI[i] == debugDOIstr ) {
a=0
}
crossRefData$DOI[i] = tolower(crossRefData$DOI[i])
if ( !is.null(crossRefData$'relation.is-translation-of'[[i]]) ) {
if ( crossRefData$'relation.is-translation-of'[[i]]$'id-type' == "doi" ) {
crossRefData$'relation.is-translation-of'[[i]]$'id' = tolower(crossRefData$'relation.is-translation-of'[[i]]$'id')
}
}
if ( !is.null(crossRefData$'relation.has-translation'[[i]]) ) {
noTransl = nrow(crossRefData$'relation.has-translation'[[i]])
for ( j in 1:noTransl ) {
if ( crossRefData$'relation.has-translation'[[i]]$'id-type'[j] == "doi" ) {
crossRefData$'relation.has-translation'[[i]]$'id'[j] = tolower(crossRefData$'relation.has-translation'[[i]]$'id'[j])
}
}
}
} # end loop over all entries
a=0
return(crossRefData)
}
improve_formatting <- function(dataString) {
# a = crossRefData$title[[i]]
# Remove html used in titles
# Also remove <sub> and <sup> or are they informative?
dataString = gsub("</?(i|div|b)>", "", dataString) # <[A-Za-z][A-Za-z0-9]*>
# Remove literal \n and \t, end of line and tab indicators, which are in the database in the middle of titles and the like.
dataString = gsub("\n", " ", dataString, fixed=TRUE)
dataString = gsub("\t", " ", dataString, fixed=TRUE)
dataString = gsub('\"', '""', dataString, fixed=TRUE)
# Remove multiple white space characters and substitute them with one space.
dataString = gsub(" {2,}", " ", dataString)
# if ( a != dataString) {
# print(paste0("before: ", a))
# print(paste0("after : ", dataString))
# }
dataString = gsub("&amp;", "&", dataString, fixed=TRUE)
dataString = gsub("&gt;", ">", dataString, fixed=TRUE)
dataString = gsub("&lt;", "<", dataString, fixed=TRUE)
# if ( length(grep("&.*;", dataString)) > 0 ) {
# print(dataString)
# print(crossRefData$'relation.is-translation-of'[[i]]) # The above print revealed an article title that suggests two articles have been translated. So checking here whether there are two IDs, but there are not.
# print(crossRefData$`relation.has-translation`[[i]])
# }
return(dataString)
}
# Fixed
## Fixed by giving all entires double quotation marks.
# Probably a problem for the text table:
# isTranslationData$title[[3572]]
# [1] "\"Ceci n’est pas un oiseau\" – The judge as a critic and the work of art concept in tax law"
# for (i in 1:noVal) {
#
# }