The code used to analyze the information in the CrossRef Database on works that are or have translations.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

88 lines
3.9 KiB

analyse_single_source <- function(crossRefData, dataTypeStr) {
# This function analyses the data retrieved because they either had metadata indication
# they are a translation or because it indicated they have translation.
# Analysis: year of these articles, the most frequent publisher and journal for various periods,
# and the number of unique journals and publishers.
#
# To get more output, set the verbose flag below to TRUE.
# Note CrossRef calls the journal name the "container-title".
# Initialize
verbose = FALSE
isTranslationOf = crossRefData$`relation.is-translation-of`
noElement = ncol(crossRefData) # Number of elements in one CrossRef record (is different for original and translation)
noValues = nrow(crossRefData) # Number of works
# a=0
publicationYearTmp = crossRefData$'published.date-parts' # date-parts is an incomplete date list, which can contain either just a year, a year and a month, or a year, month and day of month.
publicationYears = vector(mode = "integer", length=noValues)
publicationYears = vector(mode = "integer", length=noValues)
for (i in 1:noValues) {
publicationYears[i] = publicationYearTmp[[i]][1]
}
if ( verbose ) {
hist(publicationYears, main="", ylab = "Count", xlab = paste("Publication year - ", dataTypeStr))
}
print(sort(table(publicationYears), decreasing = TRUE))
publisherTmp = crossRefData$publisher
noUniquePublishers = length(unique(publisherTmp))
articlesPerPublisher = noValues/noUniquePublishers
print(paste("Articles per publisher:", format(articlesPerPublisher, digits=3)))
indexLate = publicationYears > 2020
indexMiddle = (publicationYears < 2021) & (publicationYears > 2008)
index2008 = publicationYears == 2008
if ( verbose == TRUE ) {
print(paste("Most common publisher names - all - ", dataTypeStr))
print(sort(table(publisherTmp), decreasing = TRUE)[1:10])
print(paste("Most common publisher names - 2021, 2022 - ", dataTypeStr))
print(sort(table(publisherTmp[indexLate]), decreasing = TRUE)[1:10])
print(paste("Most common publisher names - 2009-2020 - ", dataTypeStr))
print(sort(table(publisherTmp[indexMiddle]), decreasing = TRUE)[1:10])
print(paste("Most common publisher names - 2008 - ", dataTypeStr))
print(sort(table(publisherTmp[index2008]), decreasing = TRUE)[1:10])
}
journalTmp = crossRefData$'container-title'
noUniqueJournals = length(unique(journalTmp))
articlesPerJournal = noValues/noUniqueJournals
print(paste("Articles per journal:", format(articlesPerJournal, digits=3)))
# index = PublicationYears >= 2021
# index2 = publicationYears < 2021
# noUniqueJournals = length(unique(journalTmp[index]))
noUniqueJournals = length(unique(journalTmp))
articlesPerJournal = noValues/noUniqueJournals
print(paste("Articles per journal after 2020:", format(articlesPerJournal, digits=3)))
# Sometimes journals have multiple names. To simplify the analysis we only look at the first.
firstJournalName = vector(mode = "character", length=noValues)
for (i in 1:noValues) {
tmp = journalTmp[[i]][1]
if ( is.null(tmp) ) {
firstJournalName[i] = ""
} else {
firstJournalName[i] = tmp
}
}
if ( verbose == TRUE ) {
print(paste("Most common journal names - all - ", dataTypeStr))
print(sort(table(firstJournalName), decreasing = TRUE)[1:10])
print(paste("Most common journal names - 2021, 2022 - ", dataTypeStr))
print(sort(table(firstJournalName[indexLate]), decreasing = TRUE)[1:10])
print(paste("Most common journal names - 2009-2020 - ", dataTypeStr))
print(sort(table(firstJournalName[indexMiddle]), decreasing = TRUE)[1:10])
print(paste("Most common journal names - 2008 - ", dataTypeStr))
print(sort(table(firstJournalName[index2008]), decreasing = TRUE)[1:10])
}
languageTmp = crossRefData$language
noLanguages = length(unique(languageTmp))
print(noLanguages)
print(sort(table(languageTmp), decreasing = TRUE)[1:6])
a=0
}