The code used to analyze the information in the CrossRef Database on works that are or have translations.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Analyze_Translations_CrossR.../analyse_crossref_translatio...

402 lines
23 KiB

# analyse_crossref_translation_data.R
#
# This is the main function for the analysis of the scholarly works in the
# CrossRef DOI database that are or have translations in other languages.
# We analyse how many translations are available in which years, from which
# publishers, the completeness of the metadata, etc. The main results have been
# written up in this Wiki post:
# https://wiki.translatescience.org/wiki/Translated_articles_in_the_CrossRef_database
#
# Not to call the CrossRef API every time the function is executed, we store
# intermediary results in a directory "data" (location set below in the Initialize Section).
# If you delete them all data is retrieved again. This directory also contains tables
# with results. Other output goes to standard out, some functions have a verbose flag for more output.
#
# Author: Victor Venema
# License: GNU AFFERO GENERAL PUBLIC LICENSE, see license file for details.
# June 2022
# 1. Initialize
library(jsonlite) # The CrossRef API returns data in JSON format
source("load_data.R") # Function to retrieve all record containing metadata indicating they are or have translations
debugSource("load_update_data_API.R") # Retrieves all records with DOIs mentioned in the metadata of load_data.R
# debugSource("analyse_single_source.R")
source("analyse_single_source.R") # Make some simple analysis, for more output use the verbose flag in the function
# debugSource("clean_data.R")
source("clean_data.R") # Fix data problems, such as incomplete DOIs, double information, mix ups, or C and HTML characters, etc.
# debugSource("save_csv_data.R")
source("save_csv_data.R") # Save cleaned records that have metadata indicating they either are or have translations in a table
# debugSource("get_information_doi.R")
debugSource("initialize_DOI_cache.R")
debugSource("write_table_translations.R") # Save all translations in a big table
verbose = FALSE # How much output to generate for checking what the code does.
# Determine output file names
# if( missing("DataDir")) DataDir = '/home/victor/Documents/projects/switchboard_database/data'
dataDir = '/home/victor/Documents/projects/switchboard_database/data'
if (dir.exists(dataDir) == F ) dir.create(dataDir)
translationTableDirFileName = file.path(dataDir, "translation_table.dat")
translationTableDirFileName2 = file.path(dataDir, "translation_table2.dat")
APItranslationDirFileName = file.path(dataDir, "API_translation.RData")
isTranslationFileDirName = file.path(dataDir, "crossref_is_translation") # Is a translation of another work
hasTranslationFileDirName = file.path(dataDir, "crossref_has_translation") # Has translation
isTranslationFileDirNameRData = paste0(isTranslationFileDirName, ".RData")
hasTranslationFileDirNameRData = paste0(hasTranslationFileDirName, ".RData")
isTranslationFileDirNameCSV = paste0(isTranslationFileDirName, ".csv")
hasTranslationFileDirNameCSV = paste0(hasTranslationFileDirName, ".csv")
# CrossRef has around four thousand translations in their database and regularly checking their API for new ones is worthwhile.
# Details about the CrossRef API: https://github.com/CrossRef/rest-api-doc/blob/master/api_format.md
callstrIsTranslation = "http://api.crossref.org/works?filter=relation.type:is-translation-of"
callstrHasTranslation = "http://api.crossref.org/works?filter=relation.type:has-translation"
# 2. Retrieve, clean and save data data
## 2.1 Retrieve
isTranslationData = load_data(isTranslationFileDirNameRData, callstrIsTranslation)
hasTranslationData = load_data(hasTranslationFileDirNameRData, callstrHasTranslation)
APItranslationData = load_update_data_API(APItranslationDirFileName, isTranslationData, hasTranslationData)
## 2.2 Clean data
isTranslationData = clean_data(isTranslationData)
hasTranslationData = clean_data(hasTranslationData)
APItranslationData = clean_data(APItranslationData)
## 2.3 Save direct output
write.csv(APItranslationData, file="/home/victor/Documents/projects/switchboard_database/data/API_translation.csv")
save_csv_data(isTranslationData, isTranslationFileDirNameCSV)
save_csv_data(hasTranslationData, hasTranslationFileDirNameCSV)
# 3. Analysis
## 3.1 Analyse single data sources
analyse_single_source(isTranslationData, "is translation")
analyse_single_source(hasTranslationData, "has translation")
## 3.2 Analyse pairs of original and translation
noValuesHasTranslation = nrow(hasTranslationData)
noValuesIsTranslation = nrow(isTranslationData)
# Write tables with pairs (original and translation)
# Two tables are written. The first attempt wrote the table directly. The second creates a data.frame and writes that.
# The second method is used for the analysis as only here the columns isTranslation and hasTranslation are consistent. First method could be removed.
doiCache = initialize_DOI_cache(hasTranslationData, isTranslationData, APItranslationData)
write.csv(doiCache, file="/home/victor/Documents/projects/switchboard_database/data/DOI_cache.csv")
translationTableCon = file(translationTableDirFileName, "wt")
writeLines("\"Translation ID\"\t\"Publisher Original\"\t\"Publisher Translation\"\t\"Journal Original\"\t\"Journal Translation\"\t\"DOI Original\"\t\"DOI Translation\"\t\"Title Original\"\t\"Title Translation\"\t\"Language Original\" \t\"Language Translation\"", con = translationTableCon)
translationTable1 = write_table_translations(hasTranslationData, hasTranslationData$'relation.has-translation', doiCache, translationTableCon, "has")
translationTable2 = write_table_translations( isTranslationData, isTranslationData$'relation.is-translation-of', doiCache, translationTableCon, "is")
close(translationTableCon)
translationTable = rbind(translationTable1, translationTable2)
write.csv(translationTable, file=translationTableDirFileName2, row.names=FALSE)
# Analysis of the language metadata
translationTable$languageOriginal[translationTable$languageOriginal==""] = NA
translationTable$languageTranslation[translationTable$languageTranslation==""] = NA
uniqueLanguagesOriginal = sort(table(translationTable$languageOriginal, useNA="ifany", exclude=""))
uniqueLanguagesTranslation = sort(table(translationTable$languageTranslation, useNA="ifany", exclude=""))
print(table(translationTable$languageOriginal, translationTable$languageTranslation, useNA="ifany"))
# lang = c(NA, "en", "ru", "no", "pt", "es", "it", "ja")
# Analysis of the publishers (using the publisher DOI prefix)
noValues = length(translationTable$translationID)
prefixOriginal = vector(mode="character", length=noValues)
prefixTranslation = vector(mode="character", length=noValues)
prefixPair = vector(mode="character", length=noValues)
prefixSame = vector(mode="logical", length=noValues)
for ( i in 1:noValues ) {
tmp1 = strsplit(as.character(translationTable$DOIoriginal[i]), "/", fixed=TRUE)
prefixOriginal[i] = tmp1[[1]][1]
tmp2 = strsplit(as.character(translationTable$DOItranslation[i]), "/", fixed=TRUE)
prefixTranslation[i] = tmp2[[1]][1]
prefixPair[i] = paste0(tmp1[[1]][1], tmp2[[1]][1])
if ( tmp1[[1]][1] == tmp2[[1]][1] ) {
prefixSame[i] = TRUE
} else {
prefixSame[i] = FALSE
}
}
uo = unique(prefixOriginal) # uo: unique original
ut = unique(prefixTranslation) # ut: unique translation
uPairs = unique(prefixPair) # unique pairs of original and translation publisher
noUniquePublishersOriginal = length(uo)
noUniquePublishersTranslation = length(ut)
noUniquePublishersBoth = length(uPairs)
prefixBoth = c(uo, ut)
uniquePrefixBoth = unique(prefixBoth)
tmp = strsplit(as.character(translationTable$DOIoriginal), "/", fixed=TRUE)
translationTable$prefixOriginal = tmp[[1]][1]
tmp = strsplit(as.character(translationTable$DOItranslation), "/", fixed=TRUE)
translationTable$prefixTranslation = tmp
uniquePrefixes = unique(translationTable$prefixOriginal)
noArticlesPerPublisher = length(translationTable$translationID) / length(uniquePrefixes)
## 3.3 Analysis of DOI and URI metadata
### Analysis of whether the metadata of is-translation-of points to a DOI or URI/URL
### "relation.is-translation-of"
iHasDOI = 0
iNoDOI = 0
iHasURI = 0
iNoURI = 0
for (i in 1:noValuesIsTranslation) {
if ( !is.null(isTranslationData$'relation.is-translation-of'[[i]]) ) {
if ( !is.null(isTranslationData$'relation.is-translation-of'[[i]]$'id-type') ) {
noDOIs = sum(isTranslationData$'relation.is-translation-of'[[i]]$'id-type' == "doi")
noURIs = sum(isTranslationData$'relation.is-translation-of'[[i]]$'id-type' == "uri")
# Print the exceptional cases with more than one DOI to see what is going on.
# if ( (noDOIs + noURIs) > 1 ) {
# print(isTranslationData$title[[i]])
# print(isTranslationData$'relation.is-translation-of'[[i]]$id)
# }
if ( noDOIs > 0 ) {
iHasDOI = iHasDOI + 1
iNoDOI = iNoDOI + noDOIs
} else {
if ( noURIs > 0 ) {
iHasURI = iHasURI + 1
iNoURI = iNoURI + noURIs
}
}
}
}
}
print(iHasDOI)
print(iHasURI)
print(iNoDOI)
print(iNoURI)
a=0
# 14 translations (with a DOI) are translations of works that only have a URI.
# 10 translations apparently do not have a DOI or a URI
# The number of articles being translated is larger than the number of translations (115 larger). Most common is the same DOI is mentioned twice. Twice a DOI and a URI is given and the URI is dead. Once two DOIs are given and both go to a password protected webpage (under construction). Once two DOIs are given and one goes to the preprint and one to the published version.
# We have one case with a DOI pointing to a preprint and one DOI pointing to the published version. Articles having a preprint and journal version is likely more common. When people search on our homepage for one of these DOIs, they should also get the translation of the other, i.e. we should querry CrossRef for alternative versions.
## Analysis of the metadata of has-translation records
## "relation.has-translation"
### How many translations do the records of articles having a translation point to?
iHasDOI = 0
iNoDOI = 0
iHasURI = 0
iNoURI = 0
for (i in 1:noValuesHasTranslation) {
if ( !is.null(hasTranslationData$'relation.has-translation'[[i]]) ) {
if ( !is.null(hasTranslationData$'relation.has-translation'[[i]]$'id-type') ) {
# noTrans = length(hasTranslationData$'relation.has-translation'[[i]]$'id-type')
noDOIs = sum(hasTranslationData$'relation.has-translation'[[i]]$'id-type' == "doi")
noURIs = sum(hasTranslationData$'relation.has-translation'[[i]]$'id-type' == "uri")
if ( noDOIs > 0 ) {
iHasDOI = iHasDOI + 1
iNoDOI = iNoDOI + noDOIs
} else {
if ( noURIs > 0 ) {
iHasURI = iHasURI + 1
iNoURI = iNoURI + noURIs
}
}
}
}
}
print(iHasDOI)
print(iHasURI)
print(iNoDOI)
print(iNoURI)
a=0
## Do the records marked as being a translation also have information on their translations?
iHasDOI = 0
iNoDOI = 0
iHasURI = 0
iNoURI = 0
for (i in 1:noValuesIsTranslation) {
if ( !is.null(isTranslationData$'relation.has-translation'[[i]]) ) {
if ( !is.null(isTranslationData$'relation.has-translation'[[i]]$'id-type') ) {
noDOIs = sum(isTranslationData$'relation.has-translation'[[i]]$'id-type' == "doi")
noURIs = sum(isTranslationData$'relation.has-translation'[[i]]$'id-type' == "uri")
if ( noDOIs > 0 ) {
iHasDOI = iHasDOI + 1
iNoDOI = iNoDOI + noDOIs
} else {
if ( noURIs > 0 ) {
iHasURI = iHasURI + 1
iNoURI = iNoURI + noURIs
}
}
}
}
}
print(iHasDOI)
print(iHasURI)
print(iNoDOI)
print(iNoURI)
# Only few translation claim they have a translation. So the original is normally not seen as a translation.
# "relation.is-translation-of"
a=0
print("end")
## Remarks on results
# Suspicious that the first year (2008) with translations is so recent and has so many translations. But it seems to be okay, They are all publications in e-Anatomy by the publisher IMAIOS, which publishes an anatomical atlas. https://www.imaios.com/en/e-Anatomy
# There are 49 publishers who have articles that are translated.
# They publish articles that have translations in 95 journals.
# We only know the language of the original for some works: the most common are Russian (278), English (121), Norwegian (114), Portugeese (8) and Italian (5).
# We have 4149 articles with a translation, but only 2152 translations, the main difference is the year 2021, where we have many Russian journals indicating they have translations.
# Likely these journals (especially the Russian ones with entries for 2021) also have translations for other/earlier years. As it is a limited set, we could contact them.
# The publisher with most translations is https://www.pleiades.online/, which is a publisher that specializes on translating Russian articles into English.
# Looks like publishers add translations together when the original is published, but to not go back into their catalogs to add older translations. For example, Pleiades Publishing states it is an international group of companies founded in 1971. But we only have recent translations, while translations are their main focus.
# The previously mentioned number of translations are all wrong. Often the entry "has-translation" is null. That it is present thus does not mean that an article has a translation.
# There are 17 articles with a translation with a DOI and there are 744 articles with a translation with a URI, they often have multiple translations so the number of translations with a URI is 5952; exactly 8 per original, which fits to imaios publishing their anatomy articles in 9 languages.
## Background information
# Example:
# $ :'data.frame': 1 obs. of 3 variables:
# ..$ id-type : chr "doi"
# ..$ id : chr "10.33048/smzh.2022.63.106"
# ..$ asserted-by: chr "subject"
# All id-types seem to be doi and all asserted-by's subject. Check and then focus on DOI.
# library(jsonlite)
# winners <- fromJSON("winners.json", flatten=TRUE)
# colnames(winners)
# install.packages("rjson")
#
# library("rjson")
# json_file <- "http://api.worldbank.org/country?per_page=10&region=OED&lendingtype=LNX&format=json"
## json_data <- fromJSON(paste(readLines(json_file), collapse=""))
# json_data <- fromJSON(file=json_file)
## Translated(?)
# [1] "reference-count" "publisher"
# [3] "issue" "short-container-title"
# [5] "DOI" "type"
# [7] "page" "source"
# [9] "is-referenced-by-count" "title"
# [11] "prefix" "author"
# [13] "member" "reference"
# [15] "container-title" "score"
# [17] "references-count" "URL"
# [19] "ISSN" "issn-type"
# [21] "subject" "license"
# [23] "update-policy" "volume"
# [25] "language" "link"
# [27] "alternative-id" "assertion"
# [29] "article-number" "original-title"
# [31] "indexed.date-parts" "indexed.date-time"
# [33] "indexed.timestamp" "content-domain.domain"
# [35] "content-domain.crossmark-restriction" "published-print.date-parts"
# [37] "created.date-parts" "created.date-time"
# [39] "created.timestamp" "published-online.date-parts"
# [41] "deposited.date-parts" "deposited.date-time"
# [43] "deposited.timestamp" "issued.date-parts"
# [45] "journal-issue.issue" "journal-issue.published-online.date-parts"
# [47] "journal-issue.published-print.date-parts" "relation.is-translation-of"
# [49] "published.date-parts" "accepted.date-parts"
## Translation
# [1] "reference-count" "publisher"
# [3] "issue" "short-container-title"
# [5] "DOI" "type"
# [7] "page" "source"
# [9] "is-referenced-by-count" "title"
# [11] "prefix" "author"
# [13] "member" "container-title"
# [15] "original-title" "score"
# [17] "references-count" "URL"
# [19] "ISSN" "issn-type"
# [21] "subject" "license"
# [23] "update-policy" "volume"
# [25] "reference" "language"
# [27] "link" "alternative-id"
# [29] "assertion" "abstract"
# [31] "archive" "article-number"
# [33] "subtitle" "funder"
# [35] "indexed.date-parts" "indexed.date-time"
# [37] "indexed.timestamp" "content-domain.domain"
# [39] "content-domain.crossmark-restriction" "published-print.date-parts"
# [41] "created.date-parts" "created.date-time"
# [43] "created.timestamp" "published-online.date-parts"
# [45] "deposited.date-parts" "deposited.date-time"
# [47] "deposited.timestamp" "issued.date-parts"
# [49] "journal-issue.issue" "journal-issue.published-online.date-parts"
# [51] "journal-issue.published-print.date-parts" "relation.is-translation-of"
# [53] "relation.continues" "relation.is-continued-by"
# [55] "published.date-parts" "accepted.date-parts"
# [57] "editor" "relation.is-reply-to"
# [59] "relation.has-translation" "edition-number"
# [61] "isbn-type" "ISBN"
# [63] "relation.references" "translator"
# [65] "resource.primary.URL" "description"
# [67] "publisher-location" "relation.is-identical-to"
# [69] "relation.is-referenced-by" "content-updated.date-parts"
# [71] "content-created.date-parts" "relation.has-comment"
# [73] "relation.has-part" "relation.is-version-of"
# [75] "published-other.date-parts" "clinical-trial-number"
# [77] "update-to" "institution"
# [79] "relation.has-version" "relation.is-manuscript-of"
# [81] "relation.has-preprint" "group-title"
# [83] "subtype" "relation.is-part-of"
# [85] "posted.date-parts"
## Wastebin
# TODO look why entry i=395, 702, 1186, 1436, 1737, 2797, 2978, 3015, 3152, 3190 are missing.
# The problem are uncommon id-types of the translation (other, isbn, issn)
# if ( i==395 | i==702 | i==1186 | i==1436 | i==1737 | i==2797 | i==2978 | i==3015 | i==3152 | i==3190) {
# a=0
# if ( verbose == TRUE ) {
# print(isTranslationData$'relation.is-translation-of'[[i]])
# }
# }
# if (noDOIs > 0) {
# # Write DOI of translation
# if ( nrow(work) > 0 ) {
# writeLines(paste0("\"", work$publisher, "\""), con=isTranslationTableCon, sep="\t")
# writeLines(paste0("\"", work$'short-container-title', " ", work$'container-title', "\""), con=isTranslationTableCon, sep="\t")
# # if ( !is.null(work$'short-container-title') ) {
# # writeLines(paste0("\"", work$'short-container-title', "\""), con=isTranslationTableCon, sep="\t")
# # } else {
# # if ( !is.null(work$'container-title') ) {
# # writeLines(paste0("\"", work$'container-title', "\""), con=isTranslationTableCon, sep="\t")
# # } else {
# # writeLines("", con=isTranslationTableCon, sep="\t")
# # }
# # }
# writeLines(paste0("\"", work$language, "\""), con=isTranslationTableCon, sep="\t")
# # if ( !is.null(work$language) ) {
# # writeLines(paste0("\"", work$language, "\""), con=isTranslationTableCon, sep="\t")
# # } else {
# # writeLines("", con = isTranslationTableCon, sep="\t")
# # }
# # work = get_information_doi(isTranslationData$'relation.is-translation-of'[[i]]$id, doiCache)
# } else {
# # work = get_information_doi(isTranslationData$'relation.is-translation-of'[[i]]$id, doiCache)
# writeLines("", con=isTranslationTableCon, sep="\t")
# writeLines("", con=isTranslationTableCon, sep="\t")
# writeLines("", con=isTranslationTableCon, sep="\t")
# }
# }
# # if (noURIs > 0) {
# # # Write URI of translation
# # writeLines(paste0("\t", "\"", isTranslationData$'relation.is-translation-of'[[i]]$id, "\""), con=isTranslationTableCon, sep="\t")
# # }
## Did not get the CrossRef API working with identifying information. Also not really necessary for such a small dataset.
# UserAgentStr = "TranslateScienceCrossref/0.1 (https://TranslateScience.org; mailto:translateScience@grassroots.is)"
# options(HTTPUserAgent = UserAgentStr)
# Alternative example of adding email to be polite: https://api.crossref.org/works?filter=has-full-text:true&mailto:translateScience@grassroots.is
# Later for updating the dataset, we could make a query sorting by date added, for example: /works?query=josiah+carberry&sort=created&order=asc
# Preferred by CrossRef is the filter from-index-date
# More efficient: tell CrossRef the fields you need, e.g.: /works?select=DOI,prefix,title
# Paging is necessary to get result for larger datasets, one can use and offset or deep paging:
# /members/311/works?filter=type:journal-article&cursor=* (Use URL encoding)
# /members/311/works?filter=type:journal-article&cursor=<value of next-cursor parameter>