A Python script that extracts ClinicalTrials.gov (“NCT”) numbers from abstracts in Pubmed XML search results and checks for a corresponding entry on ClinicalTrials.gov
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 lines
4.4 KiB

# How to use:
#
# Supply the file name as an argument and Y/N to column headings
# $ python3 pubmed-xml.py pubmed_result.xml Y > pubmed-output.tsv
import sys
import xml.etree.ElementTree as ET
import re
import requests
import tempfile
# The Python XML parser gets messed up if it finds <sup> tags or
# &quot;'s in the abstract, so I remove them here.
temp = tempfile.NamedTemporaryFile()
with open(str(sys.argv[1]), "rt") as pubmed_result:
with open(temp.name, "wt") as pubmed_temp:
for line in pubmed_result:
pubmed_temp.write(line.replace("<sup>", "").replace("</sup>", "").replace("&quot;", ""))
tree = ET.parse(temp.name)
root = tree.getroot()
# This is the regular expression that is searched for.
# It looks for text that begins with an "NCT" and is followed
# by a string of numbers, spaces or hyphens of unlimited length
nct_regex = "NCT[0-9 \-]+"
if sys.argv[2] == "Y": # Print out the column headings
print(
"Extracted NCT",
"Compressed NCT",
"Number of NCTs extracted",
"NCT on ClinicalTrials.gov",
"PMID",
"Date",
"Abstract",
"Journal",
"Pubmed Metadata Registry",
sep="\t",
end="\n"
)
for article in root:
# First, clear the values of all these variables from previous loops
pmid = ""
pubdate = ""
date_array = []
abstract_array = []
abstract_text = ""
journal = ""
registry = ""
accession_nos = []
nct_in_xml = ""
nct_on_site = ""
number_extracted = ""
site_request = ""
for datatype in article:
for child in datatype:
if (str(child.tag) == "PMID"):
pmid = str(child.text)
for subchild in child:
if (str(subchild.tag) == "Abstract"):
for abstractpara in subchild:
abstract_array.append(str(abstractpara.text).replace('\r', ' ').replace('\n', ' '))
abstract_text = ' '.join(abstract_array)
if (str(subchild.tag) == "Journal"):
for journaldata in subchild:
if (str(journaldata.tag) == "Title"):
journal = str(journaldata.text)
if (str(subchild.tag) == "DataBankList"):
for databank in subchild:
for databankchild in databank:
if (str(databankchild.tag) == "DataBankName"):
registry += str(databankchild.text)
if (str(databankchild.tag) == "AccessionNumberList"):
for accession_no in databankchild:
registry += " " + str(accession_no.text)
if str(subchild.tag) == "ArticleDate":
for dateelement in subchild:
date_array.append(str(dateelement.text))
pubdate = '-'.join(date_array)
if pubdate == "":
if str(subchild.tag) == "PubMedPubDate" and subchild.get('PubStatus') == "pubmed":
for dateelement in subchild:
if dateelement.tag == "Year" or dateelement.tag == "Month" or dateelement.tag == "Day":
date_array.append(str(dateelement.text))
pubdate = '-'.join(date_array)
extracted_ncts = re.findall(nct_regex, abstract_text)
extracted_ncts = [ nct for nct in extracted_ncts if nct.replace(' ', '').replace('-', '') != "NCT"]
if (len(extracted_ncts) == 0):
extracted_ncts.append("NA")
for extracted_nct in extracted_ncts:
if len(extracted_ncts) == 1 and extracted_ncts[0] == "NA":
number_extracted = 0
else:
number_extracted = len(extracted_ncts)
if number_extracted > 0:
site_request = requests.get('https://clinicaltrials.gov/show/' + extracted_nct.replace(' ', '').replace('-', ''))
if str(site_request.content).find('NCT Number not Found') == -1:
nct_on_site = 1
else:
nct_on_site = 0
print(
extracted_nct.strip(),
extracted_nct.replace(' ', '').replace('-', ''),
number_extracted,
nct_on_site,
pmid,
pubdate,
abstract_text,
journal,
registry,
sep="\t",
end="\n"
)
temp.close()