You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
122 lines
4.4 KiB
122 lines
4.4 KiB
# How to use:
|
|
#
|
|
# Supply the file name as an argument and Y/N to column headings
|
|
# $ python3 pubmed-xml.py pubmed_result.xml Y > pubmed-output.tsv
|
|
|
|
import sys
|
|
import xml.etree.ElementTree as ET
|
|
import re
|
|
import requests
|
|
import tempfile
|
|
|
|
# The Python XML parser gets messed up if it finds <sup> tags or
|
|
# "'s in the abstract, so I remove them here.
|
|
|
|
temp = tempfile.NamedTemporaryFile()
|
|
with open(str(sys.argv[1]), "rt") as pubmed_result:
|
|
with open(temp.name, "wt") as pubmed_temp:
|
|
for line in pubmed_result:
|
|
pubmed_temp.write(line.replace("<sup>", "").replace("</sup>", "").replace(""", ""))
|
|
|
|
tree = ET.parse(temp.name)
|
|
root = tree.getroot()
|
|
|
|
# This is the regular expression that is searched for.
|
|
# It looks for text that begins with an "NCT" and is followed
|
|
# by a string of numbers, spaces or hyphens of unlimited length
|
|
|
|
nct_regex = "NCT[0-9 \-]+"
|
|
|
|
if sys.argv[2] == "Y": # Print out the column headings
|
|
print(
|
|
"Extracted NCT",
|
|
"Compressed NCT",
|
|
"Number of NCTs extracted",
|
|
"NCT on ClinicalTrials.gov",
|
|
"PMID",
|
|
"Date",
|
|
"Abstract",
|
|
"Journal",
|
|
"Pubmed Metadata Registry",
|
|
sep="\t",
|
|
end="\n"
|
|
)
|
|
|
|
for article in root:
|
|
# First, clear the values of all these variables from previous loops
|
|
pmid = ""
|
|
pubdate = ""
|
|
date_array = []
|
|
abstract_array = []
|
|
abstract_text = ""
|
|
journal = ""
|
|
registry = ""
|
|
accession_nos = []
|
|
nct_in_xml = ""
|
|
nct_on_site = ""
|
|
number_extracted = ""
|
|
site_request = ""
|
|
|
|
for datatype in article:
|
|
for child in datatype:
|
|
if (str(child.tag) == "PMID"):
|
|
pmid = str(child.text)
|
|
for subchild in child:
|
|
if (str(subchild.tag) == "Abstract"):
|
|
for abstractpara in subchild:
|
|
abstract_array.append(str(abstractpara.text).replace('\r', ' ').replace('\n', ' '))
|
|
abstract_text = ' '.join(abstract_array)
|
|
if (str(subchild.tag) == "Journal"):
|
|
for journaldata in subchild:
|
|
if (str(journaldata.tag) == "Title"):
|
|
journal = str(journaldata.text)
|
|
if (str(subchild.tag) == "DataBankList"):
|
|
for databank in subchild:
|
|
for databankchild in databank:
|
|
if (str(databankchild.tag) == "DataBankName"):
|
|
registry += str(databankchild.text)
|
|
if (str(databankchild.tag) == "AccessionNumberList"):
|
|
for accession_no in databankchild:
|
|
registry += " " + str(accession_no.text)
|
|
|
|
if str(subchild.tag) == "ArticleDate":
|
|
for dateelement in subchild:
|
|
date_array.append(str(dateelement.text))
|
|
pubdate = '-'.join(date_array)
|
|
if pubdate == "":
|
|
if str(subchild.tag) == "PubMedPubDate" and subchild.get('PubStatus') == "pubmed":
|
|
for dateelement in subchild:
|
|
if dateelement.tag == "Year" or dateelement.tag == "Month" or dateelement.tag == "Day":
|
|
date_array.append(str(dateelement.text))
|
|
pubdate = '-'.join(date_array)
|
|
extracted_ncts = re.findall(nct_regex, abstract_text)
|
|
extracted_ncts = [ nct for nct in extracted_ncts if nct.replace(' ', '').replace('-', '') != "NCT"]
|
|
if (len(extracted_ncts) == 0):
|
|
extracted_ncts.append("NA")
|
|
for extracted_nct in extracted_ncts:
|
|
if len(extracted_ncts) == 1 and extracted_ncts[0] == "NA":
|
|
number_extracted = 0
|
|
else:
|
|
number_extracted = len(extracted_ncts)
|
|
if number_extracted > 0:
|
|
site_request = requests.get('https://clinicaltrials.gov/show/' + extracted_nct.replace(' ', '').replace('-', ''))
|
|
if str(site_request.content).find('NCT Number not Found') == -1:
|
|
nct_on_site = 1
|
|
else:
|
|
nct_on_site = 0
|
|
print(
|
|
extracted_nct.strip(),
|
|
extracted_nct.replace(' ', '').replace('-', ''),
|
|
number_extracted,
|
|
nct_on_site,
|
|
pmid,
|
|
pubdate,
|
|
abstract_text,
|
|
journal,
|
|
registry,
|
|
sep="\t",
|
|
end="\n"
|
|
)
|
|
|
|
temp.close()
|