-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathannotator.py
92 lines (79 loc) · 4.08 KB
/
annotator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/local/bin/python
#################################
__author__ = 'Nolan Nichols' ##
__created__ = '2011-05-09' ##
__modified__ = '2011-05-11' ##
#################################
from httplib2 import Http
from urllib import urlencode
from lxml import etree
from Bio import Entrez
from nltk.corpus import stopwords
# config for NCBO Annotator Web Service
URL = 'http://rest.bioontology.org/obs/annotator'
API_KEY = '' # api keys are available at http://bioportal.bioontology.org/
STOPWORDS = ','.join([word for word in stopwords.words('english')])
# sample annotation text from pubmed
Entrez.email = ''
handle = Entrez.efetch(db="pubmed",rettype='xml',id='20808702')
rawHTML = etree.fromstring(handle.read())
rawXML = etree.fromstring(rawHTML[1][0].text[1:-1])
ABSTRACT = rawXML.xpath('//AbstractText')[0].text
# myText = '''Melanoma is a malignant tumor of melanocytes which are found
# predominantly in skin but also in the bowel and the eye'''
# create a POST ready Http object
# configure default parameters per user guide
# http://www.bioontology.org/wiki/index.php/Annotator_User_Guide
annotator = Http()
headers = {'Content-type': 'application/x-www-form-urlencoded'}
data = dict(apikey=API_KEY,
longestOnly='false',
wholeWordOnly='true',
filterNumber='true',
stopWords=STOPWORDS,
withDefaultStopWords='false',
isStopWordsCaseSensitive='false',
minTermSize=3,
scored='true',
withSynonyms='true',
ontologiesToExpand='',
ontologiesToKeepInResult='44777', # SNOMED-CT --> 44777, RadLex --> 45589
isVirtualOntologyID='false',
semanticTypes='',
levelMax=0,
mappingTypes=0,
textToAnnotate=ABSTRACT, # myText
format='xml') # parse will not work without an XML format
# parse response
def parse_annotator_response(httpResponse, annotatorXML):
matchedConcepts = {}
matchedContext = {}
if httpResponse['content-type'] == 'text/xml;charset=UTF-8':
annotatorParse = etree.fromstring(annotatorXML)
print "Annotated text = ", annotatorParse.xpath('//textToAnnotate')[0].text
annotatorConcepts = annotatorParse.xpath('/success/data/annotatorResultBean/annotations/annotationBean')
for node in annotatorConcepts: # examples of using xpath to parse the xml response
print "################################################## \n"
print "Concept ID = ", node.xpath('./concept/id')[0].text, "\n"
print "Concept Name = ", node.xpath('./concept/preferredName')[0].text, "\n"
print "Concept Semantic Type = ", node.xpath('./concept/semanticTypes/semanticTypeBean/semanticType')[0].text, "\n"
print "Concept Semantic Type Name = ", node.xpath('./concept/semanticTypes/semanticTypeBean/description')[0].text, "\n"
print "Concept Score = ", node.xpath('./score')[0].text, "\n"
print "#### \n"
print "Context ID = ", node.xpath('./context/term/localConceptId')[0].text, "\n"
print "Context Name = ", node.xpath('./context/term/name')[0].text, "\n"
print "Context From = ", node.xpath('./context/from')[0].text, "\n"
print "Context To = ", node.xpath('./context/to')[0].text, "\n"
matchedConcepts[node.xpath('./concept/id')[0].text] = node.xpath('./concept/preferredName')[0].text
matchedContext[node.xpath('./context/term/localConceptId')[0].text] = node.xpath('./context/term/name')[0].text
else:
print 'Please choose XML as the Format type'
return matchedConcepts, matchedContext # returns dict of 'id:preferredName'
# send request and get response
httpResponse, xmlContent = annotator.request(URL, 'POST', headers=headers, body=urlencode(data))
# check the outcome of the response
if httpResponse.status == 200:
print 'Call successful on ', httpResponse['date']
concept, context = parse_annotator_response(httpResponse, xmlContent)
else:
print 'NCBO Annotator Response Status', httpResponse.reason,' on ',httpResponse['date']