Skip to content

Commit

Permalink
Merge pull request #75 from seasidesparrow/dc_zenodo_tests.20231105
Browse files Browse the repository at this point in the history
Fix: adds datacite tests for Zenodo records, moves _detag function fr…
  • Loading branch information
seasidesparrow authored Nov 7, 2023
2 parents 05eedbc + 68ccd8b commit 9d806ea
Show file tree
Hide file tree
Showing 17 changed files with 1,578 additions and 125 deletions.
74 changes: 74 additions & 0 deletions adsingestp/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,37 @@ class BaseBeautifulSoupParser(IngestBase):
out of the input XML stream.
"""

fix_ampersand_1 = re.compile(r"(__amp__)(.*?)(;)")
fix_ampersand_2 = re.compile(r"(&)(.*?)(;)")
re_ampersands = [fix_ampersand_1, fix_ampersand_2]

HTML_TAGS_MATH = [
"inline-formula",
"tex-math",
"mml:math",
"mml:semantics",
"mml:mrow",
"mml:munder",
"mml:mo",
"mml:mi",
"mml:msub",
"mml:mover",
"mml:mn",
"mml:annotation",
]

HTML_TAGS_HTML = ["sub", "sup", "a", "astrobj"]

HTML_TAGSET = {
"title": HTML_TAGS_MATH + HTML_TAGS_HTML,
"abstract": HTML_TAGS_MATH + HTML_TAGS_HTML + ["pre", "br"],
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["pre", "br"],
"affiliations": ["email", "orcid"],
"keywords": ["astrobj"],
}

HTML_TAGS_DANGER = ["php", "script", "css"]

def bsstrtodict(self, input_xml, parser="lxml-xml"):
"""
Returns a BeautifulSoup tree given an XML text
Expand All @@ -480,3 +511,46 @@ def bsstrtodict(self, input_xml, parser="lxml-xml"):
"""

return bs4.BeautifulSoup(input_xml, parser)

def _detag(self, r, tags_keep):
"""
Removes tags from input BeautifulSoup object
:param r: BeautifulSoup object (not string)
:param tags_keep: this function will remove all tags except those passed here
:return: newr: striing with cleaned text
"""
# note that parser=lxml is recommended here - if the more stringent lxml-xml is used,
# the output is slightly different and the code will need to be modified
newr = self.bsstrtodict(str(r), "lxml")
if newr.find_all():
tag_list = list(set([x.name for x in newr.find_all()]))
else:
tag_list = []
for t in tag_list:
elements = newr.find_all(t)
for e in elements:
if t in self.HTML_TAGS_DANGER:
e.decompose()
elif t in tags_keep:
continue
else:
if t.lower() == "sc":
e.string = e.string.upper()
e.unwrap()

# Note: newr is converted from a bs4 object to a string here.
# Everything after this point is string manipulation.
newr = str(newr)

for reamp in self.re_ampersands:
amp_fix = reamp.findall(newr)
for s in amp_fix:
s_old = "".join(s)
s_new = "&" + s[1] + ";"
newr = newr.replace(s_old, s_new)

newr = re.sub("\\s+|\n+|\r+", " ", newr)
newr = newr.replace(" ", " ")
newr = newr.strip()

return newr
28 changes: 22 additions & 6 deletions adsingestp/parsers/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,17 @@ def _parse_title_abstract(self):
if not type_attr:
titles[title_attr.lower()] = self._clean_output(t.get_text())
if type_attr == "Subtitle":
self.base_metadata["subtitle"] = self._clean_output(t.get_text())
self.base_metadata["subtitle"] = self._detag(
self._clean_output(t.get_text()), self.HTML_TAGSET["title"]
)
if not titles:
raise MissingTitleException("No title found")
# we use the English title as the main one, then add any foreign ones
# there are several options for "English" in this schema, so check for all of them (lowercase forms). If no language specified (key is ""), assume English.
en_key = list({"en", "en-us", ""} & set(titles.keys()))[0]
self.base_metadata["title"] = self._clean_output(titles.pop(en_key))
self.base_metadata["title"] = self._detag(
self._clean_output(titles.pop(en_key)), self.HTML_TAGSET["title"]
)
title_foreign = []
lang_foreign = []
for tkey in titles:
Expand All @@ -138,7 +142,9 @@ def _parse_title_abstract(self):

# the data model only takes a single foreign-language title; will need to adjust if more are required
if title_foreign:
self.base_metadata["title_native"] = self._clean_output(title_foreign[0])
self.base_metadata["title_native"] = self._detag(
self._clean_output(title_foreign[0]), self.HTML_TAGSET["title"]
)
self.base_metadata["lang_native"] = lang_foreign[0]

# abstract, references are all in the "descriptions" section
Expand All @@ -153,7 +159,9 @@ def _parse_title_abstract(self):
abstract = s.get_text()

if abstract:
self.base_metadata["abstract"] = self._clean_output(abstract)
self.base_metadata["abstract"] = self._detag(
self._clean_output(abstract), self.HTML_TAGSET["abstract"]
)

def _parse_publisher(self):
if self.input_metadata.find("publisher"):
Expand Down Expand Up @@ -246,8 +254,16 @@ def _parse_permissions(self):
c = i.get_text()
if u == "info:eu-repo/semantics/openAccess" or c == "Open Access":
is_oa = True

self.base_metadata["openAccess"]["open"] = is_oa
elif "http" in u:
self.base_metadata.setdefault("openAccess", {}).setdefault("licenseURL", u)
if "creativecommon" in u:
is_oa = True
if c:
self.base_metadata.setdefault("openAccess", {}).setdefault("license", c)
if "Creative Common" in c or "GNU General Public License" in c:
is_oa = True

self.base_metadata.setdefault("openAccess", {}).setdefault("open", is_oa)

def _parse_doctype(self):
if self.input_metadata.find("resourceType"):
Expand Down
93 changes: 11 additions & 82 deletions adsingestp/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,84 +454,13 @@ def parse(self, article_metadata):


class JATSParser(BaseBeautifulSoupParser):
fix_ampersand = re.compile(r"(&)(.*?)(;)")

JATS_TAGS_MATH = [
"inline-formula",
"tex-math",
"mml:math",
"mml:semantics",
"mml:mrow",
"mml:munder",
"mml:mo",
"mml:mi",
"mml:msub",
"mml:mover",
"mml:mn",
"mml:annotation",
]

JATS_TAGS_HTML = ["sub", "sup", "a", "astrobj"]

JATS_TAGSET = {
"title": JATS_TAGS_MATH + JATS_TAGS_HTML,
"abstract": JATS_TAGS_MATH + JATS_TAGS_HTML + ["pre", "br"],
"comments": JATS_TAGS_MATH + JATS_TAGS_HTML + ["pre", "br"],
"affiliations": ["email", "orcid"],
"keywords": ["astrobj"],
}

JATS_TAGS_DANGER = ["php", "script", "css"]

def __init__(self):
self.base_metadata = {}
self.back_meta = None
self.article_meta = None
self.journal_meta = None
self.isErratum = False

def _detag(self, r, tags_keep):
"""
Removes tags from input BeautifulSoup object
:param r: BeautifulSoup object (not string)
:param tags_keep: this function will remove all tags except those passed here
:return: newr: striing with cleaned text
"""
# note that parser=lxml is recommended here - if the more stringent lxml-xml is used,
# the output is slightly different and the code will need to be modified
newr = self.bsstrtodict(str(r), "lxml")
if newr.find_all():
tag_list = list(set([x.name for x in newr.find_all()]))
else:
tag_list = []
for t in tag_list:
elements = newr.find_all(t)
for e in elements:
if t in self.JATS_TAGS_DANGER:
e.decompose()
elif t in tags_keep:
continue
else:
if t.lower() == "sc":
e.string = e.string.upper()
e.unwrap()

# Note: newr is converted from a bs4 object to a string here.
# Everything after this point is string manipulation.
newr = str(newr)

amp_fix = self.fix_ampersand.findall(newr)
for s in amp_fix:
s_old = "".join(s)
s_new = "&" + s[1] + ";"
newr = newr.replace(s_old, s_new)

newr = re.sub("\\s+|\n+|\r+", " ", newr)
newr = newr.replace(" ", " ")
newr = newr.strip()

return newr

def _get_date(self, d):
"""
Extract and standarize date from input BeautifulSoup date object
Expand Down Expand Up @@ -593,7 +522,7 @@ def _parse_title_abstract(self):
# all title footnotes:
for df in title_group.find_all("fn"):
key = df.get("id", None)
note = self._detag(df, self.JATS_TAGSET["abstract"]).strip()
note = self._detag(df, self.HTML_TAGSET["abstract"]).strip()
if key and note:
title_fn_dict[key] = note
df.decompose()
Expand All @@ -603,7 +532,7 @@ def _parse_title_abstract(self):
if title_fn_dict.get(key, None):
title_fn_list.append(title_fn_dict.get(key, None))
dx.decompose()
art_title = self._detag(title, self.JATS_TAGSET["title"]).strip()
art_title = self._detag(title, self.HTML_TAGSET["title"]).strip()
title_notes = []
if title_fn_list:
title_notes.extend(title_fn_list)
Expand All @@ -616,7 +545,7 @@ def _parse_title_abstract(self):
if title_fn_dict.get(key, None):
subtitle_fn_list.append(title_fn_dict.get(key, None))
dx.decompose()
sub_title = self._detag(subtitle, self.JATS_TAGSET["title"]).strip()
sub_title = self._detag(subtitle, self.HTML_TAGSET["title"]).strip()
subtitle_notes = []
if subtitle_fn_list:
subtitle_notes.extend(subtitle_fn_list)
Expand All @@ -634,14 +563,14 @@ def _parse_title_abstract(self):
abstract_all = self.article_meta.find("abstract").find_all("p")
abstract_paragraph_list = list()
for paragraph in abstract_all:
para = self._detag(paragraph, self.JATS_TAGSET["abstract"])
para = self._detag(paragraph, self.HTML_TAGSET["abstract"])
abstract_paragraph_list.append(para)
self.base_metadata["abstract"] = "\n".join(abstract_paragraph_list)
if title_fn_list:
self.base_metadata["abstract"] += " " + " ".join(title_fn_list)
else:
abs_raw = self.article_meta.find("abstract")
abs_txt = self._detag(abs_raw, self.JATS_TAGSET["abstract"])
abs_txt = self._detag(abs_raw, self.HTML_TAGSET["abstract"])
self.base_metadata["abstract"] = abs_txt

def _parse_author(self):
Expand Down Expand Up @@ -710,29 +639,29 @@ def _parse_keywords(self):
for kk in keys_uat_test:
# Check for UAT first:
if kk["content-type"] == "uat-code":
keyid = self._detag(kk, self.JATS_TAGSET["keywords"])
keyid = self._detag(kk, self.HTML_TAGSET["keywords"])
if kk["content-type"] == "term":
keystring = self._detag(kk, self.JATS_TAGSET["keywords"])
keystring = self._detag(kk, self.HTML_TAGSET["keywords"])

if keyid or keystring:
keys_uat.append({"string": keystring, "system": "UAT", "id": keyid})

if not keys_uat:
keys_misc_test = kg.find_all("kwd")
for kk in keys_misc_test:
keys_misc.append(self._detag(kk, self.JATS_TAGSET["keywords"]))
keys_misc.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

# Then check for AAS:
if kg.get("kwd-group-type", "") == "AAS":
keys_aas_test = kg.find_all("kwd")
for kk in keys_aas_test:
keys_aas.append(self._detag(kk, self.JATS_TAGSET["keywords"]))
keys_aas.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

# If all else fails, just search for 'kwd'
if (not keys_uat) and (not keys_aas):
keys_misc_test = kg.find_all("kwd")
for kk in keys_misc_test:
keys_misc.append(self._detag(kk, self.JATS_TAGSET["keywords"]))
keys_misc.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

if keys_uat:
for k in keys_uat:
Expand Down Expand Up @@ -765,7 +694,7 @@ def _parse_keywords(self):
keys_out.append(
{
"system": "subject",
"string": self._detag(k, self.JATS_TAGSET["keywords"]),
"string": self._detag(k, self.HTML_TAGSET["keywords"]),
}
)

Expand Down
44 changes: 44 additions & 0 deletions tests/stubdata/input/zenodo_test.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<record>
<header>
<identifier>oai:zenodo.org:34650</identifier>
<datestamp>2020-01-20T17:28:18Z</datestamp>
</header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
<identifier identifierType="DOI">10.5281/zenodo.34650</identifier>
<alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="oai">oai:zenodo.org:34650</alternateIdentifier>
</alternateIdentifiers>
<creators>
<creator>
<creatorName nameType="Personal">Reetz, Johannes</creatorName>
<givenName>Johannes</givenName>
<familyName>Reetz</familyName>
<affiliation>Max Planck Computing and Data Facility, Garching, Germany</affiliation>
</creator>
</creators>
<titles>
<title>EUDAT - Open Data Services for Research</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>2015</publicationYear>
<subjects>
<subject>Science data management</subject>
</subjects>
<dates>
<date dateType="Issued">2015-12-03</date>
</dates>
<resourceType resourceTypeGeneral="Text">Presentation</resourceType>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf"></relatedIdentifier>
<relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">https://zenodo.org/communities/sciops2015</relatedIdentifier>
</relatedIdentifiers>
<rightsList>
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode" rightsIdentifierScheme="spdx" rightsIdentifier="cc-by-4.0">Creative Commons Attribution 4.0 International</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract">Presentation slides</description>
</descriptions>
</resource>
</metadata>
</record>
Loading

0 comments on commit 9d806ea

Please sign in to comment.