Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: adds datacite tests for Zenodo records, moves _detag function fr… #75

Merged
merged 4 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions adsingestp/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,37 @@ class BaseBeautifulSoupParser(IngestBase):
out of the input XML stream.
"""

fix_ampersand_1 = re.compile(r"(__amp__)(.*?)(;)")
fix_ampersand_2 = re.compile(r"(&)(.*?)(;)")
re_ampersands = [fix_ampersand_1, fix_ampersand_2]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if you have ever encountered this in this context: I have seen cases where ampersands got encoded as __amp__amp;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you remember which publisher(s) specifically? I'm looking for an example to make a unit test with.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file has an example: /proj/ads/references/sources/MNRAS/0423/iss4.wiley2.xml

HTML_TAGS_MATH = [
"inline-formula",
"tex-math",
"mml:math",
"mml:semantics",
"mml:mrow",
"mml:munder",
"mml:mo",
"mml:mi",
"mml:msub",
"mml:mover",
"mml:mn",
"mml:annotation",
]

HTML_TAGS_HTML = ["sub", "sup", "a", "astrobj"]

HTML_TAGSET = {
"title": HTML_TAGS_MATH + HTML_TAGS_HTML,
"abstract": HTML_TAGS_MATH + HTML_TAGS_HTML + ["pre", "br"],
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["pre", "br"],
"affiliations": ["email", "orcid"],
"keywords": ["astrobj"],
}

HTML_TAGS_DANGER = ["php", "script", "css"]

def bsstrtodict(self, input_xml, parser="lxml-xml"):
"""
Returns a BeautifulSoup tree given an XML text
Expand All @@ -480,3 +511,46 @@ def bsstrtodict(self, input_xml, parser="lxml-xml"):
"""

return bs4.BeautifulSoup(input_xml, parser)

def _detag(self, r, tags_keep):
"""
Removes tags from input BeautifulSoup object
:param r: BeautifulSoup object (not string)
:param tags_keep: this function will remove all tags except those passed here
:return: newr: striing with cleaned text
"""
# note that parser=lxml is recommended here - if the more stringent lxml-xml is used,
# the output is slightly different and the code will need to be modified
newr = self.bsstrtodict(str(r), "lxml")
if newr.find_all():
tag_list = list(set([x.name for x in newr.find_all()]))
else:
tag_list = []
for t in tag_list:
elements = newr.find_all(t)
for e in elements:
if t in self.HTML_TAGS_DANGER:
e.decompose()
elif t in tags_keep:
continue
else:
if t.lower() == "sc":
e.string = e.string.upper()
e.unwrap()

# Note: newr is converted from a bs4 object to a string here.
# Everything after this point is string manipulation.
newr = str(newr)

for reamp in self.re_ampersands:
amp_fix = reamp.findall(newr)
for s in amp_fix:
s_old = "".join(s)
s_new = "&" + s[1] + ";"
newr = newr.replace(s_old, s_new)

newr = re.sub("\\s+|\n+|\r+", " ", newr)
newr = newr.replace(" ", " ")
newr = newr.strip()

return newr
28 changes: 22 additions & 6 deletions adsingestp/parsers/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,17 @@ def _parse_title_abstract(self):
if not type_attr:
titles[title_attr.lower()] = self._clean_output(t.get_text())
if type_attr == "Subtitle":
self.base_metadata["subtitle"] = self._clean_output(t.get_text())
self.base_metadata["subtitle"] = self._detag(
self._clean_output(t.get_text()), self.HTML_TAGSET["title"]
)
if not titles:
raise MissingTitleException("No title found")
# we use the English title as the main one, then add any foreign ones
# there are several options for "English" in this schema, so check for all of them (lowercase forms). If no language specified (key is ""), assume English.
en_key = list({"en", "en-us", ""} & set(titles.keys()))[0]
self.base_metadata["title"] = self._clean_output(titles.pop(en_key))
self.base_metadata["title"] = self._detag(
self._clean_output(titles.pop(en_key)), self.HTML_TAGSET["title"]
)
title_foreign = []
lang_foreign = []
for tkey in titles:
Expand All @@ -138,7 +142,9 @@ def _parse_title_abstract(self):

# the data model only takes a single foreign-language title; will need to adjust if more are required
if title_foreign:
self.base_metadata["title_native"] = self._clean_output(title_foreign[0])
self.base_metadata["title_native"] = self._detag(
self._clean_output(title_foreign[0]), self.HTML_TAGSET["title"]
)
self.base_metadata["lang_native"] = lang_foreign[0]

# abstract, references are all in the "descriptions" section
Expand All @@ -153,7 +159,9 @@ def _parse_title_abstract(self):
abstract = s.get_text()

if abstract:
self.base_metadata["abstract"] = self._clean_output(abstract)
self.base_metadata["abstract"] = self._detag(
self._clean_output(abstract), self.HTML_TAGSET["abstract"]
)

def _parse_publisher(self):
if self.input_metadata.find("publisher"):
Expand Down Expand Up @@ -246,8 +254,16 @@ def _parse_permissions(self):
c = i.get_text()
if u == "info:eu-repo/semantics/openAccess" or c == "Open Access":
is_oa = True

self.base_metadata["openAccess"]["open"] = is_oa
elif "http" in u:
self.base_metadata.setdefault("openAccess", {}).setdefault("licenseURL", u)
if "creativecommon" in u:
is_oa = True
if c:
self.base_metadata.setdefault("openAccess", {}).setdefault("license", c)
if "Creative Common" in c or "GNU General Public License" in c:
is_oa = True

self.base_metadata.setdefault("openAccess", {}).setdefault("open", is_oa)

def _parse_doctype(self):
if self.input_metadata.find("resourceType"):
Expand Down
93 changes: 11 additions & 82 deletions adsingestp/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,84 +454,13 @@ def parse(self, article_metadata):


class JATSParser(BaseBeautifulSoupParser):
fix_ampersand = re.compile(r"(&)(.*?)(;)")

JATS_TAGS_MATH = [
"inline-formula",
"tex-math",
"mml:math",
"mml:semantics",
"mml:mrow",
"mml:munder",
"mml:mo",
"mml:mi",
"mml:msub",
"mml:mover",
"mml:mn",
"mml:annotation",
]

JATS_TAGS_HTML = ["sub", "sup", "a", "astrobj"]

JATS_TAGSET = {
"title": JATS_TAGS_MATH + JATS_TAGS_HTML,
"abstract": JATS_TAGS_MATH + JATS_TAGS_HTML + ["pre", "br"],
"comments": JATS_TAGS_MATH + JATS_TAGS_HTML + ["pre", "br"],
"affiliations": ["email", "orcid"],
"keywords": ["astrobj"],
}

JATS_TAGS_DANGER = ["php", "script", "css"]

def __init__(self):
self.base_metadata = {}
self.back_meta = None
self.article_meta = None
self.journal_meta = None
self.isErratum = False

def _detag(self, r, tags_keep):
"""
Removes tags from input BeautifulSoup object
:param r: BeautifulSoup object (not string)
:param tags_keep: this function will remove all tags except those passed here
:return: newr: striing with cleaned text
"""
# note that parser=lxml is recommended here - if the more stringent lxml-xml is used,
# the output is slightly different and the code will need to be modified
newr = self.bsstrtodict(str(r), "lxml")
if newr.find_all():
tag_list = list(set([x.name for x in newr.find_all()]))
else:
tag_list = []
for t in tag_list:
elements = newr.find_all(t)
for e in elements:
if t in self.JATS_TAGS_DANGER:
e.decompose()
elif t in tags_keep:
continue
else:
if t.lower() == "sc":
e.string = e.string.upper()
e.unwrap()

# Note: newr is converted from a bs4 object to a string here.
# Everything after this point is string manipulation.
newr = str(newr)

amp_fix = self.fix_ampersand.findall(newr)
for s in amp_fix:
s_old = "".join(s)
s_new = "&" + s[1] + ";"
newr = newr.replace(s_old, s_new)

newr = re.sub("\\s+|\n+|\r+", " ", newr)
newr = newr.replace(" ", " ")
newr = newr.strip()

return newr

def _get_date(self, d):
"""
Extract and standarize date from input BeautifulSoup date object
Expand Down Expand Up @@ -593,7 +522,7 @@ def _parse_title_abstract(self):
# all title footnotes:
for df in title_group.find_all("fn"):
key = df.get("id", None)
note = self._detag(df, self.JATS_TAGSET["abstract"]).strip()
note = self._detag(df, self.HTML_TAGSET["abstract"]).strip()
if key and note:
title_fn_dict[key] = note
df.decompose()
Expand All @@ -603,7 +532,7 @@ def _parse_title_abstract(self):
if title_fn_dict.get(key, None):
title_fn_list.append(title_fn_dict.get(key, None))
dx.decompose()
art_title = self._detag(title, self.JATS_TAGSET["title"]).strip()
art_title = self._detag(title, self.HTML_TAGSET["title"]).strip()
title_notes = []
if title_fn_list:
title_notes.extend(title_fn_list)
Expand All @@ -616,7 +545,7 @@ def _parse_title_abstract(self):
if title_fn_dict.get(key, None):
subtitle_fn_list.append(title_fn_dict.get(key, None))
dx.decompose()
sub_title = self._detag(subtitle, self.JATS_TAGSET["title"]).strip()
sub_title = self._detag(subtitle, self.HTML_TAGSET["title"]).strip()
subtitle_notes = []
if subtitle_fn_list:
subtitle_notes.extend(subtitle_fn_list)
Expand All @@ -634,14 +563,14 @@ def _parse_title_abstract(self):
abstract_all = self.article_meta.find("abstract").find_all("p")
abstract_paragraph_list = list()
for paragraph in abstract_all:
para = self._detag(paragraph, self.JATS_TAGSET["abstract"])
para = self._detag(paragraph, self.HTML_TAGSET["abstract"])
abstract_paragraph_list.append(para)
self.base_metadata["abstract"] = "\n".join(abstract_paragraph_list)
if title_fn_list:
self.base_metadata["abstract"] += " " + " ".join(title_fn_list)
else:
abs_raw = self.article_meta.find("abstract")
abs_txt = self._detag(abs_raw, self.JATS_TAGSET["abstract"])
abs_txt = self._detag(abs_raw, self.HTML_TAGSET["abstract"])
self.base_metadata["abstract"] = abs_txt

def _parse_author(self):
Expand Down Expand Up @@ -710,29 +639,29 @@ def _parse_keywords(self):
for kk in keys_uat_test:
# Check for UAT first:
if kk["content-type"] == "uat-code":
keyid = self._detag(kk, self.JATS_TAGSET["keywords"])
keyid = self._detag(kk, self.HTML_TAGSET["keywords"])
if kk["content-type"] == "term":
keystring = self._detag(kk, self.JATS_TAGSET["keywords"])
keystring = self._detag(kk, self.HTML_TAGSET["keywords"])

if keyid or keystring:
keys_uat.append({"string": keystring, "system": "UAT", "id": keyid})

if not keys_uat:
keys_misc_test = kg.find_all("kwd")
for kk in keys_misc_test:
keys_misc.append(self._detag(kk, self.JATS_TAGSET["keywords"]))
keys_misc.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

# Then check for AAS:
if kg.get("kwd-group-type", "") == "AAS":
keys_aas_test = kg.find_all("kwd")
for kk in keys_aas_test:
keys_aas.append(self._detag(kk, self.JATS_TAGSET["keywords"]))
keys_aas.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

# If all else fails, just search for 'kwd'
if (not keys_uat) and (not keys_aas):
keys_misc_test = kg.find_all("kwd")
for kk in keys_misc_test:
keys_misc.append(self._detag(kk, self.JATS_TAGSET["keywords"]))
keys_misc.append(self._detag(kk, self.HTML_TAGSET["keywords"]))

if keys_uat:
for k in keys_uat:
Expand Down Expand Up @@ -765,7 +694,7 @@ def _parse_keywords(self):
keys_out.append(
{
"system": "subject",
"string": self._detag(k, self.JATS_TAGSET["keywords"]),
"string": self._detag(k, self.HTML_TAGSET["keywords"]),
}
)

Expand Down
44 changes: 44 additions & 0 deletions tests/stubdata/input/zenodo_test.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<record>
<header>
<identifier>oai:zenodo.org:34650</identifier>
<datestamp>2020-01-20T17:28:18Z</datestamp>
</header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
<identifier identifierType="DOI">10.5281/zenodo.34650</identifier>
<alternateIdentifiers>
<alternateIdentifier alternateIdentifierType="oai">oai:zenodo.org:34650</alternateIdentifier>
</alternateIdentifiers>
<creators>
<creator>
<creatorName nameType="Personal">Reetz, Johannes</creatorName>
<givenName>Johannes</givenName>
<familyName>Reetz</familyName>
<affiliation>Max Planck Computing and Data Facility, Garching, Germany</affiliation>
</creator>
</creators>
<titles>
<title>EUDAT - Open Data Services for Research</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>2015</publicationYear>
<subjects>
<subject>Science data management</subject>
</subjects>
<dates>
<date dateType="Issued">2015-12-03</date>
</dates>
<resourceType resourceTypeGeneral="Text">Presentation</resourceType>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf"></relatedIdentifier>
<relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">https://zenodo.org/communities/sciops2015</relatedIdentifier>
</relatedIdentifiers>
<rightsList>
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode" rightsIdentifierScheme="spdx" rightsIdentifier="cc-by-4.0">Creative Commons Attribution 4.0 International</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract">Presentation slides</description>
</descriptions>
</resource>
</metadata>
</record>
Loading
Loading