Skip to content

Commit

Permalink
JATS: Fixed edge case handing for collaborations with nested authors (#…
Browse files Browse the repository at this point in the history
…150)

* Edge case handling for nested collab authors in JATS

* testing failure cases

* testing failure cases 2

* testing failure cases 3

* jats: collab edge case handing fixed

* reverting testing changes

* lint fix

* lint fix

---------

Co-authored-by: Mugdha Polimera <[email protected]>
Co-authored-by: Mugdha Polimera <[email protected]>
  • Loading branch information
3 people authored Nov 14, 2024
1 parent 3cbfd45 commit 8bb2152
Show file tree
Hide file tree
Showing 9 changed files with 1,507 additions and 82 deletions.
126 changes: 90 additions & 36 deletions adsingestp/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import OrderedDict
from copy import copy

import bs4
import validators
from ordered_set import OrderedSet

Expand Down Expand Up @@ -291,28 +292,48 @@ def parse(self, article_metadata):
else:
collab = contrib.find("collab")

collab_affil = ""
collab_name = collab.get_text()
if collab.find("address"):
collab_affil = collab.find("address").get_text()
# This is checking if a collaboration is listed as an author
if collab:
if type(collab.contents[0].get_text()) == str:
collab_name = collab.contents[0].get_text().strip()
else:
collab_name = collab.get_text().strip()

if collab.find("address"):
collab_affil = collab.find("address").get_text()
else:
collab_affil = []

self.collab = {
"collab": collab_name,
"aff": collab_affil,
"affid": [],
"xaff": [],
"xemail": [],
"email": [],
"corresp": False,
"rid": None,
"surname": "",
"given": "",
"native_lang": "",
"orcid": "",
}

self.collab = {
"collab": collab_name,
"aff": collab_affil,
"affid": [],
"xaff": [],
"xemail": [],
"email": [],
"corresp": False,
"rid": None,
}
if self.collab:
# add collab in the correct author position
if self.collab not in authors_out:
authors_out.append(self.collab)

# find nested collab authors and unnest them
nested_contribs = contrib.find_all("contrib")
collab_contribs = collab.find_all("contrib")
nested_contribs = []
for ncontrib in collab_contribs:
if ncontrib:
nested_contribs.append(copy(ncontrib))
ncontrib.decompose()

if not nested_contribs:
nested_contribs = contrib.find_all("contrib")

nested_idx = idx + 1
for nested_contrib in nested_contribs:
Expand All @@ -334,35 +355,58 @@ def parse(self, article_metadata):
authors_out[rid_match[0]] = author_tmp
else:
# add new collab tag to each unnested author
collabtag = copy(contrib.find("collab").find("institution"))
nested_contrib.append(collabtag)
contribs_raw.insert(nested_idx, nested_contrib.extract())
nested_idx += 1

continue

if contrib.find("collab") and contrib.find("collab").find(
"institution"
):
collabtag = copy(contrib.find("collab").find("institution"))
else:
collabtag = None

if not collabtag and collab_name:
collabtag_string = "<collab>" + collab_name + "</collab>"
collabtag = bs4.BeautifulSoup(collabtag_string, "xml").collab

if not collabtag:
collabtag = "ALLAUTH"

if collabtag:
nested_contrib.insert(0, collabtag)
contribs_raw.insert(nested_idx, nested_contrib.extract())
nested_idx += 1

# check if collabtag is present in the author author attributes
collab = contrib.find("collab")

# Springer collab info for nested authors is given as <institution>
if not collab:
collab = contrib.find("institution")

if collab:
collab_affil = ""
collab_name = collab.get_text()
if type(collab.contents[0].get_text()) == str:
collab_name = collab.contents[0].get_text().strip()
else:
collab_name = collab.get_text().strip()

if collab.find("address"):
collab_affil = collab.find("address").get_text()

self.collab = {
"collab": collab_name,
"aff": collab_affil,
"affid": [],
"xaff": [],
"xemail": [],
"email": [],
"corresp": False,
"rid": None,
}
else:
collab_affil = ""

if not self.collab:
self.collab = {
"collab": collab_name,
"aff": collab_affil,
"affid": [],
"xaff": [],
"xemail": [],
"email": [],
"corresp": False,
"rid": None,
"surname": "",
"given": "",
"native_lang": "",
"orcid": "",
}

l_correspondent = False
if contrib.get("corresp", None) == "yes":
Expand Down Expand Up @@ -452,7 +496,7 @@ def parse(self, article_metadata):
contrib_id = contrib.find_all("contrib-id")
orcid = []
for c in contrib_id:
if c.get("contrib-id-type", "") == "orcid":
if (c.get("contrib-id-type", "") == "orcid") or ("orcid" in c.get_text()):
orcid.append(c.get_text(separator=" ").strip())
c.decompose()

Expand Down Expand Up @@ -490,6 +534,16 @@ def parse(self, article_metadata):
if collab:
auth["collab"] = collab_name

# Check if author is a duplicate of a collaboration
if auth["surname"] == "" and auth["collab"]:
# delete email and correspondence info for collabs
auth["email"] = []
auth["xemail"] = []
auth["corresp"] = False
# if the collab is already in author list, skip
if auth in authors_out:
continue

if contrib.get("contrib-type", "author") == "author":
authors_out.append(auth)
default_key = "ALLAUTH"
Expand Down
Loading

0 comments on commit 8bb2152

Please sign in to comment.