From 0c5b584c49798ea8cf94f2e9d15ab1f47d9837a8 Mon Sep 17 00:00:00 2001 From: Mugdha Polimera Date: Fri, 15 Dec 2023 08:48:16 -0500 Subject: [PATCH 1/3] Creating generic dublincore parser --- .vscode/settings.json | 2 +- adsingestp/parsers/arxiv.py | 87 +---------- adsingestp/parsers/dubcore.py | 146 ++++++++++++++++++ .../stubdata/input/dubcore_pos_ecrs_002.html | 33 ++++ tests/test_dublincore.py | 55 +++++++ 5 files changed, 238 insertions(+), 85 deletions(-) create mode 100644 adsingestp/parsers/dubcore.py create mode 100644 tests/stubdata/input/dubcore_pos_ecrs_002.html create mode 100644 tests/test_dublincore.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 667b00c..2cd3a39 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,7 +5,7 @@ "python.linting.enabled": true, "[python]": { "editor.codeActionsOnSave": { - "source.organizeImports": true + "source.organizeImports": "explicit" } }, "terminal.integrated.env.linux": { diff --git a/adsingestp/parsers/arxiv.py b/adsingestp/parsers/arxiv.py index 1ce06d1..d3e5f6e 100644 --- a/adsingestp/parsers/arxiv.py +++ b/adsingestp/parsers/arxiv.py @@ -1,14 +1,12 @@ import logging -from adsingestp import utils from adsingestp.ingest_exceptions import ( - MissingAuthorsException, - MissingTitleException, NoSchemaException, WrongSchemaException, XmlLoadException, ) -from adsingestp.parsers.base import BaseBeautifulSoupParser, IngestBase +from adsingestp.parsers.base import IngestBase +from adsingestp.parsers.dubcore import DublinCoreParser logger = logging.getLogger(__name__) @@ -33,11 +31,9 @@ def parse(self, text, header=False): return output_chunks -class ArxivParser(BaseBeautifulSoupParser): +class ArxivParser(DublinCoreParser): # Dublin Core parser for arXiv - DUBCORE_SCHEMA = ["http://www.openarchives.org/OAI/2.0/oai_dc/"] - author_collaborations_params = { "keywords": ["group", "team", "collaboration"], "remove_the": False, @@ -49,83 +45,6 @@ def __init__(self): self.input_header = None self.input_metadata = None - def _parse_ids(self): - if self.input_header.find("identifier"): - ids = self.input_header.find("identifier").get_text() - id_array = ids.split(":") - arxiv_id = id_array[-1] - - # TODO what should the key on this actually be? - self.base_metadata["publication"] = "eprint arXiv:" + arxiv_id - - self.base_metadata["ids"] = {"preprint": {}} - - self.base_metadata["ids"]["preprint"]["source"] = "arXiv" - self.base_metadata["ids"]["preprint"]["id"] = arxiv_id - - dc_ids = self.input_metadata.find_all("dc:identifier") - for d in dc_ids: - d_text = d.get_text() - if "doi:" in d_text: - self.base_metadata["ids"]["doi"] = d_text.replace("doi:", "") - - def _parse_title(self): - title_array = self.input_metadata.find_all("dc:title") - if title_array: - title_array_text = [i.get_text() for i in title_array] - if len(title_array) == 1: - self.base_metadata["title"] = self._clean_output(title_array_text[0]) - else: - self.base_metadata["title"] = self._clean_output(": ".join(title_array_text)) - else: - raise MissingTitleException("No title found") - - def _parse_author(self): - authors_out = [] - name_parser = utils.AuthorNames() - - author_array = self.input_metadata.find_all("dc:creator") - for a in author_array: - a = a.get_text() - parsed_name_list = name_parser.parse( - a, collaborations_params=self.author_collaborations_params - ) - for name in parsed_name_list: - authors_out.append(name) - - if not authors_out: - raise MissingAuthorsException("No contributors found for") - - self.base_metadata["authors"] = authors_out - - def _parse_pubdate(self): - if self.input_metadata.find("dc:date"): - self.base_metadata["pubdate_electronic"] = self.input_metadata.find( - "dc:date" - ).get_text() - - def _parse_abstract(self): - desc_array = self.input_metadata.find_all("dc:description") - # for arXiv.org, only 'dc:description'[0] is the abstract, the rest are comments - if desc_array: - self.base_metadata["abstract"] = self._clean_output(desc_array.pop(0).get_text()) - - if desc_array: - comments_out = [] - for d in desc_array: - comments_out.append({"origin": "arxiv", "text": self._clean_output(d.get_text())}) - - self.base_metadata["comments"] = comments_out - - def _parse_keywords(self): - keywords_array = self.input_metadata.find_all("dc:subject") - - if keywords_array: - keywords_out = [] - for k in keywords_array: - keywords_out.append({"system": "arxiv", "string": k.get_text()}) - self.base_metadata["keywords"] = keywords_out - def parse(self, text): """ Parse arXiv XML into standard JSON format diff --git a/adsingestp/parsers/dubcore.py b/adsingestp/parsers/dubcore.py new file mode 100644 index 0000000..655daaf --- /dev/null +++ b/adsingestp/parsers/dubcore.py @@ -0,0 +1,146 @@ +import logging + +from adsingestp import utils +from adsingestp.ingest_exceptions import ( + MissingAuthorsException, + MissingTitleException, + NoSchemaException, + WrongSchemaException, + XmlLoadException, +) +from adsingestp.parsers.base import BaseBeautifulSoupParser + +logger = logging.getLogger(__name__) + + +class DublinCoreParser(BaseBeautifulSoupParser): + # Generic Dublin Core parser + + DUBCORE_SCHEMA = ["http://www.openarchives.org/OAI/2.0/oai_dc/"] + + author_collaborations_params = { + "keywords": ["group", "team", "collaboration"], + "remove_the": False, + } + + def __init__(self): + self.base_metadata = {} + self.input_header = None + self.input_metadata = None + + def _parse_ids(self): + if self.input_header.find("identifier"): + ids = self.input_header.find("identifier").get_text() + id_array = ids.split(":") + + dubcore_id = id_array[-1] + source = id_array[1].split(".")[0] + + preprint_list = ["arXiv"] # TODO: Put in config file inside adsingest dir? + + if source in preprint_list: + self.base_metadata["ids"] = {"preprint": {}} + self.base_metadata["ids"]["preprint"]["source"] = source + self.base_metadata["ids"]["preprint"]["id"] = dubcore_id + + self.base_metadata["publication"] = "eprint " + source + ":" + dubcore_id + + dc_ids = self.input_metadata.find_all("dc:identifier") + for d in dc_ids: + d_text = d.get_text() + if "doi:" in d_text: + self.base_metadata["ids"]["doi"] = d_text.replace("doi:", "") + + def _parse_title(self): + title_array = self.input_metadata.find_all("dc:title") + if title_array: + title_array_text = [i.get_text() for i in title_array] + if len(title_array) == 1: + self.base_metadata["title"] = self._clean_output(title_array_text[0]) + else: + self.base_metadata["title"] = self._clean_output(": ".join(title_array_text)) + else: + raise MissingTitleException("No title found") + + def _parse_author(self): + authors_out = [] + name_parser = utils.AuthorNames() + + author_array = self.input_metadata.find_all("dc:creator") + for a in author_array: + a = a.get_text() + parsed_name_list = name_parser.parse( + a, collaborations_params=self.author_collaborations_params + ) + for name in parsed_name_list: + authors_out.append(name) + + if not authors_out: + raise MissingAuthorsException("No contributors found for") + + self.base_metadata["authors"] = authors_out + + def _parse_pubdate(self): + if self.input_metadata.find("dc:date"): + self.base_metadata["pubdate_electronic"] = self.input_metadata.find( + "dc:date" + ).get_text() + + def _parse_abstract(self): + desc_array = self.input_metadata.find_all("dc:description") + # for arXiv.org, only 'dc:description'[0] is the abstract, the rest are comments + if desc_array: + self.base_metadata["abstract"] = self._clean_output(desc_array.pop(0).get_text()) + + if desc_array: + comments_out = [] + for d in desc_array: + # TODO: FIX + comments_out.append({"origin": "arxiv", "text": self._clean_output(d.get_text())}) + + self.base_metadata["comments"] = comments_out + + def _parse_keywords(self): + keywords_array = self.input_metadata.find_all("dc:subject") + + if keywords_array: + keywords_out = [] + for k in keywords_array: + # TODO: FIX + keywords_out.append({"system": "arxiv", "string": k.get_text()}) + self.base_metadata["keywords"] = keywords_out + + def parse(self, text): + """ + Parse arXiv XML into standard JSON format + :param text: string, contents of XML file + :return: parsed file contents in JSON format + """ + try: + d = self.bsstrtodict(text, parser="lxml-xml") + except Exception as err: + raise XmlLoadException(err) + + if d.find("record"): + self.input_header = d.find("record").find("header") + if d.find("record") and d.find("record").find("metadata"): + self.input_metadata = d.find("record").find("metadata").find("oai_dc:dc") + + schema_spec = self.input_metadata.get("xmlns:oai_dc", "") + if not schema_spec: + raise NoSchemaException("Unknown record schema.") + elif schema_spec not in self.DUBCORE_SCHEMA: + raise WrongSchemaException("Wrong schema.") + + self._parse_ids() + self._parse_title() + self._parse_author() + self._parse_pubdate() + self._parse_abstract() + self._parse_keywords() + + self.base_metadata = self._entity_convert(self.base_metadata) + + output = self.format(self.base_metadata, format="OtherXML") + + return output diff --git a/tests/stubdata/input/dubcore_pos_ecrs_002.html b/tests/stubdata/input/dubcore_pos_ecrs_002.html new file mode 100644 index 0000000..1d3bac4 --- /dev/null +++ b/tests/stubdata/input/dubcore_pos_ecrs_002.html @@ -0,0 +1,33 @@ + +
+ oai:pos.sissa.it:ECRS/002 + 2023-02-15 + conference:ECRS + group:14 +
+ + + The Memories of the First European Cosmic Ray Symposium: Łódź 1968 + Alan Watson + Astroparticle Physics + The origins of the series of European Cosmic-Ray Symposia are briefly described. The first + meeting in the seri + es, on ‘Hadronic Interactions and Extensive Air Showers’, held in Łódź, Poland in 1968, was attended by + the author: some memories are recounted. + Sissa Medialab + 2023-02-15 + Text + application/pdf + PoS(ECRS)002 + 10.22323/1.423.0002 + https://pos.sissa.it/423/002/ + en + ECRS (27th European Cosmic Ray Symposium) Opening; isPartOf + Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC BY-NC-ND + 4.0) + + +
diff --git a/tests/test_dublincore.py b/tests/test_dublincore.py new file mode 100644 index 0000000..a266c25 --- /dev/null +++ b/tests/test_dublincore.py @@ -0,0 +1,55 @@ +import datetime +import json +import os +import unittest + +from adsingestschema import ads_schema_validator + +from adsingestp.parsers import dubcore + +TIMESTAMP_FMT = "%Y-%m-%dT%H:%M:%S.%fZ" + + +class TestDublinCore(unittest.TestCase): + def setUp(self): + stubdata_dir = os.path.join(os.path.dirname(__file__), "stubdata/") + self.inputdir = os.path.join(stubdata_dir, "input") + self.outputdir = os.path.join(stubdata_dir, "output") + + def test_dubcore(self): + filenames = [ + "arxiv_1711_05739", + "arxiv_0901_2443", + "arxiv_1711_04702", + "arxiv_math_0306266", + ] + for f in filenames: + test_infile = os.path.join(self.inputdir, f + ".xml") + test_outfile = os.path.join(self.outputdir, f + ".json") + parser = dubcore.DublinCoreParser() + + with open(test_infile, "rb") as fp: + input_data = fp.read() + + with open(test_outfile, "rb") as fp: + output_text = fp.read() + output_data = json.loads(output_text) + + parsed = parser.parse(input_data) + + # make sure this is valid schema + try: + ads_schema_validator().validate(parsed) + except Exception: + self.fail("Schema validation failed") + pass + + # this field won't match the test data, so check and then discard + time_difference = ( + datetime.datetime.strptime(parsed["recordData"]["parsedTime"], TIMESTAMP_FMT) + - datetime.datetime.utcnow() + ) + self.assertTrue(abs(time_difference) < datetime.timedelta(seconds=10)) + parsed["recordData"]["parsedTime"] = "" + + self.assertEqual(parsed, output_data) From e6a6c6e059ea8f86d0196f95d54dbddfa6d1bd42 Mon Sep 17 00:00:00 2001 From: Mugdha Polimera Date: Thu, 4 Jan 2024 13:16:31 -0500 Subject: [PATCH 2/3] Replaced arxiv parser with dubcore parser --- adsingestp/parsers/arxiv.py | 81 ------------ adsingestp/parsers/dubcore.py | 64 +++++---- tests/stubdata/input/dubcore_pos_ecrs_002.xml | 33 +++++ tests/stubdata/output/arxiv_0901_2443.json | 125 +++++++++++++++--- tests/stubdata/output/arxiv_1711_04702.json | 93 +++++++++++-- tests/stubdata/output/arxiv_1711_05739.json | 74 +++++++++-- tests/stubdata/output/arxiv_math_0306266.json | 71 ++++++++-- .../stubdata/output/dubcore_pos_ecrs_002.json | 54 ++++++++ tests/test_arxiv.py | 93 ------------- tests/test_dublincore.py | 38 ++++++ 10 files changed, 474 insertions(+), 252 deletions(-) delete mode 100644 adsingestp/parsers/arxiv.py create mode 100644 tests/stubdata/input/dubcore_pos_ecrs_002.xml create mode 100644 tests/stubdata/output/dubcore_pos_ecrs_002.json delete mode 100644 tests/test_arxiv.py diff --git a/adsingestp/parsers/arxiv.py b/adsingestp/parsers/arxiv.py deleted file mode 100644 index d3e5f6e..0000000 --- a/adsingestp/parsers/arxiv.py +++ /dev/null @@ -1,81 +0,0 @@ -import logging - -from adsingestp.ingest_exceptions import ( - NoSchemaException, - WrongSchemaException, - XmlLoadException, -) -from adsingestp.parsers.base import IngestBase -from adsingestp.parsers.dubcore import DublinCoreParser - -logger = logging.getLogger(__name__) - - -class MultiArxivParser(IngestBase): - start_re = r"]*>" - end_re = r"]*>" - - def parse(self, text, header=False): - """ - Separate multi-record arXiv XML document into individual XML documents - - :param text: string, input XML text from a multi-record XML document - :param header: boolean (default: False), set to True to preserve overall - document header/footer for each separate record's document - :return: list, each item is the XML of a separate arXiv document - """ - output_chunks = [] - for chunk in self.get_chunks(text, self.start_re, self.end_re, head_foot=header): - output_chunks.append(chunk.strip()) - - return output_chunks - - -class ArxivParser(DublinCoreParser): - # Dublin Core parser for arXiv - - author_collaborations_params = { - "keywords": ["group", "team", "collaboration"], - "remove_the": False, - "fix_arXiv_mixed_collaboration_string": True, - } - - def __init__(self): - self.base_metadata = {} - self.input_header = None - self.input_metadata = None - - def parse(self, text): - """ - Parse arXiv XML into standard JSON format - :param text: string, contents of XML file - :return: parsed file contents in JSON format - """ - try: - d = self.bsstrtodict(text, parser="lxml-xml") - except Exception as err: - raise XmlLoadException(err) - - if d.find("record"): - self.input_header = d.find("record").find("header") - if d.find("record") and d.find("record").find("metadata"): - self.input_metadata = d.find("record").find("metadata").find("oai_dc:dc") - - schema_spec = self.input_metadata.get("xmlns:oai_dc", "") - if not schema_spec: - raise NoSchemaException("Unknown record schema.") - elif schema_spec not in self.DUBCORE_SCHEMA: - raise WrongSchemaException("Wrong schema.") - - self._parse_ids() - self._parse_title() - self._parse_author() - self._parse_pubdate() - self._parse_abstract() - self._parse_keywords() - - self.base_metadata = self._entity_convert(self.base_metadata) - - output = self.format(self.base_metadata, format="OtherXML") - - return output diff --git a/adsingestp/parsers/dubcore.py b/adsingestp/parsers/dubcore.py index 655daaf..6f9a1c1 100644 --- a/adsingestp/parsers/dubcore.py +++ b/adsingestp/parsers/dubcore.py @@ -8,11 +8,31 @@ WrongSchemaException, XmlLoadException, ) -from adsingestp.parsers.base import BaseBeautifulSoupParser +from adsingestp.parsers.base import BaseBeautifulSoupParser, IngestBase logger = logging.getLogger(__name__) +class MultiDublinCoreParser(IngestBase): + start_re = r"]*>" + end_re = r"]*>" + + def parse(self, text, header=False): + """ + Separate multi-record DublinCore XML document into individual XML documents + + :param text: string, input XML text from a multi-record XML document + :param header: boolean (default: False), set to True to preserve overall + document header/footer for each separate record's document + :return: list, each item is the XML of a separate DublinCore document + """ + output_chunks = [] + for chunk in self.get_chunks(text, self.start_re, self.end_re, head_foot=header): + output_chunks.append(chunk.strip()) + + return output_chunks + + class DublinCoreParser(BaseBeautifulSoupParser): # Generic Dublin Core parser @@ -29,27 +49,22 @@ def __init__(self): self.input_metadata = None def _parse_ids(self): - if self.input_header.find("identifier"): - ids = self.input_header.find("identifier").get_text() - id_array = ids.split(":") - - dubcore_id = id_array[-1] - source = id_array[1].split(".")[0] - - preprint_list = ["arXiv"] # TODO: Put in config file inside adsingest dir? - - if source in preprint_list: - self.base_metadata["ids"] = {"preprint": {}} - self.base_metadata["ids"]["preprint"]["source"] = source - self.base_metadata["ids"]["preprint"]["id"] = dubcore_id + self.base_metadata["ids"] = {} + self.base_metadata["ids"]["pub-id"] = [] - self.base_metadata["publication"] = "eprint " + source + ":" + dubcore_id + if self.input_header.find("identifier"): + self.base_metadata["ids"]["pub-id"].append( + { + "attribute": "publisher-id", + "Identifier": self.input_header.find("identifier").get_text(), + } + ) - dc_ids = self.input_metadata.find_all("dc:identifier") - for d in dc_ids: - d_text = d.get_text() - if "doi:" in d_text: - self.base_metadata["ids"]["doi"] = d_text.replace("doi:", "") + if self.input_metadata.find("dc:identifier"): + for dc_id in self.input_metadata.find_all("dc:identifier"): + self.base_metadata["ids"]["pub-id"].append( + {"attribute": "publisher-id", "Identifier": dc_id.get_text()} + ) def _parse_title(self): title_array = self.input_metadata.find_all("dc:title") @@ -88,7 +103,7 @@ def _parse_pubdate(self): def _parse_abstract(self): desc_array = self.input_metadata.find_all("dc:description") - # for arXiv.org, only 'dc:description'[0] is the abstract, the rest are comments + # in general, only 'dc:description'[0] is the abstract, the rest are comments if desc_array: self.base_metadata["abstract"] = self._clean_output(desc_array.pop(0).get_text()) @@ -96,7 +111,7 @@ def _parse_abstract(self): comments_out = [] for d in desc_array: # TODO: FIX - comments_out.append({"origin": "arxiv", "text": self._clean_output(d.get_text())}) + comments_out.append({"text": self._clean_output(d.get_text())}) self.base_metadata["comments"] = comments_out @@ -106,13 +121,12 @@ def _parse_keywords(self): if keywords_array: keywords_out = [] for k in keywords_array: - # TODO: FIX - keywords_out.append({"system": "arxiv", "string": k.get_text()}) + keywords_out.append({"string": k.get_text()}) self.base_metadata["keywords"] = keywords_out def parse(self, text): """ - Parse arXiv XML into standard JSON format + Parse DublinCore XML into standard JSON format :param text: string, contents of XML file :return: parsed file contents in JSON format """ diff --git a/tests/stubdata/input/dubcore_pos_ecrs_002.xml b/tests/stubdata/input/dubcore_pos_ecrs_002.xml new file mode 100644 index 0000000..1d3bac4 --- /dev/null +++ b/tests/stubdata/input/dubcore_pos_ecrs_002.xml @@ -0,0 +1,33 @@ + +
+ oai:pos.sissa.it:ECRS/002 + 2023-02-15 + conference:ECRS + group:14 +
+ + + The Memories of the First European Cosmic Ray Symposium: Łódź 1968 + Alan Watson + Astroparticle Physics + The origins of the series of European Cosmic-Ray Symposia are briefly described. The first + meeting in the seri + es, on ‘Hadronic Interactions and Extensive Air Showers’, held in Łódź, Poland in 1968, was attended by + the author: some memories are recounted. + Sissa Medialab + 2023-02-15 + Text + application/pdf + PoS(ECRS)002 + 10.22323/1.423.0002 + https://pos.sissa.it/423/002/ + en + ECRS (27th European Cosmic Ray Symposium) Opening; isPartOf + Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC BY-NC-ND + 4.0) + + +
diff --git a/tests/stubdata/output/arxiv_0901_2443.json b/tests/stubdata/output/arxiv_0901_2443.json index c9d1df5..4fc1e5e 100644 --- a/tests/stubdata/output/arxiv_0901_2443.json +++ b/tests/stubdata/output/arxiv_0901_2443.json @@ -1,17 +1,108 @@ -{"abstract": {"textEnglish": "The $^{112,120}$Sn$(\\gamma,\\gamma')$ reactions have been studied at the S-DALINAC. Electric dipole (E1) strength distributions have been determined including contributions from unresolved strength extracted by a fluctuation analysis. Together with available data on $^{116,124}$Sn, an experimental systematics of the pygmy dipole resonance (PDR) in stable even-mass tin isotopes is established. The PDR centroid excitation energies and summed strengths are in reasonable agreement with quasiparticle-phonon model calculations based on a nonrelativistic description of the mean field but disagree with relativistic quasiparticle random-phase approximation predictions."}, - "authors": [{"name": {"given_name": "B.", "pubraw": "Özel, B.", "surname": "Özel"}}, - {"name": {"given_name": "J.", "pubraw": "Enders, J.", "surname": "Enders"}}, - {"name": {"given_name": "H.", "pubraw": "Lenske, H.", "surname": "Lenske"}}, - {"name": {"given_name": "P.", "pubraw": "von Neumann-Cosel, P.", "surname": "von Neumann-Cosel"}}, - {"name": {"given_name": "I.", "pubraw": "Poltoratska, I.", "surname": "Poltoratska"}}, - {"name": {"given_name": "V.", "middle_name": "Yu.", "pubraw": "Ponomarev, V. Yu.", "surname": "Ponomarev"}}, - {"name": {"given_name": "A.", "pubraw": "Richter, A.", "surname": "Richter"}}, - {"name": {"given_name": "D.", "pubraw": "Savran, D.", "surname": "Savran"}}, - {"name": {"given_name": "N.", "pubraw": "Tsoneva, N.", "surname": "Tsoneva"}}], - "comments": [{"commentOrigin": "arxiv", "commentText": "Comment: submitted to Phys. Lett. B"}], - "keywords": [{"keyString": "Nuclear Experiment", "keySystem": "arxiv"}], - "persistentIDs": [{"preprint": {"identifier": "0901.2443", "source": "arXiv"}}], - "pubDate": {"electrDate": "2009-01-16"}, - "publication": {"pubName": "eprint arXiv:0901.2443", "pubYear": "2009"}, - "recordData": {"createdTime": "", "loadFormat": "OtherXML", "loadLocation": "", "loadType": "fromFile", "parsedTime": "", "recordOrigin": ""}, - "title": {"textEnglish": "Excitation energy and strength of the pygmy dipole resonance in stable tin isotopes"}} +{ + "abstract": { + "textEnglish": "The $^{112,120}$Sn$(\\gamma,\\gamma')$ reactions have been studied at the S-DALINAC. Electric dipole (E1) strength distributions have been determined including contributions from unresolved strength extracted by a fluctuation analysis. Together with available data on $^{116,124}$Sn, an experimental systematics of the pygmy dipole resonance (PDR) in stable even-mass tin isotopes is established. The PDR centroid excitation energies and summed strengths are in reasonable agreement with quasiparticle-phonon model calculations based on a nonrelativistic description of the mean field but disagree with relativistic quasiparticle random-phase approximation predictions." + }, + "authors": [ + { + "name": { + "given_name": "B.", + "pubraw": "\u00d6zel, B.", + "surname": "\u00d6zel" + } + }, + { + "name": { + "given_name": "J.", + "pubraw": "Enders, J.", + "surname": "Enders" + } + }, + { + "name": { + "given_name": "H.", + "pubraw": "Lenske, H.", + "surname": "Lenske" + } + }, + { + "name": { + "given_name": "P.", + "pubraw": "von Neumann-Cosel, P.", + "surname": "von Neumann-Cosel" + } + }, + { + "name": { + "given_name": "I.", + "pubraw": "Poltoratska, I.", + "surname": "Poltoratska" + } + }, + { + "name": { + "given_name": "V.", + "middle_name": "Yu.", + "pubraw": "Ponomarev, V. Yu.", + "surname": "Ponomarev" + } + }, + { + "name": { + "given_name": "A.", + "pubraw": "Richter, A.", + "surname": "Richter" + } + }, + { + "name": { + "given_name": "D.", + "pubraw": "Savran, D.", + "surname": "Savran" + } + }, + { + "name": { + "given_name": "N.", + "pubraw": "Tsoneva, N.", + "surname": "Tsoneva" + } + } + ], + "comments": [ + { + "commentText": "Comment: submitted to Phys. Lett. B" + } + ], + "keywords": [ + { + "keyString": "Nuclear Experiment" + } + ], + "pubDate": { + "electrDate": "2009-01-16" + }, + "publication": { + "pubYear": "2009" + }, + "publisherIDs": [ + { + "Identifier": "oai:arXiv.org:0901.2443", + "attribute": "publisher-id" + }, + { + "Identifier": "http://arxiv.org/abs/0901.2443", + "attribute": "publisher-id" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "OtherXML", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "title": { + "textEnglish": "Excitation energy and strength of the pygmy dipole resonance in stable tin isotopes" + } +} diff --git a/tests/stubdata/output/arxiv_1711_04702.json b/tests/stubdata/output/arxiv_1711_04702.json index c5e5f48..7535aad 100644 --- a/tests/stubdata/output/arxiv_1711_04702.json +++ b/tests/stubdata/output/arxiv_1711_04702.json @@ -1,13 +1,80 @@ -{"abstract": {"textEnglish": "Background: Gene co-expression network analyses have become a central approach for the systems-level analysis of biological data. Several software packages exist for generating and analyzing such networks, either from correlation scores or the absolute value of a transformed score called weighted topological overlap (wTO). However, since some genes are able to up- or down-regulate other genes, it is important to explicitly consider both positive and negative correlations when constructing a gene co-expression network. Additionally, there has been a growing interest in the systematic comparison of multiple networks to identify deferentially changed links. Typically, such analyses are focused on the comparison of networks or data from two different conditions. Results: Here, we present an R package for calculating the weighted topological overlap (wTO), that explicitly addresses the sign of wTO values. The package includes the calculation of p-values (raw and adjusted) for each pairwise gene score. Our package also allows the calculation of networks from time series, without replicates. Additionally, our R package incorporates a novel method for calculating a consensus network (CN) from two or more networks. To visualize the resulting networks, the R package contains a visualization tool which allows for the direct network manipulation and access of node and link information. When testing the package on a standard laptop computer, we can conduct all calculations for systems of 20,000 genes in under two hours. Conclusion: In this work, we developed an R package that allows the computation of wTO networks, CNs and a visualization tool in the R statistical environment. It is publicly available on CRAN repositories under the GPL-2 Open Source License (https://cran.r-project.org/web/packages/wTO/)."}, - "authors": [{"name": {"given_name": "Deisy", "pubraw": "Gysi, Deisy Morselli", "surname": "Morselli Gysi"}}, - {"name": {"given_name": "Andre", "pubraw": "Voigt, Andre", "surname": "Voigt"}}, - {"name": {"given_name": "Tiago", "pubraw": "Fragoso, Tiago de Miranda", "middle_name": "de Miranda", "surname": "Fragoso"}}, - {"name": {"given_name": "Eivind", "pubraw": "Almaas, Eivind", "surname": "Almaas"}}, - {"name": {"given_name": "Katja", "pubraw": "Nowick, Katja", "surname": "Nowick"}}], - "comments": [{"commentOrigin": "arxiv", "commentText": "Comment: 13 pages, 3 Figures"}], - "keywords": [{"keyString": "Quantitative Biology - Molecular Networks", "keySystem": "arxiv"}], - "persistentIDs": [{"preprint": {"identifier": "1711.04702", "source": "arXiv"}}], - "pubDate": {"electrDate": "2017-11-13"}, - "publication": {"pubName": "eprint arXiv:1711.04702", "pubYear": "2017"}, - "recordData": {"createdTime": "", "loadFormat": "OtherXML", "loadLocation": "", "loadType": "fromFile", "parsedTime": "", "recordOrigin": ""}, - "title": {"textEnglish": "wTO: an R package for computing weighted topological overlap and consensus networks with an integrated visualization tool"}} +{ + "abstract": { + "textEnglish": "Background: Gene co-expression network analyses have become a central approach for the systems-level analysis of biological data. Several software packages exist for generating and analyzing such networks, either from correlation scores or the absolute value of a transformed score called weighted topological overlap (wTO). However, since some genes are able to up- or down-regulate other genes, it is important to explicitly consider both positive and negative correlations when constructing a gene co-expression network. Additionally, there has been a growing interest in the systematic comparison of multiple networks to identify deferentially changed links. Typically, such analyses are focused on the comparison of networks or data from two different conditions. Results: Here, we present an R package for calculating the weighted topological overlap (wTO), that explicitly addresses the sign of wTO values. The package includes the calculation of p-values (raw and adjusted) for each pairwise gene score. Our package also allows the calculation of networks from time series, without replicates. Additionally, our R package incorporates a novel method for calculating a consensus network (CN) from two or more networks. To visualize the resulting networks, the R package contains a visualization tool which allows for the direct network manipulation and access of node and link information. When testing the package on a standard laptop computer, we can conduct all calculations for systems of 20,000 genes in under two hours. Conclusion: In this work, we developed an R package that allows the computation of wTO networks, CNs and a visualization tool in the R statistical environment. It is publicly available on CRAN repositories under the GPL-2 Open Source License (https://cran.r-project.org/web/packages/wTO/)." + }, + "authors": [ + { + "name": { + "given_name": "Deisy", + "pubraw": "Gysi, Deisy Morselli", + "surname": "Morselli Gysi" + } + }, + { + "name": { + "given_name": "Andre", + "pubraw": "Voigt, Andre", + "surname": "Voigt" + } + }, + { + "name": { + "given_name": "Tiago", + "middle_name": "de Miranda", + "pubraw": "Fragoso, Tiago de Miranda", + "surname": "Fragoso" + } + }, + { + "name": { + "given_name": "Eivind", + "pubraw": "Almaas, Eivind", + "surname": "Almaas" + } + }, + { + "name": { + "given_name": "Katja", + "pubraw": "Nowick, Katja", + "surname": "Nowick" + } + } + ], + "comments": [ + { + "commentText": "Comment: 13 pages, 3 Figures" + } + ], + "keywords": [ + { + "keyString": "Quantitative Biology - Molecular Networks" + } + ], + "pubDate": { + "electrDate": "2017-11-13" + }, + "publication": { + "pubYear": "2017" + }, + "publisherIDs": [ + { + "Identifier": "oai:arXiv.org:1711.04702", + "attribute": "publisher-id" + }, + { + "Identifier": "http://arxiv.org/abs/1711.04702", + "attribute": "publisher-id" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "OtherXML", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "title": { + "textEnglish": "wTO: an R package for computing weighted topological overlap and consensus networks with an integrated visualization tool" + } +} diff --git a/tests/stubdata/output/arxiv_1711_05739.json b/tests/stubdata/output/arxiv_1711_05739.json index 2d45fa7..4476143 100644 --- a/tests/stubdata/output/arxiv_1711_05739.json +++ b/tests/stubdata/output/arxiv_1711_05739.json @@ -1,9 +1,65 @@ -{"abstract": {"textEnglish": "We explore the occurrence and detectability of planet-planet occultations (PPOs) in exoplanet systems. These are events during which a planet occults the disk of another planet in the same system, imparting a small photometric signal as its thermal or reflected light is blocked. We focus on the planets in TRAPPIST-1, whose orbital planes we show are aligned to within 0.3 degrees at 90% confidence. We present a photodynamical model for predicting and computing PPOs in TRAPPIST-1 and other systems for various assumptions of the planets' atmospheric states. When marginalizing over the uncertainties on all orbital parameters, we find that the rate of PPOs in TRAPPIST-1 is about 1.4 per day. We investigate the prospects for detection of these events with the James Webb Space Telescope, finding that ~10-20 occultations per year of b and c should be above the noise level at 12-15 microns. Joint modeling of several of these PPOs could lead to a robust detection. Alternatively, observations with the proposed Origins Space Telescope should be able to detect individual PPOs at high signal-to-noise. We show how PPOs can be used to break transit timing variation degeneracies, imposing strong constraints on the eccentricities and masses of the planets, as well as to constrain the longitudes of nodes and thus the complete three-dimensional structure of the system. We further show how modeling of these events can be used to reveal a planet's day/night temperature contrast and construct crude surface maps. We make our photodynamical code available on github."}, - "authors": [{"name": {"given_name": "Rodrigo", "pubraw": "Luger, Rodrigo", "surname": "Luger"}}, {"name": {"given_name": "Jacob", "pubraw": "Lustig-Yaeger, Jacob", "surname": "Lustig-Yaeger"}}, {"name": {"given_name": "Eric", "pubraw": "Agol, Eric", "surname": "Agol"}}], - "comments": [{"commentOrigin": "arxiv", "commentText": "Comment: 36 pages, 25 figures. Accepted to ApJ. Multi-purpose photodynamical code available at github.com/rodluger/planetplanet"}], - "keywords": [{"keyString": "Astrophysics - Earth and Planetary Astrophysics", "keySystem": "arxiv"}], - "persistentIDs": [{"preprint": {"identifier": "1711.05739", "source": "arXiv"}}], - "pubDate": {"electrDate": "2017-11-15"}, - "publication": {"pubName": "eprint arXiv:1711.05739", "pubYear": "2017"}, - "recordData": {"createdTime": "", "loadFormat": "OtherXML", "loadLocation": "", "loadType": "fromFile", "parsedTime": "", "recordOrigin": ""}, - "title": {"textEnglish": "Planet-Planet Occultations in TRAPPIST-1 and Other Exoplanet Systems"}} +{ + "abstract": { + "textEnglish": "We explore the occurrence and detectability of planet-planet occultations (PPOs) in exoplanet systems. These are events during which a planet occults the disk of another planet in the same system, imparting a small photometric signal as its thermal or reflected light is blocked. We focus on the planets in TRAPPIST-1, whose orbital planes we show are aligned to within 0.3 degrees at 90% confidence. We present a photodynamical model for predicting and computing PPOs in TRAPPIST-1 and other systems for various assumptions of the planets' atmospheric states. When marginalizing over the uncertainties on all orbital parameters, we find that the rate of PPOs in TRAPPIST-1 is about 1.4 per day. We investigate the prospects for detection of these events with the James Webb Space Telescope, finding that ~10-20 occultations per year of b and c should be above the noise level at 12-15 microns. Joint modeling of several of these PPOs could lead to a robust detection. Alternatively, observations with the proposed Origins Space Telescope should be able to detect individual PPOs at high signal-to-noise. We show how PPOs can be used to break transit timing variation degeneracies, imposing strong constraints on the eccentricities and masses of the planets, as well as to constrain the longitudes of nodes and thus the complete three-dimensional structure of the system. We further show how modeling of these events can be used to reveal a planet's day/night temperature contrast and construct crude surface maps. We make our photodynamical code available on github." + }, + "authors": [ + { + "name": { + "given_name": "Rodrigo", + "pubraw": "Luger, Rodrigo", + "surname": "Luger" + } + }, + { + "name": { + "given_name": "Jacob", + "pubraw": "Lustig-Yaeger, Jacob", + "surname": "Lustig-Yaeger" + } + }, + { + "name": { + "given_name": "Eric", + "pubraw": "Agol, Eric", + "surname": "Agol" + } + } + ], + "comments": [ + { + "commentText": "Comment: 36 pages, 25 figures. Accepted to ApJ. Multi-purpose photodynamical code available at github.com/rodluger/planetplanet" + } + ], + "keywords": [ + { + "keyString": "Astrophysics - Earth and Planetary Astrophysics" + } + ], + "pubDate": { + "electrDate": "2017-11-15" + }, + "publication": { + "pubYear": "2017" + }, + "publisherIDs": [ + { + "Identifier": "oai:arXiv.org:1711.05739", + "attribute": "publisher-id" + }, + { + "Identifier": "http://arxiv.org/abs/1711.05739", + "attribute": "publisher-id" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "OtherXML", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "title": { + "textEnglish": "Planet-Planet Occultations in TRAPPIST-1 and Other Exoplanet Systems" + } +} diff --git a/tests/stubdata/output/arxiv_math_0306266.json b/tests/stubdata/output/arxiv_math_0306266.json index 4091d53..f39cec9 100644 --- a/tests/stubdata/output/arxiv_math_0306266.json +++ b/tests/stubdata/output/arxiv_math_0306266.json @@ -1,14 +1,57 @@ -{"abstract": {"textEnglish": "We study the Lovasz number theta along with two further SDP relaxations theta1, theta1/2 of the independence number and the corresponding relaxations of the chromatic number on random graphs G(n,p). We prove that these relaxations are concentrated about their means Moreover, extending a result of Juhasz, we compute the asymptotic value of the relaxations for essentially the entire range of edge probabilities p. As an application, we give an improved algorithm for approximating the independence number in polynomial expected time, thereby extending a result of Krivelevich and Vu. We also improve on the analysis of an algorithm of Krivelevich for deciding whether G(n,p) is k-colorable."}, - "authors": [{"name": {"given_name": "Amin", "pubraw": "Coja-Oghlan, Amin", "surname": "Coja-Oghlan"}}], - "keywords": [{"keyString": "Mathematics - Combinatorics", "keySystem": "arxiv"}, - {"keyString": "05C80, 05C15", "keySystem": "arxiv"}], - "persistentIDs": [{"DOI": "10.1017/S0963548305006826", "preprint": {"identifier": "math/0306266", "source": "arXiv"}}], - "pubDate": {"electrDate": "2003-06-18"}, - "publication": {"pubName": "eprint arXiv:math/0306266", "pubYear": "2003"}, - "recordData": {"createdTime": "", - "loadFormat": "OtherXML", - "loadLocation": "", - "loadType": "fromFile", - "parsedTime": "", - "recordOrigin": ""}, - "title": {"textEnglish": "The Lovasz number of random graphs"}} +{ + "abstract": { + "textEnglish": "We study the Lovasz number theta along with two further SDP relaxations theta1, theta1/2 of the independence number and the corresponding relaxations of the chromatic number on random graphs G(n,p). We prove that these relaxations are concentrated about their means Moreover, extending a result of Juhasz, we compute the asymptotic value of the relaxations for essentially the entire range of edge probabilities p. As an application, we give an improved algorithm for approximating the independence number in polynomial expected time, thereby extending a result of Krivelevich and Vu. We also improve on the analysis of an algorithm of Krivelevich for deciding whether G(n,p) is k-colorable." + }, + "authors": [ + { + "name": { + "given_name": "Amin", + "pubraw": "Coja-Oghlan, Amin", + "surname": "Coja-Oghlan" + } + } + ], + "keywords": [ + { + "keyString": "Mathematics - Combinatorics" + }, + { + "keyString": "05C80, 05C15" + } + ], + "pubDate": { + "electrDate": "2003-06-18" + }, + "publication": { + "pubYear": "2003" + }, + "publisherIDs": [ + { + "Identifier": "oai:arXiv.org:math/0306266", + "attribute": "publisher-id" + }, + { + "Identifier": "http://arxiv.org/abs/math/0306266", + "attribute": "publisher-id" + }, + { + "Identifier": "Combinatorics, Probability and Computing 14 (2005) 439 - 465", + "attribute": "publisher-id" + }, + { + "Identifier": "doi:10.1017/S0963548305006826", + "attribute": "publisher-id" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "OtherXML", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "title": { + "textEnglish": "The Lovasz number of random graphs" + } +} diff --git a/tests/stubdata/output/dubcore_pos_ecrs_002.json b/tests/stubdata/output/dubcore_pos_ecrs_002.json new file mode 100644 index 0000000..085a8c0 --- /dev/null +++ b/tests/stubdata/output/dubcore_pos_ecrs_002.json @@ -0,0 +1,54 @@ +{ + "abstract": { + "textEnglish": "The origins of the series of European Cosmic-Ray Symposia are briefly described. The first meeting in the seri es, on \u2018Hadronic Interactions and Extensive Air Showers\u2019, held in \u0141\u00f3d\u017a, Poland in 1968, was attended by the author: some memories are recounted." + }, + "authors": [ + { + "name": { + "given_name": "Alan", + "pubraw": "Alan Watson", + "surname": "Watson" + } + } + ], + "keywords": [ + { + "keyString": "Astroparticle Physics" + } + ], + "pubDate": { + "electrDate": "2023-02-15" + }, + "publication": { + "pubYear": "2023" + }, + "publisherIDs": [ + { + "Identifier": "oai:pos.sissa.it:ECRS/002", + "attribute": "publisher-id" + }, + { + "Identifier": "PoS(ECRS)002", + "attribute": "publisher-id" + }, + { + "Identifier": "10.22323/1.423.0002", + "attribute": "publisher-id" + }, + { + "Identifier": "https://pos.sissa.it/423/002/", + "attribute": "publisher-id" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "OtherXML", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "title": { + "textEnglish": "The Memories of the First European Cosmic Ray Symposium: \u0141\u00f3d\u017a 1968" + } +} diff --git a/tests/test_arxiv.py b/tests/test_arxiv.py deleted file mode 100644 index 313887c..0000000 --- a/tests/test_arxiv.py +++ /dev/null @@ -1,93 +0,0 @@ -import datetime -import json -import os -import unittest - -from adsingestschema import ads_schema_validator - -from adsingestp.parsers import arxiv - -TIMESTAMP_FMT = "%Y-%m-%dT%H:%M:%S.%fZ" - - -class TestArxiv(unittest.TestCase): - def setUp(self): - stubdata_dir = os.path.join(os.path.dirname(__file__), "stubdata/") - self.inputdir = os.path.join(stubdata_dir, "input") - self.outputdir = os.path.join(stubdata_dir, "output") - - def test_arxiv(self): - filenames = [ - "arxiv_1711_05739", - "arxiv_0901_2443", - "arxiv_1711_04702", - "arxiv_math_0306266", - ] - for f in filenames: - test_infile = os.path.join(self.inputdir, f + ".xml") - test_outfile = os.path.join(self.outputdir, f + ".json") - parser = arxiv.ArxivParser() - - with open(test_infile, "rb") as fp: - input_data = fp.read() - - with open(test_outfile, "rb") as fp: - output_text = fp.read() - output_data = json.loads(output_text) - - parsed = parser.parse(input_data) - - # make sure this is valid schema - try: - ads_schema_validator().validate(parsed) - except Exception: - self.fail("Schema validation failed") - pass - - # this field won't match the test data, so check and then discard - time_difference = ( - datetime.datetime.strptime(parsed["recordData"]["parsedTime"], TIMESTAMP_FMT) - - datetime.datetime.utcnow() - ) - self.assertTrue(abs(time_difference) < datetime.timedelta(seconds=10)) - parsed["recordData"]["parsedTime"] = "" - - self.assertEqual(parsed, output_data) - - -class TextArxivMulti(unittest.TestCase): - def setUp(self): - stubdata_dir = os.path.join(os.path.dirname(__file__), "stubdata/") - self.inputdir = os.path.join(stubdata_dir, "input") - self.outputdir = os.path.join(stubdata_dir, "output") - - def test_arxiv_multi(self): - filenames = [ - "arxiv_multi_20230125", - ] - - parser = arxiv.MultiArxivParser() - - for f in filenames: - test_infile = os.path.join(self.inputdir, f + ".xml") - test_outfile_header = os.path.join(self.outputdir, f + "_header.txt") - test_outfile_noheader = os.path.join(self.outputdir, f + "_noheader.txt") - - with open(test_infile, "r") as fp: - input_data = fp.read() - - with open(test_outfile_header, "r") as fp: - output_text = fp.read() - output_data_header = output_text.strip().split("\n\n") - - parsed = parser.parse(input_data, header=True) - - self.assertEqual(parsed, output_data_header) - - with open(test_outfile_noheader, "r") as fp: - output_text = fp.read() - output_data_noheader = output_text.strip().split("\n\n") - - parsed = parser.parse(input_data, header=False) - - self.assertEqual(parsed, output_data_noheader) diff --git a/tests/test_dublincore.py b/tests/test_dublincore.py index a266c25..59b1f81 100644 --- a/tests/test_dublincore.py +++ b/tests/test_dublincore.py @@ -18,6 +18,7 @@ def setUp(self): def test_dubcore(self): filenames = [ + "dubcore_pos_ecrs_002", "arxiv_1711_05739", "arxiv_0901_2443", "arxiv_1711_04702", @@ -53,3 +54,40 @@ def test_dubcore(self): parsed["recordData"]["parsedTime"] = "" self.assertEqual(parsed, output_data) + + +class TextDublinCoreMulti(unittest.TestCase): + def setUp(self): + stubdata_dir = os.path.join(os.path.dirname(__file__), "stubdata/") + self.inputdir = os.path.join(stubdata_dir, "input") + self.outputdir = os.path.join(stubdata_dir, "output") + + def test_dubcore_multi(self): + filenames = [ + "arxiv_multi_20230125", + ] + + parser = dubcore.MultiDublinCoreParser() + + for f in filenames: + test_infile = os.path.join(self.inputdir, f + ".xml") + test_outfile_header = os.path.join(self.outputdir, f + "_header.txt") + test_outfile_noheader = os.path.join(self.outputdir, f + "_noheader.txt") + + with open(test_infile, "r") as fp: + input_data = fp.read() + + with open(test_outfile_header, "r") as fp: + output_text = fp.read() + output_data_header = output_text.strip().split("\n\n") + + parsed = parser.parse(input_data, header=True) + self.assertEqual(parsed, output_data_header) + + with open(test_outfile_noheader, "r") as fp: + output_text = fp.read() + output_data_noheader = output_text.strip().split("\n\n") + + parsed = parser.parse(input_data, header=False) + + self.assertEqual(parsed, output_data_noheader) From fc887891e89c34555184c7eddca51c84e5f948ae Mon Sep 17 00:00:00 2001 From: Mugdha Polimera Date: Fri, 5 Jan 2024 12:54:28 -0500 Subject: [PATCH 3/3] removed identifier from header and added publisher field --- adsingestp/parsers/dubcore.py | 13 +++----- .../stubdata/input/dubcore_pos_ecrs_002.html | 33 ------------------- tests/stubdata/output/arxiv_0901_2443.json | 4 --- tests/stubdata/output/arxiv_1711_04702.json | 4 --- tests/stubdata/output/arxiv_1711_05739.json | 4 --- tests/stubdata/output/arxiv_math_0306266.json | 4 --- .../stubdata/output/dubcore_pos_ecrs_002.json | 7 ++-- 7 files changed, 7 insertions(+), 62 deletions(-) delete mode 100644 tests/stubdata/input/dubcore_pos_ecrs_002.html diff --git a/adsingestp/parsers/dubcore.py b/adsingestp/parsers/dubcore.py index 6f9a1c1..130946b 100644 --- a/adsingestp/parsers/dubcore.py +++ b/adsingestp/parsers/dubcore.py @@ -52,14 +52,6 @@ def _parse_ids(self): self.base_metadata["ids"] = {} self.base_metadata["ids"]["pub-id"] = [] - if self.input_header.find("identifier"): - self.base_metadata["ids"]["pub-id"].append( - { - "attribute": "publisher-id", - "Identifier": self.input_header.find("identifier").get_text(), - } - ) - if self.input_metadata.find("dc:identifier"): for dc_id in self.input_metadata.find_all("dc:identifier"): self.base_metadata["ids"]["pub-id"].append( @@ -101,6 +93,10 @@ def _parse_pubdate(self): "dc:date" ).get_text() + def _parse_publisher(self): + if self.input_metadata.find("dc:publisher"): + self.base_metadata["publisher"] = self.input_metadata.find("dc:publisher").get_text() + def _parse_abstract(self): desc_array = self.input_metadata.find_all("dc:description") # in general, only 'dc:description'[0] is the abstract, the rest are comments @@ -152,6 +148,7 @@ def parse(self, text): self._parse_pubdate() self._parse_abstract() self._parse_keywords() + self._parse_publisher() self.base_metadata = self._entity_convert(self.base_metadata) diff --git a/tests/stubdata/input/dubcore_pos_ecrs_002.html b/tests/stubdata/input/dubcore_pos_ecrs_002.html deleted file mode 100644 index 1d3bac4..0000000 --- a/tests/stubdata/input/dubcore_pos_ecrs_002.html +++ /dev/null @@ -1,33 +0,0 @@ - -
- oai:pos.sissa.it:ECRS/002 - 2023-02-15 - conference:ECRS - group:14 -
- - - The Memories of the First European Cosmic Ray Symposium: Łódź 1968 - Alan Watson - Astroparticle Physics - The origins of the series of European Cosmic-Ray Symposia are briefly described. The first - meeting in the seri - es, on ‘Hadronic Interactions and Extensive Air Showers’, held in Łódź, Poland in 1968, was attended by - the author: some memories are recounted. - Sissa Medialab - 2023-02-15 - Text - application/pdf - PoS(ECRS)002 - 10.22323/1.423.0002 - https://pos.sissa.it/423/002/ - en - ECRS (27th European Cosmic Ray Symposium) Opening; isPartOf - Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC BY-NC-ND - 4.0) - - -
diff --git a/tests/stubdata/output/arxiv_0901_2443.json b/tests/stubdata/output/arxiv_0901_2443.json index 4fc1e5e..b1a2700 100644 --- a/tests/stubdata/output/arxiv_0901_2443.json +++ b/tests/stubdata/output/arxiv_0901_2443.json @@ -85,10 +85,6 @@ "pubYear": "2009" }, "publisherIDs": [ - { - "Identifier": "oai:arXiv.org:0901.2443", - "attribute": "publisher-id" - }, { "Identifier": "http://arxiv.org/abs/0901.2443", "attribute": "publisher-id" diff --git a/tests/stubdata/output/arxiv_1711_04702.json b/tests/stubdata/output/arxiv_1711_04702.json index 7535aad..28b46d6 100644 --- a/tests/stubdata/output/arxiv_1711_04702.json +++ b/tests/stubdata/output/arxiv_1711_04702.json @@ -57,10 +57,6 @@ "pubYear": "2017" }, "publisherIDs": [ - { - "Identifier": "oai:arXiv.org:1711.04702", - "attribute": "publisher-id" - }, { "Identifier": "http://arxiv.org/abs/1711.04702", "attribute": "publisher-id" diff --git a/tests/stubdata/output/arxiv_1711_05739.json b/tests/stubdata/output/arxiv_1711_05739.json index 4476143..40ed568 100644 --- a/tests/stubdata/output/arxiv_1711_05739.json +++ b/tests/stubdata/output/arxiv_1711_05739.json @@ -42,10 +42,6 @@ "pubYear": "2017" }, "publisherIDs": [ - { - "Identifier": "oai:arXiv.org:1711.05739", - "attribute": "publisher-id" - }, { "Identifier": "http://arxiv.org/abs/1711.05739", "attribute": "publisher-id" diff --git a/tests/stubdata/output/arxiv_math_0306266.json b/tests/stubdata/output/arxiv_math_0306266.json index f39cec9..1a82f19 100644 --- a/tests/stubdata/output/arxiv_math_0306266.json +++ b/tests/stubdata/output/arxiv_math_0306266.json @@ -26,10 +26,6 @@ "pubYear": "2003" }, "publisherIDs": [ - { - "Identifier": "oai:arXiv.org:math/0306266", - "attribute": "publisher-id" - }, { "Identifier": "http://arxiv.org/abs/math/0306266", "attribute": "publisher-id" diff --git a/tests/stubdata/output/dubcore_pos_ecrs_002.json b/tests/stubdata/output/dubcore_pos_ecrs_002.json index 085a8c0..af968be 100644 --- a/tests/stubdata/output/dubcore_pos_ecrs_002.json +++ b/tests/stubdata/output/dubcore_pos_ecrs_002.json @@ -20,13 +20,10 @@ "electrDate": "2023-02-15" }, "publication": { - "pubYear": "2023" + "pubYear": "2023", + "publisher": "Sissa Medialab" }, "publisherIDs": [ - { - "Identifier": "oai:pos.sissa.it:ECRS/002", - "attribute": "publisher-id" - }, { "Identifier": "PoS(ECRS)002", "attribute": "publisher-id"