diff --git a/fuji_server/client/ex_evaluate.py b/fuji_server/client/ex_evaluate.py index 0c387651..f8f62f2d 100644 --- a/fuji_server/client/ex_evaluate.py +++ b/fuji_server/client/ex_evaluate.py @@ -19,7 +19,7 @@ debug = True muchotestpids=[ - '10.15493/DEFF.10000003','https://phaidra.cab.unipd.it/view/o:267291', + '10.15493/DEFF.10000003','https://phaidra.cab.unipd.it/view/o:267291','https://jyx.jyu.fi/handle/123456789/39205', 'doi:10.1038/nphys1170','doi:10.17882/42182','https://deims.org/sites/default/files/data/elter_va_fruska_gora_temperature_0.xls', '10.25504/FAIRsharing.2bdvmk','http://bio2rdf.org/affymetrix:1415765_at','doi:10.18129/B9.bioc.BiocGenerics', 'https://data.noaa.gov/dataset/dataset/w00411-nos-hydrographic-survey-2015-08-15','10.6075/J0513WJD','10.7280/D1P075', @@ -146,13 +146,15 @@ #testpids=['http://doi.org/10.1007/s10531-013-0468-6'] #rdf #testpids=['http://tun.fi/JX.1099769'] -testpids=['https://ortus.rtu.lv/science/en/datamodule/3'] +#testpids=['https://ortus.rtu.lv/science/en/datamodule/3'] #rdf #testpids=['https://databank.ora.ox.ac.uk/UniversityCollege/datasets/04156fde-dabb-48fd-baf6-533182f74b5b'] #testpids=['https://data.gov.lv/dati/lv/dataset/maksatnespejas-procesi'] -testpids=['http://doi.org/10.17882/42182'] -#testpids = muchotestpids -testpids =['https://datadoi.ee/handle/33/48'] +#testpids=['http://doi.org/10.17882/42182'] +testpids = muchotestpids +#testpids =['https://repo.clarino.uib.no/xmlui/handle/11509/103'] +#testpids=['https://data.aussda.at/dataset.xhtml?persistentId=doi:10.11587/QQ7HTL'] +testpids =['https://www.proteinatlas.org/ENSG00000180739-S1PR5/tissue/primary+data'] startpid='' def effectivehandlers(logger): handlers = logger.handlers @@ -195,19 +197,25 @@ def main(): start=False usedatacite = True tracemalloc.start() + n=1 for identifier in testpids: print (identifier) + print(n) + n+=1 if identifier==startpid or not startpid: start=True if start: ft = FAIRCheck(uid=identifier, test_debug=True, use_datacite=usedatacite) - # print(effectivehandlers(ft.logger)) uid_result, pid_result = ft.check_unique_persistent() - core_metadata_result = ft.check_minimal_metatadata() + ft.retrieve_metadata_embedded(ft.extruct_result) + include_embedded= True if ft.repeat_pid_check: uid_result, pid_result = ft.check_unique_persistent() + ft.retrieve_metadata_external() + + core_metadata_result = ft.check_minimal_metatadata() content_identifier_included_result = ft.check_content_identifier_included() access_level_result=ft.check_data_access_level() license_result = ft.check_license() diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py index b6ebccfb..a0c98f5e 100644 --- a/fuji_server/controllers/fair_check.py +++ b/fuji_server/controllers/fair_check.py @@ -95,13 +95,13 @@ class FAIRCheck: FILES_LIMIT = None LOG_SUCCESS = 25 VALID_RESOURCE_TYPES = [] - FUJI_VERSION = 'v1.0.5d' + FUJI_VERSION = 'v1.0.6' def __init__(self, uid, test_debug=False, oaipmh=None, use_datacite=True): uid_bytes = uid.encode('utf-8') self.test_id = hashlib.sha1(uid_bytes).hexdigest() #str(base64.urlsafe_b64encode(uid_bytes), "utf-8") # an id we can use for caching etc - self.id = uid + self.id = self.input_id = uid self.oaipmh_endpoint = oaipmh self.pid_url = None # full pid # e.g., "https://doi.org/10.1594/pangaea.906092 or url (non-pid) self.landing_url = None # url of the landing page of self.pid_url @@ -141,7 +141,7 @@ def __init__(self, uid, test_debug=False, oaipmh=None, use_datacite=True): self.embedded_retrieved = False FAIRCheck.load_predata() self.extruct = None - self.extruct_result = None + self.extruct_result = {} self.tika_content_types_list = [] @@ -199,9 +199,10 @@ def retrieve_metadata(self, extruct_metadata): else: self.logger.warning('FsF-F2-01M : NO structured metadata embedded in HTML') ''' - if self.reference_elements: # this will be always true as we need datacite client id - self.retrieve_metadata_embedded(embedded_exists) - self.retrieve_metadata_external() + #if self.reference_elements: # this will be always true as we need datacite client id + # if include_embedded ==True: + # self.retrieve_metadata_embedded(embedded_exists) + # self.retrieve_metadata_external() # ========= clean merged metadata, delete all entries which are None or '' data_objects = self.metadata_merged.get('object_content_identifier') @@ -268,7 +269,7 @@ def retrieve_apis_standards(self): else: self.logger.warning('{} : Skipped external ressources (OAI, re3data) checks since landing page could not be resolved'.format('FsF-R1.3-01M')) - def retrieve_metadata_embedded(self, extruct_metadata): + def retrieve_metadata_embedded(self, extruct_metadata ={}): isPid = False if self.pid_scheme: isPid = True @@ -611,6 +612,8 @@ def retrieve_metadata_external(self): if typed_metadata_links is not None: typed_rdf_collector = None + #unique entries for typed links + typed_metadata_links = [dict(t) for t in {tuple(d.items()) for d in typed_metadata_links}] for metadata_link in typed_metadata_links: if metadata_link['type'] in ['application/rdf+xml','text/n3','text/ttl','application/ld+json']: self.logger.info('FsF-F2-01M : Found e.g. Typed Links in HTML Header linking to RDF Metadata -: ('+str(metadata_link['type'])+' '+str(metadata_link['url'])+')') @@ -682,7 +685,7 @@ def check_persistent_identifier(self): def check_unique_persistent(self): return self.check_unique_identifier(), self.check_persistent_identifier() - def check_minimal_metatadata(self): + def check_minimal_metatadata(self,include_embedded = True): core_metadata_check = FAIREvaluatorCoreMetadata(self) core_metadata_check.set_metric('FsF-F2-01M', metrics=FAIRCheck.METRICS) return core_metadata_check.getResult() diff --git a/fuji_server/controllers/fair_object_controller.py b/fuji_server/controllers/fair_object_controller.py index e48cf0a8..59266fd5 100644 --- a/fuji_server/controllers/fair_object_controller.py +++ b/fuji_server/controllers/fair_object_controller.py @@ -51,9 +51,13 @@ def assess_by_id(body): # noqa: E501 ft = FAIRCheck(uid=identifier, test_debug=debug, oaipmh=oai, use_datacite=usedatacite) uid_result, pid_result = ft.check_unique_persistent() - core_metadata_result = ft.check_minimal_metatadata() + ft.retrieve_metadata_embedded(ft.extruct_result) + include_embedded = True if ft.repeat_pid_check: uid_result, pid_result = ft.check_unique_persistent() + ft.retrieve_metadata_external() + + core_metadata_result = ft.check_minimal_metatadata() content_identifier_included_result = ft.check_content_identifier_included() access_level_result = ft.check_data_access_level() license_result = ft.check_license() diff --git a/fuji_server/evaluators/fair_evaluator_minimal_metadata.py b/fuji_server/evaluators/fair_evaluator_minimal_metadata.py index cf6a28b3..f0cc1eba 100644 --- a/fuji_server/evaluators/fair_evaluator_minimal_metadata.py +++ b/fuji_server/evaluators/fair_evaluator_minimal_metadata.py @@ -27,6 +27,7 @@ from fuji_server.helper.metadata_mapper import Mapper class FAIREvaluatorCoreMetadata(FAIREvaluator): + def evaluate(self): if self.fuji.landing_url is None: self.logger.warning('FsF-F2-01M : Metadata checks probably unreliable: landing page URL could not be determined') diff --git a/fuji_server/evaluators/fair_evaluator_persistent_identifier.py b/fuji_server/evaluators/fair_evaluator_persistent_identifier.py index c94baa4f..0bc76b96 100644 --- a/fuji_server/evaluators/fair_evaluator_persistent_identifier.py +++ b/fuji_server/evaluators/fair_evaluator_persistent_identifier.py @@ -32,7 +32,6 @@ class FAIREvaluatorPersistentIdentifier(FAIREvaluator): def evaluate(self): - self.result = Persistence(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= @@ -50,10 +49,17 @@ def evaluate(self): requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate('FsF-F1-02D', ignore_html = False) + if type(self.fuji.extruct_result) != dict: + self.fuji.extruct_result ={} r = requestHelper.getHTTPResponse() if r: self.fuji.landing_url = requestHelper.redirect_url + #in case the test has been repeated because a PID has been found in metadata + if self.fuji.repeat_pid_check == True: + if self.fuji.landing_url != self.fuji.input_id: + self.logger.warning('FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL') + if r.status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse().getheader('Link') @@ -64,6 +70,7 @@ def evaluate(self): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None + found_formats, formats_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: @@ -71,11 +78,15 @@ def evaluate(self): rel_match = re.search('rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search('type=\"(.*?)\"', link_prop) + elif str(link_prop).startswith('formats="'): + formats_match = re.search('formats=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] - signposting_link_dict = {'url': found_link[1:-1], 'type': found_type, 'rel': found_rel} + if formats_match: + found_formats = formats_match[1] + signposting_link_dict = {'url': found_link[1:-1], 'type': found_type, 'rel': found_rel, 'profile':found_formats} if found_link: self.fuji.signposting_header_links.append(signposting_link_dict)