From 706433c5049c63c6e16fee5f71d81a7e507abe06 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 10 Jun 2023 22:19:22 +0200 Subject: [PATCH 1/7] Revert "Merge pull request #97 from qurator-spk/420-namespace-package" This reverts commit fd56b86acf55677dc7a8bfb9e2737c3cc167327a, reversing changes made to ea792d1e4ac4a722770b82dc91e71f84d5beb212. --- qurator/__init__.py | 1 + setup.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/qurator/__init__.py b/qurator/__init__.py index e69de29..5284146 100644 --- a/qurator/__init__.py +++ b/qurator/__init__.py @@ -0,0 +1 @@ +__import__("pkg_resources").declare_namespace(__name__) diff --git a/setup.py b/setup.py index 807eae7..9abf158 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import find_namespace_packages, find_packages, setup +from setuptools import setup, find_packages from json import load install_requires = open('requirements.txt').read().split('\n') @@ -13,6 +13,7 @@ author='Vahid Rezanezhad', url='https://github.com/qurator-spk/eynollah', license='Apache License 2.0', + namespace_packages=['qurator'], packages=find_packages(exclude=['tests']), install_requires=install_requires, package_data={ From dbabe0f180942ecbb4e5c52fb947b06b786bc7e4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 19 Jan 2024 16:17:02 +0000 Subject: [PATCH 2/7] adapt to ocrd>=2.54 url vs local_filename --- qurator/eynollah/processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index ccec456..3375113 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -42,7 +42,11 @@ def process(self): page = pcgts.get_Page() # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename + if not('://' in page.imageFilename): + image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename + else: + # could be a URL with file:// or truly remote + image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': False, From 032a99ef11b19d8cb97566a2ce086878544991a0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 24 Jan 2024 19:33:49 +0100 Subject: [PATCH 3/7] adapt to OcrdFile.local_filename now :Path --- qurator/eynollah/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index 3375113..c89c1bd 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -58,7 +58,7 @@ def process(self): 'override_dpi': self.parameter['dpi'], 'logger': LOG, 'pcgts': pcgts, - 'image_filename': image_filename + 'image_filename': str(image_filename) } Eynollah(**eynollah_kwargs).run() file_id = make_file_id(input_file, self.output_file_grp) From a367620b0244a13341dbe11addc8669a14d959aa Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 23 May 2024 21:19:33 +0200 Subject: [PATCH 4/7] non-legacy namespace package --- qurator/__init__.py | 1 - qurator/eynollah/__init__.py | 1 - setup.py | 1 - 3 files changed, 3 deletions(-) delete mode 100644 qurator/__init__.py diff --git a/qurator/__init__.py b/qurator/__init__.py deleted file mode 100644 index 5284146..0000000 --- a/qurator/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__import__("pkg_resources").declare_namespace(__name__) diff --git a/qurator/eynollah/__init__.py b/qurator/eynollah/__init__.py index 8b13789..e69de29 100644 --- a/qurator/eynollah/__init__.py +++ b/qurator/eynollah/__init__.py @@ -1 +0,0 @@ - diff --git a/setup.py b/setup.py index 9abf158..c78ee3f 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,6 @@ author='Vahid Rezanezhad', url='https://github.com/qurator-spk/eynollah', license='Apache License 2.0', - namespace_packages=['qurator'], packages=find_packages(exclude=['tests']), install_requires=install_requires, package_data={ From 9644c8fcb79f78bba53e168b604c2fafc4b313e6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 24 May 2024 14:29:57 +0000 Subject: [PATCH 5/7] fix namespace pkg setup --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c78ee3f..af8a321 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import setup, find_namespace_packages from json import load install_requires = open('requirements.txt').read().split('\n') @@ -13,7 +13,7 @@ author='Vahid Rezanezhad', url='https://github.com/qurator-spk/eynollah', license='Apache License 2.0', - packages=find_packages(exclude=['tests']), + packages=find_namespace_packages(include=['qurator']), install_requires=install_requires, package_data={ '': ['*.json'] From c52ef98821ca2301857d8091a01980827470037e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 11 Jun 2023 22:14:41 +0200 Subject: [PATCH 6/7] processor: reuse loaded models across pages, use derived images --- qurator/eynollah/processor.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/qurator/eynollah/processor.py b/qurator/eynollah/processor.py index c89c1bd..bf76295 100644 --- a/qurator/eynollah/processor.py +++ b/qurator/eynollah/processor.py @@ -33,6 +33,7 @@ def process(self): LOG = getLogger('eynollah') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) + models = None for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) @@ -40,13 +41,18 @@ def process(self): LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) self.add_metadata(pcgts) page = pcgts.get_Page() + # if not('://' in page.imageFilename): + # image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename + # else: + # # could be a URL with file:// or truly remote + # image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename # XXX loses DPI information - # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - if not('://' in page.imageFilename): - image_filename = next(self.workspace.mets.find_files(local_filename=page.imageFilename)).local_filename - else: - # could be a URL with file:// or truly remote - image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename + page_image, _, _ = self.workspace.image_from_page( + page, page_id, + # avoid any features that would change the coordinate system: cropped,deskewed + # (the PAGE builder merely adds regions, so afterwards we would not know which to transform) + # also avoid binarization as models usually fare better on grayscale/RGB + feature_filter='cropped,deskewed,binarized') eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': False, @@ -58,9 +64,14 @@ def process(self): 'override_dpi': self.parameter['dpi'], 'logger': LOG, 'pcgts': pcgts, - 'image_filename': str(image_filename) + 'image_filename': page.imageFilename, + 'image_pil': page_image } - Eynollah(**eynollah_kwargs).run() + eynollah = Eynollah(**eynollah_kwargs) + if models is not None: + # reuse loaded models from previous page + eynollah.models = models + pcgts = eynollah.run() file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( From 60cf0bddfd35dd3fcd87b2e077b05bb170983977 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 28 May 2024 14:07:45 +0200 Subject: [PATCH 7/7] check_dpi: fix Pillow type detection --- qurator/eynollah/utils/pil_cv2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qurator/eynollah/utils/pil_cv2.py b/qurator/eynollah/utils/pil_cv2.py index 20dc22f..83ae47d 100644 --- a/qurator/eynollah/utils/pil_cv2.py +++ b/qurator/eynollah/utils/pil_cv2.py @@ -16,7 +16,7 @@ def pil2cv(img): def check_dpi(img): try: - if isinstance(img, Image.__class__): + if isinstance(img, Image.Image): pil_image = img elif isinstance(img, str): pil_image = Image.open(img)