Skip to content

Commit

Permalink
Feature/windows compat (#11)
Browse files Browse the repository at this point in the history
* add windows impl

* fix gnu specific flag --parents

* CI: fix shell directive

* CI: add current go version, add wget on Windows

* CI: fix windows

* CI: fix windows; use curl in downloader script is available

* add lib extension to extracted file to make it work on windows

* reenable CI for all platforms

* Close extracted lib before deleting

* fix wget param in downloader script

* update README
  • Loading branch information
johbar authored Jan 13, 2025
1 parent f675cdc commit 38346a7
Show file tree
Hide file tree
Showing 16 changed files with 119 additions and 29 deletions.
28 changes: 20 additions & 8 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,30 @@ on:
branches: [ "main" ]

jobs:
build-embedded-on-windows:
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.23'
- name: Download libpdfium
shell: bash
run: pkg/pdflibwrappers/pdfium_purego/download-pdfium.sh
- name: Build service
run: go build -tags nomsgpack,embed_pdfium
- name: Run basic test - embedded pdfium
shell: bash
run: time ./text-extraction-service pkg/pdflibwrappers/testdata/2000001.pdf
build-on-ubuntu:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
go-version: '1.23'
- name: golangci-lint
uses: golangci/golangci-lint-action@v6
with:
Expand All @@ -41,7 +57,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
go-version: '1.23'
- name: Download libpdfium
run: pkg/pdflibwrappers/pdfium_purego/download-pdfium.sh
- name: Build service
Expand All @@ -55,7 +71,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
go-version: '1.23'
- name: Download libpdfium
run: pkg/pdflibwrappers/pdfium_purego/download-pdfium.sh
- name: Build service
Expand All @@ -70,13 +86,9 @@ jobs:
run: time ./text-extraction-service pkg/pdflibwrappers/testdata/2000001.pdf
build-embedded-on-alpine:
runs-on: "ubuntu-latest"
container: alpine:3.21
container: "golang:alpine"
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
- name: Install some tools
run: apk update && apk add --no-cache file binutils
- name: Download libpdfium
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
go-version: '1.23'
- name: Install cross-compiling tools
run: sudo apt-get install -y -q binutils-aarch64-linux-gnu
- name: Compile TES for Linux, arm64 and amd64
Expand Down Expand Up @@ -60,7 +60,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
go-version: '1.23'
- name: Build for Darwin on arm64 and amd64
run: |
mkdir dist
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ Apache [Tika](https://tika.apache.org/) is definitively a more versatile and mat
- Processing password protected files
- Processing files from web servers that require authentication of any kind (cookie, header, referral, user agent etc)
- A lot of common document formats, including odt, docx, html, xml
- Running on MS Windows

## License

Expand Down Expand Up @@ -92,7 +91,6 @@ But testing and running TES requires additional shared libs.
Depending on the PDF engine you choose (see below for comparison) you need it installed in your dev/build environment.

All instructions supplied here suppose a Linux environment.
TES is only tested on Linux, might work on MacOS/Darwin as well but needs modifications to be compatible with Windows.

### PDFium

Expand Down
4 changes: 4 additions & 0 deletions internal/pdfproc/pdf_images.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ var pdfConf *model.Configuration

func init() {
pdfConf = model.NewDefaultConfiguration()
pdfConf.Optimize = false
pdfConf.OptimizeResourceDicts = false
pdfConf.ValidateLinks = false
pdfConf.Offline = true
}

// ProcessImages applies readFunc to every image found on the page with the specified zero-based page number
Expand Down
9 changes: 4 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"net/http"
"os"
"os/signal"
"runtime/debug"

"github.com/gin-contrib/expvar"
"github.com/gin-gonic/gin"
Expand All @@ -26,7 +25,7 @@ var (

func main() {
tesConfig = NewTesConfigFromEnv()
logger = slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: tesConfig.logLevel}))
logger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: tesConfig.logLevel}))
// set static/global config of submodules
tesswrap.Languages = tesConfig.TesseractLangs
dehyphenator.RemoveNewlines = tesConfig.RemoveNewlines
Expand All @@ -47,17 +46,17 @@ func main() {
}
// one shot mode: don't start a server, just process a single file provided on the command line
if len(os.Args) > 1 {
debug.SetGCPercent(-1)
// debug.SetGCPercent(-1)
logger = slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelWarn}))
LogAndFixConfigIssues()
PrintMetadataAndTextToStdout(os.Args[1])
return
}
if tesConfig.Debug {
logger = slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
logger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelDebug}))
// this might expose passwords in the log...
logger.Debug("Starting with config", "conf", tesConfig)
}
logger.Debug("Starting Text Extraction Service with config", "conf", tesConfig)
LogAndFixConfigIssues()
postprocessDocChan = make(chan *ExtractedDocument, 100)
go saveAndCloseExtracedDocs()
Expand Down
2 changes: 2 additions & 0 deletions misc.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/johbar/text-extraction-service/v2/internal/pdfproc"
"github.com/johbar/text-extraction-service/v2/pkg/dehyphenator"
"github.com/johbar/text-extraction-service/v2/pkg/docparser"
"github.com/johbar/text-extraction-service/v2/pkg/pdflibwrappers"
"github.com/johbar/text-extraction-service/v2/pkg/tesswrap"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
)
Expand Down Expand Up @@ -164,6 +165,7 @@ func LogAndFixConfigIssues() {
}

func deleteExtractedLib() {
pdflibwrappers.CloseLib()
err := os.Remove(pdfImpl.LibPath)
if err != nil {
logger.Warn("Could not delete libpdfium in temp dir", "path", pdfImpl.LibPath)
Expand Down
6 changes: 3 additions & 3 deletions pdfimpl.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ func LoadPdfLib(libName string, libPath string) error {
libPath, err := pdfium.InitLib(libPath)
if err == nil {
pdfImpl = pdfImplementation{libShort: "pdfium", LibDescription: "PDFium", LibPath: libPath}
} else {
var err2 error
libPath, err2 = pdfium.ExtractLibpdfium()
} else {
var err2 error
libPath, err2 = pdfium.ExtractLibpdfium()
if err2 != nil {
return errors.Join(err, err2)
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/pdflibwrappers/misc.go → pkg/pdflibwrappers/loadlib.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//go:build linux || darwin

package pdflibwrappers

import (
Expand All @@ -6,6 +8,8 @@ import (
"github.com/ebitengine/purego"
)

var CloseLib func() = func() {}

// TryLoadLib tries to load a shared object/dynamically linked library
// from various paths and returns a handle or 0 and an error.
func TryLoadLib(paths ...string) (uintptr, string, error) {
Expand Down
26 changes: 26 additions & 0 deletions pkg/pdflibwrappers/loadlib_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package pdflibwrappers

import (
"errors"
"syscall"
)

var CloseLib func() = func() {}

// TryLoadLib tries to load a shared object/dynamically linked library
// from various paths and returns a handle or 0 and an error.
func TryLoadLib(paths ...string) (uintptr, string, error) {
var lib syscall.Handle
var liberr, err error
for _, path := range paths {
lib, liberr = syscall.LoadLibrary(path)
err = errors.Join(liberr, err)
if lib != 0 {
CloseLib = func() {
syscall.FreeLibrary(syscall.Handle(lib))
}
return uintptr(lib), path, nil
}
}
return 0, "", err
}
33 changes: 26 additions & 7 deletions pkg/pdflibwrappers/pdfium_purego/download-pdfium.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ set -o errexit
goos=$(go env GOOS)
arch=$(go env GOARCH)
linux_arch="${arch}"
# default for *nix
path_in_tar='lib'
name_in_tar='libpdfium'

my_path=$(readlink -f $0)
my_dir=$(dirname "${my_path}")
Expand Down Expand Up @@ -46,15 +49,22 @@ case ${goos} in
os='mac'
ext='dylib'
;;
'windows')
os='win'
# overwrite for windows:
path_in_tar='bin'
name_in_tar='pdfium'
ext='dll'
;;
*)
printf "not a supported OS: %s\n" "${os}"
exit 1;
;;
esac

try_to_strip () {
du -h "${lib_path}"
printf "Trying to strip...\n"
du -h "${1}"
if test "${os}" = 'mac' && strip -S -x "${1}"; then
du -h "${1}"
fi
Expand All @@ -66,15 +76,24 @@ try_to_strip () {
fi
}

download () {
if which curl 2>&1 >/dev/null ; then
curl -sS --location "$1"
else
wget -q -O - "$1"
fi
}

url="https://github.com/bblanchon/pdfium-binaries/releases/latest/download/pdfium-${os}-${arch}.tgz"

printf "Downloading %s\n" "${url}"
lib_path="lib/libpdfium.${ext}"
local_name="${name_in_tar}.${ext}"
(
cd "${my_dir}"
wget -q -O - "${url}" | tar -xz "${lib_path}"
printf "Extracted lib to %s/lib/libpdfium.%s\n" "${my_dir}" "${ext}"
file "${lib_path}" || true
try_to_strip "${lib_path}"
mkdir -p "${my_dir}/lib"
cd "${my_dir}/lib"
download "${url}" | tar -xz --strip-components 1 "${path_in_tar}/${local_name}"
printf "Extracted lib to %s\n" "${my_dir}/${name_in_tar}${ext}"
file "${local_name}" || true
try_to_strip "${local_name}"
printf "Done.\n"
)
2 changes: 1 addition & 1 deletion pkg/pdflibwrappers/pdfium_purego/embed_pdfium.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func ExtractLibpdfium() (string, error) {
if len(pdfiumBlob) == 0 {
return "", errors.New("extraction of libpdfium has been requested, but it is not embedded in this build")
}
f, err := os.CreateTemp("", "libpdfium")
f, err := os.CreateTemp("", "libpdfium*" + libExtension)
if err != nil {
return "", err
}
Expand Down
12 changes: 12 additions & 0 deletions pkg/pdflibwrappers/pdfium_purego/embed_pdfium_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
//go:build embed_pdfium

package pdfium_purego

import (
_ "embed"
)

var (
//go:embed lib/pdfium.dll
pdfiumBlob []byte
)
5 changes: 5 additions & 0 deletions pkg/pdflibwrappers/pdfium_purego/lib_darwin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package pdfium_purego

var defaultLibNames = []string{"libpdfium.dylib"}

const libExtension string = ".dylib"
5 changes: 5 additions & 0 deletions pkg/pdflibwrappers/pdfium_purego/lib_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package pdfium_purego

var defaultLibNames = []string{"libpdfium.so", "/usr/lib/libreoffice/program/libpdfiumlo.so"}

const libExtension string = ".so"
5 changes: 5 additions & 0 deletions pkg/pdflibwrappers/pdfium_purego/lib_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package pdfium_purego

var defaultLibNames = []string{"pdfium.dll"}

const libExtension string = ".dll"
1 change: 0 additions & 1 deletion pkg/pdflibwrappers/pdfium_purego/pdfium.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ var (

// PDFium is not thread-safe. This lock guards the lib against concurrent access in places where this is known to be necessary
Lock sync.Mutex
defaultLibNames = []string{"libpdfium.so", "/usr/lib/libreoffice/program/libpdfiumlo.so", "libpdfium.dylib"}
)

type Document struct {
Expand Down

0 comments on commit 38346a7

Please sign in to comment.