-
Notifications
You must be signed in to change notification settings - Fork 98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement Text Extraction in PyMuPdf Fitz Layout Mode #86
Comments
I just had a closer look at how to possibly implement a layout-preserving func Text() in go. A good starting point is checking the native C implementation for fz_buffer *
fz_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page)
{
fz_stext_block *block;
fz_stext_line *line;
fz_stext_char *ch;
fz_buffer *buf;
buf = fz_new_buffer(ctx, 256);
fz_try(ctx)
{
for (block = page->first_block; block; block = block->next)
{
if (block->type == FZ_STEXT_BLOCK_TEXT)
{
for (line = block->u.t.first_line; line; line = line->next)
{
for (ch = line->first_char; ch; ch = ch->next)
fz_append_rune(ctx, buf, ch->c);
fz_append_byte(ctx, buf, '\n');
}
fz_append_byte(ctx, buf, '\n');
}
}
}
fz_catch(ctx)
{
fz_drop_buffer(ctx, buf);
fz_rethrow(ctx);
}
return buf;
} Now looking at the crucial structs: /**
A text block is a list of lines of text (typically a paragraph),
or an image.
*/
struct fz_stext_block
{
int type;
fz_rect bbox;
union {
struct { fz_stext_line *first_line, *last_line; } t;
struct { fz_matrix transform; fz_image *image; } i;
} u;
fz_stext_block *prev, *next;
};
/**
A text line is a list of characters that share a common baseline.
*/
struct fz_stext_line
{
int wmode; /* 0 for horizontal, 1 for vertical */
fz_point dir; /* normalized direction of baseline */
fz_rect bbox;
fz_stext_char *first_char, *last_char;
fz_stext_line *prev, *next;
};
/**
A text char is a unicode character, the style in which is
appears, and the point at which it is positioned.
*/
struct fz_stext_char
{
int c;
int color; /* sRGB hex color */
fz_point origin;
fz_quad quad;
float size;
fz_font *font;
fz_stext_char *next;
}; Those are not present in the go-fitz library (yet). The auto-generated go structs don't do the trick: type _Ctype_struct_fz_stext_block struct {
_type _Ctype_int
bbox _Ctype_struct___7
_ [4]byte
u [32]byte
prev *_Ctype_struct_fz_stext_block
next *_Ctype_struct_fz_stext_block
}
type _Ctype_struct_fz_stext_line struct {
wmode _Ctype_int
dir _Ctype_struct___28
bbox _Ctype_struct___7
first_char *_Ctype_struct_fz_stext_char
last_char *_Ctype_struct_fz_stext_char
prev *_Ctype_struct_fz_stext_line
next *_Ctype_struct_fz_stext_line
}
type _Ctype_struct_fz_stext_char struct {
c _Ctype_int
color _Ctype_int
origin _Ctype_struct___28
quad _Ctype_struct___29
size _Ctype_float
font *_Ctype_struct_fz_font
next *_Ctype_struct_fz_stext_char
} First step would be to include proper definitions for those structs within go-fitz. Any help is appreciated! |
Okay, got the start right... Structs: type fzRect struct {
X0, Y0 float32
X1, Y1 float32
}
type fzPoint struct {
X, Y float32
}
type fzQuad struct {
Ul fzPoint
Ur fzPoint
Ll fzPoint
Lr fzPoint
}
const (
FZ_STEXT_BLOCK_TEXT = 0
FZ_STEXT_BLOCK_IMAGE = 1
)
type fzStextBlock struct {
Type int32
Bbox fzRect
U struct {
T struct {
FirstLine *fzStextLine
LastLine *fzStextLine
_ [16]byte
}
// I struct {
// Transform fzMatrix
// Image *fzImage
// }
}
Prev *fzStextBlock
Next *fzStextBlock
}
type fzStextLine struct {
Wmode int32
Dir fzPoint
Bbox fzRect
FirstChar *fzStextChar
LastChar *fzStextChar
Prev *fzStextLine
Next *fzStextLine
}
type fzStextChar struct {
C int32
Color int32
Origin fzPoint
Quad fzQuad
Size float32
Font unsafe.Pointer
Next *fzStextChar
} Now the call to fz_new_buffer_from_stext_page() from go-fitz Text() can simply be replaced by a go port of the original function: func (f *Document) Text(pageNumber int) (string, error) {
...
// buf := C.fz_new_buffer_from_stext_page(f.ctx, text)
// defer C.fz_drop_buffer(f.ctx, buf)
// str := C.GoString(C.fz_string_from_buffer(f.ctx, buf))
str := ""
block := (*fzStextBlock)(unsafe.Pointer(text.first_block))
for block != nil {
if block.Type == FZ_STEXT_BLOCK_TEXT {
line := block.U.T.FirstLine
for line != nil {
char := line.FirstChar
for char != nil {
str += string(rune(char.C))
char = char.Next
}
str += "\n"
line = line.Next
}
str += "\n"
}
block = block.Next
}
return str, nil
} We can go from here! :) |
@MarcoWel If you or someone else manage to implement this I am willing to merge it. I don't have a plan or time to work on this. |
@gen2brain On it... |
Thank you for this excellent muPdf wrapper!
One feature that muPdf does not implement natively is layout-preserving plain text extraction.
layout
mode as standard:https://www.mankier.com/1/pdftotext
layout
mode by default:python -m fitz gettext input.pdf
https://pymupdf.readthedocs.io/en/latest/module.html#text-extraction
This is how the PyMuPdf fitz module does it:
https://github.com/pymupdf/PyMuPDF/blob/main/fitz/__main__.py#L577
When layout preservation is a must, there is currently no other way than invoking pdftotext from the go app or - even nastier - calling the fitz python module from go.
How hard would it be to add this to go-fitz as well?
The text was updated successfully, but these errors were encountered: