Skip to content

Commit

Permalink
Merge pull request #35 from iscc/titusz/mobi-support
Browse files Browse the repository at this point in the history
Add mobi support
  • Loading branch information
titusz authored Mar 2, 2020
2 parents c4bf075 + a2dd3a5 commit e1ef508
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 41 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ This tool offers an easy way to generate ISCC codes from the command line. It su

#### Text

doc, docx, epub, html, odt, pdf, rtf, txt, xml, ibooks, md, xls ...
doc, docx, epub, html, odt, pdf, rtf, txt, xml, ibooks, md, xls, mobi ...


#### Image
Expand Down Expand Up @@ -199,6 +199,9 @@ You may also want join our developer chat on Telegram at <https://t.me/iscc_dev>

## Change Log

### [0.9.4] - 2020-03-02
- Add experimental support for mobi files

### [0.9.3] - 2020-02-18
- Add support for XHTML
- Fix error on unsupported media types
Expand Down
3 changes: 2 additions & 1 deletion iscc_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import click
from tika import tika

__version__ = "0.9.3"
__version__ = "0.9.4"

APP_NAME = "iscc-cli"
APP_DIR = click.get_app_dir(APP_NAME, roaming=False)
os.makedirs(iscc_cli.APP_DIR, exist_ok=True)
os.environ["TIKA_PATH"] = APP_DIR
os.environ["LOGURU_AUTOINIT"] = "False"
tika.log.disabled = True
14 changes: 13 additions & 1 deletion iscc_cli/batch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
import shutil
from os.path import basename, abspath
import click
import mobi
from tika import detector, parser
import iscc

Expand Down Expand Up @@ -41,7 +43,17 @@ def batch(path, recursive, guess):
)
continue

tika_result = parser.from_file(f)
if media_type == "application/x-mobipocket-ebook":
try:
tempdir, epub_filepath = mobi.extract(f)
tika_result = parser.from_file(epub_filepath)
shutil.rmtree(tempdir)
except Exception as e:
click.echo("Error with mobi extraction %s" % f)
continue
else:
tika_result = parser.from_file(f)

title = get_title(tika_result, guess=guess)

mid, norm_title, _ = iscc.meta_id(title)
Expand Down
6 changes: 5 additions & 1 deletion iscc_cli/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@ class GMT:
"gmt": GMT.TEXT,
"ext": "xlsx",
},
"application/vnd.ms-excel": {"gmt": GMT.TEXT, "ext": "xls",},
"application/vnd.ms-excel": {"gmt": GMT.TEXT, "ext": "xls"},
"application/x-mobipocket-ebook": {
"gmt": GMT.TEXT,
"ext": ["mobi", "prc", "azw", "azw3", "azw4"],
},
# Image Formats
"image/bmp": {"gmt": GMT.IMAGE, "ext": "bmp"},
"image/gif": {"gmt": GMT.IMAGE, "ext": "gif"},
Expand Down
20 changes: 18 additions & 2 deletions iscc_cli/dump.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# -*- coding: utf-8 -*-
import shutil

import click
import mobi
from click import UsageError
from tika import parser
from tika import parser, detector
from iscc_cli.utils import DefaultHelp
from iscc_cli.const import SUPPORTED_MIME_TYPES
import json


Expand All @@ -15,7 +19,19 @@
@click.option("-c", "--content", is_flag=True, default=False, help="Dump content only.")
def dump(file, strip, meta, content):
"""Dump Tika extraction results for FILE."""
tika_result = parser.from_file(file.name)

media_type = detector.from_file(file.name)
if media_type not in SUPPORTED_MIME_TYPES:
click.echo("Unsupported media type {}.".format(media_type))
click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")

if media_type == "application/x-mobipocket-ebook":
tempdir, epub_filepath = mobi.extract(file.name)
tika_result = parser.from_file(epub_filepath)
shutil.rmtree(tempdir)
else:
tika_result = parser.from_file(file.name)

if all([meta, content]):
raise UsageError("Use either --meta or --content for selective output.")

Expand Down
10 changes: 9 additions & 1 deletion iscc_cli/gen.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# -*- coding: utf-8 -*-
import shutil
from os.path import abspath

import click
import iscc
import mobi
from tika import detector, parser

from iscc_cli import audio_id, video_id, fpcalc
Expand Down Expand Up @@ -31,7 +33,13 @@ def gen(file, guess, title, extra, verbose):
click.echo("Unsupported media type {}.".format(media_type))
click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")

tika_result = parser.from_file(file.name)
if media_type == "application/x-mobipocket-ebook":
tempdir, epub_filepath = mobi.extract(file.name)
tika_result = parser.from_file(epub_filepath)
shutil.rmtree(tempdir)
else:
tika_result = parser.from_file(file.name)

if not title:
title = get_title(tika_result, guess=guess)

Expand Down
Loading

0 comments on commit e1ef508

Please sign in to comment.