From 7b107537e0379993eb4b3bfadf89308f963f06e2 Mon Sep 17 00:00:00 2001 From: Tom Arnfeld Date: Sat, 28 Dec 2013 22:18:36 +0000 Subject: [PATCH 1/2] More detailed readme file. --- README.md | 44 +++++++++++++++++++++++++++++++++++++++++++- setup.py | 1 + 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 74b01bc..ab498b1 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,46 @@ python-lzo-indexer ![](https://travis-ci.org/duedil-ltd/python-lzo-indexer.png) -Python library for indexing block offsets within LZO compressed files. +Python library for indexing block offsets within LZO compressed files. The implementation is largely based on that of the [Hadoop Library](https://github.com/twitter/hadoop-lzo). Index files are used to allow Hadoop to split a single file compressed with LZO into several chunks for parallel processing. + +Since LZO is a block based compression algorithm, we can split the file along the lines of blocks and decompress each block on it's own. The index is a file containing byte offsets for each block in the original LZO file. + + +Example +------- + +The python code below demonstrates how easy it is to index an LZO file. This library also supports indexing a string, and a method to return the individual block offsets should you need to create a file of your own format. + +```python +import lzo_indexer + +with open("my-file.lzo", "r") as f: + with open("my-file.lzo.index", "rw") as index: + lzo_indexer.index_lzo_file(f, index) +``` + + +Command-line Utility +-------------------- + +This library also includes a utility for indexing multiple lzo files, using the python indexer. This is a much faster alternative to the command line utility built into the hadoop-lzo library as it avoids the JVM. + +``` +$ bin/lzo-indexer --help + +usage: lzo-indexer [-h] [--verbose] [--force] lzo_files [lzo_files ...] + +positional arguments: + lzo_files List of LZO files to index + +optional arguments: + -h, --help show this help message and exit + --verbose, -v Enable verbose logging + --force, -f Force re-creation of an index even if it exsts +``` + + +Contributions +------------- + +I welcome any contributions, though I request that any pull requests come with test coverage. diff --git a/setup.py b/setup.py index 375558c..aa6d231 100644 --- a/setup.py +++ b/setup.py @@ -18,5 +18,6 @@ def read(filename): download_url="https://github.com/duedil-ltd/python-lzo-indexer/archive/release-0.0.1.zip", license=read("LICENSE"), packages=["lzo_indexer"], + scripts=["bin/lzo-indexer"], test_suite="tests.test_indexer", ) From da7a00bbe5a88c98bba2ab60264dd4434fc810e3 Mon Sep 17 00:00:00 2001 From: Tom Arnfeld Date: Sat, 28 Dec 2013 22:19:39 +0000 Subject: [PATCH 2/2] Fix typo --- README.md | 2 +- bin/lzo-indexer | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ab498b1..2c8322e 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit --verbose, -v Enable verbose logging - --force, -f Force re-creation of an index even if it exsts + --force, -f Force re-creation of an index even if it exists ``` diff --git a/bin/lzo-indexer b/bin/lzo-indexer index 3948245..614a94c 100755 --- a/bin/lzo-indexer +++ b/bin/lzo-indexer @@ -17,7 +17,7 @@ def parse_args(argv): parser.add_argument("--verbose", "-v", default=False, action="store_true", help="Enable verbose logging") parser.add_argument("--force", "-f", default=False, action="store_true", - help="Force re-creation of an index even if it exsts") + help="Force re-creation of an index even if it exists") parser.add_argument("lzo_files", type=str, nargs="+", help="List of LZO files to index")