Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Talk: JSON parsing with SIMD #32

Open
newhoggy opened this issue Mar 14, 2019 · 8 comments
Open

Talk: JSON parsing with SIMD #32

newhoggy opened this issue Mar 14, 2019 · 8 comments

Comments

@newhoggy
Copy link
Member

newhoggy commented Mar 14, 2019

No description provided.

@newhoggy
Copy link
Member Author

newhoggy commented Mar 15, 2019

State machine based lookup tables

uint32_t hw_json_simd_phi_table_32[] =
{ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000007, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000007, 0x00000000, 0x00000007, 0x00000007, 0x00000000
, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000007, 0x06000006, 0x00000000, 0x01000001, 0x00000000, 0x00000000
, 0x00000000, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007
, 0x00000007, 0x00000007, 0x00000007, 0x06000006, 0x00000000, 0x01000001, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
};

uint32_t hw_json_simd_transition_table_32[] =
{ 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010001, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x03010103, 0x00010100, 0x03010103, 0x03010103, 0x00010100
, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x03010103, 0x00010100, 0x00010200, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103, 0x03010103
, 0x03010103, 0x03010103, 0x03010103, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100, 0x00010100
};

@newhoggy newhoggy changed the title Annotated Source Code JSON parsing with SIMD Mar 15, 2019
@newhoggy
Copy link
Member Author

newhoggy commented Mar 15, 2019

Comparison with pure C code

hw-json-simd/cbits/main.c

Lines 81 to 163 in fe19db8

uint8_t buffer[W8_BUFFER_SIZE];
uint32_t phi_buffer[W8_BUFFER_SIZE];
uint8_t ibs_buffer[W8_BUFFER_SIZE];
uint8_t ops_buffer[W8_BUFFER_SIZE];
uint8_t cls_buffer[W8_BUFFER_SIZE];
// uint32_t result_ib[W8_BUFFER_SIZE];
// uint32_t result_a [W8_BUFFER_SIZE];
// uint32_t result_z [W8_BUFFER_SIZE];
// uint64_t accum = 0;
uint64_t remaining_bp_bits = 0;
size_t remaining_bp_bits_len = 0;
uint64_t out_bp_buffer[W32_BUFFER_SIZE * 2];
size_t total_bytes_read = 0;
uint32_t state = 0x03020100;
while (1) {
size_t bytes_read = fread(buffer, 1, W8_BUFFER_SIZE, in);
total_bytes_read += bytes_read;
if (bytes_read < W8_BUFFER_SIZE) {
if (ferror(in)) {
fprintf(stderr, "Error reading file\n");
exit(1);
}
if (bytes_read == 0) {
if (feof(in)) {
break;
}
}
size_t next_alignment = ((bytes_read + 63) / 64) * 64;
memset(buffer + bytes_read, 0, next_alignment - bytes_read);
bytes_read = next_alignment;
}
uint32_t chunk_state = state;
hw_json_simd_sm_process_chunk(buffer, bytes_read,
&state,
phi_buffer);
hw_json_simd_sm_make_ib_op_cl_chunks(chunk_state, phi_buffer, bytes_read,
ibs_buffer,
ops_buffer,
cls_buffer);
size_t idx_bytes = (bytes_read + 7) / 8;
fwrite(ibs_buffer, 1, idx_bytes, ib_out);
size_t out_bp_bytes = hw_json_simd_sm_write_bp_chunk(
ops_buffer,
cls_buffer,
idx_bytes,
&remaining_bp_bits,
&remaining_bp_bits_len,
out_bp_buffer);
fwrite(out_bp_buffer, out_bp_bytes, sizeof(uint64_t), bp_out);
fflush(ib_out);
fflush(bp_out);
}
hw_json_simd_sm_write_bp_chunk_final(remaining_bp_bits, remaining_bp_bits_len, out_bp_buffer);
fprintf(stderr, "Final state %u\n", state);
fwrite(out_bp_buffer, 2, sizeof(uint64_t), bp_out);
fclose(in);
fclose(ib_out);
return 0;

@newhoggy
Copy link
Member Author

newhoggy commented Mar 15, 2019

Dealing with incompatible architectures

Allow the code to query how the package was compiled

enabledMakeStandardJsonIbBps :: Bool
enabledMakeStandardJsonIbBps = C.avx_2 && C.sse_4_2 && C.bmi_2

flag avx2
description: Enable avx2 instruction set
manual: False
default: False
flag bmi2
description: Enable bmi2 instruction set
manual: False
default: False
flag sse42
description: Enable SSE 4.2 optimisations.
manual: False
default: True

common config
ghc-options: -Wall
default-language: Haskell2010
if impl(ghc >= 8.0.1)
ghc-options: -Wcompat -Wincomplete-record-updates -Wincomplete-uni-patterns -Wredundant-constraints
if flag(sse42)
ghc-options: -msse4.2
cc-options: -msse4.2
if flag(bmi2)
cc-options: -mbmi2
if impl(ghc >= 8.4.1)
ghc-options: -mbmi2
if flag(avx2)
cc-options: -mavx2

avx_2 :: Bool
avx_2 = U.unsafePerformIO F.enabled_avx_2 /= 0
{-# NOINLINE avx_2 #-}
sse_4_2 :: Bool
sse_4_2 = U.unsafePerformIO F.enabled_sse_4_2 /= 0
{-# NOINLINE sse_4_2 #-}
bmi_2 :: Bool
bmi_2 = U.unsafePerformIO F.enabled_bmi_2 /= 0
{-# NOINLINE bmi_2 #-}

enabled_avx_2 :: IO Int
enabled_avx_2 = fromIntegral <$> do
{#call unsafe hw_json_simd_avx2_enabled as c_hw_json_simd_avx2_enabled#}
{-# NOINLINE enabled_avx_2 #-}
enabled_sse_4_2 :: IO Int
enabled_sse_4_2 = fromIntegral <$> do
{#call unsafe hw_json_simd_sse4_2_enabled as c_hw_json_simd_sse4_2_enabled#}
{-# NOINLINE enabled_sse_4_2 #-}
enabled_bmi_2 :: IO Int
enabled_bmi_2 = fromIntegral <$> do
{#call unsafe hw_json_simd_bmi2_enabled as c_hw_json_simd_bmi2_enabled#}
{-# NOINLINE enabled_bmi_2 #-}

int hw_json_simd_avx2_enabled() {
#ifdef __AVX2__
return 1;
#else
return 0;
#endif
}
int hw_json_simd_bmi2_enabled() {
#ifdef __BMI2__
return 1;
#else
return 0;
#endif
}
int hw_json_simd_sse4_2_enabled() {
#ifdef __BMI2__
return 1;
#else
return 0;
#endif
}

@newhoggy
Copy link
Member Author

C Implementation of Parallel State Machines

void
hw_json_simd_sm_process_chunk(
uint8_t *in_buffer,
size_t in_length,
uint32_t *inout_state,
uint32_t *out_phi_buffer) {
__m128i s = _mm_set_epi64x(0, *inout_state);
for (size_t i = 0; i < in_length; i += 1) {
uint8_t w = in_buffer[i];
__m128i p = _mm_shuffle_epi8(_mm_set1_epi32(hw_json_simd_phi_table_32[w]), s);
out_phi_buffer[i] = _mm_extract_epi32(p, 0);
s = _mm_shuffle_epi8(_mm_set1_epi32(hw_json_simd_transition_table_32[w]), s);
}
*inout_state = (uint32_t)_mm_extract_epi32(s, 0);
}

void
hw_json_simd_sm_make_ib_op_cl_chunks(
uint8_t state,
uint32_t *in_phis,
size_t phi_length,
uint8_t *out_ibs,
uint8_t *out_ops,
uint8_t *out_cls) {
uint32_t state_offset = state * 8;
uint32_t ib_offset = 5;
uint32_t op_offset = 6;
uint32_t cl_offset = 7;
for (size_t i = 0; i < phi_length; i += 8) {
__m256i v_8 = *(__m256i *)&in_phis[i];
__m256i v_ib_8 = _mm256_slli_epi64(_mm256_srli_epi64(v_8, state_offset), ib_offset);
__m256i v_op_8 = _mm256_slli_epi64(_mm256_srli_epi64(v_8, state_offset), op_offset);
__m256i v_cl_8 = _mm256_slli_epi64(_mm256_srli_epi64(v_8, state_offset), cl_offset);
uint8_t all_ibs = (uint8_t)_pext_u32(_mm256_movemask_epi8(v_ib_8), 0x11111111);
uint8_t all_ops = (uint8_t)_pext_u32(_mm256_movemask_epi8(v_op_8), 0x11111111);
uint8_t all_cls = (uint8_t)_pext_u32(_mm256_movemask_epi8(v_cl_8), 0x11111111);
size_t j = i / 8;
out_ibs[j] = all_ibs;
out_ops[j] = all_ops;
out_cls[j] = all_cls;
}

size_t
hw_json_simd_sm_write_bp_chunk(
uint8_t *result_op,
uint8_t *result_cl,
size_t ib_bytes,
uint64_t *remaining_bits,
size_t *remaning_bits_len,
uint64_t *out_buffer) {
uint64_t *w64_result_op = (uint64_t *)result_op;
uint64_t *w64_result_cl = (uint64_t *)result_cl;
uint64_t *w64_work_bp = (uint64_t *)out_buffer;
uint64_t w64_len = ib_bytes / 8;
size_t w64s_ready = 0;
for (size_t i = 0; i < w64_len; ++i) {
uint64_t w64_op = w64_result_op[i];
uint64_t w64_cl = w64_result_cl[i];
uint64_t w64_op_lo = w64_op;
uint64_t w64_op_hi = w64_op >> 32;
uint64_t w64_cl_lo = w64_cl;
uint64_t w64_cl_hi = w64_cl >> 32;
uint64_t op_lo = _pdep_u64(w64_op_lo, 0x5555555555555555);
uint64_t cl_lo = _pdep_u64(w64_cl_lo, 0xaaaaaaaaaaaaaaaa);
uint64_t ib_lo = op_lo | cl_lo;
uint64_t op_hi = _pdep_u64(w64_op_hi, 0x5555555555555555);
uint64_t cl_hi = _pdep_u64(w64_cl_hi, 0xaaaaaaaaaaaaaaaa);
uint64_t ib_hi = op_hi | cl_hi;
size_t pc_ib_lo = __builtin_popcountll(ib_lo);
size_t pc_ib_hi = __builtin_popcountll(ib_hi);
uint64_t ext_lo = _pext_u64(op_lo, ib_lo);
uint64_t ext_hi = _pext_u64(op_hi, ib_hi);
w64s_ready += hw_json_simd_sm_write_bits(ext_lo, pc_ib_lo, remaining_bits, remaning_bits_len, w64_work_bp + w64s_ready);
w64s_ready += hw_json_simd_sm_write_bits(ext_hi, pc_ib_hi, remaining_bits, remaning_bits_len, w64_work_bp + w64s_ready);
}
return w64s_ready;
}

size_t
hw_json_simd_sm_write_bits(
uint64_t bits,
size_t bits_len,
uint64_t *remaining_bits,
size_t *remaining_bits_len,
uint64_t *out_buffer) {
*remaining_bits |= (bits << *remaining_bits_len);
if (*remaining_bits_len + bits_len >= 64) {
*out_buffer = *remaining_bits;
*remaining_bits = bits >> (64 - *remaining_bits_len);
*remaining_bits_len = *remaining_bits_len + bits_len - 64;
return 1;
} else {
*remaining_bits_len += bits_len;
return 0;
}
}

size_t
hw_json_simd_sm_write_bp_chunk_final(
uint64_t remaining_bits,
size_t remaining_bits_len,
uint64_t *out_buffer) {
if (remaining_bits_len > 0) {
size_t zero_len = 64 - remaining_bits_len;
*out_buffer = (remaining_bits << zero_len) >> zero_len;
return 1;
} else {
return 0;
}
}

@newhoggy
Copy link
Member Author

Foreign imports

import Foreign
#include "../cbits/simd.h"
type UInt8 = {#type uint8_t #}
type UInt32 = {#type uint32_t#}
type UInt64 = {#type uint64_t#}
type Size = {#type size_t #}

processChunk :: ()
=> Ptr UInt8 -- in_buffer
-> Size -- in_length
-> Ptr UInt8 -- work_bits_of_d
-> Ptr UInt8 -- work_bits_of_a
-> Ptr UInt8 -- work_bits_of_z
-> Ptr UInt8 -- work_bits_of_q
-> Ptr UInt8 -- work_bits_of_b
-> Ptr UInt8 -- work_bits_of_e
-> Ptr Size -- last_trailing_ones
-> Ptr Size -- quote_odds_carry
-> Ptr Size -- quote_evens_carry
-> Ptr UInt64 -- quote_mask_carry
-> Ptr UInt8 -- result_ibs
-> Ptr UInt8 -- result_a
-> Ptr UInt8 -- result_z
-> IO UInt64
processChunk = do
{#call unsafe hw_json_simd_process_chunk as c_hw_json_simd_process_chunk#}
{-# INLINE processChunk #-}

initBpState :: ()
=> Ptr ()
-> IO ()
initBpState = {#call unsafe hw_json_simd_init_bp_state as c_hw_json_simd_init_bp_state#}
{-# INLINE initBpState #-}

writeBpChunk :: ()
=> Ptr UInt8 -- result_ib
-> Ptr UInt8 -- result_a
-> Ptr UInt8 -- result_z
-> Size -- ib_bytes
-> Ptr () -- bp_state
-> Ptr UInt8 -- out_buffer
-> IO Size
writeBpChunk = {#call unsafe hw_json_simd_write_bp_chunk as c_hw_json_simd_write_bp_chunk#}
{-# INLINE writeBpChunk #-}

writeBpChunkFinal :: ()
=> Ptr () -- bp_state
-> Ptr UInt8 -- out_buffer
-> IO Size
writeBpChunkFinal = {#call unsafe hw_json_simd_write_bp_chunk_final as c_hw_json_simd_write_bp_chunk_final#}
{-# INLINE writeBpChunkFinal #-}

smProcessChunk :: ()
=> Ptr UInt8 -- in_buffer
-> Size -- in_length
-> Ptr UInt32 -- inout_state
-> Ptr UInt32 -- out_phi_buffer
-> IO ()
smProcessChunk = {#call unsafe hw_json_simd_sm_process_chunk as c_hw_json_simd_sm_process_chunk#}

smMakeIbOpClChunks :: ()
=> UInt8 -- state
-> Ptr UInt32 -- in_phis
-> Size -- phi_length
-> Ptr UInt8 -- out_ibs
-> Ptr UInt8 -- out_ops
-> Ptr UInt8 -- out_cls
-> IO ()
smMakeIbOpClChunks = {#call unsafe hw_json_simd_sm_make_ib_op_cl_chunks as c_hw_json_simd_sm_make_ib_op_cl_chunks#}

smWriteBpChunk :: ()
=> Ptr UInt8 -- result_op
-> Ptr UInt8 -- result_cl
-> Size -- ib_bytes
-> Ptr UInt64 -- remaining_bp_bits
-> Ptr Size -- remaning_bp_bits_len
-> Ptr UInt64 -- out_buffer
-> IO Size
smWriteBpChunk = {#call unsafe hw_json_simd_sm_write_bp_chunk as c_hw_json_simd_sm_write_bp_chunk#}

smWriteBpChunkFinal :: ()
=> UInt64 -- remaining_bits
-> Size -- remaining_bits_len
-> Ptr UInt64 -- out_buffer
-> IO Size
smWriteBpChunkFinal = {#call unsafe hw_json_simd_sm_write_bp_chunk_final as c_hw_json_simd_sm_write_bp_chunk_final#}

@newhoggy
Copy link
Member Author

newhoggy commented Mar 15, 2019

Parsing state

Data types

data WorkBuffers = WorkBuffers
{ workBuffersP :: !(ForeignPtr F.UInt8)
, workBuffersD :: !(Ptr F.UInt8)
, workBuffersA :: !(Ptr F.UInt8)
, workBuffersZ :: !(Ptr F.UInt8)
, workBuffersQ :: !(Ptr F.UInt8)
, workBuffersB :: !(Ptr F.UInt8)
, workBuffersE :: !(Ptr F.UInt8)
}
data WorkState = WorkState
{ workStateZ :: !(Ptr F.Size)
, workStateO :: !(Ptr F.Size)
, workStateE :: !(Ptr F.Size)
, workStateM :: !(Ptr F.UInt64)
, workStateP :: !(ForeignPtr Word8)
}
newtype BpState = BpState
{ bpStateP :: ForeignPtr Word8
}
data Step where
Step :: ( forall s
. BpState
-> DVSM.MVector s Word64
-> ST s Int)
-> Int
-> Step

State initialisation

emptyBpState :: IO BpState
emptyBpState = do
fptr <- F.mallocForeignPtrBytes 32
return (BpState (F.castForeignPtr fptr))
allocWorkBuffers :: Int -> IO WorkBuffers
allocWorkBuffers n = do
fptr <- F.mallocForeignPtrBytes (6 * n)
let ptr = F.unsafeForeignPtrToPtr fptr
return WorkBuffers
{ workBuffersP = fptr
, workBuffersD = ptr `F.plusPtr` 0
, workBuffersA = ptr `F.plusPtr` n
, workBuffersZ = ptr `F.plusPtr` (n * 2)
, workBuffersQ = ptr `F.plusPtr` (n * 3)
, workBuffersB = ptr `F.plusPtr` (n * 4)
, workBuffersE = ptr `F.plusPtr` (n * 5)
}
allocWorkState :: IO WorkState
allocWorkState = do
fptr <- F.mallocForeignPtrBytes 256
let ptr = F.unsafeForeignPtrToPtr fptr
let ws = WorkState
{ workStateZ = ptr `F.plusPtr` 0
, workStateO = ptr `F.plusPtr` 8
, workStateE = ptr `F.plusPtr` (8 * 2)
, workStateM = ptr `F.plusPtr` (8 * 3)
, workStateP = fptr
}
F.poke (workStateZ ws) 0
F.poke (workStateO ws) 0
F.poke (workStateE ws) 1
F.poke (workStateM ws) 0
return ws

@newhoggy
Copy link
Member Author

newhoggy commented Mar 15, 2019

Building the index (streaming)

makeStandardJsonIbBps :: LBS.ByteString -> Either String [(BS.ByteString, BS.ByteString)]
makeStandardJsonIbBps lbs = if enabledMakeStandardJsonIbBps
then Right (makeStandardJsonIbBpsUnsafe lbs)
else Left "makeStandardJsonIbBps function is disabled"

makeStandardJsonIbBpsUnsafe :: LBS.ByteString -> [(BS.ByteString, BS.ByteString)]
makeStandardJsonIbBpsUnsafe lbs = F.unsafeLocalState $ do
wb <- allocWorkBuffers (32 * 1024 * 1204)
ws <- newWorkState 0
fptrState :: F.ForeignPtr F.UInt32 <- F.mallocForeignPtr
fptrRemBits :: F.ForeignPtr F.UInt64 <- F.mallocForeignPtr
fptrRemBitsLen :: F.ForeignPtr F.Size <- F.mallocForeignPtr
let ptrState = F.unsafeForeignPtrToPtr fptrState
let ptrRemBits = F.unsafeForeignPtrToPtr fptrRemBits
let ptrRemBitsLen = F.unsafeForeignPtrToPtr fptrRemBitsLen
F.poke ptrState 0
F.poke ptrRemBits 0
F.poke ptrRemBitsLen 0
IO.unsafeInterleaveIO $ go wb ws fptrState fptrRemBits fptrRemBitsLen (LBS.toChunks lbs)
where go :: ()
=> WorkBuffers
-> WorkState
-> F.ForeignPtr F.UInt32
-> F.ForeignPtr F.UInt64
-> F.ForeignPtr F.Size
-> [BS.ByteString]
-> IO [(BS.ByteString, BS.ByteString)]
go _ _ _ fptrRemBits fptrRemBitsLen [] = do
resBpFptr <- F.mallocForeignPtrBytes 8
let resBpPtr = F.castPtr (F.unsafeForeignPtrToPtr resBpFptr )
let ptrRemBits = F.unsafeForeignPtrToPtr fptrRemBits
let ptrRemBitsLen = F.unsafeForeignPtrToPtr fptrRemBitsLen
remBits <- F.peek ptrRemBits
remBitsLen <- F.peek ptrRemBitsLen
bpByteLen <- F.smWriteBpChunkFinal
remBits -- remaining_bp_bits
remBitsLen -- remaning_bp_bits_len
resBpPtr -- out_buffer
return [ ( BS.empty
, BSI.fromForeignPtr resBpFptr 0 (fromIntegral bpByteLen * 8)
)
]
go wb ws fptrState fptrRemBits fptrRemBitsLen (bs:bss) = do
let (!bsFptr, !bsOff, !bsLen) = BSI.toForeignPtr bs
let !idxByteLen = (bsLen + 7) `div` 8
resIbFptr <- F.mallocForeignPtrBytes idxByteLen
resBpFptr <- F.mallocForeignPtrBytes idxByteLen
let resIbPtr = F.castPtr (F.unsafeForeignPtrToPtr resIbFptr )
let resBpPtr = F.castPtr (F.unsafeForeignPtrToPtr resBpFptr )
let bsPtr = F.castPtr (F.unsafeForeignPtrToPtr bsFptr)
let ptrState = F.unsafeForeignPtrToPtr fptrState
let ptrRemBits = F.unsafeForeignPtrToPtr fptrRemBits
let ptrRemBitsLen = F.unsafeForeignPtrToPtr fptrRemBitsLen
s :: Word8 <- fromIntegral <$> F.peek ptrState
void $ F.smProcessChunk
(F.plusPtr bsPtr bsOff) -- in_buffer: Ptr UInt8
(fromIntegral bsLen) -- in_length: Size
ptrState -- work state: Ptr UInt32
(workBuffersP wb) -- result_phi: Ptr UInt8
void $ F.smMakeIbOpClChunks
(fromIntegral s) -- state
(workBuffersP wb) -- in_phis
(fromIntegral bsLen) -- phi_length
resIbPtr -- out_ibs
(workBuffersO wb) -- out_ops
(workBuffersC wb) -- out_cls
bpByteLen <- F.smWriteBpChunk
(workBuffersO wb) -- result_op
(workBuffersC wb) -- result_cl
(fromIntegral idxByteLen) -- ib_bytes
ptrRemBits -- remaining_bp_bits
ptrRemBitsLen -- remaning_bp_bits_len
resBpPtr -- out_buffer
let !r =
( BSI.fromForeignPtr resIbFptr 0 idxByteLen
, BSI.fromForeignPtr resBpFptr 0 (fromIntegral bpByteLen * 8)
)
rs <- IO.unsafeInterleaveIO $ go wb ws fptrState fptrRemBits fptrRemBitsLen bss
return (r:rs)

"standard" -> do
IO.withFile filePath IO.ReadMode $ \hIn -> do
contents <- LBS.resegmentPadded 512 <$> LBS.hGetContents hIn
case makeStandardJsonIbBps contents of
Right chunks -> do
IO.withFile outputIbFile IO.WriteMode $ \hIb -> do
IO.withFile outputBpFile IO.WriteMode $ \hBp -> do
forM_ chunks $ \(ibBs, bpBs) -> do
BS.hPut hIb ibBs
BS.hPut hBp bpBs
Left msg -> do
IO.hPutStrLn IO.stderr $ "Unable to create index: " <> show msg
IO.exitFailure
_ -> do
IO.hPutStrLn IO.stderr $ "Unrecognised method: " <> show method
IO.exitFailure

@newhoggy newhoggy changed the title JSON parsing with SIMD Talk: JSON parsing with SIMD Mar 15, 2019
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant