Skip to content

Commit b666e42

Browse files
authored
Improve arrow-json deserialization performance by 30% (#7157)
* Implement BufIter methods directly instead of wrapping an iterator (22% faster) * Use memchr2 to significantly speed up string-end finding * Use simdutf8 to speed up UTF-8 validation of JSON
1 parent cb18801 commit b666e42

File tree

2 files changed

+36
-12
lines changed

2 files changed

+36
-12
lines changed

arrow-json/Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ serde = { version = "1.0", default-features = false }
4646
serde_json = { version = "1.0", default-features = false, features = ["std"] }
4747
chrono = { workspace = true }
4848
lexical-core = { version = "1.0", default-features = false}
49+
memchr = "2.7.4"
50+
simdutf8 = "0.1.5"
4951

5052
[dev-dependencies]
5153
flate2 = { version = "1", default-features = false, features = ["rust_backend"] }

arrow-json/src/reader/tape.rs

+34-12
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use crate::reader::serializer::TapeSerializer;
1919
use arrow_schema::ArrowError;
20+
use memchr::memchr2;
2021
use serde::Serialize;
2122
use std::fmt::Write;
2223

@@ -394,7 +395,7 @@ impl TapeDecoder {
394395
}
395396
// Decoding a string
396397
DecoderState::String => {
397-
let s = iter.advance_until(|b| matches!(b, b'\\' | b'"'));
398+
let s = iter.skip_chrs(b'\\', b'"');
398399
self.bytes.extend_from_slice(s);
399400

400401
match next!(iter) {
@@ -582,7 +583,7 @@ impl TapeDecoder {
582583
self.bytes.len()
583584
);
584585

585-
let strings = std::str::from_utf8(&self.bytes)
586+
let strings = simdutf8::basic::from_utf8(&self.bytes)
586587
.map_err(|_| ArrowError::JsonError("Encountered non-UTF-8 data".to_string()))?;
587588

588589
for offset in self.offsets.iter().copied() {
@@ -615,29 +616,33 @@ impl TapeDecoder {
615616
}
616617

617618
/// A wrapper around a slice iterator that provides some helper functionality
618-
struct BufIter<'a>(std::slice::Iter<'a, u8>);
619+
struct BufIter<'a> {
620+
buf: &'a [u8],
621+
pos: usize,
622+
}
619623

620624
impl<'a> BufIter<'a> {
621625
fn new(buf: &'a [u8]) -> Self {
622-
Self(buf.iter())
626+
Self { buf, pos: 0 }
623627
}
624628

629+
#[inline]
625630
fn as_slice(&self) -> &'a [u8] {
626-
self.0.as_slice()
631+
&self.buf[self.pos..]
627632
}
628633

634+
#[inline]
629635
fn is_empty(&self) -> bool {
630-
self.0.len() == 0
636+
self.pos >= self.buf.len()
631637
}
632638

633639
fn peek(&self) -> Option<u8> {
634-
self.0.as_slice().first().copied()
640+
self.buf.get(self.pos).copied()
635641
}
636642

643+
#[inline]
637644
fn advance(&mut self, skip: usize) {
638-
for _ in 0..skip {
639-
self.0.next();
640-
}
645+
self.pos += skip;
641646
}
642647

643648
fn advance_until<F: FnMut(u8) -> bool>(&mut self, f: F) -> &[u8] {
@@ -654,6 +659,20 @@ impl<'a> BufIter<'a> {
654659
}
655660
}
656661

662+
fn skip_chrs(&mut self, c1: u8, c2: u8) -> &[u8] {
663+
let s = self.as_slice();
664+
match memchr2(c1, c2, s) {
665+
Some(p) => {
666+
self.advance(p);
667+
&s[..p]
668+
}
669+
None => {
670+
self.advance(s.len());
671+
s
672+
}
673+
}
674+
}
675+
657676
fn skip_whitespace(&mut self) {
658677
self.advance_until(|b| !json_whitespace(b));
659678
}
@@ -663,11 +682,14 @@ impl Iterator for BufIter<'_> {
663682
type Item = u8;
664683

665684
fn next(&mut self) -> Option<Self::Item> {
666-
self.0.next().copied()
685+
let b = self.peek();
686+
self.pos += 1;
687+
b
667688
}
668689

669690
fn size_hint(&self) -> (usize, Option<usize>) {
670-
self.0.size_hint()
691+
let s = self.buf.len().checked_sub(self.pos).unwrap_or_default();
692+
(s, Some(s))
671693
}
672694
}
673695

0 commit comments

Comments
 (0)