Skip to content

Commit 1680903

Browse files
committed
improveJSON parser perfromance
- unified string/stream algorithm - templated Source class - use char_traits integer type and conversion function for char extraction
1 parent bc808cf commit 1680903

4 files changed

Lines changed: 20389 additions & 383 deletions

File tree

JSON/include/Poco/JSON/Parser.h

Lines changed: 317 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,13 @@ SOFTWARE.
6969
#include "Poco/JSON/ParseHandler.h"
7070
#include "Poco/Dynamic/Var.h"
7171
#include "Poco/StreamTokenizer.h"
72+
#include <string>
7273

7374

7475
namespace Poco {
7576
namespace JSON {
7677

7778

78-
class Source;
79-
80-
8179
class JSON_API Parser
8280
/// A RFC 4627 compatible class for parsing JSON strings or streams.
8381
///
@@ -97,6 +95,8 @@ class JSON_API Parser
9795
///
9896
{
9997
public:
98+
typedef std::char_traits<char> CharTraits;
99+
typedef CharTraits::int_type CharIntType;
100100

101101
enum Classes
102102
{
@@ -287,26 +287,332 @@ class JSON_API Parser
287287

288288
void parseBufferPopBackChar();
289289

290-
void addCharToParseBuffer(int nextChar, int nextClass);
290+
void addCharToParseBuffer(CharIntType nextChar, int nextClass);
291291

292-
void addEscapedCharToParseBuffer(int nextChar);
292+
void addEscapedCharToParseBuffer(CharIntType nextChar);
293293

294-
int decodeUnicodeChar();
294+
CharIntType decodeUnicodeChar();
295295

296296
void assertNotStringNullBool();
297297

298298
void assertNonContainer();
299299

300300
void parseBuffer();
301301

302-
bool parseChar(int nextChar, Source& feeder);
302+
template <typename IT>
303+
class Source
304+
{
305+
public:
306+
Source(const IT& it, const IT& end) : _it(it), _end(end)
307+
{
308+
}
309+
310+
~Source()
311+
{
312+
}
313+
314+
bool nextChar(CharIntType& c)
315+
{
316+
if (_it == _end) return false;
317+
c = *_it;
318+
++_it;
319+
return true;
320+
}
321+
322+
private:
323+
IT _it;
324+
IT _end;
325+
};
326+
327+
template <typename S>
328+
bool parseChar(CharIntType nextChar, S& source)
303329
/// Called for each character (or partial character) in JSON string.
304330
/// It accepts UTF-8, UTF-16, or UTF-32. If the character is accepted,
305331
/// it returns true, otherwise false.
332+
{
333+
CharIntType nextClass, nextState;
334+
unsigned char ch = static_cast<unsigned char>(CharTraits::to_char_type(nextChar));
335+
336+
// Determine the character's class.
337+
if (ch < 0 || (!_allowNullByte && ch == 0)) return false;
338+
if (0x80 <= ch && ch <= 0xFF)
339+
{
340+
nextClass = C_ETC;
341+
CharIntType count = utf8CheckFirst(nextChar);
342+
if (!count)
343+
{
344+
throw JSONException(format("Unable to decode byte 0x%x", (unsigned int) nextChar));
345+
}
346+
347+
char buffer[4];
348+
buffer[0] = nextChar;
349+
for(int i = 1; i < count; ++i)
350+
{
351+
int c = 0;
352+
if (!source.nextChar(c)) throw JSONException("Invalid UTF8 sequence found");
353+
buffer[i] = c;
354+
}
355+
356+
if (!UTF8Encoding::isLegal((unsigned char*) buffer, count))
357+
{
358+
throw JSONException("No legal UTF8 found");
359+
}
360+
361+
for(int i = 0; i < count; ++i)
362+
{
363+
parseBufferPushBackChar(buffer[i]);
364+
}
365+
return true;
366+
}
367+
else
368+
{
369+
nextClass = _asciiClass[nextChar];
370+
if (nextClass <= xx) return false;
371+
}
372+
373+
addCharToParseBuffer(nextChar, nextClass);
374+
375+
// Get the next _state from the _state transition table.
376+
nextState = _stateTransitionTable[_state][nextClass];
377+
if (nextState >= 0)
378+
{
379+
_state = nextState;
380+
}
381+
else
382+
{
383+
// Or perform one of the actions.
384+
switch (nextState)
385+
{
386+
// Unicode character
387+
case UC:
388+
if(!decodeUnicodeChar()) return false;
389+
// check if we need to read a second UTF-16 char
390+
if (_utf16HighSurrogate) _state = D1;
391+
else _state = ST;
392+
break;
393+
// _escaped char
394+
case EX:
395+
_escaped = 1;
396+
_state = ES;
397+
break;
398+
// integer detected by minus
399+
case MX:
400+
_type = JSON_T_INTEGER;
401+
_state = MI;
402+
break;
403+
// integer detected by zero
404+
case ZX:
405+
_type = JSON_T_INTEGER;
406+
_state = ZE;
407+
break;
408+
// integer detected by 1-9
409+
case IX:
410+
_type = JSON_T_INTEGER;
411+
_state = IT;
412+
break;
413+
// floating point number detected by exponent
414+
case DE:
415+
assertNotStringNullBool();
416+
_type = JSON_T_FLOAT;
417+
_state = E1;
418+
break;
419+
// floating point number detected by fraction
420+
case DF:
421+
assertNotStringNullBool();
422+
_type = JSON_T_FLOAT;
423+
_state = FX;
424+
break;
425+
// string begin "
426+
case SB:
427+
clearBuffer();
428+
poco_assert(_type == JSON_T_NONE);
429+
_type = JSON_T_STRING;
430+
_state = ST;
431+
break;
432+
433+
// n
434+
case NU:
435+
poco_assert(_type == JSON_T_NONE);
436+
_type = JSON_T_NULL;
437+
_state = N1;
438+
break;
439+
// f
440+
case FA:
441+
poco_assert(_type == JSON_T_NONE);
442+
_type = JSON_T_FALSE;
443+
_state = F1;
444+
break;
445+
// t
446+
case TR:
447+
poco_assert(_type == JSON_T_NONE);
448+
_type = JSON_T_TRUE;
449+
_state = T1;
450+
break;
451+
452+
// closing comment
453+
case CE:
454+
_comment = 0;
455+
poco_assert(_parseBufferCount == 0);
456+
poco_assert(_type == JSON_T_NONE);
457+
_state = _beforeCommentState;
458+
break;
459+
460+
// opening comment
461+
case CB:
462+
if (!_allowComments) return false;
463+
parseBufferPopBackChar();
464+
parseBuffer();
465+
poco_assert(_parseBufferCount == 0);
466+
poco_assert(_type != JSON_T_STRING);
467+
switch (_stack[_top])
468+
{
469+
case MODE_ARRAY:
470+
case MODE_OBJECT:
471+
switch(_state)
472+
{
473+
case VA:
474+
case AR:
475+
_beforeCommentState = _state;
476+
break;
477+
default:
478+
_beforeCommentState = OK;
479+
break;
480+
}
481+
break;
482+
default:
483+
_beforeCommentState = _state;
484+
break;
485+
}
486+
_type = JSON_T_NONE;
487+
_state = C1;
488+
_comment = 1;
489+
break;
490+
// empty }
491+
case -9:
492+
{
493+
clearBuffer();
494+
if (_pHandler) _pHandler->endObject();
495+
496+
if (!pop(MODE_KEY)) return false;
497+
_state = OK;
498+
break;
499+
}
500+
// }
501+
case -8:
502+
{
503+
parseBufferPopBackChar();
504+
parseBuffer();
505+
if (_pHandler) _pHandler->endObject();
506+
if (!pop(MODE_OBJECT)) return false;
507+
_type = JSON_T_NONE;
508+
_state = OK;
509+
break;
510+
}
511+
// ]
512+
case -7:
513+
{
514+
parseBufferPopBackChar();
515+
parseBuffer();
516+
if (_pHandler) _pHandler->endArray();
517+
if (!pop(MODE_ARRAY)) return false;
518+
_type = JSON_T_NONE;
519+
_state = OK;
520+
break;
521+
}
522+
// {
523+
case -6:
524+
{
525+
parseBufferPopBackChar();
526+
if (_pHandler) _pHandler->startObject();
527+
if (!push(MODE_KEY)) return false;
528+
poco_assert(_type == JSON_T_NONE);
529+
_state = OB;
530+
break;
531+
}
532+
// [
533+
case -5:
534+
{
535+
parseBufferPopBackChar();
536+
if (_pHandler) _pHandler->startArray();
537+
if (!push(MODE_ARRAY)) return false;
538+
poco_assert(_type == JSON_T_NONE);
539+
_state = AR;
540+
break;
541+
}
542+
// string end "
543+
case -4:
544+
parseBufferPopBackChar();
545+
switch (_stack[_top])
546+
{
547+
case MODE_KEY:
548+
{
549+
poco_assert(_type == JSON_T_STRING);
550+
_type = JSON_T_NONE;
551+
_state = CO;
552+
553+
if (_pHandler)
554+
{
555+
std::string value(_parseBuffer.begin(), _parseBufferCount);
556+
_pHandler->key(value);
557+
}
558+
clearBuffer();
559+
break;
560+
}
561+
case MODE_ARRAY:
562+
case MODE_OBJECT:
563+
poco_assert(_type == JSON_T_STRING);
564+
parseBuffer();
565+
_type = JSON_T_NONE;
566+
_state = OK;
567+
break;
568+
default:
569+
return false;
570+
}
571+
break;
572+
573+
// ,
574+
case -3:
575+
{
576+
parseBufferPopBackChar();
577+
parseBuffer();
578+
switch (_stack[_top])
579+
{
580+
case MODE_OBJECT:
581+
//A comma causes a flip from object mode to key mode.
582+
if (!pop(MODE_OBJECT) || !push(MODE_KEY)) return false;
583+
poco_assert(_type != JSON_T_STRING);
584+
_type = JSON_T_NONE;
585+
_state = KE;
586+
break;
587+
case MODE_ARRAY:
588+
poco_assert(_type != JSON_T_STRING);
589+
_type = JSON_T_NONE;
590+
_state = VA;
591+
break;
592+
default:
593+
return false;
594+
}
595+
break;
596+
}
597+
// :
598+
case -2:
599+
// A colon causes a flip from key mode to object mode.
600+
parseBufferPopBackChar();
601+
if (!pop(MODE_KEY) || !push(MODE_OBJECT)) return false;
602+
poco_assert(_type == JSON_T_NONE);
603+
_state = VA;
604+
break;
605+
//Bad action.
606+
default:
607+
return false;
608+
}
609+
}
610+
return true;
611+
}
306612

307613
bool done();
308614

309-
static int utf8_check_first(char byte);
615+
static CharIntType utf8CheckFirst(char byte);
310616

311617
static const int _asciiClass[128];
312618
/// This array maps the 128 ASCII characters into character classes.
@@ -390,7 +696,9 @@ inline Dynamic::Var Parser::result() const
390696

391697
inline Dynamic::Var Parser::asVar() const
392698
{
393-
return _pHandler->asVar();
699+
if (_pHandler) return _pHandler->asVar();
700+
701+
return Dynamic::Var();
394702
}
395703

396704

0 commit comments

Comments
 (0)