forked from facebook/hermes
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExecutor.cpp
More file actions
1539 lines (1354 loc) · 53.7 KB
/
Executor.cpp
File metadata and controls
1539 lines (1354 loc) · 53.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "hermes/Regex/Executor.h"
#include "hermes/Regex/RegexTraits.h"
#include "llvh/ADT/SmallVector.h"
#include "llvh/Support/TrailingObjects.h"
// This file contains the machinery for executing a regexp compiled to bytecode.
namespace hermes {
namespace regex {
template <class Traits>
struct State;
/// The kind of error that occurred when trying to find a match.
enum class MatchRuntimeErrorType {
/// No error occurred.
None,
/// Reached maximum stack depth while searching for match.
MaxStackDepth,
};
/// An enum describing Width1 opcodes. This is the set of regex opcodes which
/// always match exactly one character (or fail). This is broken out from Opcode
/// to get exhaustiveness checking in switch statements. Note that conversions
/// can be performed via static_cast.
enum class Width1Opcode : uint8_t {
MatchChar8 = (uint8_t)Opcode::MatchChar8,
MatchChar16 = (uint8_t)Opcode::MatchChar16,
MatchCharICase8 = (uint8_t)Opcode::MatchCharICase8,
MatchCharICase16 = (uint8_t)Opcode::MatchCharICase16,
MatchAny = (uint8_t)Opcode::MatchAny,
MatchAnyButNewline = (uint8_t)Opcode::MatchAnyButNewline,
Bracket = (uint8_t)Opcode::Bracket,
};
/// LoopData tracks information about a loop during a match attempt. Each State
/// has one LoopData per loop.
struct LoopData {
/// The number of times that this loop has executed in this state.
uint32_t iterations;
/// The input position where we entered the loop.
uint32_t entryPosition;
};
/// Cursor is a lightweight value type which allows tracking a character pointer
/// 'current' within a range 'first' to 'last'.
/// A cursor may either be forwards, in which case it proceeds from 'first' to
/// 'last'. It may also (in the case of lookbehind assertions) be backwards, in
/// which case the cursor proceeds from 'last' to 'first'. The terms "begin" and
/// "end" denote tracking in the direction of the cursor, while "left" and
/// "right" are direction independent.
template <class Traits>
class Cursor {
using CodeUnit = typename Traits::CodeUnit;
using CodePoint = typename Traits::CodePoint;
public:
/// Construct with the range \p first and \p last, setting the current
/// position to \p first. Note that the \p last is one past the last valid
/// character. \p forwards decides whether the current pointer advances
/// towards last_ (true) or first_ (false).
Cursor(
const CodeUnit *first,
const CodeUnit *current,
const CodeUnit *last,
bool forwards)
: first_(first),
last_(last),
current_(current),
end_(forwards ? last : first),
forwards_(forwards) {
assert(first_ <= last_ && "first and last out of order");
assert(
first_ <= current_ && current <= last_ &&
"current pointer not in range");
}
/// \return whether this cursor advances forwards.
bool forwards() const {
return forwards_;
}
/// Set whether this cursor advances forwards to \p flag.
void setForwards(bool flag) {
forwards_ = flag;
end_ = forwards_ ? last_ : first_;
}
/// \return the number of code units remaining.
uint32_t remaining() const {
return forwards_ ? last_ - current_ : current_ - first_;
}
/// \return whether we are at the end of the range.
bool atEnd() const {
return current_ == end_;
}
/// \return the number of code units consumed from the leftmost character.
/// This is called "offsetFromLeft" and not "offsetFromStart" to indicate that
/// it does not change under backwards tracking.
uint32_t offsetFromLeft() const {
return current_ - first_;
}
/// \return the number of code units between the current position and the end
/// of the string.
/// This is called "offsetFromRight" and not "offsetFromEnd" to indicate that
/// it does not change under backwards tracking.
uint32_t offsetFromRight() const {
return last_ - current_;
}
/// \return whether we are at the leftmost position.
/// This does not change under backwards tracking.
bool atLeft() const {
return current_ == first_;
}
/// \return whether we are at the rightmost position.
/// This does not change under backwards tracking.
bool atRight() const {
return current_ == last_;
}
/// \return the current code unit.
CodeUnit current() const {
// Access the character at index 0 if forwards, -1 if backwards.
assert(!atEnd() && "Cursor is at end");
return current_[(int)forwards_ - 1];
}
/// \return the current cursor position.
const CodeUnit *currentPointer() const {
return current_;
}
/// Set the current cursor position to \p current.
void setCurrentPointer(const CodeUnit *current) {
assert(first_ <= current && current <= last_ && "Current not in range");
current_ = current;
}
/// \return the current code unit, advancing the cursor by 1.
CodeUnit consume() {
CodeUnit result = current();
current_ += forwards_ ? 1 : -1;
return result;
}
/// \return a code point decoded from the code units under the cursor,
/// possibly by decoding surrogates. Advances the cursor by the number of code
/// units consumed.
CodePoint consumeUTF16() {
assert(!atEnd() && "At end");
// In ASCII we have no surrogates.
if (sizeof(CodeUnit) >= 2 && remaining() >= 2) {
CodeUnit hi = forwards_ ? current_[0] : current_[-2];
CodeUnit lo = forwards_ ? current_[1] : current_[-1];
if (isHighSurrogate(hi) && isLowSurrogate(lo)) {
current_ += forwards_ ? 2 : -2;
return decodeSurrogatePair(hi, lo);
}
}
return consume();
}
/// \return whether a regex match performed using the given \p flags can
/// possibly match the given \p constraints.
bool satisfiesConstraints(
constants::MatchFlagType flags,
MatchConstraintSet constraints) const {
if ((constraints & MatchConstraintNonASCII) &&
(flags & constants::matchInputAllAscii))
return false;
if ((constraints & MatchConstraintAnchoredAtStart) && current_ != first_)
return false;
return true;
}
private:
// The first code unit in the string.
const CodeUnit *first_;
// One past the last code unit in the string.
const CodeUnit *last_;
// Our position between first_ and last_.
// If we are forwards, then the current character is current_[0].
// If we are backwards, then the current character is current_[-1].
const CodeUnit *current_;
// A pointer to the end. This is either last (if forwards) or first (if not
// forwards). If our current cursor reaches this value, we are done.
const CodeUnit *end_;
// Whether we are tracking forwards or backwards.
bool forwards_;
};
/// A Context records global information about a match attempt.
template <class Traits>
struct Context {
using CodeUnit = typename Traits::CodeUnit;
using CodePoint = typename Traits::CodePoint;
/// The set of backtracking opcodes. These are interpreted by the backtrack()
/// function.
enum class BacktrackOp : uint8_t {
/// Set the value of a capture group to a stored value.
SetCaptureGroup,
/// Set the value of a loop data to a stored value.
SetLoopData,
/// Set the IP and position in the input string to a stored value.
SetPosition,
/// Backtrack by entering the body of a non-greedy loop.
EnterNonGreedyLoop,
/// Backtrack a greedy loop whose body matches exactly one character, such
/// as /.*/.
GreedyWidth1Loop,
/// Backtrack a nongreedy loop whose body matches exactly one character,
/// such as /.*?/.
NongreedyWidth1Loop,
};
/// An instruction describing how to backtrack.
union BacktrackInsn {
/// The operation to perform.
BacktrackOp op;
/// List of instruction-specific fields. Note that the opcode is reproduced
/// in every struct; this avoids padding between the opcode and the
/// following field.
/// Fields used by setCaptureGroup instruction.
struct {
BacktrackOp op;
uint16_t mexp; /// Which capture group to set.
CapturedRange range; /// Value to set.
} setCaptureGroup;
/// Fields used by SetLoopData instruction.
struct {
BacktrackOp op;
uint16_t loopId; /// Which loop to set.
LoopData loopData; /// Value to set.
} setLoopData;
/// Fields used by SetPosition instruction.
struct {
BacktrackOp op;
uint32_t ip; /// Instruction pointer to set.
const CodeUnit *value; /// Input string position to set.
} setPosition;
/// Fields used by EnterNonGreedyLoop instruction.
struct {
BacktrackOp op;
uint32_t bodyIp; /// The IP of the loop body.
LoopData loopData; /// Data for the loop to set.
const BeginLoopInsn *loopInsn; /// The loop instruction.
} enterNonGreedyLoop;
/// Fields used by GreedyWidth1Loop and NongreedyWidth1Loop.
struct {
BacktrackOp op; /// The opcode.
uint32_t continuation; /// The ip for the not-taken branch of the loop.
const CodeUnit *min; /// The minimum possible match position.
const CodeUnit *max; /// The maximum possible match position.
} width1Loop;
/* implicit */ BacktrackInsn(BacktrackOp op) : op(op) {}
/// \return a SetCaptureGroup instruction.
static BacktrackInsn makeSetCaptureGroup(
uint16_t mexp,
CapturedRange range) {
BacktrackInsn result{BacktrackOp::SetCaptureGroup};
result.setCaptureGroup.mexp = mexp;
result.setCaptureGroup.range = range;
return result;
}
/// \return a SetLoopData instruction.
static BacktrackInsn makeSetLoopData(uint16_t loopId, LoopData loopData) {
BacktrackInsn result{BacktrackOp::SetLoopData};
result.setLoopData.loopId = loopId;
result.setLoopData.loopData = loopData;
return result;
}
/// \return a SetPosition instruction.
static BacktrackInsn makeSetPosition(
uint32_t ip,
const CodeUnit *inputPos) {
BacktrackInsn result = BacktrackOp::SetPosition;
result.setPosition.ip = ip;
result.setPosition.value = inputPos;
return result;
}
/// \return an EnterNonGreedyLoop instruction.
static BacktrackInsn makeEnterNonGreedyLoop(
const BeginLoopInsn *loopInsn,
uint32_t bodyIp,
LoopData loopData) {
BacktrackInsn result = BacktrackOp::EnterNonGreedyLoop;
result.enterNonGreedyLoop.bodyIp = bodyIp;
result.enterNonGreedyLoop.loopInsn = loopInsn;
result.enterNonGreedyLoop.loopData = loopData;
return result;
}
};
/// Our stack of backtrack instructions.
using BacktrackStack = llvh::SmallVector<BacktrackInsn, 64>;
/// The maximum depth of our backtracking stack. Beyond this we return a stack
/// overflow error.
static constexpr size_t kMaxBacktrackDepth = 1u << 24;
/// The stream of bytecode instructions, including the header.
llvh::ArrayRef<uint8_t> bytecodeStream_;
/// The flags associated with the match attempt.
constants::MatchFlagType flags_;
/// Syntax flags associated with the regex.
SyntaxFlags syntaxFlags_;
/// The first character in the input string.
const CodeUnit *first_;
/// The end of the input string (one-past the last).
const CodeUnit *last_;
/// Count of submatches.
uint32_t markedCount_;
/// Count of loops.
uint32_t loopCount_;
/// Traits used for canonicalization.
Traits traits_;
/// The remaining number of times we will attempt to backtrack.
/// This is effectively a timeout on the regexp execution.
uint32_t backtracksRemaining_ = kBacktrackLimit;
/// Whether an error occurred during the regex matching.
MatchRuntimeErrorType error_ = MatchRuntimeErrorType::None;
Context(
llvh::ArrayRef<uint8_t> bytecodeStream,
constants::MatchFlagType flags,
SyntaxFlags syntaxFlags,
const CodeUnit *first,
const CodeUnit *last,
uint32_t markedCount,
uint32_t loopCount)
: bytecodeStream_(bytecodeStream),
flags_(flags),
syntaxFlags_(syntaxFlags),
first_(first),
last_(last),
markedCount_(markedCount),
loopCount_(loopCount) {}
/// Run the given State \p state, by starting at its cursor and acting on its
/// ip_ until the match succeeds or fails. If \p onlyAtStart is set, only
/// test the match at \pos; otherwise test all successive input positions from
/// pos_ through last_.
/// \return a pointer to the start of the match if the match succeeds, nullptr
/// if it fails. If the match succeeds, populates \p state with the state of
/// the successful match; on failure the state's contents are undefined.
/// Note the end of the match can be recovered as
/// state->cursor_.currentPointer().
const CodeUnit *match(State<Traits> *state, bool onlyAtStart);
/// Backtrack the given state \p s with the backtrack stack \p bts.
/// \return true if we backatracked, false if we exhausted the stack.
bool backtrack(BacktrackStack &bts, State<Traits> *s);
/// Set the state's position to the body of a non-greedy loop.
bool performEnterNonGreedyLoop(
State<Traits> *s,
const BeginLoopInsn *loop,
uint32_t bodyIp,
LoopData loopData,
BacktrackStack &backtrackStack);
/// Add a backtrack instruction to the backtrack stack \p bts.
/// On overflow, set error_ to Overflow.
/// \return true on success, false if we overflow.
bool pushBacktrack(BacktrackStack &bts, BacktrackInsn insn) {
bts.push_back(insn);
if (LLVM_UNLIKELY(bts.size() > kMaxBacktrackDepth) ||
LLVM_UNLIKELY(backtracksRemaining_ == 0)) {
error_ = MatchRuntimeErrorType::MaxStackDepth;
return false;
}
backtracksRemaining_--;
return true;
}
/// Run the given Width1Loop \p insn on the given state \p s with the
/// backtrack stack \p bts.
/// \return true on success, false if we should backtrack.
bool matchWidth1Loop(
const Width1LoopInsn *insn,
State<Traits> *s,
BacktrackStack &bts);
private:
/// Do initialization of the given state before it enters the loop body
/// described by the LoopInsn \p loop, including setting up any backtracking
/// state.
/// \return true if backtracking was prepared, false if it overflowed.
bool prepareToEnterLoopBody(
State<Traits> *state,
const BeginLoopInsn *loop,
BacktrackStack &bts);
/// Given a Width1Opcode \p w1opcode, return true if the given char \p c
/// matches the instruction \p insn (with that opcode).
template <Width1Opcode w1opcode>
inline bool matchWidth1(const Insn *insn, CodeUnit c) const;
/// \return true if all chars, stored in contiguous memory after \p insn,
/// match the chars in state \p s in the same order, case insensitive. Note
/// the count of chars is given in \p insn.
inline bool matchesNCharICase8(
const MatchNCharICase8Insn *insn,
State<Traits> &s);
/// Execute the given Width1 instruction \p loopBody on cursor \p c up to \p
/// max times. \return the number of matches made, not to exceed \p max.
/// Note we deliberately accept \p c by value.
template <Width1Opcode w1opcode>
inline uint32_t
matchWidth1LoopBody(const Insn *loopBody, Cursor<Traits> c, uint32_t max);
/// ES6 21.2.5.2.3 AdvanceStringIndex.
/// Return the index of the next character to check.
/// This is typically just the index + 1, except if Unicode is enabled we need
/// to skip surrogate pairs.
inline size_t advanceStringIndex(
const CodeUnit *start,
size_t index,
size_t lastIndex) const;
};
/// We store loop and captured range data contiguously in a single allocation at
/// the end of the State. Use this union to simplify the use of
/// llvh::TrailingObjects.
union LoopOrCapturedRange {
struct LoopData loopData;
struct CapturedRange capturedRange;
};
/// State represents a set of in-flight capture groups and loop datas, along
/// with the IP and input position.
template <typename Traits>
struct State {
using CharT = typename Traits::CodeUnit;
/// The cursor in the input string.
Cursor<Traits> cursor_;
/// The instruction pointer position in the bytecode stream.
uint32_t ip_ = 0;
/// List of captured ranges. This has size equal to the number of marked
/// subexpressions for the regex.
llvh::SmallVector<CapturedRange, 16> capturedRanges_;
/// List of loop datas. This has size equal to the number of loops for the
/// regex.
llvh::SmallVector<LoopData, 16> loopDatas_;
/// \return the loop data at index \p idx.
LoopData &getLoop(uint32_t idx) {
assert(idx < loopDatas_.size() && "Invalid loop index");
return loopDatas_[idx];
}
/// \return the captured range at index \p idx.
CapturedRange &getCapturedRange(uint32_t idx) {
// Captured ranges are allocated after loops, so add the loop count.
assert(idx < capturedRanges_.size() && "Invalid captured range index");
return capturedRanges_[idx];
}
/// Construct a state which with the given \p cursor, which can hold \p
/// markedCount submatches and \p loopCount loop datas.
State(Cursor<Traits> cursor, uint32_t markedCount, uint32_t loopCount)
: cursor_(cursor),
capturedRanges_(markedCount, {kNotMatched, kNotMatched}),
loopDatas_(loopCount, {0, 0}) {}
State(const State &) = default;
State &operator=(const State &) = default;
State(State &&) = default;
State &operator=(State &&) = default;
};
/// ES5.1 7.3
template <class CharT>
bool isLineTerminator(CharT c) {
return c == u'\u000A' || c == u'\u000D' || c == u'\u2028' || c == u'\u2029';
}
template <class Traits>
bool matchesLeftAnchor(Context<Traits> &ctx, State<Traits> &s) {
bool matchesAnchor = false;
const Cursor<Traits> &c = s.cursor_;
if (c.atLeft()) {
// Beginning of text.
matchesAnchor = true;
} else if (
(ctx.syntaxFlags_.multiline) && !c.atLeft() &&
isLineTerminator(c.currentPointer()[-1])) {
// Multiline and after line terminator.
matchesAnchor = true;
}
return matchesAnchor;
}
template <class Traits>
bool matchesRightAnchor(Context<Traits> &ctx, State<Traits> &s) {
bool matchesAnchor = false;
const Cursor<Traits> &c = s.cursor_;
if (c.atRight() && !(ctx.flags_ & constants::matchNotEndOfLine)) {
matchesAnchor = true;
} else if (
(ctx.syntaxFlags_.multiline) && (!c.atRight()) &&
isLineTerminator(c.currentPointer()[0])) {
matchesAnchor = true;
}
return matchesAnchor;
}
/// \return true if all chars, stored in contiguous memory after \p insn,
/// match the chars in state \p s in the same order. Note the count of chars
/// is given in \p insn.
template <class Traits>
bool matchesNChar8(const MatchNChar8Insn *insn, State<Traits> &s) {
Cursor<Traits> &c = s.cursor_;
auto insnCharPtr = reinterpret_cast<const char *>(insn + 1);
auto charCount = insn->charCount;
for (int idx = 0; idx < charCount; idx++) {
if (c.consume() != insnCharPtr[idx]) {
return false;
}
}
return true;
}
template <class Traits>
bool Context<Traits>::matchesNCharICase8(
const MatchNCharICase8Insn *insn,
State<Traits> &s) {
Cursor<Traits> &c = s.cursor_;
auto insnCharPtr = reinterpret_cast<const char *>(insn + 1);
auto charCount = insn->charCount;
bool unicode = syntaxFlags_.unicode;
for (int idx = 0; idx < charCount; idx++) {
auto c1 = c.consume();
char instC = insnCharPtr[idx];
if (c1 != instC &&
(char32_t)traits_.canonicalize(c1, unicode) != (char32_t)instC) {
return false;
}
}
return true;
}
/// \return true if the character \p ch matches a bracket instruction \p insn,
/// containing the bracket ranges \p ranges. Note the count of ranges is given
/// in \p insn.
template <class Traits>
bool bracketMatchesChar(
const Context<Traits> &ctx,
const BracketInsn *insn,
const BracketRange32 *ranges,
typename Traits::CodePoint ch) {
const auto &traits = ctx.traits_;
// Note that if the bracket is negated /[^abc]/, we want to return true if we
// do not match, false if we do. Implement this by xor with the negate flag.
// Check character classes.
// Note we don't have to canonicalize here, because canonicalization does not
// affect which character class a character is in (i.e. a character doesn't
// become a digit after uppercasing).
if (insn->positiveCharClasses || insn->negativeCharClasses) {
for (auto charClass : {CharacterClass::Digits,
CharacterClass::Spaces,
CharacterClass::Words}) {
if ((insn->positiveCharClasses & charClass) &&
traits.characterHasType(ch, charClass))
return true ^ insn->negate;
if ((insn->negativeCharClasses & charClass) &&
!traits.characterHasType(ch, charClass))
return true ^ insn->negate;
}
}
bool contained =
traits.rangesContain(llvh::makeArrayRef(ranges, insn->rangeCount), ch);
return contained ^ insn->negate;
}
/// Do initialization of the given state before it enters the loop body
/// described by the LoopInsn \p loop, including setting up any backtracking
/// state.
/// \return true if backtracking was prepared, false if it overflowed.
template <class Traits>
bool Context<Traits>::prepareToEnterLoopBody(
State<Traits> *s,
const BeginLoopInsn *loop,
BacktrackStack &bts) {
LoopData &loopData = s->getLoop(loop->loopId);
if (!pushBacktrack(
bts, BacktrackInsn::makeSetLoopData(loop->loopId, loopData))) {
return false;
}
loopData.iterations++;
loopData.entryPosition = s->cursor_.offsetFromLeft();
// Backtrack and reset contained capture groups.
for (uint32_t mexp = loop->mexpBegin; mexp != loop->mexpEnd; mexp++) {
auto &captureRange = s->getCapturedRange(mexp);
if (!pushBacktrack(
bts, BacktrackInsn::makeSetCaptureGroup(mexp, captureRange))) {
return false;
}
captureRange = {kNotMatched, kNotMatched};
}
return true;
}
template <class Traits>
bool Context<Traits>::performEnterNonGreedyLoop(
State<Traits> *s,
const BeginLoopInsn *loop,
uint32_t bodyIp,
LoopData loopData,
BacktrackStack &backtrackStack) {
assert(loop->opcode == Opcode::BeginLoop && "Not a BeginLoopInsn");
s->getLoop(loop->loopId) = loopData;
// Set the IP and input position, and initialize the state for entering the
// loop.
s->ip_ = bodyIp;
s->cursor_.setCurrentPointer(first_ + loopData.entryPosition);
prepareToEnterLoopBody(s, loop, backtrackStack);
return true;
}
template <class Traits>
bool Context<Traits>::backtrack(BacktrackStack &bts, State<Traits> *s) {
while (!bts.empty()) {
BacktrackInsn &binsn = bts.back();
switch (binsn.op) {
case BacktrackOp::SetCaptureGroup:
s->getCapturedRange(binsn.setCaptureGroup.mexp) =
binsn.setCaptureGroup.range;
bts.pop_back();
break;
case BacktrackOp::SetLoopData:
s->getLoop(binsn.setLoopData.loopId) = binsn.setLoopData.loopData;
bts.pop_back();
break;
case BacktrackOp::SetPosition:
s->cursor_.setCurrentPointer(binsn.setPosition.value);
s->ip_ = binsn.setPosition.ip;
bts.pop_back();
return true;
case BacktrackOp::EnterNonGreedyLoop: {
auto fields = binsn.enterNonGreedyLoop;
bts.pop_back();
performEnterNonGreedyLoop(
s, fields.loopInsn, fields.bodyIp, fields.loopData, bts);
return true;
}
case BacktrackOp::GreedyWidth1Loop:
case BacktrackOp::NongreedyWidth1Loop: {
// In both of these instructions, we have a range [min, max] containing
// possible match locations, and the match failed at the max location
// (if we are greedy) or the min location (nongreedy). Backtrack by
// decrementing the max (incrementing the min) if we are greedy
// (nongreedy), setting the IP to that location, and jumping to the loop
// exit. Note that if we are tracking backwards (lookbehind assertion)
// our maximum is before our minimum, so we have to reverse the
// direction of increment/decrement.
bool forwards = s->cursor_.forwards();
assert(
(forwards ? binsn.width1Loop.min <= binsn.width1Loop.max
: binsn.width1Loop.min >= binsn.width1Loop.max) &&
"Loop min should be <= max (or >= max if backwards)");
if (binsn.width1Loop.min == binsn.width1Loop.max) {
// We have backtracked as far as possible. Give up.
bts.pop_back();
break;
}
if (binsn.op == BacktrackOp::GreedyWidth1Loop) {
binsn.width1Loop.max += forwards ? -1 : 1;
s->cursor_.setCurrentPointer(binsn.width1Loop.max);
} else {
binsn.width1Loop.min += forwards ? 1 : -1;
s->cursor_.setCurrentPointer(binsn.width1Loop.min);
}
s->ip_ = binsn.width1Loop.continuation;
return true;
}
}
}
// Exhausted the backtracking stack.
return false;
}
template <class Traits>
template <Width1Opcode w1opcode>
bool Context<Traits>::matchWidth1(const Insn *base, CodeUnit c) const {
// Note this switch should resolve at compile time.
assert(
base->opcode == static_cast<Opcode>(w1opcode) &&
"Instruction has wrong opcode");
switch (w1opcode) {
case Width1Opcode::MatchChar8: {
const auto *insn = llvh::cast<MatchChar8Insn>(base);
return c == insn->c;
}
case Width1Opcode::MatchChar16: {
const auto *insn = llvh::cast<MatchChar16Insn>(base);
return c == insn->c;
}
case Width1Opcode::MatchCharICase8: {
const auto *insn = llvh::cast<MatchCharICase8Insn>(base);
return c == (CodePoint)insn->c ||
(CodePoint)traits_.canonicalize(c, syntaxFlags_.unicode) ==
(CodePoint)insn->c;
}
case Width1Opcode::MatchCharICase16: {
const auto *insn = llvh::cast<MatchCharICase16Insn>(base);
return c == insn->c ||
(char32_t)traits_.canonicalize(c, syntaxFlags_.unicode) ==
(char32_t)insn->c;
}
case Width1Opcode::MatchAny:
return true;
case Width1Opcode::MatchAnyButNewline:
return !isLineTerminator(c);
case Width1Opcode::Bracket: {
// BracketInsn is followed by a list of BracketRange32s.
assert(
!(syntaxFlags_.unicode) &&
"Unicode should not be set for Width 1 brackets");
const BracketInsn *insn = llvh::cast<BracketInsn>(base);
const BracketRange32 *ranges =
reinterpret_cast<const BracketRange32 *>(insn + 1);
return bracketMatchesChar<Traits>(*this, insn, ranges, c);
}
}
llvm_unreachable("Invalid width 1 opcode");
}
template <class Traits>
template <Width1Opcode w1opcode>
uint32_t Context<Traits>::matchWidth1LoopBody(
const Insn *insn,
Cursor<Traits> c,
uint32_t max) {
uint32_t iters = 0;
for (; iters < max; iters++) {
if (!matchWidth1<w1opcode>(insn, c.consume()))
break;
}
return iters;
}
template <class Traits>
bool Context<Traits>::matchWidth1Loop(
const Width1LoopInsn *insn,
State<Traits> *s,
BacktrackStack &bts) {
// Note we copy the cursor here.
Cursor<Traits> c = s->cursor_;
uint32_t matched = 0, minMatch = insn->min, maxMatch = insn->max;
// Limit our max to the smaller of the maximum in the loop and number of
// number of characters remaining. This allows us to avoid having to test for
// end of input in the loop body.
maxMatch = std::min(c.remaining(), maxMatch);
// The loop body follows the loop instruction.
const Insn *body = static_cast<const Insn *>(&insn[1]);
// Match as far as we can up to maxMatch. Note we do this even if the loop is
// non-greedy: we compute how far we might conceivably have to backtrack
// (except in non-greedy loops we're "backtracking" by moving forwards).
using W1 = Width1Opcode;
switch (static_cast<Width1Opcode>(body->opcode)) {
case W1::MatchChar8:
matched = matchWidth1LoopBody<W1::MatchChar8>(body, c, maxMatch);
break;
case W1::MatchChar16:
matched = matchWidth1LoopBody<W1::MatchChar16>(body, c, maxMatch);
break;
case W1::MatchCharICase8:
matched = matchWidth1LoopBody<W1::MatchCharICase8>(body, c, maxMatch);
break;
case W1::MatchCharICase16:
matched = matchWidth1LoopBody<W1::MatchCharICase16>(body, c, maxMatch);
break;
case W1::MatchAny:
matched = matchWidth1LoopBody<W1::MatchAny>(body, c, maxMatch);
break;
case W1::MatchAnyButNewline:
matched = matchWidth1LoopBody<W1::MatchAnyButNewline>(body, c, maxMatch);
break;
case W1::Bracket:
matched = matchWidth1LoopBody<W1::Bracket>(body, c, maxMatch);
break;
}
// If we iterated less than the minimum, we failed to match.
if (matched < minMatch) {
return false;
}
assert(
minMatch <= matched && matched <= maxMatch &&
"matched should be between min and max match count");
// Now we know the valid match range.
// Compute the beginning and end pointers in this range.
bool forwards = s->cursor_.forwards();
const CodeUnit *pos = s->cursor_.currentPointer();
const CodeUnit *minPos = forwards ? pos + minMatch : pos - minMatch;
const CodeUnit *maxPos = forwards ? pos + matched : pos - matched;
// If min == max (e.g. /a{3}/) then no backtracking is possible. If min < max,
// backtracking is possible and we need to add a backtracking instruction.
if (minMatch < matched) {
BacktrackInsn backtrack{insn->greedy ? BacktrackOp::GreedyWidth1Loop
: BacktrackOp::NongreedyWidth1Loop};
backtrack.width1Loop.continuation = insn->notTakenTarget;
backtrack.width1Loop.min = minPos;
backtrack.width1Loop.max = maxPos;
if (!pushBacktrack(bts, backtrack)) {
return false;
}
}
// Set the state's current position to either the minimum or maximum location,
// and point it to the exit of the loop.
s->cursor_.setCurrentPointer(insn->greedy ? maxPos : minPos);
s->ip_ = insn->notTakenTarget;
return true;
}
/// ES6 21.2.5.2.3. Effectively this skips surrogate pairs if the regexp has the
/// Unicode flag set.
template <class Traits>
inline size_t Context<Traits>::advanceStringIndex(
const CodeUnit *start,
size_t index,
size_t length) const {
if (sizeof(CodeUnit) == 1) {
// The input string is ASCII and therefore cannot have surrogate pairs.
return index + 1;
}
// "If unicode is false, return index+1."
// "If index+1 >= length, return index+1."
if (LLVM_LIKELY(!(syntaxFlags_.unicode)) || (index + 1 >= length))
return index + 1;
// Let first be the code unit value at index index in S
// If first < 0xD800 or first > 0xDBFF, return index+1
// Let second be the code unit value at index index+1 in S.
// If second < 0xDC00 or second > 0xDFFF, return index+1.
CodeUnit first = start[index];
CodeUnit second = start[index + 1];
if (LLVM_LIKELY(!isHighSurrogate(first)) ||
LLVM_LIKELY(!isLowSurrogate(second))) {
return index + 1;
}
// Return index+2.
return index + 2;
}
template <class Traits>
auto Context<Traits>::match(State<Traits> *s, bool onlyAtStart)
-> const CodeUnit * {
using State = State<Traits>;
BacktrackStack backtrackStack;
// We'll refer to the cursor often.
Cursor<Traits> &c = s->cursor_;
// Pull out the instruction portion of the bytecode, following the header.
const uint8_t *const bytecode = &bytecodeStream_[sizeof(RegexBytecodeHeader)];
// Save the incoming IP in case we have to loop.
const auto startIp = s->ip_;
const CodeUnit *const startLoc = c.currentPointer();
// Use offsetFromRight() instead of remaining() here so that the length passed
// to advanceStringIndex is accurate even when the cursor is going backwards.
const size_t charsToRight = c.offsetFromRight();
// Decide how many locations we'll need to check.
// Note that we do want to check the empty range at the end, so add one to
// charsToRight.
const size_t locsToCheckCount = onlyAtStart ? 1 : 1 + charsToRight;
// If we are tracking backwards, we should only ever have one potential match
// location. This is because advanceStringIndex only ever tracks forwards.
assert(
(c.forwards() || locsToCheckCount == 1) &&
"Can only check one location when cursor is backwards");
// Macro used when a state fails to match.
#define BACKTRACK() \
do { \
if (backtrack(backtrackStack, s)) \
goto backtrackingSucceeded; \
goto backtrackingExhausted; \
} while (0)
for (size_t locIndex = 0; locIndex < locsToCheckCount;
locIndex = advanceStringIndex(startLoc, locIndex, charsToRight)) {
const CodeUnit *potentialMatchLocation = startLoc + locIndex;
c.setCurrentPointer(potentialMatchLocation);
s->ip_ = startIp;
backtrackingSucceeded:
for (;;) {
const Insn *base = reinterpret_cast<const Insn *>(&bytecode[s->ip_]);
switch (base->opcode) {
case Opcode::Goal:
return potentialMatchLocation;
case Opcode::LeftAnchor:
if (!matchesLeftAnchor(*this, *s))
BACKTRACK();
s->ip_ += sizeof(LeftAnchorInsn);
break;
case Opcode::RightAnchor:
if (!matchesRightAnchor(*this, *s))
BACKTRACK();
s->ip_ += sizeof(RightAnchorInsn);
break;
case Opcode::MatchAny:
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchAny>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchAnyInsn);
break;
case Opcode::U16MatchAny:
if (c.atEnd())
BACKTRACK();
c.consumeUTF16();
s->ip_ += sizeof(U16MatchAnyInsn);
break;
case Opcode::MatchAnyButNewline:
if (c.atEnd() ||
!matchWidth1<Width1Opcode::MatchAnyButNewline>(base, c.consume()))
BACKTRACK();
s->ip_ += sizeof(MatchAnyButNewlineInsn);