delimiterProcessors, IncludeSourceSpans includeSourceSpans) {
this.blockParserFactories = blockParserFactories;
this.inlineParserFactory = inlineParserFactory;
this.delimiterProcessors = delimiterProcessors;
+ this.includeSourceSpans = includeSourceSpans;
this.documentBlockParser = new DocumentBlockParser();
activateBlockParser(new OpenBlockParser(documentBlockParser, 0));
@@ -430,13 +433,15 @@ private void addLine() {
}
private void addSourceSpans() {
- // Don't add source spans for Document itself (it would get the whole source text)
- for (int i = 1; i < openBlockParsers.size(); i++) {
- OpenBlockParser openBlockParser = openBlockParsers.get(i);
- int blockIndex = openBlockParser.sourceIndex;
- int length = line.length() - blockIndex;
- if (length != 0) {
- openBlockParser.blockParser.addSourceSpan(SourceSpan.of(lineIndex, blockIndex, length));
+ if (includeSourceSpans != IncludeSourceSpans.NONE) {
+ // Don't add source spans for Document itself (it would get the whole source text)
+ for (int i = 1; i < openBlockParsers.size(); i++) {
+ OpenBlockParser openBlockParser = openBlockParsers.get(i);
+ int blockIndex = openBlockParser.sourceIndex;
+ int length = line.length() - blockIndex;
+ if (length != 0) {
+ openBlockParser.blockParser.addSourceSpan(SourceSpan.of(lineIndex, blockIndex, length));
+ }
}
}
}
diff --git a/commonmark/src/main/java/org/commonmark/node/Block.java b/commonmark/src/main/java/org/commonmark/node/Block.java
index e6a317d7c..753447c5c 100644
--- a/commonmark/src/main/java/org/commonmark/node/Block.java
+++ b/commonmark/src/main/java/org/commonmark/node/Block.java
@@ -1,5 +1,8 @@
package org.commonmark.node;
+/**
+ * Block nodes such as paragraphs, list blocks, code blocks etc.
+ */
public abstract class Block extends Node {
public Block getParent() {
diff --git a/commonmark/src/main/java/org/commonmark/node/Node.java b/commonmark/src/main/java/org/commonmark/node/Node.java
index 6235f84ee..f8a8cace7 100644
--- a/commonmark/src/main/java/org/commonmark/node/Node.java
+++ b/commonmark/src/main/java/org/commonmark/node/Node.java
@@ -4,6 +4,11 @@
import java.util.Collections;
import java.util.List;
+/**
+ * The base class of all CommonMark AST nodes ({@link Block} and inlines).
+ *
+ * A node can have multiple children, and a parent (except for the root node).
+ */
public abstract class Node {
private Node parent = null;
@@ -11,7 +16,7 @@ public abstract class Node {
private Node lastChild = null;
private Node prev = null;
private Node next = null;
- private List sourceSpans = new ArrayList<>();
+ private List sourceSpans = null;
public abstract void accept(Visitor visitor);
@@ -35,18 +40,6 @@ public Node getParent() {
return parent;
}
- public List getSourceSpans() {
- return Collections.unmodifiableList(sourceSpans);
- }
-
- public void setSourceSpans(List sourceSpans) {
- this.sourceSpans = new ArrayList<>(sourceSpans);
- }
-
- public void addSourceSpan(SourceSpan sourceSpan) {
- this.sourceSpans.add(sourceSpan);
- }
-
protected void setParent(Node parent) {
this.parent = parent;
}
@@ -121,6 +114,35 @@ public void insertBefore(Node sibling) {
}
}
+
+ /**
+ * @return the source spans of this node if included by the parser, an empty list otherwise
+ */
+ public List getSourceSpans() {
+ return sourceSpans != null ? Collections.unmodifiableList(sourceSpans) : Collections.emptyList();
+ }
+
+ /**
+ * Replace the current source spans with the provided list.
+ *
+ * @param sourceSpans the new source spans to set
+ */
+ public void setSourceSpans(List sourceSpans) {
+ this.sourceSpans = new ArrayList<>(sourceSpans);
+ }
+
+ /**
+ * Add a source span to the end of the list.
+ *
+ * @param sourceSpan the source span to add
+ */
+ public void addSourceSpan(SourceSpan sourceSpan) {
+ if (sourceSpans == null) {
+ this.sourceSpans = new ArrayList<>();
+ }
+ this.sourceSpans.add(sourceSpan);
+ }
+
@Override
public String toString() {
return getClass().getSimpleName() + "{" + toStringAttributes() + "}";
diff --git a/commonmark/src/main/java/org/commonmark/node/Paragraph.java b/commonmark/src/main/java/org/commonmark/node/Paragraph.java
index 0c3f88f39..176eaaa76 100644
--- a/commonmark/src/main/java/org/commonmark/node/Paragraph.java
+++ b/commonmark/src/main/java/org/commonmark/node/Paragraph.java
@@ -1,5 +1,8 @@
package org.commonmark.node;
+/**
+ * A paragraph block, contains inline nodes such as {@link Text}
+ */
public class Paragraph extends Block {
@Override
diff --git a/commonmark/src/main/java/org/commonmark/node/SourceSpan.java b/commonmark/src/main/java/org/commonmark/node/SourceSpan.java
index 3788f86f0..a643bd4dc 100644
--- a/commonmark/src/main/java/org/commonmark/node/SourceSpan.java
+++ b/commonmark/src/main/java/org/commonmark/node/SourceSpan.java
@@ -2,6 +2,25 @@
import java.util.Objects;
+/**
+ * A source span references a snippet of text from the source input.
+ *
+ * It has a starting position (line and column index) and a length of how many characters it spans.
+ *
+ * For example, this CommonMark source text:
+ *
+ * > foo
+ *
+ * The {@link BlockQuote} node would have this source span: line 0, column 0, length 5.
+ *
+ * The {@link Paragraph} node inside it would have: line 0, column 2, length 3.
+ *
+ * If a block has multiple lines, it will have a source span for each line.
+ *
+ * Note that the column index and length are measured in Java characters (UTF-16 code units). If you're outputting them
+ * to be consumed by another programming language, e.g. one that uses UTF-8 strings, you will need to translate them,
+ * otherwise characters such as emojis will result in incorrect positions.
+ */
public class SourceSpan {
private final int lineIndex;
@@ -33,7 +52,7 @@ public int getColumnIndex() {
}
/**
- * @return length of the span
+ * @return length of the span in characters
*/
public int getLength() {
return length;
diff --git a/commonmark/src/main/java/org/commonmark/parser/IncludeSourceSpans.java b/commonmark/src/main/java/org/commonmark/parser/IncludeSourceSpans.java
new file mode 100644
index 000000000..d6fc459eb
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/parser/IncludeSourceSpans.java
@@ -0,0 +1,16 @@
+package org.commonmark.parser;
+
+/**
+ * Whether to include {@link org.commonmark.node.SourceSpan} or not while parsing,
+ * see {@link Parser.Builder#includeSourceSpans(IncludeSourceSpans)}.
+ */
+public enum IncludeSourceSpans {
+ /**
+ * Do not include source spans.
+ */
+ NONE,
+ /**
+ * Include source spans on {@link org.commonmark.node.Block} nodes.
+ */
+ BLOCKS,
+}
diff --git a/commonmark/src/main/java/org/commonmark/parser/Parser.java b/commonmark/src/main/java/org/commonmark/parser/Parser.java
index 5e15158ad..a4d5c8531 100644
--- a/commonmark/src/main/java/org/commonmark/parser/Parser.java
+++ b/commonmark/src/main/java/org/commonmark/parser/Parser.java
@@ -4,7 +4,16 @@
import org.commonmark.internal.DocumentParser;
import org.commonmark.internal.InlineParserContextImpl;
import org.commonmark.internal.InlineParserImpl;
-import org.commonmark.node.*;
+import org.commonmark.node.Block;
+import org.commonmark.node.BlockQuote;
+import org.commonmark.node.FencedCodeBlock;
+import org.commonmark.node.Heading;
+import org.commonmark.node.HtmlBlock;
+import org.commonmark.node.IndentedCodeBlock;
+import org.commonmark.node.LinkReferenceDefinition;
+import org.commonmark.node.ListBlock;
+import org.commonmark.node.Node;
+import org.commonmark.node.ThematicBreak;
import org.commonmark.parser.block.BlockParserFactory;
import org.commonmark.parser.delimiter.DelimiterProcessor;
@@ -31,12 +40,14 @@ public class Parser {
private final List delimiterProcessors;
private final InlineParserFactory inlineParserFactory;
private final List postProcessors;
+ private final IncludeSourceSpans includeSourceSpans;
private Parser(Builder builder) {
this.blockParserFactories = DocumentParser.calculateBlockParserFactories(builder.blockParserFactories, builder.enabledBlockTypes);
this.inlineParserFactory = builder.getInlineParserFactory();
this.postProcessors = builder.postProcessors;
this.delimiterProcessors = builder.delimiterProcessors;
+ this.includeSourceSpans = builder.includeSourceSpans;
// Try to construct an inline parser. Invalid configuration might result in an exception, which we want to
// detect as soon as possible.
@@ -99,7 +110,7 @@ public Node parseReader(Reader input) throws IOException {
}
private DocumentParser createDocumentParser() {
- return new DocumentParser(blockParserFactories, inlineParserFactory, delimiterProcessors);
+ return new DocumentParser(blockParserFactories, inlineParserFactory, delimiterProcessors, includeSourceSpans);
}
private Node postProcess(Node document) {
@@ -118,6 +129,7 @@ public static class Builder {
private final List postProcessors = new ArrayList<>();
private Set> enabledBlockTypes = DocumentParser.getDefaultBlockParserTypes();
private InlineParserFactory inlineParserFactory;
+ private IncludeSourceSpans includeSourceSpans = IncludeSourceSpans.NONE;
/**
* @return the configured {@link Parser}
@@ -167,7 +179,7 @@ public Builder extensions(Iterable extends Extension> extensions) {
*
*
* @param enabledBlockTypes A list of block nodes the parser will parse.
- * If this list is empty, the parser will not recognize any CommonMark core features.
+ * If this list is empty, the parser will not recognize any CommonMark core features.
* @return {@code this}
*/
public Builder enabledBlockTypes(Set> enabledBlockTypes) {
@@ -178,6 +190,19 @@ public Builder enabledBlockTypes(Set> enabledBlockTypes)
return this;
}
+ /**
+ * Whether to calculate {@link org.commonmark.node.SourceSpan} for {@link Node}.
+ *
+ * By default, source spans are disabled.
+ *
+ * @param includeSourceSpans which kind of source spans should be included
+ * @return {@code this}
+ */
+ public Builder includeSourceSpans(IncludeSourceSpans includeSourceSpans) {
+ this.includeSourceSpans = includeSourceSpans;
+ return this;
+ }
+
/**
* Adds a custom block parser factory.
*
diff --git a/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java b/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java
index 3e2de2be2..f6796c925 100644
--- a/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java
+++ b/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java
@@ -13,6 +13,7 @@
import org.commonmark.node.SourceSpan;
import org.commonmark.node.ThematicBreak;
import org.commonmark.parser.Parser;
+import org.commonmark.parser.IncludeSourceSpans;
import org.junit.Test;
import java.util.Arrays;
@@ -22,7 +23,7 @@
public class SourceSpansTest {
- private static final Parser PARSER = Parser.builder().build();
+ private static final Parser PARSER = Parser.builder().includeSourceSpans(IncludeSourceSpans.BLOCKS).build();
@Test
public void paragraph() {
From db6aa0b802fb2101105a0565dcd0af0b7f29c286 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 7 Sep 2020 21:22:52 +1000
Subject: [PATCH 063/450] Remove unnecessary dependency
---
commonmark-test-util/pom.xml | 7 ++-----
.../test/java/org/commonmark/test/SourceSpansTest.java | 10 +++++-----
2 files changed, 7 insertions(+), 10 deletions(-)
diff --git a/commonmark-test-util/pom.xml b/commonmark-test-util/pom.xml
index 92046f159..cad82f51d 100644
--- a/commonmark-test-util/pom.xml
+++ b/commonmark-test-util/pom.xml
@@ -1,5 +1,6 @@
-
+
4.0.0
com.atlassian.commonmark
@@ -16,10 +17,6 @@
junit
junit
-
- org.hamcrest
- hamcrest-library
-
diff --git a/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java b/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java
index f6796c925..9fe1fac03 100644
--- a/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java
+++ b/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java
@@ -12,14 +12,14 @@
import org.commonmark.node.Paragraph;
import org.commonmark.node.SourceSpan;
import org.commonmark.node.ThematicBreak;
-import org.commonmark.parser.Parser;
import org.commonmark.parser.IncludeSourceSpans;
+import org.commonmark.parser.Parser;
import org.junit.Test;
import java.util.Arrays;
-import static org.hamcrest.Matchers.contains;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
public class SourceSpansTest {
@@ -99,7 +99,7 @@ public void fencedCodeBlock() {
Node document = PARSER.parse("```\nfoo\n```\nbar\n");
Paragraph paragraph = (Paragraph) document.getLastChild();
- assertThat(paragraph.getSourceSpans(), contains(SourceSpan.of(3, 0, 3)));
+ assertEquals(Arrays.asList(SourceSpan.of(3, 0, 3)), paragraph.getSourceSpans());
}
@Test
@@ -142,7 +142,7 @@ public void listBlock() {
Node document = PARSER.parse("* foo\n * bar\n");
ListBlock listBlock = (ListBlock) document.getFirstChild().getFirstChild().getLastChild();
- assertThat(listBlock.getSourceSpans(), contains(SourceSpan.of(1, 2, 5)));
+ assertEquals(Arrays.asList(SourceSpan.of(1, 2, 5)), listBlock.getSourceSpans());
}
@Test
From ccafb953f8fe8c12aa023de70178ab6a8c8e3572 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Tue, 8 Sep 2020 10:49:17 +1000
Subject: [PATCH 064/450] Remove unused dependencyManagement version
---
pom.xml | 5 -----
1 file changed, 5 deletions(-)
diff --git a/pom.xml b/pom.xml
index 2bc6e3e40..d81dc186a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -143,11 +143,6 @@
junit
4.12
-
- org.hamcrest
- hamcrest-library
- 1.3
-
org.openjdk.jmh
jmh-core
From 5790505b3544c44295b283815074900cfc0af1a9 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Sat, 25 Jul 2020 21:23:16 +1000
Subject: [PATCH 065/450] Start extracting inline parsers
The end goal is to extract all current inline parsers, and then allow
customizing inline parsing via the same infrastructure (similar to the
current block parsing).
A stretch goal is to move away from the parser input having to be a
single contiguous String. This could speed up parsing and allow keeping
track of source positions (line and column indexes) for inline nodes.
---
.../commonmark/internal/InlineParserImpl.java | 94 +++++++------------
.../inline/BackslashInlineParser.java | 35 +++++++
.../internal/inline/InlineContentParser.java | 10 ++
.../internal/inline/InlineParserState.java | 6 ++
.../inline/LineBreakInlineContentParser.java | 36 +++++++
.../internal/inline/ParsedInline.java | 23 +++++
.../internal/inline/ParsedInlineImpl.java | 21 +++++
.../commonmark/internal/inline/Scanner.java | 31 ++++++
8 files changed, 195 insertions(+), 61 deletions(-)
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index dfb50149a..f42723d57 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -1,7 +1,7 @@
package org.commonmark.internal;
-import org.commonmark.internal.inline.AsteriskDelimiterProcessor;
-import org.commonmark.internal.inline.UnderscoreDelimiterProcessor;
+import org.commonmark.internal.inline.Scanner;
+import org.commonmark.internal.inline.*;
import org.commonmark.internal.util.Escaping;
import org.commonmark.internal.util.Html5Entities;
import org.commonmark.internal.util.LinkScanner;
@@ -15,7 +15,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-public class InlineParserImpl implements InlineParser {
+public class InlineParserImpl implements InlineParser, InlineParserState {
private static final String HTMLCOMMENT = "|";
private static final String PROCESSINGINSTRUCTION = "[<][?].*?[?][>]";
@@ -30,8 +30,6 @@ public class InlineParserImpl implements InlineParser {
private static final Pattern HTML_TAG = Pattern.compile('^' + HTMLTAG, Pattern.CASE_INSENSITIVE);
- private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE);
-
private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE);
private static final Pattern TICKS = Pattern.compile("`+");
@@ -50,12 +48,11 @@ public class InlineParserImpl implements InlineParser {
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
- private static final Pattern FINAL_SPACE = Pattern.compile(" *$");
-
private final BitSet specialCharacters;
private final BitSet delimiterCharacters;
private final Map delimiterProcessors;
private final InlineParserContext context;
+ private final Map> inlineParsers;
private String input;
private int index;
@@ -73,10 +70,14 @@ public class InlineParserImpl implements InlineParser {
public InlineParserImpl(InlineParserContext inlineParserContext) {
this.delimiterProcessors = calculateDelimiterProcessors(inlineParserContext.getCustomDelimiterProcessors());
- this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet());
- this.specialCharacters = calculateSpecialCharacters(delimiterCharacters);
this.context = inlineParserContext;
+ this.inlineParsers = new HashMap<>();
+ this.inlineParsers.put('\n', Collections.singletonList(new LineBreakInlineContentParser()));
+ this.inlineParsers.put('\\', Collections.singletonList(new BackslashInlineParser()));
+
+ this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet());
+ this.specialCharacters = calculateSpecialCharacters(delimiterCharacters, inlineParsers.keySet());
}
public static BitSet calculateDelimiterCharacters(Set characters) {
@@ -87,10 +88,12 @@ public static BitSet calculateDelimiterCharacters(Set characters) {
return bitSet;
}
- public static BitSet calculateSpecialCharacters(BitSet delimiterCharacters) {
+ public static BitSet calculateSpecialCharacters(BitSet delimiterCharacters, Set characters) {
BitSet bitSet = new BitSet();
bitSet.or(delimiterCharacters);
- bitSet.set('\n');
+ for (Character c : characters) {
+ bitSet.set(c);
+ }
bitSet.set('`');
bitSet.set('[');
bitSet.set(']');
@@ -108,6 +111,12 @@ public static Map calculateDelimiterProcessors(Li
return map;
}
+ // TODO: The implementation shouldn't be public
+ @Override
+ public Scanner scanner() {
+ return new Scanner(input, index);
+ }
+
private static void addDelimiterProcessors(Iterable delimiterProcessors, Map map) {
for (DelimiterProcessor delimiterProcessor : delimiterProcessors) {
char opening = delimiterProcessor.getOpeningCharacter();
@@ -190,14 +199,21 @@ private Node parseInline(Node previous) {
return null;
}
+ List inlineParsers = this.inlineParsers.get(c);
+ if (inlineParsers != null) {
+ for (InlineContentParser inlineParser : inlineParsers) {
+ // TODO: Should we pass the whole previous node or can we make the API surface smaller?
+ ParsedInline parsedInline = inlineParser.tryParse(this, previous);
+ if (parsedInline instanceof ParsedInlineImpl) {
+ ParsedInlineImpl parsedInlineImpl = (ParsedInlineImpl) parsedInline;
+ index += parsedInlineImpl.getConsumed();
+ return parsedInlineImpl.getNode();
+ }
+ }
+ }
+
Node node;
switch (c) {
- case '\n':
- node = parseNewline(previous);
- break;
- case '\\':
- node = parseBackslash();
- break;
case '`':
node = parseBackticks();
break;
@@ -280,50 +296,6 @@ private void spnl() {
match(SPNL);
}
- /**
- * Parse a newline. If it was preceded by two spaces, return a hard line break; otherwise a soft line break.
- */
- private Node parseNewline(Node previous) {
- index++; // assume we're at a \n
-
- // Check previous text for trailing spaces.
- // The "endsWith" is an optimization to avoid an RE match in the common case.
- if (previous instanceof Text && ((Text) previous).getLiteral().endsWith(" ")) {
- Text text = (Text) previous;
- String literal = text.getLiteral();
- Matcher matcher = FINAL_SPACE.matcher(literal);
- int spaces = matcher.find() ? matcher.end() - matcher.start() : 0;
- if (spaces > 0) {
- text.setLiteral(literal.substring(0, literal.length() - spaces));
- }
- if (spaces >= 2) {
- return new HardLineBreak();
- } else {
- return new SoftLineBreak();
- }
- } else {
- return new SoftLineBreak();
- }
- }
-
- /**
- * Parse a backslash-escaped special character, adding either the escaped character, a hard line break
- * (if the backslash is followed by a newline), or a literal backslash to the block's children.
- */
- private Node parseBackslash() {
- index++;
- Node node;
- if (peek() == '\n') {
- node = new HardLineBreak();
- index++;
- } else if (index < input.length() && ESCAPABLE.matcher(input.substring(index, index + 1)).matches()) {
- node = text(input, index, index + 1);
- index++;
- } else {
- node = text("\\");
- }
- return node;
- }
/**
* Attempt to parse backticks, returning either a backtick code span or a literal sequence of backticks.
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
new file mode 100644
index 000000000..bcdc669c8
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
@@ -0,0 +1,35 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.internal.util.Escaping;
+import org.commonmark.node.HardLineBreak;
+import org.commonmark.node.Node;
+import org.commonmark.node.Text;
+
+import java.util.regex.Pattern;
+
+/**
+ * Parse a backslash-escaped special character, adding either the escaped character, a hard line break
+ * (if the backslash is followed by a newline), or a literal backslash to the block's children.
+ */
+public class BackslashInlineParser implements InlineContentParser {
+
+ private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE);
+
+ @Override
+ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ Scanner scanner = inlineParserState.scanner();
+ // Backslash
+ scanner.skip();
+
+ char next = scanner.peek();
+ if (next == '\n') {
+ scanner.skip();
+ return ParsedInline.of(new HardLineBreak(), scanner.consumed());
+ } else if (ESCAPABLE.matcher(String.valueOf(next)).matches()) {
+ scanner.skip();
+ return ParsedInline.of(new Text(String.valueOf(next)), scanner.consumed());
+ } else {
+ return ParsedInline.of(new Text("\\"), scanner.consumed());
+ }
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java
new file mode 100644
index 000000000..76259c444
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java
@@ -0,0 +1,10 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.node.Node;
+
+// TODO: I'd prefer if this was named InlineParser, but that's already public API, hmm...
+public interface InlineContentParser {
+
+ ParsedInline tryParse(InlineParserState inlineParserState, Node previous);
+
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java b/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java
new file mode 100644
index 000000000..9a6ef7d19
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java
@@ -0,0 +1,6 @@
+package org.commonmark.internal.inline;
+
+public interface InlineParserState {
+
+ Scanner scanner();
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
new file mode 100644
index 000000000..1ca6ce531
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
@@ -0,0 +1,36 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.internal.util.Parsing;
+import org.commonmark.node.HardLineBreak;
+import org.commonmark.node.Node;
+import org.commonmark.node.SoftLineBreak;
+import org.commonmark.node.Text;
+
+/**
+ * Parse a newline. If it was preceded by two spaces, return a hard line break; otherwise a soft line break.
+ */
+public class LineBreakInlineContentParser implements InlineContentParser {
+
+ @Override
+ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ // Check previous text for trailing spaces.
+ // The "endsWith" is an optimization to avoid an RE match in the common case.
+ if (previous instanceof Text && ((Text) previous).getLiteral().endsWith(" ")) {
+ Text text = (Text) previous;
+ String literal = text.getLiteral();
+ int last = literal.length() - 1;
+ int nonSpace = Parsing.skipBackwards(' ', literal, last, 0);
+ int spaces = last - nonSpace;
+ if (spaces > 0) {
+ text.setLiteral(literal.substring(0, literal.length() - spaces));
+ }
+ if (spaces >= 2) {
+ return ParsedInline.of(new HardLineBreak(), 1);
+ } else {
+ return ParsedInline.of(new SoftLineBreak(), 1);
+ }
+ } else {
+ return ParsedInline.of(new SoftLineBreak(), 1);
+ }
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java
new file mode 100644
index 000000000..d52caf096
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java
@@ -0,0 +1,23 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.node.Node;
+
+public abstract class ParsedInline {
+
+ protected ParsedInline() {
+ }
+
+ public static ParsedInline none() {
+ return null;
+ }
+
+ public static ParsedInline of(Node node, int consumed) {
+ if (node == null) {
+ throw new NullPointerException("node must not be null");
+ }
+ if (consumed <= 0) {
+ throw new IllegalArgumentException("consumed must be greater than 0");
+ }
+ return new ParsedInlineImpl(node, consumed);
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java
new file mode 100644
index 000000000..a22163d15
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java
@@ -0,0 +1,21 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.node.Node;
+
+public class ParsedInlineImpl extends ParsedInline {
+ private final Node node;
+ private final int consumed;
+
+ public ParsedInlineImpl(Node node, int consumed) {
+ this.node = node;
+ this.consumed = consumed;
+ }
+
+ public Node getNode() {
+ return node;
+ }
+
+ public int getConsumed() {
+ return consumed;
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
new file mode 100644
index 000000000..16204a530
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -0,0 +1,31 @@
+package org.commonmark.internal.inline;
+
+public class Scanner {
+
+ private final String input;
+ private int index;
+ private int consumed = 0;
+
+ // TODO: Visibility
+ public Scanner(String input, int index) {
+ this.input = input;
+ this.index = index;
+ }
+
+ public char peek() {
+ if (index >= input.length()) {
+ return '\0';
+ } else {
+ return input.charAt(index);
+ }
+ }
+
+ public void skip() {
+ index++;
+ consumed++;
+ }
+
+ public int consumed() {
+ return consumed;
+ }
+}
From 15ed46759d6889a550ca2907757acbe6928a4ddc Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Sat, 25 Jul 2020 21:49:15 +1000
Subject: [PATCH 066/450] Backticks parser, still simple so far
---
.../commonmark/internal/InlineParserImpl.java | 47 +------------------
.../inline/BackslashInlineParser.java | 6 +--
.../inline/BackticksInlineParser.java | 47 +++++++++++++++++++
.../inline/LineBreakInlineContentParser.java | 9 ++--
.../internal/inline/ParsedInline.java | 8 ++--
.../internal/inline/ParsedInlineImpl.java | 10 ++--
.../commonmark/internal/inline/Position.java | 14 ++++++
.../commonmark/internal/inline/Scanner.java | 33 +++++++++++--
8 files changed, 110 insertions(+), 64 deletions(-)
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/Position.java
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index f42723d57..2d4e732fc 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -32,10 +32,6 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE);
- private static final Pattern TICKS = Pattern.compile("`+");
-
- private static final Pattern TICKS_HERE = Pattern.compile("^`+");
-
private static final Pattern EMAIL_AUTOLINK = Pattern
.compile("^<([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>");
@@ -75,6 +71,7 @@ public InlineParserImpl(InlineParserContext inlineParserContext) {
this.inlineParsers = new HashMap<>();
this.inlineParsers.put('\n', Collections.singletonList(new LineBreakInlineContentParser()));
this.inlineParsers.put('\\', Collections.singletonList(new BackslashInlineParser()));
+ this.inlineParsers.put('`', Collections.singletonList(new BackticksInlineParser()));
this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet());
this.specialCharacters = calculateSpecialCharacters(delimiterCharacters, inlineParsers.keySet());
@@ -94,10 +91,8 @@ public static BitSet calculateSpecialCharacters(BitSet delimiterCharacters, Set<
for (Character c : characters) {
bitSet.set(c);
}
- bitSet.set('`');
bitSet.set('[');
bitSet.set(']');
- bitSet.set('\\');
bitSet.set('!');
bitSet.set('<');
bitSet.set('&');
@@ -206,7 +201,7 @@ private Node parseInline(Node previous) {
ParsedInline parsedInline = inlineParser.tryParse(this, previous);
if (parsedInline instanceof ParsedInlineImpl) {
ParsedInlineImpl parsedInlineImpl = (ParsedInlineImpl) parsedInline;
- index += parsedInlineImpl.getConsumed();
+ index = parsedInlineImpl.getPosition().getIndex();
return parsedInlineImpl.getNode();
}
}
@@ -214,9 +209,6 @@ private Node parseInline(Node previous) {
Node node;
switch (c) {
- case '`':
- node = parseBackticks();
- break;
case '[':
node = parseOpenBracket();
break;
@@ -296,41 +288,6 @@ private void spnl() {
match(SPNL);
}
-
- /**
- * Attempt to parse backticks, returning either a backtick code span or a literal sequence of backticks.
- */
- private Node parseBackticks() {
- String ticks = match(TICKS_HERE);
- if (ticks == null) {
- return null;
- }
- int afterOpenTicks = index;
- String matched;
- while ((matched = match(TICKS)) != null) {
- if (matched.equals(ticks)) {
- Code node = new Code();
- String content = input.substring(afterOpenTicks, index - ticks.length());
- content = content.replace('\n', ' ');
-
- // spec: If the resulting string both begins and ends with a space character, but does not consist
- // entirely of space characters, a single space character is removed from the front and back.
- if (content.length() >= 3 &&
- content.charAt(0) == ' ' &&
- content.charAt(content.length() - 1) == ' ' &&
- Parsing.hasNonSpace(content)) {
- content = content.substring(1, content.length() - 1);
- }
-
- node.setLiteral(content);
- return node;
- }
- }
- // If we got here, we didn't match a closing backtick sequence.
- index = afterOpenTicks;
- return text(ticks);
- }
-
/**
* Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters.
*/
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
index bcdc669c8..2a36c329a 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
@@ -24,12 +24,12 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
char next = scanner.peek();
if (next == '\n') {
scanner.skip();
- return ParsedInline.of(new HardLineBreak(), scanner.consumed());
+ return ParsedInline.of(new HardLineBreak(), scanner.position());
} else if (ESCAPABLE.matcher(String.valueOf(next)).matches()) {
scanner.skip();
- return ParsedInline.of(new Text(String.valueOf(next)), scanner.consumed());
+ return ParsedInline.of(new Text(String.valueOf(next)), scanner.position());
} else {
- return ParsedInline.of(new Text("\\"), scanner.consumed());
+ return ParsedInline.of(new Text("\\"), scanner.position());
}
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
new file mode 100644
index 000000000..a7b17c46f
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
@@ -0,0 +1,47 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.internal.util.Parsing;
+import org.commonmark.node.Code;
+import org.commonmark.node.Node;
+import org.commonmark.node.Text;
+
+/**
+ * Attempt to parse backticks, returning either a backtick code span or a literal sequence of backticks.
+ */
+public class BackticksInlineParser implements InlineContentParser {
+
+ @Override
+ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ Scanner scanner = inlineParserState.scanner();
+ Position start = scanner.position();
+ int openingTicks = scanner.skip('`');
+ Position afterOpening = scanner.position();
+
+ while (scanner.find('`')) {
+ Position beforeClosing = scanner.position();
+ int count = scanner.skip('`');
+ if (count == openingTicks) {
+ Code node = new Code();
+
+ String content = scanner.textBetween(afterOpening, beforeClosing);
+ content = content.replace('\n', ' ');
+
+ // spec: If the resulting string both begins and ends with a space character, but does not consist
+ // entirely of space characters, a single space character is removed from the front and back.
+ if (content.length() >= 3 &&
+ content.charAt(0) == ' ' &&
+ content.charAt(content.length() - 1) == ' ' &&
+ Parsing.hasNonSpace(content)) {
+ content = content.substring(1, content.length() - 1);
+ }
+
+ node.setLiteral(content);
+ return ParsedInline.of(node, scanner.position());
+ }
+ }
+
+ // If we got here, we didn't find a matching closing backtick sequence.
+ String ticks = scanner.textBetween(start, afterOpening);
+ return ParsedInline.of(new Text(ticks), afterOpening);
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
index 1ca6ce531..2ef8f2328 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
@@ -13,6 +13,9 @@ public class LineBreakInlineContentParser implements InlineContentParser {
@Override
public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ Scanner scanner = inlineParserState.scanner();
+ scanner.skip();
+
// Check previous text for trailing spaces.
// The "endsWith" is an optimization to avoid an RE match in the common case.
if (previous instanceof Text && ((Text) previous).getLiteral().endsWith(" ")) {
@@ -25,12 +28,12 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
text.setLiteral(literal.substring(0, literal.length() - spaces));
}
if (spaces >= 2) {
- return ParsedInline.of(new HardLineBreak(), 1);
+ return ParsedInline.of(new HardLineBreak(), scanner.position());
} else {
- return ParsedInline.of(new SoftLineBreak(), 1);
+ return ParsedInline.of(new SoftLineBreak(), scanner.position());
}
} else {
- return ParsedInline.of(new SoftLineBreak(), 1);
+ return ParsedInline.of(new SoftLineBreak(), scanner.position());
}
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java
index d52caf096..7e6ece88e 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInline.java
@@ -11,13 +11,13 @@ public static ParsedInline none() {
return null;
}
- public static ParsedInline of(Node node, int consumed) {
+ public static ParsedInline of(Node node, Position position) {
if (node == null) {
throw new NullPointerException("node must not be null");
}
- if (consumed <= 0) {
- throw new IllegalArgumentException("consumed must be greater than 0");
+ if (position == null) {
+ throw new NullPointerException("position must not be null");
}
- return new ParsedInlineImpl(node, consumed);
+ return new ParsedInlineImpl(node, position);
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java
index a22163d15..aea325f27 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java
@@ -4,18 +4,18 @@
public class ParsedInlineImpl extends ParsedInline {
private final Node node;
- private final int consumed;
+ private final Position position;
- public ParsedInlineImpl(Node node, int consumed) {
+ ParsedInlineImpl(Node node, Position position) {
this.node = node;
- this.consumed = consumed;
+ this.position = position;
}
public Node getNode() {
return node;
}
- public int getConsumed() {
- return consumed;
+ public Position getPosition() {
+ return position;
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Position.java b/commonmark/src/main/java/org/commonmark/internal/inline/Position.java
new file mode 100644
index 000000000..dff5e36df
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Position.java
@@ -0,0 +1,14 @@
+package org.commonmark.internal.inline;
+
+public class Position {
+ final int index;
+
+ Position(int index) {
+ this.index = index;
+ }
+
+ // TODO: Move packages around so that this can stay package-private
+ public int getIndex() {
+ return index;
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index 16204a530..599800845 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -1,10 +1,11 @@
package org.commonmark.internal.inline;
+import org.commonmark.internal.util.Parsing;
+
public class Scanner {
private final String input;
private int index;
- private int consumed = 0;
// TODO: Visibility
public Scanner(String input, int index) {
@@ -22,10 +23,34 @@ public char peek() {
public void skip() {
index++;
- consumed++;
}
- public int consumed() {
- return consumed;
+ public int skip(char c) {
+ int count = 0;
+ while (peek() == c) {
+ count++;
+ skip();
+ }
+ return count;
+ }
+
+ public boolean find(char c) {
+ int newIndex = Parsing.find(c, input, index);
+ if (newIndex == -1) {
+ return false;
+ } else {
+ index = newIndex;
+ return true;
+ }
+ }
+
+ // Don't expose the int index, because it would be good if we could switch input to a List of lines later
+ // instead of one contiguous String.
+ public Position position() {
+ return new Position(index);
+ }
+
+ public String textBetween(Position begin, Position end) {
+ return input.substring(begin.index, end.index);
}
}
From 3b90833d866981fbb0ce342a7cbca6da0841fb4d Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Sun, 26 Jul 2020 17:34:58 +1000
Subject: [PATCH 067/450] Autolink parser
---
.../commonmark/internal/InlineParserImpl.java | 32 +--------------
.../internal/inline/AutolinkInlineParser.java | 40 +++++++++++++++++++
2 files changed, 42 insertions(+), 30 deletions(-)
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index 2d4e732fc..dec9fe236 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -32,12 +32,6 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE);
- private static final Pattern EMAIL_AUTOLINK = Pattern
- .compile("^<([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>");
-
- private static final Pattern AUTOLINK = Pattern
- .compile("^<[a-zA-Z][a-zA-Z0-9.+-]{1,31}:[^<>\u0000-\u0020]*>");
-
private static final Pattern SPNL = Pattern.compile("^ *(?:\n *)?");
private static final Pattern UNICODE_WHITESPACE_CHAR = Pattern.compile("^[\\p{Zs}\t\r\n\f]");
@@ -72,6 +66,7 @@ public InlineParserImpl(InlineParserContext inlineParserContext) {
this.inlineParsers.put('\n', Collections.singletonList(new LineBreakInlineContentParser()));
this.inlineParsers.put('\\', Collections.singletonList(new BackslashInlineParser()));
this.inlineParsers.put('`', Collections.singletonList(new BackticksInlineParser()));
+ this.inlineParsers.put('<', Collections.singletonList(new AutolinkInlineParser()));
this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet());
this.specialCharacters = calculateSpecialCharacters(delimiterCharacters, inlineParsers.keySet());
@@ -219,10 +214,7 @@ private Node parseInline(Node previous) {
node = parseCloseBracket();
break;
case '<':
- node = parseAutolink();
- if (node == null) {
- node = parseHtmlInline();
- }
+ node = parseHtmlInline();
break;
case '&':
node = parseEntity();
@@ -533,26 +525,6 @@ int parseLinkLabel() {
return contentLength + 2;
}
- /**
- * Attempt to parse an autolink (URL or email in pointy brackets).
- */
- private Node parseAutolink() {
- String m;
- if ((m = match(EMAIL_AUTOLINK)) != null) {
- String dest = m.substring(1, m.length() - 1);
- Link node = new Link("mailto:" + dest, null);
- node.appendChild(new Text(dest));
- return node;
- } else if ((m = match(AUTOLINK)) != null) {
- String dest = m.substring(1, m.length() - 1);
- Link node = new Link(dest, null);
- node.appendChild(new Text(dest));
- return node;
- } else {
- return null;
- }
- }
-
/**
* Attempt to parse inline HTML.
*/
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
new file mode 100644
index 000000000..f71206fb5
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
@@ -0,0 +1,40 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.node.Link;
+import org.commonmark.node.Node;
+import org.commonmark.node.Text;
+
+import java.util.regex.Pattern;
+
+/**
+ * Attempt to parse an autolink (URL or email in pointy brackets).
+ */
+public class AutolinkInlineParser implements InlineContentParser {
+
+ private static final Pattern URI = Pattern
+ .compile("^[a-zA-Z][a-zA-Z0-9.+-]{1,31}:[^<>\u0000-\u0020]*$");
+
+ private static final Pattern EMAIL = Pattern
+ .compile("^([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$");
+
+ @Override
+ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ Scanner scanner = inlineParserState.scanner();
+ scanner.skip();
+ Position start = scanner.position();
+ if (scanner.find('>')) {
+ String text = scanner.textBetween(start, scanner.position());
+ scanner.skip();
+ if (URI.matcher(text).matches()) {
+ Link node = new Link(text, null);
+ node.appendChild(new Text(text));
+ return ParsedInline.of(node, scanner.position());
+ } else if (EMAIL.matcher(text).matches()) {
+ Link node = new Link("mailto:" + text, null);
+ node.appendChild(new Text(text));
+ return ParsedInline.of(node, scanner.position());
+ }
+ }
+ return ParsedInline.none();
+ }
+}
From 2b956ec96c47e6074e885235035a60b1e5d72523 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 27 Jul 2020 16:03:16 +1000
Subject: [PATCH 068/450] HTML inline parser
Most complex one yet, but I think it's nice to get rid of the regexes.
---
.../commonmark/internal/InlineParserImpl.java | 29 +--
.../internal/inline/AutolinkInlineParser.java | 2 +-
.../inline/BackticksInlineParser.java | 2 +-
.../internal/inline/HtmlInlineParser.java | 210 ++++++++++++++++++
.../commonmark/internal/inline/Scanner.java | 58 ++++-
.../internal/util/AsciiMatcher.java | 52 +++++
.../commonmark/internal/util/CharMatcher.java | 6 +
.../org/commonmark/internal/util/Parsing.java | 17 ++
.../commonmark/test/HtmlInlineParserTest.java | 27 +++
9 files changed, 366 insertions(+), 37 deletions(-)
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/util/AsciiMatcher.java
create mode 100644 commonmark/src/main/java/org/commonmark/internal/util/CharMatcher.java
create mode 100644 commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index dec9fe236..65661da32 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -5,7 +5,6 @@
import org.commonmark.internal.util.Escaping;
import org.commonmark.internal.util.Html5Entities;
import org.commonmark.internal.util.LinkScanner;
-import org.commonmark.internal.util.Parsing;
import org.commonmark.node.*;
import org.commonmark.parser.InlineParser;
import org.commonmark.parser.InlineParserContext;
@@ -17,19 +16,10 @@
public class InlineParserImpl implements InlineParser, InlineParserState {
- private static final String HTMLCOMMENT = "|";
- private static final String PROCESSINGINSTRUCTION = "[<][?].*?[?][>]";
- private static final String DECLARATION = "]*>";
- private static final String CDATA = "";
- private static final String HTMLTAG = "(?:" + Parsing.OPENTAG + "|" + Parsing.CLOSETAG + "|" + HTMLCOMMENT
- + "|" + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")";
-
private static final String ASCII_PUNCTUATION = "!\"#\\$%&'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~";
private static final Pattern PUNCTUATION = Pattern
.compile("^[" + ASCII_PUNCTUATION + "\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}]");
- private static final Pattern HTML_TAG = Pattern.compile('^' + HTMLTAG, Pattern.CASE_INSENSITIVE);
-
private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE);
private static final Pattern SPNL = Pattern.compile("^ *(?:\n *)?");
@@ -66,7 +56,7 @@ public InlineParserImpl(InlineParserContext inlineParserContext) {
this.inlineParsers.put('\n', Collections.singletonList(new LineBreakInlineContentParser()));
this.inlineParsers.put('\\', Collections.singletonList(new BackslashInlineParser()));
this.inlineParsers.put('`', Collections.singletonList(new BackticksInlineParser()));
- this.inlineParsers.put('<', Collections.singletonList(new AutolinkInlineParser()));
+ this.inlineParsers.put('<', Arrays.asList(new AutolinkInlineParser(), new HtmlInlineParser()));
this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet());
this.specialCharacters = calculateSpecialCharacters(delimiterCharacters, inlineParsers.keySet());
@@ -213,9 +203,6 @@ private Node parseInline(Node previous) {
case ']':
node = parseCloseBracket();
break;
- case '<':
- node = parseHtmlInline();
- break;
case '&':
node = parseEntity();
break;
@@ -525,20 +512,6 @@ int parseLinkLabel() {
return contentLength + 2;
}
- /**
- * Attempt to parse inline HTML.
- */
- private Node parseHtmlInline() {
- String m = match(HTML_TAG);
- if (m != null) {
- HtmlInline node = new HtmlInline();
- node.setLiteral(m);
- return node;
- } else {
- return null;
- }
- }
-
/**
* Attempt to parse a HTML style entity.
*/
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
index f71206fb5..8549a6d8d 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
@@ -22,7 +22,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
Scanner scanner = inlineParserState.scanner();
scanner.skip();
Position start = scanner.position();
- if (scanner.find('>')) {
+ if (scanner.find('>') > 0) {
String text = scanner.textBetween(start, scanner.position());
scanner.skip();
if (URI.matcher(text).matches()) {
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
index a7b17c46f..2acca095e 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
@@ -17,7 +17,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
int openingTicks = scanner.skip('`');
Position afterOpening = scanner.position();
- while (scanner.find('`')) {
+ while (scanner.find('`') > 0) {
Position beforeClosing = scanner.position();
int count = scanner.skip('`');
if (count == openingTicks) {
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
new file mode 100644
index 000000000..cff7184ab
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
@@ -0,0 +1,210 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.internal.util.AsciiMatcher;
+import org.commonmark.node.HtmlInline;
+import org.commonmark.node.Node;
+
+/**
+ * Attempt to parse inline HTML.
+ */
+public class HtmlInlineParser implements InlineContentParser {
+
+ private static final AsciiMatcher asciiLetter = AsciiMatcher.builder().range('A', 'Z').range('a', 'z').build();
+
+ // spec: A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-).
+ private static final AsciiMatcher tagNameStart = asciiLetter;
+ private static final AsciiMatcher tagNameContinue = tagNameStart.newBuilder().range('0', '9').c('-').build();
+
+ // spec: An attribute name consists of an ASCII letter, _, or :, followed by zero or more ASCII letters, digits,
+ // _, ., :, or -. (Note: This is the XML specification restricted to ASCII. HTML5 is laxer.)
+ private static final AsciiMatcher attributeStart = asciiLetter.newBuilder().c('_').c(':').build();
+ private static final AsciiMatcher attributeContinue = attributeStart.newBuilder().range('0', '9').c('.').c('-').build();
+ // spec: An unquoted attribute value is a nonempty string of characters not including whitespace, ", ', =, <, >, or `.
+ private static final AsciiMatcher attributeValueEnd = AsciiMatcher.builder()
+ .c(' ').c('\t').c('\n').c('\u000B').c('\f').c('\r')
+ .c('"').c('\'').c('=').c('<').c('>').c('`')
+ .build();
+
+ private static final AsciiMatcher declaration = AsciiMatcher.builder().range('A', 'Z').build();
+
+ @Override
+ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ Scanner scanner = inlineParserState.scanner();
+ Position start = scanner.position();
+ // Skip over `<`
+ scanner.skip();
+
+ char c = scanner.peek();
+ if (tagNameStart.matches(c)) {
+ if (tryOpenTag(scanner)) {
+ return htmlInline(start, scanner);
+ }
+ } else if (c == '/') {
+ if (tryClosingTag(scanner)) {
+ return htmlInline(start, scanner);
+ }
+ } else if (c == '?') {
+ if (tryProcessingInstruction(scanner)) {
+ return htmlInline(start, scanner);
+ }
+ } else if (c == '!') {
+ // comment, declaration or CDATA
+ scanner.skip();
+ c = scanner.peek();
+ if (c == '-') {
+ if (tryComment(scanner)) {
+ return htmlInline(start, scanner);
+ }
+ } else if (c == '[') {
+ if (tryCdata(scanner)) {
+ return htmlInline(start, scanner);
+ }
+ } else if (declaration.matches(c)) {
+ if (tryDeclaration(scanner)) {
+ return htmlInline(start, scanner);
+ }
+ }
+ }
+
+ return ParsedInline.none();
+ }
+
+ private static ParsedInline htmlInline(Position start, Scanner scanner) {
+ HtmlInline node = new HtmlInline();
+ node.setLiteral(scanner.textBetween(start, scanner.position()));
+ return ParsedInline.of(node, scanner.position());
+ }
+
+ private static boolean tryOpenTag(Scanner scanner) {
+ // spec: An open tag consists of a < character, a tag name, zero or more attributes, optional whitespace,
+ // an optional / character, and a > character.
+ scanner.skip();
+ scanner.skip(tagNameContinue);
+ boolean whitespace = scanner.skipWhitespace() >= 1;
+ // spec: An attribute consists of whitespace, an attribute name, and an optional attribute value specification.
+ while (whitespace && scanner.skip(attributeStart) >= 1) {
+ scanner.skip(attributeContinue);
+ // spec: An attribute value specification consists of optional whitespace, a = character,
+ // optional whitespace, and an attribute value.
+ whitespace = scanner.skipWhitespace() >= 1;
+ if (scanner.skipOne('=')) {
+ scanner.skipWhitespace();
+ char valueStart = scanner.peek();
+ if (valueStart == '\'') {
+ scanner.skip();
+ if (scanner.find('\'') < 0) {
+ return false;
+ }
+ scanner.skip();
+ } else if (valueStart == '"') {
+ scanner.skip();
+ if (scanner.find('"') < 0) {
+ return false;
+ }
+ scanner.skip();
+ } else {
+ if (scanner.find(attributeValueEnd) <= 0) {
+ return false;
+ }
+ }
+
+ // Whitespace is required between attributes
+ whitespace = scanner.skipWhitespace() >= 1;
+ }
+ }
+
+ scanner.skipOne('/');
+ return scanner.skipOne('>');
+ }
+
+ private static boolean tryClosingTag(Scanner scanner) {
+ // spec: A closing tag consists of the string , a tag name, optional whitespace, and the character >.
+ scanner.skip();
+ if (scanner.skip(tagNameStart) >= 1) {
+ scanner.skip(tagNameContinue);
+ scanner.skipWhitespace();
+ return scanner.skipOne('>');
+ }
+ return false;
+ }
+
+ private static boolean tryProcessingInstruction(Scanner scanner) {
+ // spec: A processing instruction consists of the string , a string of characters not including the string ?>,
+ // and the string ?>.
+ scanner.skip();
+ while (scanner.find('?') > 0) {
+ scanner.skip();
+ if (scanner.skipOne('>')) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static boolean tryComment(Scanner scanner) {
+ // spec: An HTML comment consists of , where text does not start with > or ->, does not end
+ // with -, and does not contain --. (See the HTML5 spec.)
+
+ // Skip first `-`
+ scanner.skip();
+ if (!scanner.skipOne('-')) {
+ return false;
+ }
+
+ if (scanner.skipOne('>')) {
+ return false;
+ }
+
+ if (scanner.skipOne('-')) {
+ // Can't start with ->
+ if (scanner.skipOne('>')) {
+ return false;
+ }
+ // Empty comment
+ if (scanner.skipOne('-')) {
+ return scanner.skipOne('>');
+ }
+ }
+
+ while (scanner.find('-') >= 0) {
+ if (scanner.skipOne('-') && scanner.skipOne('-')) {
+ return scanner.skipOne('>');
+ }
+ }
+
+ return false;
+ }
+
+ private static boolean tryCdata(Scanner scanner) {
+ // spec: A CDATA section consists of the string ,
+ // and the string ]]>.
+
+ // Skip `[`
+ scanner.skip();
+
+ if (scanner.skipOne('C') && scanner.skipOne('D') && scanner.skipOne('A') && scanner.skipOne('T') && scanner.skipOne('A')
+ && scanner.skipOne('[')) {
+ while (scanner.find(']') >= 0) {
+ if (scanner.skipOne(']') && scanner.skipOne(']') && scanner.skipOne('>')) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ private static boolean tryDeclaration(Scanner scanner) {
+ // spec: A declaration consists of the string , and the character >.
+ scanner.skip(declaration);
+ if (scanner.skipWhitespace() <= 0) {
+ return false;
+ }
+ if (scanner.find('>') >= 0) {
+ scanner.skip();
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index 599800845..0aee95c8e 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -1,5 +1,6 @@
package org.commonmark.internal.inline;
+import org.commonmark.internal.util.CharMatcher;
import org.commonmark.internal.util.Parsing;
public class Scanner {
@@ -25,6 +26,15 @@ public void skip() {
index++;
}
+ public boolean skipOne(char c) {
+ if (peek() == c) {
+ skip();
+ return true;
+ } else {
+ return false;
+ }
+ }
+
public int skip(char c) {
int count = 0;
while (peek() == c) {
@@ -34,13 +44,47 @@ public int skip(char c) {
return count;
}
- public boolean find(char c) {
- int newIndex = Parsing.find(c, input, index);
- if (newIndex == -1) {
- return false;
- } else {
- index = newIndex;
- return true;
+ public int skip(CharMatcher matcher) {
+ int count = 0;
+ while (matcher.matches(peek())) {
+ count++;
+ skip();
+ }
+ return count;
+ }
+
+ public int skipWhitespace() {
+ int newIndex = Parsing.skipWhitespace(input, index, input.length());
+ int count = newIndex - index;
+ index = newIndex;
+ return count;
+ }
+
+ public int find(char c) {
+ int count = 0;
+ while (true) {
+ char cur = peek();
+ if (cur == '\0') {
+ return -1;
+ } else if (cur == c) {
+ return count;
+ }
+ count++;
+ skip();
+ }
+ }
+
+ public int find(CharMatcher matcher) {
+ int count = 0;
+ while (true) {
+ char c = peek();
+ if (c == '\0') {
+ return -1;
+ } else if (matcher.matches(c)) {
+ return count;
+ }
+ count++;
+ skip();
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/AsciiMatcher.java b/commonmark/src/main/java/org/commonmark/internal/util/AsciiMatcher.java
new file mode 100644
index 000000000..0e7ab345e
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/util/AsciiMatcher.java
@@ -0,0 +1,52 @@
+package org.commonmark.internal.util;
+
+import java.util.BitSet;
+
+public class AsciiMatcher implements CharMatcher {
+ // TODO: Check if boolean[] is faster, see BitClass in java.util.regex.Pattern
+ private final BitSet set;
+
+ private AsciiMatcher(Builder builder) {
+ this.set = builder.set;
+ }
+
+ @Override
+ public boolean matches(char c) {
+ return set.get(c);
+ }
+
+ public Builder newBuilder() {
+ return new Builder((BitSet) set.clone());
+ }
+
+ public static Builder builder() {
+ return new Builder(new BitSet());
+ }
+
+ public static class Builder {
+ private final BitSet set;
+
+ private Builder(BitSet set) {
+ this.set = set;
+ }
+
+ public Builder c(char c) {
+ if (c > 127) {
+ throw new IllegalArgumentException("Can only match ASCII characters");
+ }
+ set.set(c);
+ return this;
+ }
+
+ public Builder range(char from, char toInclusive) {
+ for (char c = from; c <= toInclusive; c++) {
+ c(c);
+ }
+ return this;
+ }
+
+ public AsciiMatcher build() {
+ return new AsciiMatcher(this);
+ }
+ }
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/CharMatcher.java b/commonmark/src/main/java/org/commonmark/internal/util/CharMatcher.java
new file mode 100644
index 000000000..de730e90d
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/util/CharMatcher.java
@@ -0,0 +1,6 @@
+package org.commonmark.internal.util;
+
+public interface CharMatcher {
+
+ boolean matches(char c);
+}
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
index d429d9db0..6c59b7255 100644
--- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
+++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
@@ -188,6 +188,23 @@ public static int skipSpaceTabBackwards(CharSequence s, int startIndex, int last
return lastIndex - 1;
}
+ public static int skipWhitespace(CharSequence s, int startIndex, int endIndex) {
+ for (int i = startIndex; i < endIndex; i++) {
+ switch (s.charAt(i)) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\u000B':
+ case '\f':
+ case '\r':
+ break;
+ default:
+ return i;
+ }
+ }
+ return endIndex;
+ }
+
private static int findNonSpace(CharSequence s, int startIndex) {
int length = s.length();
for (int i = startIndex; i < length; i++) {
diff --git a/commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java b/commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java
new file mode 100644
index 000000000..0a406778b
--- /dev/null
+++ b/commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java
@@ -0,0 +1,27 @@
+package org.commonmark.test;
+
+import org.junit.Test;
+
+public class HtmlInlineParserTest extends CoreRenderingTestCase {
+
+ @Test
+ public void comment() {
+ assertRendering("inline ", "inline
\n");
+ assertRendering("inline ", "inline
\n");
+ assertRendering("inline -->", "inline <!--->-->
\n");
+ }
+
+ @Test
+ public void cdata() {
+ assertRendering("inline ", "inline
\n");
+ assertRendering("inline ", "inline
\n");
+ }
+
+ @Test
+ public void declaration() {
+ // Whitespace is mandatory
+ assertRendering("inline ", "inline <!FOO>
\n");
+ assertRendering("inline ", "inline
\n");
+ assertRendering("inline ", "inline
\n");
+ }
+}
From b336bf9b2dfea1b82034b17ab3a6c05d0a7672c4 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 27 Jul 2020 16:15:08 +1000
Subject: [PATCH 069/450] Remove unnecessary usage of regex for entities
---
.../internal/util/Html5Entities.java | 26 ++++++++++++-------
1 file changed, 16 insertions(+), 10 deletions(-)
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java b/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java
index 5215a44df..523c596ed 100644
--- a/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java
+++ b/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java
@@ -5,24 +5,31 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
public class Html5Entities {
private static final Map NAMED_CHARACTER_REFERENCES = readEntities();
- private static final Pattern NUMERIC_PATTERN = Pattern.compile("^[Xx]?");
private static final String ENTITY_PATH = "/org/commonmark/internal/util/entities.properties";
public static String entityToString(String input) {
- Matcher matcher = NUMERIC_PATTERN.matcher(input);
+ if (!input.startsWith("&") || !input.endsWith(";")) {
+ return input;
+ }
+
+ String value = input.substring(1, input.length() - 1);
+ if (value.startsWith("#")) {
+ value = value.substring(1);
+ int base = 10;
+ if (value.startsWith("x") || value.startsWith("X")) {
+ value = value.substring(1);
+ base = 16;
+ }
- if (matcher.find()) {
- int base = matcher.end() == 2 ? 10 : 16;
try {
- int codePoint = Integer.parseInt(input.substring(matcher.end(), input.length() - 1), base);
+ int codePoint = Integer.parseInt(value, base);
if (codePoint == 0) {
return "\uFFFD";
}
@@ -31,8 +38,7 @@ public static String entityToString(String input) {
return "\uFFFD";
}
} else {
- String name = input.substring(1, input.length() - 1);
- String s = NAMED_CHARACTER_REFERENCES.get(name);
+ String s = NAMED_CHARACTER_REFERENCES.get(value);
if (s != null) {
return s;
} else {
@@ -44,7 +50,7 @@ public static String entityToString(String input) {
private static Map readEntities() {
Map entities = new HashMap<>();
InputStream stream = Html5Entities.class.getResourceAsStream(ENTITY_PATH);
- Charset charset = Charset.forName("UTF-8");
+ Charset charset = StandardCharsets.UTF_8;
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, charset))) {
String line;
while ((line = bufferedReader.readLine()) != null) {
From 47a59ade072199e16ec89a4b71f8585a461981f1 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 27 Jul 2020 16:32:15 +1000
Subject: [PATCH 070/450] Entity inline parser
---
.../commonmark/internal/InlineParserImpl.java | 21 +-------
.../internal/inline/EntityInlineParser.java | 53 +++++++++++++++++++
2 files changed, 54 insertions(+), 20 deletions(-)
create mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index 65661da32..2bc1ebf69 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -3,7 +3,6 @@
import org.commonmark.internal.inline.Scanner;
import org.commonmark.internal.inline.*;
import org.commonmark.internal.util.Escaping;
-import org.commonmark.internal.util.Html5Entities;
import org.commonmark.internal.util.LinkScanner;
import org.commonmark.node.*;
import org.commonmark.parser.InlineParser;
@@ -20,8 +19,6 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
private static final Pattern PUNCTUATION = Pattern
.compile("^[" + ASCII_PUNCTUATION + "\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}]");
- private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE);
-
private static final Pattern SPNL = Pattern.compile("^ *(?:\n *)?");
private static final Pattern UNICODE_WHITESPACE_CHAR = Pattern.compile("^[\\p{Zs}\t\r\n\f]");
@@ -56,6 +53,7 @@ public InlineParserImpl(InlineParserContext inlineParserContext) {
this.inlineParsers.put('\n', Collections.singletonList(new LineBreakInlineContentParser()));
this.inlineParsers.put('\\', Collections.singletonList(new BackslashInlineParser()));
this.inlineParsers.put('`', Collections.singletonList(new BackticksInlineParser()));
+ this.inlineParsers.put('&', Collections.singletonList(new EntityInlineParser()));
this.inlineParsers.put('<', Arrays.asList(new AutolinkInlineParser(), new HtmlInlineParser()));
this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet());
@@ -79,8 +77,6 @@ public static BitSet calculateSpecialCharacters(BitSet delimiterCharacters, Set<
bitSet.set('[');
bitSet.set(']');
bitSet.set('!');
- bitSet.set('<');
- bitSet.set('&');
return bitSet;
}
@@ -203,9 +199,6 @@ private Node parseInline(Node previous) {
case ']':
node = parseCloseBracket();
break;
- case '&':
- node = parseEntity();
- break;
default:
boolean isDelimiter = delimiterCharacters.get(c);
if (isDelimiter) {
@@ -512,18 +505,6 @@ int parseLinkLabel() {
return contentLength + 2;
}
- /**
- * Attempt to parse a HTML style entity.
- */
- private Node parseEntity() {
- String m;
- if ((m = match(ENTITY_HERE)) != null) {
- return text(Html5Entities.entityToString(m));
- } else {
- return null;
- }
- }
-
/**
* Parse a run of ordinary characters, or a single character with a special meaning in markdown, as a plain string.
*/
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
new file mode 100644
index 000000000..aef2d7850
--- /dev/null
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
@@ -0,0 +1,53 @@
+package org.commonmark.internal.inline;
+
+import org.commonmark.internal.util.AsciiMatcher;
+import org.commonmark.internal.util.Html5Entities;
+import org.commonmark.node.Node;
+import org.commonmark.node.Text;
+
+/**
+ * Attempts to parse a HTML entity or numeric character reference.
+ */
+public class EntityInlineParser implements InlineContentParser {
+
+ private static final AsciiMatcher hex = AsciiMatcher.builder().range('0', '9').range('A', 'F').range('a', 'f').build();
+ private static final AsciiMatcher dec = AsciiMatcher.builder().range('0', '9').build();
+ private static final AsciiMatcher entityStart = AsciiMatcher.builder().range('A', 'Z').range('a', 'z').build();
+ private static final AsciiMatcher entityContinue = entityStart.newBuilder().range('0', '9').build();
+
+ @Override
+ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ Scanner scanner = inlineParserState.scanner();
+ Position start = scanner.position();
+ // Skip `&`
+ scanner.skip();
+
+ char c = scanner.peek();
+ if (c == '#') {
+ // Numeric
+ scanner.skip();
+ if (scanner.skipOne('x') || scanner.skipOne('X')) {
+ int digits = scanner.skip(hex);
+ if (1 <= digits && digits <= 6 && scanner.skipOne(';')) {
+ return entity(scanner, start);
+ }
+ } else {
+ int digits = scanner.skip(dec);
+ if (1 <= digits && digits <= 7 && scanner.skipOne(';')) {
+ return entity(scanner, start);
+ }
+ }
+ } else if (entityStart.matches(c)) {
+ scanner.skip(entityContinue);
+ if (scanner.skipOne(';')) {
+ return entity(scanner, start);
+ }
+ }
+
+ return ParsedInline.none();
+ }
+
+ private ParsedInline entity(Scanner scanner, Position start) {
+ return ParsedInline.of(new Text(Html5Entities.entityToString(scanner.textBetween(start, scanner.position()))), scanner.position());
+ }
+}
From 3a6bec1bd0b3413d02804ba542cbdefe35735c28 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 27 Jul 2020 17:13:37 +1000
Subject: [PATCH 071/450] Rename methods
---
.../internal/inline/AutolinkInlineParser.java | 4 +-
.../inline/BackslashInlineParser.java | 6 +-
.../inline/BackticksInlineParser.java | 4 +-
.../internal/inline/EntityInlineParser.java | 18 ++--
.../internal/inline/HtmlInlineParser.java | 82 +++++++++----------
.../inline/LineBreakInlineContentParser.java | 2 +-
.../commonmark/internal/inline/Scanner.java | 20 ++---
7 files changed, 68 insertions(+), 68 deletions(-)
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
index 8549a6d8d..629ec6619 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
@@ -20,11 +20,11 @@ public class AutolinkInlineParser implements InlineContentParser {
@Override
public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
Scanner scanner = inlineParserState.scanner();
- scanner.skip();
+ scanner.next();
Position start = scanner.position();
if (scanner.find('>') > 0) {
String text = scanner.textBetween(start, scanner.position());
- scanner.skip();
+ scanner.next();
if (URI.matcher(text).matches()) {
Link node = new Link(text, null);
node.appendChild(new Text(text));
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
index 2a36c329a..cd87f7399 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
@@ -19,14 +19,14 @@ public class BackslashInlineParser implements InlineContentParser {
public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
Scanner scanner = inlineParserState.scanner();
// Backslash
- scanner.skip();
+ scanner.next();
char next = scanner.peek();
if (next == '\n') {
- scanner.skip();
+ scanner.next();
return ParsedInline.of(new HardLineBreak(), scanner.position());
} else if (ESCAPABLE.matcher(String.valueOf(next)).matches()) {
- scanner.skip();
+ scanner.next();
return ParsedInline.of(new Text(String.valueOf(next)), scanner.position());
} else {
return ParsedInline.of(new Text("\\"), scanner.position());
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
index 2acca095e..5dedad8d3 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
@@ -14,12 +14,12 @@ public class BackticksInlineParser implements InlineContentParser {
public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
Scanner scanner = inlineParserState.scanner();
Position start = scanner.position();
- int openingTicks = scanner.skip('`');
+ int openingTicks = scanner.matchMultiple('`');
Position afterOpening = scanner.position();
while (scanner.find('`') > 0) {
Position beforeClosing = scanner.position();
- int count = scanner.skip('`');
+ int count = scanner.matchMultiple('`');
if (count == openingTicks) {
Code node = new Code();
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
index aef2d7850..79fe294d6 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
@@ -20,26 +20,26 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
Scanner scanner = inlineParserState.scanner();
Position start = scanner.position();
// Skip `&`
- scanner.skip();
+ scanner.next();
char c = scanner.peek();
if (c == '#') {
// Numeric
- scanner.skip();
- if (scanner.skipOne('x') || scanner.skipOne('X')) {
- int digits = scanner.skip(hex);
- if (1 <= digits && digits <= 6 && scanner.skipOne(';')) {
+ scanner.next();
+ if (scanner.next('x') || scanner.next('X')) {
+ int digits = scanner.match(hex);
+ if (1 <= digits && digits <= 6 && scanner.next(';')) {
return entity(scanner, start);
}
} else {
- int digits = scanner.skip(dec);
- if (1 <= digits && digits <= 7 && scanner.skipOne(';')) {
+ int digits = scanner.match(dec);
+ if (1 <= digits && digits <= 7 && scanner.next(';')) {
return entity(scanner, start);
}
}
} else if (entityStart.matches(c)) {
- scanner.skip(entityContinue);
- if (scanner.skipOne(';')) {
+ scanner.match(entityContinue);
+ if (scanner.next(';')) {
return entity(scanner, start);
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
index cff7184ab..ea4511ab3 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
@@ -32,7 +32,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
Scanner scanner = inlineParserState.scanner();
Position start = scanner.position();
// Skip over `<`
- scanner.skip();
+ scanner.next();
char c = scanner.peek();
if (tagNameStart.matches(c)) {
@@ -49,7 +49,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
}
} else if (c == '!') {
// comment, declaration or CDATA
- scanner.skip();
+ scanner.next();
c = scanner.peek();
if (c == '-') {
if (tryComment(scanner)) {
@@ -78,30 +78,30 @@ private static ParsedInline htmlInline(Position start, Scanner scanner) {
private static boolean tryOpenTag(Scanner scanner) {
// spec: An open tag consists of a < character, a tag name, zero or more attributes, optional whitespace,
// an optional / character, and a > character.
- scanner.skip();
- scanner.skip(tagNameContinue);
- boolean whitespace = scanner.skipWhitespace() >= 1;
+ scanner.next();
+ scanner.match(tagNameContinue);
+ boolean whitespace = scanner.whitespace() >= 1;
// spec: An attribute consists of whitespace, an attribute name, and an optional attribute value specification.
- while (whitespace && scanner.skip(attributeStart) >= 1) {
- scanner.skip(attributeContinue);
+ while (whitespace && scanner.match(attributeStart) >= 1) {
+ scanner.match(attributeContinue);
// spec: An attribute value specification consists of optional whitespace, a = character,
// optional whitespace, and an attribute value.
- whitespace = scanner.skipWhitespace() >= 1;
- if (scanner.skipOne('=')) {
- scanner.skipWhitespace();
+ whitespace = scanner.whitespace() >= 1;
+ if (scanner.next('=')) {
+ scanner.whitespace();
char valueStart = scanner.peek();
if (valueStart == '\'') {
- scanner.skip();
+ scanner.next();
if (scanner.find('\'') < 0) {
return false;
}
- scanner.skip();
+ scanner.next();
} else if (valueStart == '"') {
- scanner.skip();
+ scanner.next();
if (scanner.find('"') < 0) {
return false;
}
- scanner.skip();
+ scanner.next();
} else {
if (scanner.find(attributeValueEnd) <= 0) {
return false;
@@ -109,21 +109,21 @@ private static boolean tryOpenTag(Scanner scanner) {
}
// Whitespace is required between attributes
- whitespace = scanner.skipWhitespace() >= 1;
+ whitespace = scanner.whitespace() >= 1;
}
}
- scanner.skipOne('/');
- return scanner.skipOne('>');
+ scanner.next('/');
+ return scanner.next('>');
}
private static boolean tryClosingTag(Scanner scanner) {
// spec: A closing tag consists of the string , a tag name, optional whitespace, and the character >.
- scanner.skip();
- if (scanner.skip(tagNameStart) >= 1) {
- scanner.skip(tagNameContinue);
- scanner.skipWhitespace();
- return scanner.skipOne('>');
+ scanner.next();
+ if (scanner.match(tagNameStart) >= 1) {
+ scanner.match(tagNameContinue);
+ scanner.whitespace();
+ return scanner.next('>');
}
return false;
}
@@ -131,10 +131,10 @@ private static boolean tryClosingTag(Scanner scanner) {
private static boolean tryProcessingInstruction(Scanner scanner) {
// spec: A processing instruction consists of the string , a string of characters not including the string ?>,
// and the string ?>.
- scanner.skip();
+ scanner.next();
while (scanner.find('?') > 0) {
- scanner.skip();
- if (scanner.skipOne('>')) {
+ scanner.next();
+ if (scanner.next('>')) {
return true;
}
}
@@ -146,29 +146,29 @@ private static boolean tryComment(Scanner scanner) {
// with -, and does not contain --. (See the HTML5 spec.)
// Skip first `-`
- scanner.skip();
- if (!scanner.skipOne('-')) {
+ scanner.next();
+ if (!scanner.next('-')) {
return false;
}
- if (scanner.skipOne('>')) {
+ if (scanner.next('>')) {
return false;
}
- if (scanner.skipOne('-')) {
+ if (scanner.next('-')) {
// Can't start with ->
- if (scanner.skipOne('>')) {
+ if (scanner.next('>')) {
return false;
}
// Empty comment
- if (scanner.skipOne('-')) {
- return scanner.skipOne('>');
+ if (scanner.next('-')) {
+ return scanner.next('>');
}
}
while (scanner.find('-') >= 0) {
- if (scanner.skipOne('-') && scanner.skipOne('-')) {
- return scanner.skipOne('>');
+ if (scanner.next('-') && scanner.next('-')) {
+ return scanner.next('>');
}
}
@@ -180,12 +180,12 @@ private static boolean tryCdata(Scanner scanner) {
// and the string ]]>.
// Skip `[`
- scanner.skip();
+ scanner.next();
- if (scanner.skipOne('C') && scanner.skipOne('D') && scanner.skipOne('A') && scanner.skipOne('T') && scanner.skipOne('A')
- && scanner.skipOne('[')) {
+ if (scanner.next('C') && scanner.next('D') && scanner.next('A') && scanner.next('T') && scanner.next('A')
+ && scanner.next('[')) {
while (scanner.find(']') >= 0) {
- if (scanner.skipOne(']') && scanner.skipOne(']') && scanner.skipOne('>')) {
+ if (scanner.next(']') && scanner.next(']') && scanner.next('>')) {
return true;
}
}
@@ -197,12 +197,12 @@ private static boolean tryCdata(Scanner scanner) {
private static boolean tryDeclaration(Scanner scanner) {
// spec: A declaration consists of the string , and the character >.
- scanner.skip(declaration);
- if (scanner.skipWhitespace() <= 0) {
+ scanner.match(declaration);
+ if (scanner.whitespace() <= 0) {
return false;
}
if (scanner.find('>') >= 0) {
- scanner.skip();
+ scanner.next();
return true;
}
return false;
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
index 2ef8f2328..1b85c6eec 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
@@ -14,7 +14,7 @@ public class LineBreakInlineContentParser implements InlineContentParser {
@Override
public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
Scanner scanner = inlineParserState.scanner();
- scanner.skip();
+ scanner.next();
// Check previous text for trailing spaces.
// The "endsWith" is an optimization to avoid an RE match in the common case.
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index 0aee95c8e..192285b1a 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -22,38 +22,38 @@ public char peek() {
}
}
- public void skip() {
+ public void next() {
index++;
}
- public boolean skipOne(char c) {
+ public boolean next(char c) {
if (peek() == c) {
- skip();
+ next();
return true;
} else {
return false;
}
}
- public int skip(char c) {
+ public int matchMultiple(char c) {
int count = 0;
while (peek() == c) {
count++;
- skip();
+ next();
}
return count;
}
- public int skip(CharMatcher matcher) {
+ public int match(CharMatcher matcher) {
int count = 0;
while (matcher.matches(peek())) {
count++;
- skip();
+ next();
}
return count;
}
- public int skipWhitespace() {
+ public int whitespace() {
int newIndex = Parsing.skipWhitespace(input, index, input.length());
int count = newIndex - index;
index = newIndex;
@@ -70,7 +70,7 @@ public int find(char c) {
return count;
}
count++;
- skip();
+ next();
}
}
@@ -84,7 +84,7 @@ public int find(CharMatcher matcher) {
return count;
}
count++;
- skip();
+ next();
}
}
From 6ea9ae53ac02d61bf2c5aa6b7af7e038f376fa94 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 27 Jul 2020 21:09:16 +1000
Subject: [PATCH 072/450] Use Scanner for link parsing too
Getting really close to not needing `match` in InlineParserImpl anymore!
---
.../commonmark/internal/InlineParserImpl.java | 79 +++++-----
.../LinkReferenceDefinitionParser.java | 139 +++++++++---------
.../internal/inline/AutolinkInlineParser.java | 2 +-
.../inline/BackticksInlineParser.java | 4 +-
.../internal/inline/EntityInlineParser.java | 3 +-
.../internal/inline/HtmlInlineParser.java | 3 +-
.../commonmark/internal/inline/Scanner.java | 14 +-
.../commonmark/internal/util/Escaping.java | 6 -
.../commonmark/internal/util/LinkScanner.java | 131 +++++++++--------
.../org/commonmark/internal/util/Parsing.java | 72 +++++----
.../internal/inline/ScannerTest.java | 20 +++
11 files changed, 263 insertions(+), 210 deletions(-)
create mode 100644 commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index 2bc1ebf69..ae6598c91 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -93,6 +93,10 @@ public Scanner scanner() {
return new Scanner(input, index);
}
+ private void setPosition(Position position) {
+ index = position.getIndex();
+ }
+
private static void addDelimiterProcessors(Iterable delimiterProcessors, Map map) {
for (DelimiterProcessor delimiterProcessor : delimiterProcessors) {
char opening = delimiterProcessor.getOpeningCharacter();
@@ -182,7 +186,7 @@ private Node parseInline(Node previous) {
ParsedInline parsedInline = inlineParser.tryParse(this, previous);
if (parsedInline instanceof ParsedInlineImpl) {
ParsedInlineImpl parsedInlineImpl = (ParsedInlineImpl) parsedInline;
- index = parsedInlineImpl.getPosition().getIndex();
+ setPosition(parsedInlineImpl.getPosition());
return parsedInlineImpl.getNode();
}
}
@@ -372,21 +376,18 @@ private Node parseCloseBracket() {
if (!isLinkOrImage) {
// See if there's a link label like `[bar]` or `[]`
- int beforeLabel = index;
- parseLinkLabel();
- int labelLength = index - beforeLabel;
- String ref = null;
- if (labelLength > 2) {
- ref = input.substring(beforeLabel, beforeLabel + labelLength);
- } else if (!opener.bracketAfter) {
+ String ref = parseLinkLabel();
+ if ((ref == null || ref.isEmpty()) && !opener.bracketAfter) {
// If the second label is empty `[foo][]` or missing `[foo]`, then the first label is the reference.
// But it can only be a reference when there's no (unescaped) bracket in it.
// If there is, we don't even need to try to look up the reference. This is an optimization.
ref = input.substring(opener.index, startIndex);
+ // Strip '[' and ']'
+ ref = ref.substring(1, ref.length() - 1);
}
if (ref != null) {
- String label = Escaping.normalizeReference(ref);
+ String label = Escaping.normalizeLabelContent(ref);
LinkReferenceDefinition definition = context.getLinkReferenceDefinition(label);
if (definition != null) {
dest = definition.getDestination();
@@ -451,20 +452,23 @@ private void removeLastBracket() {
* Attempt to parse link destination, returning the string or null if no match.
*/
private String parseLinkDestination() {
- int afterDest = LinkScanner.scanLinkDestination(input, index);
- if (afterDest == -1) {
+ Scanner scanner = scanner();
+ char delimiter = scanner.peek();
+ Position start = scanner.position();
+ if (!LinkScanner.scanLinkDestination(scanner)) {
return null;
}
String dest;
- if (peek() == '<') {
+ if (delimiter == '<') {
// chop off surrounding <..>:
- dest = input.substring(index + 1, afterDest - 1);
+ CharSequence rawDestination = scanner.textBetween(start, scanner.position());
+ dest = rawDestination.subSequence(1, rawDestination.length() - 1).toString();
} else {
- dest = input.substring(index, afterDest);
+ dest = scanner.textBetween(start, scanner.position()).toString();
}
- index = afterDest;
+ setPosition(scanner.position());
return Escaping.unescapeString(dest);
}
@@ -472,37 +476,46 @@ private String parseLinkDestination() {
* Attempt to parse link title (sans quotes), returning the string or null if no match.
*/
private String parseLinkTitle() {
- int afterTitle = LinkScanner.scanLinkTitle(input, index);
- if (afterTitle == -1) {
+ Scanner scanner = scanner();
+ Position start = scanner.position();
+ if (!LinkScanner.scanLinkTitle(scanner)) {
return null;
}
// chop off ', " or parens
- String title = input.substring(index + 1, afterTitle - 1);
- index = afterTitle;
+ CharSequence rawTitle = scanner.textBetween(start, scanner.position());
+ String title = rawTitle.subSequence(1, rawTitle.length() - 1).toString();
+ setPosition(scanner.position());
return Escaping.unescapeString(title);
}
/**
- * Attempt to parse a link label, returning number of characters parsed.
+ * Attempt to parse a link label, returning the label between the brackets or null.
*/
- int parseLinkLabel() {
- if (index >= input.length() || input.charAt(index) != '[') {
- return 0;
+ String parseLinkLabel() {
+ Scanner scanner = scanner();
+ if (!scanner.next('[')) {
+ return null;
}
- int startContent = index + 1;
- int endContent = LinkScanner.scanLinkLabelContent(input, startContent);
- // spec: A link label can have at most 999 characters inside the square brackets.
- int contentLength = endContent - startContent;
- if (endContent == -1 || contentLength > 999) {
- return 0;
+ Position start = scanner.position();
+ if (!LinkScanner.scanLinkLabelContent(scanner)) {
+ return null;
+ }
+ Position end = scanner.position();
+
+ if (!scanner.next(']')) {
+ return null;
}
- if (endContent >= input.length() || input.charAt(endContent) != ']') {
- return 0;
+
+ String content = scanner.textBetween(start, end).toString();
+ // spec: A link label can have at most 999 characters inside the square brackets.
+ if (content.length() > 999) {
+ return null;
}
- index = endContent + 1;
- return contentLength + 2;
+
+ setPosition(scanner.position());
+ return content;
}
/**
diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
index e0c23160e..a6cd57228 100644
--- a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
@@ -1,8 +1,9 @@
package org.commonmark.internal;
+import org.commonmark.internal.inline.Position;
+import org.commonmark.internal.inline.Scanner;
import org.commonmark.internal.util.Escaping;
import org.commonmark.internal.util.LinkScanner;
-import org.commonmark.internal.util.Parsing;
import org.commonmark.node.LinkReferenceDefinition;
import org.commonmark.node.SourceSpan;
@@ -35,8 +36,9 @@ public void parse(CharSequence line) {
}
paragraph.append(line);
- int i = 0;
- while (i < line.length()) {
+ Scanner scanner = new Scanner(line, 0);
+ while (scanner.hasNext()) {
+ boolean success;
switch (state) {
case PARAGRAPH: {
// We're in a paragraph now. Link reference definitions can only appear at the beginning, so once
@@ -44,28 +46,31 @@ public void parse(CharSequence line) {
return;
}
case START_DEFINITION: {
- i = startDefinition(line, i);
+ success = startDefinition(scanner);
break;
}
case LABEL: {
- i = label(line, i);
+ success = label(scanner);
break;
}
case DESTINATION: {
- i = destination(line, i);
+ success = destination(scanner);
break;
}
case START_TITLE: {
- i = startTitle(line, i);
+ success = startTitle(scanner);
break;
}
case TITLE: {
- i = title(line, i);
+ success = title(scanner);
break;
}
+ default: {
+ throw new IllegalStateException("Unknown parsing state: " + state);
+ }
}
- // -1 is returned if parsing failed, which means we fall back to treating text as a paragraph.
- if (i == -1) {
+ // Parsing failed, which means we fall back to treating text as a paragraph.
+ if (!success) {
state = State.PARAGRAPH;
return;
}
@@ -93,96 +98,95 @@ State getState() {
return state;
}
- private int startDefinition(CharSequence line, int i) {
- i = Parsing.skipSpaceTab(line, i, line.length());
- if (i >= line.length() || line.charAt(i) != '[') {
- return -1;
+ private boolean startDefinition(Scanner scanner) {
+ scanner.whitespace();
+ if (!scanner.next('[')) {
+ return false;
}
state = State.LABEL;
label = new StringBuilder();
- int labelStart = i + 1;
- if (labelStart >= line.length()) {
+ if (!scanner.hasNext()) {
label.append('\n');
}
-
- return labelStart;
+ return true;
}
- private int label(CharSequence line, int i) {
- int afterLabel = LinkScanner.scanLinkLabelContent(line, i);
- if (afterLabel == -1) {
- return -1;
+ private boolean label(Scanner scanner) {
+ Position start = scanner.position();
+ if (!LinkScanner.scanLinkLabelContent(scanner)) {
+ return false;
}
- label.append(line, i, afterLabel);
+ label.append(scanner.textBetween(start, scanner.position()));
- if (afterLabel >= line.length()) {
+ if (!scanner.hasNext()) {
// label might continue on next line
label.append('\n');
- return afterLabel;
- } else if (line.charAt(afterLabel) == ']') {
- int colon = afterLabel + 1;
+ return true;
+ } else if (scanner.next(']')) {
// end of label
- if (colon >= line.length() || line.charAt(colon) != ':') {
- return -1;
+ if (!scanner.next(':')) {
+ return false;
}
// spec: A link label can have at most 999 characters inside the square brackets.
if (label.length() > 999) {
- return -1;
+ return false;
}
String normalizedLabel = Escaping.normalizeLabelContent(label.toString());
if (normalizedLabel.isEmpty()) {
- return -1;
+ return false;
}
this.normalizedLabel = normalizedLabel;
state = State.DESTINATION;
- return Parsing.skipSpaceTab(line, colon + 1, line.length());
+ scanner.whitespace();
+ return true;
} else {
- return -1;
+ return false;
}
}
- private int destination(CharSequence line, int i) {
- i = Parsing.skipSpaceTab(line, i, line.length());
- int afterDestination = LinkScanner.scanLinkDestination(line, i);
- if (afterDestination == -1) {
- return -1;
+ private boolean destination(Scanner scanner) {
+ scanner.whitespace();
+ Position start = scanner.position();
+ if (!LinkScanner.scanLinkDestination(scanner)) {
+ return false;
}
- destination = (line.charAt(i) == '<')
- ? line.subSequence(i + 1, afterDestination - 1).toString()
- : line.subSequence(i, afterDestination).toString();
+ String rawDestination = scanner.textBetween(start, scanner.position()).toString();
+ destination = rawDestination.startsWith("<") ?
+ rawDestination.substring(1, rawDestination.length() - 1) :
+ rawDestination;
- int afterSpace = Parsing.skipSpaceTab(line, afterDestination, line.length());
- if (afterSpace >= line.length()) {
+ int whitespace = scanner.whitespace();
+ if (!scanner.hasNext()) {
// Destination was at end of line, so this is a valid reference for sure (and maybe a title).
// If not at end of line, wait for title to be valid first.
referenceValid = true;
paragraph.setLength(0);
- } else if (afterSpace == afterDestination) {
+ } else if (whitespace == 0) {
// spec: The title must be separated from the link destination by whitespace
- return -1;
+ return false;
}
state = State.START_TITLE;
- return afterSpace;
+ return true;
}
- private int startTitle(CharSequence line, int i) {
- i = Parsing.skipSpaceTab(line, i, line.length());
- if (i >= line.length()) {
+ private boolean startTitle(Scanner scanner) {
+ scanner.whitespace();
+ if (!scanner.hasNext()) {
state = State.START_DEFINITION;
- return i;
+ return true;
}
titleDelimiter = '\0';
- char c = line.charAt(i);
+ char c = scanner.peek();
switch (c) {
case '"':
case '\'':
@@ -196,8 +200,8 @@ private int startTitle(CharSequence line, int i) {
if (titleDelimiter != '\0') {
state = State.TITLE;
title = new StringBuilder();
- i++;
- if (i == line.length()) {
+ scanner.next();
+ if (!scanner.hasNext()) {
title.append('\n');
}
} else {
@@ -205,29 +209,30 @@ private int startTitle(CharSequence line, int i) {
// There might be another reference instead, try that for the same character.
state = State.START_DEFINITION;
}
- return i;
+ return true;
}
- private int title(CharSequence line, int i) {
- int afterTitle = LinkScanner.scanLinkTitleContent(line, i, titleDelimiter);
- if (afterTitle == -1) {
+ private boolean title(Scanner scanner) {
+ Position start = scanner.position();
+ if (!LinkScanner.scanLinkTitleContent(scanner, titleDelimiter)) {
// Invalid title, stop
- return -1;
+ return false;
}
- title.append(line.subSequence(i, afterTitle));
+ title.append(scanner.textBetween(start, scanner.position()));
- if (afterTitle >= line.length()) {
- // Title still going, continue on next line
+ if (!scanner.hasNext()) {
+ // Title ran until the end of line, so continue on next line (until we find the delimiter)
title.append('\n');
- return afterTitle;
+ return true;
}
- int afterTitleDelimiter = afterTitle + 1;
- int afterSpace = Parsing.skipSpaceTab(line, afterTitleDelimiter, line.length());
- if (afterSpace != line.length()) {
+ // Skip delimiter character
+ scanner.next();
+ scanner.whitespace();
+ if (scanner.hasNext()) {
// spec: No further non-whitespace characters may occur on the line.
- return -1;
+ return false;
}
referenceValid = true;
finishReference();
@@ -235,7 +240,7 @@ private int title(CharSequence line, int i) {
// See if there's another definition.
state = State.START_DEFINITION;
- return afterSpace;
+ return true;
}
private void finishReference() {
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
index 629ec6619..acf55a796 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
@@ -23,7 +23,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
scanner.next();
Position start = scanner.position();
if (scanner.find('>') > 0) {
- String text = scanner.textBetween(start, scanner.position());
+ String text = scanner.textBetween(start, scanner.position()).toString();
scanner.next();
if (URI.matcher(text).matches()) {
Link node = new Link(text, null);
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
index 5dedad8d3..9979bde86 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
@@ -23,7 +23,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
if (count == openingTicks) {
Code node = new Code();
- String content = scanner.textBetween(afterOpening, beforeClosing);
+ String content = scanner.textBetween(afterOpening, beforeClosing).toString();
content = content.replace('\n', ' ');
// spec: If the resulting string both begins and ends with a space character, but does not consist
@@ -41,7 +41,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
}
// If we got here, we didn't find a matching closing backtick sequence.
- String ticks = scanner.textBetween(start, afterOpening);
+ String ticks = scanner.textBetween(start, afterOpening).toString();
return ParsedInline.of(new Text(ticks), afterOpening);
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
index 79fe294d6..d44ee4217 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
@@ -48,6 +48,7 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
}
private ParsedInline entity(Scanner scanner, Position start) {
- return ParsedInline.of(new Text(Html5Entities.entityToString(scanner.textBetween(start, scanner.position()))), scanner.position());
+ String text = scanner.textBetween(start, scanner.position()).toString();
+ return ParsedInline.of(new Text(Html5Entities.entityToString(text)), scanner.position());
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
index ea4511ab3..4c25c3e58 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
@@ -70,8 +70,9 @@ public ParsedInline tryParse(InlineParserState inlineParserState, Node previous)
}
private static ParsedInline htmlInline(Position start, Scanner scanner) {
+ String text = scanner.textBetween(start, scanner.position()).toString();
HtmlInline node = new HtmlInline();
- node.setLiteral(scanner.textBetween(start, scanner.position()));
+ node.setLiteral(text);
return ParsedInline.of(node, scanner.position());
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index 192285b1a..cb87248d1 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -5,11 +5,11 @@
public class Scanner {
- private final String input;
+ private final CharSequence input;
private int index;
// TODO: Visibility
- public Scanner(String input, int index) {
+ public Scanner(CharSequence input, int index) {
this.input = input;
this.index = index;
}
@@ -22,6 +22,10 @@ public char peek() {
}
}
+ public boolean hasNext() {
+ return index < input.length();
+ }
+
public void next() {
index++;
}
@@ -94,7 +98,9 @@ public Position position() {
return new Position(index);
}
- public String textBetween(Position begin, Position end) {
- return input.substring(begin.index, end.index);
+ // For cases where the caller appends the result to a StringBuilder, we could offer another method to avoid some
+ // unnecessary copying.
+ public CharSequence textBetween(Position begin, Position end) {
+ return input.subSequence(begin.index, end.index);
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java
index 15197556c..2b34f6190 100644
--- a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java
+++ b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java
@@ -111,12 +111,6 @@ public static String percentEncodeUrl(String s) {
return replaceAll(ESCAPE_IN_URI, s, URI_REPLACER);
}
- public static String normalizeReference(String input) {
- // Strip '[' and ']'
- String stripped = input.substring(1, input.length() - 1);
- return normalizeLabelContent(stripped);
- }
-
public static String normalizeLabelContent(String input) {
String trimmed = input.trim();
String lowercase = trimmed.toLowerCase(Locale.ROOT);
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java
index f25cd59e5..3ca34c5f0 100644
--- a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java
@@ -1,69 +1,76 @@
package org.commonmark.internal.util;
+import org.commonmark.internal.inline.Scanner;
+
public class LinkScanner {
/**
- * Attempt to scan the contents of a link label (inside the brackets), returning the position after the content or
- * -1. The returned position can either be the closing {@code ]}, or the end of the line if the label continues on
+ * Attempt to scan the contents of a link label (inside the brackets), stopping after the content or returning false.
+ * The stopped position can bei either the closing {@code ]}, or the end of the line if the label continues on
* the next line.
*/
- public static int scanLinkLabelContent(CharSequence input, int start) {
- for (int i = start; i < input.length(); i++) {
- char c = input.charAt(i);
- switch (c) {
+ public static boolean scanLinkLabelContent(Scanner scanner) {
+ while (scanner.hasNext()) {
+ switch (scanner.peek()) {
case '\\':
- if (Parsing.isEscapable(input, i + 1)) {
- i += 1;
+ scanner.next();
+ if (Parsing.isEscapable(scanner.peek())) {
+ scanner.next();
}
break;
case ']':
- return i;
+ return true;
case '[':
// spec: Unescaped square bracket characters are not allowed inside the opening and closing
// square brackets of link labels.
- return -1;
+ return false;
+ default:
+ scanner.next();
}
}
- return input.length();
+ return true;
}
/**
- * Attempt to scan a link destination, returning the position after the destination or -1.
+ * Attempt to scan a link destination, stopping after the destination or returning false.
*/
- public static int scanLinkDestination(CharSequence input, int start) {
- if (start >= input.length()) {
- return -1;
+ public static boolean scanLinkDestination(Scanner scanner) {
+ if (!scanner.hasNext()) {
+ return false;
}
- if (input.charAt(start) == '<') {
- for (int i = start + 1; i < input.length(); i++) {
- char c = input.charAt(i);
- switch (c) {
+ if (scanner.next('<')) {
+ while (scanner.hasNext()) {
+ switch (scanner.peek()) {
case '\\':
- if (Parsing.isEscapable(input, i + 1)) {
- i += 1;
+ scanner.next();
+ if (Parsing.isEscapable(scanner.peek())) {
+ scanner.next();
}
break;
case '\n':
case '<':
- return -1;
+ return false;
case '>':
- return i + 1;
+ scanner.next();
+ return true;
+ default:
+ scanner.next();
}
}
- return -1;
+ return false;
} else {
- return scanLinkDestinationWithBalancedParens(input, start);
+ return scanLinkDestinationWithBalancedParens(scanner);
}
}
- public static int scanLinkTitle(CharSequence input, int start) {
- if (start >= input.length()) {
- return -1;
+ public static boolean scanLinkTitle(Scanner scanner) {
+ if (!scanner.hasNext()) {
+ return false;
}
char endDelimiter;
- switch (input.charAt(start)) {
+ switch (scanner.peek()) {
case '"':
endDelimiter = '"';
break;
@@ -74,75 +81,83 @@ public static int scanLinkTitle(CharSequence input, int start) {
endDelimiter = ')';
break;
default:
- return -1;
+ return false;
}
+ scanner.next();
- int afterContent = scanLinkTitleContent(input, start + 1, endDelimiter);
- if (afterContent == -1) {
- return -1;
+ if (!scanLinkTitleContent(scanner, endDelimiter)) {
+ return false;
}
-
- if (afterContent >= input.length() || input.charAt(afterContent) != endDelimiter) {
- // missing or wrong end delimiter
- return -1;
+ if (!scanner.hasNext()) {
+ return false;
}
-
- return afterContent + 1;
+ scanner.next();
+ return true;
}
- public static int scanLinkTitleContent(CharSequence input, int start, char endDelimiter) {
- for (int i = start; i < input.length(); i++) {
- char c = input.charAt(i);
- if (c == '\\' && Parsing.isEscapable(input, i + 1)) {
- i += 1;
+ public static boolean scanLinkTitleContent(Scanner scanner, char endDelimiter) {
+ while (scanner.hasNext()) {
+ char c = scanner.peek();
+ if (c == '\\') {
+ scanner.next();
+ if (Parsing.isEscapable(scanner.peek())) {
+ scanner.next();
+ }
} else if (c == endDelimiter) {
- return i;
+ return true;
} else if (endDelimiter == ')' && c == '(') {
// unescaped '(' in title within parens is invalid
- return -1;
+ return false;
+ } else {
+ scanner.next();
}
}
- return input.length();
+ return true;
}
// spec: a nonempty sequence of characters that does not start with <, does not include ASCII space or control
// characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of a balanced
// pair of unescaped parentheses
- private static int scanLinkDestinationWithBalancedParens(CharSequence input, int start) {
+ private static boolean scanLinkDestinationWithBalancedParens(Scanner scanner) {
int parens = 0;
- for (int i = start; i < input.length(); i++) {
- char c = input.charAt(i);
+ boolean empty = true;
+ while (scanner.hasNext()) {
+ char c = scanner.peek();
switch (c) {
- case '\0':
case ' ':
- return i != start ? i : -1;
+ return !empty;
case '\\':
- if (Parsing.isEscapable(input, i + 1)) {
- i += 1;
+ scanner.next();
+ if (Parsing.isEscapable(scanner.peek())) {
+ scanner.next();
}
break;
case '(':
parens++;
// Limit to 32 nested parens for pathological cases
if (parens > 32) {
- return -1;
+ return false;
}
+ scanner.next();
break;
case ')':
if (parens == 0) {
- return i;
+ return true;
} else {
parens--;
}
+ scanner.next();
break;
default:
// or control character
if (Character.isISOControl(c)) {
- return i != start ? i : -1;
+ return !empty;
}
+ scanner.next();
break;
}
+ empty = false;
}
- return input.length();
+ return true;
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
index 6c59b7255..10be523bb 100644
--- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
+++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
@@ -72,43 +72,41 @@ public static boolean isSpaceOrTab(CharSequence s, int index) {
return false;
}
- public static boolean isEscapable(CharSequence s, int index) {
- if (index < s.length()) {
- switch (s.charAt(index)) {
- case '!':
- case '"':
- case '#':
- case '$':
- case '%':
- case '&':
- case '\'':
- case '(':
- case ')':
- case '*':
- case '+':
- case ',':
- case '-':
- case '.':
- case '/':
- case ':':
- case ';':
- case '<':
- case '=':
- case '>':
- case '?':
- case '@':
- case '[':
- case '\\':
- case ']':
- case '^':
- case '_':
- case '`':
- case '{':
- case '|':
- case '}':
- case '~':
- return true;
- }
+ public static boolean isEscapable(char c) {
+ switch (c) {
+ case '!':
+ case '"':
+ case '#':
+ case '$':
+ case '%':
+ case '&':
+ case '\'':
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case ',':
+ case '-':
+ case '.':
+ case '/':
+ case ':':
+ case ';':
+ case '<':
+ case '=':
+ case '>':
+ case '?':
+ case '@':
+ case '[':
+ case '\\':
+ case ']':
+ case '^':
+ case '_':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ case '~':
+ return true;
}
return false;
}
diff --git a/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java b/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java
new file mode 100644
index 000000000..ed8e958ad
--- /dev/null
+++ b/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java
@@ -0,0 +1,20 @@
+package org.commonmark.internal.inline;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class ScannerTest {
+
+ @Test
+ public void testNext() {
+ Scanner scanner = new Scanner("foo bar", 4);
+ assertEquals('b', scanner.peek());
+ scanner.next();
+ assertEquals('a', scanner.peek());
+ scanner.next();
+ assertEquals('r', scanner.peek());
+ scanner.next();
+ assertEquals('\0', scanner.peek());
+ }
+}
From 9b56ee4c723ea87879f468be8f27bade256806cd Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Tue, 28 Jul 2020 20:26:18 +1000
Subject: [PATCH 073/450] Rewrite remaining code in InlineParserImpl to use
Scanner
---
.../java/org/commonmark/internal/Bracket.java | 18 +-
.../commonmark/internal/InlineParserImpl.java | 213 +++++++-----------
.../commonmark/internal/inline/Scanner.java | 9 +
3 files changed, 100 insertions(+), 140 deletions(-)
diff --git a/commonmark/src/main/java/org/commonmark/internal/Bracket.java b/commonmark/src/main/java/org/commonmark/internal/Bracket.java
index 70a8a6e25..f66a79279 100644
--- a/commonmark/src/main/java/org/commonmark/internal/Bracket.java
+++ b/commonmark/src/main/java/org/commonmark/internal/Bracket.java
@@ -1,5 +1,6 @@
package org.commonmark.internal;
+import org.commonmark.internal.inline.Position;
import org.commonmark.node.Text;
/**
@@ -8,7 +9,10 @@
public class Bracket {
public final Text node;
- public final int index;
+ /**
+ * The position of the content (after the opening bracket)
+ */
+ public final Position contentPosition;
public final boolean image;
/**
@@ -31,17 +35,17 @@ public class Bracket {
*/
public boolean bracketAfter = false;
- static public Bracket link(Text node, int index, Bracket previous, Delimiter previousDelimiter) {
- return new Bracket(node, index, previous, previousDelimiter, false);
+ static public Bracket link(Text node, Position contentPosition, Bracket previous, Delimiter previousDelimiter) {
+ return new Bracket(node, contentPosition, previous, previousDelimiter, false);
}
- static public Bracket image(Text node, int index, Bracket previous, Delimiter previousDelimiter) {
- return new Bracket(node, index, previous, previousDelimiter, true);
+ static public Bracket image(Text node, Position contentPosition, Bracket previous, Delimiter previousDelimiter) {
+ return new Bracket(node, contentPosition, previous, previousDelimiter, true);
}
- private Bracket(Text node, int index, Bracket previous, Delimiter previousDelimiter, boolean image) {
+ private Bracket(Text node, Position contentPosition, Bracket previous, Delimiter previousDelimiter, boolean image) {
this.node = node;
- this.index = index;
+ this.contentPosition = contentPosition;
this.image = image;
this.previous = previous;
this.previousDelimiter = previousDelimiter;
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index ae6598c91..cd04ced92 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -10,7 +10,6 @@
import org.commonmark.parser.delimiter.DelimiterProcessor;
import java.util.*;
-import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class InlineParserImpl implements InlineParser, InlineParserState {
@@ -19,12 +18,8 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
private static final Pattern PUNCTUATION = Pattern
.compile("^[" + ASCII_PUNCTUATION + "\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}]");
- private static final Pattern SPNL = Pattern.compile("^ *(?:\n *)?");
-
private static final Pattern UNICODE_WHITESPACE_CHAR = Pattern.compile("^[\\p{Zs}\t\r\n\f]");
- private static final Pattern WHITESPACE = Pattern.compile("\\s+");
-
private final BitSet specialCharacters;
private final BitSet delimiterCharacters;
private final Map delimiterProcessors;
@@ -93,10 +88,6 @@ public Scanner scanner() {
return new Scanner(input, index);
}
- private void setPosition(Position position) {
- index = position.getIndex();
- }
-
private static void addDelimiterProcessors(Iterable delimiterProcessors, Map map) {
for (DelimiterProcessor delimiterProcessor : delimiterProcessors) {
char opening = delimiterProcessor.getOpeningCharacter();
@@ -152,6 +143,10 @@ public void parse(String content, Node block) {
mergeChildTextNodes(block);
}
+ void setPosition(Position position) {
+ index = position.getIndex();
+ }
+
void reset(String content) {
this.input = content;
this.index = 0;
@@ -159,22 +154,18 @@ void reset(String content) {
this.lastBracket = null;
}
-
- private Text text(String text, int beginIndex, int endIndex) {
- return new Text(text.substring(beginIndex, endIndex));
- }
-
private Text text(String text) {
return new Text(text);
}
/**
- * Parse the next inline element in subject, advancing input index.
+ * Parse the next inline element in subject, advancing our position.
* On success, return the new inline node.
* On failure, return null.
*/
private Node parseInline(Node previous) {
- char c = peek();
+ Scanner scanner = scanner();
+ char c = scanner.peek();
if (c == '\0') {
return null;
}
@@ -216,7 +207,8 @@ private Node parseInline(Node previous) {
if (node != null) {
return node;
} else {
- index++;
+ scanner.next();
+ setPosition(scanner.position());
// When we get here, it's only for a single special character that turned out to not have a special meaning.
// So we shouldn't have a single surrogate here, hence it should be ok to turn it into a String.
String literal = String.valueOf(c);
@@ -224,46 +216,6 @@ private Node parseInline(Node previous) {
}
}
- /**
- * If RE matches at current index in the input, advance index and return the match; otherwise return null.
- */
- private String match(Pattern re) {
- if (index >= input.length()) {
- return null;
- }
- try {
- Matcher matcher = re.matcher(input);
- matcher.region(index, input.length());
- boolean m = matcher.find();
- if (m) {
- index = matcher.end();
- return matcher.group();
- } else {
- return null;
- }
- } catch (StackOverflowError e) {
- return null;
- }
- }
-
- /**
- * Returns the char at the current input index, or {@code '\0'} in case there are no more characters.
- */
- private char peek() {
- if (index < input.length()) {
- return input.charAt(index);
- } else {
- return '\0';
- }
- }
-
- /**
- * Parse zero or more space characters, including at most one newline.
- */
- private void spnl() {
- match(SPNL);
- }
-
/**
* Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters.
*/
@@ -272,16 +224,13 @@ private Node parseDelimiters(DelimiterProcessor delimiterProcessor, char delimit
if (res == null) {
return null;
}
- int length = res.count;
- int startIndex = index;
- index += length;
- Text node = text(input, startIndex, index);
+ Text node = res.text;
// Add entry to stack for this opener
lastDelimiter = new Delimiter(node, delimiterChar, res.canOpen, res.canClose, lastDelimiter);
- lastDelimiter.length = length;
- lastDelimiter.originalLength = length;
+ lastDelimiter.length = res.count;
+ lastDelimiter.originalLength = res.count;
if (lastDelimiter.previous != null) {
lastDelimiter.previous.next = lastDelimiter;
}
@@ -293,13 +242,15 @@ private Node parseDelimiters(DelimiterProcessor delimiterProcessor, char delimit
* Add open bracket to delimiter stack and add a text node to block's children.
*/
private Node parseOpenBracket() {
- int startIndex = index;
- index++;
+ Scanner scanner = scanner();
+ scanner.next();
+ Position start = scanner.position();
+ setPosition(start);
Text node = text("[");
// Add entry to stack for this opener
- addBracket(Bracket.link(node, startIndex, lastBracket, lastDelimiter));
+ addBracket(Bracket.link(node, start, lastBracket, lastDelimiter));
return node;
}
@@ -309,18 +260,18 @@ private Node parseOpenBracket() {
* Otherwise just add a text node.
*/
private Node parseBang() {
- int startIndex = index;
- index++;
- if (peek() == '[') {
- index++;
-
+ Scanner scanner = scanner();
+ scanner.next();
+ if (scanner.next('[')) {
Text node = text("`
- if (peek() == '(') {
- index++;
- spnl();
- if ((dest = parseLinkDestination()) != null) {
- spnl();
+ if (scanner.next('(')) {
+ scanner.whitespace();
+ dest = parseLinkDestination(scanner);
+ if (dest != null) {
+ int whitespace = scanner.whitespace();
// title needs a whitespace before
- if (WHITESPACE.matcher(input.substring(index - 1, index)).matches()) {
- title = parseLinkTitle();
- spnl();
+ if (whitespace >= 1) {
+ title = parseLinkTitle(scanner);
+ scanner.whitespace();
}
- if (peek() == ')') {
- index++;
- isLinkOrImage = true;
- } else {
- index = startIndex;
+ if (!scanner.next(')')) {
+ // Don't have a closing `)`, so it's not a destination and title -> reset.
+ // Note that something like `[foo](` could be valid, `(` will just be text.
+ scanner = scanner();
+ dest = null;
+ title = null;
}
}
}
- // Maybe a reference link like `[foo][bar]`, `[foo][]` or `[foo]`
- if (!isLinkOrImage) {
-
+ // Maybe a reference link like `[foo][bar]`, `[foo][]` or `[foo]`.
+ // Note that even `[foo](` could be a valid link if there's a reference, which is why this is not just an `else`
+ // here.
+ if (dest == null) {
// See if there's a link label like `[bar]` or `[]`
- String ref = parseLinkLabel();
+ String ref = parseLinkLabel(scanner);
if ((ref == null || ref.isEmpty()) && !opener.bracketAfter) {
// If the second label is empty `[foo][]` or missing `[foo]`, then the first label is the reference.
// But it can only be a reference when there's no (unescaped) bracket in it.
// If there is, we don't even need to try to look up the reference. This is an optimization.
- ref = input.substring(opener.index, startIndex);
- // Strip '[' and ']'
- ref = ref.substring(1, ref.length() - 1);
+ ref = scanner.textBetween(opener.contentPosition, beforeClose).toString();
}
if (ref != null) {
@@ -392,12 +343,11 @@ private Node parseCloseBracket() {
if (definition != null) {
dest = definition.getDestination();
title = definition.getTitle();
- isLinkOrImage = true;
}
}
}
- if (isLinkOrImage) {
+ if (dest != null) {
// If we got here, open is a potential opener
Node linkOrImage = opener.image ? new Image(dest, title) : new Link(dest, title);
@@ -427,10 +377,12 @@ private Node parseCloseBracket() {
}
}
+ setPosition(scanner.position());
+
return linkOrImage;
- } else { // no link or image
- index = startIndex;
+ } else {
+ // No link or image, parse just the bracket as text and continue
removeLastBracket();
return text("]");
@@ -451,8 +403,7 @@ private void removeLastBracket() {
/**
* Attempt to parse link destination, returning the string or null if no match.
*/
- private String parseLinkDestination() {
- Scanner scanner = scanner();
+ private String parseLinkDestination(Scanner scanner) {
char delimiter = scanner.peek();
Position start = scanner.position();
if (!LinkScanner.scanLinkDestination(scanner)) {
@@ -468,15 +419,13 @@ private String parseLinkDestination() {
dest = scanner.textBetween(start, scanner.position()).toString();
}
- setPosition(scanner.position());
return Escaping.unescapeString(dest);
}
/**
* Attempt to parse link title (sans quotes), returning the string or null if no match.
*/
- private String parseLinkTitle() {
- Scanner scanner = scanner();
+ private String parseLinkTitle(Scanner scanner) {
Position start = scanner.position();
if (!LinkScanner.scanLinkTitle(scanner)) {
return null;
@@ -485,15 +434,13 @@ private String parseLinkTitle() {
// chop off ', " or parens
CharSequence rawTitle = scanner.textBetween(start, scanner.position());
String title = rawTitle.subSequence(1, rawTitle.length() - 1).toString();
- setPosition(scanner.position());
return Escaping.unescapeString(title);
}
/**
* Attempt to parse a link label, returning the label between the brackets or null.
*/
- String parseLinkLabel() {
- Scanner scanner = scanner();
+ String parseLinkLabel(Scanner scanner) {
if (!scanner.next('[')) {
return null;
}
@@ -514,24 +461,26 @@ String parseLinkLabel() {
return null;
}
- setPosition(scanner.position());
return content;
}
/**
- * Parse a run of ordinary characters, or a single character with a special meaning in markdown, as a plain string.
+ * Parse a run of non-special characters as plain text.
*/
private Node parseString() {
- int begin = index;
- int length = input.length();
- while (index != length) {
- if (specialCharacters.get(input.charAt(index))) {
+ Scanner scanner = scanner();
+ Position start = scanner.position();
+ while (scanner.hasNext()) {
+ if (specialCharacters.get(scanner.peek())) {
break;
}
- index++;
+ scanner.next();
}
- if (begin != index) {
- return text(input, begin, index);
+
+ String text = scanner.textBetween(start, scanner.position()).toString();
+ if (!text.isEmpty()) {
+ setPosition(scanner.position());
+ return text(text);
} else {
return null;
}
@@ -544,25 +493,20 @@ private Node parseString() {
* @return information about delimiter run, or {@code null}
*/
private DelimiterData scanDelimiters(DelimiterProcessor delimiterProcessor, char delimiterChar) {
- int startIndex = index;
+ Scanner scanner = scanner();
+ char charBefore = scanner.peekPrevious();
+ Position start = scanner.position();
- int delimiterCount = 0;
- while (peek() == delimiterChar) {
- delimiterCount++;
- index++;
- }
+ int delimiterCount = scanner.matchMultiple(delimiterChar);
if (delimiterCount < delimiterProcessor.getMinLength()) {
- index = startIndex;
+ setPosition(start);
return null;
}
- String before = startIndex == 0 ? "\n" :
- input.substring(startIndex - 1, startIndex);
-
- char charAfter = peek();
- String after = charAfter == '\0' ? "\n" :
- String.valueOf(charAfter);
+ char charAfter = scanner.peek();
+ String before = charBefore == '\0' ? "\n" : String.valueOf(charBefore);
+ String after = charAfter == '\0' ? "\n" : String.valueOf(charAfter);
// We could be more lazy here, in most cases we don't need to do every match case.
boolean beforeIsPunctuation = PUNCTUATION.matcher(before).matches();
@@ -584,8 +528,9 @@ private DelimiterData scanDelimiters(DelimiterProcessor delimiterProcessor, char
canClose = rightFlanking && delimiterChar == delimiterProcessor.getClosingCharacter();
}
- index = startIndex;
- return new DelimiterData(delimiterCount, canOpen, canClose);
+ setPosition(scanner.position());
+ String text = scanner.textBetween(start, scanner.position()).toString();
+ return new DelimiterData(delimiterCount, canOpen, canClose, new Text(text));
}
private void processDelimiters(Delimiter stackBottom) {
@@ -789,11 +734,13 @@ private static class DelimiterData {
final int count;
final boolean canClose;
final boolean canOpen;
+ final Text text;
- DelimiterData(int count, boolean canOpen, boolean canClose) {
+ DelimiterData(int count, boolean canOpen, boolean canClose, Text text) {
this.count = count;
this.canOpen = canOpen;
this.canClose = canClose;
+ this.text = text;
}
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index cb87248d1..3db930e0b 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -22,6 +22,15 @@ public char peek() {
}
}
+ public char peekPrevious() {
+ int prev = index - 1;
+ if (prev >= 0 && prev < input.length()) {
+ return input.charAt(prev);
+ } else {
+ return '\0';
+ }
+ }
+
public boolean hasNext() {
return index < input.length();
}
From cc85e20569c931cc72bf0bd8fd5660cdcf69b312 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Thu, 30 Jul 2020 20:41:48 +1000
Subject: [PATCH 074/450] Use a list of lines instead of a single string for
inline parsing
This is a big one, and was only possible because of the new Scanner
infrastructure. This has a couple of advantages:
* Less string copying and garbage because we no longer need to have all
of a block's content in a single contiguous string, we can keep the lines
we use for block parsing directly
* Because we have individual lines, we could also keep source positions
and get source positions for inline nodes :wow:!
---
.../gfm/tables/internal/TableBlockParser.java | 20 +--
.../commonmark/internal/DocumentParser.java | 19 ++-
.../commonmark/internal/HeadingParser.java | 40 +++---
.../commonmark/internal/InlineParserImpl.java | 127 +++++++++++-------
.../LinkReferenceDefinitionParser.java | 29 ++--
.../commonmark/internal/ListBlockParser.java | 2 +-
.../commonmark/internal/ParagraphParser.java | 12 +-
.../internal/inline/AutolinkInlineParser.java | 2 +-
.../inline/BackslashInlineParser.java | 2 +-
.../inline/BackticksInlineParser.java | 2 +-
.../internal/inline/EntityInlineParser.java | 2 +-
.../internal/inline/HtmlInlineParser.java | 3 +-
.../internal/inline/InlineContentParser.java | 4 +-
.../inline/LineBreakInlineContentParser.java | 39 ------
.../commonmark/internal/inline/Position.java | 9 +-
.../commonmark/internal/inline/Scanner.java | 108 ++++++++++++---
.../org/commonmark/internal/util/Parsing.java | 22 ++-
.../org/commonmark/parser/InlineParser.java | 7 +-
.../parser/block/MatchedBlockParser.java | 9 +-
.../LinkReferenceDefinitionParserTest.java | 34 +++--
.../internal/inline/ScannerTest.java | 68 +++++++++-
.../java/org/commonmark/test/ParserTest.java | 2 +-
.../org/commonmark/test/SpecialInputTest.java | 7 +
23 files changed, 358 insertions(+), 211 deletions(-)
delete mode 100644 commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java
index 2952a8785..107aa2167 100644
--- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java
+++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java
@@ -1,20 +1,11 @@
package org.commonmark.ext.gfm.tables.internal;
-import org.commonmark.ext.gfm.tables.TableBlock;
-import org.commonmark.ext.gfm.tables.TableBody;
-import org.commonmark.ext.gfm.tables.TableCell;
-import org.commonmark.ext.gfm.tables.TableHead;
-import org.commonmark.ext.gfm.tables.TableRow;
+import org.commonmark.ext.gfm.tables.*;
import org.commonmark.node.Block;
import org.commonmark.node.Node;
import org.commonmark.node.SourceSpan;
import org.commonmark.parser.InlineParser;
-import org.commonmark.parser.block.AbstractBlockParser;
-import org.commonmark.parser.block.AbstractBlockParserFactory;
-import org.commonmark.parser.block.BlockContinue;
-import org.commonmark.parser.block.BlockStart;
-import org.commonmark.parser.block.MatchedBlockParser;
-import org.commonmark.parser.block.ParserState;
+import org.commonmark.parser.block.*;
import java.util.ArrayList;
import java.util.Collections;
@@ -116,7 +107,7 @@ private TableCell parseCell(CellSource cell, int column, InlineParser inlinePars
tableCell.setSourceSpans(Collections.singletonList(cell.sourceSpan));
}
- inlineParser.parse(cell.content, tableCell);
+ inlineParser.parse(Collections.singletonList(cell.content), tableCell);
return tableCell;
}
@@ -246,11 +237,12 @@ public static class Factory extends AbstractBlockParserFactory {
@Override
public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) {
CharSequence line = state.getLine();
- CharSequence paragraph = matchedBlockParser.getParagraphContent();
- if (paragraph != null && paragraph.toString().contains("|") && !paragraph.toString().contains("\n")) {
+ List paragraphLines = matchedBlockParser.getParagraphLines();
+ if (paragraphLines.size() == 1 && paragraphLines.get(0).toString().contains("|")) {
CharSequence separatorLine = line.subSequence(state.getIndex(), line.length());
List columns = parseSeparator(separatorLine);
if (columns != null && !columns.isEmpty()) {
+ CharSequence paragraph = paragraphLines.get(0);
List headerCells = split(paragraph, null);
if (columns.size() >= headerCells.size()) {
return BlockStart.of(new TableBlockParser(columns, paragraph))
diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java
index 51791d90b..5c3cdfd3d 100644
--- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java
@@ -425,7 +425,10 @@ private void addLine() {
}
sb.append(rest);
content = sb.toString();
+ } else if (index == 0) {
+ content = line;
} else {
+ // TODO: Maybe we should bring back Subsequence here?
content = line.subSequence(index, line.length());
}
getActiveBlockParser().addLine(content);
@@ -458,9 +461,8 @@ private BlockStartImpl findBlockStart(BlockParser blockParser) {
}
/**
- * Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings,
- * setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference
- * definitions.
+ * Finalize a block. Close it and do any necessary postprocessing, e.g. setting the content of blocks and
+ * collecting link reference definitions from paragraphs.
*/
private void finalize(BlockParser blockParser) {
if (blockParser instanceof ParagraphParser) {
@@ -567,17 +569,12 @@ public BlockParser getMatchedBlockParser() {
}
@Override
- public CharSequence getParagraphContent() {
+ public List getParagraphLines() {
if (matchedBlockParser instanceof ParagraphParser) {
ParagraphParser paragraphParser = (ParagraphParser) matchedBlockParser;
- CharSequence content = paragraphParser.getContentString();
- if (content.length() == 0) {
- return null;
- }
-
- return content;
+ return Collections.unmodifiableList(paragraphParser.getParagraphLines());
}
- return null;
+ return Collections.emptyList();
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java b/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java
index 2b72ba236..88bf002f0 100644
--- a/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java
@@ -6,12 +6,15 @@
import org.commonmark.parser.InlineParser;
import org.commonmark.parser.block.*;
+import java.util.Collections;
+import java.util.List;
+
public class HeadingParser extends AbstractBlockParser {
private final Heading block = new Heading();
- private final String content;
+ private final List content;
- public HeadingParser(int level, String content) {
+ public HeadingParser(int level, List content) {
block.setLevel(level);
this.content = content;
}
@@ -49,10 +52,9 @@ public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockPar
int setextHeadingLevel = getSetextHeadingLevel(line, nextNonSpace);
if (setextHeadingLevel > 0) {
- CharSequence paragraph = matchedBlockParser.getParagraphContent();
- if (paragraph != null) {
- String content = paragraph.toString();
- return BlockStart.of(new HeadingParser(setextHeadingLevel, content))
+ List paragraph = matchedBlockParser.getParagraphLines();
+ if (!paragraph.isEmpty()) {
+ return BlockStart.of(new HeadingParser(setextHeadingLevel, paragraph))
.atIndex(line.length())
.replaceActiveBlockParser();
}
@@ -73,25 +75,29 @@ private static HeadingParser getAtxHeading(CharSequence line, int index) {
return null;
}
- int start = index + level;
- if (start >= line.length()) {
+ int afterMarker = index + level;
+ if (afterMarker >= line.length()) {
// End of line after markers is an empty heading
- return new HeadingParser(level, "");
+ return new HeadingParser(level, Collections.emptyList());
}
- char next = line.charAt(start);
+ char next = line.charAt(afterMarker);
if (!(next == ' ' || next == '\t')) {
return null;
}
- int beforeSpace = Parsing.skipSpaceTabBackwards(line, line.length() - 1, start);
- int beforeHash = Parsing.skipBackwards('#', line, beforeSpace, start);
- int beforeTrailer = Parsing.skipSpaceTabBackwards(line, beforeHash, start);
- if (beforeTrailer != beforeHash) {
- return new HeadingParser(level, line.subSequence(start, beforeTrailer + 1).toString());
- } else {
- return new HeadingParser(level, line.subSequence(start, beforeSpace + 1).toString());
+ int start = Parsing.skipSpaceTab(line, afterMarker, line.length());
+
+ int beforeSpace = Parsing.skipSpaceTabBackwards(line, line.length() - 1, afterMarker);
+ int beforeHash = Parsing.skipBackwards('#', line, beforeSpace, afterMarker);
+ int beforeTrailer = Parsing.skipSpaceTabBackwards(line, beforeHash, afterMarker);
+ // Trailing `#` need to be separated with at least one space/tab, otherwise they are part of the content.
+ int end = (beforeTrailer < beforeHash) ? beforeTrailer + 1 : beforeSpace + 1;
+ if (start >= end) {
+ // Empty, e.g. `### ###`
+ return new HeadingParser(level, Collections.emptyList());
}
+ return new HeadingParser(level, Collections.singletonList(line.subSequence(start, end)));
}
// spec: A setext heading underline is a sequence of = characters or a sequence of - characters, with no more than
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index cd04ced92..eee25af40 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -4,6 +4,7 @@
import org.commonmark.internal.inline.*;
import org.commonmark.internal.util.Escaping;
import org.commonmark.internal.util.LinkScanner;
+import org.commonmark.internal.util.Parsing;
import org.commonmark.node.*;
import org.commonmark.parser.InlineParser;
import org.commonmark.parser.InlineParserContext;
@@ -26,8 +27,11 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
private final InlineParserContext context;
private final Map> inlineParsers;
- private String input;
+ // TODO: Should we just keep a scanner here instead?
+ private List lines;
+ private int lineIndex;
private int index;
+ private int trailingSpaces;
/**
* Top delimiter (emphasis, strong emphasis or custom emphasis). (Brackets are on a separate stack, different
@@ -45,7 +49,6 @@ public InlineParserImpl(InlineParserContext inlineParserContext) {
this.context = inlineParserContext;
this.inlineParsers = new HashMap<>();
- this.inlineParsers.put('\n', Collections.singletonList(new LineBreakInlineContentParser()));
this.inlineParsers.put('\\', Collections.singletonList(new BackslashInlineParser()));
this.inlineParsers.put('`', Collections.singletonList(new BackticksInlineParser()));
this.inlineParsers.put('&', Collections.singletonList(new EntityInlineParser()));
@@ -72,6 +75,7 @@ public static BitSet calculateSpecialCharacters(BitSet delimiterCharacters, Set<
bitSet.set('[');
bitSet.set(']');
bitSet.set('!');
+ bitSet.set('\n');
return bitSet;
}
@@ -85,7 +89,7 @@ public static Map calculateDelimiterProcessors(Li
// TODO: The implementation shouldn't be public
@Override
public Scanner scanner() {
- return new Scanner(input, index);
+ return new Scanner(lines, lineIndex, index);
}
private static void addDelimiterProcessors(Iterable delimiterProcessors, Map map) {
@@ -122,16 +126,14 @@ private static void addDelimiterProcessorForChar(char delimiterChar, DelimiterPr
}
/**
- * Parse content in block into inline children, using reference map to resolve references.
+ * Parse content in block into inline children, appending them to the block node.
*/
@Override
- public void parse(String content, Node block) {
- reset(content.trim());
+ public void parse(List lines, Node block) {
+ reset(lines);
- Node previous = null;
while (true) {
- Node node = parseInline(previous);
- previous = node;
+ Node node = parseInline();
if (node != null) {
block.appendChild(node);
} else {
@@ -144,12 +146,15 @@ public void parse(String content, Node block) {
}
void setPosition(Position position) {
+ lineIndex = position.getLineIndex();
index = position.getIndex();
}
- void reset(String content) {
- this.input = content;
+ void reset(List lines) {
+ this.lines = lines;
+ this.lineIndex = 0;
this.index = 0;
+ this.trailingSpaces = 0;
this.lastDelimiter = null;
this.lastBracket = null;
}
@@ -163,7 +168,7 @@ private Text text(String text) {
* On success, return the new inline node.
* On failure, return null.
*/
- private Node parseInline(Node previous) {
+ private Node parseInline() {
Scanner scanner = scanner();
char c = scanner.peek();
if (c == '\0') {
@@ -173,8 +178,7 @@ private Node parseInline(Node previous) {
List inlineParsers = this.inlineParsers.get(c);
if (inlineParsers != null) {
for (InlineContentParser inlineParser : inlineParsers) {
- // TODO: Should we pass the whole previous node or can we make the API surface smaller?
- ParsedInline parsedInline = inlineParser.tryParse(this, previous);
+ ParsedInline parsedInline = inlineParser.tryParse(this);
if (parsedInline instanceof ParsedInlineImpl) {
ParsedInlineImpl parsedInlineImpl = (ParsedInlineImpl) parsedInline;
setPosition(parsedInlineImpl.getPosition());
@@ -183,37 +187,44 @@ private Node parseInline(Node previous) {
}
}
- Node node;
switch (c) {
case '[':
- node = parseOpenBracket();
- break;
+ return parseOpenBracket();
case '!':
- node = parseBang();
- break;
+ return parseBang();
case ']':
- node = parseCloseBracket();
- break;
- default:
- boolean isDelimiter = delimiterCharacters.get(c);
- if (isDelimiter) {
- DelimiterProcessor delimiterProcessor = delimiterProcessors.get(c);
- node = parseDelimiters(delimiterProcessor, c);
- } else {
- node = parseString();
- }
- break;
+ return parseCloseBracket();
+ case '\n':
+ return parseLineBreak();
}
- if (node != null) {
- return node;
- } else {
- scanner.next();
- setPosition(scanner.position());
- // When we get here, it's only for a single special character that turned out to not have a special meaning.
- // So we shouldn't have a single surrogate here, hence it should be ok to turn it into a String.
- String literal = String.valueOf(c);
- return text(literal);
+
+ boolean isDelimiter = delimiterCharacters.get(c);
+ if (isDelimiter) {
+ DelimiterProcessor delimiterProcessor = delimiterProcessors.get(c);
+ Node delimiterNode = parseDelimiters(delimiterProcessor, c);
+ if (delimiterNode != null) {
+ return delimiterNode;
+ }
}
+
+ // If we get here, even for a special/delimiter character, we will just treat it as text.
+ return parseText();
+// } else {
+// node = parseString();
+// }
+// break;
+//
+// Node node;
+// if (node != null) {
+// return node;
+// } else {
+// scanner.next();
+// setPosition(scanner.position());
+// // When we get here, it's only for a single special character that turned out to not have a special meaning.
+// // So we shouldn't have a single surrogate here, hence it should be ok to turn it into a String.
+// String literal = String.valueOf(c);
+// return text(literal);
+// }
}
/**
@@ -464,12 +475,25 @@ String parseLinkLabel(Scanner scanner) {
return content;
}
+ private Node parseLineBreak() {
+ Scanner scanner = scanner();
+ scanner.next();
+ setPosition(scanner.position());
+
+ if (trailingSpaces >= 2) {
+ return new HardLineBreak();
+ } else {
+ return new SoftLineBreak();
+ }
+ }
+
/**
- * Parse a run of non-special characters as plain text.
+ * Parse the next character as plain text, and possibly more if the following characters are non-special.
*/
- private Node parseString() {
+ private Node parseText() {
Scanner scanner = scanner();
Position start = scanner.position();
+ scanner.next();
while (scanner.hasNext()) {
if (specialCharacters.get(scanner.peek())) {
break;
@@ -478,12 +502,21 @@ private Node parseString() {
}
String text = scanner.textBetween(start, scanner.position()).toString();
- if (!text.isEmpty()) {
- setPosition(scanner.position());
- return text(text);
- } else {
- return null;
- }
+ setPosition(scanner.position());
+
+ char c = scanner.peek();
+ if (c == '\n') {
+ // We parsed until the end of the line. Trim any trailing spaces and remember them (for hard line breaks).
+ int end = Parsing.skipBackwards(' ', text, text.length() - 1, 0) + 1;
+ trailingSpaces = text.length() - end;
+ text = text.substring(0, end);
+ } else if (c == '\0') {
+ // For the last line, both tabs and spaces are trimmed for some reason (checked with commonmark.js).
+ int end = Parsing.skipSpaceTabBackwards(text, text.length() - 1, 0) + 1;
+ text = text.substring(0, end);
+ }
+
+ return text(text);
}
/**
diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
index a6cd57228..7f257343b 100644
--- a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
@@ -8,6 +8,7 @@
import org.commonmark.node.SourceSpan;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
/**
@@ -19,7 +20,7 @@ public class LinkReferenceDefinitionParser {
private State state = State.START_DEFINITION;
- private final StringBuilder paragraph = new StringBuilder();
+ private final List paragraphLines = new ArrayList<>();
private final List definitions = new ArrayList<>();
private final List sourceSpans = new ArrayList<>();
@@ -31,20 +32,17 @@ public class LinkReferenceDefinitionParser {
private boolean referenceValid = false;
public void parse(CharSequence line) {
- if (paragraph.length() != 0) {
- paragraph.append('\n');
+ paragraphLines.add(line);
+ if (state == State.PARAGRAPH) {
+ // We're in a paragraph now. Link reference definitions can only appear at the beginning, so once
+ // we're in a paragraph, there's no going back.
+ return;
}
- paragraph.append(line);
- Scanner scanner = new Scanner(line, 0);
+ Scanner scanner = new Scanner(Collections.singletonList(line), 0, 0);
while (scanner.hasNext()) {
boolean success;
switch (state) {
- case PARAGRAPH: {
- // We're in a paragraph now. Link reference definitions can only appear at the beginning, so once
- // we're in a paragraph, there's no going back.
- return;
- }
case START_DEFINITION: {
success = startDefinition(scanner);
break;
@@ -81,8 +79,11 @@ public void addSourceSpan(SourceSpan sourceSpan) {
sourceSpans.add(sourceSpan);
}
- CharSequence getParagraphContent() {
- return paragraph;
+ /**
+ * @return the lines that are normal paragraph content, without newlines
+ */
+ List getParagraphLines() {
+ return paragraphLines;
}
List getParagraphSourceSpans() {
@@ -168,7 +169,7 @@ private boolean destination(Scanner scanner) {
// Destination was at end of line, so this is a valid reference for sure (and maybe a title).
// If not at end of line, wait for title to be valid first.
referenceValid = true;
- paragraph.setLength(0);
+ paragraphLines.clear();
} else if (whitespace == 0) {
// spec: The title must be separated from the link destination by whitespace
return false;
@@ -236,7 +237,7 @@ private boolean title(Scanner scanner) {
}
referenceValid = true;
finishReference();
- paragraph.setLength(0);
+ paragraphLines.clear();
// See if there's another definition.
state = State.START_DEFINITION;
diff --git a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java
index de1558f92..1538ca41b 100644
--- a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java
@@ -210,7 +210,7 @@ public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockPar
}
int markerIndex = state.getNextNonSpaceIndex();
int markerColumn = state.getColumn() + state.getIndent();
- boolean inParagraph = matchedBlockParser.getParagraphContent() != null;
+ boolean inParagraph = !matchedBlockParser.getParagraphLines().isEmpty();
ListData listData = parseList(state.getLine(), markerIndex, markerColumn, inParagraph);
if (listData == null) {
return BlockStart.none();
diff --git a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java
index 7e28cf0fb..8962ec7da 100644
--- a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java
@@ -49,7 +49,7 @@ public void addSourceSpan(SourceSpan sourceSpan) {
@Override
public void closeBlock() {
- if (linkReferenceDefinitionParser.getParagraphContent().length() == 0) {
+ if (linkReferenceDefinitionParser.getParagraphLines().isEmpty()) {
block.unlink();
} else {
block.setSourceSpans(linkReferenceDefinitionParser.getParagraphSourceSpans());
@@ -58,14 +58,14 @@ public void closeBlock() {
@Override
public void parseInlines(InlineParser inlineParser) {
- CharSequence content = linkReferenceDefinitionParser.getParagraphContent();
- if (content.length() > 0) {
- inlineParser.parse(content.toString(), block);
+ List lines = linkReferenceDefinitionParser.getParagraphLines();
+ if (!lines.isEmpty()) {
+ inlineParser.parse(lines, block);
}
}
- public CharSequence getContentString() {
- return linkReferenceDefinitionParser.getParagraphContent();
+ public List getParagraphLines() {
+ return linkReferenceDefinitionParser.getParagraphLines();
}
public List getDefinitions() {
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
index acf55a796..7b62d1257 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java
@@ -18,7 +18,7 @@ public class AutolinkInlineParser implements InlineContentParser {
.compile("^([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$");
@Override
- public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ public ParsedInline tryParse(InlineParserState inlineParserState) {
Scanner scanner = inlineParserState.scanner();
scanner.next();
Position start = scanner.position();
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
index cd87f7399..f57a67a74 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java
@@ -16,7 +16,7 @@ public class BackslashInlineParser implements InlineContentParser {
private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE);
@Override
- public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ public ParsedInline tryParse(InlineParserState inlineParserState) {
Scanner scanner = inlineParserState.scanner();
// Backslash
scanner.next();
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
index 9979bde86..f00079793 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java
@@ -11,7 +11,7 @@
public class BackticksInlineParser implements InlineContentParser {
@Override
- public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ public ParsedInline tryParse(InlineParserState inlineParserState) {
Scanner scanner = inlineParserState.scanner();
Position start = scanner.position();
int openingTicks = scanner.matchMultiple('`');
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
index d44ee4217..f0c330002 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java
@@ -16,7 +16,7 @@ public class EntityInlineParser implements InlineContentParser {
private static final AsciiMatcher entityContinue = entityStart.newBuilder().range('0', '9').build();
@Override
- public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ public ParsedInline tryParse(InlineParserState inlineParserState) {
Scanner scanner = inlineParserState.scanner();
Position start = scanner.position();
// Skip `&`
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
index 4c25c3e58..6abe12487 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java
@@ -2,7 +2,6 @@
import org.commonmark.internal.util.AsciiMatcher;
import org.commonmark.node.HtmlInline;
-import org.commonmark.node.Node;
/**
* Attempt to parse inline HTML.
@@ -28,7 +27,7 @@ public class HtmlInlineParser implements InlineContentParser {
private static final AsciiMatcher declaration = AsciiMatcher.builder().range('A', 'Z').build();
@Override
- public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
+ public ParsedInline tryParse(InlineParserState inlineParserState) {
Scanner scanner = inlineParserState.scanner();
Position start = scanner.position();
// Skip over `<`
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java
index 76259c444..dc8c43640 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/InlineContentParser.java
@@ -1,10 +1,8 @@
package org.commonmark.internal.inline;
-import org.commonmark.node.Node;
-
// TODO: I'd prefer if this was named InlineParser, but that's already public API, hmm...
public interface InlineContentParser {
- ParsedInline tryParse(InlineParserState inlineParserState, Node previous);
+ ParsedInline tryParse(InlineParserState inlineParserState);
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
deleted file mode 100644
index 1b85c6eec..000000000
--- a/commonmark/src/main/java/org/commonmark/internal/inline/LineBreakInlineContentParser.java
+++ /dev/null
@@ -1,39 +0,0 @@
-package org.commonmark.internal.inline;
-
-import org.commonmark.internal.util.Parsing;
-import org.commonmark.node.HardLineBreak;
-import org.commonmark.node.Node;
-import org.commonmark.node.SoftLineBreak;
-import org.commonmark.node.Text;
-
-/**
- * Parse a newline. If it was preceded by two spaces, return a hard line break; otherwise a soft line break.
- */
-public class LineBreakInlineContentParser implements InlineContentParser {
-
- @Override
- public ParsedInline tryParse(InlineParserState inlineParserState, Node previous) {
- Scanner scanner = inlineParserState.scanner();
- scanner.next();
-
- // Check previous text for trailing spaces.
- // The "endsWith" is an optimization to avoid an RE match in the common case.
- if (previous instanceof Text && ((Text) previous).getLiteral().endsWith(" ")) {
- Text text = (Text) previous;
- String literal = text.getLiteral();
- int last = literal.length() - 1;
- int nonSpace = Parsing.skipBackwards(' ', literal, last, 0);
- int spaces = last - nonSpace;
- if (spaces > 0) {
- text.setLiteral(literal.substring(0, literal.length() - spaces));
- }
- if (spaces >= 2) {
- return ParsedInline.of(new HardLineBreak(), scanner.position());
- } else {
- return ParsedInline.of(new SoftLineBreak(), scanner.position());
- }
- } else {
- return ParsedInline.of(new SoftLineBreak(), scanner.position());
- }
- }
-}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Position.java b/commonmark/src/main/java/org/commonmark/internal/inline/Position.java
index dff5e36df..4a54cbce6 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Position.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Position.java
@@ -1,12 +1,19 @@
package org.commonmark.internal.inline;
public class Position {
+ final int lineIndex;
final int index;
- Position(int index) {
+ Position(int lineIndex, int index) {
+ this.lineIndex = lineIndex;
this.index = index;
}
+ // TODO: Move packages around so that this can stay package-private
+ public int getLineIndex() {
+ return lineIndex;
+ }
+
// TODO: Move packages around so that this can stay package-private
public int getIndex() {
return index;
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index 3db930e0b..b144ba52a 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -1,42 +1,83 @@
package org.commonmark.internal.inline;
import org.commonmark.internal.util.CharMatcher;
-import org.commonmark.internal.util.Parsing;
+
+import java.util.List;
public class Scanner {
- private final CharSequence input;
+ // Lines without newlines at the end. The scanner will yield `\n` between lines because they're significant for
+ // parsing and the final output. There is no `\n` after the last line.
+ private final List lines;
+ // Which line we're at.
+ private int lineIndex;
+ // The index within the line. If index == length(), we pretend that there's a `\n` and only advance after we yield
+ // that.
private int index;
+ // Current line or "" if at the end of the lines (using "" instead of null saves a null check)
+ private CharSequence line = "";
+ private int lineLength = 0;
+
// TODO: Visibility
- public Scanner(CharSequence input, int index) {
- this.input = input;
+ public Scanner(List lines, int lineIndex, int index) {
+ this.lines = lines;
+ this.lineIndex = lineIndex;
this.index = index;
+ if (!lines.isEmpty()) {
+ line = lines.get(lineIndex);
+ lineLength = line.length();
+ }
}
public char peek() {
- if (index >= input.length()) {
- return '\0';
+ if (index < lineLength) {
+ return line.charAt(index);
} else {
- return input.charAt(index);
+ if (lineIndex < lines.size() - 1) {
+ return '\n';
+ } else {
+ // Don't return newline for end of last line
+ return '\0';
+ }
}
}
public char peekPrevious() {
- int prev = index - 1;
- if (prev >= 0 && prev < input.length()) {
- return input.charAt(prev);
+ if (index > 0) {
+ int prev = index - 1;
+ return line.charAt(prev);
} else {
- return '\0';
+ if (lineIndex > 0) {
+ return '\n';
+ } else {
+ return '\0';
+ }
}
}
public boolean hasNext() {
- return index < input.length();
+ if (index < lineLength) {
+ return true;
+ } else {
+ // No newline at end of last line
+ return lineIndex < lines.size() - 1;
+ }
}
public void next() {
index++;
+ if (index > lineLength) {
+ lineIndex++;
+ if (lineIndex < lines.size()) {
+ line = lines.get(lineIndex);
+ lineLength = line.length();
+ } else {
+ line = "";
+ lineLength = 0;
+ }
+ index = 0;
+ }
}
public boolean next(char c) {
@@ -67,10 +108,22 @@ public int match(CharMatcher matcher) {
}
public int whitespace() {
- int newIndex = Parsing.skipWhitespace(input, index, input.length());
- int count = newIndex - index;
- index = newIndex;
- return count;
+ int count = 0;
+ while (true) {
+ switch (peek()) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\u000B':
+ case '\f':
+ case '\r':
+ count++;
+ next();
+ break;
+ default:
+ return count;
+ }
+ }
}
public int find(char c) {
@@ -104,12 +157,31 @@ public int find(CharMatcher matcher) {
// Don't expose the int index, because it would be good if we could switch input to a List of lines later
// instead of one contiguous String.
public Position position() {
- return new Position(index);
+ return new Position(lineIndex, index);
}
// For cases where the caller appends the result to a StringBuilder, we could offer another method to avoid some
// unnecessary copying.
public CharSequence textBetween(Position begin, Position end) {
- return input.subSequence(begin.index, end.index);
+ if (begin.lineIndex == end.lineIndex) {
+ // Shortcut for common case of text from a single line
+ return lines.get(begin.lineIndex).subSequence(begin.index, end.index);
+ } else {
+ StringBuilder sb = new StringBuilder();
+
+ CharSequence firstLine = lines.get(begin.lineIndex);
+ sb.append(firstLine.subSequence(begin.index, firstLine.length()));
+ sb.append('\n');
+
+ // Lines between begin and end (we are appending the full line)
+ for (int line = begin.lineIndex + 1; line < end.lineIndex; line++) {
+ sb.append(lines.get(line));
+ sb.append('\n');
+ }
+
+ CharSequence lastLine = lines.get(end.lineIndex);
+ sb.append(lastLine.subSequence(0, end.index));
+ return sb.toString();
+ }
}
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
index 10be523bb..eb7dddb0d 100644
--- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
+++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java
@@ -120,18 +120,16 @@ public static CharSequence prepareLine(CharSequence line) {
int length = line.length();
for (int i = 0; i < length; i++) {
char c = line.charAt(i);
- switch (c) {
- case '\0':
- if (sb == null) {
- sb = new StringBuilder(length);
- sb.append(line, 0, i);
- }
- sb.append('\uFFFD');
- break;
- default:
- if (sb != null) {
- sb.append(c);
- }
+ if (c == '\0') {
+ if (sb == null) {
+ sb = new StringBuilder(length);
+ sb.append(line, 0, i);
+ }
+ sb.append('\uFFFD');
+ } else {
+ if (sb != null) {
+ sb.append(c);
+ }
}
}
diff --git a/commonmark/src/main/java/org/commonmark/parser/InlineParser.java b/commonmark/src/main/java/org/commonmark/parser/InlineParser.java
index 492c3cc8a..291fd4900 100644
--- a/commonmark/src/main/java/org/commonmark/parser/InlineParser.java
+++ b/commonmark/src/main/java/org/commonmark/parser/InlineParser.java
@@ -2,14 +2,17 @@
import org.commonmark.node.Node;
+import java.util.List;
+
/**
* Parser for inline content (text, links, emphasized text, etc).
*/
public interface InlineParser {
/**
- * @param input the content to parse as inline
+ * @param lines the content to parse as inline
* @param node the node to append resulting nodes to (as children)
*/
- void parse(String input, Node node);
+ // TODO: Should we use a better type here, one that will be able to include source positions?
+ void parse(List lines, Node node);
}
diff --git a/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java b/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java
index d4cd9d471..41ba2712b 100644
--- a/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java
+++ b/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java
@@ -1,5 +1,7 @@
package org.commonmark.parser.block;
+import java.util.List;
+
/**
* Open block parser that was last matched during the continue phase. This is different from the currently active
* block parser, as an unmatched block is only closed when a new block is started.
@@ -10,11 +12,10 @@ public interface MatchedBlockParser {
BlockParser getMatchedBlockParser();
/**
- * Returns the current content of the paragraph if the matched block is a paragraph. The content can be multiple
- * lines separated by {@code '\n'}.
+ * Returns the current paragraph lines if the matched block is a paragraph.
*
- * @return paragraph content or {@code null}
+ * @return paragraph content or an empty list
*/
- CharSequence getParagraphContent();
+ List getParagraphLines();
}
diff --git a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java
index f0bdef492..3b9ae83dc 100644
--- a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java
+++ b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java
@@ -12,9 +12,7 @@ public class LinkReferenceDefinitionParserTest {
@Test
public void testStartLabel() {
- parser.parse("[");
- assertEquals(State.LABEL, parser.getState());
- assertEquals("[", parser.getParagraphContent().toString());
+ assertState("[", State.LABEL, "[");
}
@Test
@@ -25,7 +23,7 @@ public void testStartNoLabel() {
parser.parse("a");
parser.parse("[");
assertEquals(State.PARAGRAPH, parser.getState());
- assertEquals("a\n[", parser.getParagraphContent().toString());
+ assertParagraphLines("a\n[", parser);
}
@Test
@@ -80,7 +78,7 @@ public void testDestination() {
LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser();
parser.parse("[foo]: /url");
assertEquals(State.START_TITLE, parser.getState());
- assertEquals("", parser.getParagraphContent().toString());
+ assertParagraphLines("", parser);
assertEquals(1, parser.getDefinitions().size());
assertDef(parser.getDefinitions().get(0), "foo", "/url", null);
@@ -99,7 +97,7 @@ public void testTitle() {
LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser();
parser.parse("[foo]: /url 'title'");
assertEquals(State.START_DEFINITION, parser.getState());
- assertEquals("", parser.getParagraphContent().toString());
+ assertParagraphLines("", parser);
assertEquals(1, parser.getDefinitions().size());
assertDef(parser.getDefinitions().get(0), "foo", "/url", "title");
@@ -110,12 +108,12 @@ public void testTitleStartWhitespace() {
LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser();
parser.parse("[foo]: /url");
assertEquals(State.START_TITLE, parser.getState());
- assertEquals("", parser.getParagraphContent().toString());
+ assertParagraphLines("", parser);
parser.parse(" ");
assertEquals(State.START_DEFINITION, parser.getState());
- assertEquals(" ", parser.getParagraphContent().toString());
+ assertParagraphLines(" ", parser);
assertEquals(1, parser.getDefinitions().size());
assertDef(parser.getDefinitions().get(0), "foo", "/url", null);
@@ -126,17 +124,17 @@ public void testTitleMultiline() {
LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser();
parser.parse("[foo]: /url 'two");
assertEquals(State.TITLE, parser.getState());
- assertEquals("[foo]: /url 'two", parser.getParagraphContent().toString());
+ assertParagraphLines("[foo]: /url 'two", parser);
assertEquals(0, parser.getDefinitions().size());
parser.parse("lines");
assertEquals(State.TITLE, parser.getState());
- assertEquals("[foo]: /url 'two\nlines", parser.getParagraphContent().toString());
+ assertParagraphLines("[foo]: /url 'two\nlines", parser);
assertEquals(0, parser.getDefinitions().size());
parser.parse("'");
assertEquals(State.START_DEFINITION, parser.getState());
- assertEquals("", parser.getParagraphContent().toString());
+ assertParagraphLines("", parser);
assertEquals(1, parser.getDefinitions().size());
assertDef(parser.getDefinitions().get(0), "foo", "/url", "two\nlines\n");
@@ -168,7 +166,7 @@ private static void assertState(String input, State state, String paragraphConte
LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser();
parser.parse(input);
assertEquals(state, parser.getState());
- assertEquals(paragraphContent, parser.getParagraphContent().toString());
+ assertParagraphLines(paragraphContent, parser);
}
private static void assertDef(LinkReferenceDefinition def, String label, String destination, String title) {
@@ -176,4 +174,16 @@ private static void assertDef(LinkReferenceDefinition def, String label, String
assertEquals(destination, def.getDestination());
assertEquals(title, def.getTitle());
}
+
+ private static void assertParagraphLines(String expectedContent, LinkReferenceDefinitionParser parser) {
+ StringBuilder sb = new StringBuilder();
+ for (CharSequence line : parser.getParagraphLines()) {
+ if (sb.length() != 0) {
+ sb.append('\n');
+ }
+ sb.append(line);
+ }
+ String actual = sb.toString();
+ assertEquals(expectedContent, actual);
+ }
}
diff --git a/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java b/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java
index ed8e958ad..798330e8d 100644
--- a/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java
+++ b/commonmark/src/test/java/org/commonmark/internal/inline/ScannerTest.java
@@ -2,13 +2,16 @@
import org.junit.Test;
-import static org.junit.Assert.assertEquals;
+import java.util.Arrays;
+import java.util.Collections;
+
+import static org.junit.Assert.*;
public class ScannerTest {
-
+
@Test
public void testNext() {
- Scanner scanner = new Scanner("foo bar", 4);
+ Scanner scanner = new Scanner(Collections.singletonList("foo bar"), 0, 4);
assertEquals('b', scanner.peek());
scanner.next();
assertEquals('a', scanner.peek());
@@ -17,4 +20,63 @@ public void testNext() {
scanner.next();
assertEquals('\0', scanner.peek());
}
+
+ @Test
+ public void testMultipleLines() {
+ Scanner scanner = new Scanner(Arrays.asList("ab", "cde"), 0, 0);
+ assertTrue(scanner.hasNext());
+ assertEquals('\0', scanner.peekPrevious());
+ assertEquals('a', scanner.peek());
+ scanner.next();
+
+ assertTrue(scanner.hasNext());
+ assertEquals('a', scanner.peekPrevious());
+ assertEquals('b', scanner.peek());
+ scanner.next();
+
+ assertTrue(scanner.hasNext());
+ assertEquals('b', scanner.peekPrevious());
+ assertEquals('\n', scanner.peek());
+ scanner.next();
+
+ assertTrue(scanner.hasNext());
+ assertEquals('\n', scanner.peekPrevious());
+ assertEquals('c', scanner.peek());
+ scanner.next();
+
+ assertTrue(scanner.hasNext());
+ assertEquals('c', scanner.peekPrevious());
+ assertEquals('d', scanner.peek());
+ scanner.next();
+
+ assertTrue(scanner.hasNext());
+ assertEquals('d', scanner.peekPrevious());
+ assertEquals('e', scanner.peek());
+ scanner.next();
+
+ assertFalse(scanner.hasNext());
+ assertEquals('e', scanner.peekPrevious());
+ assertEquals('\0', scanner.peek());
+ }
+
+ @Test
+ public void testTextBetween() {
+ Scanner scanner = new Scanner(Arrays.asList("ab", "cde"), 0, 0);
+ Position start = scanner.position();
+ scanner.next();
+ assertEquals("a", scanner.textBetween(start, scanner.position()));
+ Position afterA = scanner.position();
+ scanner.next();
+ assertEquals("ab", scanner.textBetween(start, scanner.position()));
+ scanner.next();
+ assertEquals("ab\n", scanner.textBetween(start, scanner.position()));
+ scanner.next();
+ assertEquals("ab\nc", scanner.textBetween(start, scanner.position()));
+ scanner.next();
+ assertEquals("ab\ncd", scanner.textBetween(start, scanner.position()));
+ scanner.next();
+ assertEquals("ab\ncde", scanner.textBetween(start, scanner.position()));
+
+ assertEquals("b\ncde", scanner.textBetween(afterA, scanner.position()));
+ }
}
diff --git a/commonmark/src/test/java/org/commonmark/test/ParserTest.java b/commonmark/src/test/java/org/commonmark/test/ParserTest.java
index e058de378..216bdeefe 100644
--- a/commonmark/src/test/java/org/commonmark/test/ParserTest.java
+++ b/commonmark/src/test/java/org/commonmark/test/ParserTest.java
@@ -103,7 +103,7 @@ public void indentation() {
public void inlineParser() {
final InlineParser fakeInlineParser = new InlineParser() {
@Override
- public void parse(String input, Node node) {
+ public void parse(List lines, Node node) {
node.appendChild(new ThematicBreak());
}
};
diff --git a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java
index a70127a72..2b19db3db 100644
--- a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java
+++ b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java
@@ -167,4 +167,11 @@ public void deeplyIndentedList() {
"\n" +
"\n");
}
+
+ @Test
+ public void trailingTabs() {
+ // The tab is not treated as 4 spaces here and so does not result in a hard line break, but is just preserved.
+ // This matches what commonmark.js did at the time of writing.
+ assertRendering("a\t\nb\n", "a\t\nb
\n");
+ }
}
From aa9590c7af7d6695f2e8c3f686dccfde06126644 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 3 Aug 2020 17:14:01 +1000
Subject: [PATCH 075/450] Keep a single Scanner instance
This avoids a lot of copying (we still need to copy the position
though). In exchange, we need to be more careful and reset the position
if we want to backtrack.
---
.../commonmark/internal/DocumentParser.java | 1 -
.../commonmark/internal/InlineParserImpl.java | 69 +++++--------------
.../LinkReferenceDefinitionParser.java | 3 +-
.../internal/inline/InlineParserState.java | 7 ++
.../commonmark/internal/inline/Scanner.java | 44 +++++++++---
5 files changed, 62 insertions(+), 62 deletions(-)
diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java
index 5c3cdfd3d..58b390b63 100644
--- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java
@@ -428,7 +428,6 @@ private void addLine() {
} else if (index == 0) {
content = line;
} else {
- // TODO: Maybe we should bring back Subsequence here?
content = line.subSequence(index, line.length());
}
getActiveBlockParser().addLine(content);
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index eee25af40..5383d9827 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -27,10 +27,7 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
private final InlineParserContext context;
private final Map> inlineParsers;
- // TODO: Should we just keep a scanner here instead?
- private List lines;
- private int lineIndex;
- private int index;
+ private Scanner scanner;
private int trailingSpaces;
/**
@@ -86,10 +83,9 @@ public static Map calculateDelimiterProcessors(Li
return map;
}
- // TODO: The implementation shouldn't be public
@Override
public Scanner scanner() {
- return new Scanner(lines, lineIndex, index);
+ return scanner;
}
private static void addDelimiterProcessors(Iterable delimiterProcessors, Map map) {
@@ -145,15 +141,8 @@ public void parse(List lines, Node block) {
mergeChildTextNodes(block);
}
- void setPosition(Position position) {
- lineIndex = position.getLineIndex();
- index = position.getIndex();
- }
-
void reset(List lines) {
- this.lines = lines;
- this.lineIndex = 0;
- this.index = 0;
+ this.scanner = Scanner.of(lines);
this.trailingSpaces = 0;
this.lastDelimiter = null;
this.lastBracket = null;
@@ -169,20 +158,23 @@ private Text text(String text) {
* On failure, return null.
*/
private Node parseInline() {
- Scanner scanner = scanner();
char c = scanner.peek();
if (c == '\0') {
return null;
}
+ Position position = scanner.position();
List inlineParsers = this.inlineParsers.get(c);
if (inlineParsers != null) {
for (InlineContentParser inlineParser : inlineParsers) {
ParsedInline parsedInline = inlineParser.tryParse(this);
if (parsedInline instanceof ParsedInlineImpl) {
ParsedInlineImpl parsedInlineImpl = (ParsedInlineImpl) parsedInline;
- setPosition(parsedInlineImpl.getPosition());
+ scanner.setPosition(parsedInlineImpl.getPosition());
return parsedInlineImpl.getNode();
+ } else {
+ // Reset position
+ scanner.setPosition(position);
}
}
}
@@ -209,22 +201,6 @@ private Node parseInline() {
// If we get here, even for a special/delimiter character, we will just treat it as text.
return parseText();
-// } else {
-// node = parseString();
-// }
-// break;
-//
-// Node node;
-// if (node != null) {
-// return node;
-// } else {
-// scanner.next();
-// setPosition(scanner.position());
-// // When we get here, it's only for a single special character that turned out to not have a special meaning.
-// // So we shouldn't have a single surrogate here, hence it should be ok to turn it into a String.
-// String literal = String.valueOf(c);
-// return text(literal);
-// }
}
/**
@@ -253,10 +229,8 @@ private Node parseDelimiters(DelimiterProcessor delimiterProcessor, char delimit
* Add open bracket to delimiter stack and add a text node to block's children.
*/
private Node parseOpenBracket() {
- Scanner scanner = scanner();
scanner.next();
Position start = scanner.position();
- setPosition(start);
Text node = text("[");
@@ -271,18 +245,14 @@ private Node parseOpenBracket() {
* Otherwise just add a text node.
*/
private Node parseBang() {
- Scanner scanner = scanner();
scanner.next();
if (scanner.next('[')) {
Text node = text(";
+ scanner.setPosition(afterClose);
dest = null;
title = null;
}
@@ -341,6 +312,9 @@ private Node parseCloseBracket() {
if (dest == null) {
// See if there's a link label like `[bar]` or `[]`
String ref = parseLinkLabel(scanner);
+ if (ref == null) {
+ scanner.setPosition(afterClose);
+ }
if ((ref == null || ref.isEmpty()) && !opener.bracketAfter) {
// If the second label is empty `[foo][]` or missing `[foo]`, then the first label is the reference.
// But it can only be a reference when there's no (unescaped) bracket in it.
@@ -388,14 +362,13 @@ private Node parseCloseBracket() {
}
}
- setPosition(scanner.position());
-
return linkOrImage;
} else {
// No link or image, parse just the bracket as text and continue
removeLastBracket();
+ scanner.setPosition(afterClose);
return text("]");
}
}
@@ -476,9 +449,7 @@ String parseLinkLabel(Scanner scanner) {
}
private Node parseLineBreak() {
- Scanner scanner = scanner();
scanner.next();
- setPosition(scanner.position());
if (trailingSpaces >= 2) {
return new HardLineBreak();
@@ -491,7 +462,6 @@ private Node parseLineBreak() {
* Parse the next character as plain text, and possibly more if the following characters are non-special.
*/
private Node parseText() {
- Scanner scanner = scanner();
Position start = scanner.position();
scanner.next();
while (scanner.hasNext()) {
@@ -502,7 +472,6 @@ private Node parseText() {
}
String text = scanner.textBetween(start, scanner.position()).toString();
- setPosition(scanner.position());
char c = scanner.peek();
if (c == '\n') {
@@ -526,14 +495,13 @@ private Node parseText() {
* @return information about delimiter run, or {@code null}
*/
private DelimiterData scanDelimiters(DelimiterProcessor delimiterProcessor, char delimiterChar) {
- Scanner scanner = scanner();
char charBefore = scanner.peekPrevious();
Position start = scanner.position();
int delimiterCount = scanner.matchMultiple(delimiterChar);
if (delimiterCount < delimiterProcessor.getMinLength()) {
- setPosition(start);
+ scanner.setPosition(start);
return null;
}
@@ -561,7 +529,6 @@ private DelimiterData scanDelimiters(DelimiterProcessor delimiterProcessor, char
canClose = rightFlanking && delimiterChar == delimiterProcessor.getClosingCharacter();
}
- setPosition(scanner.position());
String text = scanner.textBetween(start, scanner.position()).toString();
return new DelimiterData(delimiterCount, canOpen, canClose, new Text(text));
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
index 7f257343b..950fc31c8 100644
--- a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
+++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java
@@ -8,7 +8,6 @@
import org.commonmark.node.SourceSpan;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
/**
@@ -39,7 +38,7 @@ public void parse(CharSequence line) {
return;
}
- Scanner scanner = new Scanner(Collections.singletonList(line), 0, 0);
+ Scanner scanner = Scanner.of(line);
while (scanner.hasNext()) {
boolean success;
switch (state) {
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java b/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java
index 9a6ef7d19..f6cb6bf49 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/InlineParserState.java
@@ -2,5 +2,12 @@
public interface InlineParserState {
+ /**
+ * Return a scanner for the input for the current position (on the character that the inline parser registered
+ * interest for).
+ *
+ * Note that this always returns the same instance, if you want to backtrack you need to use
+ * {@link Scanner#position()} and {@link Scanner#setPosition(Position)}.
+ */
Scanner scanner();
}
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index b144ba52a..a96493d7c 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -2,6 +2,7 @@
import org.commonmark.internal.util.CharMatcher;
+import java.util.Collections;
import java.util.List;
public class Scanner {
@@ -19,17 +20,24 @@ public class Scanner {
private CharSequence line = "";
private int lineLength = 0;
- // TODO: Visibility
- public Scanner(List lines, int lineIndex, int index) {
+ Scanner(List lines, int lineIndex, int index) {
this.lines = lines;
this.lineIndex = lineIndex;
this.index = index;
if (!lines.isEmpty()) {
- line = lines.get(lineIndex);
- lineLength = line.length();
+ checkPosition(lineIndex, index);
+ setLine(lines.get(lineIndex));
}
}
+ public static Scanner of(List lines) {
+ return new Scanner(lines, 0, 0);
+ }
+
+ public static Scanner of(CharSequence line) {
+ return new Scanner(Collections.singletonList(line), 0, 0);
+ }
+
public char peek() {
if (index < lineLength) {
return line.charAt(index);
@@ -70,11 +78,9 @@ public void next() {
if (index > lineLength) {
lineIndex++;
if (lineIndex < lines.size()) {
- line = lines.get(lineIndex);
- lineLength = line.length();
+ setLine(lines.get(lineIndex));
} else {
- line = "";
- lineLength = 0;
+ setLine("");
}
index = 0;
}
@@ -160,6 +166,13 @@ public Position position() {
return new Position(lineIndex, index);
}
+ public void setPosition(Position position) {
+ checkPosition(position.lineIndex, position.index);
+ this.lineIndex = position.lineIndex;
+ this.index = position.index;
+ setLine(lines.get(this.lineIndex));
+ }
+
// For cases where the caller appends the result to a StringBuilder, we could offer another method to avoid some
// unnecessary copying.
public CharSequence textBetween(Position begin, Position end) {
@@ -184,4 +197,19 @@ public CharSequence textBetween(Position begin, Position end) {
return sb.toString();
}
}
+
+ private void setLine(CharSequence line) {
+ this.line = line;
+ this.lineLength = line.length();
+ }
+
+ private void checkPosition(int lineIndex, int index) {
+ if (lineIndex < 0 || lineIndex >= lines.size()) {
+ throw new IllegalArgumentException("Line index " + lineIndex + " out of range, number of lines: " + lines.size());
+ }
+ CharSequence line = lines.get(lineIndex);
+ if (index < 0 || index > line.length()) {
+ throw new IllegalArgumentException("Index " + index + " out of range, line length: " + line.length());
+ }
+ }
}
From c5fc1f75e614461219df9dee9715f314a3b2df55 Mon Sep 17 00:00:00 2001
From: Robin Stocker
Date: Mon, 3 Aug 2020 17:26:32 +1000
Subject: [PATCH 076/450] Add Scanner.END constant
---
.../commonmark/internal/InlineParserImpl.java | 8 ++++----
.../org/commonmark/internal/inline/Scanner.java | 16 ++++++++++++----
2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
index 5383d9827..dedf68599 100644
--- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
+++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java
@@ -159,7 +159,7 @@ private Text text(String text) {
*/
private Node parseInline() {
char c = scanner.peek();
- if (c == '\0') {
+ if (c == Scanner.END) {
return null;
}
@@ -479,7 +479,7 @@ private Node parseText() {
int end = Parsing.skipBackwards(' ', text, text.length() - 1, 0) + 1;
trailingSpaces = text.length() - end;
text = text.substring(0, end);
- } else if (c == '\0') {
+ } else if (c == Scanner.END) {
// For the last line, both tabs and spaces are trimmed for some reason (checked with commonmark.js).
int end = Parsing.skipSpaceTabBackwards(text, text.length() - 1, 0) + 1;
text = text.substring(0, end);
@@ -506,8 +506,8 @@ private DelimiterData scanDelimiters(DelimiterProcessor delimiterProcessor, char
}
char charAfter = scanner.peek();
- String before = charBefore == '\0' ? "\n" : String.valueOf(charBefore);
- String after = charAfter == '\0' ? "\n" : String.valueOf(charAfter);
+ String before = charBefore == Scanner.END ? "\n" : String.valueOf(charBefore);
+ String after = charAfter == Scanner.END ? "\n" : String.valueOf(charAfter);
// We could be more lazy here, in most cases we don't need to do every match case.
boolean beforeIsPunctuation = PUNCTUATION.matcher(before).matches();
diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
index a96493d7c..7c1076e2d 100644
--- a/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
+++ b/commonmark/src/main/java/org/commonmark/internal/inline/Scanner.java
@@ -7,6 +7,14 @@
public class Scanner {
+ /**
+ * Character representing the end of input (or outside of the text in case of the "previous" methods).
+ *
+ * Note that we can use NULL to represent this because CommonMark does not allow those in the input (we replace them
+ * in the beginning of parsing).
+ */
+ public static final char END = '\0';
+
// Lines without newlines at the end. The scanner will yield `\n` between lines because they're significant for
// parsing and the final output. There is no `\n` after the last line.
private final List lines;
@@ -46,7 +54,7 @@ public char peek() {
return '\n';
} else {
// Don't return newline for end of last line
- return '\0';
+ return END;
}
}
}
@@ -59,7 +67,7 @@ public char peekPrevious() {
if (lineIndex > 0) {
return '\n';
} else {
- return '\0';
+ return END;
}
}
}
@@ -136,7 +144,7 @@ public int find(char c) {
int count = 0;
while (true) {
char cur = peek();
- if (cur == '\0') {
+ if (cur == Scanner.END) {
return -1;
} else if (cur == c) {
return count;
@@ -150,7 +158,7 @@ public int find(CharMatcher matcher) {
int count = 0;
while (true) {
char c = peek();
- if (c == '\0') {
+ if (c == END) {
return -1;
} else if (matcher.matches(c)) {
return count;
From 62ee2b6435c3b29a1d4b5280d2fec99f9633af70 Mon Sep 17 00:00:00 2001
From: Robin Stocker