diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 000000000..3f9d1b9aa --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,2 @@ +comment: + layout: "diff, flags, files" diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..95ce00314 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,28 @@ +--- +name: Bug report +about: Some input not render as you expect? Include an example so we can help +title: '' +labels: bug +assignees: '' + +--- + +Steps to reproduce the problem (provide example Markdown if applicable): + +``` +my markdown +``` + +Expected behavior: + +``` +expected HTML +``` + +Actual behavior: + +``` +actual HTML +``` + +(Also see what the reference implementation does: https://spec.commonmark.org/dingus/) diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..02c83ef31 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. Include source code if possible. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context about the feature request here. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..b32794271 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,61 @@ +# See https://docs.github.com/en/actions/language-and-framework-guides/building-and-testing-java-with-maven + +name: ci + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + java: [11, 17, 21, 25] + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + java-version: ${{ matrix.java }} + distribution: 'zulu' + + - name: Build + run: mvn -B package javadoc:javadoc + + coverage: + runs-on: ubuntu-latest + if: ${{ github.event_name == 'push' }} + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + java-version: 11 + distribution: 'zulu' + + - name: Build with coverage + run: mvn -B -Pcoverage clean test jacoco:report-aggregate + + - name: Publish coverage + uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} + + android-compatibility: + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + java-version: 11 + distribution: 'zulu' + + - name: Android Lint checks + run: cd commonmark-android-test && ./gradlew :app:lint diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..c0531ca55 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,43 @@ +# See: +# https://docs.github.com/en/free-pro-team@latest/actions/guides/publishing-java-packages-with-maven +# https://central.sonatype.org/pages/apache-maven.html +# https://github.com/actions/setup-java + +name: release + +on: + workflow_dispatch: + +jobs: + release: + environment: maven_central + runs-on: ubuntu-latest + steps: + - name: Checkout sources + uses: actions/checkout@v4 + + - name: Set up Maven Central repository + uses: actions/setup-java@v4 + with: + java-version: 24 + distribution: 'zulu' + # See https://central.sonatype.org/publish/publish-portal-maven/ + server-id: central + server-username: CENTRAL_USERNAME # env variable to use for username in release + server-password: CENTRAL_PASSWORD # env variable to use for password in release + gpg-private-key: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }} + gpg-passphrase: MAVEN_GPG_PASSPHRASE # env variable to use for passphrase in release + + - name: Set up Git user + run: | + git config --global user.name "${{ secrets.GIT_USER_NAME }}" + git config --global user.email "${{ secrets.GIT_USER_EMAIL }}" + + - name: Release + run: | + mvn -B -Dusername=${{ secrets.GH_USERNAME }} -Dpassword=${{ secrets.GH_ACCESS_TOKEN }} release:prepare + mvn -B release:perform + env: + CENTRAL_USERNAME: ${{ secrets.CENTRAL_USERNAME }} + CENTRAL_PASSWORD: ${{ secrets.CENTRAL_PASSWORD }} + MAVEN_GPG_PASSPHRASE: ${{ secrets.MAVEN_GPG_PASSPHRASE }} diff --git a/.gitignore b/.gitignore index b1ce96c86..d998d8890 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,14 @@ +# Eclipse +.project +.classpath +.settings/ + # IntelliJ IDEA .idea *.iml # Maven target/ + +# macOS +.DS_Store diff --git a/.mvn/wrapper/maven-wrapper.properties b/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 000000000..4d245050f --- /dev/null +++ b/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +wrapperVersion=3.3.2 +distributionType=only-script +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.6.3/apache-maven-3.6.3-bin.zip diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index fa2c88c01..000000000 --- a/.travis.yml +++ /dev/null @@ -1,4 +0,0 @@ -language: java -jdk: - - openjdk7 - - oraclejdk8 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..9c5c67268 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,582 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), +with the exception that 0.x versions can break between minor versions. + +## [Unreleased] +### Added +- Allow customizing HTML attributes for alert title `

` tag via `AttributeProvider` + +## [0.28.0] - 2026-03-31 +### Added +- New extension for alerts (aka callouts/admonitions) + - Syntax: + ``` + > [!NOTE] + > The text of the note. + ``` + - As types you can use NOTE, TIP, IMPORTANT, WARNING, CAUTION; or configure the + extension to add additional ones. + - Use class `AlertsExtension` in artifact `commonmark-ext-gfm-alerts` (#420) +- New option `maxOpenBlockParsers` for `Parser.Builder` to set an overall limit + for the depth of block parsing. If set, any nesting beyond the limit will be + parsed as paragraph text instead. The default remains unlimited. + +## [0.27.1] - 2026-01-14 +### Fixed +- Line(s) after a hard line break would sometimes also get an unwanted hard + line break, e.g. if they ended in emphasis or other non-text inlines (#415) +- `TextContentRenderer` (for plain text): Fix nested lists on the same line (#413) +- Fix minor performance regression with pathological input (deeply nested + brackets) that was introduced in version 0.23.0. + +## [0.27.0] - 2025-10-12 +### Added +- Autolink extension: Now supports configuration of different link types that + should be recognized and converted to links. See `AutolinkExtension#builder` + + | Type | Default? | Description | + |---------|----------|--------------------------------------------------------| + | `URL` | Yes | URL with a protocol such as `https://example.com` | + | `EMAIL` | Yes | Email address such as `foo@example.com` | + | `WWW` | Yes | Address beginning with `www` such as `www.example.com` | + + Note that this changes the behavior of `AutolinkExtension.create()` to now also + include `WWW` links by default. To re-enable the previous behavior, use: + + ```java + AutolinkExtension.builder().linkTypes(AutolinkType.URL, AutolinkType.EMAIL).build(); + ``` + +## [0.26.0] - 2025-09-13 +### Changed +- A `LinkProcessor` using `replaceWith` now also stops outer links from being + parsed as links, same as with `wrapTextIn`. This prevents nested links, see + footnotes change below. +### Fixed +- Fix rendering of image alt text to include contents of code spans (`` `code` ``). (#398) +- footnotes: Fix footnotes nested within links. Before, both the link and the + footnote reference would be parsed and lead to nested `` elements, which + is disallowed. Now, only the footnote is parsed and the outer link becomes + plain text; this matches the behavior of links. (#400) + +## [0.25.1] - 2025-08-01 +### Fixed +- footnotes: Fix parsing of footnote definitions containing multiple paragraphs + separated by blank lines. Before it only worked if paragraphs were separated + by lines of 4 spaces. (#388) + +## [0.25.0] - 2025-06-20 +### Added +- Include OSGi metadata in jars (`META-INF/MANIFEST.MF` files) (#378) +- More documentation with examples for `Node` classes (#370) +### Changed +- GitHub tables: Tables are now parsed even if there's no blank line before the + table heading, matching GitHub's behavior. (#381) +### Fixed +- `MarkdownRenderer`: Fix precedence for `nodeRendererFactory`: Factories passed + to the builder can now override rendering for core node types. (#368) +- `MarkdownRenderer`: Fix exception with ordered lists with a long first number + followed by a shorter one (#382) +- Fix warning in Eclipse about "missing 'requires transitive'" (#358) +- Fix Android incompatibility with `requireNonNullElseGet` (#369) + +## [0.24.0] - 2024-10-21 +### Added +- `SourceSpan` on nodes now have a `getInputIndex` to get the index within the + original input string (in addition to the existing line/column indexes). + This is useful when looking up the input source: It can now be done using + `substring` instead of having to split the input into lines first (#348) +- Configurable line break rendering for `TextContentRenderer` via `lineBreakRendering` + on the builder; e.g. `LineBreakRendering.SEPARATE_BLOCKS` will render an empty + line between blocks (#344) +### Changed +- Adopted small changes from OpenJDK vendoring to make updates easier for them (#343) +### Fixed +- Enable overriding of built-in node rendering for `TextContentRenderer` (#346) + +## [0.23.0] - 2024-09-16 +### Added +- New extension for footnotes! + - Syntax: + ``` + Main text[^1] + + [^1]: Additional text in a footnote + ``` + - Inline footnotes like `^[inline footnote]` are also supported when enabled + via an option in `FootnotesExtension.Builder` + - Use class `FootnotesExtension` in artifact `commonmark-ext-footnotes` (#332) +- New option `omitSingleParagraphP` in `HtmlRenderer.Builder` for not using `

` + tags for when a document only has one paragraph (#150) +- Support for custom link processing during inline parsing (e.g. `[foo]`), + see `Parser.Builder#linkProcessor` +- Support for extending inline parsing with custom inline content parsers. See + `Parser.Builder#customInlineContentParserFactory`. This allows users/extensions + to hook into inline parsing on a deeper level than before (e.g. with delimiter + processors). It can be used to add support for math/latex formulas or other inline + syntax. (#321) +### Changed +- The default `DefaultUrlSanitizer` now also allows `data` as a protocol. Use the + constructor with a list to customize this. (#329) +- `LinkReferenceDefinition` now extends `Block` (it was extending `Node` + directly before) +- `MarkdownRenderer`: Don't escape `=` text if it's the first node in a block (#335) +### Fixed +- Fix parsing of link reference definitions with incorrect title syntax (followed + by characters other than space/tab). In that case, the title was set to the + partially-parsed title and the source spans were wrong. (#315) +- Fix source spans of blocks with lazy continuation lines (#337) +- `MarkdownRenderer`: Preserve thematic break literals (#331) + +## [0.22.0] - 2024-03-15 +### Added +- New `MarkdownRenderer` for rendering nodes to Markdown (CommonMark) (#306)! + Note that while care is taken to produce equivalent Markdown, some differences + in the original Markdown (if parsed) are not preserved, such as: + - The type of heading used + - The type of link used (reference links will be rendered as inline links) + - Whether special characters are escaped or not + - Leading and trailing whitespace +- Modular JAR (JPMS): All artifacts now include module descriptors (module-info) + so jlink can be used; the old `Automatic-Module-Name` manifest entries were removed +- New package `org.commonmark.parser.beta` containing classes that are not part of + the stable API but are exported from the module because they might be useful for + extension parsers +- New package `org.commonmark.text` for text related utilities that are useful for + both parsing and rendering +- `TableCell` now has `getWidth` returning the number of dash and colon characters + in the delimiter row, useful for rendering proportional width tables (#296) +- `ThematicBreak` now has `getLiteral` containing the string that was used in the + source when parsing (#309) +- `ListItem` now has `getMarkerIndent` and `getContentIndent` for retrieving the + space between the start of the line and the marker/content +- Deprecated a some properties of `BulletList`, `OrderedList`, `FencedCodeBlock` + and replaced with nullable ones because they might not be set when constructing + these nodes manually instead of via parsing +### Changed +- Java 11 or later is now required (dropping support for Java 8) +- Update to CommonMark spec 0.31.2 +### Fixed +- Fix `LinkReferenceDefinition` having null `SourceSpan` when title is present + and parsing with source spans option enabled (#310) + +## [0.21.0] - 2022-11-17 +### Added +- GitHub strikethrough: With the previous version we adjusted the + extension to also accept the single tilde syntax. But if you use + another extension that uses the single tilde syntax, you will get a + conflict. To avoid that, `StrikethroughExtension` can now be + configured to require two tildes like before, see Javadoc. + +## [0.20.0] - 2022-10-20 +### Fixed +- GitHub tables: A single pipe (optional whitespace) now ends a table + instead of crashing or being treated as an empty row, for consistency + with GitHub (#255). +- GitHub strikethrough: A single tilde now also works, and more than two + tildes are not accepted anymore. This brings us in line with what + GitHub actually does, which is a bit underspecified (#267) +- The autolink extension now handles source spans correctly (#209) + +## [0.19.0] - 2022-06-02 +### Added +- YAML front matter extension: Limited support for single and double + quoted string values (#260) +### Changed +- Check argument of `enabledBlockTypes` when building parser instead of NPEing later + +## [0.18.2] - 2022-02-24 +### Changed +- Test against Java 17 +- Bundle LICENSE.txt with artifacts (in addition to Maven metadata) + +## [0.18.1] - 2021-11-29 +### Fixed +- Fix tables with leading/trailing header pipes and trailing spaces (#244). + This was a regression in 0.16.1 which is now fixed. + +## [0.18.0] - 2021-06-30 +### Changed +- Update to CommonMark spec 0.30: + - Add `textarea` to list of literal HTML block tags. + Like `script`, `style`, and `pre`, `textarea` blocks can contain + blank lines without the contents being interpreted as commonmark. + - Fix case folding for link reference labels in some cases + (e.g. `ẞ` and `SS` should match) + - Allow lowercase ASCII in HTML declaration + - Don't let type 7 HTML blocks interrupt lazy paragraphs either +- Preserve the original case for the label of `LinkReferenceDefinition`. + Before, we used to store the normalized version (lowercase, collapsed whitespace). + +## [0.17.2] - 2021-05-14 +### Changed +- Pass original instead of normalized label to `InlineParserContext` for lookup (#204). + This allows custom contexts to change the lookup logic and have access to the original + label content. + In case you have a custom implementation of `InlineParserContext`, you might need to adjust + it to do normalization. + +## [0.17.1] - 2021-02-03 +### Fixed +- Fix emphasis surrounded by non-BMP punctuation/whitespace characters + (characters that are longer than one UTF-16 "char"). Note that this is + an edge case with rarely used Unicode characters, which a lot of other + implementations don't handle correctly. +- Fix tables where the row starts with spaces and then the first `|` - + rows that didn't have spaces before were not affected (#199). This bug + is present in 0.16.1 and 0.17.0. + +## [0.17.0] - 2021-01-15 +### Changed +- **ACTION REQUIRED**: Maven groupId has changed from `com.atlassian.commonmark` to `org.commonmark` + - To continue getting new versions of commonmark-java, change the Maven coordinates in your dependencies: + - Old: `com.atlassian.commonmark` + - New: `org.commonmark` + +## [0.16.1] - 2020-12-11 +### Added +- Support for including source spans on block and inline nodes (#1): + - Answer for "Where in the source input (line/column position and length) does this node come from?" + - Useful for things like editors that want to keep the input and rendered output scrolled to the same lines, + or start editing on the node that was selected. + - Use `includeSourceSpans` on `Parser.Builder` to enable, + either with `IncludeSourceSpans.BLOCKS` or `IncludeSourceSpans.BLOCKS_AND_INLINES` + - Read data with `Node.getSourceSpans` + - Note that enabling this has a small performance impact on parsing (about 10%) +### Changed +- In order to support source spans (see above), a few of the extension + APIs changed. It should only affect users implementing their own + extensions. See the Javadoc to see what changed. +- YAML front matter extension: Support dots in key names + +## [0.15.2] - 2020-07-20 +### Fixed +- image-attributes extension: Fix unexpected altering of text in case + parsing of attributes fails, e.g. `{NN} text` -> `{NN text}`, thanks @jk1 + +## [0.15.1] - 2020-05-29 +### Added +- Add text content rendering support for `InsExtension` + +## [0.15.0] - 2020-05-21 +### Added +- Extension for width/height attributes for images, thanks @dohertyfjatl + - Syntax: `![text](/url.png){width=640 height=480}` + - Use class `ImageAttributesExtension` in artifact `commonmark-ext-image-attributes` +- Extension for task lists (GitHub-style), thanks @dohertyfjatl + - Syntax: + ``` + - [x] task #1 + - [ ] task #2 + ``` + - Use class `TaskListItemsExtension` in artifact `commonmark-ext-task-list-items` + +## [0.14.0] - 2020-01-22 +### Added +- Add `sanitizeUrls` to `HtmlRenderer.Builder` to enable sanitizing URLs + of `` and `` tags. Sanitizing logic can be customized via + `urlSanitizer`. Thanks @VandorpeDavid + +## [0.13.1] - 2019-11-25 +### Fixed +- Fix potential `StackOverflowError` for regular expressions used in the + inline parser (e.g. when parsing long HTML), thanks @lehvolk + +## [0.13.0] - 2019-07-15 +### Added +- `LinkReferenceDefinition` nodes are now part of the document (not + rendered by default). +- `InlineParserContext.getLinkReferenceDefinition` was added to allow + custom inline parsers to look up definitions for reference links. +### Changed +- Performance improvements compared to previous version: + - Parsing 7-10% faster + - HTML rendering 105% faster - or in other words, twice as fast! +- Update to CommonMark spec 0.29 (#156): + - Change how newlines/spaces are handled in inline code + - Info strings for tilde code blocks can contain backticks and tildes + - Allow spaces inside link destinations in pointy brackets + - Disallow link destination beginning with `<` unless it is inside `<..>` + - Disallow unescaped '(' in link title in parens + - Disallow indenting list item marker by more than 3 spaces + - No longer treat `` as a block tag + - Link reference definitions can now be in setext headings too +- Tables extension: Changes to match GitHub implementation: + - Escaping now only considers pipe characters when parsing tables: + `\|` results in a literal `|` instead of a column, everything else + is passed through to inline parsing. + - Table body can now contain lazy continuation lines (without `|`). + An empty line or another block is needed to interrupt the table. + - For tables without a body, `` is no longer rendered in HTML + - See https://github.github.com/gfm/#tables-extension- for details +- Check non-null arguments early and provide a nicer message +### Fixed +- Fix incorrectly preserving HTML entities when rendering attributes +- Fix pathological case with input `[\\\\...` (a lot of backslashes) +- Fix pathological case with input `[]([]([](...` + +## [0.12.1] - 2018-11-13 +### Changed +- Speed up parsing significantly: Compared to the previous version, the + benchmarks show up to 55% faster parsing for both small and large + documents! (#137, #140) +- Parse backslash followed by unescapable character the same way as + the reference implementations. +- Build and test on Java 11 as well. +- autolink: Stop URLs at " and \` as well +### Fixed +- Fix tab handling in ATX and Setext headings. + +## [0.11.0] - 2018-01-17 +### Added +- The extension for tables now also renders to plain text + (when using a `TextContentRenderer`), thanks @ahjaworski +### Changed +- Add `Automatic-Module-Name` manifest entries so that library can be used + nicely in Java 9 modules. The module names correspond to the root + package name: `org.commonmark`, `org.commonmark.ext.autolink`, etc. +- Java 7 is now only supported on a best-effort basis (but it has been + EOL for quite some time, so yeah) + +## [0.10.0] - 2017-09-14 +### Added +- Support multiple `DelimiterProcessor` with the same delimiter char as long + as they have different length, thanks @szeiger +- Add tests for thread-safety and a section to the readme (#83) +### Changed +- Update to CommonMark spec 0.28 (#94): + - Adapt to changed emphasis parsing rule + - Allow nested parentheses in inline link destinations +### Fixed +- Fixes for text content rendering, thanks @JinneeJ: + - Support for mixed lists + - Fixed that whitespaces between text elements are removed in "stripped" mode. + For example `**text** and text` had rendered as `textand text` + - Improved rendering for auto links +- Fix `[\]` being parsed as link label +- Fix `[foo](<\>)` resulting in `\` in href +- Fix multiple of 3 rule for emphasis parsing (see commonmark/cmark#177) +- Fix text node merging when opening/closing delimiters are adjacent (#96) +- autolink: Fix linking of URLs without host, e.g. `http://.` (#99) + +## [0.9.0] - 2017-03-03 +### Added +- Support restricting which block types are parsed, see `enabledBlockTypes` + method on `Parser.Builder` (#43), thanks @marksliva, @pivotal-graham-bell and + @lalunamel. This allows you to disable parsing of e.g. headings, they will + just be parsed as paragraphs instead. +- Allow customizing the inline parser, see `inlineParserFactory` method on + `Parser.Builder` (#68), thanks @vreynolds and @lalunamel. Note that this is + experimental and currently requires using internal classes. +### Changed +- Wrap escaped HTML blocks in a `

` tag (#78) +- Add missing `ext-heading-anchor` to `dependencyManagement` in parent pom, + thanks @drobert + +## [0.8.0] - 2016-12-09 +### Changed +- Update to CommonMark spec 0.27 (#73): + - Treat h2..h6 as HTML blocks well + - Allow shortcut reference link before open parenthesis (if parenthesis is not + part of a valid inline link) +- `AttributeProvider.setAttributes` now has an additional `tagName` argument and + is called for all HTML tags of a block. This allows users to add attributes + for the `pre` tag of a code block in addition to `code`. Also added attribute + provider support for additional HTML tags, namely `em`, `strong`, `code` and + `br`. (#74) +### Fixed +- ext-heading-anchor: Fix IllegalArgumentException on Android (#71) + +## [0.7.1] - 2016-10-05 +### Added +- Allow to configure prefix/suffix for ID on `HeadingAnchorExtension` (#66), + thanks @paulthom12345 + +## [0.7.0] - 2016-09-23 +### Added +- Plain text content renderer (#58), thanks to @JinneeJ! + - Renders a plain text representation of a document instead of HTML, see + `TextContentRenderer` in core. + - Extensible in the same way as HTML rendering. +- Heading anchor extension (#26), thanks to @paulthom12345! + - Adds "id" attribute to heading tags (e.g. `

Heading

`), + useful for linking to sections of a document. + - ID generation logic can also be used by itself via the `IdGenerator` class. + - Use class `HeadingAnchorExtension` in artifact `commonmark-ext-heading-anchor` +- Ins (underline) extension (#54), thanks to @pabranch! + - Enables underlining of text by enclosing it in `++`. It's rendered as an + `ins` tag in HTML. + - Use class `InsExtension` in artifact `commonmark-ext-ins`. +### Changed +- `HtmlRenderer` and related classes moved from `org.commonmark.html` to + `org.commonmark.renderer.html` +- `HtmlRenderer.Builder` no longer takes an `AttributeProvider`, but uses a + `AttributeProviderFactory` to instantiate a new provider for each rendering. + Code needs to be changed to create a factory and then return the existing + provider from its `create` method, similar to node renderers. +- `NodeRendererFactory` was renamed to `HtmlNodeRendererFactory`, same for + related classes (there's a corresponsing interface for text content rendering) + +## [0.6.0] - 2016-07-25 +### Added +- Add coverage data to build. Currently at 97 %. +### Changed +- Update to CommonMark spec 0.26 (#55) + - empty list items can no longer interrupt a paragraph; this resolves an + ambiguity with setext headers + - ordered lists can interrupt a paragraph only when beginning with 1 + - the two-blank-lines-breaks-out-of-lists rule has been removed + - the spec for emphasis and strong emphasis has been refined to give more + intuitive results in some cases + - tabs can be used after the # in an ATX header and between the markers in a + thematic break +- Simplify and speed up brackets processing (links/images) + - Improves the nested brackets pathological case (e.g. `[[[[a]]]]` with a lot + of brackets) + - Also contributed these changes upstream to + [commonmark.js](https://talk.commonmark.org/t/ann-commonmark-0-26-cmark-0-26-0-commonmark-js-0-26-0/2165) +- Simplify merging of adjacent text nodes +- Extended `DelimiterProcessor` interface so that implementations get more + information in `getDelimiterUse` and can reject delimiters by returning `0` + from it. Also rename the methods: + - `getOpeningDelimiterChar` -> `getOpeningCharacter` + - `getClosingDelimiterChar` -> `getClosingCharacter` + - `getMinDelimiterCount` -> `getMinLength` +### Fixed +- Fix max length for link labels (999, not 1000) +- autolink: Stop URLs at more invalid characters, notably '<' and '>'. + According to RFC 3987, angle brackets are not allowed in URLs, and + other linkers don't seem to allow them either. + +## [0.5.1] - 2016-05-25 +### Fixed +- Fix `StringIndexOutOfBoundsException` on line after tab (#52) + +## [0.5.0] - 2016-04-22 +### Added +- Add YAML front matter extension for document metadata blocks (#24), thanks to + @chiwanpark +- Add information about delimiter character and length to delimiter nodes (#10), + thanks to @pcj +- Make HTML rendering for nodes extensible (#35) +- Add support for asymmetric delimiters (#17): + `DelimiterProcessor#getDelimiterChar` was split into `getOpeningDelimiterChar` + and `getClosingDelimiterChar` +### Changed +- Make `AttributeProvider` work for image and table nodes (#31) +- Update to CommonMark spec 0.25: + - Changes how partially consumed tabs are handled. +- Add Android test project to build so that we won't break Android support + (#38), thanks to @JinneeJ +- Replace `CustomHtmlRenderer` with `NodeRenderer` which also allows overriding + rendering for built-in node types (#35) +### Fixed +- Fix blank line after empty list item to terminate list +- Fix nested bullet list indented with mix of tab and spaces (#41), thanks to + @derari +- Fix package name in Javadoc, thanks to @jiakuan +- autolink: Treat more special characters as trailing delimiters to not include + `">`, `"/>` and `");` at the end of URLs +- autolink: Fix unexpected link end with unfinished delimiter pairs in URLs +- autolink: Fix Android incompatibility by not using `java.util.Objects` + +## [0.4.1] - 2016-02-11 +### Fixed +- Fix problematic regex that doesn't work on some Java versions and Android +- Fix problems with Android (usage of `java.util.Objects`, `StandardCharsets`, + ProGuard, see #30), thanks to @JinneeJ! +### Changed +- autolink extension: Update to autolink 0.3.0. This stops recognizing + "abc://foo" within "1abc://foo" as a link + +## [0.4.0] - 2016-01-18 +### Changed +Update to CommonMark spec 0.24 (#28): +- No longer allow whitespace between link text and link label +- Don't allow whitespace in link destination even with <> +- Don't use whitelist for schemes in autolinks, recognize all 2-32 length + schemes (see [spec](http://spec.commonmark.org/0.24/#scheme)) +- Allow multi-line content in setext headings + +API breaking changes (caused by changes in spec): +- Rename `Header` to `Heading` +- Rename `HorizontalRule` to `ThematicBreak` +- Rename `HtmlTag` to `HtmlInline` +- Replace `MatchedBlockParser#getParagraphStartLine` with `#getParagraphContent` + that returns the current content if the matched block is a paragraph + +## [0.3.2] - 2016-01-07 +### Fixed +- Add more bounds checks to internal Substring class (might affect extensions) + +## [0.3.1] - 2015-12-01 +### Fixed +- Fix StringIndexOutOfBoundsException with unclosed inline link (#27) + +## [0.3.0] - 2015-10-15 +### Changed +- Update to spec 0.22 (#14) +- Allow block parsers from extensions to override core behavior (#18) +- Fix compilation without `install` (#19) +- Parent pom, build and README updates + +## [0.2.0] - 2015-08-20 +### Added +- Add method `Node parseReader(java.io.Reader)` to `Parser` (#2) +- Extend Javadoc and publish online (#4) +### Fixed +- Fix StringIndexOutOfBoundsException on some inputs (#13) +- ext-gfm-tables: Implement single-column tables (#7) + +## [0.1.0] - 2015-07-22 +### Added +Initial release of commonmark-java, a port of commonmark.js with extensions +for autolinking URLs, GitHub flavored strikethrough and tables. + +[Unreleased]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.28.0...main +[0.28.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.27.1...commonmark-parent-0.28.0 +[0.27.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.27.0...commonmark-parent-0.27.1 +[0.27.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.26.0...commonmark-parent-0.27.0 +[0.26.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.25.1...commonmark-parent-0.26.0 +[0.25.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.25.0...commonmark-parent-0.25.1 +[0.25.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.24.0...commonmark-parent-0.25.0 +[0.24.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.23.0...commonmark-parent-0.24.0 +[0.23.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.22.0...commonmark-parent-0.23.0 +[0.22.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.21.0...commonmark-parent-0.22.0 +[0.21.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.20.0...commonmark-parent-0.21.0 +[0.20.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.19.0...commonmark-parent-0.20.0 +[0.19.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.18.2...commonmark-parent-0.19.0 +[0.18.2]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.18.1...commonmark-parent-0.18.2 +[0.18.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.18.0...commonmark-parent-0.18.1 +[0.18.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.17.2...commonmark-parent-0.18.0 +[0.17.2]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.17.1...commonmark-parent-0.17.2 +[0.17.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.17.0...commonmark-parent-0.17.1 +[0.17.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.16.1...commonmark-parent-0.17.0 +[0.16.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.15.2...commonmark-parent-0.16.1 +[0.15.2]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.15.1...commonmark-parent-0.15.2 +[0.15.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.15.0...commonmark-parent-0.15.1 +[0.15.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.14.0...commonmark-parent-0.15.0 +[0.14.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.13.1...commonmark-parent-0.14.0 +[0.13.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.13.0...commonmark-parent-0.13.1 +[0.13.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.12.1...commonmark-parent-0.13.0 +[0.12.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.11.0...commonmark-parent-0.12.1 +[0.11.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.10.0...commonmark-parent-0.11.0 +[0.10.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.9.0...commonmark-parent-0.10.0 +[0.9.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.8.0...commonmark-parent-0.9.0 +[0.8.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.7.1...commonmark-parent-0.8.0 +[0.7.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.7.0...commonmark-parent-0.7.1 +[0.7.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.6.0...commonmark-parent-0.7.0 +[0.6.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.5.1...commonmark-parent-0.6.0 +[0.5.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.5.0...commonmark-parent-0.5.1 +[0.5.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.4.1...commonmark-parent-0.5.0 +[0.4.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.4.0...commonmark-parent-0.4.1 +[0.4.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.3.2...commonmark-parent-0.4.0 +[0.3.2]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.3.1...commonmark-parent-0.3.2 +[0.3.1]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.3.0...commonmark-parent-0.3.1 +[0.3.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.2.0...commonmark-parent-0.3.0 +[0.2.0]: https://github.com/commonmark/commonmark-java/compare/commonmark-parent-0.1.0...commonmark-parent-0.2.0 +[0.1.0]: https://github.com/commonmark/commonmark-java/commits/commonmark-parent-0.1.0 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..ce3a9d1d8 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,27 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery +* Personal attacks +* Trolling or insulting/derogatory comments +* Public or private harassment +* Publishing other's private information, such as physical or electronic addresses, without explicit permission +* Submitting contributions or comments that you know to violate the intellectual property or privacy rights of others +* Other unethical or unprofessional conduct + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting a project maintainer. Complaints will result in a response and be reviewed and investigated in a way that is deemed necessary and appropriate to the circumstances. Maintainers are obligated to maintain confidentiality with regard to the reporter of an incident. + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.3.0, available at [http://contributor-covenant.org/version/1/3/0/][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/3/0/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..6bb2f1640 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,30 @@ +Contributing +============ + +Hey, thanks for your interest in contributing to this library! We welcome any +type of pull request, issues and comments! 😀 + +For pull requests, make sure you: + +* Add tests for new features and bug fixes +* Follow the existing style (always use braces, 4 space indent) +* Separate unrelated changes into multiple pull requests + +If you are interested in working on something but don't know what, see the +existing issues with label "help wanted". + +For bigger changes, make sure you start a discussion first by creating +an issue and explaining the intended change. + +The [sourcespy dashboard](https://sourcespy.com/github/commonmarkcommonmarkjava/) +provides a high level overview of the repository including +[class diagram](https://sourcespy.com/github/commonmarkcommonmarkjava/xx-omodel-.html), +[module dependencies](https://sourcespy.com/github/commonmarkcommonmarkjava/xx-omodulesc-.html), +[module hierarchy](https://sourcespy.com/github/commonmarkcommonmarkjava/xx-omodules-.html), +[external libraries](https://sourcespy.com/github/commonmarkcommonmarkjava/xx-ojavalibs-.html), +and other components of the system. + +Releasing +--------- + +Releases are done from the "release" workflow on GitHub Actions. diff --git a/LICENSE.txt b/LICENSE.txt index b09e367ce..604b777d3 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2015, Atlassian Pty Ltd +Copyright (c) 2015, Robin Stocker All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 101d847c2..845226729 100644 --- a/README.md +++ b/README.md @@ -1,60 +1,279 @@ commonmark-java =============== -Java implementation of [CommonMark], a specification of the [Markdown] format for turning plain text into formatted text. -Parses input to an AST (tree of nodes) and then renders to HTML. +Java library for parsing and rendering [Markdown] text according to the +[CommonMark] specification (and some extensions). -This started out as a port of [commonmark.js] and has evolved into a full -library with a nice Java API and some optional extensions. Features: +[![Maven Central status](https://img.shields.io/maven-central/v/org.commonmark/commonmark.svg)](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.commonmark%22) +[![javadoc](https://www.javadoc.io/badge/org.commonmark/commonmark.svg?color=blue)](https://www.javadoc.io/doc/org.commonmark/commonmark) +[![ci](https://github.com/commonmark/commonmark-java/workflows/ci/badge.svg)](https://github.com/commonmark/commonmark-java/actions?query=workflow%3Aci) +[![codecov](https://codecov.io/gh/commonmark/commonmark-java/branch/main/graph/badge.svg)](https://codecov.io/gh/commonmark/commonmark-java) +[![SourceSpy Dashboard](https://sourcespy.com/shield.svg)](https://sourcespy.com/github/commonmarkcommonmarkjava/) -* Small with minimal dependencies -* Extensible (see below) -* Fast (10-20 times faster than pegdown, see benchmarks in repo) +Introduction +------------ + +Provides classes for parsing input to an abstract syntax tree (AST), +visiting and manipulating nodes, and rendering to HTML or back to Markdown. +It started out as a port of [commonmark.js], but has since evolved into an +extensible library with the following features: -Requirements: +* Small (core has no dependencies, extensions in separate artifacts) +* Fast (10-20 times faster than [pegdown] which used to be a popular Markdown + library, see benchmarks in repo) +* Flexible (manipulate the AST after parsing, customize HTML rendering) +* Extensible (tables, strikethrough, autolinking and more, see below) -* Java 7 or above -* The core has no dependencies; for extensions, see below +The library is supported on Java 11 and later. It works on Android too, +but that is on a best-effort basis, please report problems. For Android the +minimum API level is 19, see the +[commonmark-android-test](commonmark-android-test) +directory. Coordinates for core library (see all on [Maven Central]): ```xml - com.atlassian.commonmark + org.commonmark commonmark - 0.1.0 + 0.28.0 ``` +The module names to use in Java 9 are `org.commonmark`, +`org.commonmark.ext.autolink`, etc, corresponding to package names. + Note that for 0.x releases of this library, the API is not considered stable yet and may break between minor releases. After 1.0, [Semantic Versioning] will -be followed. +be followed. A package containing `beta` means it's not subject to stable API +guarantees yet; but for normal usage it should not be necessary to use. -[![Build status](https://travis-ci.org/atlassian/commonmark-java.svg?branch=master)](https://travis-ci.org/atlassian/commonmark-java) +See the [spec.txt](commonmark-test-util/src/main/resources/spec.txt) +file if you're wondering which version of the spec is currently +implemented. Also check out the [CommonMark dingus] for getting familiar +with the syntax or trying out edge cases. If you clone the repository, +you can also use the `DingusApp` class to try out things interactively. Usage ----- -Basic example: +#### Parse and render to HTML ```java -import org.commonmark.html.HtmlRenderer; -import org.commonmark.node.Node; +import org.commonmark.node.*; import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; + +Parser parser = Parser.builder().build(); +Node document = parser.parse("This is *Markdown*"); +HtmlRenderer renderer = HtmlRenderer.builder().build(); +renderer.render(document); // "

This is Markdown

\n" +``` + +This uses the parser and renderer with default options. Both builders have +methods for configuring their behavior: + +* `escapeHtml(true)` on `HtmlRenderer` will escape raw HTML tags and blocks. +* `sanitizeUrls(true)` on `HtmlRenderer` will strip potentially unsafe URLs + from `
` and `` tags +* For all available options, see methods on the builders. + +Note that this library doesn't try to sanitize the resulting HTML with regards +to which tags are allowed, etc. That is the responsibility of the caller, and +if you expose the resulting HTML, you probably want to run a sanitizer on it +after this. + +#### Render to Markdown + +```java +import org.commonmark.node.*; +import org.commonmark.renderer.markdown.MarkdownRenderer; + +MarkdownRenderer renderer = MarkdownRenderer.builder().build(); +Node document = new Document(); +Heading heading = new Heading(); +heading.setLevel(2); +heading.appendChild(new Text("My title")); +document.appendChild(heading); + +renderer.render(document); // "## My title\n" +``` + +For rendering to plain text with minimal markup, there's also `TextContentRenderer`. + +#### Use a visitor to process parsed nodes + +After the source text has been parsed, the result is a tree of nodes. +That tree can be modified before rendering, or just inspected without +rendering: + +```java +Node node = parser.parse("Example\n=======\n\nSome more text"); +WordCountVisitor visitor = new WordCountVisitor(); +node.accept(visitor); +visitor.wordCount; // 4 + +class WordCountVisitor extends AbstractVisitor { + int wordCount = 0; + + @Override + public void visit(Text text) { + // This is called for all Text nodes. Override other visit methods for other node types. + + // Count words (this is just an example, don't actually do it this way for various reasons). + wordCount += text.getLiteral().split("\\W+").length; + + // Descend into children (could be omitted in this case because Text nodes don't have children). + visitChildren(text); + } +} +``` + +#### Source positions + +If you want to know where a parsed `Node` appeared in the input source text, +you can request the parser to return source positions like this: + +```java +var parser = Parser.builder().includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES).build(); +``` + +Then parse nodes and inspect source positions: + +```java +var source = "foo\n\nbar *baz*"; +var doc = parser.parse(source); +var emphasis = doc.getLastChild().getLastChild(); +var s = emphasis.getSourceSpans().get(0); +s.getLineIndex(); // 2 (third line) +s.getColumnIndex(); // 4 (fifth column) +s.getInputIndex(); // 9 (string index 9) +s.getLength(); // 5 +source.substring(s.getInputIndex(), s.getInputIndex() + s.getLength()); // "*baz*" +``` + +If you're only interested in blocks and not inlines, use `IncludeSourceSpans.BLOCKS`. + +#### Add or change attributes of HTML elements + +Sometimes you might want to customize how HTML is rendered. If all you +want to do is add or change attributes on some elements, there's a +simple way to do that. + +In this example, we register a factory for an `AttributeProvider` on the +renderer to set a `class="border"` attribute on `img` elements. + +```java +Parser parser = Parser.builder().build(); +HtmlRenderer renderer = HtmlRenderer.builder() + .attributeProviderFactory(new AttributeProviderFactory() { + public AttributeProvider create(AttributeProviderContext context) { + return new ImageAttributeProvider(); + } + }) + .build(); + +Node document = parser.parse("![text](/url.png)"); +renderer.render(document); +// "

\"text\"

\n" + +class ImageAttributeProvider implements AttributeProvider { + @Override + public void setAttributes(Node node, String tagName, Map attributes) { + if (node instanceof Image) { + attributes.put("class", "border"); + } + } +} +``` +#### Customize HTML rendering + +If you want to do more than just change attributes, there's also a way +to take complete control over how HTML is rendered. + +In this example, we're changing the rendering of indented code blocks to +only wrap them in `pre` instead of `pre` and `code`: + +```java Parser parser = Parser.builder().build(); -Node document = parser.parse("This is *Sparta*"); -HtmlRenderer renderer = HtmlRenderer.builder().escapeHtml(true).build(); -renderer.render(document); // "

This is Sparta

\n" +HtmlRenderer renderer = HtmlRenderer.builder() + .nodeRendererFactory(new HtmlNodeRendererFactory() { + public NodeRenderer create(HtmlNodeRendererContext context) { + return new IndentedCodeBlockNodeRenderer(context); + } + }) + .build(); + +Node document = parser.parse("Example:\n\n code"); +renderer.render(document); +// "

Example:

\n
code\n
\n" + +class IndentedCodeBlockNodeRenderer implements NodeRenderer { + + private final HtmlWriter html; + + IndentedCodeBlockNodeRenderer(HtmlNodeRendererContext context) { + this.html = context.getWriter(); + } + + @Override + public Set> getNodeTypes() { + // Return the node types we want to use this renderer for. + return Set.of(IndentedCodeBlock.class); + } + + @Override + public void render(Node node) { + // We only handle one type as per getNodeTypes, so we can just cast it here. + IndentedCodeBlock codeBlock = (IndentedCodeBlock) node; + html.line(); + html.tag("pre"); + html.text(codeBlock.getLiteral()); + html.tag("/pre"); + html.line(); + } +} ``` -This uses the parser and renderer with default options, except for escaping raw -HTML tags and blocks. For all the available options, see other methods on the -builder objects. +#### Add your own node types + +In case you want to store additional data in the document or have custom +elements in the resulting HTML, you can create your own subclass of +`CustomNode` and add instances as child nodes to existing nodes. + +To define the HTML rendering for them, you can use a `NodeRenderer` as +explained above. + +#### Customize parsing -Note that this library doesn't try to sanitize HTML; that is the responsibility -of the caller. +There are a few ways to extend parsing or even override built-in parsing, +all of them via methods on `Parser.Builder` +(see [Blocks and inlines](https://spec.commonmark.org/0.31.2/#blocks-and-inlines) in the spec for an overview of blocks/inlines): + +- Parsing of specific block types (e.g. headings, code blocks, etc) can be + enabled/disabled with `enabledBlockTypes` +- Parsing of blocks can be extended/overridden with `customBlockParserFactory` +- Parsing of inline content can be extended/overridden with `customInlineContentParserFactory` +- Parsing of [delimiters](https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis) in inline content can be + extended with `customDelimiterProcessor` +- Processing of links can be customized with `linkProcessor` and `linkMarker` + +#### Thread-safety + +Both the `Parser` and `HtmlRenderer` are designed so that you can +configure them once using the builders and then use them multiple +times/from multiple threads. This is done by separating the state for +parsing/rendering from the configuration. + +Having said that, there might be bugs of course. If you find one, please +report an issue. + +### API documentation + +Javadocs are available online on +[javadoc.io](https://www.javadoc.io/doc/org.commonmark/commonmark). Extensions @@ -70,9 +289,9 @@ First, add an additional dependency (see [Maven Central] for others): ```xml - com.atlassian.commonmark + org.commonmark commonmark-ext-gfm-tables - 0.1.0 + 0.28.0 ``` @@ -81,9 +300,13 @@ Then, configure the extension on the builders: ```java import org.commonmark.ext.gfm.tables.TablesExtension; -List extensions = Arrays.asList(TablesExtension.create()); -Parser parser = Parser.builder().extensions(extensions).build(); -HtmlRenderer renderer = HtmlRenderer.builder().extensions(extensions).build(); +List extensions = List.of(TablesExtension.create()); +Parser parser = Parser.builder() + .extensions(extensions) + .build(); +HtmlRenderer renderer = HtmlRenderer.builder() + .extensions(extensions) + .build(); ``` To configure another extension in the above example, just add it to the list. @@ -110,33 +333,166 @@ Enables tables using pipes as in [GitHub Flavored Markdown][gfm-tables]. Use class `TablesExtension` in artifact `commonmark-ext-gfm-tables`. +### Alerts -Contributing ------------- +Adds support for GitHub-style alerts (also known as callouts or admonitions) as described [here](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts), e.g.: + +``` +> [!NOTE] +> The text of the note. +``` + +As types you can use NOTE, TIP, IMPORTANT, WARNING, CAUTION; or configure the extension to add additional ones. + +Use class `AlertsExtension` in artifact `commonmark-ext-gfm-alerts`. + +### Footnotes + +Enables footnotes like in [GitHub](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#footnotes) +or [Pandoc](https://pandoc.org/MANUAL.html#footnotes): + +``` +Main text[^1] + +[^1]: Additional text in a footnote +``` + +Inline footnotes like `^[inline footnote]` are also supported when enabled via `FootnotesExtension.Builder#inlineFootnotes`. + +Use class `FootnotesExtension` in artifact `commonmark-ext-footnotes`. + +### Heading anchor + +Enables adding auto generated "id" attributes to heading tags. The "id" +is based on the text of the heading. + +`# Heading` will be rendered as: + +``` +

Heading

+``` + +Use class `HeadingAnchorExtension` in artifact `commonmark-ext-heading-anchor`. + +In case you want custom rendering of the heading instead, you can use +the `IdGenerator` class directly together with a +`HtmlNodeRendererFactory` (see example above). + +### Ins + +Enables underlining of text by enclosing it in `++`. For example, in +`hey ++you++`, `you` will be rendered as underline text. Uses the <ins> tag. + +Use class `InsExtension` in artifact `commonmark-ext-ins`. + +### YAML front matter + +Adds support for metadata through a YAML front matter block. This extension only supports a subset of YAML syntax. Here's an example of what's supported: + +``` +--- +key: value +list: + - value 1 + - value 2 +literal: | + this is literal value. + + literal values 2 +--- + +document start here +``` + +Use class `YamlFrontMatterExtension` in artifact `commonmark-ext-yaml-front-matter`. To fetch metadata, use `YamlFrontMatterVisitor`. + +### Image Attributes + +Adds support for specifying attributes (specifically height and width) for images. + +The attribute elements are given as `key=value` pairs inside curly braces `{ }` after the image node to which they apply, +for example: +``` +![text](/url.png){width=640 height=480} +``` +will be rendered as: +``` +text +``` + +Use class `ImageAttributesExtension` in artifact `commonmark-ext-image-attributes`. + +Note: since this extension uses curly braces `{` `}` as its delimiters (in `StylesDelimiterProcessor`), this means that +other delimiter processors *cannot* use curly braces for delimiting. + +### Task List Items + +Adds support for tasks as list items. -Pull requests, issues and comments welcome ☺. For pull requests: +A task can be represented as a list item where the first non-whitespace character is a left bracket `[`, then a single +whitespace character or the letter `x` in lowercase or uppercase, then a right bracket `]` followed by at least one +whitespace before any other content. -* Add tests for new features and bug fixes -* Follow the existing style (always use braces, 4 space indent) -* Separate unrelated changes into multiple pull requests +For example: +``` +- [ ] task #1 +- [x] task #2 +``` +will be rendered as: +``` +
    +
  • task #1
  • +
  • task #2
  • +
+``` + +Use class `TaskListItemsExtension` in artifact `commonmark-ext-task-list-items`. + +### Third-party extensions + +You can also find other extensions in the wild: -See the existing "help wanted" issues for things to start contributing. +* [commonmark-ext-notifications](https://github.com/McFoggy/commonmark-ext-notifications): this extension allows to easily create notifications/admonitions paragraphs like `INFO`, `SUCCESS`, `WARNING` or `ERROR` -For bigger changes, make sure you start a discussion first by creating -an issue and explaining the intended change. +Used by +------- + +Some users of this library (feel free to raise a PR if you want to be added): +* [Atlassian](https://www.atlassian.com/) (where the library was initially developed) +* Java (OpenJDK) ([link](https://github.com/openjdk/jdk/blob/3895b8fc0b2c6d187080dba6fe08297adad4a480/src/jdk.internal.md/share/classes/module-info.java)) +* [Gerrit](https://www.gerritcodereview.com/) code review/Gitiles ([link](https://gerrit-review.googlesource.com/c/gitiles/+/353794)) +* [Clerk](https://clerk.vision/) moldable live programming for Clojure +* [Znai](https://github.com/testingisdocumenting/znai) +* [Open Note](https://github.com/YangDai2003/OpenNote-Compose) a markdown editor and note-taking app for Android +* [Quarkus Roq](https://github.com/quarkiverse/quarkus-roq/) The Roq Static Site Generator allows to easily create a static website or blog using Quarkus super-powers. +* [Lucee](https://github.com/lucee/lucee) +* [Previewer](https://github.com/sebthom/previewer-eclipse-plugin) an extensible Eclipse plugin that previews Markdown and other text based formats. +* [Xeres](https://xeres.io) a Peer-to-Peer application where all user generated content is done with markdown + +See also +-------- + +* [Markwon](https://github.com/noties/Markwon): Android library for rendering markdown as system-native Spannables +* [flexmark-java](https://github.com/vsch/flexmark-java): Fork that added support for a lot more syntax and flexibility + +Contributing +------------ +See [CONTRIBUTING.md](CONTRIBUTING.md) file. License ------- -Copyright (c) 2015 Atlassian and others. +Copyright (c) 2015, Robin Stocker BSD (2-clause) licensed, see LICENSE.txt file. -[CommonMark]: http://commonmark.org/ +[CommonMark]: https://commonmark.org/ [Markdown]: https://daringfireball.net/projects/markdown/ -[commonmark.js]: https://github.com/jgm/commonmark.js -[Maven Central]: https://search.maven.org/#search|ga|1|g%3A%22com.atlassian.commonmark%22 -[Semantic Versioning]: http://semver.org/ +[commonmark.js]: https://github.com/commonmark/commonmark.js +[pegdown]: https://github.com/sirthias/pegdown +[CommonMark Dingus]: https://spec.commonmark.org/dingus/ +[Maven Central]: https://search.maven.org/#search|ga|1|g%3A%22org.commonmark%22 +[Semantic Versioning]: https://semver.org/ [autolink-java]: https://github.com/robinst/autolink-java -[gfm-tables]: https://help.github.com/articles/github-flavored-markdown/#tables +[gfm-tables]: https://help.github.com/articles/organizing-information-with-tables/ diff --git a/commonmark-android-test/.gitignore b/commonmark-android-test/.gitignore new file mode 100644 index 000000000..b738a69b4 --- /dev/null +++ b/commonmark-android-test/.gitignore @@ -0,0 +1,5 @@ +.gradle +local.properties +test.properties +gradle.properties +build diff --git a/commonmark-android-test/README.md b/commonmark-android-test/README.md new file mode 100644 index 000000000..0fb792ae3 --- /dev/null +++ b/commonmark-android-test/README.md @@ -0,0 +1,43 @@ +commonmark-android-test +======================= + +This module ensures that commonmark-java is supported on Android by running `lint` checks on library sources. +Current `minSdk` is 19 + +Requirements: + +* Java 11 or above +* Android SDK 30 + +Configuration +----- + +1. Download Android SDK +2. Be sure that SDK Platform 30 is installed. It's recommended to use x86 +3. Export to PATH: `path_to_android_sdk/platform-tools` and `path_to_android_sdk/tools` +4. Create 2 properties files in commonmark-android-test + +/local.properties +```properties +sdk.dir=/path_to_android_sdk +``` + +Usage +----- + +#### Run lint checked + +on Mac/Linux: +```shell +./gradlew :app:lint +``` + +on Windows: +```bat +.\gradlew :app:lint +``` + +Links +----- +[Gradle Documentations](https://docs.gradle.org/current/userguide/userguide.html) +[Android Gradle Plugin Docs](http://tools.android.com/tech-docs/new-build-system) diff --git a/commonmark-android-test/app/build.gradle b/commonmark-android-test/app/build.gradle new file mode 100644 index 000000000..fd8ae34cb --- /dev/null +++ b/commonmark-android-test/app/build.gradle @@ -0,0 +1,52 @@ +apply plugin: 'com.android.application' + +android { + namespace "org.commonmark.android.test" + compileSdk 30 + + defaultConfig { + applicationId "org.commonmark.android.test" + minSdk 19 + targetSdk 30 + versionCode 1 + versionName "1.0" + } + + compileOptions { + sourceCompatibility JavaVersion.VERSION_11 + targetCompatibility JavaVersion.VERSION_11 + } + + packagingOptions { + exclude 'META-INF/LICENSE' + exclude 'META-INF/LICENSE.txt' + exclude 'META-INF/NOTICE' + exclude 'META-INF/NOTICE.txt' + } + + // we add other modules sources in order for lint to process them (lint operates on sources) + sourceSets { + main { + java { + [ + '../../commonmark', + '../../commonmark-ext-autolink', + '../../commonmark-ext-gfm-strikethrough', + '../../commonmark-ext-gfm-tables', + '../../commonmark-ext-heading-anchor', + '../../commonmark-ext-ins', + '../../commonmark-ext-yaml-front-matter' + ].forEach { d -> + // don't include module-info files, otherwise we get + // "too many module declarations found" + PatternSet patternSet = new PatternSet().exclude('**/module-info.java') + srcDirs += fileTree("$d/src/main/java").matching(patternSet) + } + } + } + } +} + +dependencies { + implementation('org.nibor.autolink:autolink:0.11.0') +} diff --git a/commonmark-android-test/app/lint.xml b/commonmark-android-test/app/lint.xml new file mode 100644 index 000000000..3507f11d5 --- /dev/null +++ b/commonmark-android-test/app/lint.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/commonmark-android-test/app/src/main/AndroidManifest.xml b/commonmark-android-test/app/src/main/AndroidManifest.xml new file mode 100644 index 000000000..486520569 --- /dev/null +++ b/commonmark-android-test/app/src/main/AndroidManifest.xml @@ -0,0 +1,3 @@ + + + diff --git a/commonmark-android-test/build.gradle b/commonmark-android-test/build.gradle new file mode 100644 index 000000000..f359e8154 --- /dev/null +++ b/commonmark-android-test/build.gradle @@ -0,0 +1,22 @@ +buildscript { + repositories { + mavenCentral() + google() + } + dependencies { + classpath 'com.android.tools.build:gradle:7.4.2' + } +} + +allprojects { + repositories { + mavenCentral() + google() + } +} + +task clean(type: Delete) { + delete rootProject.buildDir +} + + diff --git a/commonmark-android-test/gradle/wrapper/gradle-wrapper.jar b/commonmark-android-test/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 000000000..d64cd4917 Binary files /dev/null and b/commonmark-android-test/gradle/wrapper/gradle-wrapper.jar differ diff --git a/commonmark-android-test/gradle/wrapper/gradle-wrapper.properties b/commonmark-android-test/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 000000000..a80b22ce5 --- /dev/null +++ b/commonmark-android-test/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,7 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip +networkTimeout=10000 +validateDistributionUrl=true +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/commonmark-android-test/gradlew b/commonmark-android-test/gradlew new file mode 100755 index 000000000..1aa94a426 --- /dev/null +++ b/commonmark-android-test/gradlew @@ -0,0 +1,249 @@ +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# +############################################################################## + +# Attempt to set APP_HOME + +# Resolve links: $0 may be a link +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +# This is normally unused +# shellcheck disable=SC2034 +APP_BASE_NAME=${0##*/} +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac +fi + +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + + # Now convert the arguments - kludge to limit ourselves to /bin/sh + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) + fi + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg + done +fi + + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' + +exec "$JAVACMD" "$@" diff --git a/commonmark-android-test/gradlew.bat b/commonmark-android-test/gradlew.bat new file mode 100644 index 000000000..7101f8e46 --- /dev/null +++ b/commonmark-android-test/gradlew.bat @@ -0,0 +1,92 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%"=="" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%"=="" set DIRNAME=. +@rem This is normally unused +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if %ERRORLEVEL% equ 0 goto execute + +echo. 1>&2 +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. 1>&2 +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if %ERRORLEVEL% equ 0 goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/commonmark-android-test/settings.gradle b/commonmark-android-test/settings.gradle new file mode 100644 index 000000000..e7b4def49 --- /dev/null +++ b/commonmark-android-test/settings.gradle @@ -0,0 +1 @@ +include ':app' diff --git a/commonmark-ext-autolink/pom.xml b/commonmark-ext-autolink/pom.xml index 26248b6a9..2cc4d53ca 100644 --- a/commonmark-ext-autolink/pom.xml +++ b/commonmark-ext-autolink/pom.xml @@ -2,35 +2,33 @@ 4.0.0 - com.atlassian.commonmark + org.commonmark commonmark-parent - 0.1.1-SNAPSHOT + 0.28.1-SNAPSHOT commonmark-ext-autolink commonmark-java extension for autolinking commonmark-java extension for turning plain URLs and email addresses into links + + 0.12.0 + + - com.atlassian.commonmark + org.commonmark commonmark org.nibor.autolink autolink - 0.2.0 + ${autolink.version} - junit - junit - test - - - com.atlassian.commonmark - commonmark - test-jar + org.commonmark + commonmark-test-util test diff --git a/commonmark-ext-autolink/src/main/java/module-info.java b/commonmark-ext-autolink/src/main/java/module-info.java new file mode 100644 index 000000000..561934b85 --- /dev/null +++ b/commonmark-ext-autolink/src/main/java/module-info.java @@ -0,0 +1,6 @@ +module org.commonmark.ext.autolink { + exports org.commonmark.ext.autolink; + + requires transitive org.commonmark; + requires org.nibor.autolink; +} diff --git a/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkExtension.java b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkExtension.java index 0dc7d0e84..7d5a74f30 100644 --- a/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkExtension.java +++ b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkExtension.java @@ -1,20 +1,91 @@ package org.commonmark.ext.autolink; +import java.util.EnumSet; +import java.util.Set; + import org.commonmark.Extension; +import org.commonmark.ext.autolink.internal.AutolinkPostProcessor; import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +/** + * Extension for automatically turning plain URLs and email addresses into links. + *

+ * Create it with {@link #create()} and then configure it on the builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ *

+ * The parsed links are turned into normal {@link org.commonmark.node.Link} nodes. + *

+ */ public class AutolinkExtension implements Parser.ParserExtension { - private AutolinkExtension() { + private final Set linkTypes; + + private AutolinkExtension(Builder builder) { + this.linkTypes = builder.linkTypes; } + /** + * @return the extension with default options + */ public static Extension create() { - return new AutolinkExtension(); + return builder().build(); + } + + /** + * @return a builder to configure the behavior of the extension. + */ + public static Builder builder() { + return new Builder(); } @Override public void extend(Parser.Builder parserBuilder) { - parserBuilder.postProcessor(new AutolinkPostProcessor()); + parserBuilder.postProcessor(new AutolinkPostProcessor(linkTypes)); } + public static class Builder { + + private Set linkTypes = EnumSet.allOf(AutolinkType.class); + + /** + * @param linkTypes the link types that should be converted. By default, + * all {@link AutolinkType}s are converted. + * @return {@code this} + */ + public Builder linkTypes(AutolinkType... linkTypes) { + if (linkTypes == null) { + throw new NullPointerException("linkTypes must not be null"); + } + + return this.linkTypes(Set.of(linkTypes)); + } + + /** + * @param linkTypes the link types that should be converted. By default, + * all {@link AutolinkType}s are converted. + * @return {@code this} + */ + public Builder linkTypes(Set linkTypes) { + if (linkTypes == null) { + throw new NullPointerException("linkTypes must not be null"); + } + + if (linkTypes.isEmpty()) { + throw new IllegalArgumentException("linkTypes must not be empty"); + } + + this.linkTypes = EnumSet.copyOf(linkTypes); + return this; + } + + /** + * @return a configured extension + */ + public Extension build() { + return new AutolinkExtension(this); + } + } } diff --git a/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkPostProcessor.java b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkPostProcessor.java deleted file mode 100644 index bfbb7cf3a..000000000 --- a/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkPostProcessor.java +++ /dev/null @@ -1,78 +0,0 @@ -package org.commonmark.ext.autolink; - -import org.commonmark.node.*; -import org.commonmark.parser.PostProcessor; -import org.nibor.autolink.LinkExtractor; -import org.nibor.autolink.LinkSpan; -import org.nibor.autolink.LinkType; - -import java.util.EnumSet; - -public class AutolinkPostProcessor implements PostProcessor { - - private LinkExtractor linkExtractor = LinkExtractor.builder() - .linkTypes(EnumSet.of(LinkType.URL, LinkType.EMAIL)) - .build(); - - @Override - public Node process(Node node) { - AutolinkVisitor autolinkVisitor = new AutolinkVisitor(); - node.accept(autolinkVisitor); - return node; - } - - private void linkify(Text text) { - String literal = text.getLiteral(); - Iterable links = linkExtractor.extractLinks(literal); - - Node lastNode = text; - int last = 0; - for (LinkSpan link : links) { - String linkText = literal.substring(link.getBeginIndex(), link.getEndIndex()); - if (link.getBeginIndex() != last) { - lastNode = insertNode(new Text(literal.substring(last, link.getBeginIndex())), lastNode); - } - Text contentNode = new Text(linkText); - String destination = getDestination(link, linkText); - Link linkNode = new Link(destination, null); - linkNode.appendChild(contentNode); - lastNode = insertNode(linkNode, lastNode); - last = link.getEndIndex(); - } - if (last != literal.length()) { - insertNode(new Text(literal.substring(last)), lastNode); - } - text.unlink(); - } - - private static String getDestination(LinkSpan linkSpan, String linkText) { - if (linkSpan.getType() == LinkType.EMAIL) { - return "mailto:" + linkText; - } else { - return linkText; - } - } - - private static Node insertNode(Node node, Node insertAfterNode) { - insertAfterNode.insertAfter(node); - return node; - } - - private class AutolinkVisitor extends AbstractVisitor { - int inLink = 0; - - @Override - public void visit(Link link) { - inLink++; - super.visit(link); - inLink--; - } - - @Override - public void visit(Text text) { - if (inLink == 0) { - linkify(text); - } - } - } -} diff --git a/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkType.java b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkType.java new file mode 100644 index 000000000..2c8c6574f --- /dev/null +++ b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/AutolinkType.java @@ -0,0 +1,19 @@ +package org.commonmark.ext.autolink; + +/** + * The types of strings that can be automatically turned into links. + */ +public enum AutolinkType { + /** + * URL such as {@code http://example.com} + */ + URL, + /** + * Email address such as {@code foo@example.com} + */ + EMAIL, + /** + * URL such as {@code www.example.com} + */ + WWW +} diff --git a/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/internal/AutolinkPostProcessor.java b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/internal/AutolinkPostProcessor.java new file mode 100644 index 000000000..a381c2f19 --- /dev/null +++ b/commonmark-ext-autolink/src/main/java/org/commonmark/ext/autolink/internal/AutolinkPostProcessor.java @@ -0,0 +1,132 @@ +package org.commonmark.ext.autolink.internal; + +import org.commonmark.ext.autolink.AutolinkType; +import org.commonmark.node.*; +import org.commonmark.parser.PostProcessor; +import org.nibor.autolink.LinkExtractor; +import org.nibor.autolink.LinkSpan; +import org.nibor.autolink.LinkType; +import org.nibor.autolink.Span; + +import java.util.*; + +public class AutolinkPostProcessor implements PostProcessor { + + private final LinkExtractor linkExtractor; + + public AutolinkPostProcessor(Set linkTypes) { + if (linkTypes == null) { + throw new NullPointerException("linkTypes must not be null"); + } + + if (linkTypes.isEmpty()) { + throw new IllegalArgumentException("linkTypes must not be empty"); + } + + var types = EnumSet.noneOf(LinkType.class); + for (AutolinkType linkType : linkTypes) { + switch (linkType) { + case URL: + types.add(LinkType.URL); + break; + case EMAIL: + types.add(LinkType.EMAIL); + break; + case WWW: + types.add(LinkType.WWW); + break; + } + } + + this.linkExtractor = LinkExtractor.builder() + .linkTypes(types) + .build(); + } + + @Override + public Node process(Node node) { + AutolinkVisitor autolinkVisitor = new AutolinkVisitor(); + node.accept(autolinkVisitor); + return node; + } + + private void linkify(Text originalTextNode) { + String literal = originalTextNode.getLiteral(); + + Node lastNode = originalTextNode; + List sourceSpans = originalTextNode.getSourceSpans(); + SourceSpan sourceSpan = sourceSpans.size() == 1 ? sourceSpans.get(0) : null; + + Iterator spans = linkExtractor.extractSpans(literal).iterator(); + while (spans.hasNext()) { + Span span = spans.next(); + + if (lastNode == originalTextNode && !spans.hasNext() && !(span instanceof LinkSpan)) { + // Didn't find any links, don't bother changing existing node. + return; + } + + Text textNode = createTextNode(literal, span, sourceSpan); + if (span instanceof LinkSpan) { + String destination = getDestination((LinkSpan) span, textNode.getLiteral()); + + Link linkNode = new Link(destination, null); + linkNode.appendChild(textNode); + linkNode.setSourceSpans(textNode.getSourceSpans()); + lastNode = insertNode(linkNode, lastNode); + } else { + lastNode = insertNode(textNode, lastNode); + } + } + + // Original node no longer needed + originalTextNode.unlink(); + } + + private static Text createTextNode(String literal, Span span, SourceSpan sourceSpan) { + int beginIndex = span.getBeginIndex(); + int endIndex = span.getEndIndex(); + String text = literal.substring(beginIndex, endIndex); + Text textNode = new Text(text); + if (sourceSpan != null) { + textNode.addSourceSpan(sourceSpan.subSpan(beginIndex, endIndex)); + } + return textNode; + } + + private static String getDestination(LinkSpan linkSpan, String linkText) { + var type = linkSpan.getType(); + + if (type == LinkType.EMAIL) { + return "mailto:" + linkText; + } else if (type == LinkType.WWW) { + // Use http instead of https (see https://github.github.com/gfm/#extended-www-autolink) + return "http://" + linkText; + } else { + return linkText; + } + } + + private static Node insertNode(Node node, Node insertAfterNode) { + insertAfterNode.insertAfter(node); + return node; + } + + private class AutolinkVisitor extends AbstractVisitor { + int inLink = 0; + + @Override + public void visit(Link link) { + inLink++; + super.visit(link); + inLink--; + } + + @Override + public void visit(Text text) { + if (inLink == 0) { + linkify(text); + } + } + } +} diff --git a/commonmark-ext-autolink/src/main/javadoc/overview.html b/commonmark-ext-autolink/src/main/javadoc/overview.html new file mode 100644 index 000000000..b268c7bc0 --- /dev/null +++ b/commonmark-ext-autolink/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for automatically turning plain URLs and email addresses into links +

See {@link org.commonmark.ext.autolink.AutolinkExtension}

+ + diff --git a/commonmark-ext-autolink/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-autolink/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-autolink/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-autolink/src/test/java/org/commonmark/ext/autolink/AutolinkTest.java b/commonmark-ext-autolink/src/test/java/org/commonmark/ext/autolink/AutolinkTest.java index ff6e8bd90..82c3899fc 100644 --- a/commonmark-ext-autolink/src/test/java/org/commonmark/ext/autolink/AutolinkTest.java +++ b/commonmark-ext-autolink/src/test/java/org/commonmark/ext/autolink/AutolinkTest.java @@ -1,13 +1,30 @@ package org.commonmark.ext.autolink; import org.commonmark.Extension; -import org.commonmark.test.RenderingTestCase; -import org.junit.Test; +import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; -import java.util.Collections; +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; public class AutolinkTest extends RenderingTestCase { + private static final Set EXTENSIONS = Set.of(AutolinkExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + private static final Set NO_WWW_EXTENSIONS = Set.of(AutolinkExtension.builder() + .linkTypes(AutolinkType.URL, AutolinkType.EMAIL) + .build()); + private static final Parser NO_WWW_PARSER = Parser.builder().extensions(NO_WWW_EXTENSIONS).build(); + private static final HtmlRenderer NO_WWW_RENDERER = HtmlRenderer.builder().extensions(NO_WWW_EXTENSIONS).build(); + @Test public void oneTextNode() { assertRendering("foo http://one.org/ bar http://two.org/", @@ -46,9 +63,66 @@ public void dontLinkTextWithinLinks() { "

http://example.com

\n"); } - @Override - protected Iterable getExtensions() { - return Collections.singleton(AutolinkExtension.create()); + @Test + public void wwwLinks() { + assertRendering("www.example.com", + "

www.example.com

\n"); + } + + @Test + public void noWwwLinks() { + String html = NO_WWW_RENDERER.render(NO_WWW_PARSER.parse("www.example.com")); + assertThat(html).isEqualTo("

www.example.com

\n"); } + @Test + public void sourceSpans() { + Parser parser = Parser.builder() + .extensions(EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + Node document = parser.parse("abc\n" + + "http://example.com/one\n" + + "def http://example.com/two\n" + + "ghi http://example.com/three jkl"); + + Paragraph paragraph = (Paragraph) document.getFirstChild(); + Text abc = (Text) paragraph.getFirstChild(); + assertThat(abc.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 3))); + + assertThat(abc.getNext()).isInstanceOf(SoftLineBreak.class); + + Link one = (Link) abc.getNext().getNext(); + assertThat(one.getDestination()).isEqualTo("http://example.com/one"); + assertThat(one.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 4, 22))); + + assertThat(one.getNext()).isInstanceOf(SoftLineBreak.class); + + Text def = (Text) one.getNext().getNext(); + assertThat(def.getLiteral()).isEqualTo("def "); + assertThat(def.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 0, 27, 4))); + + Link two = (Link) def.getNext(); + assertThat(two.getDestination()).isEqualTo("http://example.com/two"); + assertThat(two.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 4, 31, 22))); + + assertThat(two.getNext()).isInstanceOf(SoftLineBreak.class); + + Text ghi = (Text) two.getNext().getNext(); + assertThat(ghi.getLiteral()).isEqualTo("ghi "); + assertThat(ghi.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 0, 54, 4))); + + Link three = (Link) ghi.getNext(); + assertThat(three.getDestination()).isEqualTo("http://example.com/three"); + assertThat(three.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 4, 58, 24))); + + Text jkl = (Text) three.getNext(); + assertThat(jkl.getLiteral()).isEqualTo(" jkl"); + assertThat(jkl.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 28, 82, 4))); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } } diff --git a/commonmark-ext-footnotes/pom.xml b/commonmark-ext-footnotes/pom.xml new file mode 100644 index 000000000..0d9e2f30c --- /dev/null +++ b/commonmark-ext-footnotes/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-ext-footnotes + commonmark-java extension for footnotes + commonmark-java extension for footnotes using [^1] syntax + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-footnotes/src/main/java/module-info.java b/commonmark-ext-footnotes/src/main/java/module-info.java new file mode 100644 index 000000000..0667b2801 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.footnotes { + exports org.commonmark.ext.footnotes; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnoteDefinition.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnoteDefinition.java new file mode 100644 index 000000000..4a560dc9e --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnoteDefinition.java @@ -0,0 +1,27 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.node.CustomBlock; + +/** + * A footnote definition, e.g.: + *

+ * [^foo]: This is the footnote text
+ * 
+ * The {@link #getLabel() label} is the text in brackets after {@code ^}, so {@code foo} in the example. The contents + * of the footnote are child nodes of the definition, a {@link org.commonmark.node.Paragraph} in the example. + *

+ * Footnote definitions are parsed even if there's no corresponding {@link FootnoteReference}. + */ +public class FootnoteDefinition extends CustomBlock { + + private String label; + + public FootnoteDefinition(String label) { + this.label = label; + } + + public String getLabel() { + return label; + } +} + diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnoteReference.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnoteReference.java new file mode 100644 index 000000000..61dcf8626 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnoteReference.java @@ -0,0 +1,21 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.node.CustomNode; + +/** + * A footnote reference, e.g. [^foo] in Some text with a footnote[^foo] + *

+ * The {@link #getLabel() label} is the text within brackets after {@code ^}, so {@code foo} in the example. It needs to + * match the label of a corresponding {@link FootnoteDefinition} for the footnote to be parsed. + */ +public class FootnoteReference extends CustomNode { + private String label; + + public FootnoteReference(String label) { + this.label = label; + } + + public String getLabel() { + return label; + } +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnotesExtension.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnotesExtension.java new file mode 100644 index 000000000..dd532fa34 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/FootnotesExtension.java @@ -0,0 +1,105 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.Extension; +import org.commonmark.ext.footnotes.internal.*; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownNodeRendererFactory; +import org.commonmark.renderer.markdown.MarkdownRenderer; + +import java.util.Set; + +/** + * Extension for footnotes with syntax like GitHub Flavored Markdown: + *


+ * Some text with a footnote[^1].
+ *
+ * [^1]: The text of the footnote.
+ * 
+ * The [^1] is a {@link FootnoteReference}, with "1" being the label. + *

+ * The line with [^1]: ... is a {@link FootnoteDefinition}, with the contents as child nodes (can be a + * paragraph like in the example, or other blocks like lists). + *

+ * All the footnotes (definitions) will be rendered in a list at the end of a document, no matter where they appear in + * the source. The footnotes will be numbered starting from 1, then 2, etc, depending on the order in which they appear + * in the text (and not dependent on the label). The footnote reference is a link to the footnote, and from the footnote + * there is a link back to the reference (or multiple). + *

+ * There is also optional support for inline footnotes, use {@link #builder()} and then set {@link Builder#inlineFootnotes}. + * + * @see GitHub docs for footnotes + */ +public class FootnotesExtension implements Parser.ParserExtension, + HtmlRenderer.HtmlRendererExtension, + MarkdownRenderer.MarkdownRendererExtension { + + private final boolean inlineFootnotes; + + private FootnotesExtension(boolean inlineFootnotes) { + this.inlineFootnotes = inlineFootnotes; + } + + /** + * The extension with the default configuration (no support for inline footnotes). + */ + public static Extension create() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + @Override + public void extend(Parser.Builder parserBuilder) { + parserBuilder + .customBlockParserFactory(new FootnoteBlockParser.Factory()) + .linkProcessor(new FootnoteLinkProcessor()); + if (inlineFootnotes) { + parserBuilder.linkMarker('^'); + } + } + + @Override + public void extend(HtmlRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(FootnoteHtmlNodeRenderer::new); + } + + @Override + public void extend(MarkdownRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new FootnoteMarkdownNodeRenderer(context); + } + + @Override + public Set getSpecialCharacters() { + return Set.of(); + } + }); + } + + public static class Builder { + + private boolean inlineFootnotes = false; + + /** + * Enable support for inline footnotes without definitions, e.g.: + *

+         * Some text^[this is an inline footnote]
+         * 
+ */ + public Builder inlineFootnotes(boolean inlineFootnotes) { + this.inlineFootnotes = inlineFootnotes; + return this; + } + + public FootnotesExtension build() { + return new FootnotesExtension(inlineFootnotes); + } + } +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/InlineFootnote.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/InlineFootnote.java new file mode 100644 index 000000000..665d01936 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/InlineFootnote.java @@ -0,0 +1,6 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.node.CustomNode; + +public class InlineFootnote extends CustomNode { +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteBlockParser.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteBlockParser.java new file mode 100644 index 000000000..110bdef20 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteBlockParser.java @@ -0,0 +1,105 @@ +package org.commonmark.ext.footnotes.internal; + +import org.commonmark.ext.footnotes.FootnoteDefinition; +import org.commonmark.node.Block; +import org.commonmark.node.DefinitionMap; +import org.commonmark.parser.block.*; +import org.commonmark.text.Characters; + +import java.util.List; + +/** + * Parser for a single {@link FootnoteDefinition} block. + */ +public class FootnoteBlockParser extends AbstractBlockParser { + + private final FootnoteDefinition block; + + public FootnoteBlockParser(String label) { + block = new FootnoteDefinition(label); + } + + @Override + public Block getBlock() { + return block; + } + + @Override + public boolean isContainer() { + return true; + } + + @Override + public boolean canContain(Block childBlock) { + return true; + } + + @Override + public BlockContinue tryContinue(ParserState parserState) { + if (parserState.getIndent() >= 4) { + // It looks like content needs to be indented by 4 so that it's part of a footnote (instead of starting a new block). + return BlockContinue.atColumn(4); + } else if (parserState.isBlank()) { + // A blank line doesn't finish a footnote yet. If there's another line with indent >= 4 after it, + // that should result in another paragraph in this footnote definition. + return BlockContinue.atIndex(parserState.getIndex()); + } else { + // We're not continuing to give other block parsers a chance to interrupt this definition. + // But if no other block parser applied (including another FootnotesBlockParser), we will + // accept the line via lazy continuation (same as a block quote). + return BlockContinue.none(); + } + } + + @Override + public List> getDefinitions() { + var map = new DefinitionMap<>(FootnoteDefinition.class); + map.putIfAbsent(block.getLabel(), block); + return List.of(map); + } + + public static class Factory implements BlockParserFactory { + + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + if (state.getIndent() >= 4) { + return BlockStart.none(); + } + var index = state.getNextNonSpaceIndex(); + var content = state.getLine().getContent(); + if (content.charAt(index) != '[' || index + 1 >= content.length()) { + return BlockStart.none(); + } + index++; + if (content.charAt(index) != '^' || index + 1 >= content.length()) { + return BlockStart.none(); + } + // Now at first label character (if any) + index++; + var labelStart = index; + + for (index = labelStart; index < content.length(); index++) { + var c = content.charAt(index); + switch (c) { + case ']': + if (index > labelStart && index + 1 < content.length() && content.charAt(index + 1) == ':') { + var label = content.subSequence(labelStart, index).toString(); + // After the colon, any number of spaces is skipped (not part of the content) + var afterSpaces = Characters.skipSpaceTab(content, index + 2, content.length()); + return BlockStart.of(new FootnoteBlockParser(label)).atIndex(afterSpaces); + } else { + return BlockStart.none(); + } + case ' ': + case '\r': + case '\n': + case '\0': + case '\t': + return BlockStart.none(); + } + } + + return BlockStart.none(); + } + } +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteHtmlNodeRenderer.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteHtmlNodeRenderer.java new file mode 100644 index 000000000..70eb048a3 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteHtmlNodeRenderer.java @@ -0,0 +1,391 @@ +package org.commonmark.ext.footnotes.internal; + +import org.commonmark.ext.footnotes.FootnoteDefinition; +import org.commonmark.ext.footnotes.FootnoteReference; +import org.commonmark.ext.footnotes.InlineFootnote; +import org.commonmark.node.*; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlWriter; + +import java.util.*; +import java.util.function.Consumer; + +/** + * HTML rendering for footnotes. + *

+ * Aims to match the rendering of cmark-gfm (which is slightly different from GitHub's when it comes to class + * attributes, not sure why). + *

+ * Some notes on how rendering works: + *

+ * + *

Nested footnotes

+ * Text in footnote definitions can reference other footnotes, even ones that aren't referenced in the main text. + * This makes them tricky because it's not enough to just go through the main text for references. + * And before we can render a definition, we need to know all references (because we add links back to references). + *

+ * In other words, footnotes form a directed graph. Footnotes can reference each other so cycles are possible too. + *

+ * One way to implement it, which is what cmark-gfm does, is to go through the whole document (including definitions) + * and find all references in order. That guarantees that all definitions are found, but it has strange results for + * ordering or when the reference is in an unreferenced definition, see tests. In graph terms, it renders all + * definitions that have an incoming edge, no matter whether they are connected to the main text or not. + *

+ * The way we implement it: + *

    + *
  1. Start with the references in the main text; we can render them as we go
  2. + *
  3. After the main text is rendered, we have the referenced definitions, but there might be more from definition text
  4. + *
  5. To find the remaining definitions, we visit the definitions from before to look at references
  6. + *
  7. Repeat (breadth-first search) until we've found all definitions (note that we can't render before that's done because of backrefs)
  8. + *
  9. Now render the definitions (and any references inside)
  10. + *
+ * This means we only render definitions whose references are actually rendered, and in a meaningful order (all main + * text footnotes first, then any nested ones). + */ +public class FootnoteHtmlNodeRenderer implements NodeRenderer { + + private final HtmlWriter html; + private final HtmlNodeRendererContext context; + + /** + * All definitions (even potentially unused ones), for looking up references + */ + private DefinitionMap definitionMap; + + /** + * Definitions that were referenced, in order in which they should be rendered. + */ + private final Map referencedDefinitions = new LinkedHashMap<>(); + + /** + * Information about references that should be rendered as footnotes. This doesn't contain all references, just the + * ones from inside definitions. + */ + private final Map references = new HashMap<>(); + + public FootnoteHtmlNodeRenderer(HtmlNodeRendererContext context) { + this.html = context.getWriter(); + this.context = context; + } + + @Override + public Set> getNodeTypes() { + return Set.of(FootnoteReference.class, InlineFootnote.class, FootnoteDefinition.class); + } + + @Override + public void beforeRoot(Node rootNode) { + // Collect all definitions first, so we can look them up when encountering a reference later. + var visitor = new DefinitionVisitor(); + rootNode.accept(visitor); + definitionMap = visitor.definitions; + } + + @Override + public void render(Node node) { + if (node instanceof FootnoteReference) { + // This is called for all references, even ones inside definitions that we render at the end. + // Inside definitions, we have registered the reference already. + var ref = (FootnoteReference) node; + // Use containsKey because if value is null, we don't need to try registering again. + var info = references.containsKey(ref) ? references.get(ref) : tryRegisterReference(ref); + if (info != null) { + renderReference(ref, info); + } else { + // A reference without a corresponding definition is rendered as plain text + html.text("[^" + ref.getLabel() + "]"); + } + } else if (node instanceof InlineFootnote) { + var info = references.get(node); + if (info == null) { + info = registerReference(node, null); + } + renderReference(node, info); + } + } + + @Override + public void afterRoot(Node rootNode) { + // Now render the referenced definitions if there are any. + if (referencedDefinitions.isEmpty()) { + return; + } + + var firstDef = referencedDefinitions.keySet().iterator().next(); + var attrs = new LinkedHashMap(); + attrs.put("class", "footnotes"); + attrs.put("data-footnotes", null); + html.tag("section", context.extendAttributes(firstDef, "section", attrs)); + html.line(); + html.tag("ol"); + html.line(); + + // Check whether there are any footnotes inside the definitions that we're about to render. For those, we might + // need to render more definitions. So do a breadth-first search to find all relevant definitions. + var check = new LinkedList<>(referencedDefinitions.keySet()); + while (!check.isEmpty()) { + var def = check.removeFirst(); + def.accept(new ShallowReferenceVisitor(def, node -> { + if (node instanceof FootnoteReference) { + var ref = (FootnoteReference) node; + var d = definitionMap.get(ref.getLabel()); + if (d != null) { + if (!referencedDefinitions.containsKey(d)) { + check.addLast(d); + } + references.put(ref, registerReference(d, d.getLabel())); + } + } else if (node instanceof InlineFootnote) { + check.addLast(node); + references.put(node, registerReference(node, null)); + } + })); + } + + for (var entry : referencedDefinitions.entrySet()) { + // This will also render any footnote references inside definitions + renderDefinition(entry.getKey(), entry.getValue()); + } + + html.tag("/ol"); + html.line(); + html.tag("/section"); + html.line(); + } + + private ReferenceInfo tryRegisterReference(FootnoteReference ref) { + var def = definitionMap.get(ref.getLabel()); + if (def == null) { + return null; + } + return registerReference(def, def.getLabel()); + } + + private ReferenceInfo registerReference(Node node, String label) { + // The first referenced definition gets number 1, second one 2, etc. + var referencedDef = referencedDefinitions.computeIfAbsent(node, k -> { + var num = referencedDefinitions.size() + 1; + var key = definitionKey(label, num); + return new ReferencedDefinition(num, key); + }); + var definitionNumber = referencedDef.definitionNumber; + // The reference number for that particular definition. E.g. if there's two references for the same definition, + // the first one is 1, the second one 2, etc. This is needed to give each reference a unique ID so that each + // reference can get its own backlink from the definition. + var refNumber = referencedDef.references.size() + 1; + var definitionKey = referencedDef.definitionKey; + var id = referenceId(definitionKey, refNumber); + referencedDef.references.add(id); + + return new ReferenceInfo(id, definitionId(definitionKey), definitionNumber); + } + + private void renderReference(Node node, ReferenceInfo referenceInfo) { + html.tag("sup", context.extendAttributes(node, "sup", Map.of("class", "footnote-ref"))); + + var href = "#" + referenceInfo.definitionId; + var attrs = new LinkedHashMap(); + attrs.put("href", href); + attrs.put("id", referenceInfo.id); + attrs.put("data-footnote-ref", null); + html.tag("a", context.extendAttributes(node, "a", attrs)); + html.raw(String.valueOf(referenceInfo.definitionNumber)); + html.tag("/a"); + html.tag("/sup"); + } + + private void renderDefinition(Node def, ReferencedDefinition referencedDefinition) { + var attrs = new LinkedHashMap(); + attrs.put("id", definitionId(referencedDefinition.definitionKey)); + html.tag("li", context.extendAttributes(def, "li", attrs)); + html.line(); + + if (def.getLastChild() instanceof Paragraph) { + // Add backlinks into last paragraph before

. This is what GFM does. + var lastParagraph = (Paragraph) def.getLastChild(); + var node = def.getFirstChild(); + while (node != lastParagraph) { + if (node instanceof Paragraph) { + // Because we're manually rendering the

for the last paragraph, do the same for all other + // paragraphs for consistency (Paragraph rendering might be overwritten by a custom renderer). + html.tag("p", context.extendAttributes(node, "p", Map.of())); + renderChildren(node); + html.tag("/p"); + html.line(); + } else { + context.render(node); + } + node = node.getNext(); + } + + html.tag("p", context.extendAttributes(lastParagraph, "p", Map.of())); + renderChildren(lastParagraph); + html.raw(" "); + renderBackrefs(def, referencedDefinition); + html.tag("/p"); + html.line(); + } else if (def instanceof InlineFootnote) { + html.tag("p", context.extendAttributes(def, "p", Map.of())); + renderChildren(def); + html.raw(" "); + renderBackrefs(def, referencedDefinition); + html.tag("/p"); + html.line(); + } else { + renderChildren(def); + html.line(); + renderBackrefs(def, referencedDefinition); + } + + html.tag("/li"); + html.line(); + } + + private void renderBackrefs(Node def, ReferencedDefinition referencedDefinition) { + var refs = referencedDefinition.references; + for (int i = 0; i < refs.size(); i++) { + var ref = refs.get(i); + var refNumber = i + 1; + var idx = referencedDefinition.definitionNumber + (refNumber > 1 ? ("-" + refNumber) : ""); + + var attrs = new LinkedHashMap(); + attrs.put("href", "#" + ref); + attrs.put("class", "footnote-backref"); + attrs.put("data-footnote-backref", null); + attrs.put("data-footnote-backref-idx", idx); + attrs.put("aria-label", "Back to reference " + idx); + html.tag("a", context.extendAttributes(def, "a", attrs)); + if (refNumber > 1) { + html.tag("sup", context.extendAttributes(def, "sup", Map.of("class", "footnote-ref"))); + html.raw(String.valueOf(refNumber)); + html.tag("/sup"); + } + // U+21A9 LEFTWARDS ARROW WITH HOOK + html.raw("\u21A9"); + html.tag("/a"); + if (i + 1 < refs.size()) { + html.raw(" "); + } + } + } + + private String referenceId(String definitionKey, int number) { + return "fnref" + definitionKey + (number == 1 ? "" : ("-" + number)); + } + + private String definitionKey(String label, int number) { + // Named definitions use the pattern "fn-{name}" and inline definitions use "fn{number}" so as not to conflict. + // "fn{number}" is also what pandoc uses (for all types), starting with number 1. + if (label != null) { + return "-" + label; + } else { + return "" + number; + } + } + + private String definitionId(String definitionKey) { + return "fn" + definitionKey; + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } + + private static class DefinitionVisitor extends AbstractVisitor { + + private final DefinitionMap definitions = new DefinitionMap<>(FootnoteDefinition.class); + + @Override + public void visit(CustomBlock customBlock) { + if (customBlock instanceof FootnoteDefinition) { + var def = (FootnoteDefinition) customBlock; + definitions.putIfAbsent(def.getLabel(), def); + } else { + super.visit(customBlock); + } + } + } + + /** + * Visit footnote references/inline footnotes inside the parent (but not the parent itself). We want a shallow visit + * because the caller wants to control when to descend. + */ + private static class ShallowReferenceVisitor extends AbstractVisitor { + private final Node parent; + private final Consumer consumer; + + private ShallowReferenceVisitor(Node parent, Consumer consumer) { + this.parent = parent; + this.consumer = consumer; + } + + @Override + public void visit(CustomNode customNode) { + if (customNode instanceof FootnoteReference) { + consumer.accept(customNode); + } else if (customNode instanceof InlineFootnote) { + if (customNode == parent) { + // Descend into the parent (inline footnotes can contain inline footnotes) + super.visit(customNode); + } else { + // Don't descend here because we want to be shallow. + consumer.accept(customNode); + } + } else { + super.visit(customNode); + } + } + } + + private static class ReferencedDefinition { + /** + * The definition number, starting from 1, and in order in which they're referenced. + */ + final int definitionNumber; + /** + * The unique key of the definition. Together with a static prefix it forms the ID used in the HTML. + */ + final String definitionKey; + /** + * The IDs of references for this definition, for backrefs. + */ + final List references = new ArrayList<>(); + + ReferencedDefinition(int definitionNumber, String definitionKey) { + this.definitionNumber = definitionNumber; + this.definitionKey = definitionKey; + } + } + + private static class ReferenceInfo { + /** + * The ID of the reference; in the corresponding definition, a link back to this reference will be rendered. + */ + private final String id; + /** + * The ID of the definition, for linking to the definition. + */ + private final String definitionId; + /** + * The definition number, rendered in superscript. + */ + private final int definitionNumber; + + private ReferenceInfo(String id, String definitionId, int definitionNumber) { + this.id = id; + this.definitionId = definitionId; + this.definitionNumber = definitionNumber; + } + } +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteLinkProcessor.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteLinkProcessor.java new file mode 100644 index 000000000..07b008576 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteLinkProcessor.java @@ -0,0 +1,57 @@ +package org.commonmark.ext.footnotes.internal; + +import org.commonmark.ext.footnotes.FootnoteDefinition; +import org.commonmark.ext.footnotes.FootnoteReference; +import org.commonmark.ext.footnotes.InlineFootnote; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.beta.LinkInfo; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.LinkResult; +import org.commonmark.parser.beta.Scanner; + +/** + * For turning e.g. [^foo] into a {@link FootnoteReference}, + * and ^[foo] into an {@link InlineFootnote}. + */ +public class FootnoteLinkProcessor implements LinkProcessor { + @Override + public LinkResult process(LinkInfo linkInfo, Scanner scanner, InlineParserContext context) { + + if (linkInfo.marker() != null && linkInfo.marker().getLiteral().equals("^")) { + // An inline footnote like ^[footnote text]. Note that we only get the marker here if the option is enabled + // on the extension. + return LinkResult.wrapTextIn(new InlineFootnote(), linkInfo.afterTextBracket()).includeMarker(); + } + + if (linkInfo.destination() != null) { + // If it's an inline link, it can't be a footnote reference + return LinkResult.none(); + } + + var text = linkInfo.text(); + if (!text.startsWith("^")) { + // Footnote reference needs to start with [^ + return LinkResult.none(); + } + + if (linkInfo.label() != null && context.getDefinition(LinkReferenceDefinition.class, linkInfo.label()) != null) { + // If there's a label after the text and the label has a definition -> it's a link, and it should take + // preference, e.g. in `[^foo][bar]` if `[bar]` has a definition, `[^foo]` won't be a footnote reference. + return LinkResult.none(); + } + + var label = text.substring(1); + // Check if we have a definition, otherwise ignore (same behavior as for link reference definitions). + // Note that the definition parser already checked the syntax of the label, we don't need to check again. + var def = context.getDefinition(FootnoteDefinition.class, label); + if (def == null) { + return LinkResult.none(); + } + + // For footnotes, we only ever consume the text part of the link, not the label part (if any) + var position = linkInfo.afterTextBracket(); + // If the marker is `![`, we don't want to include the `!`, so start from bracket + return LinkResult.replaceWith(new FootnoteReference(label), position); + } +} diff --git a/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteMarkdownNodeRenderer.java b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteMarkdownNodeRenderer.java new file mode 100644 index 000000000..3dcf4fc83 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/java/org/commonmark/ext/footnotes/internal/FootnoteMarkdownNodeRenderer.java @@ -0,0 +1,70 @@ +package org.commonmark.ext.footnotes.internal; + +import org.commonmark.ext.footnotes.FootnoteDefinition; +import org.commonmark.ext.footnotes.FootnoteReference; +import org.commonmark.ext.footnotes.InlineFootnote; +import org.commonmark.node.*; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownWriter; + +import java.util.Set; + +public class FootnoteMarkdownNodeRenderer implements NodeRenderer { + + private final MarkdownWriter writer; + private final MarkdownNodeRendererContext context; + + public FootnoteMarkdownNodeRenderer(MarkdownNodeRendererContext context) { + this.writer = context.getWriter(); + this.context = context; + } + + @Override + public Set> getNodeTypes() { + return Set.of(FootnoteReference.class, InlineFootnote.class, FootnoteDefinition.class); + } + + @Override + public void render(Node node) { + if (node instanceof FootnoteReference) { + renderReference((FootnoteReference) node); + } else if (node instanceof InlineFootnote) { + renderInline((InlineFootnote) node); + } else if (node instanceof FootnoteDefinition) { + renderDefinition((FootnoteDefinition) node); + } + } + + private void renderReference(FootnoteReference ref) { + writer.raw("[^"); + // The label is parsed as-is without escaping, so we can render it back as-is + writer.raw(ref.getLabel()); + writer.raw("]"); + } + + private void renderInline(InlineFootnote inlineFootnote) { + writer.raw("^["); + renderChildren(inlineFootnote); + writer.raw("]"); + } + + private void renderDefinition(FootnoteDefinition def) { + writer.raw("[^"); + writer.raw(def.getLabel()); + writer.raw("]: "); + + writer.pushPrefix(" "); + renderChildren(def); + writer.popPrefix(); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-footnotes/src/main/javadoc/overview.html b/commonmark-ext-footnotes/src/main/javadoc/overview.html new file mode 100644 index 000000000..4f19d2115 --- /dev/null +++ b/commonmark-ext-footnotes/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for footnotes using [^1] syntax +

See {@link org.commonmark.ext.footnotes.FootnotesExtension}

+ + diff --git a/commonmark-ext-footnotes/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-footnotes/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-footnotes/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnoteHtmlRendererTest.java b/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnoteHtmlRendererTest.java new file mode 100644 index 000000000..bc7d4f74c --- /dev/null +++ b/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnoteHtmlRendererTest.java @@ -0,0 +1,339 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.Extension; +import org.commonmark.node.Document; +import org.commonmark.node.Paragraph; +import org.commonmark.node.Text; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.Asserts; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Set; + +public class FootnoteHtmlRendererTest extends RenderingTestCase { + private static final Set EXTENSIONS = Set.of(FootnotesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void testOne() { + assertRendering("Test [^foo]\n\n[^foo]: note\n", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    note

    \n" + + "
  2. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testLabelNormalization() { + // Labels match via their normalized form. For the href and IDs to match, rendering needs to use the + // label from the definition consistently. + assertRendering("Test [^bar]\n\n[^BAR]: note\n", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    note

    \n" + + "
  2. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testMultipleReferences() { + // Tests a few things: + // - Numbering is based on the reference order, not the definition order + // - The same number is used when a definition is referenced multiple times + // - Multiple backrefs are rendered + assertRendering("First [^foo]\n\nThen [^bar]\n\nThen [^foo] again\n\n[^bar]: b\n[^foo]: f\n", + "

First 1

\n" + + "

Then 2

\n" + + "

Then 1 again

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    f 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    b

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testDefinitionWithTwoParagraphs() { + // With two paragraphs, the backref should be added to the second one + assertRendering("Test [^foo]\n\n[^foo]: one\n \n two\n", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    one

    \n" + + "

    two

    \n" + + "
  2. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testDefinitionWithList() { + assertRendering("Test [^foo]\n\n[^foo]:\n - one\n - two\n", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "
      \n" + + "
    • one
    • \n" + + "
    • two
    • \n" + + "
    \n" + + "
  2. \n" + + "
\n" + + "
\n"); + } + + // See docs on FootnoteHtmlNodeRenderer about nested footnotes. + + @Test + public void testNestedFootnotesSimple() { + assertRendering("[^foo1]\n" + + "\n" + + "[^foo1]: one [^foo2]\n" + + "[^foo2]: two\n", "

1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    one 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    two

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testNestedFootnotesOrder() { + // GitHub has a strange result here, the definitions are in order: 1. bar, 2. foo. + // The reason is that the number is done based on all references in document order, including references in + // definitions. So [^bar] from the first line is first. + assertRendering("[^foo]: foo [^bar]\n" + + "\n" + + "[^foo]\n" + + "\n" + + "[^bar]: bar\n", "

1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    foo 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    bar

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testNestedFootnotesOrder2() { + assertRendering("[^1]\n" + + "\n" + + "[^4]: four\n" + + "[^3]: three [^4]\n" + + "[^2]: two [^4]\n" + + "[^1]: one [^2][^3]\n", "

1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    one 23

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    two 4

    \n" + + "
  4. \n" + + "
  5. \n" + + "

    three 4

    \n" + + "
  6. \n" + + "
  7. \n" + + "

    four 2

    \n" + + "
  8. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testNestedFootnotesCycle() { + // Footnotes can contain cycles, lol. + assertRendering("[^foo1]\n" + + "\n" + + "[^foo1]: one [^foo2]\n" + + "[^foo2]: two [^foo1]\n", "

1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    one 2 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    two 1

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testNestedFootnotesUnreferenced() { + // This should not result in any footnotes, as baz itself isn't referenced. + // But GitHub renders bar only, with a broken backref, because bar is referenced from foo. + assertRendering("[^foo]: foo[^bar]\n" + + "[^bar]: bar\n", ""); + + // And here only 1 is rendered. + assertRendering("[^1]\n" + + "\n" + + "[^1]: one\n" + + "[^foo]: foo[^bar]\n" + + "[^bar]: bar\n", "

1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    one

    \n" + + "
  2. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testInlineFootnotes() { + assertRenderingInline("Test ^[inline *footnote*]", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    inline footnote

    \n" + + "
  2. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testInlineFootnotesNested() { + assertRenderingInline("Test ^[inline ^[nested]]", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    inline 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    nested

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testInlineFootnoteWithReference() { + // This is a bit tricky because the IDs need to be unique. + assertRenderingInline("Test ^[inline [^1]]\n" + + "\n" + + "[^1]: normal", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    inline 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    normal

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testInlineFootnoteInsideDefinition() { + assertRenderingInline("Test [^1]\n" + + "\n" + + "[^1]: Definition ^[inline]\n", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    Definition 2

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    inline

    \n" + + "
  4. \n" + + "
\n" + + "
\n"); + } + + @Test + public void testInlineFootnoteInsideDefinition2() { + // Tricky because of the nested inline footnote which we want to visit after foo (breadth-first). + assertRenderingInline("Test [^1]\n" + + "\n" + + "[^1]: Definition ^[inline ^[nested]] ^[foo]\n", + "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    Definition 2 3

    \n" + + "
  2. \n" + + "
  3. \n" + + "

    inline 4

    \n" + + "
  4. \n" + + "
  5. \n" + + "

    foo

    \n" + + "
  6. \n" + + "
  7. \n" + + "

    nested

    \n" + + "
  8. \n" + + "
\n" + + "
\n"); + } + + + @Test + public void testRenderNodesDirectly() { + // Everything should work as expected when rendering from nodes directly (no parsing step). + var doc = new Document(); + var p = new Paragraph(); + p.appendChild(new Text("Test ")); + p.appendChild(new FootnoteReference("foo")); + var def = new FootnoteDefinition("foo"); + var note = new Paragraph(); + note.appendChild(new Text("note!")); + def.appendChild(note); + doc.appendChild(p); + doc.appendChild(def); + + var expected = "

Test 1

\n" + + "
\n" + + "
    \n" + + "
  1. \n" + + "

    note!

    \n" + + "
  2. \n" + + "
\n" + + "
\n"; + Asserts.assertRendering("", expected, RENDERER.render(doc)); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private static void assertRenderingInline(String source, String expected) { + var extension = FootnotesExtension.builder().inlineFootnotes(true).build(); + var parser = Parser.builder().extensions(List.of(extension)).build(); + var renderer = HtmlRenderer.builder().extensions(List.of(extension)).build(); + Asserts.assertRendering(source, expected, renderer.render(parser.parse(source))); + } +} diff --git a/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnoteMarkdownRendererTest.java b/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnoteMarkdownRendererTest.java new file mode 100644 index 000000000..2f1125a02 --- /dev/null +++ b/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnoteMarkdownRendererTest.java @@ -0,0 +1,65 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.Extension; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class FootnoteMarkdownRendererTest { + private static final Set EXTENSIONS = Set.of(FootnotesExtension.builder().inlineFootnotes(true).build()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final MarkdownRenderer RENDERER = MarkdownRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void testSimple() { + assertRoundTrip("Test [^foo]\n\n[^foo]: note\n"); + } + + @Test + public void testUnreferenced() { + // Whether a reference has a corresponding definition or vice versa shouldn't matter for Markdown rendering. + assertRoundTrip("Test [^foo]\n\n[^foo]: one\n\n[^bar]: two\n"); + } + + @Test + public void testFootnoteWithBlock() { + assertRoundTrip("Test [^foo]\n\n[^foo]: - foo\n - bar\n"); + } + + @Test + public void testBackslashInLabel() { + assertRoundTrip("[^\\foo]\n\n[^\\foo]: note\n"); + } + + @Test + public void testMultipleLines() { + assertRoundTrip("Test [^1]\n\n[^1]: footnote l1\n footnote l2\n"); + } + + @Test + public void testMultipleParagraphs() { + // Note that the line between p1 and p2 could be blank too (instead of 4 spaces), but we currently don't + // preserve that information. + assertRoundTrip("Test [^1]\n\n[^1]: footnote p1\n \n footnote p2\n"); + } + + @Test + public void testInline() { + assertRoundTrip("^[test *foo*]\n"); + } + + private void assertRoundTrip(String input) { + String rendered = parseAndRender(input); + assertThat(rendered).isEqualTo(input); + } + + private String parseAndRender(String source) { + Node parsed = PARSER.parse(source); + return RENDERER.render(parsed); + } +} diff --git a/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnotesTest.java b/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnotesTest.java new file mode 100644 index 000000000..7763cedb4 --- /dev/null +++ b/commonmark-ext-footnotes/src/test/java/org/commonmark/ext/footnotes/FootnotesTest.java @@ -0,0 +1,366 @@ +package org.commonmark.ext.footnotes; + +import org.commonmark.Extension; +import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class FootnotesTest { + + private static final Set EXTENSIONS = Set.of(FootnotesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + + @Test + public void testDefBlockStart() { + for (var s : List.of("1", "a", "^", "*", "\\a", "\uD83D\uDE42", "&0")) { + var doc = PARSER.parse("[^" + s + "]: footnote\n"); + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getLabel()).isEqualTo(s); + } + + for (var s : List.of("", " ", "a b", "]", "\r", "\n", "\t")) { + var input = "[^" + s + "]: footnote\n"; + var doc = PARSER.parse(input); + assertThat(tryFind(doc, FootnoteDefinition.class)).as("input: " + input).isNull(); + } + } + + @Test + public void testDefBlockStartInterrupts() { + // This is different from a link reference definition, which can only be at the start of paragraphs. + var doc = PARSER.parse("test\n[^1]: footnote\n"); + var paragraph = find(doc, Paragraph.class); + var def = find(doc, FootnoteDefinition.class); + assertThat(((Text) paragraph.getLastChild()).getLiteral()).isEqualTo("test"); + assertThat(def.getLabel()).isEqualTo("1"); + } + + @Test + public void testDefBlockStartIndented() { + var doc1 = PARSER.parse(" [^1]: footnote\n"); + assertThat(find(doc1, FootnoteDefinition.class).getLabel()).isEqualTo("1"); + var doc2 = PARSER.parse(" [^1]: footnote\n"); + assertNone(doc2, FootnoteDefinition.class); + } + + @Test + public void testDefMultiple() { + var doc = PARSER.parse("[^1]: foo\n[^2]: bar\n"); + var defs = findAll(doc, FootnoteDefinition.class); + assertThat(defs.get(0).getLabel()).isEqualTo("1"); + assertThat(defs.get(1).getLabel()).isEqualTo("2"); + } + + @Test + public void testDefBlockStartAfterLinkReferenceDefinition() { + var doc = PARSER.parse("[foo]: /url\n[^1]: footnote\n"); + var linkReferenceDef = find(doc, LinkReferenceDefinition.class); + var footnotesDef = find(doc, FootnoteDefinition.class); + assertThat(linkReferenceDef.getLabel()).isEqualTo("foo"); + assertThat(footnotesDef.getLabel()).isEqualTo("1"); + } + + @Test + public void testDefContainsParagraph() { + var doc = PARSER.parse("[^1]: footnote\n"); + var def = find(doc, FootnoteDefinition.class); + var paragraph = (Paragraph) def.getFirstChild(); + assertText("footnote", paragraph.getFirstChild()); + } + + @Test + public void testDefBlockStartSpacesAfterColon() { + var doc = PARSER.parse("[^1]: footnote\n"); + var def = find(doc, FootnoteDefinition.class); + var paragraph = (Paragraph) def.getFirstChild(); + assertText("footnote", paragraph.getFirstChild()); + } + + @Test + public void testDefContainsIndentedCodeBlock() { + var doc = PARSER.parse("[^1]:\n code\n"); + var def = find(doc, FootnoteDefinition.class); + var codeBlock = (IndentedCodeBlock) def.getFirstChild(); + assertThat(codeBlock.getLiteral()).isEqualTo("code\n"); + } + + @Test + public void testDefContainsMultipleLines() { + var doc = PARSER.parse("[^1]: footnote\nstill\n"); + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getLabel()).isEqualTo("1"); + var paragraph = (Paragraph) def.getFirstChild(); + assertText("footnote", paragraph.getFirstChild()); + assertText("still", paragraph.getLastChild()); + } + + @Test + public void testDefContainsMultipleParagraphs() { + var doc = PARSER.parse("[^1]: footnote p1\n\n footnote p2\n"); + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getLabel()).isEqualTo("1"); + var p1 = (Paragraph) def.getFirstChild(); + assertText("footnote p1", p1.getFirstChild()); + var p2 = (Paragraph) p1.getNext(); + assertText("footnote p2", p2.getFirstChild()); + } + + @Test + public void testDefFollowedByParagraph() { + var doc = PARSER.parse("[^1]: footnote\n\nnormal paragraph\n"); + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getLabel()).isEqualTo("1"); + assertText("footnote", def.getFirstChild().getFirstChild()); + assertText("normal paragraph", def.getNext().getFirstChild()); + } + + @Test + public void testDefContainsList() { + var doc = PARSER.parse("[^1]: - foo\n - bar\n"); + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getLabel()).isEqualTo("1"); + var list = (BulletList) def.getFirstChild(); + var item1 = (ListItem) list.getFirstChild(); + var item2 = (ListItem) list.getLastChild(); + assertText("foo", item1.getFirstChild().getFirstChild()); + assertText("bar", item2.getFirstChild().getFirstChild()); + } + + @Test + public void testDefInterruptedByOthers() { + var doc = PARSER.parse("[^1]: footnote\n# Heading\n"); + var def = find(doc, FootnoteDefinition.class); + var heading = find(doc, Heading.class); + assertThat(def.getLabel()).isEqualTo("1"); + assertText("Heading", heading.getFirstChild()); + } + + @Test + public void testReference() { + var doc = PARSER.parse("Test [^foo]\n\n[^foo]: /url\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("foo"); + } + + @Test + public void testReferenceNoDefinition() { + var doc = PARSER.parse("Test [^foo]\n"); + assertNone(doc, FootnoteReference.class); + } + + @Test + public void testRefWithEmphasisInside() { + // No emphasis inside footnote reference, should just be treated as text + var doc = PARSER.parse("Test [^*foo*]\n\n[^*foo*]: def\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("*foo*"); + assertThat(ref.getFirstChild()).isNull(); + var paragraph = doc.getFirstChild(); + var text = (Text) paragraph.getFirstChild(); + assertThat(text.getLiteral()).isEqualTo("Test "); + assertThat(text.getNext()).isEqualTo(ref); + assertThat(paragraph.getLastChild()).isEqualTo(ref); + } + + @Test + public void testRefWithEmphasisAround() { + // Emphasis around footnote reference, the * inside needs to be removed from emphasis processing + var doc = PARSER.parse("Test *abc [^foo*] def*\n\n[^foo*]: def\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("foo*"); + assertText("abc ", ref.getPrevious()); + assertText(" def", ref.getNext()); + var em = find(doc, Emphasis.class); + assertThat(ref.getParent()).isEqualTo(em); + } + + @Test + public void testRefAfterBang() { + var doc = PARSER.parse("Test![^foo]\n\n[^foo]: def\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("foo"); + var paragraph = doc.getFirstChild(); + assertText("Test!", paragraph.getFirstChild()); + } + + @Test + public void testRefAsLabelOnly() { + // [^bar] is a footnote but [foo] is just text, because full reference links (text `foo`, label `^bar`) don't + // resolve as footnotes. If `[foo][^bar]` fails to parse as a bracket, `[^bar]` by itself needs to be tried. + var doc = PARSER.parse("Test [foo][^bar]\n\n[^bar]: footnote\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("bar"); + var paragraph = doc.getFirstChild(); + assertText("Test [foo]", paragraph.getFirstChild()); + } + + @Test + public void testRefWithEmptyLabel() { + // [^bar] is a footnote but [] is just text, because collapsed reference links don't resolve as footnotes + var doc = PARSER.parse("Test [^bar][]\n\n[^bar]: footnote\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("bar"); + var paragraph = doc.getFirstChild(); + assertText("Test ", paragraph.getFirstChild()); + assertText("[]", paragraph.getLastChild()); + } + + @Test + public void testRefWithBracket() { + // Not a footnote, [ needs to be escaped + var doc = PARSER.parse("Test [^f[oo]\n\n[^f[oo]: /url\n"); + assertNone(doc, FootnoteReference.class); + } + + @Test + public void testRefWithBackslash() { + var doc = PARSER.parse("[^\\foo]\n\n[^\\foo]: note\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("\\foo"); + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getLabel()).isEqualTo("\\foo"); + } + + @Test + public void testPreferInlineLink() { + var doc = PARSER.parse("Test [^bar](/url)\n\n[^bar]: footnote\n"); + assertNone(doc, FootnoteReference.class); + } + + @Test + public void testPreferReferenceLink() { + // This is tricky because `[^*foo*][foo]` is a valid link already. If `[foo]` was not defined, the first bracket + // would be a footnote. + var doc = PARSER.parse("Test [^*foo*][foo]\n\n[^*foo*]: /url\n\n[foo]: /url"); + assertNone(doc, FootnoteReference.class); + } + + @Test + public void testReferenceLinkWithoutDefinition() { + // Similar to previous test but there's no definition + var doc = PARSER.parse("Test [^*foo*][foo]\n\n[^*foo*]: def\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("*foo*"); + var paragraph = (Paragraph) doc.getFirstChild(); + assertText("Test ", paragraph.getFirstChild()); + assertText("[foo]", paragraph.getLastChild()); + } + + @Test + public void testFootnoteInLink() { + // Expected to behave the same way as a link within a link, see https://spec.commonmark.org/0.31.2/#example-518 + // i.e. the first (inner) link is parsed, which means the outer one becomes plain text, as nesting links is not + // allowed. + var doc = PARSER.parse("[link with footnote ref [^1]](https://example.com)\n\n[^1]: footnote\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("1"); + var paragraph = doc.getFirstChild(); + assertText("[link with footnote ref ", paragraph.getFirstChild()); + assertText("](https://example.com)", paragraph.getLastChild()); + } + + @Test + public void testFootnoteWithMarkerInLink() { + var doc = PARSER.parse("[link with footnote ref ![^1]](https://example.com)\n\n[^1]: footnote\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getLabel()).isEqualTo("1"); + var paragraph = doc.getFirstChild(); + assertText("[link with footnote ref !", paragraph.getFirstChild()); + assertText("](https://example.com)", paragraph.getLastChild()); + } + + @Test + public void testInlineFootnote() { + var extension = FootnotesExtension.builder().inlineFootnotes(true).build(); + var parser = Parser.builder().extensions(Set.of(extension)).build(); + + { + var doc = parser.parse("Test ^[inline footnote]"); + assertText("Test ", doc.getFirstChild().getFirstChild()); + var fn = find(doc, InlineFootnote.class); + assertText("inline footnote", fn.getFirstChild()); + } + + { + var doc = parser.parse("Test \\^[not inline footnote]"); + assertNone(doc, InlineFootnote.class); + } + + { + var doc = parser.parse("Test ^[not inline footnote"); + assertNone(doc, InlineFootnote.class); + var t = doc.getFirstChild().getFirstChild(); + assertText("Test ^[not inline footnote", t); + } + + { + // This is a tricky one because the code span in the link text + // includes the `]` (and doesn't need to be escaped). Therefore + // inline footnote parsing has to do full link text parsing/inline parsing. + // https://spec.commonmark.org/0.31.2/#link-text + + var doc = parser.parse("^[test `bla]`]"); + var fn = find(doc, InlineFootnote.class); + assertText("test ", fn.getFirstChild()); + var code = fn.getFirstChild().getNext(); + assertThat(((Code) code).getLiteral()).isEqualTo("bla]"); + } + + { + var doc = parser.parse("^[with a [link](url)]"); + var fn = find(doc, InlineFootnote.class); + assertText("with a ", fn.getFirstChild()); + var link = fn.getFirstChild().getNext(); + assertThat(((Link) link).getDestination()).isEqualTo("url"); + } + } + + @Test + public void testSourcePositions() { + var parser = Parser.builder().extensions(EXTENSIONS).includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES).build(); + + var doc = parser.parse("Test [^foo]\n\n[^foo]: /url\n"); + var ref = find(doc, FootnoteReference.class); + assertThat(ref.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 5, 5, 6))); + + var def = find(doc, FootnoteDefinition.class); + assertThat(def.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 0, 13, 12))); + } + + private static void assertNone(Node parent, Class nodeClass) { + assertThat(tryFind(parent, nodeClass)).as(() -> "Node " + parent + " containing " + nodeClass).isNull(); + } + + private static T find(Node parent, Class nodeClass) { + return Objects.requireNonNull(tryFind(parent, nodeClass), "Could not find a " + nodeClass.getSimpleName() + " node in " + parent); + } + + private static T tryFind(Node parent, Class nodeClass) { + return findAll(parent, nodeClass).stream().findFirst().orElse(null); + } + + private static List findAll(Node parent, Class nodeClass) { + var nodes = new ArrayList(); + for (var node = parent.getFirstChild(); node != null; node = node.getNext()) { + if (nodeClass.isInstance(node)) { + //noinspection unchecked + nodes.add((T) node); + } + nodes.addAll(findAll(node, nodeClass)); + } + return nodes; + } + + private static void assertText(String expected, Node node) { + var text = (Text) node; + assertThat(text.getLiteral()).isEqualTo(expected); + } +} diff --git a/commonmark-ext-footnotes/src/test/resources/footnotes.html b/commonmark-ext-footnotes/src/test/resources/footnotes.html new file mode 100644 index 000000000..1dd83185f --- /dev/null +++ b/commonmark-ext-footnotes/src/test/resources/footnotes.html @@ -0,0 +1,18 @@ + + + + + + Footnotes testing + + + + +Paste HTML from footnote rendering in here to manually check that linking works as expected. + + + diff --git a/commonmark-ext-gfm-alerts/README.md b/commonmark-ext-gfm-alerts/README.md new file mode 100644 index 000000000..2368812e5 --- /dev/null +++ b/commonmark-ext-gfm-alerts/README.md @@ -0,0 +1,74 @@ +# commonmark-ext-gfm-alerts + +Extension for [commonmark-java](https://github.com/commonmark/commonmark-java) that adds support for [GitHub Flavored Markdown alerts](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts). + +Enables highlighting important information using blockquote syntax with five standard alert types: NOTE, TIP, IMPORTANT, WARNING, and CAUTION. + +## Usage + +#### Markdown Syntax + +```markdown +> [!NOTE] +> Useful information + +> [!WARNING] +> Critical information +``` + +#### Standard GFM Types + +```java +var extension = AlertsExtension.create(); +var parser = Parser.builder().extensions(List.of(extension)).build(); +var renderer = HtmlRenderer.builder().extensions(List.of(extension)).build(); +``` + +#### Custom Alert Types + +Add custom types beyond the five standard GFM types: + +```java +var extension = AlertsExtension.builder() + .addCustomType("BUG", "Known Bug") + .build(); +``` + +Custom types must be UPPERCASE. Standard type titles can also be overridden for localization. + +#### Styling + +Alerts render as `
` elements with CSS classes: + +```html +
+

Note

+

Content

+
+``` + +Basic CSS example: + +```css +.markdown-alert { + padding: 0.5rem 1rem; + margin-bottom: 1rem; + border-left: 4px solid; +} + +.markdown-alert-note { border-color: #0969da; background-color: #ddf4ff; } +.markdown-alert-tip { border-color: #1a7f37; background-color: #dcffe4; } +.markdown-alert-important { border-color: #8250df; background-color: #f6f0ff; } +.markdown-alert-warning { border-color: #9a6700; background-color: #fff8c5; } +.markdown-alert-caution { border-color: #cf222e; background-color: #ffebe9; } +``` + +![Alerts](screenshots/alerts.png) + +Icons can be added using GitHub's [Octicons](https://primer.style/octicons/): + +![Alerts with icons](screenshots/alerts-with-icons.png) + +## License + +See the main commonmark-java project for license information. diff --git a/commonmark-ext-gfm-alerts/pom.xml b/commonmark-ext-gfm-alerts/pom.xml new file mode 100644 index 000000000..02ecbf802 --- /dev/null +++ b/commonmark-ext-gfm-alerts/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-ext-gfm-alerts + commonmark-java extension for alerts + commonmark-java extension for GFM alerts (admonition blocks) using [!TYPE] syntax (GitHub Flavored Markdown) + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-gfm-alerts/screenshots/alerts-with-icons.png b/commonmark-ext-gfm-alerts/screenshots/alerts-with-icons.png new file mode 100644 index 000000000..47da9402b Binary files /dev/null and b/commonmark-ext-gfm-alerts/screenshots/alerts-with-icons.png differ diff --git a/commonmark-ext-gfm-alerts/screenshots/alerts.png b/commonmark-ext-gfm-alerts/screenshots/alerts.png new file mode 100644 index 000000000..83d4009f0 Binary files /dev/null and b/commonmark-ext-gfm-alerts/screenshots/alerts.png differ diff --git a/commonmark-ext-gfm-alerts/src/main/java/module-info.java b/commonmark-ext-gfm-alerts/src/main/java/module-info.java new file mode 100644 index 000000000..e8b5aecb7 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.gfm.alerts { + exports org.commonmark.ext.gfm.alerts; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/Alert.java b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/Alert.java new file mode 100644 index 000000000..bb28e7344 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/Alert.java @@ -0,0 +1,19 @@ +package org.commonmark.ext.gfm.alerts; + +import org.commonmark.node.CustomBlock; + +/** + * Alert block for highlighting important information using {@code [!TYPE]} syntax. + */ +public class Alert extends CustomBlock { + + private final String type; + + public Alert(String type) { + this.type = type; + } + + public String getType() { + return type; + } +} diff --git a/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/AlertsExtension.java b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/AlertsExtension.java new file mode 100644 index 000000000..3990034d2 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/AlertsExtension.java @@ -0,0 +1,118 @@ +package org.commonmark.ext.gfm.alerts; + +import org.commonmark.Extension; +import org.commonmark.ext.gfm.alerts.internal.AlertPostProcessor; +import org.commonmark.ext.gfm.alerts.internal.AlertHtmlNodeRenderer; +import org.commonmark.ext.gfm.alerts.internal.AlertMarkdownNodeRenderer; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlNodeRendererFactory; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownNodeRendererFactory; +import org.commonmark.renderer.markdown.MarkdownRenderer; + +import java.util.HashMap; +import java.util.Locale; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * Extension for GFM alerts using {@code [!TYPE]} syntax (GitHub Flavored Markdown). + *

+ * Create with {@link #create()} or {@link #builder()} and configure on builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + * Parsed alerts become {@link Alert} blocks. + */ +public class AlertsExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension, + MarkdownRenderer.MarkdownRendererExtension { + + static final Set STANDARD_TYPES = Set.of("NOTE", "TIP", "IMPORTANT", "WARNING", "CAUTION"); + + private final Map customTypes; + + private AlertsExtension(Builder builder) { + this.customTypes = new HashMap<>(builder.customTypes); + } + + public static Extension create() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + @Override + public void extend(Parser.Builder parserBuilder) { + var allowedTypes = new HashSet<>(STANDARD_TYPES); + allowedTypes.addAll(customTypes.keySet()); + parserBuilder.postProcessor(new AlertPostProcessor(allowedTypes)); + } + + @Override + public void extend(HtmlRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new AlertHtmlNodeRenderer(context, customTypes); + } + }); + } + + @Override + public void extend(MarkdownRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new AlertMarkdownNodeRenderer(context); + } + + @Override + public Set getSpecialCharacters() { + return Set.of(); + } + }); + } + + /** + * Builder for configuring the alerts extension. + */ + public static class Builder { + private final Map customTypes = new HashMap<>(); + + /** + * Adds a custom alert type with a display title. + *

+ * This can also be used to override the display title of standard GFM types + * (e.g., for localization). + * + * @param type the alert type (must be uppercase) + * @param title the display title for this alert type + * @return {@code this} + */ + public Builder addCustomType(String type, String title) { + if (type == null || type.isEmpty()) { + throw new IllegalArgumentException("Type must not be null or empty"); + } + if (title == null || title.isEmpty()) { + throw new IllegalArgumentException("Title must not be null or empty"); + } + if (!type.equals(type.toUpperCase(Locale.ROOT))) { + throw new IllegalArgumentException("Type must be uppercase: " + type); + } + customTypes.put(type, title); + return this; + } + + /** + * @return a configured {@link Extension} + */ + public Extension build() { + return new AlertsExtension(this); + } + } +} diff --git a/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertHtmlNodeRenderer.java b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertHtmlNodeRenderer.java new file mode 100644 index 000000000..ca562ba33 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertHtmlNodeRenderer.java @@ -0,0 +1,78 @@ +package org.commonmark.ext.gfm.alerts.internal; + +import org.commonmark.ext.gfm.alerts.Alert; +import org.commonmark.node.Node; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlWriter; + +import java.util.LinkedHashMap; +import java.util.Map; + +public class AlertHtmlNodeRenderer extends AlertNodeRenderer { + + private final HtmlWriter htmlWriter; + private final HtmlNodeRendererContext context; + private final Map customTypeTitles; + + public AlertHtmlNodeRenderer(HtmlNodeRendererContext context, Map customTypeTitles) { + this.htmlWriter = context.getWriter(); + this.context = context; + this.customTypeTitles = customTypeTitles; + } + + @Override + protected void renderAlert(Alert alert) { + var type = alert.getType(); + var cssClass = type.toLowerCase(); + + htmlWriter.line(); + var attributes = new LinkedHashMap(); + attributes.put("class", "markdown-alert markdown-alert-" + cssClass); + attributes.put("data-alert-type", cssClass); + + htmlWriter.tag("div", context.extendAttributes(alert, "div", attributes)); + htmlWriter.line(); + + // Render alert title + htmlWriter.tag("p", context.extendAttributes(alert, "p", Map.of("class", "markdown-alert-title"))); + htmlWriter.text(getAlertTitle(type)); + htmlWriter.tag("/p"); + htmlWriter.line(); + + // Render children (the alert content) + renderChildren(alert); + + htmlWriter.tag("/div"); + htmlWriter.line(); + } + + private String getAlertTitle(String type) { + var customTypeTitle = customTypeTitles.get(type); + if (customTypeTitle != null) { + return customTypeTitle; + } + switch (type) { + case "NOTE": + return "Note"; + case "TIP": + return "Tip"; + case "IMPORTANT": + return "Important"; + case "WARNING": + return "Warning"; + case "CAUTION": + return "Caution"; + default: + throw new IllegalStateException("Unknown alert type: " + type); + } + } + + private void renderChildren(Node parent) { + var node = parent.getFirstChild(); + while (node != null) { + var next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertMarkdownNodeRenderer.java b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertMarkdownNodeRenderer.java new file mode 100644 index 000000000..e3da62aea --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertMarkdownNodeRenderer.java @@ -0,0 +1,38 @@ +package org.commonmark.ext.gfm.alerts.internal; + +import org.commonmark.ext.gfm.alerts.Alert; +import org.commonmark.node.Node; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownWriter; + +public class AlertMarkdownNodeRenderer extends AlertNodeRenderer { + + private final MarkdownWriter writer; + private final MarkdownNodeRendererContext context; + + public AlertMarkdownNodeRenderer(MarkdownNodeRendererContext context) { + this.writer = context.getWriter(); + this.context = context; + } + + @Override + protected void renderAlert(Alert alert) { + // First line: > [!TYPE] + writer.writePrefix("> "); + writer.pushPrefix("> "); + writer.raw("[!" + alert.getType() + "]"); + writer.line(); + renderChildren(alert); + writer.popPrefix(); + writer.block(); + } + + private void renderChildren(Node parent) { + var node = parent.getFirstChild(); + while (node != null) { + var next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertNodeRenderer.java b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertNodeRenderer.java new file mode 100644 index 000000000..45b34bb46 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertNodeRenderer.java @@ -0,0 +1,23 @@ +package org.commonmark.ext.gfm.alerts.internal; + +import org.commonmark.ext.gfm.alerts.Alert; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; + +import java.util.Set; + +public abstract class AlertNodeRenderer implements NodeRenderer { + + @Override + public Set> getNodeTypes() { + return Set.of(Alert.class); + } + + @Override + public void render(Node node) { + var alert = (Alert) node; + renderAlert(alert); + } + + protected abstract void renderAlert(Alert alert); +} diff --git a/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertPostProcessor.java b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertPostProcessor.java new file mode 100644 index 000000000..8008fc8dd --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/java/org/commonmark/ext/gfm/alerts/internal/AlertPostProcessor.java @@ -0,0 +1,111 @@ +package org.commonmark.ext.gfm.alerts.internal; + +import org.commonmark.ext.gfm.alerts.Alert; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Node; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.Text; +import org.commonmark.parser.PostProcessor; + +import java.util.Locale; +import java.util.Set; +import java.util.regex.Pattern; + +public class AlertPostProcessor implements PostProcessor { + + // Alert type marker, matching any case (GitHub supports lowercase, mixed, and uppercase) + private static final Pattern ALERT_PATTERN = Pattern.compile("^\\[!([a-zA-Z]+)]\\s*$"); + + private final Set allowedTypes; + + public AlertPostProcessor(Set allowedTypes) { + this.allowedTypes = allowedTypes; + } + + @Override + public Node process(Node document) { + // Only look at direct children of Document — GitHub only detects alerts at the top level. + var child = document.getFirstChild(); + while (child != null) { + var next = child.getNext(); + if (child instanceof BlockQuote) { + tryConvertToAlert((BlockQuote) child); + } + child = next; + } + return document; + } + + private void tryConvertToAlert(BlockQuote blockQuote) { + var firstChild = blockQuote.getFirstChild(); + if (!(firstChild instanceof Paragraph)) { + return; + } + + var paragraph = (Paragraph) firstChild; + var firstInline = paragraph.getFirstChild(); + if (!(firstInline instanceof Text)) { + return; + } + + var textNode = (Text) firstInline; + + // The alert marker can be the entire text node content, or just the first line + // before a line break (trailing spaces create a HardLineBreak instead of SoftLineBreak). + var afterMarker = firstInline.getNext(); + if (afterMarker != null && !(afterMarker instanceof SoftLineBreak) && !(afterMarker instanceof HardLineBreak)) { + // Text followed by something other than a line break - not an alert + return; + } + + var matcher = ALERT_PATTERN.matcher(textNode.getLiteral()); + if (!matcher.matches()) { + return; + } + + var type = matcher.group(1).toUpperCase(Locale.ROOT); + if (!allowedTypes.contains(type)) { + return; + } + + // Must have content after the marker line. An alert with ONLY the marker + // and no content is a normal blockquote on GitHub. + if (afterMarker != null) { + // There's a line break after marker - check if there's content after it + if (afterMarker.getNext() == null && paragraph.getNext() == null) { + return; + } + afterMarker.unlink(); + } else { + // Marker is the only thing in the paragraph + if (paragraph.getNext() == null) { + return; + } + } + + // Valid alert. Create Alert node and transfer children. + var alert = new Alert(type); + alert.setSourceSpans(blockQuote.getSourceSpans()); + blockQuote.insertAfter(alert); + + // Remove the marker text from the first paragraph + firstInline.unlink(); + + // If paragraph is now empty, remove it + if (paragraph.getFirstChild() == null) { + paragraph.unlink(); + } + + // Move remaining children from blockquote to alert + var child = blockQuote.getFirstChild(); + while (child != null) { + var next = child.getNext(); + alert.appendChild(child); + child = next; + } + + blockQuote.unlink(); + } +} diff --git a/commonmark-ext-gfm-alerts/src/main/javadoc/overview.html b/commonmark-ext-gfm-alerts/src/main/javadoc/overview.html new file mode 100644 index 000000000..145232a87 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for GitHub Flavored Markdown (GFM) alerts using blockquote syntax +

See {@link org.commonmark.ext.gfm.alerts.AlertsExtension}

+ + diff --git a/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsMarkdownRendererTest.java b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsMarkdownRendererTest.java new file mode 100644 index 000000000..aca90e2df --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsMarkdownRendererTest.java @@ -0,0 +1,73 @@ +package org.commonmark.ext.gfm.alerts; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class AlertsMarkdownRendererTest { + + private static final Set EXTENSIONS = Set.of(AlertsExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final MarkdownRenderer RENDERER = MarkdownRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void alertRoundTrip() { + assertRoundTrip("> [!WARNING]\n> Be careful\n"); + } + + @Test + public void allStandardTypesRoundTrip() { + assertRoundTrip("> [!NOTE]\n> Note\n"); + assertRoundTrip("> [!TIP]\n> Tip\n"); + assertRoundTrip("> [!IMPORTANT]\n> Important\n"); + assertRoundTrip("> [!WARNING]\n> Warning\n"); + assertRoundTrip("> [!CAUTION]\n> Caution\n"); + } + + @Test + public void lowercaseTypeRendersAsUppercase() { + // Lowercase input gets normalized to uppercase type + String rendered = RENDERER.render(PARSER.parse("> [!note]\n> Content\n")); + assertThat(rendered).isEqualTo("> [!NOTE]\n> Content\n"); + } + + @Test + public void alertWithMultipleParagraphs() { + String input = "> [!NOTE]\n> First paragraph\n>\n> Second paragraph\n"; + // MarkdownWriter always writes the prefix including trailing space + String expected = "> [!NOTE]\n> First paragraph\n> \n> Second paragraph\n"; + String rendered = RENDERER.render(PARSER.parse(input)); + assertThat(rendered).isEqualTo(expected); + } + + @Test + public void customTypeRoundTrip() { + Extension extension = AlertsExtension.builder() + .addCustomType("INFO", "Information") + .build(); + + Parser parser = Parser.builder().extensions(Set.of(extension)).build(); + MarkdownRenderer renderer = MarkdownRenderer.builder().extensions(Set.of(extension)).build(); + + String input = "> [!INFO]\n> Custom type\n"; + String rendered = renderer.render(parser.parse(input)); + assertThat(rendered).isEqualTo(input); + } + + @Test + public void alertWithList() { + String input = "> [!NOTE]\n> Items:\n> \n> - First\n> - Second\n"; + String rendered = RENDERER.render(PARSER.parse(input)); + assertThat(rendered).isEqualTo(input); + } + + private void assertRoundTrip(String input) { + String rendered = RENDERER.render(PARSER.parse(input)); + assertThat(rendered).isEqualTo(input); + } +} diff --git a/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsSpecTest.java b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsSpecTest.java new file mode 100644 index 000000000..8155d8009 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsSpecTest.java @@ -0,0 +1,44 @@ +package org.commonmark.ext.gfm.alerts; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.Parameter; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.MethodSource; + +import java.net.URL; +import java.util.List; +import java.util.Set; + +@ParameterizedClass +@MethodSource("data") +public class AlertsSpecTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(AlertsExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + // Use softbreak("
") to match GitHub's rendering for easier comparison with GitHub API output. + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).softbreak("
\n").build(); + + @Parameter + Example example; + + static List data() { + URL spec = AlertsSpecTest.class.getResource("/alerts-spec.txt"); + return ExampleReader.readExamples(spec, "alert"); + } + + @Test + public void testHtmlRendering() { + assertRendering(example.getSource(), example.getHtml()); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} \ No newline at end of file diff --git a/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsTest.java b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsTest.java new file mode 100644 index 000000000..c46c532fe --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/AlertsTest.java @@ -0,0 +1,140 @@ +package org.commonmark.ext.gfm.alerts; + +import org.commonmark.Extension; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class AlertsTest { + + private static final Set EXTENSIONS = Set.of(AlertsExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + + // Custom types + + @Test + public void customType() { + Extension extension = AlertsExtension.builder() + .addCustomType("INFO", "Information") + .build(); + + Parser parser = Parser.builder().extensions(Set.of(extension)).build(); + HtmlRenderer renderer = HtmlRenderer.builder().extensions(Set.of(extension)).build(); + + assertThat(renderer.render(parser.parse("> [!INFO]\n> Custom alert"))).isEqualTo( + "
\n" + + "

Information

\n" + + "

Custom alert

\n" + + "
\n"); + } + + @Test + public void multipleCustomTypes() { + Extension extension = AlertsExtension.builder() + .addCustomType("INFO", "Information") + .addCustomType("SUCCESS", "Success!") + .addCustomType("DANGER", "Danger!") + .build(); + + Parser parser = Parser.builder().extensions(Set.of(extension)).build(); + HtmlRenderer renderer = HtmlRenderer.builder().extensions(Set.of(extension)).build(); + + assertThat(renderer.render(parser.parse("> [!INFO]\n> Info content\n\n> [!SUCCESS]\n> Success content\n\n> [!DANGER]\n> Danger content"))).isEqualTo( + "
\n" + + "

Information

\n" + + "

Info content

\n" + + "
\n" + + "
\n" + + "

Success!

\n" + + "

Success content

\n" + + "
\n" + + "
\n" + + "

Danger!

\n" + + "

Danger content

\n" + + "
\n"); + } + + @Test + public void standardTypesWithCustomConfigured() { + Extension extension = AlertsExtension.builder() + .addCustomType("INFO", "Information") + .build(); + + Parser parser = Parser.builder().extensions(Set.of(extension)).build(); + HtmlRenderer renderer = HtmlRenderer.builder().extensions(Set.of(extension)).build(); + + assertThat(renderer.render(parser.parse("> [!NOTE]\n> Standard type"))).isEqualTo( + "
\n" + + "

Note

\n" + + "

Standard type

\n" + + "
\n"); + } + + @Test + public void overrideStandardTypeTitle() { + Extension extension = AlertsExtension.builder() + .addCustomType("NOTE", "Nota") + .build(); + + Parser parser = Parser.builder().extensions(Set.of(extension)).build(); + HtmlRenderer renderer = HtmlRenderer.builder().extensions(Set.of(extension)).build(); + + assertThat(renderer.render(parser.parse("> [!NOTE]\n> Localized title"))).isEqualTo( + "
\n" + + "

Nota

\n" + + "

Localized title

\n" + + "
\n"); + } + + // Custom type validation + + @Test + public void customTypeMustBeUppercase() { + assertThrows(IllegalArgumentException.class, () -> + AlertsExtension.builder().addCustomType("info", "Information").build()); + } + + @Test + public void customTypeMustNotBeEmpty() { + assertThrows(IllegalArgumentException.class, () -> + AlertsExtension.builder().addCustomType("", "Title").build()); + } + + @Test + public void customTypeTitleMustNotBeEmpty() { + assertThrows(IllegalArgumentException.class, () -> + AlertsExtension.builder().addCustomType("INFO", "").build()); + } + + // AST + + @Test + public void alertParsedAsAlertNode() { + Node document = PARSER.parse("> [!NOTE]\n> This is a note"); + Node firstChild = document.getFirstChild(); + assertThat(firstChild).isInstanceOf(Alert.class); + Alert alert = (Alert) firstChild; + assertThat(alert.getType()).isEqualTo("NOTE"); + } + + @Test + public void customTypeParsedAsAlertNode() { + Extension extension = AlertsExtension.builder() + .addCustomType("INFO", "Information") + .build(); + + Parser parser = Parser.builder().extensions(Set.of(extension)).build(); + + Node document = parser.parse("> [!INFO]\n> Custom alert"); + Alert alert = (Alert) document.getFirstChild(); + + assertThat(alert.getType()).isEqualTo("INFO"); + } + +} \ No newline at end of file diff --git a/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/examples/AlertsExample.java b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/examples/AlertsExample.java new file mode 100644 index 000000000..34b78385c --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/java/org/commonmark/ext/gfm/alerts/examples/AlertsExample.java @@ -0,0 +1,85 @@ +package org.commonmark.ext.gfm.alerts.examples; + +import org.commonmark.ext.gfm.alerts.AlertsExtension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; + +import java.util.List; + +/** + * Example demonstrating the use of the GFM Alerts extension. + */ +public class AlertsExample { + + public static void main(String[] args) { + standardTypesExample(); + System.out.println("\n" + "=".repeat(60) + "\n"); + customTypesExample(); + } + + private static void standardTypesExample() { + System.out.println("STANDARD GFM ALERT TYPES"); + System.out.println("=".repeat(60)); + + var extension = AlertsExtension.create(); + + var parser = Parser.builder() + .extensions(List.of(extension)) + .build(); + + var renderer = HtmlRenderer.builder() + .extensions(List.of(extension)) + .build(); + + var markdown = "# GFM Alerts Demo\n\n" + + "> [!NOTE]\n" + + "> Highlights information that users should take into account.\n\n" + + "> [!TIP]\n" + + "> Helpful advice for doing things better.\n\n" + + "> [!IMPORTANT]\n" + + "> Key information users need to know.\n\n" + + "> [!WARNING]\n" + + "> Urgent info that needs immediate attention.\n\n" + + "> [!CAUTION]\n" + + "> Advises about risks or negative outcomes.\n"; + + var html = renderer.render(parser.parse(markdown)); + + System.out.println("Markdown Input:"); + System.out.println(markdown); + System.out.println("\nHTML Output:"); + System.out.println(html); + } + + private static void customTypesExample() { + System.out.println("CUSTOM ALERT TYPES"); + System.out.println("=".repeat(60)); + + var extension = AlertsExtension.builder() + .addCustomType("BUG", "Known Bug") + .build(); + + var parser = Parser.builder() + .extensions(List.of(extension)) + .build(); + + var renderer = HtmlRenderer.builder() + .extensions(List.of(extension)) + .build(); + + var markdown = "# Custom Alert Types\n\n" + + "> [!NOTE]\n" + + "> Useful information that users should know.\n\n" + + "> [!TIP]\n" + + "> Helpful advice for doing things better.\n\n" + + "> [!BUG]\n" + + "> This feature has a known issue with large files (see #42).\n"; + + var html = renderer.render(parser.parse(markdown)); + + System.out.println("Markdown Input:"); + System.out.println(markdown); + System.out.println("\nHTML Output:"); + System.out.println(html); + } +} diff --git a/commonmark-ext-gfm-alerts/src/test/resources/alerts-spec-template.md b/commonmark-ext-gfm-alerts/src/test/resources/alerts-spec-template.md new file mode 100644 index 000000000..9c1cf117b --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/resources/alerts-spec-template.md @@ -0,0 +1,280 @@ +# Alerts + +## Standard types + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +```````````````````````````````` + +```````````````````````````````` example alert +> [!TIP] +> This is a tip +```````````````````````````````` + +```````````````````````````````` example alert +> [!IMPORTANT] +> This is important +```````````````````````````````` + +```````````````````````````````` example alert +> [!WARNING] +> This is a warning +```````````````````````````````` + +```````````````````````````````` example alert +> [!CAUTION] +> This is a caution +```````````````````````````````` + +## Case insensitivity + +Alert type matching is case-insensitive. + +```````````````````````````````` example alert +> [!note] +> Content +```````````````````````````````` + +```````````````````````````````` example alert +> [!Note] +> Content +```````````````````````````````` + +## Alert content + +Marker alone in first paragraph, blank line, then content: + +```````````````````````````````` example alert +> [!NOTE] +> +> Content +```````````````````````````````` + +Multiple paragraphs: + +```````````````````````````````` example alert +> [!NOTE] +> First paragraph +> +> Second paragraph +```````````````````````````````` + +Inline formatting: + +```````````````````````````````` example alert +> [!TIP] +> This is **bold** and *italic* +```````````````````````````````` + +Code block inside alert: + +```````````````````````````````` example alert +> [!TIP] +> Code: +> +> function() { } +> +> End +```````````````````````````````` + +List inside alert: + +```````````````````````````````` example alert +> [!IMPORTANT] +> Items: +> - First item +> - Second item +```````````````````````````````` + +Links inside alert: + +```````````````````````````````` example alert +> [!NOTE] +> Check out [this link](https://example.com) for more info +```````````````````````````````` + +Heading inside alert: + +```````````````````````````````` example alert +> [!IMPORTANT] +> ## Heading +> Content below heading +```````````````````````````````` + +Empty lines in middle of alert: + +```````````````````````````````` example alert +> [!NOTE] +> First +> +> +> After empty lines +```````````````````````````````` + +## Not an alert + +Text after marker on the same line: + +```````````````````````````````` example alert +> [!NOTE] Some text +```````````````````````````````` + +Unknown type: + +```````````````````````````````` example alert +> [!INVALID] +> Some text +```````````````````````````````` + +Unconfigured custom type is not an alert: + +```````````````````````````````` example alert +> [!INFO] +> Should be blockquote +```````````````````````````````` + +Marker with no content: + +```````````````````````````````` example alert +> [!NOTE] +```````````````````````````````` + +Whitespace-only content after marker: + +```````````````````````````````` example alert +> [!TIP] +> +> +```````````````````````````````` + +Extra space inside marker: + +```````````````````````````````` example alert +> [! NOTE] +> Should be blockquote +```````````````````````````````` + +Missing brackets: + +```````````````````````````````` example alert +> !NOTE +> Should be blockquote +```````````````````````````````` + +Missing exclamation mark: + +```````````````````````````````` example alert +> [NOTE] +> Should be blockquote +```````````````````````````````` + +Regular blockquote is not affected: + +```````````````````````````````` example alert +> This is a regular blockquote +```````````````````````````````` + +## Boundaries + +Trailing spaces after marker: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +```````````````````````````````` + +Trailing tabs after marker: + +```````````````````````````````` example alert +> [!WARNING]→→ +> Be careful +```````````````````````````````` + +Leading spaces before blockquote marker: + +```````````````````````````````` example alert + > [!IMPORTANT] + > Content +```````````````````````````````` + +Blank line after marker ends the blockquote (not an alert): + +```````````````````````````````` example alert +> [!NOTE] + +Some text +```````````````````````````````` + +Alert followed by blockquote: + +```````````````````````````````` example alert +> [!NOTE] +> This is an alert + +> This is a blockquote +```````````````````````````````` + +Adjacent alerts: + +```````````````````````````````` example alert +> [!NOTE] +> First alert + +> [!WARNING] +> Second alert +```````````````````````````````` + +## Nesting and containers + +Nested alert inside alert renders as blockquote: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +>> [!WARNING] +>> Nested content +```````````````````````````````` + +Nested blockquote inside alert: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +>> Nested blockquote +```````````````````````````````` + +Alert inside list item stays as blockquote: + +```````````````````````````````` example alert +- > [!NOTE] + > Test +```````````````````````````````` + +Alert marker in content is treated as text: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +> [!WARNING] +> This is still part of the note +```````````````````````````````` + +## Continuation and interruption + +Lazy continuation: + +```````````````````````````````` example alert +> [!NOTE] +> First line +Lazy continuation +> Continues alert +```````````````````````````````` + +Alert type after regular blockquote content is not an alert: + +```````````````````````````````` example alert +> Regular blockquote +> [!NOTE] +> More text +```````````````````````````````` \ No newline at end of file diff --git a/commonmark-ext-gfm-alerts/src/test/resources/alerts-spec.txt b/commonmark-ext-gfm-alerts/src/test/resources/alerts-spec.txt new file mode 100644 index 000000000..6f041fee4 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/resources/alerts-spec.txt @@ -0,0 +1,492 @@ +# Alerts + +Expectations verified against GitHub Markdown API (gh api markdown -f mode=gfm). +Our HTML omits GitHub's SVG icons and uses a `data-alert-type` attribute instead. + +## Standard types + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +. +
+

Note

+

This is a note

+
+```````````````````````````````` + +```````````````````````````````` example alert +> [!TIP] +> This is a tip +. +
+

Tip

+

This is a tip

+
+```````````````````````````````` + +```````````````````````````````` example alert +> [!IMPORTANT] +> This is important +. +
+

Important

+

This is important

+
+```````````````````````````````` + +```````````````````````````````` example alert +> [!WARNING] +> This is a warning +. +
+

Warning

+

This is a warning

+
+```````````````````````````````` + +```````````````````````````````` example alert +> [!CAUTION] +> This is a caution +. +
+

Caution

+

This is a caution

+
+```````````````````````````````` + +## Case insensitivity + +Alert type matching is case-insensitive. + +```````````````````````````````` example alert +> [!note] +> Content +. +
+

Note

+

Content

+
+```````````````````````````````` + +```````````````````````````````` example alert +> [!Note] +> Content +. +
+

Note

+

Content

+
+```````````````````````````````` + +## Alert content + +Marker alone in first paragraph, blank line, then content: + +```````````````````````````````` example alert +> [!NOTE] +> +> Content +. +
+

Note

+

Content

+
+```````````````````````````````` + +Multiple paragraphs: + +```````````````````````````````` example alert +> [!NOTE] +> First paragraph +> +> Second paragraph +. +
+

Note

+

First paragraph

+

Second paragraph

+
+```````````````````````````````` + +Inline formatting: + +```````````````````````````````` example alert +> [!TIP] +> This is **bold** and *italic* +. +
+

Tip

+

This is bold and italic

+
+```````````````````````````````` + +Code block inside alert: + +```````````````````````````````` example alert +> [!TIP] +> Code: +> +> function() { } +> +> End +. +
+

Tip

+

Code:

+
function() { }
+
+

End

+
+```````````````````````````````` + +List inside alert: + +```````````````````````````````` example alert +> [!IMPORTANT] +> Items: +> - First item +> - Second item +. +
+

Important

+

Items:

+
    +
  • First item
  • +
  • Second item
  • +
+
+```````````````````````````````` + +Links inside alert: + +```````````````````````````````` example alert +> [!NOTE] +> Check out [this link](https://example.com) for more info +. +
+

Note

+

Check out this link for more info

+
+```````````````````````````````` + +Heading inside alert: + +```````````````````````````````` example alert +> [!IMPORTANT] +> ## Heading +> Content below heading +. +
+

Important

+

Heading

+

Content below heading

+
+```````````````````````````````` + +Empty lines in middle of alert: + +```````````````````````````````` example alert +> [!NOTE] +> First +> +> +> After empty lines +. +
+

Note

+

First

+

After empty lines

+
+```````````````````````````````` + +## Not an alert + +Text after marker on the same line: + +```````````````````````````````` example alert +> [!NOTE] Some text +. +
+

[!NOTE] Some text

+
+```````````````````````````````` + +Unknown type: + +```````````````````````````````` example alert +> [!INVALID] +> Some text +. +
+

[!INVALID]
+Some text

+
+```````````````````````````````` + +Unconfigured custom type is not an alert: + +```````````````````````````````` example alert +> [!INFO] +> Should be blockquote +. +
+

[!INFO]
+Should be blockquote

+
+```````````````````````````````` + +Marker with no content: + +```````````````````````````````` example alert +> [!NOTE] +. +
+

[!NOTE]

+
+```````````````````````````````` + +Whitespace-only content after marker: + +```````````````````````````````` example alert +> [!TIP] +> +> +. +
+

[!TIP]

+
+```````````````````````````````` + +Extra space inside marker: + +```````````````````````````````` example alert +> [! NOTE] +> Should be blockquote +. +
+

[! NOTE]
+Should be blockquote

+
+```````````````````````````````` + +Missing brackets: + +```````````````````````````````` example alert +> !NOTE +> Should be blockquote +. +
+

!NOTE
+Should be blockquote

+
+```````````````````````````````` + +Missing exclamation mark: + +```````````````````````````````` example alert +> [NOTE] +> Should be blockquote +. +
+

[NOTE]
+Should be blockquote

+
+```````````````````````````````` + +Regular blockquote is not affected: + +```````````````````````````````` example alert +> This is a regular blockquote +. +
+

This is a regular blockquote

+
+```````````````````````````````` + +## Boundaries + +Trailing spaces after marker: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +. +
+

Note

+

This is a note

+
+```````````````````````````````` + +Trailing tabs after marker: + +```````````````````````````````` example alert +> [!WARNING]→→ +> Be careful +. +
+

Warning

+

Be careful

+
+```````````````````````````````` + +Leading spaces before blockquote marker: + +```````````````````````````````` example alert + > [!IMPORTANT] + > Content +. +
+

Important

+

Content

+
+```````````````````````````````` + +Blank line after marker ends the blockquote (not an alert): + +```````````````````````````````` example alert +> [!NOTE] + +Some text +. +
+

[!NOTE]

+
+

Some text

+```````````````````````````````` + +Alert followed by blockquote: + +```````````````````````````````` example alert +> [!NOTE] +> This is an alert + +> This is a blockquote +. +
+

Note

+

This is an alert

+
+
+

This is a blockquote

+
+```````````````````````````````` + +Adjacent alerts: + +```````````````````````````````` example alert +> [!NOTE] +> First alert + +> [!WARNING] +> Second alert +. +
+

Note

+

First alert

+
+
+

Warning

+

Second alert

+
+```````````````````````````````` + +## Nesting and containers + +Nested alert inside alert renders as blockquote: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +>> [!WARNING] +>> Nested content +. +
+

Note

+

This is a note

+
+

[!WARNING]
+Nested content

+
+
+```````````````````````````````` + +Nested blockquote inside alert: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +>> Nested blockquote +. +
+

Note

+

This is a note

+
+

Nested blockquote

+
+
+```````````````````````````````` + +Alert inside list item stays as blockquote: + +```````````````````````````````` example alert +- > [!NOTE] + > Test +. +
    +
  • +
    +

    [!NOTE]
    +Test

    +
    +
  • +
+```````````````````````````````` + +Alert marker in content is treated as text: + +```````````````````````````````` example alert +> [!NOTE] +> This is a note +> [!WARNING] +> This is still part of the note +. +
+

Note

+

This is a note
+[!WARNING]
+This is still part of the note

+
+```````````````````````````````` + +## Continuation and interruption + +Lazy continuation: + +```````````````````````````````` example alert +> [!NOTE] +> First line +Lazy continuation +> Continues alert +. +
+

Note

+

First line
+Lazy continuation
+Continues alert

+
+```````````````````````````````` + +Alert type after regular blockquote content is not an alert: + +```````````````````````````````` example alert +> Regular blockquote +> [!NOTE] +> More text +. +
+

Regular blockquote
+[!NOTE]
+More text

+
+```````````````````````````````` diff --git a/commonmark-ext-gfm-alerts/src/test/resources/generate-alerts-spec.java b/commonmark-ext-gfm-alerts/src/test/resources/generate-alerts-spec.java new file mode 100644 index 000000000..06192f107 --- /dev/null +++ b/commonmark-ext-gfm-alerts/src/test/resources/generate-alerts-spec.java @@ -0,0 +1,111 @@ +///usr/bin/env jbang "$0" "$@" ; exit $? + +// Generates alerts-spec.txt from alerts-spec-template.md by rendering each example +// through the GitHub Markdown API and inserting the normalized HTML expectation. +// +// Prerequisites: gh CLI installed and authenticated (gh auth login) +// Usage: cd commonmark-ext-gfm-alerts/src/test/resources && jbang generate-alerts-spec.java + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +class GenerateAlertsSpec { + + private static final String FENCE = "````````````````````````````````"; + private static final String EXAMPLE_OPEN = FENCE + " example alert"; + + public static void main(String[] args) throws Exception { + var templatePath = Path.of("alerts-spec-template.md"); + if (!Files.exists(templatePath)) { + System.err.println("Run from the directory containing alerts-spec-template.md"); + System.exit(1); + } + + var lines = Files.readAllLines(templatePath); + var output = new ArrayList(); + var header = "Expectations verified against GitHub Markdown API (gh api markdown -f mode=gfm).\n" + + "Our HTML omits GitHub's SVG icons and uses a `data-alert-type` attribute instead."; + + int exampleCount = 0; + int i = 0; + while (i < lines.size()) { + var line = lines.get(i); + + // Insert header after the first heading + if (i == 0 && line.startsWith("# ")) { + output.add(line); + output.add(""); + output.add(header); + i++; + continue; + } + + if (line.equals(EXAMPLE_OPEN)) { + // Collect source lines until closing fence + output.add(line); + i++; + var sourceLines = new ArrayList(); + while (i < lines.size() && !lines.get(i).equals(FENCE)) { + sourceLines.add(lines.get(i)); + output.add(lines.get(i)); + i++; + } + + // Render via GitHub API (→ represents tabs in the spec format) + var source = String.join("\n", sourceLines).replace("\u2192", "\t"); + exampleCount++; + System.out.printf("%d: %s%n", exampleCount, + source.substring(0, Math.min(40, source.length())).replace("\n", "\\n")); + + var ghHtml = normalizeHtml(renderViaGh(source)); + + // Insert separator and HTML expectation + output.add("."); + output.add(ghHtml); + output.add(FENCE); + i++; // skip closing fence from template + } else { + output.add(line); + i++; + } + } + + var specPath = Path.of("alerts-spec.txt"); + Files.writeString(specPath, String.join("\n", output) + "\n"); + System.out.println("Done — " + exampleCount + " examples written to alerts-spec.txt"); + } + + static String renderViaGh(String markdown) throws Exception { + var process = new ProcessBuilder("gh", "api", "markdown", "-f", "mode=gfm", "-f", "text=" + markdown) + .redirectErrorStream(true) + .start(); + var output = new String(process.getInputStream().readAllBytes()); + if (process.waitFor() != 0) { + throw new RuntimeException("gh api failed: " + output); + } + return output; + } + + // Normalize GitHub API HTML to match our renderer output. + static String normalizeHtml(String html) { + // Strip GitHub-specific elements and attributes + html = Pattern.compile("]*>.*?", Pattern.DOTALL).matcher(html).replaceAll(""); + html = html.replaceAll(" (dir=\"auto\"|rel=\"nofollow\"|class=\"notranslate\")", ""); + // Add data-alert-type and insert newlines to match our renderer's formatting + html = Pattern.compile("class=\"markdown-alert markdown-alert-(\\w+)\"") + .matcher(html) + .replaceAll("class=\"markdown-alert markdown-alert-$1\" data-alert-type=\"$1\""); + html = Pattern.compile("(data-alert-type=\"\\w+\">)(

", "

\n

"); + return html.replace("\r\n", "\n").lines() + .map(String::stripTrailing) + .reduce((a, b) -> a + "\n" + b) + .orElse("") + .strip(); + } +} \ No newline at end of file diff --git a/commonmark-ext-gfm-strikethrough/pom.xml b/commonmark-ext-gfm-strikethrough/pom.xml index 9ad3a1657..9d8f55e5f 100644 --- a/commonmark-ext-gfm-strikethrough/pom.xml +++ b/commonmark-ext-gfm-strikethrough/pom.xml @@ -2,30 +2,24 @@ 4.0.0 - com.atlassian.commonmark + org.commonmark commonmark-parent - 0.1.1-SNAPSHOT + 0.28.1-SNAPSHOT commonmark-ext-gfm-strikethrough commonmark-java extension for strikethrough - commonmark-java extension for GFM strikethrough using ~~~ (GitHub Flavored Markdown) + commonmark-java extension for GFM strikethrough using ~~ (GitHub Flavored Markdown) - com.atlassian.commonmark + org.commonmark commonmark - junit - junit - test - - - com.atlassian.commonmark - commonmark - test-jar + org.commonmark + commonmark-test-util test diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/module-info.java b/commonmark-ext-gfm-strikethrough/src/main/java/module-info.java new file mode 100644 index 000000000..b6204934b --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.gfm.strikethrough { + exports org.commonmark.ext.gfm.strikethrough; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/Strikethrough.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/Strikethrough.java index 25b618f02..0c24642bc 100644 --- a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/Strikethrough.java +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/Strikethrough.java @@ -1,6 +1,26 @@ package org.commonmark.ext.gfm.strikethrough; import org.commonmark.node.CustomNode; +import org.commonmark.node.Delimited; -public class Strikethrough extends CustomNode { +/** + * A strikethrough node containing text and other inline nodes as children. + */ +public class Strikethrough extends CustomNode implements Delimited { + + private String delimiter; + + public Strikethrough(String delimiter) { + this.delimiter = delimiter; + } + + @Override + public String getOpeningDelimiter() { + return delimiter; + } + + @Override + public String getClosingDelimiter() { + return delimiter; + } } diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughDelimiterProcessor.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughDelimiterProcessor.java deleted file mode 100644 index 40019a55a..000000000 --- a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughDelimiterProcessor.java +++ /dev/null @@ -1,53 +0,0 @@ -package org.commonmark.ext.gfm.strikethrough; - -import org.commonmark.node.Node; -import org.commonmark.node.Text; -import org.commonmark.parser.DelimiterProcessor; - -public class StrikethroughDelimiterProcessor implements DelimiterProcessor { - - @Override - public char getDelimiterChar() { - return '~'; - } - - @Override - public int getMinDelimiterCount() { - return 2; - } - - @Override - public int getDelimiterUse(int openerCount, int closerCount) { - if (openerCount >= 2 && closerCount >= 2) { - return 2; - } else { - // Can happen if a run had 3 delimiters before, and we removed 2 of them in an earlier processing step. - // So just use 1 of them, see corresponding handling in process method. - return 1; - } - } - - @Override - public void process(Text opener, Text closer, int delimiterCount) { - // Can happen if a run had 3 or more delimiters, so 1 is left over. Don't turn that into strikethrough, but - // preserve original character. - if (delimiterCount == 1) { - opener.insertAfter(new Text("~")); - closer.insertBefore(new Text("~")); - return; - } - - // Normal case, wrap nodes between delimiters in strikethrough. - Node strikethrough = new Strikethrough(); - - Node tmp = opener.getNext(); - while (tmp != null && tmp != closer) { - Node next = tmp.getNext(); - strikethrough.appendChild(tmp); - tmp = next; - } - - opener.insertAfter(strikethrough); - } - -} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughExtension.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughExtension.java index 4660c2d7a..364205aed 100644 --- a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughExtension.java +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughExtension.java @@ -1,26 +1,134 @@ package org.commonmark.ext.gfm.strikethrough; import org.commonmark.Extension; +import org.commonmark.ext.gfm.strikethrough.internal.StrikethroughDelimiterProcessor; +import org.commonmark.ext.gfm.strikethrough.internal.StrikethroughHtmlNodeRenderer; +import org.commonmark.ext.gfm.strikethrough.internal.StrikethroughMarkdownNodeRenderer; +import org.commonmark.ext.gfm.strikethrough.internal.StrikethroughTextContentNodeRenderer; import org.commonmark.parser.Parser; -import org.commonmark.html.HtmlRenderer; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlNodeRendererFactory; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownNodeRendererFactory; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.commonmark.renderer.text.TextContentNodeRendererContext; +import org.commonmark.renderer.text.TextContentNodeRendererFactory; +import org.commonmark.renderer.text.TextContentRenderer; -public class StrikethroughExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension { +import java.util.Set; - private StrikethroughExtension() { +/** + * Extension for GFM strikethrough using {@code ~} or {@code ~~} (GitHub Flavored Markdown). + *

Example input:

+ *
{@code ~foo~ or ~~bar~~}
+ *

Example output (HTML):

+ *
{@code foo or bar}
+ *

+ * Create the extension with {@link #create()} and then add it to the parser and renderer builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ *

+ * The parsed strikethrough text regions are turned into {@link Strikethrough} nodes. + *

+ *

+ * If you have another extension that only uses a single tilde ({@code ~}) syntax, you will have to configure this + * {@link StrikethroughExtension} to only accept the double tilde syntax, like this: + *

+ *
+ *     {@code
+ *     StrikethroughExtension.builder().requireTwoTildes(true).build();
+ *     }
+ * 
+ *

+ * If you don't do that, there's a conflict between the two extensions and you will get an + * {@link IllegalArgumentException} when constructing the parser. + *

+ */ +public class StrikethroughExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension, + TextContentRenderer.TextContentRendererExtension, MarkdownRenderer.MarkdownRendererExtension { + + private final boolean requireTwoTildes; + + private StrikethroughExtension(Builder builder) { + this.requireTwoTildes = builder.requireTwoTildes; } + /** + * @return the extension with default options + */ public static Extension create() { - return new StrikethroughExtension(); + return builder().build(); + } + + /** + * @return a builder to configure the behavior of the extension + */ + public static Builder builder() { + return new Builder(); } @Override public void extend(Parser.Builder parserBuilder) { - parserBuilder.customDelimiterProcessor(new StrikethroughDelimiterProcessor()); + parserBuilder.customDelimiterProcessor(new StrikethroughDelimiterProcessor(requireTwoTildes)); } @Override public void extend(HtmlRenderer.Builder rendererBuilder) { - rendererBuilder.customHtmlRenderer(new StrikethroughHtmlRenderer()); + rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new StrikethroughHtmlNodeRenderer(context); + } + }); } + @Override + public void extend(TextContentRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new TextContentNodeRendererFactory() { + @Override + public NodeRenderer create(TextContentNodeRendererContext context) { + return new StrikethroughTextContentNodeRenderer(context); + } + }); + } + + @Override + public void extend(MarkdownRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new StrikethroughMarkdownNodeRenderer(context); + } + + @Override + public Set getSpecialCharacters() { + return Set.of('~'); + } + }); + } + + public static class Builder { + + private boolean requireTwoTildes = false; + + /** + * @param requireTwoTildes Whether two tilde characters ({@code ~~}) are required for strikethrough or whether + * one is also enough. Default is {@code false}; both a single tilde and two tildes can be used for strikethrough. + * @return {@code this} + */ + public Builder requireTwoTildes(boolean requireTwoTildes) { + this.requireTwoTildes = requireTwoTildes; + return this; + } + + /** + * @return a configured extension + */ + public Extension build() { + return new StrikethroughExtension(this); + } + } } diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughHtmlRenderer.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughHtmlRenderer.java deleted file mode 100644 index 650a075ec..000000000 --- a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/StrikethroughHtmlRenderer.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.commonmark.ext.gfm.strikethrough; - -import org.commonmark.html.CustomHtmlRenderer; -import org.commonmark.html.HtmlWriter; -import org.commonmark.node.Node; -import org.commonmark.node.Visitor; - -public class StrikethroughHtmlRenderer implements CustomHtmlRenderer { - - @Override - public boolean render(Node node, HtmlWriter htmlWriter, Visitor visitor) { - if (node instanceof Strikethrough) { - htmlWriter.tag("del"); - visitChildren(node, visitor); - htmlWriter.tag("/del"); - return true; - } else { - return false; - } - } - - private void visitChildren(Node node, Visitor visitor) { - Node child = node.getFirstChild(); - while (child != null) { - child.accept(visitor); - child = child.getNext(); - } - } - -} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughDelimiterProcessor.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughDelimiterProcessor.java new file mode 100644 index 000000000..4657106ab --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughDelimiterProcessor.java @@ -0,0 +1,67 @@ +package org.commonmark.ext.gfm.strikethrough.internal; + +import org.commonmark.ext.gfm.strikethrough.Strikethrough; +import org.commonmark.node.Node; +import org.commonmark.node.Nodes; +import org.commonmark.node.SourceSpans; +import org.commonmark.node.Text; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; + +public class StrikethroughDelimiterProcessor implements DelimiterProcessor { + + private final boolean requireTwoTildes; + + public StrikethroughDelimiterProcessor() { + this(false); + } + + public StrikethroughDelimiterProcessor(boolean requireTwoTildes) { + this.requireTwoTildes = requireTwoTildes; + } + + @Override + public char getOpeningCharacter() { + return '~'; + } + + @Override + public char getClosingCharacter() { + return '~'; + } + + @Override + public int getMinLength() { + return requireTwoTildes ? 2 : 1; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + if (openingRun.length() == closingRun.length() && openingRun.length() <= 2) { + // GitHub only accepts either one or two delimiters, but not a mix or more than that. + + Text opener = openingRun.getOpener(); + + // Wrap nodes between delimiters in strikethrough. + String delimiter = openingRun.length() == 1 ? opener.getLiteral() : opener.getLiteral() + opener.getLiteral(); + Node strikethrough = new Strikethrough(delimiter); + + SourceSpans sourceSpans = new SourceSpans(); + sourceSpans.addAllFrom(openingRun.getOpeners(openingRun.length())); + + for (Node node : Nodes.between(opener, closingRun.getCloser())) { + strikethrough.appendChild(node); + sourceSpans.addAll(node.getSourceSpans()); + } + + sourceSpans.addAllFrom(closingRun.getClosers(closingRun.length())); + strikethrough.setSourceSpans(sourceSpans.getSourceSpans()); + + opener.insertAfter(strikethrough); + + return openingRun.length(); + } else { + return 0; + } + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughHtmlNodeRenderer.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughHtmlNodeRenderer.java new file mode 100644 index 000000000..b1a82cb03 --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughHtmlNodeRenderer.java @@ -0,0 +1,35 @@ +package org.commonmark.ext.gfm.strikethrough.internal; + +import org.commonmark.node.Node; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlWriter; + +import java.util.Map; + +public class StrikethroughHtmlNodeRenderer extends StrikethroughNodeRenderer { + + private final HtmlNodeRendererContext context; + private final HtmlWriter html; + + public StrikethroughHtmlNodeRenderer(HtmlNodeRendererContext context) { + this.context = context; + this.html = context.getWriter(); + } + + @Override + public void render(Node node) { + Map attributes = context.extendAttributes(node, "del", Map.of()); + html.tag("del", attributes); + renderChildren(node); + html.tag("/del"); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughMarkdownNodeRenderer.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughMarkdownNodeRenderer.java new file mode 100644 index 000000000..1c91dd64f --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughMarkdownNodeRenderer.java @@ -0,0 +1,34 @@ +package org.commonmark.ext.gfm.strikethrough.internal; + +import org.commonmark.ext.gfm.strikethrough.Strikethrough; +import org.commonmark.node.Node; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownWriter; + +public class StrikethroughMarkdownNodeRenderer extends StrikethroughNodeRenderer { + + private final MarkdownNodeRendererContext context; + private final MarkdownWriter writer; + + public StrikethroughMarkdownNodeRenderer(MarkdownNodeRendererContext context) { + this.context = context; + this.writer = context.getWriter(); + } + + @Override + public void render(Node node) { + Strikethrough strikethrough = (Strikethrough) node; + writer.raw(strikethrough.getOpeningDelimiter()); + renderChildren(node); + writer.raw(strikethrough.getClosingDelimiter()); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughNodeRenderer.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughNodeRenderer.java new file mode 100644 index 000000000..18ed21887 --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughNodeRenderer.java @@ -0,0 +1,15 @@ +package org.commonmark.ext.gfm.strikethrough.internal; + +import org.commonmark.ext.gfm.strikethrough.Strikethrough; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; + +import java.util.Set; + +abstract class StrikethroughNodeRenderer implements NodeRenderer { + + @Override + public Set> getNodeTypes() { + return Set.of(Strikethrough.class); + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughTextContentNodeRenderer.java b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughTextContentNodeRenderer.java new file mode 100644 index 000000000..ebdcd9dbe --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/java/org/commonmark/ext/gfm/strikethrough/internal/StrikethroughTextContentNodeRenderer.java @@ -0,0 +1,32 @@ +package org.commonmark.ext.gfm.strikethrough.internal; + +import org.commonmark.renderer.text.TextContentWriter; +import org.commonmark.renderer.text.TextContentNodeRendererContext; +import org.commonmark.node.Node; + +public class StrikethroughTextContentNodeRenderer extends StrikethroughNodeRenderer { + + private final TextContentNodeRendererContext context; + private final TextContentWriter textContent; + + public StrikethroughTextContentNodeRenderer(TextContentNodeRendererContext context) { + this.context = context; + this.textContent = context.getWriter(); + } + + @Override + public void render(Node node) { + textContent.write('/'); + renderChildren(node); + textContent.write('/'); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/main/javadoc/overview.html b/commonmark-ext-gfm-strikethrough/src/main/javadoc/overview.html new file mode 100644 index 000000000..a80cdd82b --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for GFM strikethrough using ~~ (GitHub Flavored Markdown) +

See {@link org.commonmark.ext.gfm.strikethrough.StrikethroughExtension}

+ + diff --git a/commonmark-ext-gfm-strikethrough/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-gfm-strikethrough/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughMarkdownRendererTest.java b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughMarkdownRendererTest.java new file mode 100644 index 000000000..c497a4db3 --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughMarkdownRendererTest.java @@ -0,0 +1,35 @@ +package org.commonmark.ext.gfm.strikethrough; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class StrikethroughMarkdownRendererTest { + + private static final Set EXTENSIONS = Set.of(StrikethroughExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final MarkdownRenderer RENDERER = MarkdownRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void testStrikethrough() { + assertRoundTrip("~foo~ ~bar~\n"); + assertRoundTrip("~~foo~~ ~~bar~~\n"); + assertRoundTrip("~~f\\~oo~~ ~~bar~~\n"); + + assertRoundTrip("\\~foo\\~\n"); + } + + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private void assertRoundTrip(String input) { + String rendered = render(input); + assertThat(rendered).isEqualTo(input); + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughSpecTest.java b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughSpecTest.java new file mode 100644 index 000000000..f1199b521 --- /dev/null +++ b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughSpecTest.java @@ -0,0 +1,42 @@ +package org.commonmark.ext.gfm.strikethrough; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.Parameter; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.List; +import java.util.Set; + +@ParameterizedClass +@MethodSource("data") +public class StrikethroughSpecTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(StrikethroughExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Parameter + Example example; + + static List data() { + return ExampleReader.readExamples(TestResources.getGfmSpec(), "strikethrough"); + } + + @Test + public void testHtmlRendering() { + assertRendering(example.getSource(), example.getHtml()); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java index 5291a05e7..c29391cdd 100644 --- a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java +++ b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java @@ -1,20 +1,39 @@ package org.commonmark.ext.gfm.strikethrough; import org.commonmark.Extension; -import org.commonmark.test.RenderingTestCase; -import org.junit.Test; +import org.commonmark.node.Node; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SourceSpan; +import org.commonmark.node.Text; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.text.TextContentRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; -import java.util.Collections; +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; public class StrikethroughTest extends RenderingTestCase { + private static final Set EXTENSIONS = Set.of(StrikethroughExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer HTML_RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + private static final TextContentRenderer CONTENT_RENDERER = TextContentRenderer.builder() + .extensions(EXTENSIONS).build(); + @Test - public void oneTildeIsNotEnough() { - assertRendering("~foo~", "

~foo~

\n"); + public void oneTildeIsEnough() { + assertRendering("~foo~", "

foo

\n"); } @Test - public void twoTildesYay() { + public void twoTildesWorksToo() { assertRendering("~~foo~~", "

foo

\n"); } @@ -31,17 +50,22 @@ public void unmatched() { @Test public void threeInnerThree() { - assertRendering("~~~foo~~~", "

~foo~

\n"); + assertRendering("a ~~~foo~~~", "

a ~~~foo~~~

\n"); } @Test public void twoInnerThree() { - assertRendering("~~foo~~~", "

foo~

\n"); + assertRendering("~~foo~~~", "

~~foo~~~

\n"); } @Test - public void twoStrikethroughsWithoutSpacing() { - assertRendering("~~foo~~~~bar~~", "

foobar

\n"); + public void tildesInside() { + assertRendering("~~foo~bar~~", "

foo~bar

\n"); + assertRendering("~~foo~~bar~~", "

foobar~~

\n"); + assertRendering("~~foo~~~bar~~", "

foo~~~bar

\n"); + assertRendering("~~foo~~~~bar~~", "

foo~~~~bar

\n"); + assertRendering("~~foo~~~~~bar~~", "

foo~~~~~bar

\n"); + assertRendering("~~foo~~~~~~bar~~", "

foo~~~~~~bar

\n"); } @Test @@ -56,8 +80,73 @@ public void insideBlockQuote() { "
\n

strike that

\n
\n"); } + @Test + public void delimited() { + Node document = PARSER.parse("~~foo~~"); + Strikethrough strikethrough = (Strikethrough) document.getFirstChild().getFirstChild(); + assertThat(strikethrough.getOpeningDelimiter()).isEqualTo("~~"); + assertThat(strikethrough.getClosingDelimiter()).isEqualTo("~~"); + } + + @Test + public void textContentRenderer() { + Node document = PARSER.parse("~~foo~~"); + assertThat(CONTENT_RENDERER.render(document)).isEqualTo("/foo/"); + } + + @Test + public void requireTwoTildesOption() { + Parser parser = Parser.builder() + .extensions(Set.of(StrikethroughExtension.builder() + .requireTwoTildes(true) + .build())) + .customDelimiterProcessor(new SubscriptDelimiterProcessor()) + .build(); + + Node document = parser.parse("~foo~ ~~bar~~"); + assertThat(CONTENT_RENDERER.render(document)).isEqualTo("(sub)foo(/sub) /bar/"); + } + + @Test + public void sourceSpans() { + Parser parser = Parser.builder() + .extensions(EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + + Node document = parser.parse("hey ~~there~~\n"); + Paragraph block = (Paragraph) document.getFirstChild(); + Node strikethrough = block.getLastChild(); + assertThat(strikethrough.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 9))); + } + @Override - protected Iterable getExtensions() { - return Collections.singleton(StrikethroughExtension.create()); + protected String render(String source) { + return HTML_RENDERER.render(PARSER.parse(source)); + } + + private static class SubscriptDelimiterProcessor implements DelimiterProcessor { + + @Override + public char getOpeningCharacter() { + return '~'; + } + + @Override + public char getClosingCharacter() { + return '~'; + } + + @Override + public int getMinLength() { + return 1; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + openingRun.getOpener().insertAfter(new Text("(sub)")); + closingRun.getCloser().insertBefore(new Text("(/sub)")); + return 1; + } } } diff --git a/commonmark-ext-gfm-tables/pom.xml b/commonmark-ext-gfm-tables/pom.xml index 131e0dd60..5bd323168 100644 --- a/commonmark-ext-gfm-tables/pom.xml +++ b/commonmark-ext-gfm-tables/pom.xml @@ -2,30 +2,24 @@ 4.0.0 - com.atlassian.commonmark + org.commonmark commonmark-parent - 0.1.1-SNAPSHOT + 0.28.1-SNAPSHOT commonmark-ext-gfm-tables commonmark-java extension for tables - commonmark-java extension for GFM tables using | (GitHub Flavored Markdown) + commonmark-java extension for GFM tables using "|" pipes (GitHub Flavored Markdown) - com.atlassian.commonmark + org.commonmark commonmark - junit - junit - test - - - com.atlassian.commonmark - commonmark - test-jar + org.commonmark + commonmark-test-util test diff --git a/commonmark-ext-gfm-tables/src/main/java/module-info.java b/commonmark-ext-gfm-tables/src/main/java/module-info.java new file mode 100644 index 000000000..7e6d2629c --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.gfm.tables { + exports org.commonmark.ext.gfm.tables; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlock.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlock.java index 0e060b8b3..c46fc27ef 100644 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlock.java +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlock.java @@ -2,5 +2,8 @@ import org.commonmark.node.CustomBlock; +/** + * Table block containing a {@link TableHead} and optionally a {@link TableBody}. + */ public class TableBlock extends CustomBlock { } diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlockParser.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlockParser.java deleted file mode 100644 index 5d366f346..000000000 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBlockParser.java +++ /dev/null @@ -1,176 +0,0 @@ -package org.commonmark.ext.gfm.tables; - -import org.commonmark.node.Block; -import org.commonmark.node.Node; -import org.commonmark.parser.InlineParser; -import org.commonmark.parser.block.*; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Pattern; - -public class TableBlockParser extends AbstractBlockParser { - - private static String COL = "\\s*:?-{3,}:?\\s*"; - private static Pattern TABLE_HEADER_SEPARATOR = Pattern.compile( - // For single column, require at least one pipe, otherwise it's ambiguous with setext headers - "\\|" + COL + "\\|?\\s*" + "|" + - COL + "\\|\\s*" + "|" + - "\\|?" + "(?:" + COL + "\\|)+" + COL + "\\|?\\s*"); - - private final TableBlock block = new TableBlock(); - private final List rowLines = new ArrayList<>(); - - private boolean nextIsSeparatorLine = true; - private String separatorLine = ""; - - private TableBlockParser(CharSequence headerLine) { - rowLines.add(headerLine); - } - - @Override - public Block getBlock() { - return block; - } - - @Override - public BlockContinue tryContinue(ParserState state) { - if (state.getLine().toString().contains("|")) { - return BlockContinue.atIndex(state.getIndex()); - } else { - return BlockContinue.none(); - } - } - - @Override - public void addLine(CharSequence line) { - if (nextIsSeparatorLine) { - nextIsSeparatorLine = false; - separatorLine = line.toString(); - } else { - rowLines.add(line); - } - } - - @Override - public void parseInlines(InlineParser inlineParser) { - Node section = new TableHead(); - block.appendChild(section); - - List alignments = parseAlignment(separatorLine); - - int headerColumns = -1; - boolean header = true; - for (CharSequence rowLine : rowLines) { - List cells = split(rowLine); - TableRow tableRow = new TableRow(); - - if (headerColumns == -1) { - headerColumns = cells.size(); - } - - // Body can not have more columns than head - for (int i = 0; i < headerColumns; i++) { - String cell = i < cells.size() ? cells.get(i) : ""; - TableCell.Alignment alignment = i < alignments.size() ? alignments.get(i) : null; - TableCell tableCell = new TableCell(); - tableCell.setHeader(header); - tableCell.setAlignment(alignment); - inlineParser.parse(cell.trim(), tableCell); - tableRow.appendChild(tableCell); - } - - section.appendChild(tableRow); - - if (header) { - // Format allows only one row in head - header = false; - section = new TableBody(); - block.appendChild(section); - } - } - } - - private static List parseAlignment(String separatorLine) { - List parts = split(separatorLine); - List alignments = new ArrayList<>(); - for (String part : parts) { - String trimmed = part.trim(); - boolean left = trimmed.startsWith(":"); - boolean right = trimmed.endsWith(":"); - TableCell.Alignment alignment = getAlignment(left, right); - alignments.add(alignment); - } - return alignments; - } - - private static List split(CharSequence input) { - String line = input.toString().trim(); - if (line.startsWith("|")) { - line = line.substring(1); - } - List cells = new ArrayList<>(); - StringBuilder sb = new StringBuilder(); - boolean escape = false; - for (int i = 0; i < line.length(); i++) { - char c = line.charAt(i); - if (escape) { - escape = false; - sb.append(c); - } else { - switch (c) { - case '\\': - escape = true; - // Removing the escaping '\' is handled by the inline parser later, so add it to cell - sb.append(c); - break; - case '|': - cells.add(sb.toString()); - sb.setLength(0); - break; - default: - sb.append(c); - } - } - } - if (sb.length() > 0) { - cells.add(sb.toString()); - } - return cells; - } - - private static TableCell.Alignment getAlignment(boolean left, boolean right) { - if (left && right) { - return TableCell.Alignment.CENTER; - } else if (left) { - return TableCell.Alignment.LEFT; - } else if (right) { - return TableCell.Alignment.RIGHT; - } else { - return null; - } - } - - public static class Factory extends AbstractBlockParserFactory { - - @Override - public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { - CharSequence line = state.getLine(); - CharSequence paragraphStartLine = matchedBlockParser.getParagraphStartLine(); - if (paragraphStartLine != null && paragraphStartLine.toString().contains("|")) { - CharSequence separatorLine = line.subSequence(state.getIndex(), line.length()); - if (TABLE_HEADER_SEPARATOR.matcher(separatorLine).matches()) { - List headParts = split(paragraphStartLine); - List separatorParts = split(separatorLine); - if (separatorParts.size() >= headParts.size()) { - return BlockStart.of(new TableBlockParser(paragraphStartLine)) - .atIndex(state.getIndex()) - .replaceActiveBlockParser(); - } - } - } - return BlockStart.none(); - } - } - -} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBody.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBody.java index f0dd9b227..ddc80deb3 100644 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBody.java +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableBody.java @@ -2,5 +2,8 @@ import org.commonmark.node.CustomNode; +/** + * Body part of a {@link TableBlock} containing {@link TableRow TableRows}. + */ public class TableBody extends CustomNode { } diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableCell.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableCell.java index cb2ea66a3..033c2dd04 100644 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableCell.java +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableCell.java @@ -2,11 +2,18 @@ import org.commonmark.node.CustomNode; +/** + * Table cell of a {@link TableRow} containing inline nodes. + */ public class TableCell extends CustomNode { private boolean header; private Alignment alignment; + private int width; + /** + * @return whether the cell is a header or not + */ public boolean isHeader() { return header; } @@ -15,6 +22,9 @@ public void setHeader(boolean header) { this.header = header; } + /** + * @return the cell alignment or {@code null} if no specific alignment + */ public Alignment getAlignment() { return alignment; } @@ -23,7 +33,22 @@ public void setAlignment(Alignment alignment) { this.alignment = alignment; } + /** + * @return the cell width (the number of dash and colon characters in the delimiter row of the table for this column) + */ + public int getWidth() { + return width; + } + + public void setWidth(int width) { + this.width = width; + } + + /** + * How the cell is aligned horizontally. + */ public enum Alignment { LEFT, CENTER, RIGHT } + } diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHead.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHead.java index 4de7ff9b1..96a95e620 100644 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHead.java +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHead.java @@ -2,5 +2,8 @@ import org.commonmark.node.CustomNode; +/** + * Head part of a {@link TableBlock} containing {@link TableRow TableRows}. + */ public class TableHead extends CustomNode { } diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHtmlRenderer.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHtmlRenderer.java deleted file mode 100644 index aacd45689..000000000 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableHtmlRenderer.java +++ /dev/null @@ -1,98 +0,0 @@ -package org.commonmark.ext.gfm.tables; - -import org.commonmark.html.CustomHtmlRenderer; -import org.commonmark.html.HtmlWriter; -import org.commonmark.node.Node; -import org.commonmark.node.Visitor; - -import java.util.Collections; -import java.util.Map; - -public class TableHtmlRenderer implements CustomHtmlRenderer { - - @Override - public boolean render(Node node, HtmlWriter htmlWriter, Visitor visitor) { - if (node instanceof TableBlock) { - renderBlock((TableBlock) node, htmlWriter, visitor); - } else if (node instanceof TableHead) { - renderHead((TableHead) node, htmlWriter, visitor); - } else if (node instanceof TableBody) { - renderBody((TableBody) node, htmlWriter, visitor); - } else if (node instanceof TableRow) { - renderRow((TableRow) node, htmlWriter, visitor); - } else if (node instanceof TableCell) { - renderCell((TableCell) node, htmlWriter, visitor); - } else { - return false; - } - return true; - } - - private void renderBlock(TableBlock tableBlock, HtmlWriter htmlWriter, Visitor visitor) { - htmlWriter.line(); - // TODO: What about attributes? If we got the renderer instead of the visitor, we could call getAttributes. - htmlWriter.tag("table"); - visitChildren(tableBlock, visitor); - htmlWriter.tag("/table"); - htmlWriter.line(); - } - - private void renderHead(TableHead tableHead, HtmlWriter htmlWriter, Visitor visitor) { - htmlWriter.line(); - htmlWriter.tag("thead"); - visitChildren(tableHead, visitor); - htmlWriter.tag("/thead"); - htmlWriter.line(); - } - - private void renderBody(TableBody tableBody, HtmlWriter htmlWriter, Visitor visitor) { - htmlWriter.line(); - htmlWriter.tag("tbody"); - visitChildren(tableBody, visitor); - htmlWriter.tag("/tbody"); - htmlWriter.line(); - } - - private void renderRow(TableRow tableRow, HtmlWriter htmlWriter, Visitor visitor) { - htmlWriter.line(); - htmlWriter.tag("tr"); - visitChildren(tableRow, visitor); - htmlWriter.tag("/tr"); - htmlWriter.line(); - } - - private void renderCell(TableCell tableCell, HtmlWriter htmlWriter, Visitor visitor) { - String tag = tableCell.isHeader() ? "th" : "td"; - htmlWriter.tag(tag, getAttributes(tableCell)); - visitChildren(tableCell, visitor); - htmlWriter.tag("/" + tag); - } - - private static Map getAttributes(TableCell tableCell) { - if (tableCell.getAlignment() != null) { - return Collections.singletonMap("align", getAlignValue(tableCell.getAlignment())); - } else { - return Collections.emptyMap(); - } - } - - private static String getAlignValue(TableCell.Alignment alignment) { - switch (alignment) { - case LEFT: - return "left"; - case CENTER: - return "center"; - case RIGHT: - return "right"; - } - throw new IllegalStateException("Unknown alignment: " + alignment); - } - - private void visitChildren(Node node, Visitor visitor) { - Node child = node.getFirstChild(); - while (child != null) { - child.accept(visitor); - child = child.getNext(); - } - } -} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableRow.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableRow.java index c1305ff0b..1325875d0 100644 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableRow.java +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TableRow.java @@ -1,7 +1,9 @@ package org.commonmark.ext.gfm.tables; import org.commonmark.node.CustomNode; -import org.commonmark.node.Visitor; +/** + * Table row of a {@link TableHead} or {@link TableBody} containing {@link TableCell TableCells}. + */ public class TableRow extends CustomNode { } diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TablesExtension.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TablesExtension.java index b042b0c3b..f754b8276 100644 --- a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TablesExtension.java +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/TablesExtension.java @@ -1,10 +1,39 @@ package org.commonmark.ext.gfm.tables; import org.commonmark.Extension; +import org.commonmark.ext.gfm.tables.internal.TableBlockParser; +import org.commonmark.ext.gfm.tables.internal.TableHtmlNodeRenderer; +import org.commonmark.ext.gfm.tables.internal.TableMarkdownNodeRenderer; +import org.commonmark.ext.gfm.tables.internal.TableTextContentNodeRenderer; import org.commonmark.parser.Parser; -import org.commonmark.html.HtmlRenderer; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlNodeRendererFactory; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownNodeRendererFactory; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.commonmark.renderer.text.TextContentNodeRendererContext; +import org.commonmark.renderer.text.TextContentNodeRendererFactory; +import org.commonmark.renderer.text.TextContentRenderer; -public class TablesExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension { +import java.util.Set; + +/** + * Extension for GFM tables using "|" pipes (GitHub Flavored Markdown). + *

+ * Create it with {@link #create()} and then configure it on the builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ *

+ * The parsed tables are turned into {@link TableBlock} blocks. + *

+ * + * @see Tables (extension) in GitHub Flavored Markdown Spec + */ +public class TablesExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension, + TextContentRenderer.TextContentRendererExtension, MarkdownRenderer.MarkdownRendererExtension { private TablesExtension() { } @@ -20,7 +49,36 @@ public void extend(Parser.Builder parserBuilder) { @Override public void extend(HtmlRenderer.Builder rendererBuilder) { - rendererBuilder.customHtmlRenderer(new TableHtmlRenderer()); + rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new TableHtmlNodeRenderer(context); + } + }); + } + + @Override + public void extend(TextContentRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new TextContentNodeRendererFactory() { + @Override + public NodeRenderer create(TextContentNodeRendererContext context) { + return new TableTextContentNodeRenderer(context); + } + }); } + @Override + public void extend(MarkdownRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new TableMarkdownNodeRenderer(context); + } + + @Override + public Set getSpecialCharacters() { + return Set.of('|'); + } + }); + } } diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java new file mode 100644 index 000000000..57af128d8 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableBlockParser.java @@ -0,0 +1,312 @@ +package org.commonmark.ext.gfm.tables.internal; + +import org.commonmark.ext.gfm.tables.*; +import org.commonmark.node.Block; +import org.commonmark.node.Node; +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.InlineParser; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.block.*; +import org.commonmark.text.Characters; + +import java.util.ArrayList; +import java.util.List; + +public class TableBlockParser extends AbstractBlockParser { + + private final TableBlock block = new TableBlock(); + private final List rowLines = new ArrayList<>(); + private final List columns; + + private boolean canHaveLazyContinuationLines = true; + + private TableBlockParser(List columns, SourceLine headerLine) { + this.columns = columns; + this.rowLines.add(headerLine); + } + + @Override + public boolean canHaveLazyContinuationLines() { + return canHaveLazyContinuationLines; + } + + @Override + public Block getBlock() { + return block; + } + + @Override + public BlockContinue tryContinue(ParserState state) { + CharSequence content = state.getLine().getContent(); + int pipe = Characters.find('|', content, state.getNextNonSpaceIndex()); + if (pipe != -1) { + if (pipe == state.getNextNonSpaceIndex()) { + // If we *only* have a pipe character (and whitespace), that is not a valid table row and ends the table. + if (Characters.skipSpaceTab(content, pipe + 1, content.length()) == content.length()) { + // We also don't want the pipe to be added via lazy continuation. + canHaveLazyContinuationLines = false; + return BlockContinue.none(); + } + } + return BlockContinue.atIndex(state.getIndex()); + } else { + return BlockContinue.none(); + } + } + + @Override + public void addLine(SourceLine line) { + rowLines.add(line); + } + + @Override + public void parseInlines(InlineParser inlineParser) { + List sourceSpans = block.getSourceSpans(); + + SourceSpan headerSourceSpan = !sourceSpans.isEmpty() ? sourceSpans.get(0) : null; + Node head = new TableHead(); + if (headerSourceSpan != null) { + head.addSourceSpan(headerSourceSpan); + } + block.appendChild(head); + + TableRow headerRow = new TableRow(); + headerRow.setSourceSpans(head.getSourceSpans()); + head.appendChild(headerRow); + + List headerCells = split(rowLines.get(0)); + int headerColumns = headerCells.size(); + for (int i = 0; i < headerColumns; i++) { + SourceLine cell = headerCells.get(i); + TableCell tableCell = parseCell(cell, i, inlineParser); + tableCell.setHeader(true); + headerRow.appendChild(tableCell); + } + + TableBody body = null; + // Body starts at index 2. 0 is header, 1 is separator. + for (int rowIndex = 2; rowIndex < rowLines.size(); rowIndex++) { + SourceLine rowLine = rowLines.get(rowIndex); + SourceSpan sourceSpan = rowIndex < sourceSpans.size() ? sourceSpans.get(rowIndex) : null; + List cells = split(rowLine); + TableRow row = new TableRow(); + if (sourceSpan != null) { + row.addSourceSpan(sourceSpan); + } + + // Body can not have more columns than head + for (int i = 0; i < headerColumns; i++) { + SourceLine cell = i < cells.size() ? cells.get(i) : SourceLine.of("", null); + TableCell tableCell = parseCell(cell, i, inlineParser); + row.appendChild(tableCell); + } + + if (body == null) { + // It's valid to have a table without body. In that case, don't add an empty TableBody node. + body = new TableBody(); + block.appendChild(body); + } + body.appendChild(row); + body.addSourceSpan(sourceSpan); + } + } + + private TableCell parseCell(SourceLine cell, int column, InlineParser inlineParser) { + TableCell tableCell = new TableCell(); + SourceSpan sourceSpan = cell.getSourceSpan(); + if (sourceSpan != null) { + tableCell.addSourceSpan(sourceSpan); + } + + if (column < columns.size()) { + TableCellInfo cellInfo = columns.get(column); + tableCell.setAlignment(cellInfo.getAlignment()); + tableCell.setWidth(cellInfo.getWidth()); + } + + CharSequence content = cell.getContent(); + int start = Characters.skipSpaceTab(content, 0, content.length()); + int end = Characters.skipSpaceTabBackwards(content, content.length() - 1, start); + inlineParser.parse(SourceLines.of(cell.substring(start, end + 1)), tableCell); + + return tableCell; + } + + private static List split(SourceLine line) { + CharSequence row = line.getContent(); + int nonSpace = Characters.skipSpaceTab(row, 0, row.length()); + int cellStart = nonSpace; + int cellEnd = row.length(); + if (row.charAt(nonSpace) == '|') { + // This row has leading/trailing pipes - skip the leading pipe + cellStart = nonSpace + 1; + // Strip whitespace from the end but not the pipe or we could miss an empty ("||") cell + int nonSpaceEnd = Characters.skipSpaceTabBackwards(row, row.length() - 1, cellStart); + cellEnd = nonSpaceEnd + 1; + } + List cells = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int i = cellStart; i < cellEnd; i++) { + char c = row.charAt(i); + switch (c) { + case '\\': + if (i + 1 < cellEnd && row.charAt(i + 1) == '|') { + // Pipe is special for table parsing. An escaped pipe doesn't result in a new cell, but is + // passed down to inline parsing as an unescaped pipe. Note that that applies even for the `\|` + // in an input like `\\|` - in other words, table parsing doesn't support escaping backslashes. + sb.append('|'); + i++; + } else { + // Preserve backslash before other characters or at end of line. + sb.append('\\'); + } + break; + case '|': + String content = sb.toString(); + + cells.add(SourceLine.of(content, line.substring(cellStart, i).getSourceSpan())); + sb.setLength(0); + // + 1 to skip the pipe itself for the next cell's span + cellStart = i + 1; + break; + default: + sb.append(c); + } + } + if (sb.length() > 0) { + String content = sb.toString(); + cells.add(SourceLine.of(content, line.substring(cellStart, line.getContent().length()).getSourceSpan())); + } + return cells; + } + + // Examples of valid separators: + // + // |- + // -| + // |-| + // -|- + // |-|-| + // --- | --- + private static List parseSeparator(CharSequence s) { + List columns = new ArrayList<>(); + int pipes = 0; + boolean valid = false; + int i = 0; + int width = 0; + while (i < s.length()) { + char c = s.charAt(i); + switch (c) { + case '|': + i++; + pipes++; + if (pipes > 1) { + // More than one adjacent pipe not allowed + return null; + } + // Need at lest one pipe, even for a one column table + valid = true; + break; + case '-': + case ':': + if (pipes == 0 && !columns.isEmpty()) { + // Need a pipe after the first column (first column doesn't need to start with one) + return null; + } + boolean left = false; + boolean right = false; + if (c == ':') { + left = true; + i++; + width++; + } + boolean haveDash = false; + while (i < s.length() && s.charAt(i) == '-') { + i++; + width++; + haveDash = true; + } + if (!haveDash) { + // Need at least one dash + return null; + } + if (i < s.length() && s.charAt(i) == ':') { + right = true; + i++; + width++; + } + columns.add(new TableCellInfo(getAlignment(left, right), width)); + width = 0; + // Next, need another pipe + pipes = 0; + break; + case ' ': + case '\t': + // White space is allowed between pipes and columns + i++; + break; + default: + // Any other character is invalid + return null; + } + } + if (!valid) { + return null; + } + return columns; + } + + private static TableCell.Alignment getAlignment(boolean left, boolean right) { + if (left && right) { + return TableCell.Alignment.CENTER; + } else if (left) { + return TableCell.Alignment.LEFT; + } else if (right) { + return TableCell.Alignment.RIGHT; + } else { + return null; + } + } + + public static class Factory extends AbstractBlockParserFactory { + + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + List paragraphLines = matchedBlockParser.getParagraphLines().getLines(); + if (paragraphLines.size() >= 1 && Characters.find('|', paragraphLines.get(paragraphLines.size() - 1).getContent(), 0) != -1) { + SourceLine line = state.getLine(); + SourceLine separatorLine = line.substring(state.getIndex(), line.getContent().length()); + List columns = parseSeparator(separatorLine.getContent()); + if (columns != null && !columns.isEmpty()) { + SourceLine paragraph = paragraphLines.get(paragraphLines.size() - 1); + List headerCells = split(paragraph); + if (columns.size() >= headerCells.size()) { + return BlockStart.of(new TableBlockParser(columns, paragraph)) + .atIndex(state.getIndex()) + .replaceParagraphLines(1); + } + } + } + return BlockStart.none(); + } + } + + private static class TableCellInfo { + private final TableCell.Alignment alignment; + private final int width; + + public TableCell.Alignment getAlignment() { + return alignment; + } + + public int getWidth() { + return width; + } + + public TableCellInfo(TableCell.Alignment alignment, int width) { + this.alignment = alignment; + this.width = width; + } + } +} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableHtmlNodeRenderer.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableHtmlNodeRenderer.java new file mode 100644 index 000000000..966c4c151 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableHtmlNodeRenderer.java @@ -0,0 +1,98 @@ +package org.commonmark.ext.gfm.tables.internal; + +import org.commonmark.ext.gfm.tables.*; +import org.commonmark.node.Node; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlWriter; + +import java.util.Map; + +public class TableHtmlNodeRenderer extends TableNodeRenderer { + + private final HtmlWriter htmlWriter; + private final HtmlNodeRendererContext context; + + public TableHtmlNodeRenderer(HtmlNodeRendererContext context) { + this.htmlWriter = context.getWriter(); + this.context = context; + } + + @Override + protected void renderBlock(TableBlock tableBlock) { + htmlWriter.line(); + htmlWriter.tag("table", getAttributes(tableBlock, "table")); + renderChildren(tableBlock); + htmlWriter.tag("/table"); + htmlWriter.line(); + } + + @Override + protected void renderHead(TableHead tableHead) { + htmlWriter.line(); + htmlWriter.tag("thead", getAttributes(tableHead, "thead")); + renderChildren(tableHead); + htmlWriter.tag("/thead"); + htmlWriter.line(); + } + + @Override + protected void renderBody(TableBody tableBody) { + htmlWriter.line(); + htmlWriter.tag("tbody", getAttributes(tableBody, "tbody")); + renderChildren(tableBody); + htmlWriter.tag("/tbody"); + htmlWriter.line(); + } + + @Override + protected void renderRow(TableRow tableRow) { + htmlWriter.line(); + htmlWriter.tag("tr", getAttributes(tableRow, "tr")); + renderChildren(tableRow); + htmlWriter.tag("/tr"); + htmlWriter.line(); + } + + @Override + protected void renderCell(TableCell tableCell) { + String tagName = tableCell.isHeader() ? "th" : "td"; + htmlWriter.line(); + htmlWriter.tag(tagName, getCellAttributes(tableCell, tagName)); + renderChildren(tableCell); + htmlWriter.tag("/" + tagName); + htmlWriter.line(); + } + + private Map getAttributes(Node node, String tagName) { + return context.extendAttributes(node, tagName, Map.of()); + } + + private Map getCellAttributes(TableCell tableCell, String tagName) { + if (tableCell.getAlignment() != null) { + return context.extendAttributes(tableCell, tagName, Map.of("align", getAlignValue(tableCell.getAlignment()))); + } else { + return context.extendAttributes(tableCell, tagName, Map.of()); + } + } + + private static String getAlignValue(TableCell.Alignment alignment) { + switch (alignment) { + case LEFT: + return "left"; + case CENTER: + return "center"; + case RIGHT: + return "right"; + } + throw new IllegalStateException("Unknown alignment: " + alignment); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableMarkdownNodeRenderer.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableMarkdownNodeRenderer.java new file mode 100644 index 000000000..b0705f579 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableMarkdownNodeRenderer.java @@ -0,0 +1,88 @@ +package org.commonmark.ext.gfm.tables.internal; + +import org.commonmark.ext.gfm.tables.*; +import org.commonmark.node.Node; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownWriter; +import org.commonmark.text.AsciiMatcher; + +import java.util.ArrayList; +import java.util.List; + +/** + * The Table node renderer that is needed for rendering GFM tables (GitHub Flavored Markdown) to text content. + */ +public class TableMarkdownNodeRenderer extends TableNodeRenderer { + private final MarkdownWriter writer; + private final MarkdownNodeRendererContext context; + + private final AsciiMatcher pipe = AsciiMatcher.builder().c('|').build(); + + private final List columns = new ArrayList<>(); + + public TableMarkdownNodeRenderer(MarkdownNodeRendererContext context) { + this.writer = context.getWriter(); + this.context = context; + } + + @Override + protected void renderBlock(TableBlock node) { + columns.clear(); + writer.pushTight(true); + renderChildren(node); + writer.popTight(); + writer.block(); + } + + @Override + protected void renderHead(TableHead node) { + renderChildren(node); + for (TableCell.Alignment columnAlignment : columns) { + writer.raw('|'); + if (columnAlignment == TableCell.Alignment.LEFT) { + writer.raw(":---"); + } else if (columnAlignment == TableCell.Alignment.RIGHT) { + writer.raw("---:"); + } else if (columnAlignment == TableCell.Alignment.CENTER) { + writer.raw(":---:"); + } else { + writer.raw("---"); + } + } + writer.raw("|"); + writer.block(); + } + + @Override + protected void renderBody(TableBody node) { + renderChildren(node); + } + + @Override + protected void renderRow(TableRow node) { + renderChildren(node); + // Trailing | at the end of the line + writer.raw("|"); + writer.block(); + } + + @Override + protected void renderCell(TableCell node) { + if (node.getParent() != null && node.getParent().getParent() instanceof TableHead) { + columns.add(node.getAlignment()); + } + writer.raw("|"); + writer.pushRawEscape(pipe); + renderChildren(node); + writer.popRawEscape(); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableNodeRenderer.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableNodeRenderer.java new file mode 100644 index 000000000..2982e1518 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableNodeRenderer.java @@ -0,0 +1,46 @@ +package org.commonmark.ext.gfm.tables.internal; + +import org.commonmark.ext.gfm.tables.*; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; + +import java.util.Set; + +abstract class TableNodeRenderer implements NodeRenderer { + + @Override + public Set> getNodeTypes() { + return Set.of( + TableBlock.class, + TableHead.class, + TableBody.class, + TableRow.class, + TableCell.class + ); + } + + @Override + public void render(Node node) { + if (node instanceof TableBlock) { + renderBlock((TableBlock) node); + } else if (node instanceof TableHead) { + renderHead((TableHead) node); + } else if (node instanceof TableBody) { + renderBody((TableBody) node); + } else if (node instanceof TableRow) { + renderRow((TableRow) node); + } else if (node instanceof TableCell) { + renderCell((TableCell) node); + } + } + + protected abstract void renderBlock(TableBlock node); + + protected abstract void renderHead(TableHead node); + + protected abstract void renderBody(TableBody node); + + protected abstract void renderRow(TableRow node); + + protected abstract void renderCell(TableCell node); +} diff --git a/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableTextContentNodeRenderer.java b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableTextContentNodeRenderer.java new file mode 100644 index 000000000..0ba6894b5 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/java/org/commonmark/ext/gfm/tables/internal/TableTextContentNodeRenderer.java @@ -0,0 +1,68 @@ +package org.commonmark.ext.gfm.tables.internal; + +import org.commonmark.ext.gfm.tables.TableBlock; +import org.commonmark.ext.gfm.tables.TableBody; +import org.commonmark.ext.gfm.tables.TableCell; +import org.commonmark.ext.gfm.tables.TableHead; +import org.commonmark.ext.gfm.tables.TableRow; +import org.commonmark.node.Node; +import org.commonmark.renderer.text.TextContentNodeRendererContext; +import org.commonmark.renderer.text.TextContentWriter; + +/** + * The Table node renderer that is needed for rendering GFM tables (GitHub Flavored Markdown) to text content. + */ +public class TableTextContentNodeRenderer extends TableNodeRenderer { + + private final TextContentWriter textContentWriter; + private final TextContentNodeRendererContext context; + + public TableTextContentNodeRenderer(TextContentNodeRendererContext context) { + this.textContentWriter = context.getWriter(); + this.context = context; + } + + @Override + protected void renderBlock(TableBlock tableBlock) { + // Render rows tight + textContentWriter.pushTight(true); + renderChildren(tableBlock); + textContentWriter.popTight(); + textContentWriter.block(); + } + + @Override + protected void renderHead(TableHead tableHead) { + renderChildren(tableHead); + } + + @Override + protected void renderBody(TableBody tableBody) { + renderChildren(tableBody); + } + + @Override + protected void renderRow(TableRow tableRow) { + renderChildren(tableRow); + textContentWriter.block(); + } + + @Override + protected void renderCell(TableCell tableCell) { + renderChildren(tableCell); + // For the last cell in row, don't render the delimiter + if (tableCell.getNext() != null) { + textContentWriter.write('|'); + textContentWriter.whitespace(); + } + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-gfm-tables/src/main/javadoc/overview.html b/commonmark-ext-gfm-tables/src/main/javadoc/overview.html new file mode 100644 index 000000000..9b4a9ac44 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for GFM tables using "|" pipes (GitHub Flavored Markdown) +

See {@link org.commonmark.ext.gfm.tables.TablesExtension}

+ + diff --git a/commonmark-ext-gfm-tables/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-gfm-tables/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-gfm-tables/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TableMarkdownRendererTest.java b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TableMarkdownRendererTest.java new file mode 100644 index 000000000..85c11206c --- /dev/null +++ b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TableMarkdownRendererTest.java @@ -0,0 +1,75 @@ +package org.commonmark.ext.gfm.tables; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TableMarkdownRendererTest { + + private static final Set EXTENSIONS = Set.of(TablesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final MarkdownRenderer RENDERER = MarkdownRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void testHeadNoBody() { + assertRoundTrip("|Abc|\n|---|\n"); + assertRoundTrip("|Abc|Def|\n|---|---|\n"); + assertRoundTrip("|Abc||\n|---|---|\n"); + } + + @Test + public void testHeadAndBody() { + assertRoundTrip("|Abc|\n|---|\n|1|\n"); + assertRoundTrip("|Abc|Def|\n|---|---|\n|1|2|\n"); + } + + @Test + public void testBodyHasFewerColumns() { + // Could try not to write empty trailing cells but this is fine too + assertRoundTrip("|Abc|Def|\n|---|---|\n|1||\n"); + } + + @Test + public void testAlignment() { + assertRoundTrip("|Abc|Def|\n|:---|---|\n|1|2|\n"); + assertRoundTrip("|Abc|Def|\n|---|---:|\n|1|2|\n"); + assertRoundTrip("|Abc|Def|\n|:---:|:---:|\n|1|2|\n"); + } + + @Test + public void testInsideBlockQuote() { + assertRoundTrip("> |Abc|Def|\n> |---|---|\n> |1|2|\n"); + } + + @Test + public void testMultipleTables() { + assertRoundTrip("|Abc|Def|\n|---|---|\n\n|One|\n|---|\n|Only|\n"); + } + + @Test + public void testEscaping() { + assertRoundTrip("|Abc|Def|\n|---|---|\n|Pipe in|text \\||\n"); + assertRoundTrip("|Abc|Def|\n|---|---|\n|Pipe in|code `\\|`|\n"); + assertRoundTrip("|Abc|Def|\n|---|---|\n|Inline HTML|Foo\\|bar|\n"); + } + + @Test + public void testEscaped() { + // `|` in Text nodes needs to be escaped, otherwise the generated Markdown does not get parsed back as a table + assertRoundTrip("\\|Abc\\|\n\\|---\\|\n"); + } + + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private void assertRoundTrip(String input) { + String rendered = render(input); + assertThat(rendered).isEqualTo(input); + } +} diff --git a/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesSpecTest.java b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesSpecTest.java new file mode 100644 index 000000000..e7f3db4d1 --- /dev/null +++ b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesSpecTest.java @@ -0,0 +1,42 @@ +package org.commonmark.ext.gfm.tables; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.Parameter; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.List; +import java.util.Set; + +@ParameterizedClass +@MethodSource("data") +public class TablesSpecTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(TablesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Parameter + Example example; + + static List data() { + return ExampleReader.readExamples(TestResources.getGfmSpec(), "table"); + } + + @Test + public void testHtmlRendering() { + assertRendering(example.getSource(), example.getHtml()); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTest.java b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTest.java index b19f9700d..3f4b37d54 100644 --- a/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTest.java +++ b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTest.java @@ -1,13 +1,28 @@ package org.commonmark.ext.gfm.tables; import org.commonmark.Extension; -import org.commonmark.test.RenderingTestCase; -import org.junit.Test; - -import java.util.Collections; +import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.AttributeProvider; +import org.commonmark.renderer.html.AttributeProviderContext; +import org.commonmark.renderer.html.AttributeProviderFactory; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; public class TablesTest extends RenderingTestCase { + private static final Set EXTENSIONS = Set.of(TablesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + @Test public void mustHaveHeaderAndSeparator() { assertRendering("Abc|Def", "

Abc|Def

\n"); @@ -15,23 +30,63 @@ public void mustHaveHeaderAndSeparator() { } @Test - public void separatorMustBeThreeOrMore() { - assertRendering("Abc|Def\n-|-", "

Abc|Def\n-|-

\n"); - assertRendering("Abc|Def\n--|--", "

Abc|Def\n--|--

\n"); + public void separatorMustBeOneOrMore() { + assertRendering("Abc|Def\n-|-", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
\n"); + assertRendering("Abc|Def\n--|--", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
\n"); + } + + @Test + public void separatorMustNotContainInvalidChars() { + assertRendering("Abc|Def\n |-a-|---", "

Abc|Def\n|-a-|---

\n"); + assertRendering("Abc|Def\n |:--a|---", "

Abc|Def\n|:--a|---

\n"); + assertRendering("Abc|Def\n |:--a--:|---", "

Abc|Def\n|:--a--:|---

\n"); + } + + @Test + public void separatorCanHaveLeadingSpaceThenPipe() { + assertRendering("Abc|Def\n |---|---", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
\n"); + } + + @Test + public void separatorCanNotHaveAdjacentPipes() { + assertRendering("Abc|Def\n---||---", "

Abc|Def\n---||---

\n"); } @Test - public void separatorCanNotHaveLeadingSpaceThenPipe() { - assertRendering("Abc|Def\n |---|---", "

Abc|Def\n|---|---

\n"); + public void separatorNeedsPipes() { + assertRendering("Abc|Def\n|--- ---", "

Abc|Def\n|--- ---

\n"); } @Test public void oneHeadNoBody() { assertRendering("Abc|Def\n---|---", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + - "\n" + "
AbcDef
AbcDef
\n"); } @@ -39,9 +94,10 @@ public void oneHeadNoBody() { public void oneColumnOneHeadNoBody() { String expected = "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + "\n" + - "\n" + "
Abc
Abc
\n"; assertRendering("|Abc\n|---\n", expected); assertRendering("|Abc|\n|---|\n", expected); @@ -57,10 +113,14 @@ public void oneColumnOneHeadNoBody() { public void oneColumnOneHeadOneBody() { String expected = "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
Abc
Abc
1
1
\n"; assertRendering("|Abc\n|---\n|1", expected); @@ -69,25 +129,40 @@ public void oneColumnOneHeadOneBody() { // Pipe required on separator assertRendering("|Abc\n---\n|1", "

|Abc

\n

|1

\n"); + } - // Pipe required on body - assertRendering("|Abc\n|---\n1\n", "\n" + + @Test + public void oneHeadOneBody() { + assertRendering("Abc|Def\n---|---\n1|2", "
\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + - "\n" + - "
Abc
AbcDef
\n" + - "

1

\n"); + "\n" + + "\n" + + "1\n" + + "2\n" + + "\n" + + "\n" + + "\n"); } @Test - public void oneHeadOneBody() { - assertRendering("Abc|Def\n---|---\n1|2", "\n" + + public void spaceBeforeSeparator() { + assertRendering(" |Abc|Def|\n |---|---|\n |1|2|", "
\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -101,10 +176,16 @@ public void separatorMustNotHaveLessPartsThanHead() { public void padding() { assertRendering(" Abc | Def \n --- | --- \n 1 | 2 ", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -113,10 +194,16 @@ public void padding() { public void paddingWithCodeBlockIndentation() { assertRendering("Abc|Def\n---|---\n 1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -125,22 +212,77 @@ public void paddingWithCodeBlockIndentation() { public void pipesOnOutside() { assertRendering("|Abc|Def|\n|---|---|\n|1|2|", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
AbcDef
12
\n"); + } + + @Test + public void pipesOnOutsideWhitespaceAfterHeader() { + assertRendering("|Abc|Def| \n|---|---|\n|1|2|", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
12
12
\n"); } + @Test + public void pipesOnOutsideZeroLengthHeaders() { + // This is literally what someone has done IRL - it helped to expose + // an issue with parsing the last header cell correctly + assertRendering("||center header||\n" + + "-|-------------|-\n" + + "1| 2 |3", + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
center header
123
\n"); + } + @Test public void inlineElements() { assertRendering("*Abc*|Def\n---|---\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -149,58 +291,213 @@ public void inlineElements() { public void escapedPipe() { assertRendering("Abc|Def\n---|---\n1\\|2|20", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
1|220
1|220
\n"); } @Test public void escapedBackslash() { + // This is a bit weird in the GFM spec IMO. `1\\|2` looks like an escaped backslash, followed by a pipe + // (so two cells). Instead, the `\|` is parsed as an escaped pipe first, so just a single cell. The inline + // parser then gets `1\|2` which renders as `1|2`. assertRendering("Abc|Def\n---|---\n1\\\\|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
AbcDef
1|2
\n"); + } + + @Test + public void escapedOther() { + // This is a tricky one. For \`, we don't want to remove the backslash when we parse the table, otherwise + // inline parsing is wrong. So we have to be careful where we do/don't consume the backslash. + assertRendering("Abc|Def\n---|---\n1|\\`not code`", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
1`not code`
\n"); + } + + @Test + public void backslashAtEnd() { + assertRendering("Abc|Def\n---|---\n1|2\\", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
1\\2
12\\
\n"); } @Test public void alignLeft() { + assertRendering("Abc|Def\n:-|-\n1|2", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); + assertRendering("Abc|Def\n:-|-\n1|2", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); assertRendering("Abc|Def\n:---|---\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @Test public void alignRight() { + assertRendering("Abc|Def\n-:|-\n1|2", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); + assertRendering("Abc|Def\n--:|--\n1|2", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); assertRendering("Abc|Def\n---:|---\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @Test public void alignCenter() { + assertRendering("Abc|Def\n:-:|-\n1|2", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); + assertRendering("Abc|Def\n:--:|--\n1|2", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); assertRendering("Abc|Def\n:---:|---\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -209,10 +506,16 @@ public void alignCenter() { public void alignCenterSecond() { assertRendering("Abc|Def\n---|:---:\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -221,10 +524,16 @@ public void alignCenterSecond() { public void alignLeftWithSpaces() { assertRendering("Abc|Def\n :--- |---\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -241,10 +550,16 @@ public void alignmentMarkerMustBeNextToDashes() { public void bodyCanNotHaveMoreColumnsThanHead() { assertRendering("Abc|Def\n---|---\n1|2|3", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n"); } @@ -253,10 +568,18 @@ public void bodyCanNotHaveMoreColumnsThanHead() { public void bodyWithFewerColumnsThanHeadResultsInEmptyCells() { assertRendering("Abc|Def|Ghi\n---|---|---\n1|2", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDefGhi
AbcDefGhi
12
12
\n"); } @@ -266,31 +589,298 @@ public void insideBlockQuote() { assertRendering("> Abc|Def\n> ---|---\n> 1|2", "
\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n" + "
\n"); } @Test - public void tableEndWithoutEmptyLine() { - assertRendering("Abc|Def\n---|---\n1|2\ntable, you are over", "\n" + + public void tableWithLazyContinuationLine() { + assertRendering("Abc|Def\n---|---\n1|2\nlazy", "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
lazy
\n"); + } + + @Test + public void issue142() { + assertRendering("||Alveolar|Bilabial\n" + + "|:--|:-:|:-:\n" + + "|**Plosive**|t, d|b\n" + + "|**Tap**|ɾ|", + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AlveolarBilabial
Plosivet, db
Tapɾ
\n"); + } + + @Test + public void danglingPipe() { + assertRendering("Abc|Def\n" + + "---|---\n" + + "1|2\n" + + "|", "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n" + + "

|

\n"); + + assertRendering("Abc|Def\n" + + "---|---\n" + + "1|2\n" + + " | ", "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "\n" + - "\n" + + "\n" + + "\n" + + "\n" + + "\n" + "\n" + "
AbcDef
AbcDef
12
12
\n" + - "

table, you are over

\n"); + "

|

\n"); } - @Override - protected Iterable getExtensions() { - return Collections.singleton(TablesExtension.create()); + @Test + public void interruptsParagraph() { + assertRendering("text\n" + + "|a |\n" + + "|---|\n" + + "|b |", "

text

\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
a
b
\n"); + } + + @Test + public void attributeProviderIsApplied() { + AttributeProviderFactory factory = new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return new AttributeProvider() { + @Override + public void setAttributes(Node node, String tagName, Map attributes) { + if (node instanceof TableBlock) { + attributes.put("test", "block"); + } else if (node instanceof TableHead) { + attributes.put("test", "head"); + } else if (node instanceof TableBody) { + attributes.put("test", "body"); + } else if (node instanceof TableRow) { + attributes.put("test", "row"); + } else if (node instanceof TableCell) { + attributes.put("test", "cell"); + } + } + }; + } + }; + HtmlRenderer renderer = HtmlRenderer.builder() + .attributeProviderFactory(factory) + .extensions(EXTENSIONS) + .build(); + String rendered = renderer.render(PARSER.parse("Abc|Def\n---|---\n1|2")); + assertThat(rendered).isEqualTo("\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); + } + + @Test + public void columnWidthIsRecorded() { + AttributeProviderFactory factory = new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return new AttributeProvider() { + @Override + public void setAttributes(Node node, String tagName, Map attributes) { + if (node instanceof TableCell && "th".equals(tagName)) { + attributes.put("width", ((TableCell) node).getWidth() + "em"); + } + } + }; + } + }; + HtmlRenderer renderer = HtmlRenderer.builder() + .attributeProviderFactory(factory) + .extensions(EXTENSIONS) + .build(); + String rendered = renderer.render(PARSER.parse("Abc|Def\n-----|---\n1|2")); + assertThat(rendered).isEqualTo("\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
AbcDef
12
\n"); } + @Test + public void sourceSpans() { + Parser parser = Parser.builder() + .extensions(EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + Node document = parser.parse("Abc|Def\n---|---\n|1|2\n 3|four|\n|||\n"); + + TableBlock block = (TableBlock) document.getFirstChild(); + assertThat(block.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 7), + SourceSpan.of(2, 0, 16, 4), SourceSpan.of(3, 0, 21, 8), SourceSpan.of(4, 0, 30, 3))); + + TableHead head = (TableHead) block.getFirstChild(); + assertThat(head.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 7))); + + TableRow headRow = (TableRow) head.getFirstChild(); + assertThat(headRow.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 7))); + TableCell headRowCell1 = (TableCell) headRow.getFirstChild(); + TableCell headRowCell2 = (TableCell) headRow.getLastChild(); + assertThat(headRowCell1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 3))); + assertThat(headRowCell1.getFirstChild().getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 3))); + assertThat(headRowCell2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 3))); + assertThat(headRowCell2.getFirstChild().getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 3))); + + TableBody body = (TableBody) block.getLastChild(); + assertThat(body.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 0, 16, 4), SourceSpan.of(3, 0, 21, 8), SourceSpan.of(4, 0, 30, 3))); + + TableRow bodyRow1 = (TableRow) body.getFirstChild(); + assertThat(bodyRow1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 0, 16, 4))); + TableCell bodyRow1Cell1 = (TableCell) bodyRow1.getFirstChild(); + TableCell bodyRow1Cell2 = (TableCell) bodyRow1.getLastChild(); + assertThat(bodyRow1Cell1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 1, 17, 1))); + assertThat(bodyRow1Cell1.getFirstChild().getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 1, 17, 1))); + assertThat(bodyRow1Cell2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 3, 19, 1))); + assertThat(bodyRow1Cell2.getFirstChild().getSourceSpans()).isEqualTo(List.of(SourceSpan.of(2, 3, 19, 1))); + + TableRow bodyRow2 = (TableRow) body.getFirstChild().getNext(); + assertThat(bodyRow2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 0, 21, 8))); + TableCell bodyRow2Cell1 = (TableCell) bodyRow2.getFirstChild(); + TableCell bodyRow2Cell2 = (TableCell) bodyRow2.getLastChild(); + assertThat(bodyRow2Cell1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 1, 22, 1))); + assertThat(bodyRow2Cell1.getFirstChild().getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 1, 22, 1))); + assertThat(bodyRow2Cell2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 3, 24, 4))); + assertThat(bodyRow2Cell2.getFirstChild().getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 3, 24, 4))); + + TableRow bodyRow3 = (TableRow) body.getLastChild(); + assertThat(bodyRow3.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(4, 0, 30, 3))); + TableCell bodyRow3Cell1 = (TableCell) bodyRow3.getFirstChild(); + TableCell bodyRow3Cell2 = (TableCell) bodyRow3.getLastChild(); + assertThat(bodyRow3Cell1.getSourceSpans()).isEqualTo(List.of()); + assertThat(bodyRow3Cell2.getSourceSpans()).isEqualTo(List.of()); + } + + @Test + public void sourceSpansWhenInterrupting() { + var parser = Parser.builder() + .extensions(EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + var document = parser.parse("a\n" + + "bc\n" + + "|de|\n" + + "|---|\n" + + "|fg|"); + + var paragraph = (Paragraph) document.getFirstChild(); + var text = (Text) paragraph.getFirstChild(); + assertThat(text.getLiteral()).isEqualTo("a"); + assertThat(text.getNext()).isInstanceOf(SoftLineBreak.class); + var text2 = (Text) text.getNext().getNext(); + assertThat(text2.getLiteral()).isEqualTo("bc"); + + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of( + SourceSpan.of(0, 0, 0, 1), + SourceSpan.of(1, 0, 2, 2))); + + var table = (TableBlock) document.getLastChild(); + assertThat(table.getSourceSpans()).isEqualTo(List.of( + SourceSpan.of(2, 0, 5, 4), + SourceSpan.of(3, 0, 10, 5), + SourceSpan.of(4, 0, 16, 4))); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } } diff --git a/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTextContentTest.java b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTextContentTest.java new file mode 100644 index 000000000..966f097fd --- /dev/null +++ b/commonmark-ext-gfm-tables/src/test/java/org/commonmark/ext/gfm/tables/TablesTextContentTest.java @@ -0,0 +1,166 @@ +package org.commonmark.ext.gfm.tables; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.text.LineBreakRendering; +import org.commonmark.renderer.text.TextContentRenderer; +import org.commonmark.testutil.Asserts; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +public class TablesTextContentTest { + + private static final Set EXTENSIONS = Set.of(TablesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final TextContentRenderer RENDERER = TextContentRenderer.builder().extensions(EXTENSIONS).build(); + + private static final TextContentRenderer COMPACT_RENDERER = TextContentRenderer.builder().extensions(EXTENSIONS).build(); + private static final TextContentRenderer SEPARATE_RENDERER = TextContentRenderer.builder().extensions(EXTENSIONS) + .lineBreakRendering(LineBreakRendering.SEPARATE_BLOCKS).build(); + private static final TextContentRenderer STRIPPED_RENDERER = TextContentRenderer.builder().extensions(EXTENSIONS) + .lineBreakRendering(LineBreakRendering.STRIP).build(); + + @Test + public void oneHeadNoBody() { + assertCompact("Abc|Def\n---|---", "Abc| Def"); + } + + @Test + public void oneColumnOneHeadNoBody() { + String expected = "Abc"; + assertCompact("|Abc\n|---\n", expected); + assertCompact("|Abc|\n|---|\n", expected); + assertCompact("Abc|\n---|\n", expected); + + // Pipe required on separator + assertCompact("|Abc\n---\n", "|Abc"); + // Pipe required on head + assertCompact("Abc\n|---\n", "Abc\n|---"); + } + + @Test + public void oneColumnOneHeadOneBody() { + String expected = "Abc\n1"; + assertCompact("|Abc\n|---\n|1", expected); + assertCompact("|Abc|\n|---|\n|1|", expected); + assertCompact("Abc|\n---|\n1|", expected); + + // Pipe required on separator + assertCompact("|Abc\n---\n|1", "|Abc\n|1"); + } + + @Test + public void oneHeadOneBody() { + assertCompact("Abc|Def\n---|---\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void separatorMustNotHaveLessPartsThanHead() { + assertCompact("Abc|Def|Ghi\n---|---\n1|2|3", "Abc|Def|Ghi\n---|---\n1|2|3"); + } + + @Test + public void padding() { + assertCompact(" Abc | Def \n --- | --- \n 1 | 2 ", "Abc| Def\n1| 2"); + } + + @Test + public void paddingWithCodeBlockIndentation() { + assertCompact("Abc|Def\n---|---\n 1|2", "Abc| Def\n1| 2"); + } + + @Test + public void pipesOnOutside() { + assertCompact("|Abc|Def|\n|---|---|\n|1|2|", "Abc| Def\n1| 2"); + } + + @Test + public void inlineElements() { + assertCompact("*Abc*|Def\n---|---\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void escapedPipe() { + assertCompact("Abc|Def\n---|---\n1\\|2|20", "Abc| Def\n1|2| 20"); + } + + @Test + public void alignLeft() { + assertCompact("Abc|Def\n:---|---\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void alignRight() { + assertCompact("Abc|Def\n---:|---\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void alignCenter() { + assertCompact("Abc|Def\n:---:|---\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void alignCenterSecond() { + assertCompact("Abc|Def\n---|:---:\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void alignLeftWithSpaces() { + assertCompact("Abc|Def\n :--- |---\n1|2", "Abc| Def\n1| 2"); + } + + @Test + public void alignmentMarkerMustBeNextToDashes() { + assertCompact("Abc|Def\n: ---|---", "Abc|Def\n: ---|---"); + assertCompact("Abc|Def\n--- :|---", "Abc|Def\n--- :|---"); + assertCompact("Abc|Def\n---|: ---", "Abc|Def\n---|: ---"); + assertCompact("Abc|Def\n---|--- :", "Abc|Def\n---|--- :"); + } + + @Test + public void bodyCanNotHaveMoreColumnsThanHead() { + assertCompact("Abc|Def\n---|---\n1|2|3", "Abc| Def\n1| 2"); + } + + @Test + public void bodyWithFewerColumnsThanHeadResultsInEmptyCells() { + assertCompact("Abc|Def|Ghi\n---|---|---\n1|2", "Abc| Def| Ghi\n1| 2| "); + } + + @Test + public void insideBlockQuote() { + assertCompact("> Abc|Def\n> ---|---\n> 1|2", "«Abc| Def\n1| 2»"); + } + + @Test + public void tableWithLazyContinuationLine() { + assertCompact("Abc|Def\n---|---\n1|2\nlazy", "Abc| Def\n1| 2\nlazy| "); + } + + @Test + public void tableBetweenOtherBlocks() { + var s = "Foo\n\nAbc|Def\n---|---\n1|2\n\nBar"; + assertCompact(s, "Foo\nAbc| Def\n1| 2\nBar"); + assertSeparate(s, "Foo\n\nAbc| Def\n1| 2\n\nBar"); + assertStripped(s, "Foo Abc| Def 1| 2 Bar"); + } + + private void assertCompact(String source, String expected) { + var doc = PARSER.parse(source); + var actualRendering = COMPACT_RENDERER.render(doc); + Asserts.assertRendering(source, expected, actualRendering); + } + + private void assertSeparate(String source, String expected) { + var doc = PARSER.parse(source); + var actualRendering = SEPARATE_RENDERER.render(doc); + Asserts.assertRendering(source, expected, actualRendering); + } + + private void assertStripped(String source, String expected) { + var doc = PARSER.parse(source); + var actualRendering = STRIPPED_RENDERER.render(doc); + Asserts.assertRendering(source, expected, actualRendering); + } +} diff --git a/commonmark-ext-heading-anchor/pom.xml b/commonmark-ext-heading-anchor/pom.xml new file mode 100644 index 000000000..26d2d19b1 --- /dev/null +++ b/commonmark-ext-heading-anchor/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-ext-heading-anchor + commonmark-java extension for adding id attributes to h tags + commonmark-java extension for adding unique id attributes to header tags + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-heading-anchor/src/main/java/module-info.java b/commonmark-ext-heading-anchor/src/main/java/module-info.java new file mode 100644 index 000000000..2369323a6 --- /dev/null +++ b/commonmark-ext-heading-anchor/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.heading.anchor { + exports org.commonmark.ext.heading.anchor; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/HeadingAnchorExtension.java b/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/HeadingAnchorExtension.java new file mode 100644 index 000000000..cee414da2 --- /dev/null +++ b/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/HeadingAnchorExtension.java @@ -0,0 +1,105 @@ +package org.commonmark.ext.heading.anchor; + +import org.commonmark.Extension; +import org.commonmark.ext.heading.anchor.internal.HeadingIdAttributeProvider; +import org.commonmark.renderer.html.AttributeProvider; +import org.commonmark.renderer.html.AttributeProviderContext; +import org.commonmark.renderer.html.AttributeProviderFactory; +import org.commonmark.renderer.html.HtmlRenderer; + +/** + * Extension for adding auto generated IDs to headings. + *

+ * Create it with {@link #create()} or {@link #builder()} and then configure it on the + * renderer builder ({@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ * The heading text will be used to create the id. Multiple headings with the + * same text will result in appending a hyphen and number. For example: + *


+ * # Heading
+ * # Heading
+ * 
+ * will result in + *

+ * <h1 id="heading">Heading</h1>
+ * <h1 id="heading-1">Heading</h1>
+ * 
+ * + * @see IdGenerator the IdGenerator class if just the ID generation part is needed + */ +public class HeadingAnchorExtension implements HtmlRenderer.HtmlRendererExtension { + + private final String defaultId; + private final String idPrefix; + private final String idSuffix; + + private HeadingAnchorExtension(Builder builder) { + this.defaultId = builder.defaultId; + this.idPrefix = builder.idPrefix; + this.idSuffix = builder.idSuffix; + } + + /** + * @return the extension built with default settings + */ + public static Extension create() { + return new HeadingAnchorExtension(builder()); + } + + /** + * @return a builder to configure the extension settings + */ + public static Builder builder() { + return new Builder(); + } + + @Override + public void extend(HtmlRenderer.Builder rendererBuilder) { + rendererBuilder.attributeProviderFactory(new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return HeadingIdAttributeProvider.create(defaultId, idPrefix, idSuffix); + } + }); + } + + public static class Builder { + private String defaultId = "id"; + private String idPrefix = ""; + private String idSuffix = ""; + + /** + * @param value Default value for the id to take if no generated id can be extracted. Default "id" + * @return {@code this} + */ + public Builder defaultId(String value) { + this.defaultId = value; + return this; + } + + /** + * @param value Set the value to be prepended to every id generated. Default "" + * @return {@code this} + */ + public Builder idPrefix(String value) { + this.idPrefix = value; + return this; + } + + /** + * @param value Set the value to be appended to every id generated. Default "" + * @return {@code this} + */ + public Builder idSuffix(String value) { + this.idSuffix = value; + return this; + } + + /** + * @return a configured extension + */ + public Extension build() { + return new HeadingAnchorExtension(this); + } + } +} diff --git a/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/IdGenerator.java b/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/IdGenerator.java new file mode 100644 index 000000000..6eb85b6c1 --- /dev/null +++ b/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/IdGenerator.java @@ -0,0 +1,148 @@ +package org.commonmark.ext.heading.anchor; + +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Generates strings to be used as identifiers. + *

+ * Use {@link #builder()} to create an instance. + */ +public class IdGenerator { + private final Pattern allowedCharacters; + private final Map identityMap; + private final String prefix; + private final String suffix; + private String defaultIdentifier; + + private IdGenerator(Builder builder) { + this.allowedCharacters = compileAllowedCharactersPattern(); + this.defaultIdentifier = builder.defaultIdentifier; + this.prefix = builder.prefix; + this.suffix = builder.suffix; + this.identityMap = new HashMap<>(); + } + + /** + * @return a new builder with default arguments + */ + public static Builder builder() { + return new Builder(); + } + + /** + *

+ * Generate an ID based on the provided text and previously generated IDs. + *

+ * This method is not thread safe, concurrent calls can end up + * with non-unique identifiers. + *

+ * Note that collision can occur in the case that + *

    + *
  • Method called with 'X'
  • + *
  • Method called with 'X' again
  • + *
  • Method called with 'X-1'
  • + *
+ *

+ * In that case, the three generated IDs will be: + *

    + *
  • X
  • + *
  • X-1
  • + *
  • X-1
  • + *
+ *

+ * Therefore if collisions are unacceptable you should ensure that + * numbers are stripped from end of {@code text}. + * + * @param text Text that the identifier should be based on. Will be normalised, then used to generate the + * identifier. + * @return {@code text} if this is the first instance that the {@code text} has been passed + * to the method. Otherwise, {@code text + "-" + X} will be returned, where X is the number of times + * that {@code text} has previously been passed in. If {@code text} is empty, the default + * identifier given in the constructor will be used. + */ + public String generateId(String text) { + String normalizedIdentity = text != null ? normalizeText(text) : defaultIdentifier; + + if (normalizedIdentity.length() == 0) { + normalizedIdentity = defaultIdentifier; + } + + if (!identityMap.containsKey(normalizedIdentity)) { + identityMap.put(normalizedIdentity, 1); + return prefix + normalizedIdentity + suffix; + } else { + int currentCount = identityMap.get(normalizedIdentity); + identityMap.put(normalizedIdentity, currentCount + 1); + return prefix + normalizedIdentity + "-" + currentCount + suffix; + } + } + + private static Pattern compileAllowedCharactersPattern() { + String regex = "[\\w\\-_]+"; + try { + return Pattern.compile(regex, Pattern.UNICODE_CHARACTER_CLASS); + } catch (IllegalArgumentException e) { + // Android only supports the flag in API level 24. But it actually uses Unicode character classes by + // default, so not specifying the flag is ok. See issue #71. + return Pattern.compile(regex); + } + } + + /** + * Assume we've been given a space separated text. + * + * @param text Text to normalize to an ID + */ + private String normalizeText(String text) { + String firstPassNormalising = text.toLowerCase().replace(" ", "-"); + + StringBuilder sb = new StringBuilder(); + Matcher matcher = allowedCharacters.matcher(firstPassNormalising); + + while (matcher.find()) { + sb.append(matcher.group()); + } + + return sb.toString(); + } + + public static class Builder { + private String defaultIdentifier = "id"; + private String prefix = ""; + private String suffix = ""; + + public IdGenerator build() { + return new IdGenerator(this); + } + + /** + * @param defaultId the default identifier to use in case the provided text is empty or only contains unusable characters + * @return {@code this} + */ + public Builder defaultId(String defaultId) { + this.defaultIdentifier = defaultId; + return this; + } + + /** + * @param prefix the text to place before the generated identity + * @return {@code this} + */ + public Builder prefix(String prefix) { + this.prefix = prefix; + return this; + } + + /** + * @param suffix the text to place after the generated identity + * @return {@code this} + */ + public Builder suffix(String suffix) { + this.suffix = suffix; + return this; + } + } +} diff --git a/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/internal/HeadingIdAttributeProvider.java b/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/internal/HeadingIdAttributeProvider.java new file mode 100644 index 000000000..6b8792bd5 --- /dev/null +++ b/commonmark-ext-heading-anchor/src/main/java/org/commonmark/ext/heading/anchor/internal/HeadingIdAttributeProvider.java @@ -0,0 +1,55 @@ +package org.commonmark.ext.heading.anchor.internal; + +import org.commonmark.ext.heading.anchor.IdGenerator; +import org.commonmark.renderer.html.AttributeProvider; +import org.commonmark.node.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class HeadingIdAttributeProvider implements AttributeProvider { + + private final IdGenerator idGenerator; + + private HeadingIdAttributeProvider(String defaultId, String prefix, String suffix) { + idGenerator = IdGenerator.builder() + .defaultId(defaultId) + .prefix(prefix) + .suffix(suffix) + .build(); + } + + public static HeadingIdAttributeProvider create(String defaultId, String prefix, String suffix) { + return new HeadingIdAttributeProvider(defaultId, prefix, suffix); + } + + @Override + public void setAttributes(Node node, String tagName, final Map attributes) { + + if (node instanceof Heading) { + + final List wordList = new ArrayList<>(); + + node.accept(new AbstractVisitor() { + @Override + public void visit(Text text) { + wordList.add(text.getLiteral()); + } + + @Override + public void visit(Code code) { + wordList.add(code.getLiteral()); + } + }); + + String finalString = ""; + for (String word : wordList) { + finalString += word; + } + finalString = finalString.trim().toLowerCase(); + + attributes.put("id", idGenerator.generateId(finalString)); + } + } +} diff --git a/commonmark-ext-heading-anchor/src/main/javadoc/overview.html b/commonmark-ext-heading-anchor/src/main/javadoc/overview.html new file mode 100644 index 000000000..4a64323eb --- /dev/null +++ b/commonmark-ext-heading-anchor/src/main/javadoc/overview.html @@ -0,0 +1,7 @@ + + +Extension for automatically adding {@code id} attributes to all headers +

See {@link org.commonmark.ext.heading.anchor.HeadingAnchorExtension} or use + {@link org.commonmark.ext.heading.anchor.IdGenerator} directly.

+ + diff --git a/commonmark-ext-heading-anchor/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-heading-anchor/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-heading-anchor/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-heading-anchor/src/test/java/org/commonmark/ext/heading/anchor/HeadingAnchorConfigurationTest.java b/commonmark-ext-heading-anchor/src/test/java/org/commonmark/ext/heading/anchor/HeadingAnchorConfigurationTest.java new file mode 100644 index 000000000..438a3a9bd --- /dev/null +++ b/commonmark-ext-heading-anchor/src/test/java/org/commonmark/ext/heading/anchor/HeadingAnchorConfigurationTest.java @@ -0,0 +1,57 @@ +package org.commonmark.ext.heading.anchor; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class HeadingAnchorConfigurationTest { + + private static final Parser PARSER = Parser.builder().build(); + + private HtmlRenderer buildRenderer(String defaultId, String prefix, String suffix) { + Extension ext = HeadingAnchorExtension.builder() + .defaultId(defaultId) + .idPrefix(prefix) + .idSuffix(suffix) + .build(); + return HtmlRenderer.builder() + .extensions(List.of(ext)) + .build(); + } + + @Test + public void testDefaultConfigurationHasNoAdditions() { + HtmlRenderer renderer = HtmlRenderer.builder() + .extensions(List.of(HeadingAnchorExtension.create())) + .build(); + assertThat(doRender(renderer, "# ")).isEqualTo("

\n"); + } + + @Test + public void testDefaultIdWhenNoTextOnHeader() { + HtmlRenderer renderer = buildRenderer("defid", "", ""); + assertThat(doRender(renderer, "# ")).isEqualTo("

\n"); + } + + @Test + public void testPrefixAddedToHeader() { + HtmlRenderer renderer = buildRenderer("", "pre-", ""); + assertThat(doRender(renderer, "# text")).isEqualTo("

text

\n"); + } + + @Test + public void testSuffixAddedToHeader() { + HtmlRenderer renderer = buildRenderer("", "", "-post"); + assertThat(doRender(renderer, "# text")).isEqualTo("

text

\n"); + } + + private String doRender(HtmlRenderer renderer, String text) { + return renderer.render(PARSER.parse(text)); + } + +} diff --git a/commonmark-ext-heading-anchor/src/test/java/org/commonmark/ext/heading/anchor/HeadingAnchorTest.java b/commonmark-ext-heading-anchor/src/test/java/org/commonmark/ext/heading/anchor/HeadingAnchorTest.java new file mode 100644 index 000000000..3149542e3 --- /dev/null +++ b/commonmark-ext-heading-anchor/src/test/java/org/commonmark/ext/heading/anchor/HeadingAnchorTest.java @@ -0,0 +1,96 @@ +package org.commonmark.ext.heading.anchor; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +public class HeadingAnchorTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(HeadingAnchorExtension.create()); + private static final Parser PARSER = Parser.builder().build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void baseCaseSingleHeader() { + assertRendering("# Heading here\n", + "

Heading here

\n"); + } + + @Test + public void singleHeaderWithCodeBlock() { + assertRendering("Hi there\n# Heading `here`\n", + "

Hi there

\n

Heading here

\n"); + } + + @Test + public void duplicateHeadersMakeUniqueIds() { + assertRendering("# Heading here\n# Heading here", + "

Heading here

\n

Heading here

\n"); + } + + @Test + public void testSupplementalDiacriticalMarks() { + assertRendering("# a\u1DC0", "

a\u1DC0

\n"); + } + + @Test + public void testUndertieUnicodeDisplayed() { + assertRendering("# undertie \u203F", "

undertie \u203F

\n"); + } + + @Test + public void testExplicitHeaderCollision() { + assertRendering("# Header\n# Header\n# Header-1", + "

Header

\n" + + "

Header

\n" + + "

Header-1

\n"); + } + + @Test + public void testCaseIsIgnoredWhenComparingIds() { + assertRendering("# HEADING here\n" + + "# heading here", + "

HEADING here

\n" + + "

heading here

\n"); + } + + @Test + public void testNestedBlocks() { + assertRendering("## `h` `e` **l** *l* o", + "

h e l l o

\n"); + } + + @Test + public void boldEmphasisCharacters() { + assertRendering("# _hello_ **there**", "

hello there

\n"); + } + + @Test + public void testStrongEmphasis() { + assertRendering("# _**Hi there**_", "

Hi there

\n"); + } + + @Test + public void testMultipleSpacesKept() { + assertRendering("# Hi There", "

Hi There

\n"); + } + + @Test + public void testNonAsciiCharacterHeading() { + assertRendering("# bär", "

bär

\n"); + } + + @Test + public void testCombiningDiaeresis() { + assertRendering("# Product\u036D\u036B", "

Product\u036D\u036B

\n"); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-ext-image-attributes/pom.xml b/commonmark-ext-image-attributes/pom.xml new file mode 100644 index 000000000..e646bc3fd --- /dev/null +++ b/commonmark-ext-image-attributes/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-ext-image-attributes + commonmark-java extension for image attributes + commonmark-java extension for adding attributes to images + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-image-attributes/src/main/java/module-info.java b/commonmark-ext-image-attributes/src/main/java/module-info.java new file mode 100644 index 000000000..42d04a358 --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.image.attributes { + exports org.commonmark.ext.image.attributes; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/ImageAttributes.java b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/ImageAttributes.java new file mode 100644 index 000000000..1ee43958b --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/ImageAttributes.java @@ -0,0 +1,37 @@ +package org.commonmark.ext.image.attributes; + +import org.commonmark.node.CustomNode; +import org.commonmark.node.Delimited; + +import java.util.Map; + +/** + * A node containing text and other inline nodes as children. + */ +public class ImageAttributes extends CustomNode implements Delimited { + + private final Map attributes; + + public ImageAttributes(Map attributes) { + this.attributes = attributes; + } + + @Override + public String getOpeningDelimiter() { + return "{"; + } + + @Override + public String getClosingDelimiter() { + return "}"; + } + + public Map getAttributes() { + return attributes; + } + + @Override + protected String toStringAttributes() { + return "imageAttributes=" + attributes; + } +} diff --git a/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/ImageAttributesExtension.java b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/ImageAttributesExtension.java new file mode 100644 index 000000000..28c6abab2 --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/ImageAttributesExtension.java @@ -0,0 +1,45 @@ +package org.commonmark.ext.image.attributes; + +import org.commonmark.Extension; +import org.commonmark.ext.image.attributes.internal.ImageAttributesAttributeProvider; +import org.commonmark.ext.image.attributes.internal.ImageAttributesDelimiterProcessor; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.AttributeProvider; +import org.commonmark.renderer.html.AttributeProviderContext; +import org.commonmark.renderer.html.AttributeProviderFactory; +import org.commonmark.renderer.html.HtmlRenderer; + +/** + * Extension for adding attributes to image nodes. + *

+ * Create it with {@link #create()} and then configure it on the builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ * + * @since 0.15.0 + */ +public class ImageAttributesExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension { + + private ImageAttributesExtension() { + } + + public static Extension create() { + return new ImageAttributesExtension(); + } + + @Override + public void extend(Parser.Builder parserBuilder) { + parserBuilder.customDelimiterProcessor(new ImageAttributesDelimiterProcessor()); + } + + @Override + public void extend(HtmlRenderer.Builder rendererBuilder) { + rendererBuilder.attributeProviderFactory(new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return ImageAttributesAttributeProvider.create(); + } + }); + } +} diff --git a/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/internal/ImageAttributesAttributeProvider.java b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/internal/ImageAttributesAttributeProvider.java new file mode 100644 index 000000000..edd9c4692 --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/internal/ImageAttributesAttributeProvider.java @@ -0,0 +1,39 @@ +package org.commonmark.ext.image.attributes.internal; + +import org.commonmark.ext.image.attributes.ImageAttributes; +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.CustomNode; +import org.commonmark.node.Image; +import org.commonmark.node.Node; +import org.commonmark.renderer.html.AttributeProvider; + +import java.util.*; + +public class ImageAttributesAttributeProvider implements AttributeProvider { + + private ImageAttributesAttributeProvider() { + } + + public static ImageAttributesAttributeProvider create() { + return new ImageAttributesAttributeProvider(); + } + + @Override + public void setAttributes(Node node, String tagName, final Map attributes) { + if (node instanceof Image) { + node.accept(new AbstractVisitor() { + @Override + public void visit(CustomNode node) { + if (node instanceof ImageAttributes) { + ImageAttributes imageAttributes = (ImageAttributes) node; + for (Map.Entry entry : imageAttributes.getAttributes().entrySet()) { + attributes.put(entry.getKey(), entry.getValue()); + } + // Now that we have used the image attributes we remove the node. + imageAttributes.unlink(); + } + } + }); + } + } +} diff --git a/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/internal/ImageAttributesDelimiterProcessor.java b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/internal/ImageAttributesDelimiterProcessor.java new file mode 100644 index 000000000..a335ccadc --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/java/org/commonmark/ext/image/attributes/internal/ImageAttributesDelimiterProcessor.java @@ -0,0 +1,87 @@ +package org.commonmark.ext.image.attributes.internal; + +import org.commonmark.ext.image.attributes.ImageAttributes; +import org.commonmark.node.Image; +import org.commonmark.node.Node; +import org.commonmark.node.Nodes; +import org.commonmark.node.Text; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; + +import java.util.*; + +public class ImageAttributesDelimiterProcessor implements DelimiterProcessor { + + // Only allow a defined set of attributes to be used. + private static final Set SUPPORTED_ATTRIBUTES = Set.of("width", "height"); + + @Override + public char getOpeningCharacter() { + return '{'; + } + + @Override + public char getClosingCharacter() { + return '}'; + } + + @Override + public int getMinLength() { + return 1; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + if (openingRun.length() != 1) { + return 0; + } + + // Check if the attributes can be applied - if the previous node is an Image, and if all the attributes are in + // the set of SUPPORTED_ATTRIBUTES + Text opener = openingRun.getOpener(); + Node nodeToStyle = opener.getPrevious(); + if (!(nodeToStyle instanceof Image)) { + return 0; + } + + List toUnlink = new ArrayList<>(); + StringBuilder content = new StringBuilder(); + + for (Node node : Nodes.between(opener, closingRun.getCloser())) { + // Only Text nodes can be used for attributes + if (node instanceof Text) { + content.append(((Text) node).getLiteral()); + toUnlink.add(node); + } else { + // This node type is not supported, so stop here (no need to check any further ones). + return 0; + } + } + + Map attributesMap = new LinkedHashMap<>(); + String attributes = content.toString(); + for (String s : attributes.split("\\s+")) { + String[] attribute = s.split("="); + if (attribute.length > 1 && SUPPORTED_ATTRIBUTES.contains(attribute[0].toLowerCase())) { + attributesMap.put(attribute[0], attribute[1]); + } else { + // This attribute is not supported, so stop here (no need to check any further ones). + return 0; + } + } + + // Unlink the tmp nodes + for (Node node : toUnlink) { + node.unlink(); + } + + if (attributesMap.size() > 0) { + ImageAttributes imageAttributes = new ImageAttributes(attributesMap); + + // The new node is added as a child of the image node to which the attributes apply. + nodeToStyle.appendChild(imageAttributes); + } + + return 1; + } +} diff --git a/commonmark-ext-image-attributes/src/main/javadoc/overview.html b/commonmark-ext-image-attributes/src/main/javadoc/overview.html new file mode 100644 index 000000000..060597233 --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for adding attributes to image nodes +

See {@link org.commonmark.ext.image.attributes.ImageAttributes}

+ + diff --git a/commonmark-ext-image-attributes/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-image-attributes/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-image-attributes/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-image-attributes/src/test/java/org/commonmark/ext/image/attributes/ImageAttributesTest.java b/commonmark-ext-image-attributes/src/test/java/org/commonmark/ext/image/attributes/ImageAttributesTest.java new file mode 100644 index 000000000..3edf8497e --- /dev/null +++ b/commonmark-ext-image-attributes/src/test/java/org/commonmark/ext/image/attributes/ImageAttributesTest.java @@ -0,0 +1,141 @@ +package org.commonmark.ext.image.attributes; + +import org.commonmark.Extension; +import org.commonmark.node.Node; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ImageAttributesTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(ImageAttributesExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void baseCase() { + assertRendering("![text](/url.png){height=5}", + "

\"text\"

\n"); + + assertRendering("![text](/url.png){height=5 width=6}", + "

\"text\"

\n"); + + assertRendering("![text](/url.png){height=99px width=100px}", + "

\"text\"

\n"); + + assertRendering("![text](/url.png){width=100 height=100}", + "

\"text\"

\n"); + + assertRendering("![text](/url.png){height=4.8 width=3.14}", + "

\"text\"

\n"); + + assertRendering("![text](/url.png){Width=18 HeIgHt=1001}", + "

\"text\"

\n"); + + assertRendering("![text](/url.png){height=green width=blue}", + "

\"text\"

\n"); + } + + @Test + public void doubleDelimiters() { + assertRendering("![text](/url.png){{height=5}}", + "

\"text\"{{height=5}}

\n"); + } + + @Test + public void mismatchingDelimitersAreIgnored() { + assertRendering("![text](/url.png){", "

\"text\"{

\n"); + } + + @Test + public void unsupportedStyleNamesAreLeftUnchanged() { + assertRendering("![text](/url.png){j=502 K=101 img=2 url=5}", + "

\"text\"{j=502 K=101 img=2 url=5}

\n"); + assertRendering("![foo](/url.png){height=3 invalid}\n", + "

\"foo\"{height=3 invalid}

\n"); + assertRendering("![foo](/url.png){height=3 *test*}\n", + "

\"foo\"{height=3 test}

\n"); + } + + @Test + public void styleWithNoValueIsIgnored() { + assertRendering("![text](/url.png){height}", + "

\"text\"{height}

\n"); + } + + @Test + public void repeatedStyleNameUsesFinalOne() { + assertRendering("![text](/url.png){height=4 height=5 width=1 height=6}", + "

\"text\"

\n"); + } + + @Test + public void styleValuesAreEscaped() { + assertRendering("![text](/url.png){height=\"text\"

\n"); + assertRendering("![text](/url.png){height=\"\"img}", + "

\"text\"

\n"); + } + + @Test + public void imageAltTextWithSpaces() { + assertRendering("![Android SDK Manager](/contrib/android-sdk-manager.png){height=502 width=101}", + "

\"Android

\n"); + } + + @Test + public void imageAltTextWithSoftLineBreak() { + assertRendering("![foo\nbar](/url){height=101 width=202}\n", + "

\"foo\nbar\"

\n"); + } + + @Test + public void imageAltTextWithHardLineBreak() { + assertRendering("![foo \nbar](/url){height=506 width=1}\n", + "

\"foo\nbar\"

\n"); + } + + @Test + public void imageAltTextWithEntities() { + assertRendering("![foo ä](/url){height=99 width=100}\n", + "

\"foo

\n"); + } + + @Test + public void textNodesAreUnchanged() { + assertRendering("x{height=3 width=4}\n", "

x{height=3 width=4}

\n"); + assertRendering("x {height=3 width=4}\n", "

x {height=3 width=4}

\n"); + assertRendering("\\documentclass[12pt]{article}\n", "

\\documentclass[12pt]{article}

\n"); + assertRendering("some *text*{height=3 width=4}\n", "

some text{height=3 width=4}

\n"); + assertRendering("{NN} text", "

{NN} text

\n"); + assertRendering("{}", "

{}

\n"); + } + + @Test + public void sourceSpans() { + Parser parser = Parser.builder() + .extensions(EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + + // This doesn't result in image attributes, so source spans should be for the single (merged) text node. + Node document = parser.parse("x{height=3 width=4}\n"); + Paragraph block = (Paragraph) document.getFirstChild(); + Node text = block.getFirstChild(); + assertThat(text.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 19))); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-ext-ins/pom.xml b/commonmark-ext-ins/pom.xml new file mode 100644 index 000000000..48481c073 --- /dev/null +++ b/commonmark-ext-ins/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-ext-ins + commonmark-java extension for <ins> (underline) + commonmark-java extension for <ins> using ++ + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-ins/src/main/java/module-info.java b/commonmark-ext-ins/src/main/java/module-info.java new file mode 100644 index 000000000..fb96ea598 --- /dev/null +++ b/commonmark-ext-ins/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.ins { + exports org.commonmark.ext.ins; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/Ins.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/Ins.java new file mode 100644 index 000000000..2ebd4f5ca --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/Ins.java @@ -0,0 +1,22 @@ +package org.commonmark.ext.ins; + +import org.commonmark.node.CustomNode; +import org.commonmark.node.Delimited; + +/** + * An ins node containing text and other inline nodes as children. + */ +public class Ins extends CustomNode implements Delimited { + + private static final String DELIMITER = "++"; + + @Override + public String getOpeningDelimiter() { + return DELIMITER; + } + + @Override + public String getClosingDelimiter() { + return DELIMITER; + } +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/InsExtension.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/InsExtension.java new file mode 100644 index 000000000..e8a53e59a --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/InsExtension.java @@ -0,0 +1,83 @@ +package org.commonmark.ext.ins; + +import org.commonmark.Extension; +import org.commonmark.ext.ins.internal.InsDelimiterProcessor; +import org.commonmark.ext.ins.internal.InsHtmlNodeRenderer; +import org.commonmark.ext.ins.internal.InsMarkdownNodeRenderer; +import org.commonmark.ext.ins.internal.InsTextContentNodeRenderer; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlNodeRendererFactory; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownNodeRendererFactory; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.commonmark.renderer.text.TextContentNodeRendererContext; +import org.commonmark.renderer.text.TextContentNodeRendererFactory; +import org.commonmark.renderer.text.TextContentRenderer; + +import java.util.Set; + +/** + * Extension for ins using ++ + *

+ * Create it with {@link #create()} and then configure it on the builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ *

+ * The parsed ins text regions are turned into {@link Ins} nodes. + *

+ */ +public class InsExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension, TextContentRenderer.TextContentRendererExtension, MarkdownRenderer.MarkdownRendererExtension { + + private InsExtension() { + } + + public static Extension create() { + return new InsExtension(); + } + + @Override + public void extend(Parser.Builder parserBuilder) { + parserBuilder.customDelimiterProcessor(new InsDelimiterProcessor()); + } + + @Override + public void extend(HtmlRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new InsHtmlNodeRenderer(context); + } + }); + } + + @Override + public void extend(TextContentRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new TextContentNodeRendererFactory() { + @Override + public NodeRenderer create(TextContentNodeRendererContext context) { + return new InsTextContentNodeRenderer(context); + } + }); + } + + @Override + public void extend(MarkdownRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new InsMarkdownNodeRenderer(context); + } + + @Override + public Set getSpecialCharacters() { + // We technically don't need to escape single occurrences of +, but that's all the extension API + // exposes currently. + return Set.of('+'); + } + }); + } +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsDelimiterProcessor.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsDelimiterProcessor.java new file mode 100644 index 000000000..b0bfb4c6e --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsDelimiterProcessor.java @@ -0,0 +1,56 @@ +package org.commonmark.ext.ins.internal; + +import org.commonmark.ext.ins.Ins; +import org.commonmark.node.Node; +import org.commonmark.node.Nodes; +import org.commonmark.node.SourceSpans; +import org.commonmark.node.Text; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; + +public class InsDelimiterProcessor implements DelimiterProcessor { + + @Override + public char getOpeningCharacter() { + return '+'; + } + + @Override + public char getClosingCharacter() { + return '+'; + } + + @Override + public int getMinLength() { + return 2; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + if (openingRun.length() >= 2 && closingRun.length() >= 2) { + // Use exactly two delimiters even if we have more, and don't care about internal openers/closers. + + Text opener = openingRun.getOpener(); + + // Wrap nodes between delimiters in ins. + Node ins = new Ins(); + + SourceSpans sourceSpans = new SourceSpans(); + sourceSpans.addAllFrom(openingRun.getOpeners(2)); + + for (Node node : Nodes.between(opener, closingRun.getCloser())) { + ins.appendChild(node); + sourceSpans.addAll(node.getSourceSpans()); + } + + sourceSpans.addAllFrom(closingRun.getClosers(2)); + ins.setSourceSpans(sourceSpans.getSourceSpans()); + + opener.insertAfter(ins); + + return 2; + } else { + return 0; + } + } +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsHtmlNodeRenderer.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsHtmlNodeRenderer.java new file mode 100644 index 000000000..dcd05fd59 --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsHtmlNodeRenderer.java @@ -0,0 +1,35 @@ +package org.commonmark.ext.ins.internal; + +import org.commonmark.node.Node; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlWriter; + +import java.util.Map; + +public class InsHtmlNodeRenderer extends InsNodeRenderer { + + private final HtmlNodeRendererContext context; + private final HtmlWriter html; + + public InsHtmlNodeRenderer(HtmlNodeRendererContext context) { + this.context = context; + this.html = context.getWriter(); + } + + @Override + public void render(Node node) { + Map attributes = context.extendAttributes(node, "ins", Map.of()); + html.tag("ins", attributes); + renderChildren(node); + html.tag("/ins"); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsMarkdownNodeRenderer.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsMarkdownNodeRenderer.java new file mode 100644 index 000000000..851d47282 --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsMarkdownNodeRenderer.java @@ -0,0 +1,32 @@ +package org.commonmark.ext.ins.internal; + +import org.commonmark.node.Node; +import org.commonmark.renderer.markdown.MarkdownNodeRendererContext; +import org.commonmark.renderer.markdown.MarkdownWriter; + +public class InsMarkdownNodeRenderer extends InsNodeRenderer { + + private final MarkdownNodeRendererContext context; + private final MarkdownWriter writer; + + public InsMarkdownNodeRenderer(MarkdownNodeRendererContext context) { + this.context = context; + this.writer = context.getWriter(); + } + + @Override + public void render(Node node) { + writer.raw("++"); + renderChildren(node); + writer.raw("++"); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsNodeRenderer.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsNodeRenderer.java new file mode 100644 index 000000000..31f0a64ec --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsNodeRenderer.java @@ -0,0 +1,15 @@ +package org.commonmark.ext.ins.internal; + +import org.commonmark.ext.ins.Ins; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; + +import java.util.Set; + +abstract class InsNodeRenderer implements NodeRenderer { + + @Override + public Set> getNodeTypes() { + return Set.of(Ins.class); + } +} diff --git a/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsTextContentNodeRenderer.java b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsTextContentNodeRenderer.java new file mode 100644 index 000000000..f30947c93 --- /dev/null +++ b/commonmark-ext-ins/src/main/java/org/commonmark/ext/ins/internal/InsTextContentNodeRenderer.java @@ -0,0 +1,27 @@ +package org.commonmark.ext.ins.internal; + +import org.commonmark.node.Node; +import org.commonmark.renderer.text.TextContentNodeRendererContext; + +public class InsTextContentNodeRenderer extends InsNodeRenderer { + + private final TextContentNodeRendererContext context; + + public InsTextContentNodeRenderer(TextContentNodeRendererContext context) { + this.context = context; + } + + @Override + public void render(Node node) { + renderChildren(node); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-ins/src/main/javadoc/overview.html b/commonmark-ext-ins/src/main/javadoc/overview.html new file mode 100644 index 000000000..6dad34d3f --- /dev/null +++ b/commonmark-ext-ins/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for ins (underline) using ++ +

See {@link org.commonmark.ext.ins.InsExtension}

+ + diff --git a/commonmark-ext-ins/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-ins/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-ins/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-ins/src/test/java/org/commonmark/ext/ins/InsMarkdownRendererTest.java b/commonmark-ext-ins/src/test/java/org/commonmark/ext/ins/InsMarkdownRendererTest.java new file mode 100644 index 000000000..6fc9ead67 --- /dev/null +++ b/commonmark-ext-ins/src/test/java/org/commonmark/ext/ins/InsMarkdownRendererTest.java @@ -0,0 +1,33 @@ +package org.commonmark.ext.ins; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class InsMarkdownRendererTest { + + private static final Set EXTENSIONS = Set.of(InsExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final MarkdownRenderer RENDERER = MarkdownRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void testStrikethrough() { + assertRoundTrip("++foo++\n"); + + assertRoundTrip("\\+\\+foo\\+\\+\n"); + } + + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private void assertRoundTrip(String input) { + String rendered = render(input); + assertThat(rendered).isEqualTo(input); + } +} diff --git a/commonmark-ext-ins/src/test/java/org/commonmark/ext/ins/InsTest.java b/commonmark-ext-ins/src/test/java/org/commonmark/ext/ins/InsTest.java new file mode 100644 index 000000000..a5c91a395 --- /dev/null +++ b/commonmark-ext-ins/src/test/java/org/commonmark/ext/ins/InsTest.java @@ -0,0 +1,112 @@ +package org.commonmark.ext.ins; + +import org.commonmark.Extension; +import org.commonmark.node.Node; +import org.commonmark.node.Paragraph; +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.text.TextContentRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class InsTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(InsExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + private static final TextContentRenderer CONTENT_RENDERER = TextContentRenderer.builder() + .extensions(EXTENSIONS).build(); + + @Test + public void onePlusIsNotEnough() { + assertRendering("+foo+", "

+foo+

\n"); + } + + @Test + public void twoPlusesYay() { + assertRendering("++foo++", "

foo

\n"); + } + + @Test + public void fourPlusesNope() { + assertRendering("foo ++++", "

foo ++++

\n"); + } + + @Test + public void unmatched() { + assertRendering("++foo", "

++foo

\n"); + assertRendering("foo++", "

foo++

\n"); + } + + @Test + public void threeInnerThree() { + assertRendering("+++foo+++", "

+foo+

\n"); + } + + @Test + public void twoInnerThree() { + assertRendering("++foo+++", "

foo+

\n"); + } + + @Test + public void plusesInside() { + assertRendering("++foo+bar++", "

foo+bar

\n"); + assertRendering("++foo++bar++", "

foobar++

\n"); + assertRendering("++foo+++bar++", "

foo+bar++

\n"); + assertRendering("++foo++++bar++", "

foobar

\n"); + assertRendering("++foo+++++bar++", "

foo+bar

\n"); + assertRendering("++foo++++++bar++", "

foo++bar

\n"); + assertRendering("++foo+++++++bar++", "

foo+++bar

\n"); + } + + @Test + public void insWholeParagraphWithOtherDelimiters() { + assertRendering("++Paragraph with *emphasis* and __strong emphasis__++", + "

Paragraph with emphasis and strong emphasis

\n"); + } + + @Test + public void insideBlockQuote() { + assertRendering("> underline ++that++", + "
\n

underline that

\n
\n"); + } + + @Test + public void delimited() { + Node document = PARSER.parse("++foo++"); + Ins ins = (Ins) document.getFirstChild().getFirstChild(); + assertThat(ins.getOpeningDelimiter()).isEqualTo("++"); + assertThat(ins.getClosingDelimiter()).isEqualTo("++"); + } + + @Test + public void textContentRenderer() { + Node document = PARSER.parse("++foo++"); + assertThat(CONTENT_RENDERER.render(document)).isEqualTo("foo"); + } + + @Test + public void sourceSpans() { + Parser parser = Parser.builder() + .extensions(EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + + Node document = parser.parse("hey ++there++\n"); + Paragraph block = (Paragraph) document.getFirstChild(); + Node ins = block.getLastChild(); + assertThat(ins.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 9))); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-ext-task-list-items/pom.xml b/commonmark-ext-task-list-items/pom.xml new file mode 100644 index 000000000..4359f8707 --- /dev/null +++ b/commonmark-ext-task-list-items/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-ext-task-list-items + commonmark-java extension for task list items + commonmark-java extension for task list items + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-task-list-items/src/main/java/module-info.java b/commonmark-ext-task-list-items/src/main/java/module-info.java new file mode 100644 index 000000000..9528323ea --- /dev/null +++ b/commonmark-ext-task-list-items/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.task.list.items { + exports org.commonmark.ext.task.list.items; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/TaskListItemMarker.java b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/TaskListItemMarker.java new file mode 100644 index 000000000..9eca59bc9 --- /dev/null +++ b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/TaskListItemMarker.java @@ -0,0 +1,19 @@ +package org.commonmark.ext.task.list.items; + +import org.commonmark.node.CustomNode; + +/** + * A marker node indicating that a list item contains a task. + */ +public class TaskListItemMarker extends CustomNode { + + private final boolean checked; + + public TaskListItemMarker(boolean checked) { + this.checked = checked; + } + + public boolean isChecked() { + return checked; + } +} diff --git a/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/TaskListItemsExtension.java b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/TaskListItemsExtension.java new file mode 100644 index 000000000..9bf0a2155 --- /dev/null +++ b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/TaskListItemsExtension.java @@ -0,0 +1,45 @@ +package org.commonmark.ext.task.list.items; + +import org.commonmark.Extension; +import org.commonmark.ext.task.list.items.internal.TaskListItemHtmlNodeRenderer; +import org.commonmark.ext.task.list.items.internal.TaskListItemPostProcessor; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlNodeRendererFactory; +import org.commonmark.renderer.html.HtmlRenderer; + +/** + * Extension for adding task list items. + *

+ * Create it with {@link #create()} and then configure it on the builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ * + * @since 0.15.0 + */ +public class TaskListItemsExtension implements Parser.ParserExtension, HtmlRenderer.HtmlRendererExtension { + + private TaskListItemsExtension() { + } + + public static Extension create() { + return new TaskListItemsExtension(); + } + + @Override + public void extend(Parser.Builder parserBuilder) { + parserBuilder.postProcessor(new TaskListItemPostProcessor()); + } + + @Override + public void extend(HtmlRenderer.Builder rendererBuilder) { + rendererBuilder.nodeRendererFactory(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new TaskListItemHtmlNodeRenderer(context); + } + }); + } +} diff --git a/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/internal/TaskListItemHtmlNodeRenderer.java b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/internal/TaskListItemHtmlNodeRenderer.java new file mode 100644 index 000000000..331b301e9 --- /dev/null +++ b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/internal/TaskListItemHtmlNodeRenderer.java @@ -0,0 +1,52 @@ +package org.commonmark.ext.task.list.items.internal; + +import org.commonmark.ext.task.list.items.TaskListItemMarker; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlWriter; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +public class TaskListItemHtmlNodeRenderer implements NodeRenderer { + + private final HtmlNodeRendererContext context; + private final HtmlWriter html; + + public TaskListItemHtmlNodeRenderer(HtmlNodeRendererContext context) { + this.context = context; + this.html = context.getWriter(); + } + + @Override + public Set> getNodeTypes() { + return Set.of(TaskListItemMarker.class); + } + + @Override + public void render(Node node) { + if (node instanceof TaskListItemMarker) { + Map attributes = new LinkedHashMap<>(); + attributes.put("type", "checkbox"); + attributes.put("disabled", ""); + if (((TaskListItemMarker) node).isChecked()) { + attributes.put("checked", ""); + } + html.tag("input", context.extendAttributes(node, "input", attributes)); + // Add a space after the input tag (as the next text node has been trimmed) + html.text(" "); + renderChildren(node); + } + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } +} diff --git a/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/internal/TaskListItemPostProcessor.java b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/internal/TaskListItemPostProcessor.java new file mode 100644 index 000000000..b95c2e30d --- /dev/null +++ b/commonmark-ext-task-list-items/src/main/java/org/commonmark/ext/task/list/items/internal/TaskListItemPostProcessor.java @@ -0,0 +1,49 @@ +package org.commonmark.ext.task.list.items.internal; + +import org.commonmark.ext.task.list.items.TaskListItemMarker; +import org.commonmark.node.*; +import org.commonmark.parser.PostProcessor; + +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class TaskListItemPostProcessor implements PostProcessor { + + private static final Pattern REGEX_TASK_LIST_ITEM = Pattern.compile("^\\[([xX\\s])]\\s+(.*)"); + + @Override + public Node process(Node node) { + TaskListItemVisitor visitor = new TaskListItemVisitor(); + node.accept(visitor); + return node; + } + + private static class TaskListItemVisitor extends AbstractVisitor { + + @Override + public void visit(ListItem listItem) { + Node child = listItem.getFirstChild(); + if (child instanceof Paragraph) { + Node node = child.getFirstChild(); + if (node instanceof Text) { + Text textNode = (Text) node; + Matcher matcher = REGEX_TASK_LIST_ITEM.matcher(textNode.getLiteral()); + if (matcher.matches()) { + String checked = matcher.group(1); + boolean isChecked = Objects.equals(checked, "X") || Objects.equals(checked, "x"); + + // Add the task list item marker node as the first child of the list item. + listItem.prependChild(new TaskListItemMarker(isChecked)); + + // Parse the node using the input after the task marker (in other words, group 2 from the matcher). + // (Note that the String has been trimmed, so we should add a space between the + // TaskListItemMarker and the text that follows it when we come to render it). + textNode.setLiteral(matcher.group(2)); + } + } + } + visitChildren(listItem); + } + } +} diff --git a/commonmark-ext-task-list-items/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-task-list-items/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-task-list-items/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-task-list-items/src/test/java/org/commonmark/ext/task/list/items/TaskListItemsTest.java b/commonmark-ext-task-list-items/src/test/java/org/commonmark/ext/task/list/items/TaskListItemsTest.java new file mode 100644 index 000000000..0adc615a7 --- /dev/null +++ b/commonmark-ext-task-list-items/src/test/java/org/commonmark/ext/task/list/items/TaskListItemsTest.java @@ -0,0 +1,102 @@ +package org.commonmark.ext.task.list.items; + +import org.commonmark.Extension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +public class TaskListItemsTest extends RenderingTestCase { + + private static final Set EXTENSIONS = Set.of(TaskListItemsExtension.create()); + private static final String HTML_CHECKED = ""; + private static final String HTML_UNCHECKED = ""; + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void baseCase() { + assertRendering("- [x] this is *done*\n", "
    \n
  • " + HTML_CHECKED + " this is done
  • \n
\n"); + + assertRendering("- [ ] do this\n", "
    \n
  • " + HTML_UNCHECKED + " do this
  • \n
\n"); + + assertRendering("- [x] foo\n" + + " - [ ] bar\n" + + " - [x] baz\n" + + "- [ ] bim", + "
    \n" + + "
  • " + HTML_CHECKED + " foo\n" + + "
      \n" + + "
    • " + HTML_UNCHECKED + " bar
    • \n" + + "
    • " + HTML_CHECKED + " baz
    • \n" + + "
    \n" + + "
  • \n" + + "
  • " + HTML_UNCHECKED + " bim
  • \n" + + "
\n"); + + assertRendering("* [ ] do this\n* [ ] and this", + "
    \n
  • " + HTML_UNCHECKED + " do this
  • \n
  • " + HTML_UNCHECKED + " and this
  • \n
\n"); + + assertRendering("+ [x] one\n" + + " - [ ] two\n" + + " * [x] three\n", + "
    \n" + + "
  • " + HTML_CHECKED + " one\n" + + "
      \n" + + "
    • " + HTML_UNCHECKED + " two\n" + + "
        \n" + + "
      • " + HTML_CHECKED + " three
      • \n" + + "
      \n" + + "
    • \n" + + "
    \n" + + "
  • \n" + + "
\n"); + + assertRendering("TODO list\n" + + "---------\n" + + "- [ ] first task\n" + + "- [x] second task\n" + + "- [ ] third task\n\n" + + "Let me know when you are finished", + "

TODO list

\n" + + "
    \n" + + "
  • " + HTML_UNCHECKED + " first task
  • \n" + + "
  • " + HTML_CHECKED + " second task
  • \n" + + "
  • " + HTML_UNCHECKED + " third task
  • \n" + + "
\n" + + "

Let me know when you are finished

\n"); + } + + @Test + public void notListItem() { + assertRendering("[x] this is not a task\n", "

[x] this is not a task

\n"); + assertRendering(" [ ] this is not a task either\n", "

[ ] this is not a task either

\n"); + } + + @Test + public void notValidTaskFormat() { + assertRendering("- [x]no space\n", "
    \n
  • [x]no space
  • \n
\n"); + assertRendering("- [O] is not a _task_\n", "
    \n
  • [O] is not a task
  • \n
\n"); + assertRendering("* [] neither is this\n", "
    \n
  • [] neither is this
  • \n
\n"); + assertRendering("* [ ] nor this\n" + + "* [XX] nor this\n", + "
    \n
  • [ ] nor this
  • \n
  • [XX] nor this
  • \n
\n"); + assertRendering("+ [x]] is not a task\n", "
    \n
  • [x]] is not a task
  • \n
\n"); + assertRendering("- [x isn't\n", "
    \n
  • [x isn't
  • \n
\n"); + assertRendering("- [[x is not\n", "
    \n
  • [[x is not
  • \n
\n"); + assertRendering("- x] nope\n", "
    \n
  • x] nope
  • \n
\n"); + assertRendering("- x]] no way\n", "
    \n
  • x]] no way
  • \n
\n"); + assertRendering("+ (x) sorry no\n", "
    \n
  • (x) sorry no
  • \n
\n"); + assertRendering("+ {x} sorry not sorry\n", "
    \n
  • {x} sorry not sorry
  • \n
\n"); + assertRendering("+ [[x]] nooo\n", "
    \n
  • [[x]] nooo
  • \n
\n"); + assertRendering("+ text before [x] is not a task\n", "
    \n
  • text before [x] is not a task
  • \n
\n"); + assertRendering("* [x] \n* [ ] \n", "
    \n
  • [x]
  • \n
  • [ ]
  • \n
\n"); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-ext-yaml-front-matter/pom.xml b/commonmark-ext-yaml-front-matter/pom.xml new file mode 100644 index 000000000..e6822f771 --- /dev/null +++ b/commonmark-ext-yaml-front-matter/pom.xml @@ -0,0 +1,27 @@ + + + 4.0.0 + + commonmark-parent + org.commonmark + 0.28.1-SNAPSHOT + + + commonmark-ext-yaml-front-matter + commonmark-java extension for YAML front matter + commonmark-java extension for YAML front matter + + + + org.commonmark + commonmark + + + + org.commonmark + commonmark-test-util + test + + + + diff --git a/commonmark-ext-yaml-front-matter/src/main/java/module-info.java b/commonmark-ext-yaml-front-matter/src/main/java/module-info.java new file mode 100644 index 000000000..5f96c14ad --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/java/module-info.java @@ -0,0 +1,5 @@ +module org.commonmark.ext.front.matter { + exports org.commonmark.ext.front.matter; + + requires transitive org.commonmark; +} diff --git a/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterBlock.java b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterBlock.java new file mode 100644 index 000000000..0d9aba2d3 --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterBlock.java @@ -0,0 +1,6 @@ +package org.commonmark.ext.front.matter; + +import org.commonmark.node.CustomBlock; + +public class YamlFrontMatterBlock extends CustomBlock { +} diff --git a/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterExtension.java b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterExtension.java new file mode 100644 index 000000000..7a2c9f9f5 --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterExtension.java @@ -0,0 +1,32 @@ +package org.commonmark.ext.front.matter; + +import org.commonmark.Extension; +import org.commonmark.ext.front.matter.internal.YamlFrontMatterBlockParser; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; + +/** + * Extension for YAML-like metadata. + *

+ * Create it with {@link #create()} and then configure it on the builders + * ({@link org.commonmark.parser.Parser.Builder#extensions(Iterable)}, + * {@link HtmlRenderer.Builder#extensions(Iterable)}). + *

+ *

+ * The parsed metadata is turned into {@link YamlFrontMatterNode}. You can access the metadata using {@link YamlFrontMatterVisitor}. + *

+ */ +public class YamlFrontMatterExtension implements Parser.ParserExtension { + + private YamlFrontMatterExtension() { + } + + @Override + public void extend(Parser.Builder parserBuilder) { + parserBuilder.customBlockParserFactory(new YamlFrontMatterBlockParser.Factory()); + } + + public static Extension create() { + return new YamlFrontMatterExtension(); + } +} diff --git a/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterNode.java b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterNode.java new file mode 100644 index 000000000..20eb3baf7 --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterNode.java @@ -0,0 +1,31 @@ +package org.commonmark.ext.front.matter; + +import org.commonmark.node.CustomNode; + +import java.util.List; + +public class YamlFrontMatterNode extends CustomNode { + private String key; + private List values; + + public YamlFrontMatterNode(String key, List values) { + this.key = key; + this.values = values; + } + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public List getValues() { + return values; + } + + public void setValues(List values) { + this.values = values; + } +} diff --git a/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterVisitor.java b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterVisitor.java new file mode 100644 index 000000000..1c23966f5 --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/YamlFrontMatterVisitor.java @@ -0,0 +1,29 @@ +package org.commonmark.ext.front.matter; + +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.CustomNode; + +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class YamlFrontMatterVisitor extends AbstractVisitor { + private Map> data; + + public YamlFrontMatterVisitor() { + data = new LinkedHashMap<>(); + } + + @Override + public void visit(CustomNode customNode) { + if (customNode instanceof YamlFrontMatterNode) { + data.put(((YamlFrontMatterNode) customNode).getKey(), ((YamlFrontMatterNode) customNode).getValues()); + } else { + super.visit(customNode); + } + } + + public Map> getData() { + return data; + } +} diff --git a/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/internal/YamlFrontMatterBlockParser.java b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/internal/YamlFrontMatterBlockParser.java new file mode 100644 index 000000000..469cf4e2f --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/java/org/commonmark/ext/front/matter/internal/YamlFrontMatterBlockParser.java @@ -0,0 +1,130 @@ +package org.commonmark.ext.front.matter.internal; + +import org.commonmark.ext.front.matter.YamlFrontMatterBlock; +import org.commonmark.ext.front.matter.YamlFrontMatterNode; +import org.commonmark.node.Block; +import org.commonmark.node.Document; +import org.commonmark.parser.InlineParser; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.block.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class YamlFrontMatterBlockParser extends AbstractBlockParser { + private static final Pattern REGEX_METADATA = Pattern.compile("^[ ]{0,3}([A-Za-z0-9._-]+):\\s*(.*)"); + private static final Pattern REGEX_METADATA_LIST = Pattern.compile("^[ ]+-\\s*(.*)"); + private static final Pattern REGEX_METADATA_LITERAL = Pattern.compile("^\\s*(.*)"); + private static final Pattern REGEX_BEGIN = Pattern.compile("^-{3}(\\s.*)?"); + private static final Pattern REGEX_END = Pattern.compile("^(-{3}|\\.{3})(\\s.*)?"); + + private boolean inLiteral; + private String currentKey; + private List currentValues; + private YamlFrontMatterBlock block; + + public YamlFrontMatterBlockParser() { + inLiteral = false; + currentKey = null; + currentValues = new ArrayList<>(); + block = new YamlFrontMatterBlock(); + } + + @Override + public Block getBlock() { + return block; + } + + @Override + public void addLine(SourceLine line) { + } + + @Override + public BlockContinue tryContinue(ParserState parserState) { + final CharSequence line = parserState.getLine().getContent(); + + if (REGEX_END.matcher(line).matches()) { + if (currentKey != null) { + block.appendChild(new YamlFrontMatterNode(currentKey, currentValues)); + } + return BlockContinue.finished(); + } + + Matcher matcher = REGEX_METADATA.matcher(line); + if (matcher.matches()) { + if (currentKey != null) { + block.appendChild(new YamlFrontMatterNode(currentKey, currentValues)); + } + + inLiteral = false; + currentKey = matcher.group(1); + currentValues = new ArrayList<>(); + String value = matcher.group(2); + if ("|".equals(value)) { + inLiteral = true; + } else if (!"".equals(value)) { + currentValues.add(parseString(value)); + } + + return BlockContinue.atIndex(parserState.getIndex()); + } else { + if (inLiteral) { + matcher = REGEX_METADATA_LITERAL.matcher(line); + if (matcher.matches()) { + if (currentValues.size() == 1) { + currentValues.set(0, currentValues.get(0) + "\n" + matcher.group(1).trim()); + } else { + currentValues.add(matcher.group(1).trim()); + } + } + } else { + matcher = REGEX_METADATA_LIST.matcher(line); + if (matcher.matches()) { + String value = matcher.group(1); + currentValues.add(parseString(value)); + } + } + + return BlockContinue.atIndex(parserState.getIndex()); + } + } + + @Override + public void parseInlines(InlineParser inlineParser) { + } + + private static String parseString(String s) { + // Limited parsing of https://yaml.org/spec/1.2.2/#73-flow-scalar-styles + // We assume input is well-formed and otherwise treat it as a plain string. In a real + // parser, e.g. `'foo` would be invalid because it's missing a trailing `'`. + if (s.startsWith("'") && s.endsWith("'")) { + String inner = s.substring(1, s.length() - 1); + return inner.replace("''", "'"); + } else if (s.startsWith("\"") && s.endsWith("\"")) { + String inner = s.substring(1, s.length() - 1); + // Only support escaped `\` and `"`, nothing else. + return inner + .replace("\\\"", "\"") + .replace("\\\\", "\\"); + } else { + return s; + } + } + + public static class Factory extends AbstractBlockParserFactory { + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + CharSequence line = state.getLine().getContent(); + BlockParser parentParser = matchedBlockParser.getMatchedBlockParser(); + // check whether this line is the first line of whole document or not + if (parentParser.getBlock() instanceof Document && parentParser.getBlock().getFirstChild() == null && + REGEX_BEGIN.matcher(line).matches()) { + return BlockStart.of(new YamlFrontMatterBlockParser()).atIndex(state.getNextNonSpaceIndex()); + } + + return BlockStart.none(); + } + } +} diff --git a/commonmark-ext-yaml-front-matter/src/main/javadoc/overview.html b/commonmark-ext-yaml-front-matter/src/main/javadoc/overview.html new file mode 100644 index 000000000..ad3a4287c --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/javadoc/overview.html @@ -0,0 +1,6 @@ + + +Extension for YAML front matter +

See {@link org.commonmark.ext.front.matter.YamlFrontMatterExtension}

+ + diff --git a/commonmark-ext-yaml-front-matter/src/main/resources/META-INF/LICENSE.txt b/commonmark-ext-yaml-front-matter/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-ext-yaml-front-matter/src/test/java/org/commonmark/ext/front/matter/YamlFrontMatterTest.java b/commonmark-ext-yaml-front-matter/src/test/java/org/commonmark/ext/front/matter/YamlFrontMatterTest.java new file mode 100644 index 000000000..db17d4a4e --- /dev/null +++ b/commonmark-ext-yaml-front-matter/src/test/java/org/commonmark/ext/front/matter/YamlFrontMatterTest.java @@ -0,0 +1,332 @@ +package org.commonmark.ext.front.matter; + +import org.commonmark.Extension; +import org.commonmark.node.CustomNode; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class YamlFrontMatterTest extends RenderingTestCase { + private static final Set EXTENSIONS = Set.of(YamlFrontMatterExtension.create()); + private static final Parser PARSER = Parser.builder().extensions(EXTENSIONS).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(EXTENSIONS).build(); + + @Test + public void simpleValue() { + final String input = "---" + + "\nhello: world" + + "\n..." + + "\n" + + "\ngreat"; + final String rendered = "

great

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(1); + assertThat(data.keySet().iterator().next()).isEqualTo("hello"); + assertThat(data.get("hello")).hasSize(1); + assertThat(data.get("hello").get(0)).isEqualTo("world"); + + assertRendering(input, rendered); + } + + @Test + public void emptyValue() { + final String input = "---" + + "\nkey:" + + "\n---" + + "\n" + + "\ngreat"; + final String rendered = "

great

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(1); + assertThat(data.keySet().iterator().next()).isEqualTo("key"); + assertThat(data.get("key")).hasSize(0); + + assertRendering(input, rendered); + } + + @Test + public void listValues() { + final String input = "---" + + "\nlist:" + + "\n - value1" + + "\n - value2" + + "\n..." + + "\n" + + "\ngreat"; + final String rendered = "

great

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(1); + assertThat(data).containsKey("list"); + assertThat(data.get("list")).hasSize(2); + assertThat(data.get("list").get(0)).isEqualTo("value1"); + assertThat(data.get("list").get(1)).isEqualTo("value2"); + + assertRendering(input, rendered); + } + + @Test + public void literalValue1() { + final String input = "---" + + "\nliteral: |" + + "\n hello markdown!" + + "\n literal thing..." + + "\n---" + + "\n" + + "\ngreat"; + final String rendered = "

great

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(1); + assertThat(data).containsKey("literal"); + assertThat(data.get("literal")).hasSize(1); + assertThat(data.get("literal").get(0)).isEqualTo("hello markdown!\nliteral thing..."); + + assertRendering(input, rendered); + } + + @Test + public void literalValue2() { + final String input = "---" + + "\nliteral: |" + + "\n - hello markdown!" + + "\n---" + + "\n" + + "\ngreat"; + final String rendered = "

great

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(1); + assertThat(data).containsKey("literal"); + assertThat(data.get("literal")).hasSize(1); + assertThat(data.get("literal").get(0)).isEqualTo("- hello markdown!"); + + assertRendering(input, rendered); + } + + @Test + public void complexValues() { + final String input = "---" + + "\nsimple: value" + + "\nliteral: |" + + "\n hello markdown!" + + "\n" + + "\n literal literal" + + "\nlist:" + + "\n - value1" + + "\n - value2" + + "\n---" + + "\ngreat"; + final String rendered = "

great

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(3); + + assertThat(data).containsKey("simple"); + assertThat(data.get("simple")).hasSize(1); + assertThat(data.get("simple").get(0)).isEqualTo("value"); + + assertThat(data).containsKey("literal"); + assertThat(data.get("literal")).hasSize(1); + assertThat(data.get("literal").get(0)).isEqualTo("hello markdown!\n\nliteral literal"); + + assertThat(data).containsKey("list"); + assertThat(data.get("list")).hasSize(2); + assertThat(data.get("list").get(0)).isEqualTo("value1"); + assertThat(data.get("list").get(1)).isEqualTo("value2"); + + assertRendering(input, rendered); + } + + @Test + public void empty() { + final String input = "---\n" + + "---\n" + + "test"; + final String rendered = "

test

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).isEmpty(); + + assertRendering(input, rendered); + } + + @Test + public void yamlInParagraph() { + final String input = "# hello\n" + + "\nhello markdown world!" + + "\n---" + + "\nhello: world" + + "\n---"; + final String rendered = "

hello

\n

hello markdown world!

\n

hello: world

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).isEmpty(); + + assertRendering(input, rendered); + } + + @Test + public void yamlOnSecondLine() { + final String input = "hello\n" + + "\n---" + + "\nhello: world" + + "\n---"; + final String rendered = "

hello

\n
\n

hello: world

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).isEmpty(); + + assertRendering(input, rendered); + } + + @Test + public void nonMatchedStartTag() { + final String input = "----\n" + + "test"; + final String rendered = "
\n

test

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).isEmpty(); + + assertRendering(input, rendered); + } + + @Test + public void inList() { + final String input = "* ---\n" + + " ---\n" + + "test"; + final String rendered = "
    \n
  • \n
    \n
    \n
  • \n
\n

test

\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).isEmpty(); + + assertRendering(input, rendered); + } + + @Test + public void visitorIgnoresOtherCustomNodes() { + final String input = "---" + + "\nhello: world" + + "\n---" + + "\n"; + + YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor(); + Node document = PARSER.parse(input); + document.appendChild(new TestNode()); + document.accept(visitor); + + Map> data = visitor.getData(); + assertThat(data).hasSize(1); + assertThat(data).containsKey("hello"); + assertThat(data.get("hello")).isEqualTo(List.of("world")); + } + + @Test + public void nodesCanBeModified() { + final String input = "---" + + "\nhello: world" + + "\n---" + + "\n"; + + Node document = PARSER.parse(input); + YamlFrontMatterNode node = (YamlFrontMatterNode) document.getFirstChild().getFirstChild(); + node.setKey("see"); + node.setValues(List.of("you")); + + YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor(); + document.accept(visitor); + + Map> data = visitor.getData(); + assertThat(data).hasSize(1); + assertThat(data).containsKey("see"); + assertThat(data.get("see")).isEqualTo(List.of("you")); + } + + @Test + public void dotInKeys() { + final String input = "---" + + "\nms.author: author" + + "\n---" + + "\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(1); + assertThat(data.keySet().iterator().next()).isEqualTo("ms.author"); + assertThat(data.get("ms.author")).hasSize(1); + assertThat(data.get("ms.author").get(0)).isEqualTo("author"); + } + + @Test + public void singleQuotedLiterals() { + final String input = "---" + + "\nstring: 'It''s me'" + + "\nlist:" + + "\n - 'I''m here'" + + "\n---" + + "\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(2); + assertThat(data.get("string").get(0)).isEqualTo("It's me"); + assertThat(data.get("list").get(0)).isEqualTo("I'm here"); + } + + @Test + public void doubleQuotedLiteral() { + final String input = "---" + + "\nstring: \"backslash: \\\\ quote: \\\"\"" + + "\nlist:" + + "\n - \"hey\"" + + "\n---" + + "\n"; + + Map> data = getFrontMatter(input); + + assertThat(data).hasSize(2); + assertThat(data.get("string").get(0)).isEqualTo("backslash: \\ quote: \""); + assertThat(data.get("list").get(0)).isEqualTo("hey"); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private Map> getFrontMatter(String input) { + YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor(); + Node document = PARSER.parse(input); + document.accept(visitor); + + Map> data = visitor.getData(); + return data; + } + + // Custom node for tests + private static class TestNode extends CustomNode { + } +} diff --git a/commonmark-integration-test/pom.xml b/commonmark-integration-test/pom.xml index 1657fa715..7e0048a73 100644 --- a/commonmark-integration-test/pom.xml +++ b/commonmark-integration-test/pom.xml @@ -2,9 +2,9 @@ 4.0.0 - com.atlassian.commonmark + org.commonmark commonmark-parent - 0.1.1-SNAPSHOT + 0.28.1-SNAPSHOT commonmark-integration-test @@ -13,35 +13,56 @@ - com.atlassian.commonmark + org.commonmark commonmark - test - com.atlassian.commonmark + org.commonmark commonmark-ext-autolink - test - com.atlassian.commonmark + org.commonmark + commonmark-ext-footnotes + + + org.commonmark + commonmark-ext-ins + + + org.commonmark + commonmark-ext-gfm-alerts + + + org.commonmark commonmark-ext-gfm-strikethrough - test - com.atlassian.commonmark + org.commonmark commonmark-ext-gfm-tables - test + + org.commonmark + commonmark-ext-image-attributes + + + org.commonmark + commonmark-ext-task-list-items + + + org.commonmark + commonmark-ext-yaml-front-matter + + org.pegdown pegdown - 1.5.0 + 1.6.0 test - junit - junit + org.commonmark + commonmark-test-util test @@ -54,12 +75,6 @@ jmh-generator-annprocess test - - com.atlassian.commonmark - commonmark - test-jar - test - diff --git a/commonmark-integration-test/src/main/java/org/commonmark/integration/IntegrationTests.java b/commonmark-integration-test/src/main/java/org/commonmark/integration/IntegrationTests.java new file mode 100644 index 000000000..48e1ee5ba --- /dev/null +++ b/commonmark-integration-test/src/main/java/org/commonmark/integration/IntegrationTests.java @@ -0,0 +1,16 @@ +package org.commonmark.integration; + +// Prevent maven-gpg-plugin from failing with this error: +// The project artifact has not been assembled yet. +// Please do not invoke this goal before the lifecycle phase "package". +// +// Apparently it doesn't like a module that doesn't have any classes in main, +// because that means no jar is generated. +// And the javadoc plugin doesn't like if there's no classes with documentation, +// + +/** + * Module with integration tests. + */ +public class IntegrationTests { +} diff --git a/commonmark-integration-test/src/main/resources/META-INF/LICENSE.txt b/commonmark-integration-test/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-integration-test/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/BoundsIntegrationTest.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/BoundsIntegrationTest.java new file mode 100644 index 000000000..f1259b825 --- /dev/null +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/BoundsIntegrationTest.java @@ -0,0 +1,53 @@ +package org.commonmark.integration; + +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.Parameter; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Tests various substrings of the spec examples to check for out of bounds exceptions. + */ +@ParameterizedClass +@MethodSource("data") +public class BoundsIntegrationTest { + + private static final Parser PARSER = Parser.builder().build(); + + @Parameter + String input; + + static List data() { + return ExampleReader.readExampleSources(TestResources.getSpec()); + } + + @Test + public void testSubstrings() { + // Check possibly truncated block/inline starts + for (int i = 1; i < input.length() - 1; i++) { + parse(input.substring(i)); + } + // Check possibly truncated block/inline ends + for (int i = input.length() - 1; i > 1; i--) { + parse(input.substring(0, i)); + } + } + + private void parse(String input) { + try { + Node parsed = PARSER.parse(input); + // Parsing should always return a node + assertThat(parsed).isNotNull(); + } catch (Exception e) { + throw new AssertionError("Parsing failed, input: " + input, e); + } + } +} diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/Extensions.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/Extensions.java new file mode 100644 index 000000000..9090c797f --- /dev/null +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/Extensions.java @@ -0,0 +1,28 @@ +package org.commonmark.integration; + +import org.commonmark.Extension; +import org.commonmark.ext.autolink.AutolinkExtension; +import org.commonmark.ext.footnotes.FootnotesExtension; +import org.commonmark.ext.front.matter.YamlFrontMatterExtension; +import org.commonmark.ext.gfm.alerts.AlertsExtension; +import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.ext.image.attributes.ImageAttributesExtension; +import org.commonmark.ext.ins.InsExtension; +import org.commonmark.ext.task.list.items.TaskListItemsExtension; + +import java.util.List; + +public class Extensions { + + static final List ALL_EXTENSIONS = List.of( + AutolinkExtension.create(), + FootnotesExtension.create(), + ImageAttributesExtension.create(), + InsExtension.create(), + AlertsExtension.create(), + StrikethroughExtension.create(), + TablesExtension.create(), + TaskListItemsExtension.create(), + YamlFrontMatterExtension.create()); +} diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/ExtensionsIntegrationTest.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/ExtensionsIntegrationTest.java new file mode 100644 index 000000000..523154d2c --- /dev/null +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/ExtensionsIntegrationTest.java @@ -0,0 +1,38 @@ +package org.commonmark.integration; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +/** + * Tests to ensure all extensions work well together. + */ +public class ExtensionsIntegrationTest extends RenderingTestCase { + + protected static final Parser PARSER = Parser.builder() + .extensions(Extensions.ALL_EXTENSIONS) + .build(); + protected static final HtmlRenderer RENDERER = HtmlRenderer.builder() + .extensions(Extensions.ALL_EXTENSIONS) + .percentEncodeUrls(true) + .build(); + + @Test + public void testImageAttributes() { + assertRendering("![text](/url.png){height=5 width=6}", "

\"text\"

\n"); + } + + @Test + public void testTaskListItems() { + assertRendering("- [ ] task to do\n- [x] task done\n", + "
    \n
  • task to do
  • \n" + + "
  • task done
  • \n
\n"); + + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/MarkdownRendererIntegrationTest.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/MarkdownRendererIntegrationTest.java new file mode 100644 index 000000000..fe14273ab --- /dev/null +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/MarkdownRendererIntegrationTest.java @@ -0,0 +1,37 @@ +package org.commonmark.integration; + +import org.commonmark.Extension; +import org.commonmark.ext.autolink.AutolinkExtension; +import org.commonmark.ext.front.matter.YamlFrontMatterExtension; +import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.ext.image.attributes.ImageAttributesExtension; +import org.commonmark.ext.ins.InsExtension; +import org.commonmark.ext.task.list.items.TaskListItemsExtension; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class MarkdownRendererIntegrationTest { + + private static final Parser PARSER = Parser.builder().extensions(Extensions.ALL_EXTENSIONS).build(); + private static final MarkdownRenderer RENDERER = MarkdownRenderer.builder().extensions(Extensions.ALL_EXTENSIONS).build(); + + @Test + public void testStrikethroughInTable() { + assertRoundTrip("|Abc|\n|---|\n|~strikethrough~|\n|\\~escaped\\~|\n"); + } + + private String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private void assertRoundTrip(String input) { + String rendered = render(input); + assertThat(rendered).isEqualTo(input); + } +} diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/PegDownBenchmark.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/PegDownBenchmark.java index 9af06e6af..ecc9c2cfd 100644 --- a/commonmark-integration-test/src/test/java/org/commonmark/integration/PegDownBenchmark.java +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/PegDownBenchmark.java @@ -1,34 +1,37 @@ package org.commonmark.integration; -import org.commonmark.spec.SpecReader; -import org.openjdk.jmh.Main; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.ExampleReader; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.State; import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.CommandLineOptions; import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; import org.pegdown.Extensions; import org.pegdown.PegDownProcessor; -import java.util.Collections; import java.util.List; @State(Scope.Benchmark) public class PegDownBenchmark { - private static final String SPEC = SpecReader.readSpec(); - private static final List SPEC_EXAMPLES = SpecReader.readExamplesAsString(); + private static final String SPEC = TestResources.readAsString(TestResources.getSpec()); + private static final List SPEC_EXAMPLES = ExampleReader.readExampleSources(TestResources.getSpec()); private static final PegDownProcessor PROCESSOR = new PegDownProcessor(Extensions.FENCED_CODE_BLOCKS); public static void main(String[] args) throws Exception { - Options options = new OptionsBuilder().include(PegDownBenchmark.class.getName() + ".*").build(); + Options options = new OptionsBuilder() + .parent(new CommandLineOptions(args)) + .include(PegDownBenchmark.class.getName() + ".*") + .build(); new Runner(options).run(); } @Benchmark public long wholeSpec() { - return parseAndRender(Collections.singletonList(SPEC)); + return parseAndRender(List.of(SPEC)); } @Benchmark diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/SourceSpanIntegrationTest.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/SourceSpanIntegrationTest.java new file mode 100644 index 000000000..171cc51b1 --- /dev/null +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/SourceSpanIntegrationTest.java @@ -0,0 +1,21 @@ +package org.commonmark.integration; + +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.commonmark.testutil.example.Example; + +/** + * Spec and all extensions, with source spans enabled. + */ +public class SourceSpanIntegrationTest extends SpecIntegrationTest { + + protected static final Parser PARSER = Parser.builder() + .extensions(Extensions.ALL_EXTENSIONS) + .includeSourceSpans(IncludeSourceSpans.BLOCKS) + .build(); + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark-integration-test/src/test/java/org/commonmark/integration/SpecIntegrationTest.java b/commonmark-integration-test/src/test/java/org/commonmark/integration/SpecIntegrationTest.java index 3a3ce5e2a..07853d402 100644 --- a/commonmark-integration-test/src/test/java/org/commonmark/integration/SpecIntegrationTest.java +++ b/commonmark-integration-test/src/test/java/org/commonmark/integration/SpecIntegrationTest.java @@ -1,51 +1,44 @@ package org.commonmark.integration; -import org.commonmark.Extension; -import org.commonmark.ext.autolink.AutolinkExtension; -import org.commonmark.ext.gfm.strikethrough.StrikethroughExtension; -import org.commonmark.ext.gfm.tables.TablesExtension; -import org.commonmark.spec.SpecExample; -import org.commonmark.test.SpecTest; -import org.junit.Test; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.parser.Parser; +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.SpecTestCase; +import org.junit.jupiter.api.Test; + +import java.util.*; + +import static org.commonmark.testutil.Asserts.assertRendering; /** * Tests that the spec examples still render the same with all extensions enabled. */ -public class SpecIntegrationTest extends SpecTest { +public class SpecIntegrationTest extends SpecTestCase { - private static final Map OVERRIDDEN_EXAMPLES = getOverriddenExamples(); - - public SpecIntegrationTest(SpecExample example) { - super(example); - } + protected static final Parser PARSER = Parser.builder().extensions(Extensions.ALL_EXTENSIONS).build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + protected static final HtmlRenderer RENDERER = HtmlRenderer.builder().extensions(Extensions.ALL_EXTENSIONS).percentEncodeUrls(true).build(); + protected static final Map OVERRIDDEN_EXAMPLES = getOverriddenExamples(); @Test - @Override public void testHtmlRendering() { String expectedHtml = OVERRIDDEN_EXAMPLES.get(example.getSource()); if (expectedHtml != null) { - assertRendering(example.getSource(), expectedHtml); + assertRendering(example.getSource(), expectedHtml, render(example.getSource())); } else { - super.testHtmlRendering(); + assertRendering(example.getSource(), example.getHtml(), render(example.getSource())); } } - @Override - protected Iterable getExtensions() { - return Arrays.asList(AutolinkExtension.create(), - StrikethroughExtension.create(), - TablesExtension.create()); + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); } private static Map getOverriddenExamples() { Map m = new HashMap<>(); // Not a spec autolink because of space, but the resulting text contains a valid URL - m.put("\n", "

<http://foo.bar/baz bim>

\n"); + m.put("\n", "

<https://foo.bar/baz bim>

\n"); // Not a spec autolink, but the resulting text contains a valid email m.put("\n", "

<foo+@bar.example.com>

\n"); @@ -54,14 +47,18 @@ private static Map getOverriddenExamples() { m.put("\n", "

<heck://bing.bong>

\n"); // Not a spec autolink because of spaces, but autolink extension doesn't limit schemes - m.put("< http://foo.bar >\n", "

< http://foo.bar >

\n"); + m.put("< https://foo.bar >\n", "

< https://foo.bar >

\n"); // Plain autolink - m.put("http://example.com\n", "

http://example.com

\n"); + m.put("https://example.com\n", "

https://example.com

\n"); // Plain autolink m.put("foo@bar.example.com\n", "

foo@bar.example.com

\n"); + // YAML front matter block + m.put("---\nFoo\n---\nBar\n---\nBaz\n", "

Bar

\n

Baz

\n"); + m.put("---\n---\n", ""); + return m; } diff --git a/commonmark-integration-test/src/test/java/org/commonmark/ui/DingusApp.java b/commonmark-integration-test/src/test/java/org/commonmark/ui/DingusApp.java new file mode 100644 index 000000000..0e98386bb --- /dev/null +++ b/commonmark-integration-test/src/test/java/org/commonmark/ui/DingusApp.java @@ -0,0 +1,114 @@ +package org.commonmark.ui; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.text.TextContentRenderer; + +import java.awt.*; +import javax.swing.*; +import javax.swing.event.ChangeEvent; +import javax.swing.event.ChangeListener; +import javax.swing.event.DocumentEvent; +import javax.swing.event.DocumentListener; + +/** + * Simple UI to quickly test out different rendering of CommonMark inputs. + * Similar to commonmark.js dingus. + **/ +public class DingusApp { + + private final Parser parser = Parser.builder().build(); + private final TextContentRenderer textRenderer = TextContentRenderer.builder().build(); + private final HtmlRenderer htmlRenderer = HtmlRenderer.builder().build(); + + private final JTabbedPane tabbedPane; + private final JEditorPane htmlVisualRendererOutput; + private final JTextArea htmlSourceRendererOutput; + private final JTextArea textRendererOutput; + + public static void main(String[] args) { + new DingusApp().run(); + } + + private DingusApp() { + tabbedPane = new JTabbedPane(); + + htmlVisualRendererOutput = new JEditorPane(); + htmlVisualRendererOutput.setEnabled(false); + htmlVisualRendererOutput.setContentType("text/html"); + + htmlSourceRendererOutput = new JTextArea(); + htmlSourceRendererOutput.setEnabled(false); + htmlSourceRendererOutput.setLineWrap(true); + htmlSourceRendererOutput.setFont(new Font(Font.MONOSPACED, Font.PLAIN, 12)); + + textRendererOutput = new JTextArea(); + textRendererOutput.setEnabled(false); + textRendererOutput.setLineWrap(true); + textRendererOutput.setFont(new Font(Font.MONOSPACED, Font.PLAIN, 12)); + } + + private void run() { + JFrame frame = new JFrame("commonmark-java dingus"); + frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); + frame.setMinimumSize(new Dimension(400, 300)); + frame.setSize(new Dimension(1200, 675)); + + final JTextArea input = new JTextArea(); + input.setBorder(BorderFactory.createTitledBorder("Input")); + input.setLineWrap(true); + input.setFont(new Font(Font.MONOSPACED, Font.PLAIN, 12)); + + input.getDocument().addDocumentListener(new DocumentListener() { + @Override + public void insertUpdate(DocumentEvent e) { + updateOutput(input.getText()); + } + + @Override + public void removeUpdate(DocumentEvent e) { + updateOutput(input.getText()); + } + + @Override + public void changedUpdate(DocumentEvent e) { + } + }); + + tabbedPane.addTab("HTML rendered", htmlVisualRendererOutput); + tabbedPane.addTab("HTML source", htmlSourceRendererOutput); + tabbedPane.addTab("Plain text", textRendererOutput); + + tabbedPane.addChangeListener(new ChangeListener() { + @Override + public void stateChanged(ChangeEvent e) { + updateOutput(input.getText()); + } + }); + + input.setText("# Example\n" + + "Enter text *here* and see how it renders on the right.\n\n" + + "* Try\n* this\n\n" + + "```\nor this\n```"); + updateOutput(input.getText()); + + frame.setLayout(new GridLayout()); + frame.add(input); + frame.add(tabbedPane); + + frame.setVisible(true); + } + + private void updateOutput(String inputText) { + if (tabbedPane.getSelectedComponent() == htmlVisualRendererOutput) { + String rendered = htmlRenderer.render(parser.parse(inputText)); + htmlVisualRendererOutput.setText(rendered); + } else if (tabbedPane.getSelectedComponent() == htmlSourceRendererOutput) { + String rendered = htmlRenderer.render(parser.parse(inputText)); + htmlSourceRendererOutput.setText(rendered); + } else if (tabbedPane.getSelectedComponent() == textRendererOutput) { + String rendered = textRenderer.render(parser.parse(inputText)); + textRendererOutput.setText(rendered); + } + } +} diff --git a/commonmark-test-util/pom.xml b/commonmark-test-util/pom.xml new file mode 100644 index 000000000..6a9c342cc --- /dev/null +++ b/commonmark-test-util/pom.xml @@ -0,0 +1,25 @@ + + + 4.0.0 + + org.commonmark + commonmark-parent + 0.28.1-SNAPSHOT + + + commonmark-test-util + commonmark-java test utilities + commonmark-java classes for tests + + + + org.junit.jupiter + junit-jupiter + + + org.assertj + assertj-core + + + + diff --git a/commonmark-test-util/src/main/java/module-info.java b/commonmark-test-util/src/main/java/module-info.java new file mode 100644 index 000000000..12980d80a --- /dev/null +++ b/commonmark-test-util/src/main/java/module-info.java @@ -0,0 +1,7 @@ +module org.commonmark.testutil { + exports org.commonmark.testutil; + exports org.commonmark.testutil.example; + + requires org.assertj.core; + requires org.junit.jupiter.params; +} diff --git a/commonmark-test-util/src/main/java/org/commonmark/testutil/Asserts.java b/commonmark-test-util/src/main/java/org/commonmark/testutil/Asserts.java new file mode 100644 index 000000000..971a1b4ea --- /dev/null +++ b/commonmark-test-util/src/main/java/org/commonmark/testutil/Asserts.java @@ -0,0 +1,17 @@ +package org.commonmark.testutil; + +import static org.assertj.core.api.Assertions.assertThat; + +public class Asserts { + public static void assertRendering(String source, String expectedRendering, String actualRendering) { + // include source for better assertion errors + String expected = showTabs(expectedRendering + "\n\n" + source); + String actual = showTabs(actualRendering + "\n\n" + source); + assertThat(actual).isEqualTo(expected); + } + + private static String showTabs(String s) { + // Tabs are shown as "rightwards arrow" for easier comparison + return s.replace("\t", "\u2192"); + } +} diff --git a/commonmark-test-util/src/main/java/org/commonmark/testutil/RenderingTestCase.java b/commonmark-test-util/src/main/java/org/commonmark/testutil/RenderingTestCase.java new file mode 100644 index 000000000..f7da4c008 --- /dev/null +++ b/commonmark-test-util/src/main/java/org/commonmark/testutil/RenderingTestCase.java @@ -0,0 +1,12 @@ +package org.commonmark.testutil; + +import static org.assertj.core.api.Assertions.assertThat; + +public abstract class RenderingTestCase { + + protected abstract String render(String source); + + protected void assertRendering(String source, String expectedResult) { + Asserts.assertRendering(source, expectedResult, render(source)); + } +} diff --git a/commonmark-test-util/src/main/java/org/commonmark/testutil/SpecTestCase.java b/commonmark-test-util/src/main/java/org/commonmark/testutil/SpecTestCase.java new file mode 100644 index 000000000..c29a6a69a --- /dev/null +++ b/commonmark-test-util/src/main/java/org/commonmark/testutil/SpecTestCase.java @@ -0,0 +1,23 @@ +package org.commonmark.testutil; + +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.params.Parameter; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.List; + +@ParameterizedClass +@MethodSource("data") +public abstract class SpecTestCase { + + @Parameter + protected Example example; + + static List data() { + return ExampleReader.readExamples(TestResources.getSpec()); + } +} diff --git a/commonmark-test-util/src/main/java/org/commonmark/testutil/TestResources.java b/commonmark-test-util/src/main/java/org/commonmark/testutil/TestResources.java new file mode 100644 index 000000000..5af649a86 --- /dev/null +++ b/commonmark-test-util/src/main/java/org/commonmark/testutil/TestResources.java @@ -0,0 +1,40 @@ +package org.commonmark.testutil; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.List; + +public class TestResources { + + public static URL getSpec() { + return TestResources.class.getResource("/spec.txt"); + } + + public static URL getGfmSpec() { + return TestResources.class.getResource("/gfm-spec.txt"); + } + + public static List getRegressions() { + return List.of( + TestResources.class.getResource("/cmark-regression.txt"), + TestResources.class.getResource("/commonmark.js-regression.txt") + ); + } + + public static String readAsString(URL url) { + StringBuilder sb = new StringBuilder(); + try (BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + sb.append(line); + sb.append("\n"); + } + return sb.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/commonmark-test-util/src/main/java/org/commonmark/testutil/example/Example.java b/commonmark-test-util/src/main/java/org/commonmark/testutil/example/Example.java new file mode 100644 index 000000000..11e87d0aa --- /dev/null +++ b/commonmark-test-util/src/main/java/org/commonmark/testutil/example/Example.java @@ -0,0 +1,41 @@ +package org.commonmark.testutil.example; + +public class Example { + + private final String filename; + private final String section; + private final String info; + private final int exampleNumber; + private final String source; + private final String html; + + public Example(String filename, String section, String info, int exampleNumber, String source, String html) { + this.filename = filename; + this.section = section; + this.info = info; + this.exampleNumber = exampleNumber; + this.source = source; + this.html = html; + } + + public String getInfo() { + return info; + } + + public String getSource() { + return source; + } + + public String getHtml() { + return html; + } + + public String getSection() { + return section; + } + + @Override + public String toString() { + return "File \"" + filename + "\" section \"" + section + "\" example " + exampleNumber; + } +} diff --git a/commonmark/src/test/java/org/commonmark/spec/SpecReader.java b/commonmark-test-util/src/main/java/org/commonmark/testutil/example/ExampleReader.java similarity index 55% rename from commonmark/src/test/java/org/commonmark/spec/SpecReader.java rename to commonmark-test-util/src/main/java/org/commonmark/testutil/example/ExampleReader.java index 1a806ed3d..d40a10f63 100644 --- a/commonmark/src/test/java/org/commonmark/spec/SpecReader.java +++ b/commonmark-test-util/src/main/java/org/commonmark/testutil/example/ExampleReader.java @@ -1,75 +1,67 @@ -package org.commonmark.spec; +package org.commonmark.testutil.example; -import org.commonmark.test.SpecTest; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; -public class SpecReader { +/** + * Reader for files containing examples of CommonMark source and the expected HTML rendering (e.g. spec.txt). + */ +public class ExampleReader { private static final Pattern SECTION_PATTERN = Pattern.compile("#{1,6} *(.*)"); + private static final String EXAMPLE_START_MARKER = "```````````````````````````````` example"; private final InputStream inputStream; + private final String filename; private State state = State.BEFORE; private String section; + // The gfm spec has additional text after the example marker for their additions, e.g. "table" + private String info = ""; private StringBuilder source; private StringBuilder html; private int exampleNumber = 0; - private List examples = new ArrayList<>(); + private List examples = new ArrayList<>(); - private SpecReader(InputStream stream) { + private ExampleReader(InputStream stream, String filename) { this.inputStream = stream; + this.filename = filename; } - public static List readExamples() { - try (InputStream stream = getStream()) { - return new SpecReader(stream).read(); + public static List readExamples(URL url) { + try (InputStream stream = url.openStream()) { + return new ExampleReader(stream, new File(url.getPath()).getName()).read(); } catch (IOException e) { throw new RuntimeException(e); } } - public static List readExamplesAsString() { - List examples = SpecReader.readExamples(); - List result = new ArrayList<>(); - for (SpecExample example : examples) { - result.add(example.getSource()); - } - return result; + public static List readExamples(URL url, String info) { + var examples = readExamples(url); + return examples.stream().filter(e -> e.getInfo().contains(info)).collect(Collectors.toList()); } - public static String readSpec() { - StringBuilder sb = new StringBuilder(); - try (BufferedReader reader = new BufferedReader(new InputStreamReader(getStream(), StandardCharsets.UTF_8))) { - String line; - while ((line = reader.readLine()) != null) { - sb.append(line); - sb.append("\n"); - } - return sb.toString(); - } catch (IOException e) { - throw new RuntimeException(e); - } + public static List readExampleObjects(URL url, String info) { + return readExamples(url, info).stream().map(e -> new Object[]{e}).collect(Collectors.toList()); } - private static InputStream getStream() { - InputStream stream = SpecTest.class.getResourceAsStream("/spec.txt"); - if (stream == null) { - throw new IllegalStateException("Could not load spec.txt classpath resource"); + public static List readExampleSources(URL url) { + List examples = ExampleReader.readExamples(url); + List result = new ArrayList<>(); + for (Example example : examples) { + result.add(example.getSource()); } - return stream; + return result; } - private List read() throws IOException { + private List read() throws IOException { resetContents(); try (BufferedReader reader = new BufferedReader( @@ -84,7 +76,6 @@ private List read() throws IOException { } private void processLine(String line) { - boolean dot = line.equals("."); switch (state) { case BEFORE: Matcher matcher = SECTION_PATTERN.matcher(line); @@ -92,13 +83,14 @@ private void processLine(String line) { section = matcher.group(1); exampleNumber = 0; } - if (dot) { + if (line.startsWith(EXAMPLE_START_MARKER)) { + info = line.substring(EXAMPLE_START_MARKER.length()).trim(); state = State.SOURCE; exampleNumber++; } break; case SOURCE: - if (dot) { + if (line.equals(".")) { state = State.HTML; } else { // examples use "rightwards arrow" to show tab @@ -107,9 +99,9 @@ private void processLine(String line) { } break; case HTML: - if (dot) { + if (line.equals("````````````````````````````````")) { state = State.BEFORE; - examples.add(new SpecExample(section, exampleNumber, + examples.add(new Example(filename, section, info, exampleNumber, source.toString(), html.toString())); resetContents(); } else { diff --git a/commonmark-test-util/src/main/resources/META-INF/LICENSE.txt b/commonmark-test-util/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark-test-util/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark-test-util/src/main/resources/README.md b/commonmark-test-util/src/main/resources/README.md new file mode 100644 index 000000000..f51e88358 --- /dev/null +++ b/commonmark-test-util/src/main/resources/README.md @@ -0,0 +1,7 @@ +These files are copied from the CommonMark repositories, namely: + +https://github.com/commonmark/commonmark-spec/blob/master/spec.txt +https://github.com/commonmark/cmark/blob/master/test/regression.txt +https://github.com/commonmark/commonmark.js/blob/master/test/regression.txt + +They are licensed as stated in those repositories. diff --git a/commonmark-test-util/src/main/resources/cmark-regression.txt b/commonmark-test-util/src/main/resources/cmark-regression.txt new file mode 100644 index 000000000..5f1dc5e24 --- /dev/null +++ b/commonmark-test-util/src/main/resources/cmark-regression.txt @@ -0,0 +1,193 @@ +### Regression tests + +Issue #113: EOL character weirdness on Windows +(Important: first line ends with CR + CR + LF) + +```````````````````````````````` example +line1 + +line2 +. +

line1

+

line2

+```````````````````````````````` + +Issue #114: cmark skipping first character in line +(Important: the blank lines around "Repeatedly" contain a tab.) + +```````````````````````````````` example +By taking it apart + +- alternative solutions +→ +Repeatedly solving +→ +- how techniques +. +

By taking it apart

+
    +
  • alternative solutions
  • +
+

Repeatedly solving

+
    +
  • how techniques
  • +
+```````````````````````````````` + +Issue jgm/CommonMark#430: h2..h6 not recognized as block tags. + +```````````````````````````````` example +

lorem

+ +

lorem

+ +

lorem

+ +

lorem

+ +
lorem
+ +
lorem
+. +

lorem

+

lorem

+

lorem

+

lorem

+
lorem
+
lorem
+```````````````````````````````` + +Issue jgm/commonmark.js#109 - tabs after setext header line + + +```````````````````````````````` example +hi +--→ +. +

hi

+```````````````````````````````` + +Issue #177 - incorrect emphasis parsing + +```````````````````````````````` example +a***b* c* +. +

a*b c

+```````````````````````````````` + +Issue #193 - unescaped left angle brackets in link destination + +```````````````````````````````` example +[a] + +[a]: +. +

[a]

+

[a]: <te

+```````````````````````````````` + +Issue #192 - escaped spaces in link destination + + +```````````````````````````````` example +[a](te\ st) +. +

[a](te\ st)

+```````````````````````````````` + +Issue #527 - meta tags in inline contexts + +```````````````````````````````` example +City: + + + +. +

City: + + +

+```````````````````````````````` + +Issue #530 - link parsing corner cases + +```````````````````````````````` example +[a](\ b) + +[a](<[a](\ b)

+

[a](<<b)

+

[a](<b +)

+```````````````````````````````` + +Issue commonmark#526 - unescaped ( in link title + +```````````````````````````````` example +[link](url ((title)) +. +

[link](url ((title))

+```````````````````````````````` + +Issue commonamrk#517 - script, pre, style close tag without +opener. + +```````````````````````````````` example + + + + + +. + + + +```````````````````````````````` + +Issue #289. + +```````````````````````````````` example +[a]( +. +

[a](<b) c>

+```````````````````````````````` + +Issue #334 - UTF-8 BOM + +```````````````````````````````` example +# Hi +. +

Hi

+```````````````````````````````` + +Issue commonmark.js#213 - type 7 blocks can't interrupt +paragraph + +```````````````````````````````` example +- +. +
    +
  • +
  • +
+```````````````````````````````` + +Issue #383 - emphasis parsing. + +```````````````````````````````` example +*****Hello*world**** +. +

**Helloworld

+```````````````````````````````` + diff --git a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt new file mode 100644 index 000000000..16a0e8c35 --- /dev/null +++ b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt @@ -0,0 +1,218 @@ +# Regression tests + +Eating a character after a partially consumed tab. + +```````````````````````````````` example +* foo +→bar +. +
    +
  • foo +bar
  • +
+```````````````````````````````` + +Type 7 HTML block followed by whitespace (#98). + +```````````````````````````````` example + +x +. + +x +```````````````````````````````` + +h2..h6 raw HTML blocks (jgm/CommonMark#430). + +```````````````````````````````` example +

lorem

+ +

lorem

+ +

lorem

+ +

lorem

+ +
lorem
+ +
lorem
+. +

lorem

+

lorem

+

lorem

+

lorem

+
lorem
+
lorem
+```````````````````````````````` + +Issue #109 - tabs after setext header line + + +```````````````````````````````` example +hi +--→ +. +

hi

+```````````````````````````````` + +Issue #108 - Chinese punctuation not recognized + +```````````````````````````````` example +**。**话 +. +

**。**话

+```````````````````````````````` + +Issue jgm/cmark#177 - incorrect emphasis parsing + +```````````````````````````````` example +a***b* c* +. +

a*b c

+```````````````````````````````` + +Issue jgm/CommonMark#468 - backslash at end of link definition + + +```````````````````````````````` example +[\]: test +. +

[]: test

+```````````````````````````````` + +Issue commonmark/commonmark.js#121 - punctuation set different + +```````````````````````````````` example +^_test_ +. +

^test

+```````````````````````````````` + +Issue #116 - tabs before and after ATX closing heading +```````````````````````````````` example +# foo→#→ +. +

foo

+```````````````````````````````` + +commonmark/CommonMark#493 - escaped space not allowed in link destination. + +```````````````````````````````` example +[link](a\ b) +. +

[link](a\ b)

+```````````````````````````````` + +Issue #527 - meta tags in inline contexts + +```````````````````````````````` example +City: + + + +. +

City: + + +

+```````````````````````````````` + +Double-encoding. + +```````````````````````````````` example +[XSS](javascript&colon;alert%28'XSS'%29) +. +

XSS

+```````````````````````````````` + +PR #179 + +```````````````````````````````` example +[link](https://www.example.com/home/%25batty) +. +

link

+```````````````````````````````` + +Issue commonamrk#517 - script, pre, style close tag without +opener. + +```````````````````````````````` example + + + + + +. + + + +```````````````````````````````` + +Issue #289. + +```````````````````````````````` example +[a]( +. +

[a](<b) c>

+```````````````````````````````` + +Issue #161. + +```````````````````````````````` example +*failed to be italic!*\ +text +. +

failed to be italic!
+text

+```````````````````````````````` + +Issue #196. + +```````````````````````````````` example +a +. +

a

+```````````````````````````````` + +Issue #211 + +```````````````````````````````` example +[\ +foo]: /uri + +[\ +foo] +. +


+foo

+```````````````````````````````` + +Issue #213 - type 7 blocks can't interrupt +paragraph + +```````````````````````````````` example +- +. +
    +
  • +
  • +
+```````````````````````````````` + +Issue cmark/#383 - emphasis parsing. + +```````````````````````````````` example +*****Hello*world**** +. +

**Helloworld

+```````````````````````````````` + diff --git a/commonmark/src/test/resources/spec.txt b/commonmark-test-util/src/main/resources/gfm-spec.txt similarity index 53% rename from commonmark/src/test/resources/spec.txt rename to commonmark-test-util/src/main/resources/gfm-spec.txt index bdb9569d2..d42f3369e 100644 --- a/commonmark/src/test/resources/spec.txt +++ b/commonmark-test-util/src/main/resources/gfm-spec.txt @@ -1,24 +1,120 @@ --- -title: CommonMark Spec -author: John MacFarlane -version: 0.21 -date: +title: GitHub Flavored Markdown Spec +version: 0.29 +date: '2019-04-06' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... # Introduction +## What is GitHub Flavored Markdown? + +GitHub Flavored Markdown, often shortened as GFM, is the dialect of Markdown +that is currently supported for user content on GitHub.com and GitHub +Enterprise. + +This formal specification, based on the CommonMark Spec, defines the syntax and +semantics of this dialect. + +GFM is a strict superset of CommonMark. All the features which are supported in +GitHub user content and that are not specified on the original CommonMark Spec +are hence known as **extensions**, and highlighted as such. + +While GFM supports a wide range of inputs, it's worth noting that GitHub.com +and GitHub Enterprise perform additional post-processing and sanitization after +GFM is converted to HTML to ensure security and consistency of the website. + ## What is Markdown? Markdown is a plain text format for writing structured documents, -based on conventions used for indicating formatting in email and -usenet posts. It was developed in 2004 by John Gruber, who wrote -the first Markdown-to-HTML converter in perl, and it soon became -widely used in websites. By 2014 there were dozens of -implementations in many languages. Some of them extended basic -Markdown syntax with conventions for footnotes, definition lists, -tables, and other constructs, and some allowed output not just in -HTML but in LaTeX and many other formats. +based on conventions for indicating formatting in email +and usenet posts. It was developed by John Gruber (with +help from Aaron Swartz) and released in 2004 in the form of a +[syntax description](http://daringfireball.net/projects/markdown/syntax) +and a Perl script (`Markdown.pl`) for converting Markdown to +HTML. In the next decade, dozens of implementations were +developed in many languages. Some extended the original +Markdown syntax with conventions for footnotes, tables, and +other document elements. Some allowed Markdown documents to be +rendered in formats other than HTML. Websites like Reddit, +StackOverflow, and GitHub had millions of people using Markdown. +And Markdown started to be used beyond the web, to author books, +articles, slide shows, letters, and lecture notes. + +What distinguishes Markdown from many other lightweight markup +syntaxes, which are often easier to write, is its readability. +As Gruber writes: + +> The overriding design goal for Markdown's formatting syntax is +> to make it as readable as possible. The idea is that a +> Markdown-formatted document should be publishable as-is, as +> plain text, without looking like it's been marked up with tags +> or formatting instructions. +> () + +The point can be illustrated by comparing a sample of +[AsciiDoc](http://www.methods.co.nz/asciidoc/) with +an equivalent sample of Markdown. Here is a sample of +AsciiDoc from the AsciiDoc manual: + +``` +1. List item one. ++ +List item one continued with a second paragraph followed by an +Indented block. ++ +................. +$ ls *.sh +$ mv *.sh ~/tmp +................. ++ +List item continued with a third paragraph. + +2. List item two continued with an open block. ++ +-- +This paragraph is part of the preceding list item. + +a. This list is nested and does not require explicit item +continuation. ++ +This paragraph is part of the preceding list item. + +b. List item b. + +This paragraph belongs to item two of the outer list. +-- +``` + +And here is the equivalent in Markdown: +``` +1. List item one. + + List item one continued with a second paragraph followed by an + Indented block. + + $ ls *.sh + $ mv *.sh ~/tmp + + List item continued with a third paragraph. + +2. List item two continued with an open block. + + This paragraph is part of the preceding list item. + + 1. This list is nested and does not require explicit item continuation. + + This paragraph is part of the preceding list item. + + 2. List item b. + + This paragraph belongs to item two of the outer list. +``` + +The AsciiDoc version is, arguably, easier to write. You don't need +to worry about indentation. But the Markdown version is much easier +to read. The nesting of list items is apparent to the eye in the +source, not just in the processed document. ## Why is a spec needed? @@ -34,15 +130,15 @@ questions it does not answer: not require that. This is hardly a "corner case," and divergences between implementations on this issue often lead to surprises for users in real documents. (See [this comment by John - Gruber](http://article.gmane.org/gmane.text.markdown.general/1997).) + Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).) -2. Is a blank line needed before a block quote or header? +2. Is a blank line needed before a block quote or heading? Most implementations do not require the blank line. However, this can lead to unexpected results in hard-wrapped text, and also to ambiguities in parsing (note that some implementations - put the header inside the blockquote, while others do not). + put the heading inside the blockquote, while others do not). (John Gruber has also spoken [in favor of requiring the blank - lines](http://article.gmane.org/gmane.text.markdown.general/2146).) + lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).) 3. Is a blank line needed before an indented code block? (`Markdown.pl` requires it, but this is not mentioned in the @@ -75,7 +171,7 @@ questions it does not answer: ``` (There are some relevant comments by John Gruber - [here](http://article.gmane.org/gmane.text.markdown.general/2554).) + [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).) 5. Can list markers be indented? Can ordered list markers be right-aligned? @@ -85,8 +181,8 @@ questions it does not answer: 10. item 2a ``` -6. Is this one list with a horizontal rule in its second item, - or two lists separated by a horizontal rule? +6. Is this one list with a thematic break in its second item, + or two lists separated by a thematic break? ``` markdown * a @@ -128,8 +224,8 @@ questions it does not answer: - and it can screw things up` ``` -11. Can list items include section headers? (`Markdown.pl` does not - allow this, but does allow blockquotes to include headers.) +11. Can list items include section headings? (`Markdown.pl` does not + allow this, but does allow blockquotes to include headings.) ``` markdown - # Heading @@ -168,7 +264,7 @@ satisfactory replacement for a spec. Because there is no unambiguous spec, implementations have diverged considerably. As a result, users are often surprised to find that -a document that renders one way on one system (say, a github wiki) +a document that renders one way on one system (say, a GitHub wiki) renders differently on another (say, converting to docbook using pandoc). To make matters worse, because nothing in Markdown counts as a "syntax error," the divergence often isn't discovered right away. @@ -201,85 +297,102 @@ In the examples, the `→` character is used to represent tabs. ## Characters and lines -Any sequence of [character]s is a valid CommonMark +Any sequence of [characters] is a valid CommonMark document. -A [character](@character) is a unicode code point. +A [character](@) is a Unicode code point. Although some +code points (for example, combining accents) do not correspond to +characters in an intuitive sense, all code points count as characters +for purposes of this spec. + This spec does not specify an encoding; it thinks of lines as composed -of characters rather than bytes. A conforming parser may be limited +of [characters] rather than bytes. A conforming parser may be limited to a certain encoding. -A [line](@line) is a sequence of zero or more [character]s +A [line](@) is a sequence of zero or more [characters] +other than newline (`U+000A`) or carriage return (`U+000D`), followed by a [line ending] or by the end of file. -A [line ending](@line-ending) is a newline (`U+000A`), carriage return -(`U+000D`), or carriage return + newline. +A [line ending](@) is a newline (`U+000A`), a carriage return +(`U+000D`) not followed by a newline, or a carriage return and a +following newline. A line containing no characters, or a line containing only spaces -(`U+0020`) or tabs (`U+0009`), is called a [blank line](@blank-line). +(`U+0020`) or tabs (`U+0009`), is called a [blank line](@). The following definitions of character classes will be used in this spec: -A [whitespace character](@whitespace-character) is a space +A [whitespace character](@) is a space (`U+0020`), tab (`U+0009`), newline (`U+000A`), line tabulation (`U+000B`), form feed (`U+000C`), or carriage return (`U+000D`). -[Whitespace](@whitespace) is a sequence of one or more [whitespace -character]s. +[Whitespace](@) is a sequence of one or more [whitespace +characters]. -A [unicode whitespace character](@unicode-whitespace-character) is -any code point in the unicode `Zs` class, or a tab (`U+0009`), +A [Unicode whitespace character](@) is +any code point in the Unicode `Zs` general category, or a tab (`U+0009`), carriage return (`U+000D`), newline (`U+000A`), or form feed (`U+000C`). -[Unicode whitespace](@unicode-whitespace) is a sequence of one -or more [unicode whitespace character]s. +[Unicode whitespace](@) is a sequence of one +or more [Unicode whitespace characters]. -A [space](@space) is `U+0020`. +A [space](@) is `U+0020`. -A [non-whitespace character](@non-space-character) is any character +A [non-whitespace character](@) is any character that is not a [whitespace character]. -An [ASCII punctuation character](@ascii-punctuation-character) +An [ASCII punctuation character](@) is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, -`*`, `+`, `,`, `-`, `.`, `/`, `:`, `;`, `<`, `=`, `>`, `?`, `@`, -`[`, `\`, `]`, `^`, `_`, `` ` ``, `{`, `|`, `}`, or `~`. +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). -A [punctuation character](@punctuation-character) is an [ASCII +A [punctuation character](@) is an [ASCII punctuation character] or anything in -the unicode classes `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. +the general Unicode categories `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. ## Tabs -Tabs in lines are not expanded to [spaces][space]. However, -in contexts where indentation is significant for the -document's structure, tabs behave as if they were replaced -by spaces with a tab stop of 4 characters. +Tabs in lines are not expanded to [spaces]. However, +in contexts where whitespace helps to define block structure, +tabs behave as if they were replaced by spaces with a tab stop +of 4 characters. -. +Thus, for example, a tab can be used instead of four spaces +in an indented code block. (Note, however, that internal +tabs are passed through as literal tabs, not expanded to +spaces.) + +```````````````````````````````` example →foo→baz→→bim .
foo→baz→→bim
 
-. +```````````````````````````````` -. +```````````````````````````````` example →foo→baz→→bim .
foo→baz→→bim
 
-. +```````````````````````````````` -. +```````````````````````````````` example a→a ὐ→a .
a→a
 ὐ→a
 
-. +```````````````````````````````` -. +In the following example, a continuation paragraph of a list +item is indented with a tab; this has exactly the same effect +as indentation with four spaces would: + +```````````````````````````````` example - foo →bar @@ -290,30 +403,105 @@ by spaces with a tab stop of 4 characters.

bar

-. +```````````````````````````````` + +```````````````````````````````` example +- foo +→→bar . ->→foo→bar +
    +
  • +

    foo

    +
      bar
    +
    +
  • +
+```````````````````````````````` + +Normally the `>` that begins a block quote may be followed +optionally by a space, which is not considered part of the +content. In the following case `>` is followed by a tab, +which is treated as if it were expanded into three spaces. +Since one of these spaces is considered part of the +delimiter, `foo` is considered to be indented six spaces +inside the block quote context, so we get an indented +code block starting with two spaces. + +```````````````````````````````` example +>→→foo .
-

foo→bar

+
  foo
+
+```````````````````````````````` + +```````````````````````````````` example +-→→foo +. +
    +
  • +
      foo
    +
    +
  • +
+```````````````````````````````` + + +```````````````````````````````` example + foo +→bar +. +
foo
+bar
+
+```````````````````````````````` + +```````````````````````````````` example + - foo + - bar +→ - baz +. +
    +
  • foo +
      +
    • bar +
        +
      • baz
      • +
      +
    • +
    +
  • +
+```````````````````````````````` + +```````````````````````````````` example +#→Foo +. +

Foo

+```````````````````````````````` + +```````````````````````````````` example +*→*→*→ . +
+```````````````````````````````` ## Insecure characters For security reasons, the Unicode character `U+0000` must be replaced -with the replacement character (`U+FFFD`). +with the REPLACEMENT CHARACTER (`U+FFFD`). # Blocks and inlines We can think of a document as a sequence of -[blocks](@block)---structural elements like paragraphs, block -quotations, lists, headers, rules, and code blocks. Some blocks (like +[blocks](@)---structural elements like paragraphs, block +quotations, lists, headings, rules, and code blocks. Some blocks (like block quotes and list items) contain other blocks; others (like -headers and paragraphs) contain [inline](@inline) content---text, -links, emphasized text, images, code, and so on. +headings and paragraphs) contain [inline](@) content---text, +links, emphasized text, images, code spans, and so on. ## Precedence @@ -321,7 +509,7 @@ Indicators of block structure always take precedence over indicators of inline structure. So, for example, the following is a list with two items, not a list with one item containing a code span: -. +```````````````````````````````` example - `one - two` . @@ -329,11 +517,12 @@ two items, not a list with one item containing a code span:
  • `one
  • two`
  • -. +```````````````````````````````` + This means that parsing can proceed in two steps: first, the block structure of the document can be discerned; second, text lines inside -paragraphs, headers, and other block constructs can be parsed for inline +paragraphs, headings, and other block constructs can be parsed for inline structure. The second step requires information about link reference definitions that will be available only at the end of the first step. Note that the first step requires processing lines in sequence, @@ -343,8 +532,8 @@ one block element does not affect the inline parsing of any other. ## Container blocks and leaf blocks We can divide blocks into two types: -[container block](@container-block)s, -which can contain other blocks, and [leaf block](@leaf-block)s, +[container blocks](@), +which can contain other blocks, and [leaf blocks](@), which cannot. # Leaf blocks @@ -352,14 +541,14 @@ which cannot. This section describes the different kinds of leaf block that make up a Markdown document. -## Horizontal rules +## Thematic breaks A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching `-`, `_`, or `*` characters, each followed -optionally by any number of spaces, forms a -[horizontal rule](@horizontal-rule). +optionally by any number of spaces or tabs, forms a +[thematic break](@). -. +```````````````````````````````` example *** --- ___ @@ -367,25 +556,28 @@ ___


    -. +```````````````````````````````` + Wrong characters: -. +```````````````````````````````` example +++ .

    +++

    -. +```````````````````````````````` -. + +```````````````````````````````` example === .

    ===

    -. +```````````````````````````````` + Not enough characters: -. +```````````````````````````````` example -- ** __ @@ -393,11 +585,12 @@ __

    -- ** __

    -. +```````````````````````````````` + One to three spaces indent are allowed: -. +```````````````````````````````` example *** *** *** @@ -405,64 +598,72 @@ One to three spaces indent are allowed:


    -. +```````````````````````````````` + Four spaces is too many: -. +```````````````````````````````` example *** .
    ***
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example Foo *** .

    Foo ***

    -. +```````````````````````````````` + More than three characters may be used: -. +```````````````````````````````` example _____________________________________ .
    -. +```````````````````````````````` + Spaces are allowed between the characters: -. +```````````````````````````````` example - - - .
    -. +```````````````````````````````` -. + +```````````````````````````````` example ** * ** * ** * ** .
    -. +```````````````````````````````` -. + +```````````````````````````````` example - - - - .
    -. +```````````````````````````````` + Spaces are allowed at the end: -. +```````````````````````````````` example - - - - .
    -. +```````````````````````````````` + However, no other characters may occur in the line: -. +```````````````````````````````` example _ _ _ _ a a------ @@ -472,20 +673,22 @@ a------

    _ _ _ _ a

    a------

    ---a---

    -. +```````````````````````````````` -It is required that all of the [non-whitespace character]s be the same. -So, this is not a horizontal rule: -. +It is required that all of the [non-whitespace characters] be the same. +So, this is not a thematic break: + +```````````````````````````````` example *-* .

    -

    -. +```````````````````````````````` -Horizontal rules do not need blank lines before or after: -. +Thematic breaks do not need blank lines before or after: + +```````````````````````````````` example - foo *** - bar @@ -497,11 +700,12 @@ Horizontal rules do not need blank lines before or after:
    • bar
    -. +```````````````````````````````` -Horizontal rules can interrupt a paragraph: -. +Thematic breaks can interrupt a paragraph: + +```````````````````````````````` example Foo *** bar @@ -509,27 +713,29 @@ bar

    Foo


    bar

    -. +```````````````````````````````` + If a line of dashes that meets the above conditions for being a -horizontal rule could also be interpreted as the underline of a [setext -header], the interpretation as a -[setext header] takes precedence. Thus, for example, -this is a setext header, not a paragraph followed by a horizontal rule: +thematic break could also be interpreted as the underline of a [setext +heading], the interpretation as a +[setext heading] takes precedence. Thus, for example, +this is a setext heading, not a paragraph followed by a thematic break: -. +```````````````````````````````` example Foo --- bar .

    Foo

    bar

    -. +```````````````````````````````` -When both a horizontal rule and a list item are possible -interpretations of a line, the horizontal rule takes precedence: -. +When both a thematic break and a list item are possible +interpretations of a line, the thematic break takes precedence: + +```````````````````````````````` example * Foo * * * * Bar @@ -541,11 +747,12 @@ interpretations of a line, the horizontal rule takes precedence:
    • Bar
    -. +```````````````````````````````` -If you want a horizontal rule in a list item, use a different bullet: -. +If you want a thematic break in a list item, use a different bullet: + +```````````````````````````````` example - Foo - * * * . @@ -555,25 +762,26 @@ If you want a horizontal rule in a list item, use a different bullet:
    -. +```````````````````````````````` + -## ATX headers +## ATX headings -An [ATX header](@atx-header) +An [ATX heading](@) consists of a string of characters, parsed as inline content, between an opening sequence of 1--6 unescaped `#` characters and an optional -closing sequence of any number of `#` characters. The opening sequence -of `#` characters cannot be followed directly by a -[non-whitespace character]. The optional closing sequence of `#`s must be +closing sequence of any number of unescaped `#` characters. +The opening sequence of `#` characters must be followed by a +[space] or by the end of line. The optional closing sequence of `#`s must be preceded by a [space] and may be followed by spaces only. The opening `#` character may be indented 0-3 spaces. The raw contents of the -header are stripped of leading and trailing spaces before being parsed -as inline content. The header level is equal to the number of `#` +heading are stripped of leading and trailing spaces before being parsed +as inline content. The heading level is equal to the number of `#` characters in the opening sequence. -Simple headers: +Simple headings: -. +```````````````````````````````` example # foo ## foo ### foo @@ -587,60 +795,66 @@ Simple headers:

    foo

    foo
    foo
    -. +```````````````````````````````` -More than six `#` characters is not a header: -. +More than six `#` characters is not a heading: + +```````````````````````````````` example ####### foo .

    ####### foo

    -. +```````````````````````````````` + At least one space is required between the `#` characters and the -header's contents, unless the header is empty. Note that many +heading's contents, unless the heading is empty. Note that many implementations currently do not require the space. However, the space was required by the [original ATX implementation](http://www.aaronsw.com/2002/atx/atx.py), and it helps prevent things like the following from being parsed as -headers: +headings: -. +```````````````````````````````` example #5 bolt -#foobar +#hashtag .

    #5 bolt

    -

    #foobar

    -. +

    #hashtag

    +```````````````````````````````` -This is not a header, because the first `#` is escaped: -. +This is not a heading, because the first `#` is escaped: + +```````````````````````````````` example \## foo .

    ## foo

    -. +```````````````````````````````` + Contents are parsed as inlines: -. +```````````````````````````````` example # foo *bar* \*baz\* .

    foo bar *baz*

    -. +```````````````````````````````` -Leading and trailing blanks are ignored in parsing inline content: -. +Leading and trailing [whitespace] is ignored in parsing inline content: + +```````````````````````````````` example # foo .

    foo

    -. +```````````````````````````````` + One to three spaces indentation are allowed: -. +```````````````````````````````` example ### foo ## foo # foo @@ -648,76 +862,83 @@ One to three spaces indentation are allowed:

    foo

    foo

    foo

    -. +```````````````````````````````` + Four spaces are too much: -. +```````````````````````````````` example # foo .
    # foo
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example foo # bar .

    foo # bar

    -. +```````````````````````````````` + A closing sequence of `#` characters is optional: -. +```````````````````````````````` example ## foo ## ### bar ### .

    foo

    bar

    -. +```````````````````````````````` + It need not be the same length as the opening sequence: -. +```````````````````````````````` example # foo ################################## ##### foo ## .

    foo

    foo
    -. +```````````````````````````````` + Spaces are allowed after the closing sequence: -. +```````````````````````````````` example ### foo ### .

    foo

    -. +```````````````````````````````` + -A sequence of `#` characters with a -[non-whitespace character] following it +A sequence of `#` characters with anything but [spaces] following it is not a closing sequence, but counts as part of the contents of the -header: +heading: -. +```````````````````````````````` example ### foo ### b .

    foo ### b

    -. +```````````````````````````````` + The closing sequence must be preceded by a space: -. +```````````````````````````````` example # foo# .

    foo#

    -. +```````````````````````````````` + Backslash-escaped `#` characters do not count as part of the closing sequence: -. +```````````````````````````````` example ### foo \### ## foo #\## # foo \# @@ -725,12 +946,13 @@ of the closing sequence:

    foo ###

    foo ###

    foo #

    -. +```````````````````````````````` -ATX headers need not be separated from surrounding content by blank + +ATX headings need not be separated from surrounding content by blank lines, and they can interrupt paragraphs: -. +```````````````````````````````` example **** ## foo **** @@ -738,9 +960,10 @@ lines, and they can interrupt paragraphs:

    foo


    -. +```````````````````````````````` -. + +```````````````````````````````` example Foo bar # baz Bar foo @@ -748,11 +971,12 @@ Bar foo

    Foo bar

    baz

    Bar foo

    -. +```````````````````````````````` -ATX headers can be empty: -. +ATX headings can be empty: + +```````````````````````````````` example ## # ### ### @@ -760,40 +984,39 @@ ATX headers can be empty:

    -. +```````````````````````````````` + -## Setext headers +## Setext headings -A [setext header](@setext-header) -consists of a line of text, containing at least one [non-whitespace character], -with no more than 3 spaces indentation, followed by a [setext header -underline]. The line of text must be -one that, were it not followed by the setext header underline, -would be interpreted as part of a paragraph: it cannot be -interpretable as a [code fence], [ATX header][ATX headers], -[block quote][block quotes], [horizontal rule][horizontal rules], +A [setext heading](@) consists of one or more +lines of text, each containing at least one [non-whitespace +character], with no more than 3 spaces indentation, followed by +a [setext heading underline]. The lines of text must be such +that, were they not followed by the setext heading underline, +they would be interpreted as a paragraph: they cannot be +interpretable as a [code fence], [ATX heading][ATX headings], +[block quote][block quotes], [thematic break][thematic breaks], [list item][list items], or [HTML block][HTML blocks]. -A [setext header underline](@setext-header-underline) is a sequence of +A [setext heading underline](@) is a sequence of `=` characters or a sequence of `-` characters, with no more than 3 -spaces indentation and any number of trailing spaces. If a line -containing a single `-` can be interpreted as an -empty [list items], it should be interpreted this way -and not as a [setext header underline]. +spaces of indentation and any number of trailing spaces or tabs. -The header is a level 1 header if `=` characters are used in the -[setext header underline], and a level 2 -header if `-` characters are used. The contents of the header are the -result of parsing the first line as Markdown inline content. +The heading is a level 1 heading if `=` characters are used in +the [setext heading underline], and a level 2 heading if `-` +characters are used. The contents of the heading are the result +of parsing the preceding lines of text as CommonMark inline +content. -In general, a setext header need not be preceded or followed by a +In general, a setext heading need not be preceded or followed by a blank line. However, it cannot interrupt a paragraph, so when a -setext header comes after a paragraph, a blank line is needed between +setext heading comes after a paragraph, a blank line is needed between them. Simple examples: -. +```````````````````````````````` example Foo *bar* ========= @@ -802,11 +1025,38 @@ Foo *bar* .

    Foo bar

    Foo bar

    +```````````````````````````````` + + +The content of the header may span more than one line: + +```````````````````````````````` example +Foo *bar +baz* +==== . +

    Foo bar +baz

    +```````````````````````````````` -The underlining can be any length: +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +[whitespace]. +```````````````````````````````` example + Foo *bar +baz*→ +==== . +

    Foo bar +baz

    +```````````````````````````````` + + +The underlining can be any length: + +```````````````````````````````` example Foo ------------------------- @@ -815,12 +1065,13 @@ Foo .

    Foo

    Foo

    -. +```````````````````````````````` + -The header content can be indented up to three spaces, and need +The heading content can be indented up to three spaces, and need not line up with the underlining: -. +```````````````````````````````` example Foo --- @@ -833,11 +1084,12 @@ not line up with the underlining:

    Foo

    Foo

    Foo

    -. +```````````````````````````````` + Four spaces indent is too much: -. +```````````````````````````````` example Foo --- @@ -850,31 +1102,34 @@ Four spaces indent is too much: Foo
    -. +```````````````````````````````` + -The setext header underline can be indented up to three spaces, and +The setext heading underline can be indented up to three spaces, and may have trailing spaces: -. +```````````````````````````````` example Foo ---- .

    Foo

    -. +```````````````````````````````` + Four spaces is too much: -. +```````````````````````````````` example Foo --- .

    Foo ---

    -. +```````````````````````````````` -The setext header underline cannot contain internal spaces: -. +The setext heading underline cannot contain internal spaces: + +```````````````````````````````` example Foo = = @@ -885,30 +1140,33 @@ Foo = =

    Foo


    -. +```````````````````````````````` + Trailing spaces in the content line do not cause a line break: -. +```````````````````````````````` example Foo ----- .

    Foo

    -. +```````````````````````````````` + Nor does a backslash at the end: -. +```````````````````````````````` example Foo\ ---- .

    Foo\

    -. +```````````````````````````````` + Since indicators of block structure take precedence over -indicators of inline structure, the following are setext headers: +indicators of inline structure, the following are setext headings: -. +```````````````````````````````` example `Foo ---- ` @@ -921,12 +1179,13 @@ of dashes"/>

    `

    <a title="a lot

    of dashes"/>

    -. +```````````````````````````````` + -The setext header underline cannot be a [lazy continuation +The setext heading underline cannot be a [lazy continuation line] in a list item or block quote: -. +```````````````````````````````` example > Foo --- . @@ -934,9 +1193,23 @@ line] in a list item or block quote:

    Foo


    -. +```````````````````````````````` + +```````````````````````````````` example +> foo +bar +=== . +
    +

    foo +bar +===

    +
    +```````````````````````````````` + + +```````````````````````````````` example - Foo --- . @@ -944,33 +1217,30 @@ line] in a list item or block quote:
  • Foo

  • -. +```````````````````````````````` -A setext header cannot interrupt a paragraph: -. +A blank line is needed between a paragraph and a following +setext heading, since otherwise the paragraph becomes part +of the heading's content: + +```````````````````````````````` example Foo Bar --- +. +

    Foo +Bar

    +```````````````````````````````` + +But in general a blank line is not required before or after +setext headings: + +```````````````````````````````` example +--- Foo -Bar -=== -. -

    Foo -Bar

    -
    -

    Foo -Bar -===

    -. - -But in general a blank line is not required before or after: - -. ---- -Foo ---- +--- Bar --- Baz @@ -979,30 +1249,33 @@ Baz

    Foo

    Bar

    Baz

    -. +```````````````````````````````` -Setext headers cannot be empty: -. +Setext headings cannot be empty: + +```````````````````````````````` example ==== .

    ====

    -. +```````````````````````````````` -Setext header text lines must not be interpretable as block + +Setext heading text lines must not be interpretable as block constructs other than paragraphs. So, the line of dashes -in these examples gets interpreted as a horizontal rule: +in these examples gets interpreted as a thematic break: -. +```````````````````````````````` example --- --- .

    -. +```````````````````````````````` -. + +```````````````````````````````` example - foo ----- . @@ -1010,18 +1283,20 @@ in these examples gets interpreted as a horizontal rule:
  • foo

  • -. +```````````````````````````````` -. + +```````````````````````````````` example foo --- .
    foo
     

    -. +```````````````````````````````` -. + +```````````````````````````````` example > foo ----- . @@ -1029,26 +1304,113 @@ in these examples gets interpreted as a horizontal rule:

    foo


    -. +```````````````````````````````` -If you want a header with `> foo` as its literal text, you can + +If you want a heading with `> foo` as its literal text, you can use backslash escapes: -. +```````````````````````````````` example \> foo ------ .

    > foo

    +```````````````````````````````` + + +**Compatibility note:** Most existing Markdown implementations +do not allow the text of setext headings to span multiple lines. +But there is no consensus about how to interpret + +``` markdown +Foo +bar +--- +baz +``` + +One can find four different interpretations: + +1. paragraph "Foo", heading "bar", paragraph "baz" +2. paragraph "Foo bar", thematic break, paragraph "baz" +3. paragraph "Foo bar --- baz" +4. heading "Foo bar", paragraph "baz" + +We find interpretation 4 most natural, and interpretation 4 +increases the expressive power of CommonMark, by allowing +multiline headings. Authors who want interpretation 1 can +put a blank line after the first paragraph: + +```````````````````````````````` example +Foo + +bar +--- +baz +. +

    Foo

    +

    bar

    +

    baz

    +```````````````````````````````` + + +Authors who want interpretation 2 can put blank lines around +the thematic break, + +```````````````````````````````` example +Foo +bar + +--- + +baz +. +

    Foo +bar

    +
    +

    baz

    +```````````````````````````````` + + +or use a thematic break that cannot count as a [setext heading +underline], such as + +```````````````````````````````` example +Foo +bar +* * * +baz +. +

    Foo +bar

    +
    +

    baz

    +```````````````````````````````` + + +Authors who want interpretation 3 can use backslash escapes: + +```````````````````````````````` example +Foo +bar +\--- +baz . +

    Foo +bar +--- +baz

    +```````````````````````````````` + ## Indented code blocks -An [indented code block](@indented-code-block) is composed of one or more -[indented chunk]s separated by blank lines. -An [indented chunk](@indented-chunk) is a sequence of non-blank lines, +An [indented code block](@) is composed of one or more +[indented chunks] separated by blank lines. +An [indented chunk](@) is a sequence of non-blank lines, each indented four or more spaces. The contents of the code block are the literal contents of the lines, including trailing -[line ending]s, minus four spaces of indentation. +[line endings], minus four spaces of indentation. An indented code block has no [info string]. An indented code block cannot interrupt a paragraph, so there must be @@ -1056,20 +1418,21 @@ a blank line between a paragraph and a following indented code block. (A blank line is not needed, however, between a code block and a following paragraph.) -. +```````````````````````````````` example a simple indented code block .
    a simple
       indented code block
     
    -. +```````````````````````````````` + If there is any ambiguity between an interpretation of indentation as a code block and as indicating that material belongs to a [list item][list items], the list item interpretation takes precedence: -. +```````````````````````````````` example - foo bar @@ -1080,9 +1443,10 @@ item][list items], the list item interpretation takes precedence:

    bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example 1. foo - bar @@ -1095,13 +1459,14 @@ item][list items], the list item interpretation takes precedence: -. +```````````````````````````````` + The contents of a code block are literal text, and do not get parsed as Markdown: -. +```````````````````````````````` example *hi* @@ -1112,11 +1477,12 @@ as Markdown: - one -. +```````````````````````````````` + Here we have three chunks separated by blank lines: -. +```````````````````````````````` example chunk1 chunk2 @@ -1133,12 +1499,13 @@ chunk2 chunk3 -. +```````````````````````````````` + Any initial spaces beyond four will be included in the content, even in interior blank lines: -. +```````````````````````````````` example chunk1 chunk2 @@ -1147,68 +1514,73 @@ in interior blank lines: chunk2 -. +```````````````````````````````` + An indented code block cannot interrupt a paragraph. (This allows hanging indents and the like.) -. +```````````````````````````````` example Foo bar .

    Foo bar

    -. +```````````````````````````````` + However, any non-blank line with fewer than four leading spaces ends the code block immediately. So a paragraph may occur immediately after indented code: -. +```````````````````````````````` example foo bar .
    foo
     

    bar

    -. +```````````````````````````````` + And indented code can occur immediately before and after other kinds of blocks: -. -# Header +```````````````````````````````` example +# Heading foo -Header +Heading ------ foo ---- . -

    Header

    +

    Heading

    foo
     
    -

    Header

    +

    Heading

    foo
     

    -. +```````````````````````````````` + The first line can be indented more than four spaces: -. +```````````````````````````````` example foo bar .
        foo
     bar
     
    -. +```````````````````````````````` + Blank lines preceding or following an indented code block are not included in it: -. +```````````````````````````````` example foo @@ -1217,30 +1589,32 @@ are not included in it: .
    foo
     
    -. +```````````````````````````````` + Trailing spaces are included in the code block's content: -. +```````````````````````````````` example foo .
    foo  
     
    -. +```````````````````````````````` + ## Fenced code blocks -A [code fence](@code-fence) is a sequence +A [code fence](@) is a sequence of at least three consecutive backtick characters (`` ` ``) or tildes (`~`). (Tildes and backticks cannot be mixed.) -A [fenced code block](@fenced-code-block) +A [fenced code block](@) begins with a code fence, indented no more than three spaces. The line with the opening code fence may optionally contain some text following the code fence; this is trimmed of leading and trailing -spaces and called the [info string](@info-string). -The [info string] may not contain any backtick +whitespace and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick characters. (The reason for this restriction is that otherwise some inline code would be incorrectly interpreted as the beginning of a fenced code block.) @@ -1261,7 +1635,7 @@ has been found, the code block contains all of the lines after the opening code fence until the end of the containing block (or document). (An alternative spec would require backtracking in the event that a closing code fence is not found. But this makes parsing -much less efficient, and there seems to be no real down side to the +much less efficient, and there seems to be no real downside to the behavior described here.) A fenced code block may interrupt a paragraph, and does not require @@ -1275,7 +1649,7 @@ particular treatment of the [info string]. Here is a simple example with backticks: -. +```````````````````````````````` example ``` < > @@ -1284,11 +1658,12 @@ Here is a simple example with backticks:
    <
      >
     
    -. +```````````````````````````````` + With tildes: -. +```````````````````````````````` example ~~~ < > @@ -1297,12 +1672,22 @@ With tildes:
    <
      >
     
    +```````````````````````````````` + +Fewer than three backticks is not enough: + +```````````````````````````````` example +`` +foo +`` . +

    foo

    +```````````````````````````````` The closing code fence must use the same character as the opening fence: -. +```````````````````````````````` example ``` aaa ~~~ @@ -1311,9 +1696,10 @@ aaa
    aaa
     ~~~
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example ~~~ aaa ``` @@ -1322,11 +1708,12 @@ aaa
    aaa
     ```
     
    -. +```````````````````````````````` + The closing code fence must be at least as long as the opening fence: -. +```````````````````````````````` example ```` aaa ``` @@ -1335,9 +1722,10 @@ aaa
    aaa
     ```
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example ~~~~ aaa ~~~ @@ -1346,18 +1734,20 @@ aaa
    aaa
     ~~~
     
    -. +```````````````````````````````` + Unclosed code blocks are closed by the end of the document -(or the enclosing [block quote] or [list item]): +(or the enclosing [block quote][block quotes] or [list item][list items]): -. +```````````````````````````````` example ``` .
    -. +```````````````````````````````` -. + +```````````````````````````````` example ````` ``` @@ -1367,9 +1757,10 @@ aaa ``` aaa -. +```````````````````````````````` -. + +```````````````````````````````` example > ``` > aaa @@ -1380,11 +1771,12 @@ bbb

    bbb

    -. +```````````````````````````````` + A code block can have all empty lines as its content: -. +```````````````````````````````` example ``` @@ -1393,22 +1785,24 @@ A code block can have all empty lines as its content:
    
       
     
    -. +```````````````````````````````` + A code block can be empty: -. +```````````````````````````````` example ``` ``` .
    -. +```````````````````````````````` + Fences can be indented. If the opening fence is indented, content lines will have equivalent opening indentation removed, if present: -. +```````````````````````````````` example ``` aaa aaa @@ -1417,9 +1811,10 @@ aaa
    aaa
     aaa
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example ``` aaa aaa @@ -1430,9 +1825,10 @@ aaa aaa aaa -. +```````````````````````````````` -. + +```````````````````````````````` example ``` aaa aaa @@ -1443,11 +1839,12 @@ aaa aaa aaa -. +```````````````````````````````` + Four spaces indentation produces an indented code block: -. +```````````````````````````````` example ``` aaa ``` @@ -1456,32 +1853,35 @@ Four spaces indentation produces an indented code block: aaa ``` -. +```````````````````````````````` + Closing fences may be indented by 0-3 spaces, and their indentation need not match that of the opening fence: -. +```````````````````````````````` example ``` aaa ``` .
    aaa
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example ``` aaa ``` .
    aaa
     
    -. +```````````````````````````````` + This is not a closing fence, because it is indented 4 spaces: -. +```````````````````````````````` example ``` aaa ``` @@ -1489,20 +1889,22 @@ aaa
    aaa
         ```
     
    -. +```````````````````````````````` + Code fences (opening and closing) cannot contain internal spaces: -. +```````````````````````````````` example ``` ``` aaa . -

    +

    aaa

    -. +```````````````````````````````` -. + +```````````````````````````````` example ~~~~~~ aaa ~~~ ~~ @@ -1510,12 +1912,13 @@ aaa
    aaa
     ~~~ ~~
     
    -. +```````````````````````````````` + Fenced code blocks can interrupt paragraphs, and can be followed directly by paragraphs, without a blank line between: -. +```````````````````````````````` example foo ``` bar @@ -1526,12 +1929,13 @@ baz
    bar
     

    baz

    -. +```````````````````````````````` + Other blocks can also occur before and after fenced code blocks without an intervening blank line: -. +```````````````````````````````` example foo --- ~~~ @@ -1543,14 +1947,17 @@ bar
    bar
     

    baz

    -. +```````````````````````````````` + An [info string] can be provided after the opening code fence. -Opening and closing spaces will be stripped, and the first word, prefixed -with `language-`, is used as the value for the `class` attribute of the -`code` element within the enclosing `pre` element. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. -. +```````````````````````````````` example ```ruby def foo(x) return 3 @@ -1561,9 +1968,10 @@ end return 3 end -. +```````````````````````````````` -. + +```````````````````````````````` example ~~~~ ruby startline=3 $%@#$ def foo(x) return 3 @@ -1574,50 +1982,67 @@ end return 3 end -. +```````````````````````````````` -. + +```````````````````````````````` example ````; ```` .
    -. +```````````````````````````````` -[Info string]s for backtick code blocks cannot contain backticks: -. +[Info strings] for backtick code blocks cannot contain backticks: + +```````````````````````````````` example ``` aa ``` foo .

    aa foo

    -. +```````````````````````````````` + -Closing code fences cannot have [info string]s: +[Info strings] for tilde code blocks can contain backticks and tildes: +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ . +
    foo
    +
    +```````````````````````````````` + + +Closing code fences cannot have [info strings]: + +```````````````````````````````` example ``` ``` aaa ``` .
    ``` aaa
     
    -. +```````````````````````````````` + ## HTML blocks -An [HTML block](@html-block) is a group of lines that is treated +An [HTML block](@) is a group of lines that is treated as raw HTML (and will not be escaped in HTML output). -There are seven kinds of [HTML block], which can be defined -by their start and end conditions. The block begins with a line that -meets a [start condition](@start-condition) (after up to three spaces -optional indentation). It ends with the first subsequent line that -meets a matching [end condition](@end-condition), or the last line of -the document, if no line is encountered that meets the -[end condition]. If the first line meets both the [start condition] -and the [end condition], the block will contain just that line. +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three spaces optional indentation). +It ends with the first subsequent line that meets a matching [end +condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. 1. **Start condition:** line begins with the string ``. -6. **Start condition:** line begins the string `<` or ``, or the string `/>`.\ **End condition:** line is followed by a [blank line]. -7. **Start condition:** line begins with an [open tag] -(with any [tag name]) followed only by [whitespace] or the end -of the line.\ +7. **Start condition:** line begins with a complete [open tag] +(with any [tag name] other than `script`, +`style`, or `pre`) or a complete [closing tag], +followed only by [whitespace] or the end of the line.\ **End condition:** line is followed by a [blank line]. +HTML blocks continue until they are closed by their appropriate +[end condition], or the last line of the document or other [container +block](#container-blocks). This means any HTML **within an HTML +block** that might otherwise be recognised as a start condition will +be ignored by the parser and passed through as-is, without changing +the parser's state. + +For instance, `
    ` within a HTML block started by `` will not affect
    +the parser state; as the HTML block was started in by start condition 6, it
    +will end at any blank line. This can be surprising:
    +
    +```````````````````````````````` example
    +
    +
    +**Hello**,
    +
    +_world_.
    +
    +
    +. +
    +
    +**Hello**,
    +

    world. +

    +
    +```````````````````````````````` + +In this case, the HTML block is terminated by the newline — the `**Hello**` +text remains verbatim — and regular parsing resumes, with a paragraph, +emphasised `world` and inline and block HTML following. + All types of [HTML blocks] except type 7 may interrupt a paragraph. Blocks of type 7 may not interrupt a paragraph. -(This restricted is intended to prevent unwanted interpretation +(This restriction is intended to prevent unwanted interpretation of long tags inside a wrapped paragraph as starting HTML blocks.) Some simple examples follow. Here are some basic HTML blocks of type 6: -. +```````````````````````````````` example
    @@ -1686,9 +2145,10 @@ okay.

    okay.

    -. +```````````````````````````````` -. + +```````````````````````````````` example
    *foo* -. +```````````````````````````````` + Here we have two HTML blocks with a Markdown paragraph between them: -. +```````````````````````````````` example
    *Markdown* @@ -1720,12 +2182,13 @@ Here we have two HTML blocks with a Markdown paragraph between them:

    Markdown

    -. +```````````````````````````````` + The tag on the first line can be partial, as long as it is split where there would be whitespace: -. +```````````````````````````````` example
    @@ -1733,9 +2196,10 @@ as it is split where there would be whitespace:
    -. +```````````````````````````````` -. + +```````````````````````````````` example
    @@ -1743,10 +2207,11 @@ as it is split where there would be whitespace:
    -. +```````````````````````````````` + An open tag need not be closed: -. +```````````````````````````````` example
    *foo* @@ -1755,49 +2220,54 @@ An open tag need not be closed:
    *foo*

    bar

    -. +```````````````````````````````` + A partial tag need not even be completed (garbage in, garbage out): -. +```````````````````````````````` example
    . -. +```````````````````````````````` -. + +```````````````````````````````` example
    foo
    @@ -1805,7 +2275,8 @@ foo
    foo
    -. +```````````````````````````````` + Everything until the next blank line or end of document gets included in the HTML block. So, in the following @@ -1813,7 +2284,7 @@ example, what looks like a Markdown code block is actually part of the HTML block, which continues until a blank line or the end of the document is reached: -. +```````````````````````````````` example
    ``` c int x = 33; @@ -1823,13 +2294,14 @@ int x = 33; ``` c int x = 33; ``` -. +```````````````````````````````` + To start an [HTML block] with a tag that is *not* in the list of block-level tags in (6), you must put the tag by itself on the first line (and it must be complete): -. +```````````````````````````````` example *bar* @@ -1837,11 +2309,12 @@ itself on the first line (and it must be complete): *bar* -. +```````````````````````````````` + In type 7 blocks, the [tag name] can be anything: -. +```````````````````````````````` example *bar* @@ -1849,9 +2322,10 @@ In type 7 blocks, the [tag name] can be anything: *bar* -. +```````````````````````````````` -. + +```````````````````````````````` example *bar* @@ -1859,7 +2333,17 @@ In type 7 blocks, the [tag name] can be anything: *bar* +```````````````````````````````` + + +```````````````````````````````` example + +*bar* . + +*bar* +```````````````````````````````` + These rules are designed to allow us to work with tags that can function as either block-level or inline-level tags. @@ -1867,7 +2351,7 @@ The `` tag is a nice example. We can surround content with `` tags in three different ways. In this case, we get a raw HTML block, because the `` tag is on a line by itself: -. +```````````````````````````````` example *foo* @@ -1875,13 +2359,14 @@ HTML block, because the `` tag is on a line by itself: *foo* -. +```````````````````````````````` + In this case, we get a raw HTML block that just includes the `` tag (because it ends with the following blank line). So the contents get interpreted as CommonMark: -. +```````````````````````````````` example *foo* @@ -1891,18 +2376,20 @@ line). So the contents get interpreted as CommonMark:

    foo

    -. +```````````````````````````````` + Finally, in this case, the `` tags are interpreted as [raw HTML] *inside* the CommonMark paragraph. (Because the tag is not on a line by itself, we get inline HTML rather than an [HTML block].) -. +```````````````````````````````` example *foo* .

    foo

    -. +```````````````````````````````` + HTML tags designed to contain literal content (`script`, `style`, `pre`), comments, processing instructions, @@ -1913,13 +2400,14 @@ As a result, these blocks can contain blank lines: A pre tag (type 1): -. +```````````````````````````````` example
    
     import Text.HTML.TagSoup
     
     main :: IO ()
     main = print $ parseTags tags
     
    +okay .
    
     import Text.HTML.TagSoup
    @@ -1927,33 +2415,39 @@ import Text.HTML.TagSoup
     main :: IO ()
     main = print $ parseTags tags
     
    -. +

    okay

    +```````````````````````````````` + A script tag (type 1): -. +```````````````````````````````` example +okay . -. +

    okay

    +```````````````````````````````` + A style tag (type 1): -. +```````````````````````````````` example +okay . -. +

    okay

    +```````````````````````````````` + If there is no matching end tag, the block will end at the -end of the document (or the enclosing [block quote] or -[list item]): +end of the document (or the enclosing [block quote][block quotes] +or [list item][list items]): -. +```````````````````````````````` example *foo* .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example *bar* *baz* . *bar*

    baz

    -. +```````````````````````````````` + Note that anything on the last line after the end tag will be included in the [HTML block]: -. +```````````````````````````````` example 1. *bar* @@ -2033,50 +2534,58 @@ foo 1. *bar* -. +```````````````````````````````` + A comment (type 2): -. +```````````````````````````````` example +okay . -. +

    okay

    +```````````````````````````````` + A processing instruction (type 3): -. +```````````````````````````````` example '; ?> +okay . '; ?> -. +

    okay

    +```````````````````````````````` + A declaration (type 4): -. +```````````````````````````````` example . -. +```````````````````````````````` + CDATA (type 5): -. +```````````````````````````````` example +okay . -. +

    okay

    +```````````````````````````````` + The opening tag can be indented 1-3 spaces, but not 4: -. +```````````````````````````````` example @@ -2114,9 +2626,10 @@ The opening tag can be indented 1-3 spaces, but not 4:
    <!-- foo -->
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example
    @@ -2124,12 +2637,13 @@ The opening tag can be indented 1-3 spaces, but not 4:
    <div>
     
    -. +```````````````````````````````` + An HTML block of types 1--6 can interrupt a paragraph, and need not be preceded by a blank line. -. +```````````````````````````````` example Foo
    bar @@ -2139,12 +2653,14 @@ bar
    bar
    -. +```````````````````````````````` + However, a following blank line is needed, except at the end of -a document, and except for blocks of types 1--5, above: +a document, and except for blocks of types 1--5, [above][HTML +block]: -. +```````````````````````````````` example
    bar
    @@ -2154,11 +2670,12 @@ bar bar
    *foo* -. +```````````````````````````````` + HTML blocks of type 7 cannot interrupt a paragraph: -. +```````````````````````````````` example Foo baz @@ -2166,7 +2683,8 @@ baz

    Foo baz

    -. +```````````````````````````````` + This rule differs from John Gruber's original Markdown syntax specification, which says: @@ -2198,7 +2716,7 @@ simply separate the Markdown from the HTML using blank lines: Compare: -. +```````````````````````````````` example
    *Emphasized* text. @@ -2208,9 +2726,10 @@ Compare:

    Emphasized text.

    -. +```````````````````````````````` -. + +```````````````````````````````` example
    *Emphasized* text.
    @@ -2218,7 +2737,8 @@ Compare:
    *Emphasized* text.
    -. +```````````````````````````````` + Some Markdown implementations have adopted a convention of interpreting content inside tags as text if the open tag has @@ -2231,7 +2751,7 @@ blocks into Markdown documents with 100% reliability. However, *in most cases* this will work fine, because the blank lines in HTML are usually followed by HTML block tags. For example: -. +```````````````````````````````` example @@ -2251,13 +2771,14 @@ Hi
    -. +```````````````````````````````` + There are problems, however, if the inner tags are indented *and* separated by spaces, as then they will be interpreted as an indented code block: -. +```````````````````````````````` example @@ -2278,16 +2799,17 @@ an indented code block:
    -. +```````````````````````````````` + Fortunately, blank lines are usually not necessary and can be deleted. The exception is inside `
    ` tags, but as described
    -above, raw HTML blocks starting with `
    ` *can* contain blank
    -lines.
    +[above][HTML blocks], raw HTML blocks starting with `
    `
    +*can* contain blank lines.
     
     ## Link reference definitions
     
    -A [link reference definition](@link-reference-definition)
    +A [link reference definition](@)
     consists of a [link label], indented up to three spaces, followed
     by a colon (`:`), optional [whitespace] (including up to one
     [line ending]), a [link destination],
    @@ -2295,24 +2817,25 @@ optional [whitespace] (including up to one
     [line ending]), and an optional [link
     title], which if it is present must be separated
     from the [link destination] by [whitespace].
    -No further [non-whitespace character]s may occur on the line.
    +No further [non-whitespace characters] may occur on the line.
     
     A [link reference definition]
     does not correspond to a structural element of a document.  Instead, it
    -defines a label which can be used in [reference link]s
    +defines a label which can be used in [reference links]
     and reference-style [images] elsewhere in the document.  [Link
     reference definitions] can come either before or after the links that use
     them.
     
    -.
    +```````````````````````````````` example
     [foo]: /url "title"
     
     [foo]
     .
     

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo]: /url 'the title' @@ -2320,17 +2843,19 @@ them. [foo] .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [Foo*bar\]]:my_(url) 'title (with parens)' [Foo*bar\]] .

    Foo*bar]

    -. +```````````````````````````````` -. + +```````````````````````````````` example [Foo bar]: 'title' @@ -2338,11 +2863,12 @@ them. [Foo bar] .

    Foo bar

    -. +```````````````````````````````` + The title may extend over multiple lines: -. +```````````````````````````````` example [foo]: /url ' title line1 @@ -2356,11 +2882,12 @@ title line1 line2 ">foo

    -. +```````````````````````````````` + However, it may not contain a [blank line]: -. +```````````````````````````````` example [foo]: /url 'title with blank line' @@ -2370,123 +2897,158 @@ with blank line'

    [foo]: /url 'title

    with blank line'

    [foo]

    -. +```````````````````````````````` + The title may be omitted: -. +```````````````````````````````` example [foo]: /url [foo] .

    foo

    -. +```````````````````````````````` + The link destination may not be omitted: -. +```````````````````````````````` example [foo]: [foo] .

    [foo]:

    [foo]

    +```````````````````````````````` + + However, an empty link destination may be specified using + angle brackets: + +```````````````````````````````` example +[foo]: <> + +[foo] +. +

    foo

    +```````````````````````````````` + +The title must be separated from the link destination by +whitespace: + +```````````````````````````````` example +[foo]: (baz) + +[foo] . +

    [foo]: (baz)

    +

    [foo]

    +```````````````````````````````` + Both title and destination can contain backslash escapes and literal backslashes: -. +```````````````````````````````` example [foo]: /url\bar\*baz "foo\"bar\baz" [foo] .

    foo

    -. +```````````````````````````````` + A link can come before its corresponding definition: -. +```````````````````````````````` example [foo] [foo]: url .

    foo

    -. +```````````````````````````````` + If there are several matching definitions, the first one takes precedence: -. +```````````````````````````````` example [foo] [foo]: first [foo]: second .

    foo

    -. +```````````````````````````````` + As noted in the section on [Links], matching of labels is case-insensitive (see [matches]). -. +```````````````````````````````` example [FOO]: /url [Foo] .

    Foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [ΑΓΩ]: /φου [αγω] .

    αγω

    -. +```````````````````````````````` + Here is a link reference definition with no corresponding link. It contributes nothing to the document. -. +```````````````````````````````` example [foo]: /url . -. +```````````````````````````````` + Here is another one: -. +```````````````````````````````` example [ foo ]: /url bar .

    bar

    -. +```````````````````````````````` + This is not a link reference definition, because there are -[non-whitespace character]s after the title: +[non-whitespace characters] after the title: -. +```````````````````````````````` example [foo]: /url "title" ok .

    [foo]: /url "title" ok

    -. +```````````````````````````````` + This is a link reference definition, but it has no title: -. +```````````````````````````````` example [foo]: /url "title" ok .

    "title" ok

    -. +```````````````````````````````` + This is not a link reference definition, because it is indented four spaces: -. +```````````````````````````````` example [foo]: /url "title" [foo] @@ -2494,12 +3056,13 @@ four spaces:
    [foo]: /url "title"
     

    [foo]

    -. +```````````````````````````````` + This is not a link reference definition, because it occurs inside a code block: -. +```````````````````````````````` example ``` [foo]: /url ``` @@ -2509,11 +3072,12 @@ a code block:
    [foo]: /url
     

    [foo]

    -. +```````````````````````````````` + A [link reference definition] cannot interrupt a paragraph. -. +```````````````````````````````` example Foo [bar]: /baz @@ -2522,12 +3086,13 @@ Foo

    Foo [bar]: /baz

    [bar]

    -. +```````````````````````````````` -However, it can directly follow other block elements, such as headers -and horizontal rules, and it need not be followed by a blank line. -. +However, it can directly follow other block elements, such as headings +and thematic breaks, and it need not be followed by a blank line. + +```````````````````````````````` example # [Foo] [foo]: /url > bar @@ -2536,12 +3101,32 @@ and horizontal rules, and it need not be followed by a blank line.

    bar

    +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +bar +=== +[foo] +. +

    bar

    +

    foo

    +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +=== +[foo] . +

    === +foo

    +```````````````````````````````` -Several [link reference definition]s + +Several [link reference definitions] can occur one after another, without intervening blank lines. -. +```````````````````````````````` example [foo]: /foo-url "foo" [bar]: /bar-url "bar" @@ -2554,14 +3139,15 @@ can occur one after another, without intervening blank lines.

    foo, bar, baz

    -. +```````````````````````````````` + -[Link reference definition]s can occur +[Link reference definitions] can occur inside block containers, like lists and block quotations. They affect the entire document, not just the container in which they are defined: -. +```````````````````````````````` example [foo] > [foo]: /url @@ -2569,13 +3155,25 @@ are defined:

    foo

    +```````````````````````````````` + + +Whether something is a [link reference definition] is +independent of whether the link reference it defines is +used in the document. Thus, for example, the following +document contains just a link reference definition, and +no visible content: + +```````````````````````````````` example +[foo]: /url . +```````````````````````````````` ## Paragraphs A sequence of non-blank lines that cannot be interpreted as other -kinds of blocks forms a [paragraph](@paragraph). +kinds of blocks forms a [paragraph](@). The contents of the paragraph are the result of parsing the paragraph's raw content as inlines. The paragraph's raw content is formed by concatenating the lines and removing initial and final @@ -2583,18 +3181,19 @@ is formed by concatenating the lines and removing initial and final A simple example with two paragraphs: -. +```````````````````````````````` example aaa bbb .

    aaa

    bbb

    -. +```````````````````````````````` + Paragraphs can contain multiple lines, but no blank lines: -. +```````````````````````````````` example aaa bbb @@ -2605,11 +3204,12 @@ ddd bbb

    ccc ddd

    -. +```````````````````````````````` + Multiple blank lines between paragraph have no effect: -. +```````````````````````````````` example aaa @@ -2617,22 +3217,24 @@ bbb .

    aaa

    bbb

    -. +```````````````````````````````` + Leading spaces are skipped: -. +```````````````````````````````` example aaa bbb .

    aaa bbb

    -. +```````````````````````````````` + Lines after the first may be indented any amount, since indented code blocks cannot interrupt paragraphs. -. +```````````````````````````````` example aaa bbb ccc @@ -2640,49 +3242,53 @@ aaa

    aaa bbb ccc

    -. +```````````````````````````````` + However, the first line may be indented at most three spaces, or an indented code block will be triggered: -. +```````````````````````````````` example aaa bbb .

    aaa bbb

    -. +```````````````````````````````` -. + +```````````````````````````````` example aaa bbb .
    aaa
     

    bbb

    -. +```````````````````````````````` + Final spaces are stripped before inline parsing, so a paragraph that ends with two or more spaces will not end with a [hard line break]: -. +```````````````````````````````` example aaa bbb .

    aaa
    bbb

    -. +```````````````````````````````` + ## Blank lines -[Blank line]s between block-level elements are ignored, +[Blank lines] between block-level elements are ignored, except for the role they play in determining whether a [list] is [tight] or [loose]. Blank lines at the beginning and end of the document are also ignored. -. +```````````````````````````````` example aaa @@ -2694,50 +3300,259 @@ aaa .

    aaa

    aaa

    -. +```````````````````````````````` +
    -# Container blocks +## Tables (extension) -A [container block] is a block that has other -blocks as its contents. There are two basic kinds of container blocks: -[block quotes] and [list items]. -[Lists] are meta-containers for [list items]. +GFM enables the `table` extension, where an additional leaf block type is +available. -We define the syntax for container blocks recursively. The general -form of the definition is: +A [table](@) is an arrangement of data with rows and columns, consisting of a +single header row, a [delimiter row] separating the header from the data, and +zero or more data rows. -> If X is a sequence of blocks, then the result of -> transforming X in such-and-such a way is a container of type Y -> with these blocks as its content. +Each row consists of cells containing arbitrary text, in which [inlines] are +parsed, separated by pipes (`|`). A leading and trailing pipe is also +recommended for clarity of reading, and if there's otherwise parsing ambiguity. +Spaces between pipes and cell content are trimmed. Block-level elements cannot +be inserted in a table. -So, we explain what counts as a block quote or list item by explaining -how these can be *generated* from their contents. This should suffice -to define the syntax, although it does not give a recipe for *parsing* -these constructions. (A recipe is provided below in the section entitled -[A parsing strategy](#appendix-a-parsing-strategy).) +The [delimiter row](@) consists of cells whose only content are hyphens (`-`), +and optionally, a leading or trailing colon (`:`), or both, to indicate left, +right, or center alignment respectively. -## Block quotes +```````````````````````````````` example table +| foo | bar | +| --- | --- | +| baz | bim | +. + + + + + + + + + + + + + +
    foobar
    bazbim
    +```````````````````````````````` -A [block quote marker](@block-quote-marker) -consists of 0-3 spaces of initial indent, plus (a) the character `>` together -with a following space, or (b) a single character `>` not followed by a space. +Cells in one column don't need to match length, though it's easier to read if +they are. Likewise, use of leading and trailing pipes may be inconsistent: -The following rules define [block quotes]: +```````````````````````````````` example table +| abc | defghi | +:-: | -----------: +bar | baz +. + + + + + + + + + + + + + +
    abcdefghi
    barbaz
    +```````````````````````````````` -1. **Basic case.** If a string of lines *Ls* constitute a sequence - of blocks *Bs*, then the result of prepending a [block quote - marker] to the beginning of each line in *Ls* - is a [block quote](#block-quotes) containing *Bs*. +Include a pipe in a cell's content by escaping it, including inside other +inline spans: -2. **Laziness.** If a string of lines *Ls* constitute a [block - quote](#block-quotes) with contents *Bs*, then the result of deleting - the initial [block quote marker] from one or - more lines in which the next [non-whitespace character] after the [block - quote marker] is [paragraph continuation - text] is a block quote with *Bs* as its content. - [Paragraph continuation text](@paragraph-continuation-text) is text - that will be parsed as part of the content of a paragraph, but does +```````````````````````````````` example table +| f\|oo | +| ------ | +| b `\|` az | +| b **\|** im | +. + + + + + + + + + + + + + + +
    f|oo
    b | az
    b | im
    +```````````````````````````````` + +The table is broken at the first empty line, or beginning of another +block-level structure: + +```````````````````````````````` example table +| abc | def | +| --- | --- | +| bar | baz | +> bar +. + + + + + + + + + + + + + +
    abcdef
    barbaz
    +
    +

    bar

    +
    +```````````````````````````````` + +```````````````````````````````` example table +| abc | def | +| --- | --- | +| bar | baz | +bar + +bar +. + + + + + + + + + + + + + + + + + +
    abcdef
    barbaz
    bar
    +

    bar

    +```````````````````````````````` + +The header row must match the [delimiter row] in the number of cells. If not, +a table will not be recognized: + +```````````````````````````````` example table +| abc | def | +| --- | +| bar | +. +

    | abc | def | +| --- | +| bar |

    +```````````````````````````````` + +The remainder of the table's rows may vary in the number of cells. If there +are a number of cells fewer than the number of cells in the header row, empty +cells are inserted. If there are greater, the excess is ignored: + +```````````````````````````````` example table +| abc | def | +| --- | --- | +| bar | +| bar | baz | boo | +. + + + + + + + + + + + + + + + + + +
    abcdef
    bar
    barbaz
    +```````````````````````````````` + +If there are no rows in the body, no `` is generated in HTML output: + +```````````````````````````````` example table +| abc | def | +| --- | --- | +. + + + + + + + +
    abcdef
    +```````````````````````````````` + +
    + +# Container blocks + +A [container block](#container-blocks) is a block that has other +blocks as its contents. There are two basic kinds of container blocks: +[block quotes] and [list items]. +[Lists] are meta-containers for [list items]. + +We define the syntax for container blocks recursively. The general +form of the definition is: + +> If X is a sequence of blocks, then the result of +> transforming X in such-and-such a way is a container of type Y +> with these blocks as its content. + +So, we explain what counts as a block quote or list item by explaining +how these can be *generated* from their contents. This should suffice +to define the syntax, although it does not give a recipe for *parsing* +these constructions. (A recipe is provided below in the section entitled +[A parsing strategy](#appendix-a-parsing-strategy).) + +## Block quotes + +A [block quote marker](@) +consists of 0-3 spaces of initial indent, plus (a) the character `>` together +with a following space, or (b) a single character `>` not followed by a space. + +The following rules define [block quotes]: + +1. **Basic case.** If a string of lines *Ls* constitute a sequence + of blocks *Bs*, then the result of prepending a [block quote + marker] to the beginning of each line in *Ls* + is a [block quote](#block-quotes) containing *Bs*. + +2. **Laziness.** If a string of lines *Ls* constitute a [block + quote](#block-quotes) with contents *Bs*, then the result of deleting + the initial [block quote marker] from one or + more lines in which the next [non-whitespace character] after the [block + quote marker] is [paragraph continuation + text] is a block quote with *Bs* as its content. + [Paragraph continuation text](@) is text + that will be parsed as part of the content of a paragraph, but does not occur at the beginning of the paragraph. 3. **Consecutiveness.** A document cannot contain two [block @@ -2747,7 +3562,7 @@ Nothing else counts as a [block quote](#block-quotes). Here is a simple example: -. +```````````````````````````````` example > # Foo > bar > baz @@ -2757,11 +3572,12 @@ Here is a simple example:

    bar baz

    -. +```````````````````````````````` + The spaces after the `>` characters can be omitted: -. +```````````````````````````````` example ># Foo >bar > baz @@ -2771,11 +3587,12 @@ The spaces after the `>` characters can be omitted:

    bar baz

    -. +```````````````````````````````` + The `>` characters can be indented 1-3 spaces: -. +```````````````````````````````` example > # Foo > bar > baz @@ -2785,11 +3602,12 @@ The `>` characters can be indented 1-3 spaces:

    bar baz

    -. +```````````````````````````````` + Four spaces gives us a code block: -. +```````````````````````````````` example > # Foo > bar > baz @@ -2798,12 +3616,13 @@ Four spaces gives us a code block: > bar > baz
    -. +```````````````````````````````` -The Laziness clause allows us to omit the `>` before a -paragraph continuation line: -. +The Laziness clause allows us to omit the `>` before +[paragraph continuation text]: + +```````````````````````````````` example > # Foo > bar baz @@ -2813,12 +3632,13 @@ baz

    bar baz

    -. +```````````````````````````````` + A block quote can contain some lazy and some non-lazy continuation lines: -. +```````````````````````````````` example > bar baz > foo @@ -2828,11 +3648,12 @@ baz baz foo

    -. +```````````````````````````````` + Laziness only applies to lines that would have been continuations of -paragraphs had they been prepended with `>`. For example, the -`>` cannot be omitted in the second line of +paragraphs had they been prepended with [block quote markers]. +For example, the `> ` cannot be omitted in the second line of ``` markdown > foo @@ -2841,7 +3662,7 @@ paragraphs had they been prepended with `>`. For example, the without changing the meaning: -. +```````````````````````````````` example > foo --- . @@ -2849,9 +3670,10 @@ without changing the meaning:

    foo


    -. +```````````````````````````````` -Similarly, if we omit the `>` in the second line of + +Similarly, if we omit the `> ` in the second line of ``` markdown > - foo @@ -2860,7 +3682,7 @@ Similarly, if we omit the `>` in the second line of then the block quote ends after the first line: -. +```````````````````````````````` example > - foo - bar . @@ -2872,12 +3694,13 @@ then the block quote ends after the first line:
    • bar
    -. +```````````````````````````````` + -For the same reason, we can't omit the `>` in front of +For the same reason, we can't omit the `> ` in front of subsequent lines of an indented or fenced code block: -. +```````````````````````````````` example > foo bar . @@ -2887,9 +3710,10 @@ subsequent lines of an indented or fenced code block:
    bar
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example > ``` foo ``` @@ -2899,29 +3723,57 @@ foo

    foo

    +```````````````````````````````` + + +Note that in the following case, we have a [lazy +continuation line]: + +```````````````````````````````` example +> foo + - bar . +
    +

    foo +- bar

    +
    +```````````````````````````````` + + +To see why, note that in + +```markdown +> foo +> - bar +``` + +the `- bar` is indented too far to start a list, and can't +be an indented code block because indented code blocks cannot +interrupt paragraphs, so it is [paragraph continuation text]. A block quote can be empty: -. +```````````````````````````````` example > .
    -. +```````````````````````````````` -. + +```````````````````````````````` example > > > .
    -. +```````````````````````````````` + A block quote can have initial or final blank lines: -. +```````````````````````````````` example > > foo > @@ -2929,11 +3781,12 @@ A block quote can have initial or final blank lines:

    foo

    -. +```````````````````````````````` + A blank line always separates block quotes: -. +```````````````````````````````` example > foo > bar @@ -2944,7 +3797,8 @@ A blank line always separates block quotes:

    bar

    -. +```````````````````````````````` + (Most current Markdown implementations, including John Gruber's original `Markdown.pl`, will parse this example as a single block quote @@ -2954,7 +3808,7 @@ whether two block quotes or one are wanted.) Consecutiveness means that if we put these block quotes together, we get a single block quote: -. +```````````````````````````````` example > foo > bar . @@ -2962,11 +3816,12 @@ we get a single block quote:

    foo bar

    -. +```````````````````````````````` + To get a block quote with two paragraphs, use: -. +```````````````````````````````` example > foo > > bar @@ -2975,11 +3830,12 @@ To get a block quote with two paragraphs, use:

    foo

    bar

    -. +```````````````````````````````` + Block quotes can interrupt paragraphs: -. +```````````````````````````````` example foo > bar . @@ -2987,12 +3843,13 @@ foo

    bar

    -. +```````````````````````````````` + In general, blank lines are not needed before or after block quotes: -. +```````````````````````````````` example > aaa *** > bbb @@ -3004,12 +3861,13 @@ quotes:

    bbb

    -. +```````````````````````````````` + However, because of laziness, a blank line is needed between a block quote and a following paragraph: -. +```````````````````````````````` example > bar baz . @@ -3017,9 +3875,10 @@ baz

    bar baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example > bar baz @@ -3028,9 +3887,10 @@ baz

    bar

    baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example > bar > baz @@ -3039,13 +3899,14 @@ baz

    bar

    baz

    -. +```````````````````````````````` + It is a consequence of the Laziness rule that any number of initial `>`s may be omitted on a continuation line of a nested block quote: -. +```````````````````````````````` example > > > foo bar . @@ -3057,9 +3918,10 @@ bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example >>> foo > bar >>baz @@ -3073,14 +3935,15 @@ baz

    -. +```````````````````````````````` + When including an indented code block in a block quote, remember that the [block quote marker] includes both the `>` and a following space. So *five spaces* are needed after the `>`: -. +```````````````````````````````` example > code > not code @@ -3092,18 +3955,19 @@ the `>`:

    not code

    -. +```````````````````````````````` + ## List items -A [list marker](@list-marker) is a +A [list marker](@) is a [bullet list marker] or an [ordered list marker]. -A [bullet list marker](@bullet-list-marker) +A [bullet list marker](@) is a `-`, `+`, or `*` character. -An [ordered list marker](@ordered-list-marker) +An [ordered list marker](@) is a sequence of 1--9 arabic digits (`0-9`), followed by either a `.` character or a `)` character. (The reason for the length limit is that with 10 digits we start seeing integer overflows @@ -3112,9 +3976,8 @@ in some browsers.) The following rules define [list items]: 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of - blocks *Bs* starting with a [non-whitespace character] and not separated - from each other by more than one blank line, and *M* is a list - marker of width *W* followed by 0 < *N* < 5 spaces, then the result + blocks *Bs* starting with a [non-whitespace character], and *M* is a + list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result of prepending *M* and the following spaces to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a list item with *Bs* as its contents. The type of the list item @@ -3122,9 +3985,19 @@ The following rules define [list items]: If the list item is ordered, then it is also assigned a start number, based on the ordered list marker. + Exceptions: + + 1. When the first list item in a [list] interrupts + a paragraph---that is, when it starts on a line that would + otherwise count as [paragraph continuation text]---then (a) + the lines *Ls* must not begin with a blank line, and (b) if + the list item is ordered, the start number must be 1. + 2. If any line is a [thematic break][thematic breaks] then + that line is not a list item. + For example, let *Ls* be the lines -. +```````````````````````````````` example A paragraph with two lines. @@ -3139,13 +4012,14 @@ with two lines.

    A block quote.

    -. +```````````````````````````````` + And let *M* be the marker `1.`, and *N* = 2. Then rule #1 says that the following is an ordered list item with start number 1, and the same contents as *Ls*: -. +```````````````````````````````` example 1. A paragraph with two lines. @@ -3164,7 +4038,8 @@ with two lines.

    -. +```````````````````````````````` + The most important thing to notice is that the position of the text after the list marker determines how much indentation @@ -3177,7 +4052,7 @@ item. Here are some examples showing how far content must be indented to be put under the list item: -. +```````````````````````````````` example - one two @@ -3186,9 +4061,10 @@ put under the list item:
  • one
  • two

    -. +```````````````````````````````` -. + +```````````````````````````````` example - one two @@ -3199,9 +4075,10 @@ put under the list item:

    two

    -. +```````````````````````````````` -. + +```````````````````````````````` example - one two @@ -3211,9 +4088,10 @@ put under the list item:
     two
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example - one two @@ -3224,7 +4102,8 @@ put under the list item:

    two

    -. +```````````````````````````````` + It is tempting to think of this in terms of columns: the continuation blocks must be indented at least to the column of the first @@ -3234,7 +4113,7 @@ is needed. Which column this indentation reaches will depend on how the list item is embedded in other constructions, as shown by this example: -. +```````````````````````````````` example > > 1. one >> >> two @@ -3249,7 +4128,8 @@ this example: -. +```````````````````````````````` + Here `two` occurs in the same column as the list marker `1.`, but is actually contained in the list item, because there is @@ -3260,7 +4140,7 @@ occurs far to the right of the initial text of the list item, `one`, but it is not considered part of the list item, because it is not indented far enough past the blockquote marker: -. +```````````````````````````````` example >>- one >> > > two @@ -3273,86 +4153,43 @@ far enough past the blockquote marker:

    two

    -. +```````````````````````````````` + Note that at least one space is needed between the list marker and any following content, so these are not list items: -. +```````````````````````````````` example -one 2.two .

    -one

    2.two

    -. +```````````````````````````````` -A list item may not contain blocks that are separated by more than -one blank line. Thus, two blank lines will end a list, unless the -two blanks are contained in a [fenced code block]. - -. -- foo - bar +A list item may contain blocks that are separated by more than +one blank line. +```````````````````````````````` example - foo bar - -- ``` - foo - - - bar - ``` - -- baz - - + ``` - foo - - - bar - ``` .
    • foo

      bar

    • -
    • -

      foo

      -
    -

    bar

    -
      -
    • -
      foo
      -
      -
      -bar
      -
      -
    • -
    • -

      baz

      -
        -
      • -
        foo
        +````````````````````````````````
         
         
        -bar
        -
        -
      • -
      -
    • -
    -. - A list item may contain any kind of block: -. +```````````````````````````````` example 1. foo ``` @@ -3374,55 +4211,83 @@ A list item may contain any kind of block: -. +```````````````````````````````` -Note that ordered list start numbers must be nine digits or less: +A list item that contains an indented code block will preserve +empty lines within the code block verbatim. + +```````````````````````````````` example +- Foo + + bar + + + baz . +
      +
    • +

      Foo

      +
      bar
      +
      +
      +baz
      +
      +
    • +
    +```````````````````````````````` + +Note that ordered list start numbers must be nine digits or less: + +```````````````````````````````` example 123456789. ok .
    1. ok
    -. +```````````````````````````````` -. + +```````````````````````````````` example 1234567890. not ok .

    1234567890. not ok

    -. +```````````````````````````````` + A start number may begin with 0s: -. +```````````````````````````````` example 0. ok .
    1. ok
    -. +```````````````````````````````` -. + +```````````````````````````````` example 003. ok .
    1. ok
    -. +```````````````````````````````` + A start number may not be negative: -. +```````````````````````````````` example -1. not ok .

    -1. not ok

    -. +```````````````````````````````` + 2. **Item starting with indented code.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with an indented code - block and not separated from each other by more than one blank line, - and *M* is a list marker of width *W* followed by + block, and *M* is a list marker of width *W* followed by one space, then the result of prepending *M* and the following space to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. @@ -3435,7 +4300,7 @@ An indented code block will have to be indented four spaces beyond the edge of the region where text will be included in the list item. In the following case that is 6 spaces: -. +```````````````````````````````` example - foo bar @@ -3447,11 +4312,12 @@ In the following case that is 6 spaces:
    -. +```````````````````````````````` + And in this case it is 11 spaces: -. +```````````````````````````````` example 10. foo bar @@ -3463,13 +4329,14 @@ And in this case it is 11 spaces:
    -. +```````````````````````````````` + If the *first* block in the list item is an indented code block, then by rule #2, the contents must be indented *one* space after the list marker: -. +```````````````````````````````` example indented code paragraph @@ -3481,9 +4348,10 @@ paragraph

    paragraph

    more code
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example 1. indented code paragraph @@ -3499,12 +4367,13 @@ paragraph
    -. +```````````````````````````````` + Note that an additional space indent is interpreted as space inside the code block: -. +```````````````````````````````` example 1. indented code paragraph @@ -3520,7 +4389,8 @@ inside the code block: -. +```````````````````````````````` + Note that rules #1 and #2 only apply to two cases: (a) cases in which the lines to be included in a list item begin with a @@ -3530,16 +4400,17 @@ block. In a case like the following, where the first block begins with a three-space indent, the rules do not allow us to form a list item by indenting the whole thing and prepending a list marker: -. +```````````````````````````````` example foo bar .

    foo

    bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example - foo bar @@ -3548,14 +4419,15 @@ bar
  • foo
  • bar

    -. +```````````````````````````````` + This is not a significant restriction, because when a block begins with 1-3 spaces indent, the indentation can always be removed without a change in interpretation, allowing rule #1 to be applied. So, in the above case: -. +```````````````````````````````` example - foo bar @@ -3566,7 +4438,8 @@ the above case:

    bar

    -. +```````````````````````````````` + 3. **Item starting with a blank line.** If a sequence of lines *Ls* starting with a single [blank line] constitute a (possibly empty) @@ -3582,7 +4455,7 @@ the above case: Here are some list items that start with a blank line but are not empty: -. +```````````````````````````````` example - foo - @@ -3603,11 +4476,40 @@ Here are some list items that start with a blank line but are not empty: +```````````````````````````````` + +When the list item starts with a blank line, the number of spaces +following the list marker doesn't change the required indentation: + +```````````````````````````````` example +- + foo +. +
      +
    • foo
    • +
    +```````````````````````````````` + + +A list item can begin with at most one blank line. +In the following example, `foo` is not part of the list +item: + +```````````````````````````````` example +- + + foo . +
      +
    • +
    +

    foo

    +```````````````````````````````` + Here is an empty bullet list item: -. +```````````````````````````````` example - foo - - bar @@ -3617,11 +4519,12 @@ Here is an empty bullet list item:
  • bar
  • -. +```````````````````````````````` + It does not matter whether there are spaces following the [list marker]: -. +```````````````````````````````` example - foo - - bar @@ -3631,11 +4534,12 @@ It does not matter whether there are spaces following the [list marker]:
  • bar
  • -. +```````````````````````````````` + Here is an empty ordered list item: -. +```````````````````````````````` example 1. foo 2. 3. bar @@ -3645,17 +4549,33 @@ Here is an empty ordered list item:
  • bar
  • -. +```````````````````````````````` + A list may start or end with an empty list item: -. +```````````````````````````````` example * .
    +```````````````````````````````` + +However, an empty list item cannot interrupt a paragraph: + +```````````````````````````````` example +foo +* + +foo +1. . +

    foo +*

    +

    foo +1.

    +```````````````````````````````` 4. **Indentation.** If a sequence of lines *Ls* constitutes a list item @@ -3666,7 +4586,7 @@ A list may start or end with an empty list item: Indented one space: -. +```````````````````````````````` example 1. A paragraph with two lines. @@ -3685,11 +4605,12 @@ with two lines.

    -. +```````````````````````````````` + Indented two spaces: -. +```````````````````````````````` example 1. A paragraph with two lines. @@ -3708,11 +4629,12 @@ with two lines.

    -. +```````````````````````````````` + Indented three spaces: -. +```````````````````````````````` example 1. A paragraph with two lines. @@ -3731,11 +4653,12 @@ with two lines.

    -. +```````````````````````````````` + Four spaces indent gives a code block: -. +```````````````````````````````` example 1. A paragraph with two lines. @@ -3750,7 +4673,8 @@ Four spaces indent gives a code block: > A block quote. -. +```````````````````````````````` + 5. **Laziness.** If a string of lines *Ls* constitute a [list @@ -3760,11 +4684,11 @@ Four spaces indent gives a code block: [paragraph continuation text] is a list item with the same contents and attributes. The unindented lines are called - [lazy continuation line](@lazy-continuation-line)s. + [lazy continuation line](@)s. -Here is an example with [lazy continuation line]s: +Here is an example with [lazy continuation lines]: -. +```````````````````````````````` example 1. A paragraph with two lines. @@ -3783,11 +4707,12 @@ with two lines.

    -. +```````````````````````````````` + Indentation can be partially deleted: -. +```````````````````````````````` example 1. A paragraph with two lines. . @@ -3795,11 +4720,12 @@ Indentation can be partially deleted:
  • A paragraph with two lines.
  • -. +```````````````````````````````` + These examples show how laziness can work in nested structures: -. +```````````````````````````````` example > 1. > Blockquote continued here. . @@ -3813,9 +4739,10 @@ continued here.

    -. +```````````````````````````````` -. + +```````````````````````````````` example > 1. > Blockquote > continued here. . @@ -3829,53 +4756,64 @@ continued here.

    -. +```````````````````````````````` + 6. **That's all.** Nothing that is not counted as a list item by rules #1--5 counts as a [list item](#list-items). -The rules for sublists follow from the general rules above. A sublist -must be indented the same number of spaces a paragraph would need to be -in order to be included in the list item. +The rules for sublists follow from the general rules +[above][List items]. A sublist must be indented the same number +of spaces a paragraph would need to be in order to be included +in the list item. So, in this case we need two spaces indent: -. +```````````````````````````````` example - foo - bar - baz + - boo .
    • foo
      • bar
          -
        • baz
        • +
        • baz +
            +
          • boo
      -. +
    • +
    +```````````````````````````````` + One is not enough: -. +```````````````````````````````` example - foo - bar - baz + - boo .
    • foo
    • bar
    • baz
    • +
    • boo
    -. +```````````````````````````````` + Here we need four, because the list marker is wider: -. +```````````````````````````````` example 10) foo - bar . @@ -3886,11 +4824,12 @@ Here we need four, because the list marker is wider: -. +```````````````````````````````` + Three is not enough: -. +```````````````````````````````` example 10) foo - bar . @@ -3900,11 +4839,12 @@ Three is not enough:
    • bar
    -. +```````````````````````````````` + A list may be the first block in a list item: -. +```````````````````````````````` example - - foo .
      @@ -3914,9 +4854,10 @@ A list may be the first block in a list item:
    -. +```````````````````````````````` -. + +```````````````````````````````` example 1. - 2. foo .
      @@ -3930,11 +4871,12 @@ A list may be the first block in a list item:
    -. +```````````````````````````````` -A list item can contain a header: -. +A list item can contain a heading: + +```````````````````````````````` example - # Foo - Bar --- @@ -3948,7 +4890,8 @@ A list item can contain a header:

    Bar

    baz -. +```````````````````````````````` + ### Motivation @@ -4136,41 +5079,97 @@ that in such cases, we require one space indentation from the list marker four-space rule in cases where the list marker plus its initial indentation takes four spaces (a common case), but diverge in other cases. +
    + +## Task list items (extension) + +GFM enables the `tasklist` extension, where an additional processing step is +performed on [list items]. + +A [task list item](@) is a [list item][list items] where the first block in it +is a paragraph which begins with a [task list item marker] and at least one +whitespace character before any other content. + +A [task list item marker](@) consists of an optional number of spaces, a left +bracket (`[`), either a whitespace character or the letter `x` in either +lowercase or uppercase, and then a right bracket (`]`). + +When rendered, the [task list item marker] is replaced with a semantic checkbox element; +in an HTML output, this would be an `` element. + +If the character between the brackets is a whitespace character, the checkbox +is unchecked. Otherwise, the checkbox is checked. + +This spec does not define how the checkbox elements are interacted with: in practice, +implementors are free to render the checkboxes as disabled or inmutable elements, +or they may dynamically handle dynamic interactions (i.e. checking, unchecking) in +the final rendered document. + +```````````````````````````````` example disabled +- [ ] foo +- [x] bar +. +
      +
    • foo
    • +
    • bar
    • +
    +```````````````````````````````` + +Task lists can be arbitrarily nested: + +```````````````````````````````` example disabled +- [x] foo + - [ ] bar + - [x] baz +- [ ] bim +. +
      +
    • foo +
        +
      • bar
      • +
      • baz
      • +
      +
    • +
    • bim
    • +
    +```````````````````````````````` + +
    + ## Lists -A [list](@list) is a sequence of one or more +A [list](@) is a sequence of one or more list items [of the same type]. The list items -may be separated by single [blank lines], but two -blank lines end all containing lists. +may be separated by any number of blank lines. -Two list items are [of the same type](@of-the-same-type) +Two list items are [of the same type](@) if they begin with a [list marker] of the same type. Two list markers are of the same type if (a) they are bullet list markers using the same character (`-`, `+`, or `*`) or (b) they are ordered list numbers with the same delimiter (either `.` or `)`). -A list is an [ordered list](@ordered-list) +A list is an [ordered list](@) if its constituent list items begin with -[ordered list marker]s, and a -[bullet list](@bullet-list) if its constituent list -items begin with [bullet list marker]s. +[ordered list markers], and a +[bullet list](@) if its constituent list +items begin with [bullet list markers]. -The [start number](@start-number) +The [start number](@) of an [ordered list] is determined by the list number of its initial list item. The numbers of subsequent list items are disregarded. -A list is [loose](@loose) if any of its constituent +A list is [loose](@) if any of its constituent list items are separated by blank lines, or if any of its constituent list items directly contain two block-level elements with a blank line -between them. Otherwise a list is [tight](@tight). +between them. Otherwise a list is [tight](@). (The difference in HTML output is that paragraphs in a loose list are wrapped in `

    ` tags, while paragraphs in a tight list are not.) Changing the bullet or ordered list delimiter starts a new list: -. +```````````````````````````````` example - foo - bar + baz @@ -4182,9 +5181,10 @@ Changing the bullet or ordered list delimiter starts a new list:

    • baz
    -. +```````````````````````````````` -. + +```````````````````````````````` example 1. foo 2. bar 3) baz @@ -4196,13 +5196,14 @@ Changing the bullet or ordered list delimiter starts a new list:
    1. baz
    -. +```````````````````````````````` + In CommonMark, a list can interrupt a paragraph. That is, no blank line is needed to separate a paragraph from a following list: -. +```````````````````````````````` example Foo - bar - baz @@ -4212,37 +5213,34 @@ Foo
  • bar
  • baz
  • -. +```````````````````````````````` `Markdown.pl` does not allow this, through fear of triggering a list via a numeral in a hard-wrapped line: -. +``` markdown The number of windows in my house is 14. The number of doors is 6. -. -

    The number of windows in my house is

    -
      -
    1. The number of doors is 6.
    2. -
    -. +``` -Oddly, `Markdown.pl` *does* allow a blockquote to interrupt a paragraph, -even though the same considerations might apply. We think that the two -cases should be treated the same. Here are two reasons for allowing -lists to interrupt paragraphs: +Oddly, though, `Markdown.pl` *does* allow a blockquote to +interrupt a paragraph, even though the same considerations might +apply. -First, it is natural and not uncommon for people to start lists without -blank lines: +In CommonMark, we do allow lists to interrupt paragraphs, for +two reasons. First, it is natural and not uncommon for people +to start lists without blank lines: - I need to buy - - new shoes - - a coat - - a plane ticket +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` Second, we are attracted to a -> [principle of uniformity](@principle-of-uniformity): +> [principle of uniformity](@): > if a chunk of text has a certain > meaning, it will continue to have the same meaning when put into a > container block (such as a list item or blockquote). @@ -4250,39 +5248,63 @@ Second, we are attracted to a (Indeed, the spec for [list items] and [block quotes] presupposes this principle.) This principle implies that if - * I need to buy - - new shoes - - a coat - - a plane ticket +``` markdown + * I need to buy + - new shoes + - a coat + - a plane ticket +``` is a list item containing a paragraph followed by a nested sublist, as all Markdown implementations agree it is (though the paragraph may be rendered without `

    ` tags, since the list is "tight"), then - I need to buy - - new shoes - - a coat - - a plane ticket +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` by itself should be a paragraph followed by a nested sublist. -Our adherence to the [principle of uniformity] -thus inclines us to think that there are two coherent packages: +Since it is well established Markdown practice to allow lists to +interrupt paragraphs inside list items, the [principle of +uniformity] requires us to allow this outside list items as +well. ([reStructuredText](http://docutils.sourceforge.net/rst.html) +takes a different approach, requiring blank lines before lists +even inside other list items.) -1. Require blank lines before *all* lists and blockquotes, - including lists that occur as sublists inside other list items. +In order to solve the problem of unwanted lists in paragraphs with +hard-wrapped numerals, we allow only lists starting with `1` to +interrupt paragraphs. Thus, -2. Require blank lines in none of these places. - -[reStructuredText](http://docutils.sourceforge.net/rst.html) takes -the first approach, for which there is much to be said. But the second -seems more consistent with established practice with Markdown. +```````````````````````````````` example +The number of windows in my house is +14. The number of doors is 6. +. +

    The number of windows in my house is +14. The number of doors is 6.

    +```````````````````````````````` -There can be blank lines between items, but two blank lines end -a list: +We may still get an unintended result in cases like +```````````````````````````````` example +The number of windows in my house is +1. The number of doors is 6. . +

    The number of windows in my house is

    +
      +
    1. The number of doors is 6.
    2. +
    +```````````````````````````````` + +but this rule should prevent most spurious list captures. + +There can be any number of blank lines between items: + +```````````````````````````````` example - foo - bar @@ -4297,35 +5319,13 @@ a list:
  • bar

  • +
  • +

    baz

    +
  • -
      -
    • baz
    • -
    -. - -As illustrated above in the section on [list items], -two blank lines between blocks *within* a list item will also end a -list: - -. -- foo - - - bar -- baz -. -
      -
    • foo
    • -
    -

    bar

    -
      -
    • baz
    • -
    -. - -Indeed, two blank lines will end *all* containing lists: +```````````````````````````````` -. +```````````````````````````````` example - foo - bar - baz @@ -4338,25 +5338,28 @@ Indeed, two blank lines will end *all* containing lists:
    • bar
        -
      • baz
      • +
      • +

        baz

        +

        bim

        +
    -
      bim
    -
    -. +```````````````````````````````` -Thus, two blank lines can be used to separate consecutive lists of -the same type, or to separate a list from an indented code block -that would otherwise be parsed as a subparagraph of the final list -item: -. +To separate consecutive lists of the same type, or to separate a +list from an indented code block that would otherwise be parsed +as a subparagraph of the final list item, you can insert a blank HTML +comment: + +```````````````````````````````` example - foo - bar + - baz - bim @@ -4365,19 +5368,22 @@ item:
  • foo
  • bar
  • +
    • baz
    • bim
    -. +```````````````````````````````` -. + +```````````````````````````````` example - foo notcode - foo + code . @@ -4390,25 +5396,25 @@ item:

    foo

    +
    code
     
    -. +```````````````````````````````` + List items need not be indented to the same level. The following list items will be treated as items at the same list level, since none is indented enough to belong to the previous list item: -. +```````````````````````````````` example - a - b - c - d - - e - - f - - g - - h -- i + - e + - f +- g .
    • a
    • @@ -4418,17 +5424,16 @@ item:
    • e
    • f
    • g
    • -
    • h
    • -
    • i
    -. +```````````````````````````````` -. + +```````````````````````````````` example 1. a 2. b - 3. c + 3. c .
    1. @@ -4441,12 +5446,56 @@ item:

      c

    +```````````````````````````````` + +Note, however, that list items may not be indented more than +three spaces. Here `- e` is treated as a paragraph continuation +line, because it is indented more than three spaces: + +```````````````````````````````` example +- a + - b + - c + - d + - e +. +
      +
    • a
    • +
    • b
    • +
    • c
    • +
    • d +- e
    • +
    +```````````````````````````````` + +And here, `3. c` is treated as in indented code block, +because it is indented four spaces and preceded by a +blank line. + +```````````````````````````````` example +1. a + + 2. b + + 3. c . +
      +
    1. +

      a

      +
    2. +
    3. +

      b

      +
    4. +
    +
    3. c
    +
    +```````````````````````````````` + This is a loose list, because there is a blank line between two of the list items: -. +```````````````````````````````` example - a - b @@ -4463,11 +5512,12 @@ two of the list items:

    c

    -. +```````````````````````````````` + So is this, with a empty second item: -. +```````````````````````````````` example * a * @@ -4482,13 +5532,14 @@ So is this, with a empty second item:

    c

    -. +```````````````````````````````` + These are loose lists, even though there is no space between the items, because one of the items directly contains two block-level elements with a blank line between them: -. +```````````````````````````````` example - a - b @@ -4507,9 +5558,10 @@ with a blank line between them:

    d

    -. +```````````````````````````````` -. + +```````````````````````````````` example - a - b @@ -4527,11 +5579,12 @@ with a blank line between them:

    d

    -. +```````````````````````````````` + This is a tight list, because the blank lines are in a code block: -. +```````````````````````````````` example - a - ``` b @@ -4550,13 +5603,14 @@ This is a tight list, because the blank lines are in a code block:
  • c
  • -. +```````````````````````````````` + This is a tight list, because the blank line is between two paragraphs of a sublist. So the sublist is loose while the outer list is tight: -. +```````````````````````````````` example - a - b @@ -4574,12 +5628,13 @@ the outer list is tight:
  • d
  • -. +```````````````````````````````` + This is a tight list, because the blank line is inside the block quote: -. +```````````````````````````````` example * a > b > @@ -4593,12 +5648,13 @@ block quote:
  • c
  • -. +```````````````````````````````` + This list is tight, because the consecutive block elements are not separated by blank lines: -. +```````````````````````````````` example - a > b ``` @@ -4616,19 +5672,21 @@ are not separated by blank lines:
  • d
  • -. +```````````````````````````````` + A single-paragraph list is tight: -. +```````````````````````````````` example - a .
    • a
    -. +```````````````````````````````` -. + +```````````````````````````````` example - a - b . @@ -4639,12 +5697,13 @@ A single-paragraph list is tight: -. +```````````````````````````````` + This list is loose, because of the blank line between the two block elements in the list item: -. +```````````````````````````````` example 1. ``` foo ``` @@ -4658,11 +5717,12 @@ two block elements in the list item:

    bar

    -. +```````````````````````````````` + Here the outer list is loose, the inner list tight: -. +```````````````````````````````` example * foo * bar @@ -4677,9 +5737,10 @@ Here the outer list is loose, the inner list tight:

    baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example - a - b - c @@ -4704,7 +5765,8 @@ Here the outer list is loose, the inner list tight: -. +```````````````````````````````` + # Inlines @@ -4712,46 +5774,50 @@ Inlines are parsed sequentially from the beginning of the character stream to the end (left to right, in left-to-right languages). Thus, for example, in -. +```````````````````````````````` example `hi`lo` .

    hilo`

    -. +```````````````````````````````` `hi` is parsed as code, leaving the backtick at the end as a literal backtick. + ## Backslash escapes Any ASCII punctuation character may be backslash-escaped: -. +```````````````````````````````` example \!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ .

    !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

    -. +```````````````````````````````` + Backslashes before other characters are treated as literal backslashes: -. +```````````````````````````````` example \→\A\a\ \3\φ\« .

    \→\A\a\ \3\φ\«

    -. +```````````````````````````````` + Escaped characters are treated as regular characters and do not have their usual Markdown meanings: -. +```````````````````````````````` example \*not emphasized* \
    not a tag \[not a link](/foo) \`not code` 1\. not a list \* not a list -\# not a header +\# not a heading \[foo]: /url "not a reference" +\ö not a character entity .

    *not emphasized* <br/> not a tag @@ -4759,109 +5825,131 @@ not have their usual Markdown meanings: `not code` 1. not a list * not a list -# not a header -[foo]: /url "not a reference"

    -. +# not a heading +[foo]: /url "not a reference" +&ouml; not a character entity

    +```````````````````````````````` + If a backslash is itself escaped, the following character is not: -. +```````````````````````````````` example \\*emphasis* .

    \emphasis

    -. +```````````````````````````````` + A backslash at the end of the line is a [hard line break]: -. +```````````````````````````````` example foo\ bar .

    foo
    bar

    -. +```````````````````````````````` + Backslash escapes do not work in code blocks, code spans, autolinks, or raw HTML: -. +```````````````````````````````` example `` \[\` `` .

    \[\`

    -. +```````````````````````````````` -. + +```````````````````````````````` example \[\] .
    \[\]
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example ~~~ \[\] ~~~ .
    \[\]
     
    -. +```````````````````````````````` -. + +```````````````````````````````` example .

    http://example.com?find=\*

    -. +```````````````````````````````` -. + +```````````````````````````````` example . -. +```````````````````````````````` + But they work in all other contexts, including URLs and link titles, -link references, and [info string]s in [fenced code block]s: +link references, and [info strings] in [fenced code blocks]: -. +```````````````````````````````` example [foo](/bar\* "ti\*tle") .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo] [foo]: /bar\* "ti\*tle" .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ``` foo\+bar foo ``` .
    foo
     
    -. +```````````````````````````````` -## Entities -With the goal of making this standard as HTML-agnostic as possible, all -valid HTML entities (except in code blocks and code spans) -are recognized as such and converted into unicode characters before -they are stored in the AST. This means that renderers to formats other -than HTML need not be HTML-entity aware. HTML renderers may either escape -unicode characters as entities or leave them as they are. (However, -`"`, `&`, `<`, and `>` must always be rendered as entities.) +## Entity and numeric character references -[Named entities](@name-entities) consist of `&` -+ any of the valid HTML5 entity names + `;`. The -[following document](https://html.spec.whatwg.org/multipage/entities.json) -is used as an authoritative source of the valid entity names and their -corresponding codepoints. +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: -. +- Entity and character references are not recognized in code + blocks and code spans. + +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. + +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. + +[Entity references](@) consist of `&` + any of the valid +HTML5 entity names + `;`. The +document +is used as an authoritative source for the valid entity +references and their corresponding code points. + +```````````````````````````````` example   & © Æ Ď ¾ ℋ ⅆ ∲ ≧̸ @@ -4869,263 +5957,405 @@ corresponding codepoints.

      & © Æ Ď ¾ ℋ ⅆ ∲ ≧̸

    -. +```````````````````````````````` -[Decimal entities](@decimal-entities) -consist of `&#` + a string of 1--8 arabic digits + `;`. Again, these -entities need to be recognised and transformed into their corresponding -unicode codepoints. Invalid unicode codepoints will be replaced by -the "unknown codepoint" character (`U+FFFD`). For security reasons, -the codepoint `U+0000` will also be replaced by `U+FFFD`. +[Decimal numeric character +references](@) +consist of `&#` + a string of 1--7 arabic digits + `;`. A +numeric character reference is parsed as the corresponding +Unicode character. Invalid Unicode code points will be replaced by +the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, +the code point `U+0000` will also be replaced by `U+FFFD`. + +```````````````````````````````` example +# Ӓ Ϡ � . -# Ӓ Ϡ � � -. -

    # Ӓ Ϡ � �

    -. +

    # Ӓ Ϡ �

    +```````````````````````````````` -[Hexadecimal entities](@hexadecimal-entities) -consist of `&#` + either `X` or `x` + a string of 1-8 hexadecimal digits -+ `;`. They will also be parsed and turned into the corresponding -unicode codepoints in the AST. -. +[Hexadecimal numeric character +references](@) consist of `&#` + +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. +They too are parsed as the corresponding Unicode character (this +time specified with a hexadecimal numeral instead of decimal). + +```````````````````````````````` example " ആ ಫ .

    " ആ ಫ

    -. +```````````````````````````````` + Here are some nonentities: +```````````````````````````````` example +  &x; &#; &#x; +� +&#abcdef0; +&ThisIsNotDefined; &hi?; . -  &x; &#; &#x; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?; -. -

    &nbsp &x; &#; &#x; &ThisIsWayTooLongToBeAnEntityIsntIt; &hi?;

    -. +

    &nbsp &x; &#; &#x; +&#987654321; +&#abcdef0; +&ThisIsNotDefined; &hi?;

    +```````````````````````````````` -Although HTML5 does accept some entities without a trailing semicolon -(such as `©`), these are not recognized as entities here, because it -makes the grammar too ambiguous: -. +Although HTML5 does accept some entity references +without a trailing semicolon (such as `©`), these are not +recognized here, because it makes the grammar too ambiguous: + +```````````````````````````````` example © .

    &copy

    -. +```````````````````````````````` + Strings that are not on the list of HTML5 named entities are not -recognized as entities either: +recognized as entity references either: -. +```````````````````````````````` example &MadeUpEntity; .

    &MadeUpEntity;

    -. +```````````````````````````````` -Entities are recognized in any context besides code spans or -code blocks, including raw HTML, URLs, [link title]s, and -[fenced code block] [info string]s: -. +Entity and numeric character references are recognized in any +context besides code spans or code blocks, including +URLs, [link titles], and [fenced code block][] [info strings]: + +```````````````````````````````` example . -. +```````````````````````````````` -. + +```````````````````````````````` example [foo](/föö "föö") .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo] [foo]: /föö "föö" .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ``` föö foo ``` .
    foo
     
    -. +```````````````````````````````` -Entities are treated as literal text in code spans and code blocks: -. +Entity and numeric character references are treated as literal +text in code spans and code blocks: + +```````````````````````````````` example `föö` .

    f&ouml;&ouml;

    -. +```````````````````````````````` -. + +```````````````````````````````` example föfö .
    f&ouml;f&ouml;
     
    +```````````````````````````````` + + +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. + +```````````````````````````````` example +*foo* +*foo* +. +

    *foo* +foo

    +```````````````````````````````` + +```````````````````````````````` example +* foo + +* foo . +

    * foo

    +
      +
    • foo
    • +
    +```````````````````````````````` + +```````````````````````````````` example +foo bar +. +

    foo + +bar

    +```````````````````````````````` + +```````````````````````````````` example + foo +. +

    →foo

    +```````````````````````````````` + + +```````````````````````````````` example +[a](url "tit") +. +

    [a](url "tit")

    +```````````````````````````````` + ## Code spans -A [backtick string](@backtick-string) +A [backtick string](@) is a string of one or more backtick characters (`` ` ``) that is neither preceded nor followed by a backtick. -A [code span](@code-span) begins with a backtick string and ends with +A [code span](@) begins with a backtick string and ends with a backtick string of equal length. The contents of the code span are -the characters between the two backtick strings, with leading and -trailing spaces and [line ending]s removed, and -[whitespace] collapsed to single spaces. +the characters between the two backtick strings, normalized in the +following ways: + +- First, [line endings] are converted to [spaces]. +- If the resulting string both begins *and* ends with a [space] + character, but does not consist entirely of [space] + characters, a single [space] character is removed from the + front and back. This allows you to include code that begins + or ends with backtick characters, which must be separated by + whitespace from the opening or closing backtick strings. This is a simple code span: -. +```````````````````````````````` example `foo` .

    foo

    -. +```````````````````````````````` + Here two backticks are used, because the code contains a backtick. -This example also illustrates stripping of leading and trailing spaces: +This example also illustrates stripping of a single leading and +trailing space: -. -`` foo ` bar `` +```````````````````````````````` example +`` foo ` bar `` .

    foo ` bar

    -. +```````````````````````````````` + This example shows the motivation for stripping leading and trailing spaces: -. +```````````````````````````````` example ` `` ` .

    ``

    +```````````````````````````````` + +Note that only *one* space is stripped: + +```````````````````````````````` example +` `` ` . +

    ``

    +```````````````````````````````` -[Line ending]s are treated like spaces: +The stripping only happens if the space is on both +sides of the string: +```````````````````````````````` example +` a` . +

    a

    +```````````````````````````````` + +Only [spaces], and not [unicode whitespace] in general, are +stripped in this way: + +```````````````````````````````` example +` b ` +. +

     b 

    +```````````````````````````````` + +No stripping occurs if the code span contains only spaces: + +```````````````````````````````` example +` ` +` ` +. +

      +

    +```````````````````````````````` + + +[Line endings] are treated like spaces: + +```````````````````````````````` example `` foo +bar +baz `` . -

    foo

    +

    foo bar baz

    +```````````````````````````````` + +```````````````````````````````` example +`` +foo +`` . +

    foo

    +```````````````````````````````` -Interior spaces and [line ending]s are collapsed into -single spaces, just as they would be by a browser: +Interior spaces are not collapsed: + +```````````````````````````````` example +`foo bar +baz` . -`foo bar - baz` -. -

    foo bar baz

    -. +

    foo bar baz

    +```````````````````````````````` -Q: Why not just leave the spaces, since browsers will collapse them -anyway? A: Because we might be targeting a non-HTML format, and we -shouldn't rely on HTML-specific rendering assumptions. +Note that browsers will typically collapse consecutive spaces +when rendering `` elements, so it is recommended that +the following CSS be used: -(Existing implementations differ in their treatment of internal -spaces and [line ending]s. Some, including `Markdown.pl` and -`showdown`, convert an internal [line ending] into a -`
    ` tag. But this makes things difficult for those who like to -hard-wrap their paragraphs, since a line break in the midst of a code -span will cause an unintended line break in the output. Others just -leave internal spaces as they are, which is fine if only HTML is being -targeted.) + code{white-space: pre-wrap;} -. -`foo `` bar` -. -

    foo `` bar

    -. Note that backslash escapes do not work in code spans. All backslashes are treated literally: -. +```````````````````````````````` example `foo\`bar` .

    foo\bar`

    -. +```````````````````````````````` + Backslash escapes are never needed, because one can always choose a string of *n* backtick characters as delimiters, where the code does not contain any strings of exactly *n* backtick characters. +```````````````````````````````` example +``foo`bar`` +. +

    foo`bar

    +```````````````````````````````` + +```````````````````````````````` example +` foo `` bar ` +. +

    foo `` bar

    +```````````````````````````````` + + Code span backticks have higher precedence than any other inline constructs except HTML tags and autolinks. Thus, for example, this is not parsed as emphasized text, since the second `*` is part of a code span: -. +```````````````````````````````` example *foo`*` .

    *foo*

    -. +```````````````````````````````` + And this is not parsed as a link: -. +```````````````````````````````` example [not a `link](/foo`) .

    [not a link](/foo)

    -. +```````````````````````````````` + Code spans, HTML tags, and autolinks have the same precedence. Thus, this is code: -. +```````````````````````````````` example `` .

    <a href="">`

    -. +```````````````````````````````` + But this is an HTML tag: -. +```````````````````````````````` example
    ` .

    `

    -. +```````````````````````````````` + And this is code: -. +```````````````````````````````` example `` .

    <http://foo.bar.baz>`

    -. +```````````````````````````````` + But this is an autolink: -. +```````````````````````````````` example ` .

    http://foo.bar.`baz`

    -. +```````````````````````````````` + When a backtick string is not closed by a matching backtick string, we just have literal backticks: -. +```````````````````````````````` example ```foo`` .

    ```foo``

    -. +```````````````````````````````` -. + +```````````````````````````````` example `foo .

    `foo

    +```````````````````````````````` + +The following case also illustrates the need for opening and +closing backtick strings to be equal in length: + +```````````````````````````````` example +`foo``bar`` . +

    `foobar

    +```````````````````````````````` + ## Emphasis and strong emphasis @@ -5173,24 +6403,27 @@ no emphasis: foo_bar_baz The rules given below capture all of these patterns, while allowing for efficient parsing strategies that do not backtrack. -First, some definitions. A [delimiter run](@delimiter-run) is either +First, some definitions. A [delimiter run](@) is either a sequence of one or more `*` characters that is not preceded or -followed by a `*` character, or a sequence of one or more `_` -characters that is not preceded or followed by a `_` character. - -A [left-flanking delimiter run](@left-flanking-delimiter-run) is -a [delimiter run] that is (a) not followed by [unicode whitespace], -and (b) either not followed by a [punctuation character], or -preceded by [unicode whitespace] or a [punctuation character]. +followed by a non-backslash-escaped `*` character, or a sequence +of one or more `_` characters that is not preceded or followed by +a non-backslash-escaped `_` character. + +A [left-flanking delimiter run](@) is +a [delimiter run] that is (1) not followed by [Unicode whitespace], +and either (2a) not followed by a [punctuation character], or +(2b) followed by a [punctuation character] and +preceded by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of -the line count as unicode whitespace. +the line count as Unicode whitespace. -A [right-flanking delimiter run](@right-flanking-delimiter-run) is -a [delimiter run] that is (a) not preceded by [unicode whitespace], -and (b) either not preceded by a [punctuation character], or -followed by [unicode whitespace] or a [punctuation character]. +A [right-flanking delimiter run](@) is +a [delimiter run] that is (1) not preceded by [Unicode whitespace], +and either (2a) not preceded by a [punctuation character], or +(2b) preceded by a [punctuation character] and +followed by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of -the line count as unicode whitespace. +the line count as Unicode whitespace. Here are some examples of delimiter runs. @@ -5236,56 +6469,62 @@ are a bit more complex than the ones given here.) The following rules define emphasis and strong emphasis: -1. A single `*` character [can open emphasis](@can-open-emphasis) +1. A single `*` character [can open emphasis](@) iff (if and only if) it is part of a [left-flanking delimiter run]. 2. A single `_` character [can open emphasis] iff it is part of a [left-flanking delimiter run] and either (a) not part of a [right-flanking delimiter run] - or (b) part of a [right-flanking delimeter run] + or (b) part of a [right-flanking delimiter run] preceded by punctuation. -3. A single `*` character [can close emphasis](@can-close-emphasis) +3. A single `*` character [can close emphasis](@) iff it is part of a [right-flanking delimiter run]. 4. A single `_` character [can close emphasis] iff it is part of a [right-flanking delimiter run] and either (a) not part of a [left-flanking delimiter run] - or (b) part of a [left-flanking delimeter run] + or (b) part of a [left-flanking delimiter run] followed by punctuation. -5. A double `**` [can open strong emphasis](@can-open-strong-emphasis) +5. A double `**` [can open strong emphasis](@) iff it is part of a [left-flanking delimiter run]. 6. A double `__` [can open strong emphasis] iff it is part of a [left-flanking delimiter run] and either (a) not part of a [right-flanking delimiter run] - or (b) part of a [right-flanking delimeter run] + or (b) part of a [right-flanking delimiter run] preceded by punctuation. -7. A double `**` [can close strong emphasis](@can-close-strong-emphasis) +7. A double `**` [can close strong emphasis](@) iff it is part of a [right-flanking delimiter run]. -8. A double `__` [can close strong emphasis] +8. A double `__` [can close strong emphasis] iff it is part of a [right-flanking delimiter run] and either (a) not part of a [left-flanking delimiter run] - or (b) part of a [left-flanking delimeter run] + or (b) part of a [left-flanking delimiter run] followed by punctuation. 9. Emphasis begins with a delimiter that [can open emphasis] and ends with a delimiter that [can close emphasis], and that uses the same - character (`_` or `*`) as the opening delimiter. There must - be a nonempty sequence of inlines between the open delimiter - and the closing delimiter; these form the contents of the emphasis - inline. + character (`_` or `*`) as the opening delimiter. The + opening and closing delimiters must belong to separate + [delimiter runs]. If one of the delimiters can both + open and close emphasis, then the sum of the lengths of the + delimiter runs containing the opening and closing delimiters + must not be a multiple of 3 unless both lengths are + multiples of 3. 10. Strong emphasis begins with a delimiter that [can open strong emphasis] and ends with a delimiter that [can close strong emphasis], and that uses the same character - (`_` or `*`) as the opening delimiter. - There must be a nonempty sequence of inlines between the open - delimiter and the closing delimiter; these form the contents of - the strong emphasis inline. + (`_` or `*`) as the opening delimiter. The + opening and closing delimiters must belong to separate + [delimiter runs]. If one of the delimiters can both open + and close strong emphasis, then the sum of the lengths of + the delimiter runs containing the opening and closing + delimiters must not be a multiple of 3 unless both lengths + are multiples of 3. 11. A literal `*` character cannot occur at the beginning or end of `*`-delimited emphasis or `**`-delimited strong emphasis, unless it @@ -5302,16 +6541,14 @@ the following principles resolve ambiguity: an interpretation `...` is always preferred to `...`. -14. An interpretation `...` is always - preferred to `..`. +14. An interpretation `...` is always + preferred to `...`. 15. When two potential emphasis or strong emphasis spans overlap, so that the second begins before the first ends and ends after the first ends, the first takes precedence. Thus, for example, `*foo _bar* baz_` is parsed as `foo _bar baz_` rather - than `*foo bar* baz`. For the same reason, - `**foo*bar**` is parsed as `foobar*` - rather than `foo*bar`. + than `*foo bar* baz`. 16. When there are two potential emphasis or strong emphasis spans with the same closing delimiter, the shorter one (the one that @@ -5330,176 +6567,194 @@ These rules can be illustrated through a series of examples. Rule 1: -. +```````````````````````````````` example *foo bar* .

    foo bar

    -. +```````````````````````````````` + This is not emphasis, because the opening `*` is followed by whitespace, and hence not part of a [left-flanking delimiter run]: -. +```````````````````````````````` example a * foo bar* .

    a * foo bar*

    -. +```````````````````````````````` + This is not emphasis, because the opening `*` is preceded by an alphanumeric and followed by punctuation, and hence not part of a [left-flanking delimiter run]: -. +```````````````````````````````` example a*"foo"* .

    a*"foo"*

    -. +```````````````````````````````` + Unicode nonbreaking spaces count as whitespace, too: -. +```````````````````````````````` example * a * .

    * a *

    -. +```````````````````````````````` + Intraword emphasis with `*` is permitted: -. +```````````````````````````````` example foo*bar* .

    foobar

    -. +```````````````````````````````` -. + +```````````````````````````````` example 5*6*78 .

    5678

    -. +```````````````````````````````` + Rule 2: -. +```````````````````````````````` example _foo bar_ .

    foo bar

    -. +```````````````````````````````` + This is not emphasis, because the opening `_` is followed by whitespace: -. +```````````````````````````````` example _ foo bar_ .

    _ foo bar_

    -. +```````````````````````````````` + This is not emphasis, because the opening `_` is preceded by an alphanumeric and followed by punctuation: -. +```````````````````````````````` example a_"foo"_ .

    a_"foo"_

    -. +```````````````````````````````` + Emphasis with `_` is not allowed inside words: -. +```````````````````````````````` example foo_bar_ .

    foo_bar_

    -. +```````````````````````````````` -. + +```````````````````````````````` example 5_6_78 .

    5_6_78

    -. +```````````````````````````````` -. + +```````````````````````````````` example пристаням_стремятся_ .

    пристаням_стремятся_

    -. +```````````````````````````````` + Here `_` does not generate emphasis, because the first delimiter run is right-flanking and the second left-flanking: -. +```````````````````````````````` example aa_"bb"_cc .

    aa_"bb"_cc

    -. +```````````````````````````````` + This is emphasis, even though the opening delimiter is both left- and right-flanking, because it is preceded by punctuation: -. +```````````````````````````````` example foo-_(bar)_ .

    foo-(bar)

    -. +```````````````````````````````` + Rule 3: This is not emphasis, because the closing delimiter does not match the opening delimiter: -. +```````````````````````````````` example _foo* .

    _foo*

    -. +```````````````````````````````` + This is not emphasis, because the closing `*` is preceded by whitespace: -. +```````````````````````````````` example *foo bar * .

    *foo bar *

    -. +```````````````````````````````` + A newline also counts as whitespace: -. +```````````````````````````````` example *foo bar * . -

    *foo bar

    -
      -
    • -
    -. +

    *foo bar +*

    +```````````````````````````````` + This is not emphasis, because the second `*` is preceded by punctuation and followed by an alphanumeric (hence it is not part of a [right-flanking delimiter run]: -. +```````````````````````````````` example *(*foo) .

    *(*foo)

    -. +```````````````````````````````` + The point of this restriction is more easily appreciated with this example: -. +```````````````````````````````` example *(*foo*)* .

    (foo)

    -. +```````````````````````````````` + Intraword emphasis with `*` is allowed: -. +```````````````````````````````` example *foo*bar .

    foobar

    -. +```````````````````````````````` + Rule 4: @@ -5507,164 +6762,184 @@ Rule 4: This is not emphasis, because the closing `_` is preceded by whitespace: -. +```````````````````````````````` example _foo bar _ .

    _foo bar _

    -. +```````````````````````````````` + This is not emphasis, because the second `_` is preceded by punctuation and followed by an alphanumeric: -. +```````````````````````````````` example _(_foo) .

    _(_foo)

    -. +```````````````````````````````` + This is emphasis within emphasis: -. +```````````````````````````````` example _(_foo_)_ .

    (foo)

    -. +```````````````````````````````` + Intraword emphasis is disallowed for `_`: -. +```````````````````````````````` example _foo_bar .

    _foo_bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example _пристаням_стремятся .

    _пристаням_стремятся

    -. +```````````````````````````````` -. + +```````````````````````````````` example _foo_bar_baz_ .

    foo_bar_baz

    -. +```````````````````````````````` + This is emphasis, even though the closing delimiter is both left- and right-flanking, because it is followed by punctuation: -. +```````````````````````````````` example _(bar)_. .

    (bar).

    -. +```````````````````````````````` + Rule 5: -. +```````````````````````````````` example **foo bar** .

    foo bar

    -. +```````````````````````````````` + This is not strong emphasis, because the opening delimiter is followed by whitespace: -. +```````````````````````````````` example ** foo bar** .

    ** foo bar**

    -. +```````````````````````````````` + This is not strong emphasis, because the opening `**` is preceded by an alphanumeric and followed by punctuation, and hence not part of a [left-flanking delimiter run]: -. +```````````````````````````````` example a**"foo"** .

    a**"foo"**

    -. +```````````````````````````````` + Intraword strong emphasis with `**` is permitted: -. +```````````````````````````````` example foo**bar** .

    foobar

    -. +```````````````````````````````` + Rule 6: -. +```````````````````````````````` example __foo bar__ .

    foo bar

    -. +```````````````````````````````` + This is not strong emphasis, because the opening delimiter is followed by whitespace: -. +```````````````````````````````` example __ foo bar__ .

    __ foo bar__

    -. +```````````````````````````````` + A newline counts as whitespace: -. +```````````````````````````````` example __ foo bar__ .

    __ foo bar__

    -. +```````````````````````````````` + This is not strong emphasis, because the opening `__` is preceded by an alphanumeric and followed by punctuation: -. +```````````````````````````````` example a__"foo"__ .

    a__"foo"__

    -. +```````````````````````````````` + Intraword strong emphasis is forbidden with `__`: -. +```````````````````````````````` example foo__bar__ .

    foo__bar__

    -. +```````````````````````````````` -. + +```````````````````````````````` example 5__6__78 .

    5__6__78

    -. +```````````````````````````````` -. + +```````````````````````````````` example пристаням__стремятся__ .

    пристаням__стремятся__

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo, __bar__, baz__ . -

    foo, bar, baz

    -. +

    foo, bar, baz

    +```````````````````````````````` + This is strong emphasis, even though the opening delimiter is both left- and right-flanking, because it is preceded by punctuation: -. +```````````````````````````````` example foo-__(bar)__ .

    foo-(bar)

    -. +```````````````````````````````` + Rule 7: @@ -5672,11 +6947,12 @@ Rule 7: This is not strong emphasis, because the closing delimiter is preceded by whitespace: -. +```````````````````````````````` example **foo bar ** .

    **foo bar **

    -. +```````````````````````````````` + (Nor can it be interpreted as an emphasized `*foo bar *`, because of Rule 11.) @@ -5684,215 +6960,278 @@ Rule 11.) This is not strong emphasis, because the second `**` is preceded by punctuation and followed by an alphanumeric: -. +```````````````````````````````` example **(**foo) .

    **(**foo)

    -. +```````````````````````````````` + The point of this restriction is more easily appreciated with these examples: -. +```````````````````````````````` example *(**foo**)* .

    (foo)

    -. +```````````````````````````````` -. + +```````````````````````````````` example **Gomphocarpus (*Gomphocarpus physocarpus*, syn. *Asclepias physocarpa*)** .

    Gomphocarpus (Gomphocarpus physocarpus, syn. Asclepias physocarpa)

    -. +```````````````````````````````` -. + +```````````````````````````````` example **foo "*bar*" foo** .

    foo "bar" foo

    -. +```````````````````````````````` + Intraword emphasis: -. +```````````````````````````````` example **foo**bar .

    foobar

    -. +```````````````````````````````` + Rule 8: This is not strong emphasis, because the closing delimiter is preceded by whitespace: -. +```````````````````````````````` example __foo bar __ .

    __foo bar __

    -. +```````````````````````````````` + This is not strong emphasis, because the second `__` is preceded by punctuation and followed by an alphanumeric: -. +```````````````````````````````` example __(__foo) .

    __(__foo)

    -. +```````````````````````````````` + The point of this restriction is more easily appreciated with this example: -. +```````````````````````````````` example _(__foo__)_ .

    (foo)

    -. +```````````````````````````````` + Intraword strong emphasis is forbidden with `__`: -. +```````````````````````````````` example __foo__bar .

    __foo__bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example __пристаням__стремятся .

    __пристаням__стремятся

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo__bar__baz__ .

    foo__bar__baz

    -. +```````````````````````````````` + This is strong emphasis, even though the closing delimiter is both left- and right-flanking, because it is followed by punctuation: -. +```````````````````````````````` example __(bar)__. .

    (bar).

    -. +```````````````````````````````` + Rule 9: Any nonempty sequence of inline elements can be the contents of an emphasized span. -. +```````````````````````````````` example *foo [bar](/url)* .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo bar* .

    foo bar

    -. +```````````````````````````````` + In particular, emphasis and strong emphasis can be nested inside emphasis: -. +```````````````````````````````` example _foo __bar__ baz_ .

    foo bar baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example _foo _bar_ baz_ .

    foo bar baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo_ bar_ .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo *bar** .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo **bar** baz* .

    foo bar baz

    -. - -But note: +```````````````````````````````` -. +```````````````````````````````` example *foo**bar**baz* . -

    foobarbaz

    -. +

    foobarbaz

    +```````````````````````````````` + +Note that in the preceding case, the interpretation + +``` markdown +

    foobarbaz

    +``` + + +is precluded by the condition that a delimiter that +can both open and close (like the `*` after `foo`) +cannot form emphasis if the sum of the lengths of +the delimiter runs containing the opening and +closing delimiters is a multiple of 3 unless +both lengths are multiples of 3. -The difference is that in the preceding case, the internal delimiters -[can close emphasis], while in the cases with spaces, they cannot. +For the same reason, we don't get two consecutive +emphasis sections in this example: + +```````````````````````````````` example +*foo**bar* . +

    foo**bar

    +```````````````````````````````` + + +The same condition ensures that the following +cases are all strong emphasis nested inside +emphasis, even when the interior spaces are +omitted: + + +```````````````````````````````` example ***foo** bar* .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo **bar*** .

    foo bar

    -. +```````````````````````````````` -Note, however, that in the following case we get no strong -emphasis, because the opening delimiter is closed by the first -`*` before `bar`: -. +```````````````````````````````` example *foo**bar*** . -

    foobar**

    +

    foobar

    +```````````````````````````````` + + +When the lengths of the interior closing and opening +delimiter runs are *both* multiples of 3, though, +they can match to create emphasis: + +```````````````````````````````` example +foo***bar***baz +. +

    foobarbaz

    +```````````````````````````````` + +```````````````````````````````` example +foo******bar*********baz . +

    foobar***baz

    +```````````````````````````````` Indefinite levels of nesting are possible: -. +```````````````````````````````` example *foo **bar *baz* bim** bop* .

    foo bar baz bim bop

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo [*bar*](/url)* .

    foo bar

    -. +```````````````````````````````` + There can be no empty emphasis or strong emphasis: -. +```````````````````````````````` example ** is not an empty emphasis .

    ** is not an empty emphasis

    -. +```````````````````````````````` -. + +```````````````````````````````` example **** is not an empty strong emphasis .

    **** is not an empty strong emphasis

    -. +```````````````````````````````` + Rule 10: @@ -5900,431 +7239,508 @@ Rule 10: Any nonempty sequence of inline elements can be the contents of an strongly emphasized span. -. +```````````````````````````````` example **foo [bar](/url)** .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example **foo bar** .

    foo bar

    -. +```````````````````````````````` + In particular, emphasis and strong emphasis can be nested inside strong emphasis: -. +```````````````````````````````` example __foo _bar_ baz__ .

    foo bar baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo __bar__ baz__ . -

    foo bar baz

    -. +

    foo bar baz

    +```````````````````````````````` -. + +```````````````````````````````` example ____foo__ bar__ . -

    foo bar

    -. +

    foo bar

    +```````````````````````````````` -. + +```````````````````````````````` example **foo **bar**** . -

    foo bar

    -. +

    foo bar

    +```````````````````````````````` -. + +```````````````````````````````` example **foo *bar* baz** .

    foo bar baz

    -. +```````````````````````````````` -But note: -. +```````````````````````````````` example **foo*bar*baz** . -

    foobarbaz**

    -. +

    foobarbaz

    +```````````````````````````````` -The difference is that in the preceding case, the internal delimiters -[can close emphasis], while in the cases with spaces, they cannot. -. +```````````````````````````````` example ***foo* bar** .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example **foo *bar*** .

    foo bar

    -. +```````````````````````````````` + Indefinite levels of nesting are possible: -. +```````````````````````````````` example **foo *bar **baz** bim* bop** .

    foo bar baz bim bop

    -. +```````````````````````````````` -. + +```````````````````````````````` example **foo [*bar*](/url)** .

    foo bar

    -. +```````````````````````````````` + There can be no empty emphasis or strong emphasis: -. +```````````````````````````````` example __ is not an empty emphasis .

    __ is not an empty emphasis

    -. +```````````````````````````````` -. + +```````````````````````````````` example ____ is not an empty strong emphasis .

    ____ is not an empty strong emphasis

    -. +```````````````````````````````` + Rule 11: -. +```````````````````````````````` example foo *** .

    foo ***

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo *\** .

    foo *

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo *_* .

    foo _

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo ***** .

    foo *****

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo **\*** .

    foo *

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo **_** .

    foo _

    -. +```````````````````````````````` + Note that when delimiters do not match evenly, Rule 11 determines that the excess literal `*` characters will appear outside of the emphasis, rather than inside it: -. +```````````````````````````````` example **foo* .

    *foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo** .

    foo*

    -. +```````````````````````````````` -. + +```````````````````````````````` example ***foo** .

    *foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ****foo* .

    ***foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example **foo*** .

    foo*

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo**** .

    foo***

    -. +```````````````````````````````` + Rule 12: -. +```````````````````````````````` example foo ___ .

    foo ___

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo _\__ .

    foo _

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo _*_ .

    foo *

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo _____ .

    foo _____

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo __\___ .

    foo _

    -. +```````````````````````````````` -. + +```````````````````````````````` example foo __*__ .

    foo *

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo_ .

    _foo

    -. +```````````````````````````````` + Note that when delimiters do not match evenly, Rule 12 determines that the excess literal `_` characters will appear outside of the emphasis, rather than inside it: -. +```````````````````````````````` example _foo__ .

    foo_

    -. +```````````````````````````````` -. + +```````````````````````````````` example ___foo__ .

    _foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ____foo_ .

    ___foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo___ .

    foo_

    -. +```````````````````````````````` -. + +```````````````````````````````` example _foo____ .

    foo___

    -. +```````````````````````````````` + Rule 13 implies that if you want emphasis nested directly inside emphasis, you must use different delimiters: -. +```````````````````````````````` example **foo** .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example *_foo_* .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example __foo__ .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example _*foo*_ .

    foo

    -. +```````````````````````````````` + However, strong emphasis within strong emphasis is possible without switching delimiters: -. +```````````````````````````````` example ****foo**** . -

    foo

    -. +

    foo

    +```````````````````````````````` -. + +```````````````````````````````` example ____foo____ . -

    foo

    -. +

    foo

    +```````````````````````````````` + Rule 13 can be applied to arbitrarily long sequences of delimiters: -. +```````````````````````````````` example ******foo****** . -

    foo

    -. +

    foo

    +```````````````````````````````` + Rule 14: -. +```````````````````````````````` example ***foo*** . -

    foo

    -. +

    foo

    +```````````````````````````````` -. + +```````````````````````````````` example _____foo_____ . -

    foo

    -. +

    foo

    +```````````````````````````````` + Rule 15: -. +```````````````````````````````` example *foo _bar* baz_ .

    foo _bar baz_

    -. +```````````````````````````````` -. -**foo*bar** -. -

    foobar*

    -. -. +```````````````````````````````` example *foo __bar *baz bim__ bam* .

    foo bar *baz bim bam

    -. +```````````````````````````````` + Rule 16: -. +```````````````````````````````` example **foo **bar baz** .

    **foo bar baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example *foo *bar baz* .

    *foo bar baz

    -. +```````````````````````````````` + Rule 17: -. +```````````````````````````````` example *[bar*](/url) .

    *bar*

    -. +```````````````````````````````` -. + +```````````````````````````````` example _foo [bar_](/url) .

    _foo bar_

    -. +```````````````````````````````` -. + +```````````````````````````````` example * .

    *

    -. +```````````````````````````````` -. + +```````````````````````````````` example ** .

    **

    -. +```````````````````````````````` -. + +```````````````````````````````` example __ .

    __

    -. +```````````````````````````````` -. + +```````````````````````````````` example *a `*`* .

    a *

    -. +```````````````````````````````` -. + +```````````````````````````````` example _a `_`_ .

    a _

    -. +```````````````````````````````` -. + +```````````````````````````````` example **a .

    **ahttp://foo.bar/?q=**

    -. +```````````````````````````````` -. + +```````````````````````````````` example __a .

    __ahttp://foo.bar/?q=__

    +```````````````````````````````` + + +
    + +## Strikethrough (extension) + +GFM enables the `strikethrough` extension, where an additional emphasis type is +available. + +Strikethrough text is any text wrapped in two tildes (`~`). + +```````````````````````````````` example strikethrough +~~Hi~~ Hello, world! . +

    Hi Hello, world!

    +```````````````````````````````` +As with regular emphasis delimiters, a new paragraph will cause strikethrough +parsing to cease: + +```````````````````````````````` example strikethrough +This ~~has a + +new paragraph~~. +. +

    This ~~has a

    +

    new paragraph~~.

    +```````````````````````````````` + +
    ## Links A link contains [link text] (the visible text), a [link destination] (the URI that is the link destination), and optionally a [link title]. -There are two basic kinds of links in Markdown. In [inline link]s the +There are two basic kinds of links in Markdown. In [inline links] the destination and title are given immediately after the link text. In -[reference link]s the destination and title are defined elsewhere in +[reference links] the destination and title are defined elsewhere in the document. -A [link text](@link-text) consists of a sequence of zero or more +A [link text](@) consists of a sequence of zero or more inline elements enclosed by square brackets (`[` and `]`). The following rules apply: @@ -6337,7 +7753,7 @@ following rules apply: with an open bracket `[`, a sequence of zero or more inlines, and a close bracket `]`. -- Backtick [code span]s, [autolink]s, and raw [HTML tag]s bind more tightly +- Backtick [code spans], [autolinks], and raw [HTML tags] bind more tightly than the brackets in link text. Thus, for example, `` [foo`]` `` could not be a link text, since the second `]` is part of a code span. @@ -6345,19 +7761,21 @@ following rules apply: - The brackets in link text bind more tightly than markers for [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link. -A [link destination](@link-destination) consists of either +A [link destination](@) consists of either - a sequence of zero or more characters between an opening `<` and a - closing `>` that contains no line breaks or unescaped `<` or `>` - characters, or + closing `>` that contains no line breaks or unescaped + `<` or `>` characters, or -- a nonempty sequence of characters that does not include - ASCII space or control characters, and includes parentheses - only if (a) they are backslash-escaped or (b) they are part of - a balanced pair of unescaped parentheses that is not itself - inside a balanced pair of unescaped parentheses. +- a nonempty sequence of characters that does not start with + `<`, does not include ASCII space or control characters, and + includes parentheses only if (a) they are backslash-escaped or + (b) they are part of a balanced pair of unescaped parentheses. + (Implementations may impose limits on parentheses nesting to + avoid performance issues, but at least three levels of nesting + should be supported.) -A [link title](@link-title) consists of either +A [link title](@) consists of either - a sequence of zero or more characters between straight double-quote characters (`"`), including a `"` character only if it is @@ -6368,12 +7786,13 @@ A [link title](@link-title) consists of either backslash-escaped, or - a sequence of zero or more characters between matching parentheses - (`(...)`), including a `)` character only if it is backslash-escaped. + (`(...)`), including a `(` or `)` character only if it is + backslash-escaped. -Although [link title]s may span multiple lines, they may not contain +Although [link titles] may span multiple lines, they may not contain a [blank line]. -An [inline link](@inline-link) consists of a [link text] followed immediately +An [inline link](@) consists of a [link text] followed immediately by a left parenthesis `(`, optional [whitespace], an optional [link destination], an optional [link title] separated from the link destination by [whitespace], optional [whitespace], and a right @@ -6387,152 +7806,200 @@ above. Here is a simple inline link: -. +```````````````````````````````` example [link](/uri "title") .

    link

    -. +```````````````````````````````` + The title may be omitted: -. +```````````````````````````````` example [link](/uri) .

    link

    -. +```````````````````````````````` + Both the title and the destination may be omitted: -. +```````````````````````````````` example [link]() .

    link

    -. +```````````````````````````````` -. + +```````````````````````````````` example [link](<>) .

    link

    -. +```````````````````````````````` -If the destination contains spaces, it must be enclosed in pointy -braces: +The destination can only contain spaces if it is +enclosed in pointy brackets: -. +```````````````````````````````` example [link](/my uri) .

    [link](/my uri)

    -. +```````````````````````````````` -. +```````````````````````````````` example [link]() .

    link

    -. +```````````````````````````````` -The destination cannot contain line breaks, even with pointy braces: +The destination cannot contain line breaks, +even if enclosed in pointy brackets: -. +```````````````````````````````` example [link](foo bar) .

    [link](foo bar)

    -. +```````````````````````````````` -. +```````````````````````````````` example [link]() .

    [link]()

    -. +```````````````````````````````` -One level of balanced parentheses is allowed without escaping: +The destination can contain `)` if it is enclosed +in pointy brackets: +```````````````````````````````` example +[a]() . -[link]((foo)and(bar)) -. -

    link

    -. +

    a

    +```````````````````````````````` -However, if you have parentheses within parentheses, you need to escape -or use the `<...>` form: +Pointy brackets that enclose links must be unescaped: +```````````````````````````````` example +[link]() . -[link](foo(and(bar))) -. -

    [link](foo(and(bar)))

    +

    [link](<foo>)

    +```````````````````````````````` + +These are not links, because the opening pointy bracket +is not matched properly: + +```````````````````````````````` example +[a]( +[a](c) . +

    [a](<b)c +[a](<b)c> +[a](c)

    +```````````````````````````````` + +Parentheses inside the link destination may be escaped: +```````````````````````````````` example +[link](\(foo\)) . -[link](foo(and\(bar\))) +

    link

    +```````````````````````````````` + +Any number of parentheses are allowed without escaping, as long as they are +balanced: + +```````````````````````````````` example +[link](foo(and(bar))) .

    link

    -. +```````````````````````````````` +However, if you have unbalanced parentheses, you need to escape or use the +`<...>` form: + +```````````````````````````````` example +[link](foo\(and\(bar\)) . -[link]() -. -

    link

    +

    link

    +```````````````````````````````` + + +```````````````````````````````` example +[link]() . +

    link

    +```````````````````````````````` + Parentheses and other symbols can also be escaped, as usual in Markdown: -. +```````````````````````````````` example [link](foo\)\:) .

    link

    -. +```````````````````````````````` + A link can contain fragment identifiers and queries: -. +```````````````````````````````` example [link](#fragment) [link](http://example.com#fragment) -[link](http://example.com?foo=bar&baz#fragment) +[link](http://example.com?foo=3#frag) .

    link

    link

    -

    link

    -. +

    link

    +```````````````````````````````` + Note that a backslash before a non-escapable character is just a backslash: -. +```````````````````````````````` example [link](foo\bar) .

    link

    -. +```````````````````````````````` -URL-escaping should be left alone inside the destination, as all -URL-escaped characters are also valid URL characters. HTML entities in -the destination will be parsed into the corresponding unicode -codepoints, as usual, and optionally URL-escaped when written as HTML. -. +URL-escaping should be left alone inside the destination, as all +URL-escaped characters are also valid URL characters. Entity and +numerical character references in the destination will be parsed +into the corresponding Unicode code points, as usual. These may +be optionally URL-escaped when written as HTML, but this spec +does not enforce any particular policy for rendering URLs in +HTML or other formats. Renderers may make different decisions +about how to escape or normalize URLs in the output. + +```````````````````````````````` example [link](foo%20bä) .

    link

    -. +```````````````````````````````` + Note that, because titles can often be parsed as destinations, if you try to omit the destination and keep the title, you'll get unexpected results: -. +```````````````````````````````` example [link]("title") .

    link

    -. +```````````````````````````````` + Titles may be in single quotes, double quotes, or parentheses: -. +```````````````````````````````` example [link](/url "title") [link](/url 'title') [link](/url (title)) @@ -6540,188 +8007,224 @@ Titles may be in single quotes, double quotes, or parentheses:

    link link link

    -. +```````````````````````````````` -Backslash escapes and entities may be used in titles: -. +Backslash escapes and entity and numeric character references +may be used in titles: + +```````````````````````````````` example [link](/url "title \""") .

    link

    +```````````````````````````````` + + +Titles must be separated from the link using a [whitespace]. +Other [Unicode whitespace] like non-breaking space doesn't work. + +```````````````````````````````` example +[link](/url "title") . +

    link

    +```````````````````````````````` + Nested balanced quotes are not allowed without escaping: -. +```````````````````````````````` example [link](/url "title "and" title") .

    [link](/url "title "and" title")

    -. +```````````````````````````````` + But it is easy to work around this by using a different quote type: -. +```````````````````````````````` example [link](/url 'title "and" title') .

    link

    -. +```````````````````````````````` + (Note: `Markdown.pl` did allow double quotes inside a double-quoted title, and its test suite included a test demonstrating this. But it is hard to see a good rationale for the extra complexity this brings, since there are already many ways---backslash escaping, -entities, or using a different quote type for the enclosing title---to -write titles containing double quotes. `Markdown.pl`'s handling of -titles has a number of other strange features. For example, it allows -single-quoted titles in inline links, but not reference links. And, in -reference links but not inline links, it allows a title to begin with -`"` and end with `)`. `Markdown.pl` 1.0.1 even allows titles with no closing -quotation mark, though 1.0.2b8 does not. It seems preferable to adopt -a simple, rational rule that works the same way in inline links and -link reference definitions.) +entity and numeric character references, or using a different +quote type for the enclosing title---to write titles containing +double quotes. `Markdown.pl`'s handling of titles has a number +of other strange features. For example, it allows single-quoted +titles in inline links, but not reference links. And, in +reference links but not inline links, it allows a title to begin +with `"` and end with `)`. `Markdown.pl` 1.0.1 even allows +titles with no closing quotation mark, though 1.0.2b8 does not. +It seems preferable to adopt a simple, rational rule that works +the same way in inline links and link reference definitions.) [Whitespace] is allowed around the destination and title: -. +```````````````````````````````` example [link]( /uri "title" ) .

    link

    -. +```````````````````````````````` + But it is not allowed between the link text and the following parenthesis: -. +```````````````````````````````` example [link] (/uri) .

    [link] (/uri)

    -. +```````````````````````````````` + The link text may contain balanced brackets, but not unbalanced ones, unless they are escaped: -. +```````````````````````````````` example [link [foo [bar]]](/uri) .

    link [foo [bar]]

    -. +```````````````````````````````` -. + +```````````````````````````````` example [link] bar](/uri) .

    [link] bar](/uri)

    -. +```````````````````````````````` -. + +```````````````````````````````` example [link [bar](/uri) .

    [link bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example [link \[bar](/uri) .

    link [bar

    -. +```````````````````````````````` + The link text may contain inline content: -. +```````````````````````````````` example [link *foo **bar** `#`*](/uri) .

    link foo bar #

    -. +```````````````````````````````` -. + +```````````````````````````````` example [![moon](moon.jpg)](/uri) .

    moon

    -. +```````````````````````````````` + However, links may not contain other links, at any level of nesting. -. +```````````````````````````````` example [foo [bar](/uri)](/uri) .

    [foo bar](/uri)

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo *[bar [baz](/uri)](/uri)*](/uri) .

    [foo [bar baz](/uri)](/uri)

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![[[foo](uri1)](uri2)](uri3) .

    [foo](uri2)

    -. +```````````````````````````````` + These cases illustrate the precedence of link text grouping over emphasis grouping: -. +```````````````````````````````` example *[foo*](/uri) .

    *foo*

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo *bar](baz*) .

    foo *bar

    -. +```````````````````````````````` + Note that brackets that *aren't* part of links do not take precedence: -. +```````````````````````````````` example *foo [bar* baz] .

    foo [bar baz]

    -. +```````````````````````````````` + These cases illustrate the precedence of HTML tags, code spans, and autolinks over link grouping: -. +```````````````````````````````` example [foo .

    [foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo`](/uri)` .

    [foo](/uri)

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo .

    [foohttp://example.com/?search=](uri)

    -. +```````````````````````````````` -There are three kinds of [reference link](@reference-link)s: + +There are three kinds of [reference link](@)s: [full](#full-reference-link), [collapsed](#collapsed-reference-link), and [shortcut](#shortcut-reference-link). -A [full reference link](@full-reference-link) -consists of a [link text], optional [whitespace], and a [link label] +A [full reference link](@) +consists of a [link text] immediately followed by a [link label] that [matches] a [link reference definition] elsewhere in the document. -A [link label](@link-label) begins with a left bracket (`[`) and ends +A [link label](@) begins with a left bracket (`[`) and ends with the first right bracket (`]`) that is not backslash-escaped. Between these brackets there must be at least one [non-whitespace character]. -Unescaped square bracket characters are not allowed in -[link label]s. A link label can have at most 999 -characters inside the square brackets. +Unescaped square bracket characters are not allowed inside the +opening and closing square brackets of [link labels]. A link +label can have at most 999 characters inside the square +brackets. -One label [matches](@matches) +One label [matches](@) another just in case their normalized forms are equal. To normalize a -label, perform the *unicode case fold* and collapse consecutive internal +label, strip off the opening and closing brackets, +perform the *Unicode case fold*, strip leading and trailing +[whitespace] and collapse consecutive internal [whitespace] to a single space. If there are multiple matching reference link definitions, the one that comes first in the document is used. (It is desirable in such cases to emit a warning.) @@ -6732,176 +8235,222 @@ matching [link reference definition]. Here is a simple example: -. +```````````````````````````````` example [foo][bar] [bar]: /url "title" .

    foo

    -. +```````````````````````````````` + The rules for the [link text] are the same as with -[inline link]s. Thus: +[inline links]. Thus: The link text may contain balanced brackets, but not unbalanced ones, unless they are escaped: -. +```````````````````````````````` example [link [foo [bar]]][ref] [ref]: /uri .

    link [foo [bar]]

    -. +```````````````````````````````` -. + +```````````````````````````````` example [link \[bar][ref] [ref]: /uri .

    link [bar

    -. +```````````````````````````````` + The link text may contain inline content: -. +```````````````````````````````` example [link *foo **bar** `#`*][ref] [ref]: /uri .

    link foo bar #

    -. +```````````````````````````````` -. + +```````````````````````````````` example [![moon](moon.jpg)][ref] [ref]: /uri .

    moon

    -. +```````````````````````````````` + However, links may not contain other links, at any level of nesting. -. +```````````````````````````````` example [foo [bar](/uri)][ref] [ref]: /uri .

    [foo bar]ref

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo *bar [baz][ref]*][ref] [ref]: /uri .

    [foo bar baz]ref

    -. +```````````````````````````````` + -(In the examples above, we have two [shortcut reference link]s +(In the examples above, we have two [shortcut reference links] instead of one [full reference link].) The following cases illustrate the precedence of link text grouping over emphasis grouping: -. +```````````````````````````````` example *[foo*][ref] [ref]: /uri .

    *foo*

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo *bar][ref] [ref]: /uri .

    foo *bar

    -. +```````````````````````````````` + These cases illustrate the precedence of HTML tags, code spans, and autolinks over link grouping: -. +```````````````````````````````` example [foo [ref]: /uri .

    [foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo`][ref]` [ref]: /uri .

    [foo][ref]

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo [ref]: /uri .

    [foohttp://example.com/?search=][ref]

    -. +```````````````````````````````` + Matching is case-insensitive: -. +```````````````````````````````` example [foo][BaR] [bar]: /url "title" .

    foo

    -. +```````````````````````````````` + Unicode case fold is used: -. +```````````````````````````````` example [Толпой][Толпой] is a Russian word. [ТОЛПОЙ]: /url .

    Толпой is a Russian word.

    -. +```````````````````````````````` + Consecutive internal [whitespace] is treated as one space for purposes of determining matching: -. +```````````````````````````````` example [Foo bar]: /url [Baz][Foo bar] .

    Baz

    -. +```````````````````````````````` -There can be [whitespace] between the [link text] and the [link label]: -. +No [whitespace] is allowed between the [link text] and the +[link label]: + +```````````````````````````````` example [foo] [bar] [bar]: /url "title" . -

    foo

    -. +

    [foo] bar

    +```````````````````````````````` -. + +```````````````````````````````` example [foo] [bar] [bar]: /url "title" . -

    foo

    -. +

    [foo] +bar

    +```````````````````````````````` + + +This is a departure from John Gruber's original Markdown syntax +description, which explicitly allows whitespace between the link +text and the link label. It brings reference links in line with +[inline links], which (according to both original Markdown and +this spec) cannot have whitespace after the link text. More +importantly, it prevents inadvertent capture of consecutive +[shortcut reference links]. If whitespace is allowed between the +link text and the link label, then in the following we will have +a single reference link, not two shortcut reference links, as +intended: + +``` markdown +[foo] +[bar] + +[foo]: /url1 +[bar]: /url2 +``` + +(Note that [shortcut reference links] were introduced by Gruber +himself in a beta version of `Markdown.pl`, but never included +in the official syntax description. Without shortcut reference +links, it is harmless to allow space between the link text and +link label; but once shortcut references are introduced, it is +too dangerous to allow this, as it frequently leads to +unintended results.) -When there are multiple matching [link reference definition]s, +When there are multiple matching [link reference definitions], the first is used: -. +```````````````````````````````` example [foo]: /url1 [foo]: /url2 @@ -6909,70 +8458,88 @@ the first is used: [bar][foo] .

    bar

    -. +```````````````````````````````` + Note that matching is performed on normalized strings, not parsed inline content. So the following does not match, even though the labels define equivalent inline content: -. +```````````````````````````````` example [bar][foo\!] [foo!]: /url .

    [bar][foo!]

    -. +```````````````````````````````` + -[Link label]s cannot contain brackets, unless they are +[Link labels] cannot contain brackets, unless they are backslash-escaped: -. +```````````````````````````````` example [foo][ref[] [ref[]: /uri .

    [foo][ref[]

    [ref[]: /uri

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo][ref[bar]] [ref[bar]]: /uri .

    [foo][ref[bar]]

    [ref[bar]]: /uri

    -. +```````````````````````````````` -. + +```````````````````````````````` example [[[foo]]] [[[foo]]]: /url .

    [[[foo]]]

    [[[foo]]]: /url

    -. +```````````````````````````````` -. + +```````````````````````````````` example [foo][ref\[] [ref\[]: /uri .

    foo

    +```````````````````````````````` + + +Note that in this example `]` is not backslash-escaped: + +```````````````````````````````` example +[bar\\]: /uri + +[bar\\] . +

    bar\

    +```````````````````````````````` + A [link label] must contain at least one [non-whitespace character]: -. +```````````````````````````````` example [] []: /uri .

    []

    []: /uri

    -. +```````````````````````````````` -. + +```````````````````````````````` example [ ] @@ -6983,191 +8550,235 @@ A [link label] must contain at least one [non-whitespace character]: ]

    [ ]: /uri

    -. +```````````````````````````````` -A [collapsed reference link](@collapsed-reference-link) + +A [collapsed reference link](@) consists of a [link label] that [matches] a [link reference definition] elsewhere in the -document, optional [whitespace], and the string `[]`. +document, followed by the string `[]`. The contents of the first link label are parsed as inlines, which are used as the link's text. The link's URI and title are provided by the matching reference link definition. Thus, `[foo][]` is equivalent to `[foo][foo]`. -. +```````````````````````````````` example [foo][] [foo]: /url "title" .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [*foo* bar][] [*foo* bar]: /url "title" .

    foo bar

    -. +```````````````````````````````` + The link labels are case-insensitive: -. +```````````````````````````````` example [Foo][] [foo]: /url "title" .

    Foo

    -. +```````````````````````````````` -As with full reference links, [whitespace] is allowed -between the two sets of brackets: -. +As with full reference links, [whitespace] is not +allowed between the two sets of brackets: + +```````````````````````````````` example [foo] [] [foo]: /url "title" . -

    foo

    -. +

    foo +[]

    +```````````````````````````````` -A [shortcut reference link](@shortcut-reference-link) + +A [shortcut reference link](@) consists of a [link label] that [matches] a [link reference definition] elsewhere in the document and is not followed by `[]` or a link label. The contents of the first link label are parsed as inlines, -which are used as the link's text. the link's URI and title +which are used as the link's text. The link's URI and title are provided by the matching link reference definition. Thus, `[foo]` is equivalent to `[foo][]`. -. +```````````````````````````````` example [foo] [foo]: /url "title" .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example [*foo* bar] [*foo* bar]: /url "title" .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example [[*foo* bar]] [*foo* bar]: /url "title" .

    [foo bar]

    -. +```````````````````````````````` -. + +```````````````````````````````` example [[bar [foo] [foo]: /url .

    [[bar foo

    -. +```````````````````````````````` + The link labels are case-insensitive: -. +```````````````````````````````` example [Foo] [foo]: /url "title" .

    Foo

    -. +```````````````````````````````` + A space after the link text should be preserved: -. +```````````````````````````````` example [foo] bar [foo]: /url .

    foo bar

    -. +```````````````````````````````` + If you just want bracketed text, you can backslash-escape the opening bracket to avoid links: -. +```````````````````````````````` example \[foo] [foo]: /url "title" .

    [foo]

    -. +```````````````````````````````` + Note that this is a link, because a link label ends with the first following closing bracket: -. +```````````````````````````````` example [foo*]: /url *[foo*] .

    *foo*

    -. +```````````````````````````````` -Full references take precedence over shortcut references: -. +Full and compact references take precedence over shortcut +references: + +```````````````````````````````` example [foo][bar] [foo]: /url1 [bar]: /url2 .

    foo

    +```````````````````````````````` + +```````````````````````````````` example +[foo][] + +[foo]: /url1 +. +

    foo

    +```````````````````````````````` + +Inline links also take precedence: + +```````````````````````````````` example +[foo]() + +[foo]: /url1 +. +

    foo

    +```````````````````````````````` + +```````````````````````````````` example +[foo](not a link) + +[foo]: /url1 . +

    foo(not a link)

    +```````````````````````````````` In the following case `[bar][baz]` is parsed as a reference, `[foo]` as normal text: -. +```````````````````````````````` example [foo][bar][baz] [baz]: /url .

    [foo]bar

    -. +```````````````````````````````` + Here, though, `[foo][bar]` is parsed as a reference, since `[bar]` is defined: -. +```````````````````````````````` example [foo][bar][baz] [baz]: /url1 [bar]: /url2 .

    foobaz

    -. +```````````````````````````````` + Here `[foo]` is not parsed as a shortcut reference, because it is followed by a link label (even though `[bar]` is not defined): -. +```````````````````````````````` example [foo][bar][baz] [baz]: /url1 [foo]: /url2 .

    [foo]bar

    -. +```````````````````````````````` + ## Images Syntax for images is like the syntax for links, with one difference. Instead of [link text], we have an -[image description](@image-description). The rules for this are the +[image description](@). The rules for this are the same as for [link text], except that (a) an image description starts with `![` rather than `[`, and (b) an image description may contain links. @@ -7175,31 +8786,35 @@ An image description has inline elements as its contents. When an image is rendered to HTML, this is standardly used as the image's `alt` attribute. -. +```````````````````````````````` example ![foo](/url "title") .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![foo *bar*] [foo *bar*]: train.jpg "train & tracks" .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![foo ![bar](/url)](/url2) .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![foo [bar](/url)](/url2) .

    foo bar

    -. +```````````````````````````````` + Though this spec is concerned with parsing, not rendering, it is recommended that in rendering to HTML, only the plain string content @@ -7208,258 +8823,295 @@ the above example, the alt attribute's value is `foo bar`, not `foo [bar](/url)` or `foo bar`. Only the plain string content is rendered, without formatting. -. +```````````````````````````````` example ![foo *bar*][] [foo *bar*]: train.jpg "train & tracks" .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![foo *bar*][foobar] [FOOBAR]: train.jpg "train & tracks" .

    foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![foo](train.jpg) .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example My ![foo bar](/path/to/train.jpg "title" ) .

    My foo bar

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![foo]() .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![](/url) .

    -. +```````````````````````````````` + Reference-style: -. -![foo] [bar] +```````````````````````````````` example +![foo][bar] [bar]: /url .

    foo

    -. +```````````````````````````````` -. -![foo] [bar] + +```````````````````````````````` example +![foo][bar] [BAR]: /url .

    foo

    -. +```````````````````````````````` + Collapsed: -. +```````````````````````````````` example ![foo][] [foo]: /url "title" .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![*foo* bar][] [*foo* bar]: /url "title" .

    foo bar

    -. +```````````````````````````````` + The labels are case-insensitive: -. +```````````````````````````````` example ![Foo][] [foo]: /url "title" .

    Foo

    -. +```````````````````````````````` -As with full reference links, [whitespace] is allowed + +As with reference links, [whitespace] is not allowed between the two sets of brackets: -. +```````````````````````````````` example ![foo] [] [foo]: /url "title" . -

    foo

    -. +

    foo +[]

    +```````````````````````````````` + Shortcut: -. +```````````````````````````````` example ![foo] [foo]: /url "title" .

    foo

    -. +```````````````````````````````` -. + +```````````````````````````````` example ![*foo* bar] [*foo* bar]: /url "title" .

    foo bar

    -. +```````````````````````````````` + Note that link labels cannot contain unescaped brackets: -. +```````````````````````````````` example ![[foo]] [[foo]]: /url "title" .

    ![[foo]]

    [[foo]]: /url "title"

    -. +```````````````````````````````` + The link labels are case-insensitive: -. +```````````````````````````````` example ![Foo] [foo]: /url "title" .

    Foo

    -. +```````````````````````````````` -If you just want bracketed text, you can backslash-escape the -opening `!` and `[`: -. -\!\[foo] +If you just want a literal `!` followed by bracketed text, you can +backslash-escape the opening `[`: + +```````````````````````````````` example +!\[foo] [foo]: /url "title" .

    ![foo]

    -. +```````````````````````````````` + If you want a link after a literal `!`, backslash-escape the `!`: -. +```````````````````````````````` example \![foo] [foo]: /url "title" .

    !foo

    -. +```````````````````````````````` + ## Autolinks -[Autolink](@autolink)s are absolute URIs and email addresses inside +[Autolink](@)s are absolute URIs and email addresses inside `<` and `>`. They are parsed as links, with the URL or email address as the link label. -A [URI autolink](@uri-autolink) consists of `<`, followed by an -[absolute URI] not containing `<`, followed by `>`. It is parsed as +A [URI autolink](@) consists of `<`, followed by an +[absolute URI] followed by `>`. It is parsed as a link to the URI, with the URI as the link's label. -An [absolute URI](@absolute-uri), +An [absolute URI](@), for these purposes, consists of a [scheme] followed by a colon (`:`) followed by zero or more characters other than ASCII [whitespace] and control characters, `<`, and `>`. If -the URI includes these characters, you must use percent-encoding +the URI includes these characters, they must be percent-encoded (e.g. `%20` for a space). -The following [schemes](@scheme) -are recognized (case-insensitive): -`coap`, `doi`, `javascript`, `aaa`, `aaas`, `about`, `acap`, `cap`, -`cid`, `crid`, `data`, `dav`, `dict`, `dns`, `file`, `ftp`, `geo`, `go`, -`gopher`, `h323`, `http`, `https`, `iax`, `icap`, `im`, `imap`, `info`, -`ipp`, `iris`, `iris.beep`, `iris.xpc`, `iris.xpcs`, `iris.lwz`, `ldap`, -`mailto`, `mid`, `msrp`, `msrps`, `mtqp`, `mupdate`, `news`, `nfs`, -`ni`, `nih`, `nntp`, `opaquelocktoken`, `pop`, `pres`, `rtsp`, -`service`, `session`, `shttp`, `sieve`, `sip`, `sips`, `sms`, `snmp`,` -soap.beep`, `soap.beeps`, `tag`, `tel`, `telnet`, `tftp`, `thismessage`, -`tn3270`, `tip`, `tv`, `urn`, `vemmi`, `ws`, `wss`, `xcon`, -`xcon-userid`, `xmlrpc.beep`, `xmlrpc.beeps`, `xmpp`, `z39.50r`, -`z39.50s`, `adiumxtra`, `afp`, `afs`, `aim`, `apt`,` attachment`, `aw`, -`beshare`, `bitcoin`, `bolo`, `callto`, `chrome`,` chrome-extension`, -`com-eventbrite-attendee`, `content`, `cvs`,` dlna-playsingle`, -`dlna-playcontainer`, `dtn`, `dvb`, `ed2k`, `facetime`, `feed`, -`finger`, `fish`, `gg`, `git`, `gizmoproject`, `gtalk`, `hcp`, `icon`, -`ipn`, `irc`, `irc6`, `ircs`, `itms`, `jar`, `jms`, `keyparc`, `lastfm`, -`ldaps`, `magnet`, `maps`, `market`,` message`, `mms`, `ms-help`, -`msnim`, `mumble`, `mvn`, `notes`, `oid`, `palm`, `paparazzi`, -`platform`, `proxy`, `psyc`, `query`, `res`, `resource`, `rmi`, `rsync`, -`rtmp`, `secondlife`, `sftp`, `sgn`, `skype`, `smb`, `soldat`, -`spotify`, `ssh`, `steam`, `svn`, `teamspeak`, `things`, `udp`, -`unreal`, `ut2004`, `ventrilo`, `view-source`, `webcal`, `wtai`, -`wyciwyg`, `xfire`, `xri`, `ymsgr`. +For purposes of this spec, a [scheme](@) is any sequence +of 2--32 characters beginning with an ASCII letter and followed +by any combination of ASCII letters, digits, or the symbols plus +("+"), period ("."), or hyphen ("-"). Here are some valid autolinks: -. +```````````````````````````````` example .

    http://foo.bar.baz

    -. +```````````````````````````````` -. + +```````````````````````````````` example .

    http://foo.bar.baz/test?q=hello&id=22&boolean

    -. +```````````````````````````````` -. + +```````````````````````````````` example .

    irc://foo.bar:2233/baz

    -. +```````````````````````````````` + Uppercase is also fine: -. +```````````````````````````````` example .

    MAILTO:FOO@BAR.BAZ

    +```````````````````````````````` + + +Note that many strings that count as [absolute URIs] for +purposes of this spec are not valid URIs, because their +schemes are not registered or because of other problems +with their syntax: + +```````````````````````````````` example + . +

    a+b+c:d

    +```````````````````````````````` -Spaces are not allowed in autolinks: +```````````````````````````````` example + . +

    made-up-scheme://foo,bar

    +```````````````````````````````` + + +```````````````````````````````` example + +. +

    http://../

    +```````````````````````````````` + + +```````````````````````````````` example + +. +

    localhost:5001/foo

    +```````````````````````````````` + + +Spaces are not allowed in autolinks: + +```````````````````````````````` example .

    <http://foo.bar/baz bim>

    -. +```````````````````````````````` + Backslash-escapes do not work inside autolinks: -. +```````````````````````````````` example .

    http://example.com/\[\

    -. +```````````````````````````````` + -An [email autolink](@email-autolink) +An [email autolink](@) consists of `<`, followed by an [email address], followed by `>`. The link's label is the email address, and the URL is `mailto:` followed by the email address. -An [email address](@email-address), +An [email address](@), for these purposes, is anything that matches the [non-normative regex from the HTML5 spec](https://html.spec.whatwg.org/multipage/forms.html#e-mail-state-(type=email)): @@ -7469,69 +9121,241 @@ spec](https://html.spec.whatwg.org/multipage/forms.html#e-mail-state-(type=email Examples of email autolinks: -. +```````````````````````````````` example .

    foo@bar.example.com

    -. +```````````````````````````````` -. + +```````````````````````````````` example .

    foo+special@Bar.baz-bar0.com

    -. +```````````````````````````````` + Backslash-escapes do not work inside email autolinks: -. +```````````````````````````````` example .

    <foo+@bar.example.com>

    -. +```````````````````````````````` + These are not autolinks: -. +```````````````````````````````` example <> .

    <>

    -. +```````````````````````````````` -. - -. -

    <heck://bing.bong>

    -. -. +```````````````````````````````` example < http://foo.bar > .

    < http://foo.bar >

    -. +```````````````````````````````` + +```````````````````````````````` example + . +

    <m:abc>

    +```````````````````````````````` + + +```````````````````````````````` example .

    <foo.bar.baz>

    +```````````````````````````````` + + +```````````````````````````````` example +http://example.com +. +

    http://example.com

    +```````````````````````````````` + + +```````````````````````````````` example +foo@bar.example.com . +

    foo@bar.example.com

    +```````````````````````````````` + +
    + +## Autolinks (extension) + +GFM enables the `autolink` extension, where autolinks will be recognised in a +greater number of conditions. + +[Autolink]s can also be constructed without requiring the use of `<` and to `>` +to delimit them, although they will be recognized under a smaller set of +circumstances. All such recognized autolinks can only come at the beginning of +a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, +and `(`. +An [extended www autolink](@) will be recognized +when the text `www.` is found followed by a [valid domain]. +A [valid domain](@) consists of segments +of alphanumeric characters, underscores (`_`) and hyphens (`-`) +separated by periods (`.`). +There must be at least one period, +and no underscores may be present in the last two segments of the domain. + +The scheme `http` will be inserted automatically: + +```````````````````````````````` example autolink +www.commonmark.org . - +

    www.commonmark.org

    +```````````````````````````````` + +After a [valid domain], zero or more non-space non-`<` characters may follow: + +```````````````````````````````` example autolink +Visit www.commonmark.org/help for more information. . -

    <localhost:5001/foo>

    +

    Visit www.commonmark.org/help for more information.

    +```````````````````````````````` + +We then apply [extended autolink path validation](@) as follows: + +Trailing punctuation (specifically, `?`, `!`, `.`, `,`, `:`, `*`, `_`, and `~`) +will not be considered part of the autolink, though they may be included in the +interior of the link: + +```````````````````````````````` example autolink +Visit www.commonmark.org. + +Visit www.commonmark.org/a.b. . +

    Visit www.commonmark.org.

    +

    Visit www.commonmark.org/a.b.

    +```````````````````````````````` + +When an autolink ends in `)`, we scan the entire autolink for the total number +of parentheses. If there is a greater number of closing parentheses than +opening ones, we don't consider the unmatched trailing parentheses part of the +autolink, in order to facilitate including an autolink inside a parenthesis: + +```````````````````````````````` example autolink +www.google.com/search?q=Markup+(business) + +www.google.com/search?q=Markup+(business))) +(www.google.com/search?q=Markup+(business)) + +(www.google.com/search?q=Markup+(business) . -http://example.com +

    www.google.com/search?q=Markup+(business)

    +

    www.google.com/search?q=Markup+(business)))

    +

    (www.google.com/search?q=Markup+(business))

    +

    (www.google.com/search?q=Markup+(business)

    +```````````````````````````````` + +This check is only done when the link ends in a closing parentheses `)`, so if +the only parentheses are in the interior of the autolink, no special rules are +applied: + +```````````````````````````````` example autolink +www.google.com/search?q=(business))+ok . -

    http://example.com

    +

    www.google.com/search?q=(business))+ok

    +```````````````````````````````` + +If an autolink ends in a semicolon (`;`), we check to see if it appears to +resemble an [entity reference][entity references]; if the preceding text is `&` +followed by one or more alphanumeric characters. If so, it is excluded from +the autolink: + +```````````````````````````````` example autolink +www.google.com/search?q=commonmark&hl=en + +www.google.com/search?q=commonmark&hl; . +

    www.google.com/search?q=commonmark&hl=en

    +

    www.google.com/search?q=commonmark&hl;

    +```````````````````````````````` +`<` immediately ends an autolink. + +```````````````````````````````` example autolink +www.commonmark.org/hewww.commonmark.org/he<lp

    +```````````````````````````````` + +An [extended url autolink](@) will be recognised when one of the schemes +`http://`, `https://`, or `ftp://`, followed by a [valid domain], then zero or +more non-space non-`<` characters according to +[extended autolink path validation]: + +```````````````````````````````` example autolink +http://commonmark.org + +(Visit https://encrypted.google.com/search?q=Markup+(business)) + +Anonymous FTP is available at ftp://foo.bar.baz. . -

    foo@bar.example.com

    +

    http://commonmark.org

    +

    (Visit https://encrypted.google.com/search?q=Markup+(business))

    +

    Anonymous FTP is available at ftp://foo.bar.baz.

    +```````````````````````````````` + + +An [extended email autolink](@) will be recognised when an email address is +recognised within any text node. Email addresses are recognised according to +the following rules: + +* One ore more characters which are alphanumeric, or `.`, `-`, `_`, or `+`. +* An `@` symbol. +* One or more characters which are alphanumeric, or `-` or `_`, + separated by periods (`.`). + There must be at least one period. + The last character must not be one of `-` or `_`. + +The scheme `mailto:` will automatically be added to the generated link: + +```````````````````````````````` example autolink +foo@bar.baz +. +

    foo@bar.baz

    +```````````````````````````````` + +`+` can occur before the `@`, but not after. + +```````````````````````````````` example autolink +hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is. +. +

    hello@mail+xyz.example isn't valid, but hello+xyz@mail.example is.

    +```````````````````````````````` + +`.`, `-`, and `_` can occur on both sides of the `@`, but only `.` may occur at +the end of the email address, in which case it will not be considered part of +the address: + +```````````````````````````````` example autolink +a.b-c_d@a.b + +a.b-c_d@a.b. + +a.b-c_d@a.b- + +a.b-c_d@a.b_ . +

    a.b-c_d@a.b

    +

    a.b-c_d@a.b.

    +

    a.b-c_d@a.b-

    +

    a.b-c_d@a.b_

    +```````````````````````````````` + +
    ## Raw HTML @@ -7542,415 +9366,477 @@ so custom tags (and even, say, DocBook tags) may be used. Here is the grammar for tags: -A [tag name](@tag-name) consists of an ASCII letter +A [tag name](@) consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (`-`). -An [attribute](@attribute) consists of [whitespace], +An [attribute](@) consists of [whitespace], an [attribute name], and an optional [attribute value specification]. -An [attribute name](@attribute-name) +An [attribute name](@) consists of an ASCII letter, `_`, or `:`, followed by zero or more ASCII letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML specification restricted to ASCII. HTML5 is laxer.) -An [attribute value specification](@attribute-value-specification) +An [attribute value specification](@) consists of optional [whitespace], a `=` character, optional [whitespace], and an [attribute value]. -An [attribute value](@attribute-value) +An [attribute value](@) consists of an [unquoted attribute value], a [single-quoted attribute value], or a [double-quoted attribute value]. -An [unquoted attribute value](@unquoted-attribute-value) +An [unquoted attribute value](@) is a nonempty string of characters not -including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. +including [whitespace], `"`, `'`, `=`, `<`, `>`, or `` ` ``. -A [single-quoted attribute value](@single-quoted-attribute-value) +A [single-quoted attribute value](@) consists of `'`, zero or more characters not including `'`, and a final `'`. -A [double-quoted attribute value](@double-quoted-attribute-value) +A [double-quoted attribute value](@) consists of `"`, zero or more characters not including `"`, and a final `"`. -An [open tag](@open-tag) consists of a `<` character, a [tag name], -zero or more [attributes](@attribute], optional [whitespace], an optional `/` +An [open tag](@) consists of a `<` character, a [tag name], +zero or more [attributes], optional [whitespace], an optional `/` character, and a `>` character. -A [closing tag](@closing-tag) consists of the string ``. -An [HTML comment](@html-comment) consists of ``, -where *text* does not start with `>` or `->`, does not end with `-`, -and does not contain `--`. (See the -[HTML5 spec](http://www.w3.org/TR/html5/syntax.html#comments).) +An [HTML comment](@) consists of ``, ``, or ``, and `-->` (see the +[HTML spec](https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)). -A [processing instruction](@processing-instruction) +A [processing instruction](@) consists of the string ``, and the string `?>`. -A [declaration](@declaration) consists of the +A [declaration](@) consists of the string ``, and the character `>`. -A [CDATA section](@cdata-section) consists of +A [CDATA section](@) consists of the string ``, and the string `]]>`. -An [HTML tag](@html-tag) consists of an [open tag], a [closing tag], +An [HTML tag](@) consists of an [open tag], a [closing tag], an [HTML comment], a [processing instruction], a [declaration], or a [CDATA section]. Here are some simple open tags: -. +```````````````````````````````` example .

    -. +```````````````````````````````` + Empty elements: -. +```````````````````````````````` example .

    -. +```````````````````````````````` + [Whitespace] is allowed: -. +```````````````````````````````` example .

    -. +```````````````````````````````` + With attributes: -. +```````````````````````````````` example .

    -. +```````````````````````````````` + Custom tag names can be used: +```````````````````````````````` example +Foo . - +

    Foo

    +```````````````````````````````` - -foo - -. - - -foo - -. Illegal tag names, not parsed as HTML: -. +```````````````````````````````` example <33> <__> .

    <33> <__>

    -. +```````````````````````````````` + Illegal attribute names: -. +```````````````````````````````` example
    .

    <a h*#ref="hi">

    -. +```````````````````````````````` + Illegal attribute values: -. +```````````````````````````````` example
    .

    </a href="foo">

    -. +```````````````````````````````` -Comments: -. -foo -. -

    foo

    -. +Comments: +```````````````````````````````` example +foo . -foo -. -

    foo <!-- not a comment -- two hyphens -->

    -. +

    foo

    +```````````````````````````````` -Not comments: - -. +```````````````````````````````` example foo foo --> -foo -. -

    foo <!--> foo -->

    -

    foo <!-- foo--->

    +foo foo --> . +

    foo foo -->

    +

    foo foo -->

    +```````````````````````````````` + Processing instructions: -. +```````````````````````````````` example foo .

    foo

    -. +```````````````````````````````` + Declarations: -. +```````````````````````````````` example foo .

    foo

    -. +```````````````````````````````` + CDATA sections: -. +```````````````````````````````` example foo &<]]> .

    foo &<]]>

    -. +```````````````````````````````` -Entities are preserved in HTML attributes: +Entity and numeric character references are preserved in HTML +attributes: + +```````````````````````````````` example +foo
    . - -. - -. +

    foo

    +```````````````````````````````` + Backslash escapes do not work in HTML attributes: +```````````````````````````````` example +foo . - -. - -. +

    foo

    +```````````````````````````````` -. + +```````````````````````````````` example .

    <a href=""">

    +```````````````````````````````` + + +
    + +## Disallowed Raw HTML (extension) + +GFM enables the `tagfilter` extension, where the following HTML tags will be +filtered when rendering HTML output: + +* `` +* `<textarea>` +* `<style>` +* `<xmp>` +* `<iframe>` +* `<noembed>` +* `<noframes>` +* `<script>` +* `<plaintext>` + +Filtering is done by replacing the leading `<` with the entity `<`. These +tags are chosen in particular as they change how HTML is interpreted in a way +unique to them (i.e. nested HTML is interpreted differently), and this is +usually undesireable in the context of other rendered Markdown content. + +All other HTML tags are left untouched. + +```````````````````````````````` example tagfilter +<strong> <title> <style> <em> + +<blockquote> + <xmp> is disallowed. <XMP> is also disallowed. +</blockquote> . +<p><strong> <title> <style> <em></p> +<blockquote> + <xmp> is disallowed. <XMP> is also disallowed. +</blockquote> +```````````````````````````````` + +</div> ## Hard line breaks A line break (not in a code span or HTML tag) that is preceded by two or more spaces and does not occur at the end of a block -is parsed as a [hard line break](@hard-line-break) (rendered +is parsed as a [hard line break](@) (rendered in HTML as a `<br />` tag): -. +```````````````````````````````` example foo baz . <p>foo<br /> baz</p> -. +```````````````````````````````` + For a more visible alternative, a backslash before the [line ending] may be used instead of two spaces: -. +```````````````````````````````` example foo\ baz . <p>foo<br /> baz</p> -. +```````````````````````````````` + More than two spaces can be used: -. +```````````````````````````````` example foo baz . <p>foo<br /> baz</p> -. +```````````````````````````````` + Leading spaces at the beginning of the next line are ignored: -. +```````````````````````````````` example foo bar . <p>foo<br /> bar</p> -. +```````````````````````````````` -. + +```````````````````````````````` example foo\ bar . <p>foo<br /> bar</p> -. +```````````````````````````````` + Line breaks can occur inside emphasis, links, and other constructs that allow inline content: -. +```````````````````````````````` example *foo bar* . <p><em>foo<br /> bar</em></p> -. +```````````````````````````````` -. + +```````````````````````````````` example *foo\ bar* . <p><em>foo<br /> bar</em></p> -. +```````````````````````````````` + Line breaks do not occur inside code spans -. +```````````````````````````````` example `code span` . -<p><code>code span</code></p> -. +<p><code>code span</code></p> +```````````````````````````````` -. + +```````````````````````````````` example `code\ span` . <p><code>code\ span</code></p> -. +```````````````````````````````` + or HTML tags: -. +```````````````````````````````` example <a href="foo bar"> . <p><a href="foo bar"></p> -. +```````````````````````````````` -. + +```````````````````````````````` example <a href="foo\ bar"> . <p><a href="foo\ bar"></p> -. +```````````````````````````````` + Hard line breaks are for separating inline content within a block. Neither syntax for hard line breaks works at the end of a paragraph or other block element: -. +```````````````````````````````` example foo\ . <p>foo\</p> -. +```````````````````````````````` -. + +```````````````````````````````` example foo . <p>foo</p> -. +```````````````````````````````` -. + +```````````````````````````````` example ### foo\ . <h3>foo\</h3> -. +```````````````````````````````` -. + +```````````````````````````````` example ### foo . <h3>foo</h3> -. +```````````````````````````````` + ## Soft line breaks A regular line break (not in a code span or HTML tag) that is not preceded by two or more spaces or a backslash is parsed as a -softbreak. (A softbreak may be rendered in HTML either as a +[softbreak](@). (A softbreak may be rendered in HTML either as a [line ending] or as a space. The result will be the same in browsers. In the examples here, a [line ending] will be used.) -. +```````````````````````````````` example foo baz . <p>foo baz</p> -. +```````````````````````````````` + Spaces at the end of the line and beginning of the next line are removed: -. +```````````````````````````````` example foo baz . <p>foo baz</p> -. +```````````````````````````````` + A conforming parser may render a soft line break in HTML either as a line break or as a space. @@ -7963,34 +9849,37 @@ as hard line breaks. Any characters not given an interpretation by the above rules will be parsed as plain textual content. -. +```````````````````````````````` example hello $.;'there . <p>hello $.;'there</p> -. +```````````````````````````````` -. + +```````````````````````````````` example Foo χρῆν . <p>Foo χρῆν</p> -. +```````````````````````````````` + Internal spaces are preserved verbatim: -. +```````````````````````````````` example Multiple spaces . <p>Multiple spaces</p> -. +```````````````````````````````` + <!-- END TESTS --> -# Appendix: A parsing strategy {-} +# Appendix: A parsing strategy In this appendix we describe some features of the parsing strategy used in the CommonMark reference implementations. -## Overview {-} +## Overview Parsing has two phases: @@ -8000,7 +9889,7 @@ list items, and so on---is constructed. Text is assigned to these blocks but not parsed. Link reference definitions are parsed and a map of links is constructed. -2. In the second phase, the raw text contents of paragraphs and headers +2. In the second phase, the raw text contents of paragraphs and headings are parsed into sequences of Markdown inline elements (strings, code spans, links, emphasis, and so on), using the map of link references constructed in phase 1. @@ -8028,7 +9917,7 @@ marked by arrows: "aliquando id" ``` -## Phase 1: block structure {-} +## Phase 1: block structure Each line that is processed has an effect on this tree. The line is analyzed and, depending on its contents, the document may be altered @@ -8055,7 +9944,7 @@ blocks. But we cannot close unmatched blocks yet, because we may have a [lazy continuation line]. 2. Next, after consuming the continuation markers for existing -blocks, we look for new block starts (e.g. `>` for a block quote. +blocks, we look for new block starts (e.g. `>` for a block quote). If we encounter a new block start, we close any blocks unmatched in step 1 before creating the new block as a child of the last matched block. @@ -8063,10 +9952,10 @@ matched block. 3. Finally, we look at the remainder of the line (after block markers like `>`, list markers, and indentation have been consumed). This is text that can be incorporated into the last open -block (a paragraph, code block, header, or raw HTML). +block (a paragraph, code block, heading, or raw HTML). -Setext headers are formed when we detect that the second line of -a paragraph is a setext header line. +Setext headings are formed when we see a line of a paragraph +that is a [setext heading underline]. Reference link definitions are detected when a paragraph is closed; the accumulated text lines are parsed to see if they begin with @@ -8170,12 +10059,12 @@ We thus obtain the final tree: "aliquando id" ``` -## Phase 2: inline structure {-} +## Phase 2: inline structure Once all of the input has been parsed, all open blocks are closed. We then "walk the tree," visiting every node, and parse raw -string contents of paragraphs and headers as inlines. At this +string contents of paragraphs and headings as inlines. At this point we have seen all the link reference definitions, so we can resolve reference links as we go. @@ -8201,7 +10090,7 @@ Notice how the [line ending] in the first paragraph has been parsed as a `softbreak`, and the asterisks in the first list item have become an `emph`. -### An algorithm for parsing nested emphasis and links {-} +### An algorithm for parsing nested emphasis and links By far the trickiest part of inline parsing is handling emphasis, strong emphasis, links, and images. This is done using the following @@ -8213,7 +10102,7 @@ When we're parsing inlines and we hit either - a `[` or `![` we insert a text node with these symbols as its literal content, and we -add a pointer to this text node to the [delimiter stack](@delimiter-stack). +add a pointer to this text node to the [delimiter stack](@). The [delimiter stack] is a doubly linked list. Each element contains a pointer to a text node, plus information about @@ -8231,7 +10120,7 @@ procedure (see below). When we hit the end of the input, we call the *process emphasis* procedure (see below), with `stack_bottom` = NULL. -#### *look for link or image* {-} +#### *look for link or image* Starting at the top of the delimiter stack, we look backwards through the stack for an opening `[` or `![` delimiter. @@ -8262,7 +10151,7 @@ through the stack for an opening `[` or `![` delimiter. `[` delimiters before the opening delimiter to *inactive*. (This will prevent us from getting links within links.) -#### *process emphasis* {-} +#### *process emphasis* Parameter `stack_bottom` sets a lower bound to how far we descend in the [delimiter stack]. If it is NULL, we can @@ -8274,7 +10163,8 @@ just above `stack_bottom` (or the first element if `stack_bottom` is NULL). We keep track of the `openers_bottom` for each delimiter -type (`*`, `_`). Initialize this to `stack_bottom`. +type (`*`, `_`) and each length of the closing delimiter run +(modulo 3). Initialize this to `stack_bottom`. Then we repeat the following until we run out of potential closers: @@ -8306,7 +10196,7 @@ closers: of the delimiter stack. If the closing node is removed, reset `current_position` to the next element in the stack. -- If none in found: +- If none is found: + Set `openers_bottom` to the element before `current_position`. (We know that there are no openers for this kind of closer up to and @@ -8320,4 +10210,3 @@ closers: After we're done, we remove all delimiters above `stack_bottom` from the delimiter stack. - diff --git a/commonmark-test-util/src/main/resources/spec.txt b/commonmark-test-util/src/main/resources/spec.txt new file mode 100644 index 000000000..f1fab281e --- /dev/null +++ b/commonmark-test-util/src/main/resources/spec.txt @@ -0,0 +1,9756 @@ +--- +title: CommonMark Spec +author: John MacFarlane +version: '0.31.2' +date: '2024-01-28' +license: '[CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)' +... + +# Introduction + +## What is Markdown? + +Markdown is a plain text format for writing structured documents, +based on conventions for indicating formatting in email +and usenet posts. It was developed by John Gruber (with +help from Aaron Swartz) and released in 2004 in the form of a +[syntax description](https://daringfireball.net/projects/markdown/syntax) +and a Perl script (`Markdown.pl`) for converting Markdown to +HTML. In the next decade, dozens of implementations were +developed in many languages. Some extended the original +Markdown syntax with conventions for footnotes, tables, and +other document elements. Some allowed Markdown documents to be +rendered in formats other than HTML. Websites like Reddit, +StackOverflow, and GitHub had millions of people using Markdown. +And Markdown started to be used beyond the web, to author books, +articles, slide shows, letters, and lecture notes. + +What distinguishes Markdown from many other lightweight markup +syntaxes, which are often easier to write, is its readability. +As Gruber writes: + +> The overriding design goal for Markdown's formatting syntax is +> to make it as readable as possible. The idea is that a +> Markdown-formatted document should be publishable as-is, as +> plain text, without looking like it's been marked up with tags +> or formatting instructions. +> (<https://daringfireball.net/projects/markdown/>) + +The point can be illustrated by comparing a sample of +[AsciiDoc](https://asciidoc.org/) with +an equivalent sample of Markdown. Here is a sample of +AsciiDoc from the AsciiDoc manual: + +``` +1. List item one. ++ +List item one continued with a second paragraph followed by an +Indented block. ++ +................. +$ ls *.sh +$ mv *.sh ~/tmp +................. ++ +List item continued with a third paragraph. + +2. List item two continued with an open block. ++ +-- +This paragraph is part of the preceding list item. + +a. This list is nested and does not require explicit item +continuation. ++ +This paragraph is part of the preceding list item. + +b. List item b. + +This paragraph belongs to item two of the outer list. +-- +``` + +And here is the equivalent in Markdown: +``` +1. List item one. + + List item one continued with a second paragraph followed by an + Indented block. + + $ ls *.sh + $ mv *.sh ~/tmp + + List item continued with a third paragraph. + +2. List item two continued with an open block. + + This paragraph is part of the preceding list item. + + 1. This list is nested and does not require explicit item continuation. + + This paragraph is part of the preceding list item. + + 2. List item b. + + This paragraph belongs to item two of the outer list. +``` + +The AsciiDoc version is, arguably, easier to write. You don't need +to worry about indentation. But the Markdown version is much easier +to read. The nesting of list items is apparent to the eye in the +source, not just in the processed document. + +## Why is a spec needed? + +John Gruber's [canonical description of Markdown's +syntax](https://daringfireball.net/projects/markdown/syntax) +does not specify the syntax unambiguously. Here are some examples of +questions it does not answer: + +1. How much indentation is needed for a sublist? The spec says that + continuation paragraphs need to be indented four spaces, but is + not fully explicit about sublists. It is natural to think that + they, too, must be indented four spaces, but `Markdown.pl` does + not require that. This is hardly a "corner case," and divergences + between implementations on this issue often lead to surprises for + users in real documents. (See [this comment by John + Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).) + +2. Is a blank line needed before a block quote or heading? + Most implementations do not require the blank line. However, + this can lead to unexpected results in hard-wrapped text, and + also to ambiguities in parsing (note that some implementations + put the heading inside the blockquote, while others do not). + (John Gruber has also spoken [in favor of requiring the blank + lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).) + +3. Is a blank line needed before an indented code block? + (`Markdown.pl` requires it, but this is not mentioned in the + documentation, and some implementations do not require it.) + + ``` markdown + paragraph + code? + ``` + +4. What is the exact rule for determining when list items get + wrapped in `<p>` tags? Can a list be partially "loose" and partially + "tight"? What should we do with a list like this? + + ``` markdown + 1. one + + 2. two + 3. three + ``` + + Or this? + + ``` markdown + 1. one + - a + + - b + 2. two + ``` + + (There are some relevant comments by John Gruber + [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).) + +5. Can list markers be indented? Can ordered list markers be right-aligned? + + ``` markdown + 8. item 1 + 9. item 2 + 10. item 2a + ``` + +6. Is this one list with a thematic break in its second item, + or two lists separated by a thematic break? + + ``` markdown + * a + * * * * * + * b + ``` + +7. When list markers change from numbers to bullets, do we have + two lists or one? (The Markdown syntax description suggests two, + but the perl scripts and many other implementations produce one.) + + ``` markdown + 1. fee + 2. fie + - foe + - fum + ``` + +8. What are the precedence rules for the markers of inline structure? + For example, is the following a valid link, or does the code span + take precedence ? + + ``` markdown + [a backtick (`)](/url) and [another backtick (`)](/url). + ``` + +9. What are the precedence rules for markers of emphasis and strong + emphasis? For example, how should the following be parsed? + + ``` markdown + *foo *bar* baz* + ``` + +10. What are the precedence rules between block-level and inline-level + structure? For example, how should the following be parsed? + + ``` markdown + - `a long code span can contain a hyphen like this + - and it can screw things up` + ``` + +11. Can list items include section headings? (`Markdown.pl` does not + allow this, but does allow blockquotes to include headings.) + + ``` markdown + - # Heading + ``` + +12. Can list items be empty? + + ``` markdown + * a + * + * b + ``` + +13. Can link references be defined inside block quotes or list items? + + ``` markdown + > Blockquote [foo]. + > + > [foo]: /url + ``` + +14. If there are multiple definitions for the same reference, which takes + precedence? + + ``` markdown + [foo]: /url1 + [foo]: /url2 + + [foo][] + ``` + +In the absence of a spec, early implementers consulted `Markdown.pl` +to resolve these ambiguities. But `Markdown.pl` was quite buggy, and +gave manifestly bad results in many cases, so it was not a +satisfactory replacement for a spec. + +Because there is no unambiguous spec, implementations have diverged +considerably. As a result, users are often surprised to find that +a document that renders one way on one system (say, a GitHub wiki) +renders differently on another (say, converting to docbook using +pandoc). To make matters worse, because nothing in Markdown counts +as a "syntax error," the divergence often isn't discovered right away. + +## About this document + +This document attempts to specify Markdown syntax unambiguously. +It contains many examples with side-by-side Markdown and +HTML. These are intended to double as conformance tests. An +accompanying script `spec_tests.py` can be used to run the tests +against any Markdown program: + + python test/spec_tests.py --spec spec.txt --program PROGRAM + +Since this document describes how Markdown is to be parsed into +an abstract syntax tree, it would have made sense to use an abstract +representation of the syntax tree instead of HTML. But HTML is capable +of representing the structural distinctions we need to make, and the +choice of HTML for the tests makes it possible to run the tests against +an implementation without writing an abstract syntax tree renderer. + +Note that not every feature of the HTML samples is mandated by +the spec. For example, the spec says what counts as a link +destination, but it doesn't mandate that non-ASCII characters in +the URL be percent-encoded. To use the automatic tests, +implementers will need to provide a renderer that conforms to +the expectations of the spec examples (percent-encoding +non-ASCII characters in URLs). But a conforming implementation +can use a different renderer and may choose not to +percent-encode non-ASCII characters in URLs. + +This document is generated from a text file, `spec.txt`, written +in Markdown with a small extension for the side-by-side tests. +The script `tools/makespec.py` can be used to convert `spec.txt` into +HTML or CommonMark (which can then be converted into other formats). + +In the examples, the `→` character is used to represent tabs. + +# Preliminaries + +## Characters and lines + +Any sequence of [characters] is a valid CommonMark +document. + +A [character](@) is a Unicode code point. Although some +code points (for example, combining accents) do not correspond to +characters in an intuitive sense, all code points count as characters +for purposes of this spec. + +This spec does not specify an encoding; it thinks of lines as composed +of [characters] rather than bytes. A conforming parser may be limited +to a certain encoding. + +A [line](@) is a sequence of zero or more [characters] +other than line feed (`U+000A`) or carriage return (`U+000D`), +followed by a [line ending] or by the end of file. + +A [line ending](@) is a line feed (`U+000A`), a carriage return +(`U+000D`) not followed by a line feed, or a carriage return and a +following line feed. + +A line containing no characters, or a line containing only spaces +(`U+0020`) or tabs (`U+0009`), is called a [blank line](@). + +The following definitions of character classes will be used in this spec: + +A [Unicode whitespace character](@) is a character in the Unicode `Zs` general +category, or a tab (`U+0009`), line feed (`U+000A`), form feed (`U+000C`), or +carriage return (`U+000D`). + +[Unicode whitespace](@) is a sequence of one or more +[Unicode whitespace characters]. + +A [tab](@) is `U+0009`. + +A [space](@) is `U+0020`. + +An [ASCII control character](@) is a character between `U+0000–1F` (both +including) or `U+007F`. + +An [ASCII punctuation character](@) +is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). + +A [Unicode punctuation character](@) is a character in the Unicode `P` +(puncuation) or `S` (symbol) general categories. + +## Tabs + +Tabs in lines are not expanded to [spaces]. However, +in contexts where spaces help to define block structure, +tabs behave as if they were replaced by spaces with a tab stop +of 4 characters. + +Thus, for example, a tab can be used instead of four spaces +in an indented code block. (Note, however, that internal +tabs are passed through as literal tabs, not expanded to +spaces.) + +```````````````````````````````` example +→foo→baz→→bim +. +<pre><code>foo→baz→→bim +</code></pre> +```````````````````````````````` + +```````````````````````````````` example + →foo→baz→→bim +. +<pre><code>foo→baz→→bim +</code></pre> +```````````````````````````````` + +```````````````````````````````` example + a→a + ὐ→a +. +<pre><code>a→a +ὐ→a +</code></pre> +```````````````````````````````` + +In the following example, a continuation paragraph of a list +item is indented with a tab; this has exactly the same effect +as indentation with four spaces would: + +```````````````````````````````` example + - foo + +→bar +. +<ul> +<li> +<p>foo</p> +<p>bar</p> +</li> +</ul> +```````````````````````````````` + +```````````````````````````````` example +- foo + +→→bar +. +<ul> +<li> +<p>foo</p> +<pre><code> bar +</code></pre> +</li> +</ul> +```````````````````````````````` + +Normally the `>` that begins a block quote may be followed +optionally by a space, which is not considered part of the +content. In the following case `>` is followed by a tab, +which is treated as if it were expanded into three spaces. +Since one of these spaces is considered part of the +delimiter, `foo` is considered to be indented six spaces +inside the block quote context, so we get an indented +code block starting with two spaces. + +```````````````````````````````` example +>→→foo +. +<blockquote> +<pre><code> foo +</code></pre> +</blockquote> +```````````````````````````````` + +```````````````````````````````` example +-→→foo +. +<ul> +<li> +<pre><code> foo +</code></pre> +</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example + foo +→bar +. +<pre><code>foo +bar +</code></pre> +```````````````````````````````` + +```````````````````````````````` example + - foo + - bar +→ - baz +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li>baz</li> +</ul> +</li> +</ul> +</li> +</ul> +```````````````````````````````` + +```````````````````````````````` example +#→Foo +. +<h1>Foo</h1> +```````````````````````````````` + +```````````````````````````````` example +*→*→*→ +. +<hr /> +```````````````````````````````` + + +## Insecure characters + +For security reasons, the Unicode character `U+0000` must be replaced +with the REPLACEMENT CHARACTER (`U+FFFD`). + + +## Backslash escapes + +Any ASCII punctuation character may be backslash-escaped: + +```````````````````````````````` example +\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~ +. +<p>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</p> +```````````````````````````````` + + +Backslashes before other characters are treated as literal +backslashes: + +```````````````````````````````` example +\→\A\a\ \3\φ\« +. +<p>\→\A\a\ \3\φ\«</p> +```````````````````````````````` + + +Escaped characters are treated as regular characters and do +not have their usual Markdown meanings: + +```````````````````````````````` example +\*not emphasized* +\<br/> not a tag +\[not a link](/foo) +\`not code` +1\. not a list +\* not a list +\# not a heading +\[foo]: /url "not a reference" +\ö not a character entity +. +<p>*not emphasized* +<br/> not a tag +[not a link](/foo) +`not code` +1. not a list +* not a list +# not a heading +[foo]: /url "not a reference" +&ouml; not a character entity</p> +```````````````````````````````` + + +If a backslash is itself escaped, the following character is not: + +```````````````````````````````` example +\\*emphasis* +. +<p>\<em>emphasis</em></p> +```````````````````````````````` + + +A backslash at the end of the line is a [hard line break]: + +```````````````````````````````` example +foo\ +bar +. +<p>foo<br /> +bar</p> +```````````````````````````````` + + +Backslash escapes do not work in code blocks, code spans, autolinks, or +raw HTML: + +```````````````````````````````` example +`` \[\` `` +. +<p><code>\[\`</code></p> +```````````````````````````````` + + +```````````````````````````````` example + \[\] +. +<pre><code>\[\] +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +~~~ +\[\] +~~~ +. +<pre><code>\[\] +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +<https://example.com?find=\*> +. +<p><a href="https://example.com?find=%5C*">https://example.com?find=\*</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<a href="/bar\/)"> +. +<a href="/bar\/)"> +```````````````````````````````` + + +But they work in all other contexts, including URLs and link titles, +link references, and [info strings] in [fenced code blocks]: + +```````````````````````````````` example +[foo](/bar\* "ti\*tle") +. +<p><a href="/bar*" title="ti*tle">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo] + +[foo]: /bar\* "ti\*tle" +. +<p><a href="/bar*" title="ti*tle">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +``` foo\+bar +foo +``` +. +<pre><code class="language-foo+bar">foo +</code></pre> +```````````````````````````````` + + +## Entity and numeric character references + +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: + +- Entity and character references are not recognized in code + blocks and code spans. + +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. + +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. + +[Entity references](@) consist of `&` + any of the valid +HTML5 entity names + `;`. The +document <https://html.spec.whatwg.org/entities.json> +is used as an authoritative source for the valid entity +references and their corresponding code points. + +```````````````````````````````` example +  & © Æ Ď +¾ ℋ ⅆ +∲ ≧̸ +. +<p>  & © Æ Ď +¾ ℋ ⅆ +∲ ≧̸</p> +```````````````````````````````` + + +[Decimal numeric character +references](@) +consist of `&#` + a string of 1--7 arabic digits + `;`. A +numeric character reference is parsed as the corresponding +Unicode character. Invalid Unicode code points will be replaced by +the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, +the code point `U+0000` will also be replaced by `U+FFFD`. + +```````````````````````````````` example +# Ӓ Ϡ � +. +<p># Ӓ Ϡ �</p> +```````````````````````````````` + + +[Hexadecimal numeric character +references](@) consist of `&#` + +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. +They too are parsed as the corresponding Unicode character (this +time specified with a hexadecimal numeral instead of decimal). + +```````````````````````````````` example +" ആ ಫ +. +<p>" ആ ಫ</p> +```````````````````````````````` + + +Here are some nonentities: + +```````````````````````````````` example +  &x; &#; &#x; +� +&#abcdef0; +&ThisIsNotDefined; &hi?; +. +<p>&nbsp &x; &#; &#x; +&#87654321; +&#abcdef0; +&ThisIsNotDefined; &hi?;</p> +```````````````````````````````` + + +Although HTML5 does accept some entity references +without a trailing semicolon (such as `©`), these are not +recognized here, because it makes the grammar too ambiguous: + +```````````````````````````````` example +© +. +<p>&copy</p> +```````````````````````````````` + + +Strings that are not on the list of HTML5 named entities are not +recognized as entity references either: + +```````````````````````````````` example +&MadeUpEntity; +. +<p>&MadeUpEntity;</p> +```````````````````````````````` + + +Entity and numeric character references are recognized in any +context besides code spans or code blocks, including +URLs, [link titles], and [fenced code block][] [info strings]: + +```````````````````````````````` example +<a href="öö.html"> +. +<a href="öö.html"> +```````````````````````````````` + + +```````````````````````````````` example +[foo](/föö "föö") +. +<p><a href="/f%C3%B6%C3%B6" title="föö">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo] + +[foo]: /föö "föö" +. +<p><a href="/f%C3%B6%C3%B6" title="föö">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +``` föö +foo +``` +. +<pre><code class="language-föö">foo +</code></pre> +```````````````````````````````` + + +Entity and numeric character references are treated as literal +text in code spans and code blocks: + +```````````````````````````````` example +`föö` +. +<p><code>f&ouml;&ouml;</code></p> +```````````````````````````````` + + +```````````````````````````````` example + föfö +. +<pre><code>f&ouml;f&ouml; +</code></pre> +```````````````````````````````` + + +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. + +```````````````````````````````` example +*foo* +*foo* +. +<p>*foo* +<em>foo</em></p> +```````````````````````````````` + +```````````````````````````````` example +* foo + +* foo +. +<p>* foo</p> +<ul> +<li>foo</li> +</ul> +```````````````````````````````` + +```````````````````````````````` example +foo bar +. +<p>foo + +bar</p> +```````````````````````````````` + +```````````````````````````````` example + foo +. +<p>→foo</p> +```````````````````````````````` + + +```````````````````````````````` example +[a](url "tit") +. +<p>[a](url "tit")</p> +```````````````````````````````` + + + +# Blocks and inlines + +We can think of a document as a sequence of +[blocks](@)---structural elements like paragraphs, block +quotations, lists, headings, rules, and code blocks. Some blocks (like +block quotes and list items) contain other blocks; others (like +headings and paragraphs) contain [inline](@) content---text, +links, emphasized text, images, code spans, and so on. + +## Precedence + +Indicators of block structure always take precedence over indicators +of inline structure. So, for example, the following is a list with +two items, not a list with one item containing a code span: + +```````````````````````````````` example +- `one +- two` +. +<ul> +<li>`one</li> +<li>two`</li> +</ul> +```````````````````````````````` + + +This means that parsing can proceed in two steps: first, the block +structure of the document can be discerned; second, text lines inside +paragraphs, headings, and other block constructs can be parsed for inline +structure. The second step requires information about link reference +definitions that will be available only at the end of the first +step. Note that the first step requires processing lines in sequence, +but the second can be parallelized, since the inline parsing of +one block element does not affect the inline parsing of any other. + +## Container blocks and leaf blocks + +We can divide blocks into two types: +[container blocks](#container-blocks), +which can contain other blocks, and [leaf blocks](#leaf-blocks), +which cannot. + +# Leaf blocks + +This section describes the different kinds of leaf block that make up a +Markdown document. + +## Thematic breaks + +A line consisting of optionally up to three spaces of indentation, followed by a +sequence of three or more matching `-`, `_`, or `*` characters, each followed +optionally by any number of spaces or tabs, forms a +[thematic break](@). + +```````````````````````````````` example +*** +--- +___ +. +<hr /> +<hr /> +<hr /> +```````````````````````````````` + + +Wrong characters: + +```````````````````````````````` example ++++ +. +<p>+++</p> +```````````````````````````````` + + +```````````````````````````````` example +=== +. +<p>===</p> +```````````````````````````````` + + +Not enough characters: + +```````````````````````````````` example +-- +** +__ +. +<p>-- +** +__</p> +```````````````````````````````` + + +Up to three spaces of indentation are allowed: + +```````````````````````````````` example + *** + *** + *** +. +<hr /> +<hr /> +<hr /> +```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example + *** +. +<pre><code>*** +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +Foo + *** +. +<p>Foo +***</p> +```````````````````````````````` + + +More than three characters may be used: + +```````````````````````````````` example +_____________________________________ +. +<hr /> +```````````````````````````````` + + +Spaces and tabs are allowed between the characters: + +```````````````````````````````` example + - - - +. +<hr /> +```````````````````````````````` + + +```````````````````````````````` example + ** * ** * ** * ** +. +<hr /> +```````````````````````````````` + + +```````````````````````````````` example +- - - - +. +<hr /> +```````````````````````````````` + + +Spaces and tabs are allowed at the end: + +```````````````````````````````` example +- - - - +. +<hr /> +```````````````````````````````` + + +However, no other characters may occur in the line: + +```````````````````````````````` example +_ _ _ _ a + +a------ + +---a--- +. +<p>_ _ _ _ a</p> +<p>a------</p> +<p>---a---</p> +```````````````````````````````` + + +It is required that all of the characters other than spaces or tabs be the same. +So, this is not a thematic break: + +```````````````````````````````` example + *-* +. +<p><em>-</em></p> +```````````````````````````````` + + +Thematic breaks do not need blank lines before or after: + +```````````````````````````````` example +- foo +*** +- bar +. +<ul> +<li>foo</li> +</ul> +<hr /> +<ul> +<li>bar</li> +</ul> +```````````````````````````````` + + +Thematic breaks can interrupt a paragraph: + +```````````````````````````````` example +Foo +*** +bar +. +<p>Foo</p> +<hr /> +<p>bar</p> +```````````````````````````````` + + +If a line of dashes that meets the above conditions for being a +thematic break could also be interpreted as the underline of a [setext +heading], the interpretation as a +[setext heading] takes precedence. Thus, for example, +this is a setext heading, not a paragraph followed by a thematic break: + +```````````````````````````````` example +Foo +--- +bar +. +<h2>Foo</h2> +<p>bar</p> +```````````````````````````````` + + +When both a thematic break and a list item are possible +interpretations of a line, the thematic break takes precedence: + +```````````````````````````````` example +* Foo +* * * +* Bar +. +<ul> +<li>Foo</li> +</ul> +<hr /> +<ul> +<li>Bar</li> +</ul> +```````````````````````````````` + + +If you want a thematic break in a list item, use a different bullet: + +```````````````````````````````` example +- Foo +- * * * +. +<ul> +<li>Foo</li> +<li> +<hr /> +</li> +</ul> +```````````````````````````````` + + +## ATX headings + +An [ATX heading](@) +consists of a string of characters, parsed as inline content, between an +opening sequence of 1--6 unescaped `#` characters and an optional +closing sequence of any number of unescaped `#` characters. +The opening sequence of `#` characters must be followed by spaces or tabs, or +by the end of line. The optional closing sequence of `#`s must be preceded by +spaces or tabs and may be followed by spaces or tabs only. The opening +`#` character may be preceded by up to three spaces of indentation. The raw +contents of the heading are stripped of leading and trailing space or tabs +before being parsed as inline content. The heading level is equal to the number +of `#` characters in the opening sequence. + +Simple headings: + +```````````````````````````````` example +# foo +## foo +### foo +#### foo +##### foo +###### foo +. +<h1>foo</h1> +<h2>foo</h2> +<h3>foo</h3> +<h4>foo</h4> +<h5>foo</h5> +<h6>foo</h6> +```````````````````````````````` + + +More than six `#` characters is not a heading: + +```````````````````````````````` example +####### foo +. +<p>####### foo</p> +```````````````````````````````` + + +At least one space or tab is required between the `#` characters and the +heading's contents, unless the heading is empty. Note that many +implementations currently do not require the space. However, the +space was required by the +[original ATX implementation](http://www.aaronsw.com/2002/atx/atx.py), +and it helps prevent things like the following from being parsed as +headings: + +```````````````````````````````` example +#5 bolt + +#hashtag +. +<p>#5 bolt</p> +<p>#hashtag</p> +```````````````````````````````` + + +This is not a heading, because the first `#` is escaped: + +```````````````````````````````` example +\## foo +. +<p>## foo</p> +```````````````````````````````` + + +Contents are parsed as inlines: + +```````````````````````````````` example +# foo *bar* \*baz\* +. +<h1>foo <em>bar</em> *baz*</h1> +```````````````````````````````` + + +Leading and trailing spaces or tabs are ignored in parsing inline content: + +```````````````````````````````` example +# foo +. +<h1>foo</h1> +```````````````````````````````` + + +Up to three spaces of indentation are allowed: + +```````````````````````````````` example + ### foo + ## foo + # foo +. +<h3>foo</h3> +<h2>foo</h2> +<h1>foo</h1> +```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example + # foo +. +<pre><code># foo +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +foo + # bar +. +<p>foo +# bar</p> +```````````````````````````````` + + +A closing sequence of `#` characters is optional: + +```````````````````````````````` example +## foo ## + ### bar ### +. +<h2>foo</h2> +<h3>bar</h3> +```````````````````````````````` + + +It need not be the same length as the opening sequence: + +```````````````````````````````` example +# foo ################################## +##### foo ## +. +<h1>foo</h1> +<h5>foo</h5> +```````````````````````````````` + + +Spaces or tabs are allowed after the closing sequence: + +```````````````````````````````` example +### foo ### +. +<h3>foo</h3> +```````````````````````````````` + + +A sequence of `#` characters with anything but spaces or tabs following it +is not a closing sequence, but counts as part of the contents of the +heading: + +```````````````````````````````` example +### foo ### b +. +<h3>foo ### b</h3> +```````````````````````````````` + + +The closing sequence must be preceded by a space or tab: + +```````````````````````````````` example +# foo# +. +<h1>foo#</h1> +```````````````````````````````` + + +Backslash-escaped `#` characters do not count as part +of the closing sequence: + +```````````````````````````````` example +### foo \### +## foo #\## +# foo \# +. +<h3>foo ###</h3> +<h2>foo ###</h2> +<h1>foo #</h1> +```````````````````````````````` + + +ATX headings need not be separated from surrounding content by blank +lines, and they can interrupt paragraphs: + +```````````````````````````````` example +**** +## foo +**** +. +<hr /> +<h2>foo</h2> +<hr /> +```````````````````````````````` + + +```````````````````````````````` example +Foo bar +# baz +Bar foo +. +<p>Foo bar</p> +<h1>baz</h1> +<p>Bar foo</p> +```````````````````````````````` + + +ATX headings can be empty: + +```````````````````````````````` example +## +# +### ### +. +<h2></h2> +<h1></h1> +<h3></h3> +```````````````````````````````` + + +## Setext headings + +A [setext heading](@) consists of one or more +lines of text, not interrupted by a blank line, of which the first line does not +have more than 3 spaces of indentation, followed by +a [setext heading underline]. The lines of text must be such +that, were they not followed by the setext heading underline, +they would be interpreted as a paragraph: they cannot be +interpretable as a [code fence], [ATX heading][ATX headings], +[block quote][block quotes], [thematic break][thematic breaks], +[list item][list items], or [HTML block][HTML blocks]. + +A [setext heading underline](@) is a sequence of +`=` characters or a sequence of `-` characters, with no more than 3 +spaces of indentation and any number of trailing spaces or tabs. + +The heading is a level 1 heading if `=` characters are used in +the [setext heading underline], and a level 2 heading if `-` +characters are used. The contents of the heading are the result +of parsing the preceding lines of text as CommonMark inline +content. + +In general, a setext heading need not be preceded or followed by a +blank line. However, it cannot interrupt a paragraph, so when a +setext heading comes after a paragraph, a blank line is needed between +them. + +Simple examples: + +```````````````````````````````` example +Foo *bar* +========= + +Foo *bar* +--------- +. +<h1>Foo <em>bar</em></h1> +<h2>Foo <em>bar</em></h2> +```````````````````````````````` + + +The content of the header may span more than one line: + +```````````````````````````````` example +Foo *bar +baz* +==== +. +<h1>Foo <em>bar +baz</em></h1> +```````````````````````````````` + +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +spaces or tabs. + +```````````````````````````````` example + Foo *bar +baz*→ +==== +. +<h1>Foo <em>bar +baz</em></h1> +```````````````````````````````` + + +The underlining can be any length: + +```````````````````````````````` example +Foo +------------------------- + +Foo += +. +<h2>Foo</h2> +<h1>Foo</h1> +```````````````````````````````` + + +The heading content can be preceded by up to three spaces of indentation, and +need not line up with the underlining: + +```````````````````````````````` example + Foo +--- + + Foo +----- + + Foo + === +. +<h2>Foo</h2> +<h2>Foo</h2> +<h1>Foo</h1> +```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example + Foo + --- + + Foo +--- +. +<pre><code>Foo +--- + +Foo +</code></pre> +<hr /> +```````````````````````````````` + + +The setext heading underline can be preceded by up to three spaces of +indentation, and may have trailing spaces or tabs: + +```````````````````````````````` example +Foo + ---- +. +<h2>Foo</h2> +```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example +Foo + --- +. +<p>Foo +---</p> +```````````````````````````````` + + +The setext heading underline cannot contain internal spaces or tabs: + +```````````````````````````````` example +Foo += = + +Foo +--- - +. +<p>Foo += =</p> +<p>Foo</p> +<hr /> +```````````````````````````````` + + +Trailing spaces or tabs in the content line do not cause a hard line break: + +```````````````````````````````` example +Foo +----- +. +<h2>Foo</h2> +```````````````````````````````` + + +Nor does a backslash at the end: + +```````````````````````````````` example +Foo\ +---- +. +<h2>Foo\</h2> +```````````````````````````````` + + +Since indicators of block structure take precedence over +indicators of inline structure, the following are setext headings: + +```````````````````````````````` example +`Foo +---- +` + +<a title="a lot +--- +of dashes"/> +. +<h2>`Foo</h2> +<p>`</p> +<h2><a title="a lot</h2> +<p>of dashes"/></p> +```````````````````````````````` + + +The setext heading underline cannot be a [lazy continuation +line] in a list item or block quote: + +```````````````````````````````` example +> Foo +--- +. +<blockquote> +<p>Foo</p> +</blockquote> +<hr /> +```````````````````````````````` + + +```````````````````````````````` example +> foo +bar +=== +. +<blockquote> +<p>foo +bar +===</p> +</blockquote> +```````````````````````````````` + + +```````````````````````````````` example +- Foo +--- +. +<ul> +<li>Foo</li> +</ul> +<hr /> +```````````````````````````````` + + +A blank line is needed between a paragraph and a following +setext heading, since otherwise the paragraph becomes part +of the heading's content: + +```````````````````````````````` example +Foo +Bar +--- +. +<h2>Foo +Bar</h2> +```````````````````````````````` + + +But in general a blank line is not required before or after +setext headings: + +```````````````````````````````` example +--- +Foo +--- +Bar +--- +Baz +. +<hr /> +<h2>Foo</h2> +<h2>Bar</h2> +<p>Baz</p> +```````````````````````````````` + + +Setext headings cannot be empty: + +```````````````````````````````` example + +==== +. +<p>====</p> +```````````````````````````````` + + +Setext heading text lines must not be interpretable as block +constructs other than paragraphs. So, the line of dashes +in these examples gets interpreted as a thematic break: + +```````````````````````````````` example +--- +--- +. +<hr /> +<hr /> +```````````````````````````````` + + +```````````````````````````````` example +- foo +----- +. +<ul> +<li>foo</li> +</ul> +<hr /> +```````````````````````````````` + + +```````````````````````````````` example + foo +--- +. +<pre><code>foo +</code></pre> +<hr /> +```````````````````````````````` + + +```````````````````````````````` example +> foo +----- +. +<blockquote> +<p>foo</p> +</blockquote> +<hr /> +```````````````````````````````` + + +If you want a heading with `> foo` as its literal text, you can +use backslash escapes: + +```````````````````````````````` example +\> foo +------ +. +<h2>> foo</h2> +```````````````````````````````` + + +**Compatibility note:** Most existing Markdown implementations +do not allow the text of setext headings to span multiple lines. +But there is no consensus about how to interpret + +``` markdown +Foo +bar +--- +baz +``` + +One can find four different interpretations: + +1. paragraph "Foo", heading "bar", paragraph "baz" +2. paragraph "Foo bar", thematic break, paragraph "baz" +3. paragraph "Foo bar --- baz" +4. heading "Foo bar", paragraph "baz" + +We find interpretation 4 most natural, and interpretation 4 +increases the expressive power of CommonMark, by allowing +multiline headings. Authors who want interpretation 1 can +put a blank line after the first paragraph: + +```````````````````````````````` example +Foo + +bar +--- +baz +. +<p>Foo</p> +<h2>bar</h2> +<p>baz</p> +```````````````````````````````` + + +Authors who want interpretation 2 can put blank lines around +the thematic break, + +```````````````````````````````` example +Foo +bar + +--- + +baz +. +<p>Foo +bar</p> +<hr /> +<p>baz</p> +```````````````````````````````` + + +or use a thematic break that cannot count as a [setext heading +underline], such as + +```````````````````````````````` example +Foo +bar +* * * +baz +. +<p>Foo +bar</p> +<hr /> +<p>baz</p> +```````````````````````````````` + + +Authors who want interpretation 3 can use backslash escapes: + +```````````````````````````````` example +Foo +bar +\--- +baz +. +<p>Foo +bar +--- +baz</p> +```````````````````````````````` + + +## Indented code blocks + +An [indented code block](@) is composed of one or more +[indented chunks] separated by blank lines. +An [indented chunk](@) is a sequence of non-blank lines, +each preceded by four or more spaces of indentation. The contents of the code +block are the literal contents of the lines, including trailing +[line endings], minus four spaces of indentation. +An indented code block has no [info string]. + +An indented code block cannot interrupt a paragraph, so there must be +a blank line between a paragraph and a following indented code block. +(A blank line is not needed, however, between a code block and a following +paragraph.) + +```````````````````````````````` example + a simple + indented code block +. +<pre><code>a simple + indented code block +</code></pre> +```````````````````````````````` + + +If there is any ambiguity between an interpretation of indentation +as a code block and as indicating that material belongs to a [list +item][list items], the list item interpretation takes precedence: + +```````````````````````````````` example + - foo + + bar +. +<ul> +<li> +<p>foo</p> +<p>bar</p> +</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +1. foo + + - bar +. +<ol> +<li> +<p>foo</p> +<ul> +<li>bar</li> +</ul> +</li> +</ol> +```````````````````````````````` + + + +The contents of a code block are literal text, and do not get parsed +as Markdown: + +```````````````````````````````` example + <a/> + *hi* + + - one +. +<pre><code><a/> +*hi* + +- one +</code></pre> +```````````````````````````````` + + +Here we have three chunks separated by blank lines: + +```````````````````````````````` example + chunk1 + + chunk2 + + + + chunk3 +. +<pre><code>chunk1 + +chunk2 + + + +chunk3 +</code></pre> +```````````````````````````````` + + +Any initial spaces or tabs beyond four spaces of indentation will be included in +the content, even in interior blank lines: + +```````````````````````````````` example + chunk1 + + chunk2 +. +<pre><code>chunk1 + + chunk2 +</code></pre> +```````````````````````````````` + + +An indented code block cannot interrupt a paragraph. (This +allows hanging indents and the like.) + +```````````````````````````````` example +Foo + bar + +. +<p>Foo +bar</p> +```````````````````````````````` + + +However, any non-blank line with fewer than four spaces of indentation ends +the code block immediately. So a paragraph may occur immediately +after indented code: + +```````````````````````````````` example + foo +bar +. +<pre><code>foo +</code></pre> +<p>bar</p> +```````````````````````````````` + + +And indented code can occur immediately before and after other kinds of +blocks: + +```````````````````````````````` example +# Heading + foo +Heading +------ + foo +---- +. +<h1>Heading</h1> +<pre><code>foo +</code></pre> +<h2>Heading</h2> +<pre><code>foo +</code></pre> +<hr /> +```````````````````````````````` + + +The first line can be preceded by more than four spaces of indentation: + +```````````````````````````````` example + foo + bar +. +<pre><code> foo +bar +</code></pre> +```````````````````````````````` + + +Blank lines preceding or following an indented code block +are not included in it: + +```````````````````````````````` example + + + foo + + +. +<pre><code>foo +</code></pre> +```````````````````````````````` + + +Trailing spaces or tabs are included in the code block's content: + +```````````````````````````````` example + foo +. +<pre><code>foo +</code></pre> +```````````````````````````````` + + + +## Fenced code blocks + +A [code fence](@) is a sequence +of at least three consecutive backtick characters (`` ` ``) or +tildes (`~`). (Tildes and backticks cannot be mixed.) +A [fenced code block](@) +begins with a code fence, preceded by up to three spaces of indentation. + +The line with the opening code fence may optionally contain some text +following the code fence; this is trimmed of leading and trailing +spaces or tabs and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick +characters. (The reason for this restriction is that otherwise +some inline code would be incorrectly interpreted as the +beginning of a fenced code block.) + +The content of the code block consists of all subsequent lines, until +a closing [code fence] of the same type as the code block +began with (backticks or tildes), and with at least as many backticks +or tildes as the opening code fence. If the leading code fence is +preceded by N spaces of indentation, then up to N spaces of indentation are +removed from each line of the content (if present). (If a content line is not +indented, it is preserved unchanged. If it is indented N spaces or less, all +of the indentation is removed.) + +The closing code fence may be preceded by up to three spaces of indentation, and +may be followed only by spaces or tabs, which are ignored. If the end of the +containing block (or document) is reached and no closing code fence +has been found, the code block contains all of the lines after the +opening code fence until the end of the containing block (or +document). (An alternative spec would require backtracking in the +event that a closing code fence is not found. But this makes parsing +much less efficient, and there seems to be no real downside to the +behavior described here.) + +A fenced code block may interrupt a paragraph, and does not require +a blank line either before or after. + +The content of a code fence is treated as literal text, not parsed +as inlines. The first word of the [info string] is typically used to +specify the language of the code sample, and rendered in the `class` +attribute of the `code` tag. However, this spec does not mandate any +particular treatment of the [info string]. + +Here is a simple example with backticks: + +```````````````````````````````` example +``` +< + > +``` +. +<pre><code>< + > +</code></pre> +```````````````````````````````` + + +With tildes: + +```````````````````````````````` example +~~~ +< + > +~~~ +. +<pre><code>< + > +</code></pre> +```````````````````````````````` + +Fewer than three backticks is not enough: + +```````````````````````````````` example +`` +foo +`` +. +<p><code>foo</code></p> +```````````````````````````````` + +The closing code fence must use the same character as the opening +fence: + +```````````````````````````````` example +``` +aaa +~~~ +``` +. +<pre><code>aaa +~~~ +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +~~~ +aaa +``` +~~~ +. +<pre><code>aaa +``` +</code></pre> +```````````````````````````````` + + +The closing code fence must be at least as long as the opening fence: + +```````````````````````````````` example +```` +aaa +``` +`````` +. +<pre><code>aaa +``` +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +~~~~ +aaa +~~~ +~~~~ +. +<pre><code>aaa +~~~ +</code></pre> +```````````````````````````````` + + +Unclosed code blocks are closed by the end of the document +(or the enclosing [block quote][block quotes] or [list item][list items]): + +```````````````````````````````` example +``` +. +<pre><code></code></pre> +```````````````````````````````` + + +```````````````````````````````` example +````` + +``` +aaa +. +<pre><code> +``` +aaa +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +> ``` +> aaa + +bbb +. +<blockquote> +<pre><code>aaa +</code></pre> +</blockquote> +<p>bbb</p> +```````````````````````````````` + + +A code block can have all empty lines as its content: + +```````````````````````````````` example +``` + + +``` +. +<pre><code> + +</code></pre> +```````````````````````````````` + + +A code block can be empty: + +```````````````````````````````` example +``` +``` +. +<pre><code></code></pre> +```````````````````````````````` + + +Fences can be indented. If the opening fence is indented, +content lines will have equivalent opening indentation removed, +if present: + +```````````````````````````````` example + ``` + aaa +aaa +``` +. +<pre><code>aaa +aaa +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example + ``` +aaa + aaa +aaa + ``` +. +<pre><code>aaa +aaa +aaa +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example + ``` + aaa + aaa + aaa + ``` +. +<pre><code>aaa + aaa +aaa +</code></pre> +```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example + ``` + aaa + ``` +. +<pre><code>``` +aaa +``` +</code></pre> +```````````````````````````````` + + +Closing fences may be preceded by up to three spaces of indentation, and their +indentation need not match that of the opening fence: + +```````````````````````````````` example +``` +aaa + ``` +. +<pre><code>aaa +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example + ``` +aaa + ``` +. +<pre><code>aaa +</code></pre> +```````````````````````````````` + + +This is not a closing fence, because it is indented 4 spaces: + +```````````````````````````````` example +``` +aaa + ``` +. +<pre><code>aaa + ``` +</code></pre> +```````````````````````````````` + + + +Code fences (opening and closing) cannot contain internal spaces or tabs: + +```````````````````````````````` example +``` ``` +aaa +. +<p><code> </code> +aaa</p> +```````````````````````````````` + + +```````````````````````````````` example +~~~~~~ +aaa +~~~ ~~ +. +<pre><code>aaa +~~~ ~~ +</code></pre> +```````````````````````````````` + + +Fenced code blocks can interrupt paragraphs, and can be followed +directly by paragraphs, without a blank line between: + +```````````````````````````````` example +foo +``` +bar +``` +baz +. +<p>foo</p> +<pre><code>bar +</code></pre> +<p>baz</p> +```````````````````````````````` + + +Other blocks can also occur before and after fenced code blocks +without an intervening blank line: + +```````````````````````````````` example +foo +--- +~~~ +bar +~~~ +# baz +. +<h2>foo</h2> +<pre><code>bar +</code></pre> +<h1>baz</h1> +```````````````````````````````` + + +An [info string] can be provided after the opening code fence. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. + +```````````````````````````````` example +```ruby +def foo(x) + return 3 +end +``` +. +<pre><code class="language-ruby">def foo(x) + return 3 +end +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +~~~~ ruby startline=3 $%@#$ +def foo(x) + return 3 +end +~~~~~~~ +. +<pre><code class="language-ruby">def foo(x) + return 3 +end +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +````; +```` +. +<pre><code class="language-;"></code></pre> +```````````````````````````````` + + +[Info strings] for backtick code blocks cannot contain backticks: + +```````````````````````````````` example +``` aa ``` +foo +. +<p><code>aa</code> +foo</p> +```````````````````````````````` + + +[Info strings] for tilde code blocks can contain backticks and tildes: + +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ +. +<pre><code class="language-aa">foo +</code></pre> +```````````````````````````````` + + +Closing code fences cannot have [info strings]: + +```````````````````````````````` example +``` +``` aaa +``` +. +<pre><code>``` aaa +</code></pre> +```````````````````````````````` + + + +## HTML blocks + +An [HTML block](@) is a group of lines that is treated +as raw HTML (and will not be escaped in HTML output). + +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three optional spaces of indentation). +It ends with the first subsequent line that meets a matching +[end condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. + +1. **Start condition:** line begins with the string `<pre`, +`<script`, `<style`, or `<textarea` (case-insensitive), followed by a space, +a tab, the string `>`, or the end of the line.\ +**End condition:** line contains an end tag +`</pre>`, `</script>`, `</style>`, or `</textarea>` (case-insensitive; it +need not match the start tag). + +2. **Start condition:** line begins with the string `<!--`.\ +**End condition:** line contains the string `-->`. + +3. **Start condition:** line begins with the string `<?`.\ +**End condition:** line contains the string `?>`. + +4. **Start condition:** line begins with the string `<!` +followed by an ASCII letter.\ +**End condition:** line contains the character `>`. + +5. **Start condition:** line begins with the string +`<![CDATA[`.\ +**End condition:** line contains the string `]]>`. + +6. **Start condition:** line begins with the string `<` or `</` +followed by one of the strings (case-insensitive) `address`, +`article`, `aside`, `base`, `basefont`, `blockquote`, `body`, +`caption`, `center`, `col`, `colgroup`, `dd`, `details`, `dialog`, +`dir`, `div`, `dl`, `dt`, `fieldset`, `figcaption`, `figure`, +`footer`, `form`, `frame`, `frameset`, +`h1`, `h2`, `h3`, `h4`, `h5`, `h6`, `head`, `header`, `hr`, +`html`, `iframe`, `legend`, `li`, `link`, `main`, `menu`, `menuitem`, +`nav`, `noframes`, `ol`, `optgroup`, `option`, `p`, `param`, +`search`, `section`, `summary`, `table`, `tbody`, `td`, +`tfoot`, `th`, `thead`, `title`, `tr`, `track`, `ul`, followed +by a space, a tab, the end of the line, the string `>`, or +the string `/>`.\ +**End condition:** line is followed by a [blank line]. + +7. **Start condition:** line begins with a complete [open tag] +(with any [tag name] other than `pre`, `script`, +`style`, or `textarea`) or a complete [closing tag], +followed by zero or more spaces and tabs, followed by the end of the line.\ +**End condition:** line is followed by a [blank line]. + +HTML blocks continue until they are closed by their appropriate +[end condition], or the last line of the document or other [container +block](#container-blocks). This means any HTML **within an HTML +block** that might otherwise be recognised as a start condition will +be ignored by the parser and passed through as-is, without changing +the parser's state. + +For instance, `<pre>` within an HTML block started by `<table>` will not affect +the parser state; as the HTML block was started in by start condition 6, it +will end at any blank line. This can be surprising: + +```````````````````````````````` example +<table><tr><td> +<pre> +**Hello**, + +_world_. +</pre> +</td></tr></table> +. +<table><tr><td> +<pre> +**Hello**, +<p><em>world</em>. +</pre></p> +</td></tr></table> +```````````````````````````````` + +In this case, the HTML block is terminated by the blank line — the `**Hello**` +text remains verbatim — and regular parsing resumes, with a paragraph, +emphasised `world` and inline and block HTML following. + +All types of [HTML blocks] except type 7 may interrupt +a paragraph. Blocks of type 7 may not interrupt a paragraph. +(This restriction is intended to prevent unwanted interpretation +of long tags inside a wrapped paragraph as starting HTML blocks.) + +Some simple examples follow. Here are some basic HTML blocks +of type 6: + +```````````````````````````````` example +<table> + <tr> + <td> + hi + </td> + </tr> +</table> + +okay. +. +<table> + <tr> + <td> + hi + </td> + </tr> +</table> +<p>okay.</p> +```````````````````````````````` + + +```````````````````````````````` example + <div> + *hello* + <foo><a> +. + <div> + *hello* + <foo><a> +```````````````````````````````` + + +A block can also start with a closing tag: + +```````````````````````````````` example +</div> +*foo* +. +</div> +*foo* +```````````````````````````````` + + +Here we have two HTML blocks with a Markdown paragraph between them: + +```````````````````````````````` example +<DIV CLASS="foo"> + +*Markdown* + +</DIV> +. +<DIV CLASS="foo"> +<p><em>Markdown</em></p> +</DIV> +```````````````````````````````` + + +The tag on the first line can be partial, as long +as it is split where there would be whitespace: + +```````````````````````````````` example +<div id="foo" + class="bar"> +</div> +. +<div id="foo" + class="bar"> +</div> +```````````````````````````````` + + +```````````````````````````````` example +<div id="foo" class="bar + baz"> +</div> +. +<div id="foo" class="bar + baz"> +</div> +```````````````````````````````` + + +An open tag need not be closed: +```````````````````````````````` example +<div> +*foo* + +*bar* +. +<div> +*foo* +<p><em>bar</em></p> +```````````````````````````````` + + + +A partial tag need not even be completed (garbage +in, garbage out): + +```````````````````````````````` example +<div id="foo" +*hi* +. +<div id="foo" +*hi* +```````````````````````````````` + + +```````````````````````````````` example +<div class +foo +. +<div class +foo +```````````````````````````````` + + +The initial tag doesn't even need to be a valid +tag, as long as it starts like one: + +```````````````````````````````` example +<div *???-&&&-<--- +*foo* +. +<div *???-&&&-<--- +*foo* +```````````````````````````````` + + +In type 6 blocks, the initial tag need not be on a line by +itself: + +```````````````````````````````` example +<div><a href="bar">*foo*</a></div> +. +<div><a href="bar">*foo*</a></div> +```````````````````````````````` + + +```````````````````````````````` example +<table><tr><td> +foo +</td></tr></table> +. +<table><tr><td> +foo +</td></tr></table> +```````````````````````````````` + + +Everything until the next blank line or end of document +gets included in the HTML block. So, in the following +example, what looks like a Markdown code block +is actually part of the HTML block, which continues until a blank +line or the end of the document is reached: + +```````````````````````````````` example +<div></div> +``` c +int x = 33; +``` +. +<div></div> +``` c +int x = 33; +``` +```````````````````````````````` + + +To start an [HTML block] with a tag that is *not* in the +list of block-level tags in (6), you must put the tag by +itself on the first line (and it must be complete): + +```````````````````````````````` example +<a href="foo"> +*bar* +</a> +. +<a href="foo"> +*bar* +</a> +```````````````````````````````` + + +In type 7 blocks, the [tag name] can be anything: + +```````````````````````````````` example +<Warning> +*bar* +</Warning> +. +<Warning> +*bar* +</Warning> +```````````````````````````````` + + +```````````````````````````````` example +<i class="foo"> +*bar* +</i> +. +<i class="foo"> +*bar* +</i> +```````````````````````````````` + + +```````````````````````````````` example +</ins> +*bar* +. +</ins> +*bar* +```````````````````````````````` + + +These rules are designed to allow us to work with tags that +can function as either block-level or inline-level tags. +The `<del>` tag is a nice example. We can surround content with +`<del>` tags in three different ways. In this case, we get a raw +HTML block, because the `<del>` tag is on a line by itself: + +```````````````````````````````` example +<del> +*foo* +</del> +. +<del> +*foo* +</del> +```````````````````````````````` + + +In this case, we get a raw HTML block that just includes +the `<del>` tag (because it ends with the following blank +line). So the contents get interpreted as CommonMark: + +```````````````````````````````` example +<del> + +*foo* + +</del> +. +<del> +<p><em>foo</em></p> +</del> +```````````````````````````````` + + +Finally, in this case, the `<del>` tags are interpreted +as [raw HTML] *inside* the CommonMark paragraph. (Because +the tag is not on a line by itself, we get inline HTML +rather than an [HTML block].) + +```````````````````````````````` example +<del>*foo*</del> +. +<p><del><em>foo</em></del></p> +```````````````````````````````` + + +HTML tags designed to contain literal content +(`pre`, `script`, `style`, `textarea`), comments, processing instructions, +and declarations are treated somewhat differently. +Instead of ending at the first blank line, these blocks +end at the first line containing a corresponding end tag. +As a result, these blocks can contain blank lines: + +A pre tag (type 1): + +```````````````````````````````` example +<pre language="haskell"><code> +import Text.HTML.TagSoup + +main :: IO () +main = print $ parseTags tags +</code></pre> +okay +. +<pre language="haskell"><code> +import Text.HTML.TagSoup + +main :: IO () +main = print $ parseTags tags +</code></pre> +<p>okay</p> +```````````````````````````````` + + +A script tag (type 1): + +```````````````````````````````` example +<script type="text/javascript"> +// JavaScript example + +document.getElementById("demo").innerHTML = "Hello JavaScript!"; +</script> +okay +. +<script type="text/javascript"> +// JavaScript example + +document.getElementById("demo").innerHTML = "Hello JavaScript!"; +</script> +<p>okay</p> +```````````````````````````````` + + +A textarea tag (type 1): + +```````````````````````````````` example +<textarea> + +*foo* + +_bar_ + +</textarea> +. +<textarea> + +*foo* + +_bar_ + +</textarea> +```````````````````````````````` + +A style tag (type 1): + +```````````````````````````````` example +<style + type="text/css"> +h1 {color:red;} + +p {color:blue;} +</style> +okay +. +<style + type="text/css"> +h1 {color:red;} + +p {color:blue;} +</style> +<p>okay</p> +```````````````````````````````` + + +If there is no matching end tag, the block will end at the +end of the document (or the enclosing [block quote][block quotes] +or [list item][list items]): + +```````````````````````````````` example +<style + type="text/css"> + +foo +. +<style + type="text/css"> + +foo +```````````````````````````````` + + +```````````````````````````````` example +> <div> +> foo + +bar +. +<blockquote> +<div> +foo +</blockquote> +<p>bar</p> +```````````````````````````````` + + +```````````````````````````````` example +- <div> +- foo +. +<ul> +<li> +<div> +</li> +<li>foo</li> +</ul> +```````````````````````````````` + + +The end tag can occur on the same line as the start tag: + +```````````````````````````````` example +<style>p{color:red;}</style> +*foo* +. +<style>p{color:red;}</style> +<p><em>foo</em></p> +```````````````````````````````` + + +```````````````````````````````` example +<!-- foo -->*bar* +*baz* +. +<!-- foo -->*bar* +<p><em>baz</em></p> +```````````````````````````````` + + +Note that anything on the last line after the +end tag will be included in the [HTML block]: + +```````````````````````````````` example +<script> +foo +</script>1. *bar* +. +<script> +foo +</script>1. *bar* +```````````````````````````````` + + +A comment (type 2): + +```````````````````````````````` example +<!-- Foo + +bar + baz --> +okay +. +<!-- Foo + +bar + baz --> +<p>okay</p> +```````````````````````````````` + + + +A processing instruction (type 3): + +```````````````````````````````` example +<?php + + echo '>'; + +?> +okay +. +<?php + + echo '>'; + +?> +<p>okay</p> +```````````````````````````````` + + +A declaration (type 4): + +```````````````````````````````` example +<!DOCTYPE html> +. +<!DOCTYPE html> +```````````````````````````````` + + +CDATA (type 5): + +```````````````````````````````` example +<![CDATA[ +function matchwo(a,b) +{ + if (a < b && a < 0) then { + return 1; + + } else { + + return 0; + } +} +]]> +okay +. +<![CDATA[ +function matchwo(a,b) +{ + if (a < b && a < 0) then { + return 1; + + } else { + + return 0; + } +} +]]> +<p>okay</p> +```````````````````````````````` + + +The opening tag can be preceded by up to three spaces of indentation, but not +four: + +```````````````````````````````` example + <!-- foo --> + + <!-- foo --> +. + <!-- foo --> +<pre><code><!-- foo --> +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example + <div> + + <div> +. + <div> +<pre><code><div> +</code></pre> +```````````````````````````````` + + +An HTML block of types 1--6 can interrupt a paragraph, and need not be +preceded by a blank line. + +```````````````````````````````` example +Foo +<div> +bar +</div> +. +<p>Foo</p> +<div> +bar +</div> +```````````````````````````````` + + +However, a following blank line is needed, except at the end of +a document, and except for blocks of types 1--5, [above][HTML +block]: + +```````````````````````````````` example +<div> +bar +</div> +*foo* +. +<div> +bar +</div> +*foo* +```````````````````````````````` + + +HTML blocks of type 7 cannot interrupt a paragraph: + +```````````````````````````````` example +Foo +<a href="bar"> +baz +. +<p>Foo +<a href="bar"> +baz</p> +```````````````````````````````` + + +This rule differs from John Gruber's original Markdown syntax +specification, which says: + +> The only restrictions are that block-level HTML elements — +> e.g. `<div>`, `<table>`, `<pre>`, `<p>`, etc. — must be separated from +> surrounding content by blank lines, and the start and end tags of the +> block should not be indented with spaces or tabs. + +In some ways Gruber's rule is more restrictive than the one given +here: + +- It requires that an HTML block be preceded by a blank line. +- It does not allow the start tag to be indented. +- It requires a matching end tag, which it also does not allow to + be indented. + +Most Markdown implementations (including some of Gruber's own) do not +respect all of these restrictions. + +There is one respect, however, in which Gruber's rule is more liberal +than the one given here, since it allows blank lines to occur inside +an HTML block. There are two reasons for disallowing them here. +First, it removes the need to parse balanced tags, which is +expensive and can require backtracking from the end of the document +if no matching end tag is found. Second, it provides a very simple +and flexible way of including Markdown content inside HTML tags: +simply separate the Markdown from the HTML using blank lines: + +Compare: + +```````````````````````````````` example +<div> + +*Emphasized* text. + +</div> +. +<div> +<p><em>Emphasized</em> text.</p> +</div> +```````````````````````````````` + + +```````````````````````````````` example +<div> +*Emphasized* text. +</div> +. +<div> +*Emphasized* text. +</div> +```````````````````````````````` + + +Some Markdown implementations have adopted a convention of +interpreting content inside tags as text if the open tag has +the attribute `markdown=1`. The rule given above seems a simpler and +more elegant way of achieving the same expressive power, which is also +much simpler to parse. + +The main potential drawback is that one can no longer paste HTML +blocks into Markdown documents with 100% reliability. However, +*in most cases* this will work fine, because the blank lines in +HTML are usually followed by HTML block tags. For example: + +```````````````````````````````` example +<table> + +<tr> + +<td> +Hi +</td> + +</tr> + +</table> +. +<table> +<tr> +<td> +Hi +</td> +</tr> +</table> +```````````````````````````````` + + +There are problems, however, if the inner tags are indented +*and* separated by spaces, as then they will be interpreted as +an indented code block: + +```````````````````````````````` example +<table> + + <tr> + + <td> + Hi + </td> + + </tr> + +</table> +. +<table> + <tr> +<pre><code><td> + Hi +</td> +</code></pre> + </tr> +</table> +```````````````````````````````` + + +Fortunately, blank lines are usually not necessary and can be +deleted. The exception is inside `<pre>` tags, but as described +[above][HTML blocks], raw HTML blocks starting with `<pre>` +*can* contain blank lines. + +## Link reference definitions + +A [link reference definition](@) +consists of a [link label], optionally preceded by up to three spaces of +indentation, followed +by a colon (`:`), optional spaces or tabs (including up to one +[line ending]), a [link destination], +optional spaces or tabs (including up to one +[line ending]), and an optional [link +title], which if it is present must be separated +from the [link destination] by spaces or tabs. +No further character may occur. + +A [link reference definition] +does not correspond to a structural element of a document. Instead, it +defines a label which can be used in [reference links] +and reference-style [images] elsewhere in the document. [Link +reference definitions] can come either before or after the links that use +them. + +```````````````````````````````` example +[foo]: /url "title" + +[foo] +. +<p><a href="/url" title="title">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example + [foo]: + /url + 'the title' + +[foo] +. +<p><a href="/url" title="the title">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[Foo*bar\]]:my_(url) 'title (with parens)' + +[Foo*bar\]] +. +<p><a href="my_(url)" title="title (with parens)">Foo*bar]</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[Foo bar]: +<my url> +'title' + +[Foo bar] +. +<p><a href="my%20url" title="title">Foo bar</a></p> +```````````````````````````````` + + +The title may extend over multiple lines: + +```````````````````````````````` example +[foo]: /url ' +title +line1 +line2 +' + +[foo] +. +<p><a href="/url" title=" +title +line1 +line2 +">foo</a></p> +```````````````````````````````` + + +However, it may not contain a [blank line]: + +```````````````````````````````` example +[foo]: /url 'title + +with blank line' + +[foo] +. +<p>[foo]: /url 'title</p> +<p>with blank line'</p> +<p>[foo]</p> +```````````````````````````````` + + +The title may be omitted: + +```````````````````````````````` example +[foo]: +/url + +[foo] +. +<p><a href="/url">foo</a></p> +```````````````````````````````` + + +The link destination may not be omitted: + +```````````````````````````````` example +[foo]: + +[foo] +. +<p>[foo]:</p> +<p>[foo]</p> +```````````````````````````````` + + However, an empty link destination may be specified using + angle brackets: + +```````````````````````````````` example +[foo]: <> + +[foo] +. +<p><a href="">foo</a></p> +```````````````````````````````` + +The title must be separated from the link destination by +spaces or tabs: + +```````````````````````````````` example +[foo]: <bar>(baz) + +[foo] +. +<p>[foo]: <bar>(baz)</p> +<p>[foo]</p> +```````````````````````````````` + + +Both title and destination can contain backslash escapes +and literal backslashes: + +```````````````````````````````` example +[foo]: /url\bar\*baz "foo\"bar\baz" + +[foo] +. +<p><a href="/url%5Cbar*baz" title="foo"bar\baz">foo</a></p> +```````````````````````````````` + + +A link can come before its corresponding definition: + +```````````````````````````````` example +[foo] + +[foo]: url +. +<p><a href="url">foo</a></p> +```````````````````````````````` + + +If there are several matching definitions, the first one takes +precedence: + +```````````````````````````````` example +[foo] + +[foo]: first +[foo]: second +. +<p><a href="first">foo</a></p> +```````````````````````````````` + + +As noted in the section on [Links], matching of labels is +case-insensitive (see [matches]). + +```````````````````````````````` example +[FOO]: /url + +[Foo] +. +<p><a href="/url">Foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[ΑΓΩ]: /φου + +[αγω] +. +<p><a href="/%CF%86%CE%BF%CF%85">αγω</a></p> +```````````````````````````````` + + +Whether something is a [link reference definition] is +independent of whether the link reference it defines is +used in the document. Thus, for example, the following +document contains just a link reference definition, and +no visible content: + +```````````````````````````````` example +[foo]: /url +. +```````````````````````````````` + + +Here is another one: + +```````````````````````````````` example +[ +foo +]: /url +bar +. +<p>bar</p> +```````````````````````````````` + + +This is not a link reference definition, because there are +characters other than spaces or tabs after the title: + +```````````````````````````````` example +[foo]: /url "title" ok +. +<p>[foo]: /url "title" ok</p> +```````````````````````````````` + + +This is a link reference definition, but it has no title: + +```````````````````````````````` example +[foo]: /url +"title" ok +. +<p>"title" ok</p> +```````````````````````````````` + + +This is not a link reference definition, because it is indented +four spaces: + +```````````````````````````````` example + [foo]: /url "title" + +[foo] +. +<pre><code>[foo]: /url "title" +</code></pre> +<p>[foo]</p> +```````````````````````````````` + + +This is not a link reference definition, because it occurs inside +a code block: + +```````````````````````````````` example +``` +[foo]: /url +``` + +[foo] +. +<pre><code>[foo]: /url +</code></pre> +<p>[foo]</p> +```````````````````````````````` + + +A [link reference definition] cannot interrupt a paragraph. + +```````````````````````````````` example +Foo +[bar]: /baz + +[bar] +. +<p>Foo +[bar]: /baz</p> +<p>[bar]</p> +```````````````````````````````` + + +However, it can directly follow other block elements, such as headings +and thematic breaks, and it need not be followed by a blank line. + +```````````````````````````````` example +# [Foo] +[foo]: /url +> bar +. +<h1><a href="/url">Foo</a></h1> +<blockquote> +<p>bar</p> +</blockquote> +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +bar +=== +[foo] +. +<h1>bar</h1> +<p><a href="/url">foo</a></p> +```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +=== +[foo] +. +<p>=== +<a href="/url">foo</a></p> +```````````````````````````````` + + +Several [link reference definitions] +can occur one after another, without intervening blank lines. + +```````````````````````````````` example +[foo]: /foo-url "foo" +[bar]: /bar-url + "bar" +[baz]: /baz-url + +[foo], +[bar], +[baz] +. +<p><a href="/foo-url" title="foo">foo</a>, +<a href="/bar-url" title="bar">bar</a>, +<a href="/baz-url">baz</a></p> +```````````````````````````````` + + +[Link reference definitions] can occur +inside block containers, like lists and block quotations. They +affect the entire document, not just the container in which they +are defined: + +```````````````````````````````` example +[foo] + +> [foo]: /url +. +<p><a href="/url">foo</a></p> +<blockquote> +</blockquote> +```````````````````````````````` + + +## Paragraphs + +A sequence of non-blank lines that cannot be interpreted as other +kinds of blocks forms a [paragraph](@). +The contents of the paragraph are the result of parsing the +paragraph's raw content as inlines. The paragraph's raw content +is formed by concatenating the lines and removing initial and final +spaces or tabs. + +A simple example with two paragraphs: + +```````````````````````````````` example +aaa + +bbb +. +<p>aaa</p> +<p>bbb</p> +```````````````````````````````` + + +Paragraphs can contain multiple lines, but no blank lines: + +```````````````````````````````` example +aaa +bbb + +ccc +ddd +. +<p>aaa +bbb</p> +<p>ccc +ddd</p> +```````````````````````````````` + + +Multiple blank lines between paragraphs have no effect: + +```````````````````````````````` example +aaa + + +bbb +. +<p>aaa</p> +<p>bbb</p> +```````````````````````````````` + + +Leading spaces or tabs are skipped: + +```````````````````````````````` example + aaa + bbb +. +<p>aaa +bbb</p> +```````````````````````````````` + + +Lines after the first may be indented any amount, since indented +code blocks cannot interrupt paragraphs. + +```````````````````````````````` example +aaa + bbb + ccc +. +<p>aaa +bbb +ccc</p> +```````````````````````````````` + + +However, the first line may be preceded by up to three spaces of indentation. +Four spaces of indentation is too many: + +```````````````````````````````` example + aaa +bbb +. +<p>aaa +bbb</p> +```````````````````````````````` + + +```````````````````````````````` example + aaa +bbb +. +<pre><code>aaa +</code></pre> +<p>bbb</p> +```````````````````````````````` + + +Final spaces or tabs are stripped before inline parsing, so a paragraph +that ends with two or more spaces will not end with a [hard line +break]: + +```````````````````````````````` example +aaa +bbb +. +<p>aaa<br /> +bbb</p> +```````````````````````````````` + + +## Blank lines + +[Blank lines] between block-level elements are ignored, +except for the role they play in determining whether a [list] +is [tight] or [loose]. + +Blank lines at the beginning and end of the document are also ignored. + +```````````````````````````````` example + + +aaa + + +# aaa + + +. +<p>aaa</p> +<h1>aaa</h1> +```````````````````````````````` + + + +# Container blocks + +A [container block](#container-blocks) is a block that has other +blocks as its contents. There are two basic kinds of container blocks: +[block quotes] and [list items]. +[Lists] are meta-containers for [list items]. + +We define the syntax for container blocks recursively. The general +form of the definition is: + +> If X is a sequence of blocks, then the result of +> transforming X in such-and-such a way is a container of type Y +> with these blocks as its content. + +So, we explain what counts as a block quote or list item by explaining +how these can be *generated* from their contents. This should suffice +to define the syntax, although it does not give a recipe for *parsing* +these constructions. (A recipe is provided below in the section entitled +[A parsing strategy](#appendix-a-parsing-strategy).) + +## Block quotes + +A [block quote marker](@), +optionally preceded by up to three spaces of indentation, +consists of (a) the character `>` together with a following space of +indentation, or (b) a single character `>` not followed by a space of +indentation. + +The following rules define [block quotes]: + +1. **Basic case.** If a string of lines *Ls* constitute a sequence + of blocks *Bs*, then the result of prepending a [block quote + marker] to the beginning of each line in *Ls* + is a [block quote](#block-quotes) containing *Bs*. + +2. **Laziness.** If a string of lines *Ls* constitute a [block + quote](#block-quotes) with contents *Bs*, then the result of deleting + the initial [block quote marker] from one or + more lines in which the next character other than a space or tab after the + [block quote marker] is [paragraph continuation + text] is a block quote with *Bs* as its content. + [Paragraph continuation text](@) is text + that will be parsed as part of the content of a paragraph, but does + not occur at the beginning of the paragraph. + +3. **Consecutiveness.** A document cannot contain two [block + quotes] in a row unless there is a [blank line] between them. + +Nothing else counts as a [block quote](#block-quotes). + +Here is a simple example: + +```````````````````````````````` example +> # Foo +> bar +> baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +```````````````````````````````` + + +The space or tab after the `>` characters can be omitted: + +```````````````````````````````` example +># Foo +>bar +> baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +```````````````````````````````` + + +The `>` characters can be preceded by up to three spaces of indentation: + +```````````````````````````````` example + > # Foo + > bar + > baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +```````````````````````````````` + + +Four spaces of indentation is too many: + +```````````````````````````````` example + > # Foo + > bar + > baz +. +<pre><code>> # Foo +> bar +> baz +</code></pre> +```````````````````````````````` + + +The Laziness clause allows us to omit the `>` before +[paragraph continuation text]: + +```````````````````````````````` example +> # Foo +> bar +baz +. +<blockquote> +<h1>Foo</h1> +<p>bar +baz</p> +</blockquote> +```````````````````````````````` + + +A block quote can contain some lazy and some non-lazy +continuation lines: + +```````````````````````````````` example +> bar +baz +> foo +. +<blockquote> +<p>bar +baz +foo</p> +</blockquote> +```````````````````````````````` + + +Laziness only applies to lines that would have been continuations of +paragraphs had they been prepended with [block quote markers]. +For example, the `> ` cannot be omitted in the second line of + +``` markdown +> foo +> --- +``` + +without changing the meaning: + +```````````````````````````````` example +> foo +--- +. +<blockquote> +<p>foo</p> +</blockquote> +<hr /> +```````````````````````````````` + + +Similarly, if we omit the `> ` in the second line of + +``` markdown +> - foo +> - bar +``` + +then the block quote ends after the first line: + +```````````````````````````````` example +> - foo +- bar +. +<blockquote> +<ul> +<li>foo</li> +</ul> +</blockquote> +<ul> +<li>bar</li> +</ul> +```````````````````````````````` + + +For the same reason, we can't omit the `> ` in front of +subsequent lines of an indented or fenced code block: + +```````````````````````````````` example +> foo + bar +. +<blockquote> +<pre><code>foo +</code></pre> +</blockquote> +<pre><code>bar +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +> ``` +foo +``` +. +<blockquote> +<pre><code></code></pre> +</blockquote> +<p>foo</p> +<pre><code></code></pre> +```````````````````````````````` + + +Note that in the following case, we have a [lazy +continuation line]: + +```````````````````````````````` example +> foo + - bar +. +<blockquote> +<p>foo +- bar</p> +</blockquote> +```````````````````````````````` + + +To see why, note that in + +```markdown +> foo +> - bar +``` + +the `- bar` is indented too far to start a list, and can't +be an indented code block because indented code blocks cannot +interrupt paragraphs, so it is [paragraph continuation text]. + +A block quote can be empty: + +```````````````````````````````` example +> +. +<blockquote> +</blockquote> +```````````````````````````````` + + +```````````````````````````````` example +> +> +> +. +<blockquote> +</blockquote> +```````````````````````````````` + + +A block quote can have initial or final blank lines: + +```````````````````````````````` example +> +> foo +> +. +<blockquote> +<p>foo</p> +</blockquote> +```````````````````````````````` + + +A blank line always separates block quotes: + +```````````````````````````````` example +> foo + +> bar +. +<blockquote> +<p>foo</p> +</blockquote> +<blockquote> +<p>bar</p> +</blockquote> +```````````````````````````````` + + +(Most current Markdown implementations, including John Gruber's +original `Markdown.pl`, will parse this example as a single block quote +with two paragraphs. But it seems better to allow the author to decide +whether two block quotes or one are wanted.) + +Consecutiveness means that if we put these block quotes together, +we get a single block quote: + +```````````````````````````````` example +> foo +> bar +. +<blockquote> +<p>foo +bar</p> +</blockquote> +```````````````````````````````` + + +To get a block quote with two paragraphs, use: + +```````````````````````````````` example +> foo +> +> bar +. +<blockquote> +<p>foo</p> +<p>bar</p> +</blockquote> +```````````````````````````````` + + +Block quotes can interrupt paragraphs: + +```````````````````````````````` example +foo +> bar +. +<p>foo</p> +<blockquote> +<p>bar</p> +</blockquote> +```````````````````````````````` + + +In general, blank lines are not needed before or after block +quotes: + +```````````````````````````````` example +> aaa +*** +> bbb +. +<blockquote> +<p>aaa</p> +</blockquote> +<hr /> +<blockquote> +<p>bbb</p> +</blockquote> +```````````````````````````````` + + +However, because of laziness, a blank line is needed between +a block quote and a following paragraph: + +```````````````````````````````` example +> bar +baz +. +<blockquote> +<p>bar +baz</p> +</blockquote> +```````````````````````````````` + + +```````````````````````````````` example +> bar + +baz +. +<blockquote> +<p>bar</p> +</blockquote> +<p>baz</p> +```````````````````````````````` + + +```````````````````````````````` example +> bar +> +baz +. +<blockquote> +<p>bar</p> +</blockquote> +<p>baz</p> +```````````````````````````````` + + +It is a consequence of the Laziness rule that any number +of initial `>`s may be omitted on a continuation line of a +nested block quote: + +```````````````````````````````` example +> > > foo +bar +. +<blockquote> +<blockquote> +<blockquote> +<p>foo +bar</p> +</blockquote> +</blockquote> +</blockquote> +```````````````````````````````` + + +```````````````````````````````` example +>>> foo +> bar +>>baz +. +<blockquote> +<blockquote> +<blockquote> +<p>foo +bar +baz</p> +</blockquote> +</blockquote> +</blockquote> +```````````````````````````````` + + +When including an indented code block in a block quote, +remember that the [block quote marker] includes +both the `>` and a following space of indentation. So *five spaces* are needed +after the `>`: + +```````````````````````````````` example +> code + +> not code +. +<blockquote> +<pre><code>code +</code></pre> +</blockquote> +<blockquote> +<p>not code</p> +</blockquote> +```````````````````````````````` + + + +## List items + +A [list marker](@) is a +[bullet list marker] or an [ordered list marker]. + +A [bullet list marker](@) +is a `-`, `+`, or `*` character. + +An [ordered list marker](@) +is a sequence of 1--9 arabic digits (`0-9`), followed by either a +`.` character or a `)` character. (The reason for the length +limit is that with 10 digits we start seeing integer overflows +in some browsers.) + +The following rules define [list items]: + +1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of + blocks *Bs* starting with a character other than a space or tab, and *M* is + a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation, + then the result of prepending *M* and the following spaces to the first line + of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a + list item with *Bs* as its contents. The type of the list item + (bullet or ordered) is determined by the type of its list marker. + If the list item is ordered, then it is also assigned a start + number, based on the ordered list marker. + + Exceptions: + + 1. When the first list item in a [list] interrupts + a paragraph---that is, when it starts on a line that would + otherwise count as [paragraph continuation text]---then (a) + the lines *Ls* must not begin with a blank line, and (b) if + the list item is ordered, the start number must be 1. + 2. If any line is a [thematic break][thematic breaks] then + that line is not a list item. + +For example, let *Ls* be the lines + +```````````````````````````````` example +A paragraph +with two lines. + + indented code + +> A block quote. +. +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +```````````````````````````````` + + +And let *M* be the marker `1.`, and *N* = 2. Then rule #1 says +that the following is an ordered list item with start number 1, +and the same contents as *Ls*: + +```````````````````````````````` example +1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li> +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +</li> +</ol> +```````````````````````````````` + + +The most important thing to notice is that the position of +the text after the list marker determines how much indentation +is needed in subsequent blocks in the list item. If the list +marker takes up two spaces of indentation, and there are three spaces between +the list marker and the next character other than a space or tab, then blocks +must be indented five spaces in order to fall under the list +item. + +Here are some examples showing how far content must be indented to be +put under the list item: + +```````````````````````````````` example +- one + + two +. +<ul> +<li>one</li> +</ul> +<p>two</p> +```````````````````````````````` + + +```````````````````````````````` example +- one + + two +. +<ul> +<li> +<p>one</p> +<p>two</p> +</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example + - one + + two +. +<ul> +<li>one</li> +</ul> +<pre><code> two +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example + - one + + two +. +<ul> +<li> +<p>one</p> +<p>two</p> +</li> +</ul> +```````````````````````````````` + + +It is tempting to think of this in terms of columns: the continuation +blocks must be indented at least to the column of the first character other than +a space or tab after the list marker. However, that is not quite right. +The spaces of indentation after the list marker determine how much relative +indentation is needed. Which column this indentation reaches will depend on +how the list item is embedded in other constructions, as shown by +this example: + +```````````````````````````````` example + > > 1. one +>> +>> two +. +<blockquote> +<blockquote> +<ol> +<li> +<p>one</p> +<p>two</p> +</li> +</ol> +</blockquote> +</blockquote> +```````````````````````````````` + + +Here `two` occurs in the same column as the list marker `1.`, +but is actually contained in the list item, because there is +sufficient indentation after the last containing blockquote marker. + +The converse is also possible. In the following example, the word `two` +occurs far to the right of the initial text of the list item, `one`, but +it is not considered part of the list item, because it is not indented +far enough past the blockquote marker: + +```````````````````````````````` example +>>- one +>> + > > two +. +<blockquote> +<blockquote> +<ul> +<li>one</li> +</ul> +<p>two</p> +</blockquote> +</blockquote> +```````````````````````````````` + + +Note that at least one space or tab is needed between the list marker and +any following content, so these are not list items: + +```````````````````````````````` example +-one + +2.two +. +<p>-one</p> +<p>2.two</p> +```````````````````````````````` + + +A list item may contain blocks that are separated by more than +one blank line. + +```````````````````````````````` example +- foo + + + bar +. +<ul> +<li> +<p>foo</p> +<p>bar</p> +</li> +</ul> +```````````````````````````````` + + +A list item may contain any kind of block: + +```````````````````````````````` example +1. foo + + ``` + bar + ``` + + baz + + > bam +. +<ol> +<li> +<p>foo</p> +<pre><code>bar +</code></pre> +<p>baz</p> +<blockquote> +<p>bam</p> +</blockquote> +</li> +</ol> +```````````````````````````````` + + +A list item that contains an indented code block will preserve +empty lines within the code block verbatim. + +```````````````````````````````` example +- Foo + + bar + + + baz +. +<ul> +<li> +<p>Foo</p> +<pre><code>bar + + +baz +</code></pre> +</li> +</ul> +```````````````````````````````` + +Note that ordered list start numbers must be nine digits or less: + +```````````````````````````````` example +123456789. ok +. +<ol start="123456789"> +<li>ok</li> +</ol> +```````````````````````````````` + + +```````````````````````````````` example +1234567890. not ok +. +<p>1234567890. not ok</p> +```````````````````````````````` + + +A start number may begin with 0s: + +```````````````````````````````` example +0. ok +. +<ol start="0"> +<li>ok</li> +</ol> +```````````````````````````````` + + +```````````````````````````````` example +003. ok +. +<ol start="3"> +<li>ok</li> +</ol> +```````````````````````````````` + + +A start number may not be negative: + +```````````````````````````````` example +-1. not ok +. +<p>-1. not ok</p> +```````````````````````````````` + + + +2. **Item starting with indented code.** If a sequence of lines *Ls* + constitute a sequence of blocks *Bs* starting with an indented code + block, and *M* is a list marker of width *W* followed by + one space of indentation, then the result of prepending *M* and the + following space to the first line of *Ls*, and indenting subsequent lines + of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. + If a line is empty, then it need not be indented. The type of the + list item (bullet or ordered) is determined by the type of its list + marker. If the list item is ordered, then it is also assigned a + start number, based on the ordered list marker. + +An indented code block will have to be preceded by four spaces of indentation +beyond the edge of the region where text will be included in the list item. +In the following case that is 6 spaces: + +```````````````````````````````` example +- foo + + bar +. +<ul> +<li> +<p>foo</p> +<pre><code>bar +</code></pre> +</li> +</ul> +```````````````````````````````` + + +And in this case it is 11 spaces: + +```````````````````````````````` example + 10. foo + + bar +. +<ol start="10"> +<li> +<p>foo</p> +<pre><code>bar +</code></pre> +</li> +</ol> +```````````````````````````````` + + +If the *first* block in the list item is an indented code block, +then by rule #2, the contents must be preceded by *one* space of indentation +after the list marker: + +```````````````````````````````` example + indented code + +paragraph + + more code +. +<pre><code>indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre> +```````````````````````````````` + + +```````````````````````````````` example +1. indented code + + paragraph + + more code +. +<ol> +<li> +<pre><code>indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre> +</li> +</ol> +```````````````````````````````` + + +Note that an additional space of indentation is interpreted as space +inside the code block: + +```````````````````````````````` example +1. indented code + + paragraph + + more code +. +<ol> +<li> +<pre><code> indented code +</code></pre> +<p>paragraph</p> +<pre><code>more code +</code></pre> +</li> +</ol> +```````````````````````````````` + + +Note that rules #1 and #2 only apply to two cases: (a) cases +in which the lines to be included in a list item begin with a +character other than a space or tab, and (b) cases in which +they begin with an indented code +block. In a case like the following, where the first block begins with +three spaces of indentation, the rules do not allow us to form a list item by +indenting the whole thing and prepending a list marker: + +```````````````````````````````` example + foo + +bar +. +<p>foo</p> +<p>bar</p> +```````````````````````````````` + + +```````````````````````````````` example +- foo + + bar +. +<ul> +<li>foo</li> +</ul> +<p>bar</p> +```````````````````````````````` + + +This is not a significant restriction, because when a block is preceded by up to +three spaces of indentation, the indentation can always be removed without +a change in interpretation, allowing rule #1 to be applied. So, in +the above case: + +```````````````````````````````` example +- foo + + bar +. +<ul> +<li> +<p>foo</p> +<p>bar</p> +</li> +</ul> +```````````````````````````````` + + +3. **Item starting with a blank line.** If a sequence of lines *Ls* + starting with a single [blank line] constitute a (possibly empty) + sequence of blocks *Bs*, and *M* is a list marker of width *W*, + then the result of prepending *M* to the first line of *Ls*, and + preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a + list item with *Bs* as its contents. + If a line is empty, then it need not be indented. The type of the + list item (bullet or ordered) is determined by the type of its list + marker. If the list item is ordered, then it is also assigned a + start number, based on the ordered list marker. + +Here are some list items that start with a blank line but are not empty: + +```````````````````````````````` example +- + foo +- + ``` + bar + ``` +- + baz +. +<ul> +<li>foo</li> +<li> +<pre><code>bar +</code></pre> +</li> +<li> +<pre><code>baz +</code></pre> +</li> +</ul> +```````````````````````````````` + +When the list item starts with a blank line, the number of spaces +following the list marker doesn't change the required indentation: + +```````````````````````````````` example +- + foo +. +<ul> +<li>foo</li> +</ul> +```````````````````````````````` + + +A list item can begin with at most one blank line. +In the following example, `foo` is not part of the list +item: + +```````````````````````````````` example +- + + foo +. +<ul> +<li></li> +</ul> +<p>foo</p> +```````````````````````````````` + + +Here is an empty bullet list item: + +```````````````````````````````` example +- foo +- +- bar +. +<ul> +<li>foo</li> +<li></li> +<li>bar</li> +</ul> +```````````````````````````````` + + +It does not matter whether there are spaces or tabs following the [list marker]: + +```````````````````````````````` example +- foo +- +- bar +. +<ul> +<li>foo</li> +<li></li> +<li>bar</li> +</ul> +```````````````````````````````` + + +Here is an empty ordered list item: + +```````````````````````````````` example +1. foo +2. +3. bar +. +<ol> +<li>foo</li> +<li></li> +<li>bar</li> +</ol> +```````````````````````````````` + + +A list may start or end with an empty list item: + +```````````````````````````````` example +* +. +<ul> +<li></li> +</ul> +```````````````````````````````` + +However, an empty list item cannot interrupt a paragraph: + +```````````````````````````````` example +foo +* + +foo +1. +. +<p>foo +*</p> +<p>foo +1.</p> +```````````````````````````````` + + +4. **Indentation.** If a sequence of lines *Ls* constitutes a list item + according to rule #1, #2, or #3, then the result of preceding each line + of *Ls* by up to three spaces of indentation (the same for each line) also + constitutes a list item with the same contents and attributes. If a line is + empty, then it need not be indented. + +Indented one space: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li> +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +</li> +</ol> +```````````````````````````````` + + +Indented two spaces: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li> +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +</li> +</ol> +```````````````````````````````` + + +Indented three spaces: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<ol> +<li> +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +</li> +</ol> +```````````````````````````````` + + +Four spaces indent gives a code block: + +```````````````````````````````` example + 1. A paragraph + with two lines. + + indented code + + > A block quote. +. +<pre><code>1. A paragraph + with two lines. + + indented code + + > A block quote. +</code></pre> +```````````````````````````````` + + + +5. **Laziness.** If a string of lines *Ls* constitute a [list + item](#list-items) with contents *Bs*, then the result of deleting + some or all of the indentation from one or more lines in which the + next character other than a space or tab after the indentation is + [paragraph continuation text] is a + list item with the same contents and attributes. The unindented + lines are called + [lazy continuation line](@)s. + +Here is an example with [lazy continuation lines]: + +```````````````````````````````` example + 1. A paragraph +with two lines. + + indented code + + > A block quote. +. +<ol> +<li> +<p>A paragraph +with two lines.</p> +<pre><code>indented code +</code></pre> +<blockquote> +<p>A block quote.</p> +</blockquote> +</li> +</ol> +```````````````````````````````` + + +Indentation can be partially deleted: + +```````````````````````````````` example + 1. A paragraph + with two lines. +. +<ol> +<li>A paragraph +with two lines.</li> +</ol> +```````````````````````````````` + + +These examples show how laziness can work in nested structures: + +```````````````````````````````` example +> 1. > Blockquote +continued here. +. +<blockquote> +<ol> +<li> +<blockquote> +<p>Blockquote +continued here.</p> +</blockquote> +</li> +</ol> +</blockquote> +```````````````````````````````` + + +```````````````````````````````` example +> 1. > Blockquote +> continued here. +. +<blockquote> +<ol> +<li> +<blockquote> +<p>Blockquote +continued here.</p> +</blockquote> +</li> +</ol> +</blockquote> +```````````````````````````````` + + + +6. **That's all.** Nothing that is not counted as a list item by rules + #1--5 counts as a [list item](#list-items). + +The rules for sublists follow from the general rules +[above][List items]. A sublist must be indented the same number +of spaces of indentation a paragraph would need to be in order to be included +in the list item. + +So, in this case we need two spaces indent: + +```````````````````````````````` example +- foo + - bar + - baz + - boo +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li>baz +<ul> +<li>boo</li> +</ul> +</li> +</ul> +</li> +</ul> +</li> +</ul> +```````````````````````````````` + + +One is not enough: + +```````````````````````````````` example +- foo + - bar + - baz + - boo +. +<ul> +<li>foo</li> +<li>bar</li> +<li>baz</li> +<li>boo</li> +</ul> +```````````````````````````````` + + +Here we need four, because the list marker is wider: + +```````````````````````````````` example +10) foo + - bar +. +<ol start="10"> +<li>foo +<ul> +<li>bar</li> +</ul> +</li> +</ol> +```````````````````````````````` + + +Three is not enough: + +```````````````````````````````` example +10) foo + - bar +. +<ol start="10"> +<li>foo</li> +</ol> +<ul> +<li>bar</li> +</ul> +```````````````````````````````` + + +A list may be the first block in a list item: + +```````````````````````````````` example +- - foo +. +<ul> +<li> +<ul> +<li>foo</li> +</ul> +</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +1. - 2. foo +. +<ol> +<li> +<ul> +<li> +<ol start="2"> +<li>foo</li> +</ol> +</li> +</ul> +</li> +</ol> +```````````````````````````````` + + +A list item can contain a heading: + +```````````````````````````````` example +- # Foo +- Bar + --- + baz +. +<ul> +<li> +<h1>Foo</h1> +</li> +<li> +<h2>Bar</h2> +baz</li> +</ul> +```````````````````````````````` + + +### Motivation + +John Gruber's Markdown spec says the following about list items: + +1. "List markers typically start at the left margin, but may be indented + by up to three spaces. List markers must be followed by one or more + spaces or a tab." + +2. "To make lists look nice, you can wrap items with hanging indents.... + But if you don't want to, you don't have to." + +3. "List items may consist of multiple paragraphs. Each subsequent + paragraph in a list item must be indented by either 4 spaces or one + tab." + +4. "It looks nice if you indent every line of the subsequent paragraphs, + but here again, Markdown will allow you to be lazy." + +5. "To put a blockquote within a list item, the blockquote's `>` + delimiters need to be indented." + +6. "To put a code block within a list item, the code block needs to be + indented twice — 8 spaces or two tabs." + +These rules specify that a paragraph under a list item must be indented +four spaces (presumably, from the left margin, rather than the start of +the list marker, but this is not said), and that code under a list item +must be indented eight spaces instead of the usual four. They also say +that a block quote must be indented, but not by how much; however, the +example given has four spaces indentation. Although nothing is said +about other kinds of block-level content, it is certainly reasonable to +infer that *all* block elements under a list item, including other +lists, must be indented four spaces. This principle has been called the +*four-space rule*. + +The four-space rule is clear and principled, and if the reference +implementation `Markdown.pl` had followed it, it probably would have +become the standard. However, `Markdown.pl` allowed paragraphs and +sublists to start with only two spaces indentation, at least on the +outer level. Worse, its behavior was inconsistent: a sublist of an +outer-level list needed two spaces indentation, but a sublist of this +sublist needed three spaces. It is not surprising, then, that different +implementations of Markdown have developed very different rules for +determining what comes under a list item. (Pandoc and python-Markdown, +for example, stuck with Gruber's syntax description and the four-space +rule, while discount, redcarpet, marked, PHP Markdown, and others +followed `Markdown.pl`'s behavior more closely.) + +Unfortunately, given the divergences between implementations, there +is no way to give a spec for list items that will be guaranteed not +to break any existing documents. However, the spec given here should +correctly handle lists formatted with either the four-space rule or +the more forgiving `Markdown.pl` behavior, provided they are laid out +in a way that is natural for a human to read. + +The strategy here is to let the width and indentation of the list marker +determine the indentation necessary for blocks to fall under the list +item, rather than having a fixed and arbitrary number. The writer can +think of the body of the list item as a unit which gets indented to the +right enough to fit the list marker (and any indentation on the list +marker). (The laziness rule, #5, then allows continuation lines to be +unindented if needed.) + +This rule is superior, we claim, to any rule requiring a fixed level of +indentation from the margin. The four-space rule is clear but +unnatural. It is quite unintuitive that + +``` markdown +- foo + + bar + + - baz +``` + +should be parsed as two lists with an intervening paragraph, + +``` html +<ul> +<li>foo</li> +</ul> +<p>bar</p> +<ul> +<li>baz</li> +</ul> +``` + +as the four-space rule demands, rather than a single list, + +``` html +<ul> +<li> +<p>foo</p> +<p>bar</p> +<ul> +<li>baz</li> +</ul> +</li> +</ul> +``` + +The choice of four spaces is arbitrary. It can be learned, but it is +not likely to be guessed, and it trips up beginners regularly. + +Would it help to adopt a two-space rule? The problem is that such +a rule, together with the rule allowing up to three spaces of indentation for +the initial list marker, allows text that is indented *less than* the +original list marker to be included in the list item. For example, +`Markdown.pl` parses + +``` markdown + - one + + two +``` + +as a single list item, with `two` a continuation paragraph: + +``` html +<ul> +<li> +<p>one</p> +<p>two</p> +</li> +</ul> +``` + +and similarly + +``` markdown +> - one +> +> two +``` + +as + +``` html +<blockquote> +<ul> +<li> +<p>one</p> +<p>two</p> +</li> +</ul> +</blockquote> +``` + +This is extremely unintuitive. + +Rather than requiring a fixed indent from the margin, we could require +a fixed indent (say, two spaces, or even one space) from the list marker (which +may itself be indented). This proposal would remove the last anomaly +discussed. Unlike the spec presented above, it would count the following +as a list item with a subparagraph, even though the paragraph `bar` +is not indented as far as the first paragraph `foo`: + +``` markdown + 10. foo + + bar +``` + +Arguably this text does read like a list item with `bar` as a subparagraph, +which may count in favor of the proposal. However, on this proposal indented +code would have to be indented six spaces after the list marker. And this +would break a lot of existing Markdown, which has the pattern: + +``` markdown +1. foo + + indented code +``` + +where the code is indented eight spaces. The spec above, by contrast, will +parse this text as expected, since the code block's indentation is measured +from the beginning of `foo`. + +The one case that needs special treatment is a list item that *starts* +with indented code. How much indentation is required in that case, since +we don't have a "first paragraph" to measure from? Rule #2 simply stipulates +that in such cases, we require one space indentation from the list marker +(and then the normal four spaces for the indented code). This will match the +four-space rule in cases where the list marker plus its initial indentation +takes four spaces (a common case), but diverge in other cases. + +## Lists + +A [list](@) is a sequence of one or more +list items [of the same type]. The list items +may be separated by any number of blank lines. + +Two list items are [of the same type](@) +if they begin with a [list marker] of the same type. +Two list markers are of the +same type if (a) they are bullet list markers using the same character +(`-`, `+`, or `*`) or (b) they are ordered list numbers with the same +delimiter (either `.` or `)`). + +A list is an [ordered list](@) +if its constituent list items begin with +[ordered list markers], and a +[bullet list](@) if its constituent list +items begin with [bullet list markers]. + +The [start number](@) +of an [ordered list] is determined by the list number of +its initial list item. The numbers of subsequent list items are +disregarded. + +A list is [loose](@) if any of its constituent +list items are separated by blank lines, or if any of its constituent +list items directly contain two block-level elements with a blank line +between them. Otherwise a list is [tight](@). +(The difference in HTML output is that paragraphs in a loose list are +wrapped in `<p>` tags, while paragraphs in a tight list are not.) + +Changing the bullet or ordered list delimiter starts a new list: + +```````````````````````````````` example +- foo +- bar ++ baz +. +<ul> +<li>foo</li> +<li>bar</li> +</ul> +<ul> +<li>baz</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +1. foo +2. bar +3) baz +. +<ol> +<li>foo</li> +<li>bar</li> +</ol> +<ol start="3"> +<li>baz</li> +</ol> +```````````````````````````````` + + +In CommonMark, a list can interrupt a paragraph. That is, +no blank line is needed to separate a paragraph from a following +list: + +```````````````````````````````` example +Foo +- bar +- baz +. +<p>Foo</p> +<ul> +<li>bar</li> +<li>baz</li> +</ul> +```````````````````````````````` + +`Markdown.pl` does not allow this, through fear of triggering a list +via a numeral in a hard-wrapped line: + +``` markdown +The number of windows in my house is +14. The number of doors is 6. +``` + +Oddly, though, `Markdown.pl` *does* allow a blockquote to +interrupt a paragraph, even though the same considerations might +apply. + +In CommonMark, we do allow lists to interrupt paragraphs, for +two reasons. First, it is natural and not uncommon for people +to start lists without blank lines: + +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` + +Second, we are attracted to a + +> [principle of uniformity](@): +> if a chunk of text has a certain +> meaning, it will continue to have the same meaning when put into a +> container block (such as a list item or blockquote). + +(Indeed, the spec for [list items] and [block quotes] presupposes +this principle.) This principle implies that if + +``` markdown + * I need to buy + - new shoes + - a coat + - a plane ticket +``` + +is a list item containing a paragraph followed by a nested sublist, +as all Markdown implementations agree it is (though the paragraph +may be rendered without `<p>` tags, since the list is "tight"), +then + +``` markdown +I need to buy +- new shoes +- a coat +- a plane ticket +``` + +by itself should be a paragraph followed by a nested sublist. + +Since it is well established Markdown practice to allow lists to +interrupt paragraphs inside list items, the [principle of +uniformity] requires us to allow this outside list items as +well. ([reStructuredText](https://docutils.sourceforge.net/rst.html) +takes a different approach, requiring blank lines before lists +even inside other list items.) + +In order to solve the problem of unwanted lists in paragraphs with +hard-wrapped numerals, we allow only lists starting with `1` to +interrupt paragraphs. Thus, + +```````````````````````````````` example +The number of windows in my house is +14. The number of doors is 6. +. +<p>The number of windows in my house is +14. The number of doors is 6.</p> +```````````````````````````````` + +We may still get an unintended result in cases like + +```````````````````````````````` example +The number of windows in my house is +1. The number of doors is 6. +. +<p>The number of windows in my house is</p> +<ol> +<li>The number of doors is 6.</li> +</ol> +```````````````````````````````` + +but this rule should prevent most spurious list captures. + +There can be any number of blank lines between items: + +```````````````````````````````` example +- foo + +- bar + + +- baz +. +<ul> +<li> +<p>foo</p> +</li> +<li> +<p>bar</p> +</li> +<li> +<p>baz</p> +</li> +</ul> +```````````````````````````````` + +```````````````````````````````` example +- foo + - bar + - baz + + + bim +. +<ul> +<li>foo +<ul> +<li>bar +<ul> +<li> +<p>baz</p> +<p>bim</p> +</li> +</ul> +</li> +</ul> +</li> +</ul> +```````````````````````````````` + + +To separate consecutive lists of the same type, or to separate a +list from an indented code block that would otherwise be parsed +as a subparagraph of the final list item, you can insert a blank HTML +comment: + +```````````````````````````````` example +- foo +- bar + +<!-- --> + +- baz +- bim +. +<ul> +<li>foo</li> +<li>bar</li> +</ul> +<!-- --> +<ul> +<li>baz</li> +<li>bim</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +- foo + + notcode + +- foo + +<!-- --> + + code +. +<ul> +<li> +<p>foo</p> +<p>notcode</p> +</li> +<li> +<p>foo</p> +</li> +</ul> +<!-- --> +<pre><code>code +</code></pre> +```````````````````````````````` + + +List items need not be indented to the same level. The following +list items will be treated as items at the same list level, +since none is indented enough to belong to the previous list +item: + +```````````````````````````````` example +- a + - b + - c + - d + - e + - f +- g +. +<ul> +<li>a</li> +<li>b</li> +<li>c</li> +<li>d</li> +<li>e</li> +<li>f</li> +<li>g</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +<ol> +<li> +<p>a</p> +</li> +<li> +<p>b</p> +</li> +<li> +<p>c</p> +</li> +</ol> +```````````````````````````````` + +Note, however, that list items may not be preceded by more than +three spaces of indentation. Here `- e` is treated as a paragraph continuation +line, because it is indented more than three spaces: + +```````````````````````````````` example +- a + - b + - c + - d + - e +. +<ul> +<li>a</li> +<li>b</li> +<li>c</li> +<li>d +- e</li> +</ul> +```````````````````````````````` + +And here, `3. c` is treated as in indented code block, +because it is indented four spaces and preceded by a +blank line. + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +<ol> +<li> +<p>a</p> +</li> +<li> +<p>b</p> +</li> +</ol> +<pre><code>3. c +</code></pre> +```````````````````````````````` + + +This is a loose list, because there is a blank line between +two of the list items: + +```````````````````````````````` example +- a +- b + +- c +. +<ul> +<li> +<p>a</p> +</li> +<li> +<p>b</p> +</li> +<li> +<p>c</p> +</li> +</ul> +```````````````````````````````` + + +So is this, with a empty second item: + +```````````````````````````````` example +* a +* + +* c +. +<ul> +<li> +<p>a</p> +</li> +<li></li> +<li> +<p>c</p> +</li> +</ul> +```````````````````````````````` + + +These are loose lists, even though there are no blank lines between the items, +because one of the items directly contains two block-level elements +with a blank line between them: + +```````````````````````````````` example +- a +- b + + c +- d +. +<ul> +<li> +<p>a</p> +</li> +<li> +<p>b</p> +<p>c</p> +</li> +<li> +<p>d</p> +</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +- a +- b + + [ref]: /url +- d +. +<ul> +<li> +<p>a</p> +</li> +<li> +<p>b</p> +</li> +<li> +<p>d</p> +</li> +</ul> +```````````````````````````````` + + +This is a tight list, because the blank lines are in a code block: + +```````````````````````````````` example +- a +- ``` + b + + + ``` +- c +. +<ul> +<li>a</li> +<li> +<pre><code>b + + +</code></pre> +</li> +<li>c</li> +</ul> +```````````````````````````````` + + +This is a tight list, because the blank line is between two +paragraphs of a sublist. So the sublist is loose while +the outer list is tight: + +```````````````````````````````` example +- a + - b + + c +- d +. +<ul> +<li>a +<ul> +<li> +<p>b</p> +<p>c</p> +</li> +</ul> +</li> +<li>d</li> +</ul> +```````````````````````````````` + + +This is a tight list, because the blank line is inside the +block quote: + +```````````````````````````````` example +* a + > b + > +* c +. +<ul> +<li>a +<blockquote> +<p>b</p> +</blockquote> +</li> +<li>c</li> +</ul> +```````````````````````````````` + + +This list is tight, because the consecutive block elements +are not separated by blank lines: + +```````````````````````````````` example +- a + > b + ``` + c + ``` +- d +. +<ul> +<li>a +<blockquote> +<p>b</p> +</blockquote> +<pre><code>c +</code></pre> +</li> +<li>d</li> +</ul> +```````````````````````````````` + + +A single-paragraph list is tight: + +```````````````````````````````` example +- a +. +<ul> +<li>a</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +- a + - b +. +<ul> +<li>a +<ul> +<li>b</li> +</ul> +</li> +</ul> +```````````````````````````````` + + +This list is loose, because of the blank line between the +two block elements in the list item: + +```````````````````````````````` example +1. ``` + foo + ``` + + bar +. +<ol> +<li> +<pre><code>foo +</code></pre> +<p>bar</p> +</li> +</ol> +```````````````````````````````` + + +Here the outer list is loose, the inner list tight: + +```````````````````````````````` example +* foo + * bar + + baz +. +<ul> +<li> +<p>foo</p> +<ul> +<li>bar</li> +</ul> +<p>baz</p> +</li> +</ul> +```````````````````````````````` + + +```````````````````````````````` example +- a + - b + - c + +- d + - e + - f +. +<ul> +<li> +<p>a</p> +<ul> +<li>b</li> +<li>c</li> +</ul> +</li> +<li> +<p>d</p> +<ul> +<li>e</li> +<li>f</li> +</ul> +</li> +</ul> +```````````````````````````````` + + +# Inlines + +Inlines are parsed sequentially from the beginning of the character +stream to the end (left to right, in left-to-right languages). +Thus, for example, in + +```````````````````````````````` example +`hi`lo` +. +<p><code>hi</code>lo`</p> +```````````````````````````````` + +`hi` is parsed as code, leaving the backtick at the end as a literal +backtick. + + + +## Code spans + +A [backtick string](@) +is a string of one or more backtick characters (`` ` ``) that is neither +preceded nor followed by a backtick. + +A [code span](@) begins with a backtick string and ends with +a backtick string of equal length. The contents of the code span are +the characters between these two backtick strings, normalized in the +following ways: + +- First, [line endings] are converted to [spaces]. +- If the resulting string both begins *and* ends with a [space] + character, but does not consist entirely of [space] + characters, a single [space] character is removed from the + front and back. This allows you to include code that begins + or ends with backtick characters, which must be separated by + whitespace from the opening or closing backtick strings. + +This is a simple code span: + +```````````````````````````````` example +`foo` +. +<p><code>foo</code></p> +```````````````````````````````` + + +Here two backticks are used, because the code contains a backtick. +This example also illustrates stripping of a single leading and +trailing space: + +```````````````````````````````` example +`` foo ` bar `` +. +<p><code>foo ` bar</code></p> +```````````````````````````````` + + +This example shows the motivation for stripping leading and trailing +spaces: + +```````````````````````````````` example +` `` ` +. +<p><code>``</code></p> +```````````````````````````````` + +Note that only *one* space is stripped: + +```````````````````````````````` example +` `` ` +. +<p><code> `` </code></p> +```````````````````````````````` + +The stripping only happens if the space is on both +sides of the string: + +```````````````````````````````` example +` a` +. +<p><code> a</code></p> +```````````````````````````````` + +Only [spaces], and not [unicode whitespace] in general, are +stripped in this way: + +```````````````````````````````` example +` b ` +. +<p><code> b </code></p> +```````````````````````````````` + +No stripping occurs if the code span contains only spaces: + +```````````````````````````````` example +` ` +` ` +. +<p><code> </code> +<code> </code></p> +```````````````````````````````` + + +[Line endings] are treated like spaces: + +```````````````````````````````` example +`` +foo +bar +baz +`` +. +<p><code>foo bar baz</code></p> +```````````````````````````````` + +```````````````````````````````` example +`` +foo +`` +. +<p><code>foo </code></p> +```````````````````````````````` + + +Interior spaces are not collapsed: + +```````````````````````````````` example +`foo bar +baz` +. +<p><code>foo bar baz</code></p> +```````````````````````````````` + +Note that browsers will typically collapse consecutive spaces +when rendering `<code>` elements, so it is recommended that +the following CSS be used: + + code{white-space: pre-wrap;} + + +Note that backslash escapes do not work in code spans. All backslashes +are treated literally: + +```````````````````````````````` example +`foo\`bar` +. +<p><code>foo\</code>bar`</p> +```````````````````````````````` + + +Backslash escapes are never needed, because one can always choose a +string of *n* backtick characters as delimiters, where the code does +not contain any strings of exactly *n* backtick characters. + +```````````````````````````````` example +``foo`bar`` +. +<p><code>foo`bar</code></p> +```````````````````````````````` + +```````````````````````````````` example +` foo `` bar ` +. +<p><code>foo `` bar</code></p> +```````````````````````````````` + + +Code span backticks have higher precedence than any other inline +constructs except HTML tags and autolinks. Thus, for example, this is +not parsed as emphasized text, since the second `*` is part of a code +span: + +```````````````````````````````` example +*foo`*` +. +<p>*foo<code>*</code></p> +```````````````````````````````` + + +And this is not parsed as a link: + +```````````````````````````````` example +[not a `link](/foo`) +. +<p>[not a <code>link](/foo</code>)</p> +```````````````````````````````` + + +Code spans, HTML tags, and autolinks have the same precedence. +Thus, this is code: + +```````````````````````````````` example +`<a href="`">` +. +<p><code><a href="</code>">`</p> +```````````````````````````````` + + +But this is an HTML tag: + +```````````````````````````````` example +<a href="`">` +. +<p><a href="`">`</p> +```````````````````````````````` + + +And this is code: + +```````````````````````````````` example +`<https://foo.bar.`baz>` +. +<p><code><https://foo.bar.</code>baz>`</p> +```````````````````````````````` + + +But this is an autolink: + +```````````````````````````````` example +<https://foo.bar.`baz>` +. +<p><a href="https://foo.bar.%60baz">https://foo.bar.`baz</a>`</p> +```````````````````````````````` + + +When a backtick string is not closed by a matching backtick string, +we just have literal backticks: + +```````````````````````````````` example +```foo`` +. +<p>```foo``</p> +```````````````````````````````` + + +```````````````````````````````` example +`foo +. +<p>`foo</p> +```````````````````````````````` + +The following case also illustrates the need for opening and +closing backtick strings to be equal in length: + +```````````````````````````````` example +`foo``bar`` +. +<p>`foo<code>bar</code></p> +```````````````````````````````` + + +## Emphasis and strong emphasis + +John Gruber's original [Markdown syntax +description](https://daringfireball.net/projects/markdown/syntax#em) says: + +> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of +> emphasis. Text wrapped with one `*` or `_` will be wrapped with an HTML +> `<em>` tag; double `*`'s or `_`'s will be wrapped with an HTML `<strong>` +> tag. + +This is enough for most users, but these rules leave much undecided, +especially when it comes to nested emphasis. The original +`Markdown.pl` test suite makes it clear that triple `***` and +`___` delimiters can be used for strong emphasis, and most +implementations have also allowed the following patterns: + +``` markdown +***strong emph*** +***strong** in emph* +***emph* in strong** +**in strong *emph*** +*in emph **strong*** +``` + +The following patterns are less widely supported, but the intent +is clear and they are useful (especially in contexts like bibliography +entries): + +``` markdown +*emph *with emph* in it* +**strong **with strong** in it** +``` + +Many implementations have also restricted intraword emphasis to +the `*` forms, to avoid unwanted emphasis in words containing +internal underscores. (It is best practice to put these in code +spans, but users often do not.) + +``` markdown +internal emphasis: foo*bar*baz +no emphasis: foo_bar_baz +``` + +The rules given below capture all of these patterns, while allowing +for efficient parsing strategies that do not backtrack. + +First, some definitions. A [delimiter run](@) is either +a sequence of one or more `*` characters that is not preceded or +followed by a non-backslash-escaped `*` character, or a sequence +of one or more `_` characters that is not preceded or followed by +a non-backslash-escaped `_` character. + +A [left-flanking delimiter run](@) is +a [delimiter run] that is (1) not followed by [Unicode whitespace], +and either (2a) not followed by a [Unicode punctuation character], or +(2b) followed by a [Unicode punctuation character] and +preceded by [Unicode whitespace] or a [Unicode punctuation character]. +For purposes of this definition, the beginning and the end of +the line count as Unicode whitespace. + +A [right-flanking delimiter run](@) is +a [delimiter run] that is (1) not preceded by [Unicode whitespace], +and either (2a) not preceded by a [Unicode punctuation character], or +(2b) preceded by a [Unicode punctuation character] and +followed by [Unicode whitespace] or a [Unicode punctuation character]. +For purposes of this definition, the beginning and the end of +the line count as Unicode whitespace. + +Here are some examples of delimiter runs. + + - left-flanking but not right-flanking: + + ``` + ***abc + _abc + **"abc" + _"abc" + ``` + + - right-flanking but not left-flanking: + + ``` + abc*** + abc_ + "abc"** + "abc"_ + ``` + + - Both left and right-flanking: + + ``` + abc***def + "abc"_"def" + ``` + + - Neither left nor right-flanking: + + ``` + abc *** def + a _ b + ``` + +(The idea of distinguishing left-flanking and right-flanking +delimiter runs based on the character before and the character +after comes from Roopesh Chander's +[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags). +vfmd uses the terminology "emphasis indicator string" instead of "delimiter +run," and its rules for distinguishing left- and right-flanking runs +are a bit more complex than the ones given here.) + +The following rules define emphasis and strong emphasis: + +1. A single `*` character [can open emphasis](@) + iff (if and only if) it is part of a [left-flanking delimiter run]. + +2. A single `_` character [can open emphasis] iff + it is part of a [left-flanking delimiter run] + and either (a) not part of a [right-flanking delimiter run] + or (b) part of a [right-flanking delimiter run] + preceded by a [Unicode punctuation character]. + +3. A single `*` character [can close emphasis](@) + iff it is part of a [right-flanking delimiter run]. + +4. A single `_` character [can close emphasis] iff + it is part of a [right-flanking delimiter run] + and either (a) not part of a [left-flanking delimiter run] + or (b) part of a [left-flanking delimiter run] + followed by a [Unicode punctuation character]. + +5. A double `**` [can open strong emphasis](@) + iff it is part of a [left-flanking delimiter run]. + +6. A double `__` [can open strong emphasis] iff + it is part of a [left-flanking delimiter run] + and either (a) not part of a [right-flanking delimiter run] + or (b) part of a [right-flanking delimiter run] + preceded by a [Unicode punctuation character]. + +7. A double `**` [can close strong emphasis](@) + iff it is part of a [right-flanking delimiter run]. + +8. A double `__` [can close strong emphasis] iff + it is part of a [right-flanking delimiter run] + and either (a) not part of a [left-flanking delimiter run] + or (b) part of a [left-flanking delimiter run] + followed by a [Unicode punctuation character]. + +9. Emphasis begins with a delimiter that [can open emphasis] and ends + with a delimiter that [can close emphasis], and that uses the same + character (`_` or `*`) as the opening delimiter. The + opening and closing delimiters must belong to separate + [delimiter runs]. If one of the delimiters can both + open and close emphasis, then the sum of the lengths of the + delimiter runs containing the opening and closing delimiters + must not be a multiple of 3 unless both lengths are + multiples of 3. + +10. Strong emphasis begins with a delimiter that + [can open strong emphasis] and ends with a delimiter that + [can close strong emphasis], and that uses the same character + (`_` or `*`) as the opening delimiter. The + opening and closing delimiters must belong to separate + [delimiter runs]. If one of the delimiters can both open + and close strong emphasis, then the sum of the lengths of + the delimiter runs containing the opening and closing + delimiters must not be a multiple of 3 unless both lengths + are multiples of 3. + +11. A literal `*` character cannot occur at the beginning or end of + `*`-delimited emphasis or `**`-delimited strong emphasis, unless it + is backslash-escaped. + +12. A literal `_` character cannot occur at the beginning or end of + `_`-delimited emphasis or `__`-delimited strong emphasis, unless it + is backslash-escaped. + +Where rules 1--12 above are compatible with multiple parsings, +the following principles resolve ambiguity: + +13. The number of nestings should be minimized. Thus, for example, + an interpretation `<strong>...</strong>` is always preferred to + `<em><em>...</em></em>`. + +14. An interpretation `<em><strong>...</strong></em>` is always + preferred to `<strong><em>...</em></strong>`. + +15. When two potential emphasis or strong emphasis spans overlap, + so that the second begins before the first ends and ends after + the first ends, the first takes precedence. Thus, for example, + `*foo _bar* baz_` is parsed as `<em>foo _bar</em> baz_` rather + than `*foo <em>bar* baz</em>`. + +16. When there are two potential emphasis or strong emphasis spans + with the same closing delimiter, the shorter one (the one that + opens later) takes precedence. Thus, for example, + `**foo **bar baz**` is parsed as `**foo <strong>bar baz</strong>` + rather than `<strong>foo **bar baz</strong>`. + +17. Inline code spans, links, images, and HTML tags group more tightly + than emphasis. So, when there is a choice between an interpretation + that contains one of these elements and one that does not, the + former always wins. Thus, for example, `*[foo*](bar)` is + parsed as `*<a href="bar">foo*</a>` rather than as + `<em>[foo</em>](bar)`. + +These rules can be illustrated through a series of examples. + +Rule 1: + +```````````````````````````````` example +*foo bar* +. +<p><em>foo bar</em></p> +```````````````````````````````` + + +This is not emphasis, because the opening `*` is followed by +whitespace, and hence not part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a * foo bar* +. +<p>a * foo bar*</p> +```````````````````````````````` + + +This is not emphasis, because the opening `*` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a*"foo"* +. +<p>a*"foo"*</p> +```````````````````````````````` + + +Unicode nonbreaking spaces count as whitespace, too: + +```````````````````````````````` example +* a * +. +<p>* a *</p> +```````````````````````````````` + + +Unicode symbols count as punctuation, too: + +```````````````````````````````` example +*$*alpha. + +*£*bravo. + +*€*charlie. +. +<p>*$*alpha.</p> +<p>*£*bravo.</p> +<p>*€*charlie.</p> +```````````````````````````````` + + +Intraword emphasis with `*` is permitted: + +```````````````````````````````` example +foo*bar* +. +<p>foo<em>bar</em></p> +```````````````````````````````` + + +```````````````````````````````` example +5*6*78 +. +<p>5<em>6</em>78</p> +```````````````````````````````` + + +Rule 2: + +```````````````````````````````` example +_foo bar_ +. +<p><em>foo bar</em></p> +```````````````````````````````` + + +This is not emphasis, because the opening `_` is followed by +whitespace: + +```````````````````````````````` example +_ foo bar_ +. +<p>_ foo bar_</p> +```````````````````````````````` + + +This is not emphasis, because the opening `_` is preceded +by an alphanumeric and followed by punctuation: + +```````````````````````````````` example +a_"foo"_ +. +<p>a_"foo"_</p> +```````````````````````````````` + + +Emphasis with `_` is not allowed inside words: + +```````````````````````````````` example +foo_bar_ +. +<p>foo_bar_</p> +```````````````````````````````` + + +```````````````````````````````` example +5_6_78 +. +<p>5_6_78</p> +```````````````````````````````` + + +```````````````````````````````` example +пристаням_стремятся_ +. +<p>пристаням_стремятся_</p> +```````````````````````````````` + + +Here `_` does not generate emphasis, because the first delimiter run +is right-flanking and the second left-flanking: + +```````````````````````````````` example +aa_"bb"_cc +. +<p>aa_"bb"_cc</p> +```````````````````````````````` + + +This is emphasis, even though the opening delimiter is +both left- and right-flanking, because it is preceded by +punctuation: + +```````````````````````````````` example +foo-_(bar)_ +. +<p>foo-<em>(bar)</em></p> +```````````````````````````````` + + +Rule 3: + +This is not emphasis, because the closing delimiter does +not match the opening delimiter: + +```````````````````````````````` example +_foo* +. +<p>_foo*</p> +```````````````````````````````` + + +This is not emphasis, because the closing `*` is preceded by +whitespace: + +```````````````````````````````` example +*foo bar * +. +<p>*foo bar *</p> +```````````````````````````````` + + +A line ending also counts as whitespace: + +```````````````````````````````` example +*foo bar +* +. +<p>*foo bar +*</p> +```````````````````````````````` + + +This is not emphasis, because the second `*` is +preceded by punctuation and followed by an alphanumeric +(hence it is not part of a [right-flanking delimiter run]: + +```````````````````````````````` example +*(*foo) +. +<p>*(*foo)</p> +```````````````````````````````` + + +The point of this restriction is more easily appreciated +with this example: + +```````````````````````````````` example +*(*foo*)* +. +<p><em>(<em>foo</em>)</em></p> +```````````````````````````````` + + +Intraword emphasis with `*` is allowed: + +```````````````````````````````` example +*foo*bar +. +<p><em>foo</em>bar</p> +```````````````````````````````` + + + +Rule 4: + +This is not emphasis, because the closing `_` is preceded by +whitespace: + +```````````````````````````````` example +_foo bar _ +. +<p>_foo bar _</p> +```````````````````````````````` + + +This is not emphasis, because the second `_` is +preceded by punctuation and followed by an alphanumeric: + +```````````````````````````````` example +_(_foo) +. +<p>_(_foo)</p> +```````````````````````````````` + + +This is emphasis within emphasis: + +```````````````````````````````` example +_(_foo_)_ +. +<p><em>(<em>foo</em>)</em></p> +```````````````````````````````` + + +Intraword emphasis is disallowed for `_`: + +```````````````````````````````` example +_foo_bar +. +<p>_foo_bar</p> +```````````````````````````````` + + +```````````````````````````````` example +_пристаням_стремятся +. +<p>_пристаням_стремятся</p> +```````````````````````````````` + + +```````````````````````````````` example +_foo_bar_baz_ +. +<p><em>foo_bar_baz</em></p> +```````````````````````````````` + + +This is emphasis, even though the closing delimiter is +both left- and right-flanking, because it is followed by +punctuation: + +```````````````````````````````` example +_(bar)_. +. +<p><em>(bar)</em>.</p> +```````````````````````````````` + + +Rule 5: + +```````````````````````````````` example +**foo bar** +. +<p><strong>foo bar</strong></p> +```````````````````````````````` + + +This is not strong emphasis, because the opening delimiter is +followed by whitespace: + +```````````````````````````````` example +** foo bar** +. +<p>** foo bar**</p> +```````````````````````````````` + + +This is not strong emphasis, because the opening `**` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a**"foo"** +. +<p>a**"foo"**</p> +```````````````````````````````` + + +Intraword strong emphasis with `**` is permitted: + +```````````````````````````````` example +foo**bar** +. +<p>foo<strong>bar</strong></p> +```````````````````````````````` + + +Rule 6: + +```````````````````````````````` example +__foo bar__ +. +<p><strong>foo bar</strong></p> +```````````````````````````````` + + +This is not strong emphasis, because the opening delimiter is +followed by whitespace: + +```````````````````````````````` example +__ foo bar__ +. +<p>__ foo bar__</p> +```````````````````````````````` + + +A line ending counts as whitespace: +```````````````````````````````` example +__ +foo bar__ +. +<p>__ +foo bar__</p> +```````````````````````````````` + + +This is not strong emphasis, because the opening `__` is preceded +by an alphanumeric and followed by punctuation: + +```````````````````````````````` example +a__"foo"__ +. +<p>a__"foo"__</p> +```````````````````````````````` + + +Intraword strong emphasis is forbidden with `__`: + +```````````````````````````````` example +foo__bar__ +. +<p>foo__bar__</p> +```````````````````````````````` + + +```````````````````````````````` example +5__6__78 +. +<p>5__6__78</p> +```````````````````````````````` + + +```````````````````````````````` example +пристаням__стремятся__ +. +<p>пристаням__стремятся__</p> +```````````````````````````````` + + +```````````````````````````````` example +__foo, __bar__, baz__ +. +<p><strong>foo, <strong>bar</strong>, baz</strong></p> +```````````````````````````````` + + +This is strong emphasis, even though the opening delimiter is +both left- and right-flanking, because it is preceded by +punctuation: + +```````````````````````````````` example +foo-__(bar)__ +. +<p>foo-<strong>(bar)</strong></p> +```````````````````````````````` + + + +Rule 7: + +This is not strong emphasis, because the closing delimiter is preceded +by whitespace: + +```````````````````````````````` example +**foo bar ** +. +<p>**foo bar **</p> +```````````````````````````````` + + +(Nor can it be interpreted as an emphasized `*foo bar *`, because of +Rule 11.) + +This is not strong emphasis, because the second `**` is +preceded by punctuation and followed by an alphanumeric: + +```````````````````````````````` example +**(**foo) +. +<p>**(**foo)</p> +```````````````````````````````` + + +The point of this restriction is more easily appreciated +with these examples: + +```````````````````````````````` example +*(**foo**)* +. +<p><em>(<strong>foo</strong>)</em></p> +```````````````````````````````` + + +```````````````````````````````` example +**Gomphocarpus (*Gomphocarpus physocarpus*, syn. +*Asclepias physocarpa*)** +. +<p><strong>Gomphocarpus (<em>Gomphocarpus physocarpus</em>, syn. +<em>Asclepias physocarpa</em>)</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo "*bar*" foo** +. +<p><strong>foo "<em>bar</em>" foo</strong></p> +```````````````````````````````` + + +Intraword emphasis: + +```````````````````````````````` example +**foo**bar +. +<p><strong>foo</strong>bar</p> +```````````````````````````````` + + +Rule 8: + +This is not strong emphasis, because the closing delimiter is +preceded by whitespace: + +```````````````````````````````` example +__foo bar __ +. +<p>__foo bar __</p> +```````````````````````````````` + + +This is not strong emphasis, because the second `__` is +preceded by punctuation and followed by an alphanumeric: + +```````````````````````````````` example +__(__foo) +. +<p>__(__foo)</p> +```````````````````````````````` + + +The point of this restriction is more easily appreciated +with this example: + +```````````````````````````````` example +_(__foo__)_ +. +<p><em>(<strong>foo</strong>)</em></p> +```````````````````````````````` + + +Intraword strong emphasis is forbidden with `__`: + +```````````````````````````````` example +__foo__bar +. +<p>__foo__bar</p> +```````````````````````````````` + + +```````````````````````````````` example +__пристаням__стремятся +. +<p>__пристаням__стремятся</p> +```````````````````````````````` + + +```````````````````````````````` example +__foo__bar__baz__ +. +<p><strong>foo__bar__baz</strong></p> +```````````````````````````````` + + +This is strong emphasis, even though the closing delimiter is +both left- and right-flanking, because it is followed by +punctuation: + +```````````````````````````````` example +__(bar)__. +. +<p><strong>(bar)</strong>.</p> +```````````````````````````````` + + +Rule 9: + +Any nonempty sequence of inline elements can be the contents of an +emphasized span. + +```````````````````````````````` example +*foo [bar](/url)* +. +<p><em>foo <a href="/url">bar</a></em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo +bar* +. +<p><em>foo +bar</em></p> +```````````````````````````````` + + +In particular, emphasis and strong emphasis can be nested +inside emphasis: + +```````````````````````````````` example +_foo __bar__ baz_ +. +<p><em>foo <strong>bar</strong> baz</em></p> +```````````````````````````````` + + +```````````````````````````````` example +_foo _bar_ baz_ +. +<p><em>foo <em>bar</em> baz</em></p> +```````````````````````````````` + + +```````````````````````````````` example +__foo_ bar_ +. +<p><em><em>foo</em> bar</em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo *bar** +. +<p><em>foo <em>bar</em></em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo **bar** baz* +. +<p><em>foo <strong>bar</strong> baz</em></p> +```````````````````````````````` + +```````````````````````````````` example +*foo**bar**baz* +. +<p><em>foo<strong>bar</strong>baz</em></p> +```````````````````````````````` + +Note that in the preceding case, the interpretation + +``` markdown +<p><em>foo</em><em>bar<em></em>baz</em></p> +``` + + +is precluded by the condition that a delimiter that +can both open and close (like the `*` after `foo`) +cannot form emphasis if the sum of the lengths of +the delimiter runs containing the opening and +closing delimiters is a multiple of 3 unless +both lengths are multiples of 3. + + +For the same reason, we don't get two consecutive +emphasis sections in this example: + +```````````````````````````````` example +*foo**bar* +. +<p><em>foo**bar</em></p> +```````````````````````````````` + + +The same condition ensures that the following +cases are all strong emphasis nested inside +emphasis, even when the interior whitespace is +omitted: + + +```````````````````````````````` example +***foo** bar* +. +<p><em><strong>foo</strong> bar</em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo **bar*** +. +<p><em>foo <strong>bar</strong></em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo**bar*** +. +<p><em>foo<strong>bar</strong></em></p> +```````````````````````````````` + + +When the lengths of the interior closing and opening +delimiter runs are *both* multiples of 3, though, +they can match to create emphasis: + +```````````````````````````````` example +foo***bar***baz +. +<p>foo<em><strong>bar</strong></em>baz</p> +```````````````````````````````` + +```````````````````````````````` example +foo******bar*********baz +. +<p>foo<strong><strong><strong>bar</strong></strong></strong>***baz</p> +```````````````````````````````` + + +Indefinite levels of nesting are possible: + +```````````````````````````````` example +*foo **bar *baz* bim** bop* +. +<p><em>foo <strong>bar <em>baz</em> bim</strong> bop</em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo [*bar*](/url)* +. +<p><em>foo <a href="/url"><em>bar</em></a></em></p> +```````````````````````````````` + + +There can be no empty emphasis or strong emphasis: + +```````````````````````````````` example +** is not an empty emphasis +. +<p>** is not an empty emphasis</p> +```````````````````````````````` + + +```````````````````````````````` example +**** is not an empty strong emphasis +. +<p>**** is not an empty strong emphasis</p> +```````````````````````````````` + + + +Rule 10: + +Any nonempty sequence of inline elements can be the contents of an +strongly emphasized span. + +```````````````````````````````` example +**foo [bar](/url)** +. +<p><strong>foo <a href="/url">bar</a></strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo +bar** +. +<p><strong>foo +bar</strong></p> +```````````````````````````````` + + +In particular, emphasis and strong emphasis can be nested +inside strong emphasis: + +```````````````````````````````` example +__foo _bar_ baz__ +. +<p><strong>foo <em>bar</em> baz</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +__foo __bar__ baz__ +. +<p><strong>foo <strong>bar</strong> baz</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +____foo__ bar__ +. +<p><strong><strong>foo</strong> bar</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo **bar**** +. +<p><strong>foo <strong>bar</strong></strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo *bar* baz** +. +<p><strong>foo <em>bar</em> baz</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo*bar*baz** +. +<p><strong>foo<em>bar</em>baz</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +***foo* bar** +. +<p><strong><em>foo</em> bar</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo *bar*** +. +<p><strong>foo <em>bar</em></strong></p> +```````````````````````````````` + + +Indefinite levels of nesting are possible: + +```````````````````````````````` example +**foo *bar **baz** +bim* bop** +. +<p><strong>foo <em>bar <strong>baz</strong> +bim</em> bop</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo [*bar*](/url)** +. +<p><strong>foo <a href="/url"><em>bar</em></a></strong></p> +```````````````````````````````` + + +There can be no empty emphasis or strong emphasis: + +```````````````````````````````` example +__ is not an empty emphasis +. +<p>__ is not an empty emphasis</p> +```````````````````````````````` + + +```````````````````````````````` example +____ is not an empty strong emphasis +. +<p>____ is not an empty strong emphasis</p> +```````````````````````````````` + + + +Rule 11: + +```````````````````````````````` example +foo *** +. +<p>foo ***</p> +```````````````````````````````` + + +```````````````````````````````` example +foo *\** +. +<p>foo <em>*</em></p> +```````````````````````````````` + + +```````````````````````````````` example +foo *_* +. +<p>foo <em>_</em></p> +```````````````````````````````` + + +```````````````````````````````` example +foo ***** +. +<p>foo *****</p> +```````````````````````````````` + + +```````````````````````````````` example +foo **\*** +. +<p>foo <strong>*</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +foo **_** +. +<p>foo <strong>_</strong></p> +```````````````````````````````` + + +Note that when delimiters do not match evenly, Rule 11 determines +that the excess literal `*` characters will appear outside of the +emphasis, rather than inside it: + +```````````````````````````````` example +**foo* +. +<p>*<em>foo</em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo** +. +<p><em>foo</em>*</p> +```````````````````````````````` + + +```````````````````````````````` example +***foo** +. +<p>*<strong>foo</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +****foo* +. +<p>***<em>foo</em></p> +```````````````````````````````` + + +```````````````````````````````` example +**foo*** +. +<p><strong>foo</strong>*</p> +```````````````````````````````` + + +```````````````````````````````` example +*foo**** +. +<p><em>foo</em>***</p> +```````````````````````````````` + + + +Rule 12: + +```````````````````````````````` example +foo ___ +. +<p>foo ___</p> +```````````````````````````````` + + +```````````````````````````````` example +foo _\__ +. +<p>foo <em>_</em></p> +```````````````````````````````` + + +```````````````````````````````` example +foo _*_ +. +<p>foo <em>*</em></p> +```````````````````````````````` + + +```````````````````````````````` example +foo _____ +. +<p>foo _____</p> +```````````````````````````````` + + +```````````````````````````````` example +foo __\___ +. +<p>foo <strong>_</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +foo __*__ +. +<p>foo <strong>*</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +__foo_ +. +<p>_<em>foo</em></p> +```````````````````````````````` + + +Note that when delimiters do not match evenly, Rule 12 determines +that the excess literal `_` characters will appear outside of the +emphasis, rather than inside it: + +```````````````````````````````` example +_foo__ +. +<p><em>foo</em>_</p> +```````````````````````````````` + + +```````````````````````````````` example +___foo__ +. +<p>_<strong>foo</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +____foo_ +. +<p>___<em>foo</em></p> +```````````````````````````````` + + +```````````````````````````````` example +__foo___ +. +<p><strong>foo</strong>_</p> +```````````````````````````````` + + +```````````````````````````````` example +_foo____ +. +<p><em>foo</em>___</p> +```````````````````````````````` + + +Rule 13 implies that if you want emphasis nested directly inside +emphasis, you must use different delimiters: + +```````````````````````````````` example +**foo** +. +<p><strong>foo</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +*_foo_* +. +<p><em><em>foo</em></em></p> +```````````````````````````````` + + +```````````````````````````````` example +__foo__ +. +<p><strong>foo</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +_*foo*_ +. +<p><em><em>foo</em></em></p> +```````````````````````````````` + + +However, strong emphasis within strong emphasis is possible without +switching delimiters: + +```````````````````````````````` example +****foo**** +. +<p><strong><strong>foo</strong></strong></p> +```````````````````````````````` + + +```````````````````````````````` example +____foo____ +. +<p><strong><strong>foo</strong></strong></p> +```````````````````````````````` + + + +Rule 13 can be applied to arbitrarily long sequences of +delimiters: + +```````````````````````````````` example +******foo****** +. +<p><strong><strong><strong>foo</strong></strong></strong></p> +```````````````````````````````` + + +Rule 14: + +```````````````````````````````` example +***foo*** +. +<p><em><strong>foo</strong></em></p> +```````````````````````````````` + + +```````````````````````````````` example +_____foo_____ +. +<p><em><strong><strong>foo</strong></strong></em></p> +```````````````````````````````` + + +Rule 15: + +```````````````````````````````` example +*foo _bar* baz_ +. +<p><em>foo _bar</em> baz_</p> +```````````````````````````````` + + +```````````````````````````````` example +*foo __bar *baz bim__ bam* +. +<p><em>foo <strong>bar *baz bim</strong> bam</em></p> +```````````````````````````````` + + +Rule 16: + +```````````````````````````````` example +**foo **bar baz** +. +<p>**foo <strong>bar baz</strong></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo *bar baz* +. +<p>*foo <em>bar baz</em></p> +```````````````````````````````` + + +Rule 17: + +```````````````````````````````` example +*[bar*](/url) +. +<p>*<a href="/url">bar*</a></p> +```````````````````````````````` + + +```````````````````````````````` example +_foo [bar_](/url) +. +<p>_foo <a href="/url">bar_</a></p> +```````````````````````````````` + + +```````````````````````````````` example +*<img src="foo" title="*"/> +. +<p>*<img src="foo" title="*"/></p> +```````````````````````````````` + + +```````````````````````````````` example +**<a href="**"> +. +<p>**<a href="**"></p> +```````````````````````````````` + + +```````````````````````````````` example +__<a href="__"> +. +<p>__<a href="__"></p> +```````````````````````````````` + + +```````````````````````````````` example +*a `*`* +. +<p><em>a <code>*</code></em></p> +```````````````````````````````` + + +```````````````````````````````` example +_a `_`_ +. +<p><em>a <code>_</code></em></p> +```````````````````````````````` + + +```````````````````````````````` example +**a<https://foo.bar/?q=**> +. +<p>**a<a href="https://foo.bar/?q=**">https://foo.bar/?q=**</a></p> +```````````````````````````````` + + +```````````````````````````````` example +__a<https://foo.bar/?q=__> +. +<p>__a<a href="https://foo.bar/?q=__">https://foo.bar/?q=__</a></p> +```````````````````````````````` + + + +## Links + +A link contains [link text] (the visible text), a [link destination] +(the URI that is the link destination), and optionally a [link title]. +There are two basic kinds of links in Markdown. In [inline links] the +destination and title are given immediately after the link text. In +[reference links] the destination and title are defined elsewhere in +the document. + +A [link text](@) consists of a sequence of zero or more +inline elements enclosed by square brackets (`[` and `]`). The +following rules apply: + +- Links may not contain other links, at any level of nesting. If + multiple otherwise valid link definitions appear nested inside each + other, the inner-most definition is used. + +- Brackets are allowed in the [link text] only if (a) they + are backslash-escaped or (b) they appear as a matched pair of brackets, + with an open bracket `[`, a sequence of zero or more inlines, and + a close bracket `]`. + +- Backtick [code spans], [autolinks], and raw [HTML tags] bind more tightly + than the brackets in link text. Thus, for example, + `` [foo`]` `` could not be a link text, since the second `]` + is part of a code span. + +- The brackets in link text bind more tightly than markers for + [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link. + +A [link destination](@) consists of either + +- a sequence of zero or more characters between an opening `<` and a + closing `>` that contains no line endings or unescaped + `<` or `>` characters, or + +- a nonempty sequence of characters that does not start with `<`, + does not include [ASCII control characters][ASCII control character] + or [space] character, and includes parentheses only if (a) they are + backslash-escaped or (b) they are part of a balanced pair of + unescaped parentheses. + (Implementations may impose limits on parentheses nesting to + avoid performance issues, but at least three levels of nesting + should be supported.) + +A [link title](@) consists of either + +- a sequence of zero or more characters between straight double-quote + characters (`"`), including a `"` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between straight single-quote + characters (`'`), including a `'` character only if it is + backslash-escaped, or + +- a sequence of zero or more characters between matching parentheses + (`(...)`), including a `(` or `)` character only if it is + backslash-escaped. + +Although [link titles] may span multiple lines, they may not contain +a [blank line]. + +An [inline link](@) consists of a [link text] followed immediately +by a left parenthesis `(`, an optional [link destination], an optional +[link title], and a right parenthesis `)`. +These four components may be separated by spaces, tabs, and up to one line +ending. +If both [link destination] and [link title] are present, they *must* be +separated by spaces, tabs, and up to one line ending. + +The link's text consists of the inlines contained +in the [link text] (excluding the enclosing square brackets). +The link's URI consists of the link destination, excluding enclosing +`<...>` if present, with backslash-escapes in effect as described +above. The link's title consists of the link title, excluding its +enclosing delimiters, with backslash-escapes in effect as described +above. + +Here is a simple inline link: + +```````````````````````````````` example +[link](/uri "title") +. +<p><a href="/uri" title="title">link</a></p> +```````````````````````````````` + + +The title, the link text and even +the destination may be omitted: + +```````````````````````````````` example +[link](/uri) +. +<p><a href="/uri">link</a></p> +```````````````````````````````` + +```````````````````````````````` example +[](./target.md) +. +<p><a href="./target.md"></a></p> +```````````````````````````````` + + +```````````````````````````````` example +[link]() +. +<p><a href="">link</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[link](<>) +. +<p><a href="">link</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[]() +. +<p><a href=""></a></p> +```````````````````````````````` + +The destination can only contain spaces if it is +enclosed in pointy brackets: + +```````````````````````````````` example +[link](/my uri) +. +<p>[link](/my uri)</p> +```````````````````````````````` + +```````````````````````````````` example +[link](</my uri>) +. +<p><a href="/my%20uri">link</a></p> +```````````````````````````````` + +The destination cannot contain line endings, +even if enclosed in pointy brackets: + +```````````````````````````````` example +[link](foo +bar) +. +<p>[link](foo +bar)</p> +```````````````````````````````` + +```````````````````````````````` example +[link](<foo +bar>) +. +<p>[link](<foo +bar>)</p> +```````````````````````````````` + +The destination can contain `)` if it is enclosed +in pointy brackets: + +```````````````````````````````` example +[a](<b)c>) +. +<p><a href="b)c">a</a></p> +```````````````````````````````` + +Pointy brackets that enclose links must be unescaped: + +```````````````````````````````` example +[link](<foo\>) +. +<p>[link](<foo>)</p> +```````````````````````````````` + +These are not links, because the opening pointy bracket +is not matched properly: + +```````````````````````````````` example +[a](<b)c +[a](<b)c> +[a](<b>c) +. +<p>[a](<b)c +[a](<b)c> +[a](<b>c)</p> +```````````````````````````````` + +Parentheses inside the link destination may be escaped: + +```````````````````````````````` example +[link](\(foo\)) +. +<p><a href="(foo)">link</a></p> +```````````````````````````````` + +Any number of parentheses are allowed without escaping, as long as they are +balanced: + +```````````````````````````````` example +[link](foo(and(bar))) +. +<p><a href="foo(and(bar))">link</a></p> +```````````````````````````````` + +However, if you have unbalanced parentheses, you need to escape or use the +`<...>` form: + +```````````````````````````````` example +[link](foo(and(bar)) +. +<p>[link](foo(and(bar))</p> +```````````````````````````````` + + +```````````````````````````````` example +[link](foo\(and\(bar\)) +. +<p><a href="foo(and(bar)">link</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[link](<foo(and(bar)>) +. +<p><a href="foo(and(bar)">link</a></p> +```````````````````````````````` + + +Parentheses and other symbols can also be escaped, as usual +in Markdown: + +```````````````````````````````` example +[link](foo\)\:) +. +<p><a href="foo):">link</a></p> +```````````````````````````````` + + +A link can contain fragment identifiers and queries: + +```````````````````````````````` example +[link](#fragment) + +[link](https://example.com#fragment) + +[link](https://example.com?foo=3#frag) +. +<p><a href="#fragment">link</a></p> +<p><a href="https://example.com#fragment">link</a></p> +<p><a href="https://example.com?foo=3#frag">link</a></p> +```````````````````````````````` + + +Note that a backslash before a non-escapable character is +just a backslash: + +```````````````````````````````` example +[link](foo\bar) +. +<p><a href="foo%5Cbar">link</a></p> +```````````````````````````````` + + +URL-escaping should be left alone inside the destination, as all +URL-escaped characters are also valid URL characters. Entity and +numerical character references in the destination will be parsed +into the corresponding Unicode code points, as usual. These may +be optionally URL-escaped when written as HTML, but this spec +does not enforce any particular policy for rendering URLs in +HTML or other formats. Renderers may make different decisions +about how to escape or normalize URLs in the output. + +```````````````````````````````` example +[link](foo%20bä) +. +<p><a href="foo%20b%C3%A4">link</a></p> +```````````````````````````````` + + +Note that, because titles can often be parsed as destinations, +if you try to omit the destination and keep the title, you'll +get unexpected results: + +```````````````````````````````` example +[link]("title") +. +<p><a href="%22title%22">link</a></p> +```````````````````````````````` + + +Titles may be in single quotes, double quotes, or parentheses: + +```````````````````````````````` example +[link](/url "title") +[link](/url 'title') +[link](/url (title)) +. +<p><a href="/url" title="title">link</a> +<a href="/url" title="title">link</a> +<a href="/url" title="title">link</a></p> +```````````````````````````````` + + +Backslash escapes and entity and numeric character references +may be used in titles: + +```````````````````````````````` example +[link](/url "title \""") +. +<p><a href="/url" title="title """>link</a></p> +```````````````````````````````` + + +Titles must be separated from the link using spaces, tabs, and up to one line +ending. +Other [Unicode whitespace] like non-breaking space doesn't work. + +```````````````````````````````` example +[link](/url "title") +. +<p><a href="/url%C2%A0%22title%22">link</a></p> +```````````````````````````````` + + +Nested balanced quotes are not allowed without escaping: + +```````````````````````````````` example +[link](/url "title "and" title") +. +<p>[link](/url "title "and" title")</p> +```````````````````````````````` + + +But it is easy to work around this by using a different quote type: + +```````````````````````````````` example +[link](/url 'title "and" title') +. +<p><a href="/url" title="title "and" title">link</a></p> +```````````````````````````````` + + +(Note: `Markdown.pl` did allow double quotes inside a double-quoted +title, and its test suite included a test demonstrating this. +But it is hard to see a good rationale for the extra complexity this +brings, since there are already many ways---backslash escaping, +entity and numeric character references, or using a different +quote type for the enclosing title---to write titles containing +double quotes. `Markdown.pl`'s handling of titles has a number +of other strange features. For example, it allows single-quoted +titles in inline links, but not reference links. And, in +reference links but not inline links, it allows a title to begin +with `"` and end with `)`. `Markdown.pl` 1.0.1 even allows +titles with no closing quotation mark, though 1.0.2b8 does not. +It seems preferable to adopt a simple, rational rule that works +the same way in inline links and link reference definitions.) + +Spaces, tabs, and up to one line ending is allowed around the destination and +title: + +```````````````````````````````` example +[link]( /uri + "title" ) +. +<p><a href="/uri" title="title">link</a></p> +```````````````````````````````` + + +But it is not allowed between the link text and the +following parenthesis: + +```````````````````````````````` example +[link] (/uri) +. +<p>[link] (/uri)</p> +```````````````````````````````` + + +The link text may contain balanced brackets, but not unbalanced ones, +unless they are escaped: + +```````````````````````````````` example +[link [foo [bar]]](/uri) +. +<p><a href="/uri">link [foo [bar]]</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[link] bar](/uri) +. +<p>[link] bar](/uri)</p> +```````````````````````````````` + + +```````````````````````````````` example +[link [bar](/uri) +. +<p>[link <a href="/uri">bar</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[link \[bar](/uri) +. +<p><a href="/uri">link [bar</a></p> +```````````````````````````````` + + +The link text may contain inline content: + +```````````````````````````````` example +[link *foo **bar** `#`*](/uri) +. +<p><a href="/uri">link <em>foo <strong>bar</strong> <code>#</code></em></a></p> +```````````````````````````````` + + +```````````````````````````````` example +[![moon](moon.jpg)](/uri) +. +<p><a href="/uri"><img src="moon.jpg" alt="moon" /></a></p> +```````````````````````````````` + + +However, links may not contain other links, at any level of nesting. + +```````````````````````````````` example +[foo [bar](/uri)](/uri) +. +<p>[foo <a href="/uri">bar</a>](/uri)</p> +```````````````````````````````` + + +```````````````````````````````` example +[foo *[bar [baz](/uri)](/uri)*](/uri) +. +<p>[foo <em>[bar <a href="/uri">baz</a>](/uri)</em>](/uri)</p> +```````````````````````````````` + + +```````````````````````````````` example +![[[foo](uri1)](uri2)](uri3) +. +<p><img src="uri3" alt="[foo](uri2)" /></p> +```````````````````````````````` + + +These cases illustrate the precedence of link text grouping over +emphasis grouping: + +```````````````````````````````` example +*[foo*](/uri) +. +<p>*<a href="/uri">foo*</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo *bar](baz*) +. +<p><a href="baz*">foo *bar</a></p> +```````````````````````````````` + + +Note that brackets that *aren't* part of links do not take +precedence: + +```````````````````````````````` example +*foo [bar* baz] +. +<p><em>foo [bar</em> baz]</p> +```````````````````````````````` + + +These cases illustrate the precedence of HTML tags, code spans, +and autolinks over link grouping: + +```````````````````````````````` example +[foo <bar attr="](baz)"> +. +<p>[foo <bar attr="](baz)"></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo`](/uri)` +. +<p>[foo<code>](/uri)</code></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo<https://example.com/?search=](uri)> +. +<p>[foo<a href="https://example.com/?search=%5D(uri)">https://example.com/?search=](uri)</a></p> +```````````````````````````````` + + +There are three kinds of [reference link](@)s: +[full](#full-reference-link), [collapsed](#collapsed-reference-link), +and [shortcut](#shortcut-reference-link). + +A [full reference link](@) +consists of a [link text] immediately followed by a [link label] +that [matches] a [link reference definition] elsewhere in the document. + +A [link label](@) begins with a left bracket (`[`) and ends +with the first right bracket (`]`) that is not backslash-escaped. +Between these brackets there must be at least one character that is not a space, +tab, or line ending. +Unescaped square bracket characters are not allowed inside the +opening and closing square brackets of [link labels]. A link +label can have at most 999 characters inside the square +brackets. + +One label [matches](@) +another just in case their normalized forms are equal. To normalize a +label, strip off the opening and closing brackets, +perform the *Unicode case fold*, strip leading and trailing +spaces, tabs, and line endings, and collapse consecutive internal +spaces, tabs, and line endings to a single space. If there are multiple +matching reference link definitions, the one that comes first in the +document is used. (It is desirable in such cases to emit a warning.) + +The link's URI and title are provided by the matching [link +reference definition]. + +Here is a simple example: + +```````````````````````````````` example +[foo][bar] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +```````````````````````````````` + + +The rules for the [link text] are the same as with +[inline links]. Thus: + +The link text may contain balanced brackets, but not unbalanced ones, +unless they are escaped: + +```````````````````````````````` example +[link [foo [bar]]][ref] + +[ref]: /uri +. +<p><a href="/uri">link [foo [bar]]</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[link \[bar][ref] + +[ref]: /uri +. +<p><a href="/uri">link [bar</a></p> +```````````````````````````````` + + +The link text may contain inline content: + +```````````````````````````````` example +[link *foo **bar** `#`*][ref] + +[ref]: /uri +. +<p><a href="/uri">link <em>foo <strong>bar</strong> <code>#</code></em></a></p> +```````````````````````````````` + + +```````````````````````````````` example +[![moon](moon.jpg)][ref] + +[ref]: /uri +. +<p><a href="/uri"><img src="moon.jpg" alt="moon" /></a></p> +```````````````````````````````` + + +However, links may not contain other links, at any level of nesting. + +```````````````````````````````` example +[foo [bar](/uri)][ref] + +[ref]: /uri +. +<p>[foo <a href="/uri">bar</a>]<a href="/uri">ref</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo *bar [baz][ref]*][ref] + +[ref]: /uri +. +<p>[foo <em>bar <a href="/uri">baz</a></em>]<a href="/uri">ref</a></p> +```````````````````````````````` + + +(In the examples above, we have two [shortcut reference links] +instead of one [full reference link].) + +The following cases illustrate the precedence of link text grouping over +emphasis grouping: + +```````````````````````````````` example +*[foo*][ref] + +[ref]: /uri +. +<p>*<a href="/uri">foo*</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo *bar][ref]* + +[ref]: /uri +. +<p><a href="/uri">foo *bar</a>*</p> +```````````````````````````````` + + +These cases illustrate the precedence of HTML tags, code spans, +and autolinks over link grouping: + +```````````````````````````````` example +[foo <bar attr="][ref]"> + +[ref]: /uri +. +<p>[foo <bar attr="][ref]"></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo`][ref]` + +[ref]: /uri +. +<p>[foo<code>][ref]</code></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo<https://example.com/?search=][ref]> + +[ref]: /uri +. +<p>[foo<a href="https://example.com/?search=%5D%5Bref%5D">https://example.com/?search=][ref]</a></p> +```````````````````````````````` + + +Matching is case-insensitive: + +```````````````````````````````` example +[foo][BaR] + +[bar]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +```````````````````````````````` + + +Unicode case fold is used: + +```````````````````````````````` example +[ẞ] + +[SS]: /url +. +<p><a href="/url">ẞ</a></p> +```````````````````````````````` + + +Consecutive internal spaces, tabs, and line endings are treated as one space for +purposes of determining matching: + +```````````````````````````````` example +[Foo + bar]: /url + +[Baz][Foo bar] +. +<p><a href="/url">Baz</a></p> +```````````````````````````````` + + +No spaces, tabs, or line endings are allowed between the [link text] and the +[link label]: + +```````````````````````````````` example +[foo] [bar] + +[bar]: /url "title" +. +<p>[foo] <a href="/url" title="title">bar</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[foo] +[bar] + +[bar]: /url "title" +. +<p>[foo] +<a href="/url" title="title">bar</a></p> +```````````````````````````````` + + +This is a departure from John Gruber's original Markdown syntax +description, which explicitly allows whitespace between the link +text and the link label. It brings reference links in line with +[inline links], which (according to both original Markdown and +this spec) cannot have whitespace after the link text. More +importantly, it prevents inadvertent capture of consecutive +[shortcut reference links]. If whitespace is allowed between the +link text and the link label, then in the following we will have +a single reference link, not two shortcut reference links, as +intended: + +``` markdown +[foo] +[bar] + +[foo]: /url1 +[bar]: /url2 +``` + +(Note that [shortcut reference links] were introduced by Gruber +himself in a beta version of `Markdown.pl`, but never included +in the official syntax description. Without shortcut reference +links, it is harmless to allow space between the link text and +link label; but once shortcut references are introduced, it is +too dangerous to allow this, as it frequently leads to +unintended results.) + +When there are multiple matching [link reference definitions], +the first is used: + +```````````````````````````````` example +[foo]: /url1 + +[foo]: /url2 + +[bar][foo] +. +<p><a href="/url1">bar</a></p> +```````````````````````````````` + + +Note that matching is performed on normalized strings, not parsed +inline content. So the following does not match, even though the +labels define equivalent inline content: + +```````````````````````````````` example +[bar][foo\!] + +[foo!]: /url +. +<p>[bar][foo!]</p> +```````````````````````````````` + + +[Link labels] cannot contain brackets, unless they are +backslash-escaped: + +```````````````````````````````` example +[foo][ref[] + +[ref[]: /uri +. +<p>[foo][ref[]</p> +<p>[ref[]: /uri</p> +```````````````````````````````` + + +```````````````````````````````` example +[foo][ref[bar]] + +[ref[bar]]: /uri +. +<p>[foo][ref[bar]]</p> +<p>[ref[bar]]: /uri</p> +```````````````````````````````` + + +```````````````````````````````` example +[[[foo]]] + +[[[foo]]]: /url +. +<p>[[[foo]]]</p> +<p>[[[foo]]]: /url</p> +```````````````````````````````` + + +```````````````````````````````` example +[foo][ref\[] + +[ref\[]: /uri +. +<p><a href="/uri">foo</a></p> +```````````````````````````````` + + +Note that in this example `]` is not backslash-escaped: + +```````````````````````````````` example +[bar\\]: /uri + +[bar\\] +. +<p><a href="/uri">bar\</a></p> +```````````````````````````````` + + +A [link label] must contain at least one character that is not a space, tab, or +line ending: + +```````````````````````````````` example +[] + +[]: /uri +. +<p>[]</p> +<p>[]: /uri</p> +```````````````````````````````` + + +```````````````````````````````` example +[ + ] + +[ + ]: /uri +. +<p>[ +]</p> +<p>[ +]: /uri</p> +```````````````````````````````` + + +A [collapsed reference link](@) +consists of a [link label] that [matches] a +[link reference definition] elsewhere in the +document, followed by the string `[]`. +The contents of the link label are parsed as inlines, +which are used as the link's text. The link's URI and title are +provided by the matching reference link definition. Thus, +`[foo][]` is equivalent to `[foo][foo]`. + +```````````````````````````````` example +[foo][] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[*foo* bar][] + +[*foo* bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo</em> bar</a></p> +```````````````````````````````` + + +The link labels are case-insensitive: + +```````````````````````````````` example +[Foo][] + +[foo]: /url "title" +. +<p><a href="/url" title="title">Foo</a></p> +```````````````````````````````` + + + +As with full reference links, spaces, tabs, or line endings are not +allowed between the two sets of brackets: + +```````````````````````````````` example +[foo] +[] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a> +[]</p> +```````````````````````````````` + + +A [shortcut reference link](@) +consists of a [link label] that [matches] a +[link reference definition] elsewhere in the +document and is not followed by `[]` or a link label. +The contents of the link label are parsed as inlines, +which are used as the link's text. The link's URI and title +are provided by the matching link reference definition. +Thus, `[foo]` is equivalent to `[foo][]`. + +```````````````````````````````` example +[foo] + +[foo]: /url "title" +. +<p><a href="/url" title="title">foo</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[*foo* bar] + +[*foo* bar]: /url "title" +. +<p><a href="/url" title="title"><em>foo</em> bar</a></p> +```````````````````````````````` + + +```````````````````````````````` example +[[*foo* bar]] + +[*foo* bar]: /url "title" +. +<p>[<a href="/url" title="title"><em>foo</em> bar</a>]</p> +```````````````````````````````` + + +```````````````````````````````` example +[[bar [foo] + +[foo]: /url +. +<p>[[bar <a href="/url">foo</a></p> +```````````````````````````````` + + +The link labels are case-insensitive: + +```````````````````````````````` example +[Foo] + +[foo]: /url "title" +. +<p><a href="/url" title="title">Foo</a></p> +```````````````````````````````` + + +A space after the link text should be preserved: + +```````````````````````````````` example +[foo] bar + +[foo]: /url +. +<p><a href="/url">foo</a> bar</p> +```````````````````````````````` + + +If you just want bracketed text, you can backslash-escape the +opening bracket to avoid links: + +```````````````````````````````` example +\[foo] + +[foo]: /url "title" +. +<p>[foo]</p> +```````````````````````````````` + + +Note that this is a link, because a link label ends with the first +following closing bracket: + +```````````````````````````````` example +[foo*]: /url + +*[foo*] +. +<p>*<a href="/url">foo*</a></p> +```````````````````````````````` + + +Full and collapsed references take precedence over shortcut +references: + +```````````````````````````````` example +[foo][bar] + +[foo]: /url1 +[bar]: /url2 +. +<p><a href="/url2">foo</a></p> +```````````````````````````````` + +```````````````````````````````` example +[foo][] + +[foo]: /url1 +. +<p><a href="/url1">foo</a></p> +```````````````````````````````` + +Inline links also take precedence: + +```````````````````````````````` example +[foo]() + +[foo]: /url1 +. +<p><a href="">foo</a></p> +```````````````````````````````` + +```````````````````````````````` example +[foo](not a link) + +[foo]: /url1 +. +<p><a href="/url1">foo</a>(not a link)</p> +```````````````````````````````` + +In the following case `[bar][baz]` is parsed as a reference, +`[foo]` as normal text: + +```````````````````````````````` example +[foo][bar][baz] + +[baz]: /url +. +<p>[foo]<a href="/url">bar</a></p> +```````````````````````````````` + + +Here, though, `[foo][bar]` is parsed as a reference, since +`[bar]` is defined: + +```````````````````````````````` example +[foo][bar][baz] + +[baz]: /url1 +[bar]: /url2 +. +<p><a href="/url2">foo</a><a href="/url1">baz</a></p> +```````````````````````````````` + + +Here `[foo]` is not parsed as a shortcut reference, because it +is followed by a link label (even though `[bar]` is not defined): + +```````````````````````````````` example +[foo][bar][baz] + +[baz]: /url1 +[foo]: /url2 +. +<p>[foo]<a href="/url1">bar</a></p> +```````````````````````````````` + + + +## Images + +Syntax for images is like the syntax for links, with one +difference. Instead of [link text], we have an +[image description](@). The rules for this are the +same as for [link text], except that (a) an +image description starts with `![` rather than `[`, and +(b) an image description may contain links. +An image description has inline elements +as its contents. When an image is rendered to HTML, +this is standardly used as the image's `alt` attribute. + +```````````````````````````````` example +![foo](/url "title") +. +<p><img src="/url" alt="foo" title="title" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo *bar*] + +[foo *bar*]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo bar" title="train & tracks" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo ![bar](/url)](/url2) +. +<p><img src="/url2" alt="foo bar" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo [bar](/url)](/url2) +. +<p><img src="/url2" alt="foo bar" /></p> +```````````````````````````````` + + +Though this spec is concerned with parsing, not rendering, it is +recommended that in rendering to HTML, only the plain string content +of the [image description] be used. Note that in +the above example, the alt attribute's value is `foo bar`, not `foo +[bar](/url)` or `foo <a href="/url">bar</a>`. Only the plain string +content is rendered, without formatting. + +```````````````````````````````` example +![foo *bar*][] + +[foo *bar*]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo bar" title="train & tracks" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo *bar*][foobar] + +[FOOBAR]: train.jpg "train & tracks" +. +<p><img src="train.jpg" alt="foo bar" title="train & tracks" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo](train.jpg) +. +<p><img src="train.jpg" alt="foo" /></p> +```````````````````````````````` + + +```````````````````````````````` example +My ![foo bar](/path/to/train.jpg "title" ) +. +<p>My <img src="/path/to/train.jpg" alt="foo bar" title="title" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo](<url>) +. +<p><img src="url" alt="foo" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![](/url) +. +<p><img src="/url" alt="" /></p> +```````````````````````````````` + + +Reference-style: + +```````````````````````````````` example +![foo][bar] + +[bar]: /url +. +<p><img src="/url" alt="foo" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![foo][bar] + +[BAR]: /url +. +<p><img src="/url" alt="foo" /></p> +```````````````````````````````` + + +Collapsed: + +```````````````````````````````` example +![foo][] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![*foo* bar][] + +[*foo* bar]: /url "title" +. +<p><img src="/url" alt="foo bar" title="title" /></p> +```````````````````````````````` + + +The labels are case-insensitive: + +```````````````````````````````` example +![Foo][] + +[foo]: /url "title" +. +<p><img src="/url" alt="Foo" title="title" /></p> +```````````````````````````````` + + +As with reference links, spaces, tabs, and line endings, are not allowed +between the two sets of brackets: + +```````````````````````````````` example +![foo] +[] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /> +[]</p> +```````````````````````````````` + + +Shortcut: + +```````````````````````````````` example +![foo] + +[foo]: /url "title" +. +<p><img src="/url" alt="foo" title="title" /></p> +```````````````````````````````` + + +```````````````````````````````` example +![*foo* bar] + +[*foo* bar]: /url "title" +. +<p><img src="/url" alt="foo bar" title="title" /></p> +```````````````````````````````` + + +Note that link labels cannot contain unescaped brackets: + +```````````````````````````````` example +![[foo]] + +[[foo]]: /url "title" +. +<p>![[foo]]</p> +<p>[[foo]]: /url "title"</p> +```````````````````````````````` + + +The link labels are case-insensitive: + +```````````````````````````````` example +![Foo] + +[foo]: /url "title" +. +<p><img src="/url" alt="Foo" title="title" /></p> +```````````````````````````````` + + +If you just want a literal `!` followed by bracketed text, you can +backslash-escape the opening `[`: + +```````````````````````````````` example +!\[foo] + +[foo]: /url "title" +. +<p>![foo]</p> +```````````````````````````````` + + +If you want a link after a literal `!`, backslash-escape the +`!`: + +```````````````````````````````` example +\![foo] + +[foo]: /url "title" +. +<p>!<a href="/url" title="title">foo</a></p> +```````````````````````````````` + + +## Autolinks + +[Autolink](@)s are absolute URIs and email addresses inside +`<` and `>`. They are parsed as links, with the URL or email address +as the link label. + +A [URI autolink](@) consists of `<`, followed by an +[absolute URI] followed by `>`. It is parsed as +a link to the URI, with the URI as the link's label. + +An [absolute URI](@), +for these purposes, consists of a [scheme] followed by a colon (`:`) +followed by zero or more characters other than [ASCII control +characters][ASCII control character], [space], `<`, and `>`. +If the URI includes these characters, they must be percent-encoded +(e.g. `%20` for a space). + +For purposes of this spec, a [scheme](@) is any sequence +of 2--32 characters beginning with an ASCII letter and followed +by any combination of ASCII letters, digits, or the symbols plus +("+"), period ("."), or hyphen ("-"). + +Here are some valid autolinks: + +```````````````````````````````` example +<http://foo.bar.baz> +. +<p><a href="http://foo.bar.baz">http://foo.bar.baz</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<https://foo.bar.baz/test?q=hello&id=22&boolean> +. +<p><a href="https://foo.bar.baz/test?q=hello&id=22&boolean">https://foo.bar.baz/test?q=hello&id=22&boolean</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<irc://foo.bar:2233/baz> +. +<p><a href="irc://foo.bar:2233/baz">irc://foo.bar:2233/baz</a></p> +```````````````````````````````` + + +Uppercase is also fine: + +```````````````````````````````` example +<MAILTO:FOO@BAR.BAZ> +. +<p><a href="MAILTO:FOO@BAR.BAZ">MAILTO:FOO@BAR.BAZ</a></p> +```````````````````````````````` + + +Note that many strings that count as [absolute URIs] for +purposes of this spec are not valid URIs, because their +schemes are not registered or because of other problems +with their syntax: + +```````````````````````````````` example +<a+b+c:d> +. +<p><a href="a+b+c:d">a+b+c:d</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<made-up-scheme://foo,bar> +. +<p><a href="made-up-scheme://foo,bar">made-up-scheme://foo,bar</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<https://../> +. +<p><a href="https://../">https://../</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<localhost:5001/foo> +. +<p><a href="localhost:5001/foo">localhost:5001/foo</a></p> +```````````````````````````````` + + +Spaces are not allowed in autolinks: + +```````````````````````````````` example +<https://foo.bar/baz bim> +. +<p><https://foo.bar/baz bim></p> +```````````````````````````````` + + +Backslash-escapes do not work inside autolinks: + +```````````````````````````````` example +<https://example.com/\[\> +. +<p><a href="https://example.com/%5C%5B%5C">https://example.com/\[\</a></p> +```````````````````````````````` + + +An [email autolink](@) +consists of `<`, followed by an [email address], +followed by `>`. The link's label is the email address, +and the URL is `mailto:` followed by the email address. + +An [email address](@), +for these purposes, is anything that matches +the [non-normative regex from the HTML5 +spec](https://html.spec.whatwg.org/multipage/forms.html#e-mail-state-(type=email)): + + /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? + (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/ + +Examples of email autolinks: + +```````````````````````````````` example +<foo@bar.example.com> +. +<p><a href="mailto:foo@bar.example.com">foo@bar.example.com</a></p> +```````````````````````````````` + + +```````````````````````````````` example +<foo+special@Bar.baz-bar0.com> +. +<p><a href="mailto:foo+special@Bar.baz-bar0.com">foo+special@Bar.baz-bar0.com</a></p> +```````````````````````````````` + + +Backslash-escapes do not work inside email autolinks: + +```````````````````````````````` example +<foo\+@bar.example.com> +. +<p><foo+@bar.example.com></p> +```````````````````````````````` + + +These are not autolinks: + +```````````````````````````````` example +<> +. +<p><></p> +```````````````````````````````` + + +```````````````````````````````` example +< https://foo.bar > +. +<p>< https://foo.bar ></p> +```````````````````````````````` + + +```````````````````````````````` example +<m:abc> +. +<p><m:abc></p> +```````````````````````````````` + + +```````````````````````````````` example +<foo.bar.baz> +. +<p><foo.bar.baz></p> +```````````````````````````````` + + +```````````````````````````````` example +https://example.com +. +<p>https://example.com</p> +```````````````````````````````` + + +```````````````````````````````` example +foo@bar.example.com +. +<p>foo@bar.example.com</p> +```````````````````````````````` + + +## Raw HTML + +Text between `<` and `>` that looks like an HTML tag is parsed as a +raw HTML tag and will be rendered in HTML without escaping. +Tag and attribute names are not limited to current HTML tags, +so custom tags (and even, say, DocBook tags) may be used. + +Here is the grammar for tags: + +A [tag name](@) consists of an ASCII letter +followed by zero or more ASCII letters, digits, or +hyphens (`-`). + +An [attribute](@) consists of spaces, tabs, and up to one line ending, +an [attribute name], and an optional +[attribute value specification]. + +An [attribute name](@) +consists of an ASCII letter, `_`, or `:`, followed by zero or more ASCII +letters, digits, `_`, `.`, `:`, or `-`. (Note: This is the XML +specification restricted to ASCII. HTML5 is laxer.) + +An [attribute value specification](@) +consists of optional spaces, tabs, and up to one line ending, +a `=` character, optional spaces, tabs, and up to one line ending, +and an [attribute value]. + +An [attribute value](@) +consists of an [unquoted attribute value], +a [single-quoted attribute value], or a [double-quoted attribute value]. + +An [unquoted attribute value](@) +is a nonempty string of characters not +including spaces, tabs, line endings, `"`, `'`, `=`, `<`, `>`, or `` ` ``. + +A [single-quoted attribute value](@) +consists of `'`, zero or more +characters not including `'`, and a final `'`. + +A [double-quoted attribute value](@) +consists of `"`, zero or more +characters not including `"`, and a final `"`. + +An [open tag](@) consists of a `<` character, a [tag name], +zero or more [attributes], optional spaces, tabs, and up to one line ending, +an optional `/` character, and a `>` character. + +A [closing tag](@) consists of the string `</`, a +[tag name], optional spaces, tabs, and up to one line ending, and the character +`>`. + +An [HTML comment](@) consists of `<!-->`, `<!--->`, or `<!--`, a string of +characters not including the string `-->`, and `-->` (see the +[HTML spec](https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)). + +A [processing instruction](@) +consists of the string `<?`, a string +of characters not including the string `?>`, and the string +`?>`. + +A [declaration](@) consists of the string `<!`, an ASCII letter, zero or more +characters not including the character `>`, and the character `>`. + +A [CDATA section](@) consists of +the string `<![CDATA[`, a string of characters not including the string +`]]>`, and the string `]]>`. + +An [HTML tag](@) consists of an [open tag], a [closing tag], +an [HTML comment], a [processing instruction], a [declaration], +or a [CDATA section]. + +Here are some simple open tags: + +```````````````````````````````` example +<a><bab><c2c> +. +<p><a><bab><c2c></p> +```````````````````````````````` + + +Empty elements: + +```````````````````````````````` example +<a/><b2/> +. +<p><a/><b2/></p> +```````````````````````````````` + + +Whitespace is allowed: + +```````````````````````````````` example +<a /><b2 +data="foo" > +. +<p><a /><b2 +data="foo" ></p> +```````````````````````````````` + + +With attributes: + +```````````````````````````````` example +<a foo="bar" bam = 'baz <em>"</em>' +_boolean zoop:33=zoop:33 /> +. +<p><a foo="bar" bam = 'baz <em>"</em>' +_boolean zoop:33=zoop:33 /></p> +```````````````````````````````` + + +Custom tag names can be used: + +```````````````````````````````` example +Foo <responsive-image src="foo.jpg" /> +. +<p>Foo <responsive-image src="foo.jpg" /></p> +```````````````````````````````` + + +Illegal tag names, not parsed as HTML: + +```````````````````````````````` example +<33> <__> +. +<p><33> <__></p> +```````````````````````````````` + + +Illegal attribute names: + +```````````````````````````````` example +<a h*#ref="hi"> +. +<p><a h*#ref="hi"></p> +```````````````````````````````` + + +Illegal attribute values: + +```````````````````````````````` example +<a href="hi'> <a href=hi'> +. +<p><a href="hi'> <a href=hi'></p> +```````````````````````````````` + + +Illegal whitespace: + +```````````````````````````````` example +< a>< +foo><bar/ > +<foo bar=baz +bim!bop /> +. +<p>< a>< +foo><bar/ > +<foo bar=baz +bim!bop /></p> +```````````````````````````````` + + +Missing whitespace: + +```````````````````````````````` example +<a href='bar'title=title> +. +<p><a href='bar'title=title></p> +```````````````````````````````` + + +Closing tags: + +```````````````````````````````` example +</a></foo > +. +<p></a></foo ></p> +```````````````````````````````` + + +Illegal attributes in closing tag: + +```````````````````````````````` example +</a href="foo"> +. +<p></a href="foo"></p> +```````````````````````````````` + + +Comments: + +```````````````````````````````` example +foo <!-- this is a -- +comment - with hyphens --> +. +<p>foo <!-- this is a -- +comment - with hyphens --></p> +```````````````````````````````` + +```````````````````````````````` example +foo <!--> foo --> + +foo <!---> foo --> +. +<p>foo <!--> foo --></p> +<p>foo <!---> foo --></p> +```````````````````````````````` + + +Processing instructions: + +```````````````````````````````` example +foo <?php echo $a; ?> +. +<p>foo <?php echo $a; ?></p> +```````````````````````````````` + + +Declarations: + +```````````````````````````````` example +foo <!ELEMENT br EMPTY> +. +<p>foo <!ELEMENT br EMPTY></p> +```````````````````````````````` + + +CDATA sections: + +```````````````````````````````` example +foo <![CDATA[>&<]]> +. +<p>foo <![CDATA[>&<]]></p> +```````````````````````````````` + + +Entity and numeric character references are preserved in HTML +attributes: + +```````````````````````````````` example +foo <a href="ö"> +. +<p>foo <a href="ö"></p> +```````````````````````````````` + + +Backslash escapes do not work in HTML attributes: + +```````````````````````````````` example +foo <a href="\*"> +. +<p>foo <a href="\*"></p> +```````````````````````````````` + + +```````````````````````````````` example +<a href="\""> +. +<p><a href="""></p> +```````````````````````````````` + + +## Hard line breaks + +A line ending (not in a code span or HTML tag) that is preceded +by two or more spaces and does not occur at the end of a block +is parsed as a [hard line break](@) (rendered +in HTML as a `<br />` tag): + +```````````````````````````````` example +foo +baz +. +<p>foo<br /> +baz</p> +```````````````````````````````` + + +For a more visible alternative, a backslash before the +[line ending] may be used instead of two or more spaces: + +```````````````````````````````` example +foo\ +baz +. +<p>foo<br /> +baz</p> +```````````````````````````````` + + +More than two spaces can be used: + +```````````````````````````````` example +foo +baz +. +<p>foo<br /> +baz</p> +```````````````````````````````` + + +Leading spaces at the beginning of the next line are ignored: + +```````````````````````````````` example +foo + bar +. +<p>foo<br /> +bar</p> +```````````````````````````````` + + +```````````````````````````````` example +foo\ + bar +. +<p>foo<br /> +bar</p> +```````````````````````````````` + + +Hard line breaks can occur inside emphasis, links, and other constructs +that allow inline content: + +```````````````````````````````` example +*foo +bar* +. +<p><em>foo<br /> +bar</em></p> +```````````````````````````````` + + +```````````````````````````````` example +*foo\ +bar* +. +<p><em>foo<br /> +bar</em></p> +```````````````````````````````` + + +Hard line breaks do not occur inside code spans + +```````````````````````````````` example +`code +span` +. +<p><code>code span</code></p> +```````````````````````````````` + + +```````````````````````````````` example +`code\ +span` +. +<p><code>code\ span</code></p> +```````````````````````````````` + + +or HTML tags: + +```````````````````````````````` example +<a href="foo +bar"> +. +<p><a href="foo +bar"></p> +```````````````````````````````` + + +```````````````````````````````` example +<a href="foo\ +bar"> +. +<p><a href="foo\ +bar"></p> +```````````````````````````````` + + +Hard line breaks are for separating inline content within a block. +Neither syntax for hard line breaks works at the end of a paragraph or +other block element: + +```````````````````````````````` example +foo\ +. +<p>foo\</p> +```````````````````````````````` + + +```````````````````````````````` example +foo +. +<p>foo</p> +```````````````````````````````` + + +```````````````````````````````` example +### foo\ +. +<h3>foo\</h3> +```````````````````````````````` + + +```````````````````````````````` example +### foo +. +<h3>foo</h3> +```````````````````````````````` + + +## Soft line breaks + +A regular line ending (not in a code span or HTML tag) that is not +preceded by two or more spaces or a backslash is parsed as a +[softbreak](@). (A soft line break may be rendered in HTML either as a +[line ending] or as a space. The result will be the same in +browsers. In the examples here, a [line ending] will be used.) + +```````````````````````````````` example +foo +baz +. +<p>foo +baz</p> +```````````````````````````````` + + +Spaces at the end of the line and beginning of the next line are +removed: + +```````````````````````````````` example +foo + baz +. +<p>foo +baz</p> +```````````````````````````````` + + +A conforming parser may render a soft line break in HTML either as a +line ending or as a space. + +A renderer may also provide an option to render soft line breaks +as hard line breaks. + +## Textual content + +Any characters not given an interpretation by the above rules will +be parsed as plain textual content. + +```````````````````````````````` example +hello $.;'there +. +<p>hello $.;'there</p> +```````````````````````````````` + + +```````````````````````````````` example +Foo χρῆν +. +<p>Foo χρῆν</p> +```````````````````````````````` + + +Internal spaces are preserved verbatim: + +```````````````````````````````` example +Multiple spaces +. +<p>Multiple spaces</p> +```````````````````````````````` + + +<!-- END TESTS --> + +# Appendix: A parsing strategy + +In this appendix we describe some features of the parsing strategy +used in the CommonMark reference implementations. + +## Overview + +Parsing has two phases: + +1. In the first phase, lines of input are consumed and the block +structure of the document---its division into paragraphs, block quotes, +list items, and so on---is constructed. Text is assigned to these +blocks but not parsed. Link reference definitions are parsed and a +map of links is constructed. + +2. In the second phase, the raw text contents of paragraphs and headings +are parsed into sequences of Markdown inline elements (strings, +code spans, links, emphasis, and so on), using the map of link +references constructed in phase 1. + +At each point in processing, the document is represented as a tree of +**blocks**. The root of the tree is a `document` block. The `document` +may have any number of other blocks as **children**. These children +may, in turn, have other blocks as children. The last child of a block +is normally considered **open**, meaning that subsequent lines of input +can alter its contents. (Blocks that are not open are **closed**.) +Here, for example, is a possible document tree, with the open blocks +marked by arrows: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## Phase 1: block structure + +Each line that is processed has an effect on this tree. The line is +analyzed and, depending on its contents, the document may be altered +in one or more of the following ways: + +1. One or more open blocks may be closed. +2. One or more new blocks may be created as children of the + last open block. +3. Text may be added to the last (deepest) open block remaining + on the tree. + +Once a line has been incorporated into the tree in this way, +it can be discarded, so input can be read in a stream. + +For each line, we follow this procedure: + +1. First we iterate through the open blocks, starting with the +root document, and descending through last children down to the last +open block. Each block imposes a condition that the line must satisfy +if the block is to remain open. For example, a block quote requires a +`>` character. A paragraph requires a non-blank line. +In this phase we may match all or just some of the open +blocks. But we cannot close unmatched blocks yet, because we may have a +[lazy continuation line]. + +2. Next, after consuming the continuation markers for existing +blocks, we look for new block starts (e.g. `>` for a block quote). +If we encounter a new block start, we close any blocks unmatched +in step 1 before creating the new block as a child of the last +matched container block. + +3. Finally, we look at the remainder of the line (after block +markers like `>`, list markers, and indentation have been consumed). +This is text that can be incorporated into the last open +block (a paragraph, code block, heading, or raw HTML). + +Setext headings are formed when we see a line of a paragraph +that is a [setext heading underline]. + +Reference link definitions are detected when a paragraph is closed; +the accumulated text lines are parsed to see if they begin with +one or more reference link definitions. Any remainder becomes a +normal paragraph. + +We can see how this works by considering how the tree above is +generated by four lines of Markdown: + +``` markdown +> Lorem ipsum dolor +sit amet. +> - Qui *quodsi iracundia* +> - aliquando id +``` + +At the outset, our document model is just + +``` tree +-> document +``` + +The first line of our text, + +``` markdown +> Lorem ipsum dolor +``` + +causes a `block_quote` block to be created as a child of our +open `document` block, and a `paragraph` block as a child of +the `block_quote`. Then the text is added to the last open +block, the `paragraph`: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor" +``` + +The next line, + +``` markdown +sit amet. +``` + +is a "lazy continuation" of the open `paragraph`, so it gets added +to the paragraph's text: + +``` tree +-> document + -> block_quote + -> paragraph + "Lorem ipsum dolor\nsit amet." +``` + +The third line, + +``` markdown +> - Qui *quodsi iracundia* +``` + +causes the `paragraph` block to be closed, and a new `list` block +opened as a child of the `block_quote`. A `list_item` is also +added as a child of the `list`, and a `paragraph` as a child of +the `list_item`. The text is then added to the new `paragraph`: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + -> list_item + -> paragraph + "Qui *quodsi iracundia*" +``` + +The fourth line, + +``` markdown +> - aliquando id +``` + +causes the `list_item` (and its child the `paragraph`) to be closed, +and a new `list_item` opened up as child of the `list`. A `paragraph` +is added as a child of the new `list_item`, to contain the text. +We thus obtain the final tree: + +``` tree +-> document + -> block_quote + paragraph + "Lorem ipsum dolor\nsit amet." + -> list (type=bullet tight=true bullet_char=-) + list_item + paragraph + "Qui *quodsi iracundia*" + -> list_item + -> paragraph + "aliquando id" +``` + +## Phase 2: inline structure + +Once all of the input has been parsed, all open blocks are closed. + +We then "walk the tree," visiting every node, and parse raw +string contents of paragraphs and headings as inlines. At this +point we have seen all the link reference definitions, so we can +resolve reference links as we go. + +``` tree +document + block_quote + paragraph + str "Lorem ipsum dolor" + softbreak + str "sit amet." + list (type=bullet tight=true bullet_char=-) + list_item + paragraph + str "Qui " + emph + str "quodsi iracundia" + list_item + paragraph + str "aliquando id" +``` + +Notice how the [line ending] in the first paragraph has +been parsed as a `softbreak`, and the asterisks in the first list item +have become an `emph`. + +### An algorithm for parsing nested emphasis and links + +By far the trickiest part of inline parsing is handling emphasis, +strong emphasis, links, and images. This is done using the following +algorithm. + +When we're parsing inlines and we hit either + +- a run of `*` or `_` characters, or +- a `[` or `![` + +we insert a text node with these symbols as its literal content, and we +add a pointer to this text node to the [delimiter stack](@). + +The [delimiter stack] is a doubly linked list. Each +element contains a pointer to a text node, plus information about + +- the type of delimiter (`[`, `![`, `*`, `_`) +- the number of delimiters, +- whether the delimiter is "active" (all are active to start), and +- whether the delimiter is a potential opener, a potential closer, + or both (which depends on what sort of characters precede + and follow the delimiters). + +When we hit a `]` character, we call the *look for link or image* +procedure (see below). + +When we hit the end of the input, we call the *process emphasis* +procedure (see below), with `stack_bottom` = NULL. + +#### *look for link or image* + +Starting at the top of the delimiter stack, we look backwards +through the stack for an opening `[` or `![` delimiter. + +- If we don't find one, we return a literal text node `]`. + +- If we do find one, but it's not *active*, we remove the inactive + delimiter from the stack, and return a literal text node `]`. + +- If we find one and it's active, then we parse ahead to see if + we have an inline link/image, reference link/image, collapsed reference + link/image, or shortcut reference link/image. + + + If we don't, then we remove the opening delimiter from the + delimiter stack and return a literal text node `]`. + + + If we do, then + + * We return a link or image node whose children are the inlines + after the text node pointed to by the opening delimiter. + + * We run *process emphasis* on these inlines, with the `[` opener + as `stack_bottom`. + + * We remove the opening delimiter. + + * If we have a link (and not an image), we also set all + `[` delimiters before the opening delimiter to *inactive*. (This + will prevent us from getting links within links.) + +#### *process emphasis* + +Parameter `stack_bottom` sets a lower bound to how far we +descend in the [delimiter stack]. If it is NULL, we can +go all the way to the bottom. Otherwise, we stop before +visiting `stack_bottom`. + +Let `current_position` point to the element on the [delimiter stack] +just above `stack_bottom` (or the first element if `stack_bottom` +is NULL). + +We keep track of the `openers_bottom` for each delimiter +type (`*`, `_`), indexed to the length of the closing delimiter run +(modulo 3) and to whether the closing delimiter can also be an +opener. Initialize this to `stack_bottom`. + +Then we repeat the following until we run out of potential +closers: + +- Move `current_position` forward in the delimiter stack (if needed) + until we find the first potential closer with delimiter `*` or `_`. + (This will be the potential closer closest + to the beginning of the input -- the first one in parse order.) + +- Now, look back in the stack (staying above `stack_bottom` and + the `openers_bottom` for this delimiter type) for the + first matching potential opener ("matching" means same delimiter). + +- If one is found: + + + Figure out whether we have emphasis or strong emphasis: + if both closer and opener spans have length >= 2, we have + strong, otherwise regular. + + + Insert an emph or strong emph node accordingly, after + the text node corresponding to the opener. + + + Remove any delimiters between the opener and closer from + the delimiter stack. + + + Remove 1 (for regular emph) or 2 (for strong emph) delimiters + from the opening and closing text nodes. If they become empty + as a result, remove them and remove the corresponding element + of the delimiter stack. If the closing node is removed, reset + `current_position` to the next element in the stack. + +- If none is found: + + + Set `openers_bottom` to the element before `current_position`. + (We know that there are no openers for this kind of closer up to and + including this point, so this puts a lower bound on future searches.) + + + If the closer at `current_position` is not a potential opener, + remove it from the delimiter stack (since we know it can't + be a closer either). + + + Advance `current_position` to the next element in the stack. + +After we're done, we remove all delimiters above `stack_bottom` from the +delimiter stack. diff --git a/commonmark/pom.xml b/commonmark/pom.xml index be18858ad..4e060edaa 100644 --- a/commonmark/pom.xml +++ b/commonmark/pom.xml @@ -2,19 +2,19 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> - <groupId>com.atlassian.commonmark</groupId> + <groupId>org.commonmark</groupId> <artifactId>commonmark-parent</artifactId> - <version>0.1.1-SNAPSHOT</version> + <version>0.28.1-SNAPSHOT</version> </parent> <artifactId>commonmark</artifactId> <name>commonmark-java core</name> - <description>Core of commonmark-java</description> + <description>Core of commonmark-java (a library for parsing Markdown to an AST, modifying the AST and rendering it to HTML or Markdown)</description> <dependencies> <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-test-util</artifactId> <scope>test</scope> </dependency> <dependency> @@ -29,21 +29,37 @@ </dependency> </dependencies> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-jar-plugin</artifactId> - <version>2.6</version> - <executions> - <execution> - <goals> - <goal>test-jar</goal> - </goals> - </execution> - </executions> - </plugin> - </plugins> - </build> + <profiles> + <profile> + <id>benchmark</id> + <build> + <defaultGoal>exec:exec</defaultGoal> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>exec-maven-plugin</artifactId> + <version>3.2.0</version> + <configuration> + <executable>java</executable> + <classpathScope>test</classpathScope> + <arguments> + <argument>-classpath</argument> + <classpath /> + <argument>org.commonmark.test.SpecBenchmark</argument> + </arguments> + </configuration> + </plugin> + </plugins> + </build> + </profile> + </profiles> + + <licenses> + <license> + <name>BSD-2-Clause</name> + <url>https://opensource.org/licenses/BSD-2-Clause</url> + <distribution>repo</distribution> + </license> + </licenses> </project> diff --git a/commonmark/src/main/java/module-info.java b/commonmark/src/main/java/module-info.java new file mode 100644 index 000000000..009fc7d18 --- /dev/null +++ b/commonmark/src/main/java/module-info.java @@ -0,0 +1,13 @@ +module org.commonmark { + exports org.commonmark; + exports org.commonmark.node; + exports org.commonmark.parser; + exports org.commonmark.parser.beta; + exports org.commonmark.parser.block; + exports org.commonmark.parser.delimiter; + exports org.commonmark.renderer; + exports org.commonmark.renderer.html; + exports org.commonmark.renderer.markdown; + exports org.commonmark.renderer.text; + exports org.commonmark.text; +} diff --git a/commonmark/src/main/java/org/commonmark/html/AttributeProvider.java b/commonmark/src/main/java/org/commonmark/html/AttributeProvider.java deleted file mode 100644 index e5f62365d..000000000 --- a/commonmark/src/main/java/org/commonmark/html/AttributeProvider.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.commonmark.html; - -import org.commonmark.node.Node; - -import java.util.Map; - -/** - * Extension point for adding/changing attributes on the primary HTML tag for a node. - */ -public interface AttributeProvider { - - /** - * Set the attributes for the node by modifying the provided map. - * <p> - * This allows to change or even remove default attributes. With great power comes great responsibility. - * <p> - * The attribute key and values will be escaped (preserving character entities), so don't escape them here, - * otherwise they will be double-escaped. - * - * @param node the node to set attributes for - * @param attributes the attributes, with any default attributes already set in the map - */ - void setAttributes(Node node, Map<String, String> attributes); - -} diff --git a/commonmark/src/main/java/org/commonmark/html/CustomHtmlRenderer.java b/commonmark/src/main/java/org/commonmark/html/CustomHtmlRenderer.java deleted file mode 100644 index cf414a35e..000000000 --- a/commonmark/src/main/java/org/commonmark/html/CustomHtmlRenderer.java +++ /dev/null @@ -1,10 +0,0 @@ -package org.commonmark.html; - -import org.commonmark.node.Node; -import org.commonmark.node.Visitor; - -public interface CustomHtmlRenderer { - // TODO: maybe pass renderer instead of visitor? - boolean render(Node node, HtmlWriter htmlWriter, Visitor visitor); -} - diff --git a/commonmark/src/main/java/org/commonmark/html/HtmlRenderer.java b/commonmark/src/main/java/org/commonmark/html/HtmlRenderer.java deleted file mode 100644 index ce001a451..000000000 --- a/commonmark/src/main/java/org/commonmark/html/HtmlRenderer.java +++ /dev/null @@ -1,398 +0,0 @@ -package org.commonmark.html; - -import org.commonmark.Extension; -import org.commonmark.internal.util.Escaping; -import org.commonmark.node.*; - -import java.util.*; - -public class HtmlRenderer { - - private static final Map<String, String> NO_ATTRIBUTES = Collections.emptyMap(); - - private final String softbreak; - private final boolean escapeHtml; - private final boolean percentEncodeUrls; - private final List<CustomHtmlRenderer> customHtmlRenderers; - private final List<AttributeProvider> attributeProviders; - - private HtmlRenderer(Builder builder) { - this.softbreak = builder.softbreak; - this.escapeHtml = builder.escapeHtml; - this.percentEncodeUrls = builder.percentEncodeUrls; - this.customHtmlRenderers = builder.customHtmlRenderers; - this.attributeProviders = builder.attributeProviders; - } - - public static Builder builder() { - return new Builder(); - } - - public void render(Node node, Appendable output) { - RendererVisitor rendererVisitor = new RendererVisitor(new HtmlWriter(output), customHtmlRenderers); - node.accept(rendererVisitor); - } - - public String render(Node node) { - StringBuilder sb = new StringBuilder(); - render(node, sb); - return sb.toString(); - } - - private String escape(String input, boolean preserveEntities) { - return Escaping.escapeHtml(input, preserveEntities); - } - - private String optionallyPercentEncodeUrl(String url) { - if (percentEncodeUrls) { - return Escaping.percentEncodeUrl(url); - } else { - return url; - } - } - - // default options: - // softbreak: '\n', // by default, soft breaks are rendered as newlines in - // HTML - // set to "<br />" to make them hard breaks - // set to " " if you want to ignore line wrapping in source - public static class Builder { - - private String softbreak = "\n"; - private boolean escapeHtml = false; - private boolean percentEncodeUrls = false; - private List<CustomHtmlRenderer> customHtmlRenderers = new ArrayList<>(); - private List<AttributeProvider> attributeProviders = new ArrayList<>(); - - public Builder softbreak(String softbreak) { - this.softbreak = softbreak; - return this; - } - - /** - * Whether {@link HtmlTag} and {@link HtmlBlock} should be escaped. - * <p> - * Note that {@link HtmlTag} is only a tag itself, not the text between an opening tag and a closing tag. So markup - * in the text will be parsed as normal and is not affected by this option. - * - * @param escapeHtml true for escaping, false for preserving raw HTML - * @return {@code this} - */ - public Builder escapeHtml(boolean escapeHtml) { - this.escapeHtml = escapeHtml; - return this; - } - - /** - * Whether URLs of link or images should be percent-encoded. If enabled, the following is done: - * <ul> - * <li>Existing percent-encoded parts are preserved (e.g. "%20" is kept as "%20")</li> - * <li>Reserved characters such as "/" are preserved, except for "[" and "]" (see encodeURI in JS)</li> - * <li>Unreserved characters such as "a" are preserved</li> - * <li>Other characters such umlauts are percent-encoded</li> - * </ul> - * - * @param percentEncodeUrls true to percent-encode, false for leaving as-is; default is false - * @return {@code this} - */ - public Builder percentEncodeUrls(boolean percentEncodeUrls) { - this.percentEncodeUrls = percentEncodeUrls; - return this; - } - - public Builder attributeProvider(AttributeProvider attributeProvider) { - this.attributeProviders.add(attributeProvider); - return this; - } - - public Builder customHtmlRenderer(CustomHtmlRenderer customHtmlRenderer) { - this.customHtmlRenderers.add(customHtmlRenderer); - return this; - } - - /** - * @param extensions extensions to use on this HTML renderer - * @return this - */ - public Builder extensions(Iterable<? extends Extension> extensions) { - for (Extension extension : extensions) { - if (extension instanceof HtmlRendererExtension) { - HtmlRendererExtension htmlRendererExtension = (HtmlRendererExtension) extension; - htmlRendererExtension.extend(this); - } - } - return this; - } - - public HtmlRenderer build() { - return new HtmlRenderer(this); - } - } - - /** - * Extension for HTML renderer. - */ - public interface HtmlRendererExtension extends Extension { - void extend(Builder rendererBuilder); - } - - private class RendererVisitor extends AbstractVisitor { - - private final HtmlWriter html; - private final List<CustomHtmlRenderer> customHtmlRenderers; - - public RendererVisitor(HtmlWriter html, List<CustomHtmlRenderer> customHtmlRenderers) { - this.html = html; - this.customHtmlRenderers = customHtmlRenderers; - } - - @Override - public void visit(Document document) { - visitChildren(document); - } - - @Override - public void visit(Header header) { - String htag = "h" + header.getLevel(); - html.line(); - html.tag(htag, getAttrs(header)); - visitChildren(header); - html.tag('/' + htag); - html.line(); - } - - @Override - public void visit(Paragraph paragraph) { - boolean inTightList = isInTightList(paragraph); - if (!inTightList) { - html.line(); - html.tag("p", getAttrs(paragraph)); - } - visitChildren(paragraph); - if (!inTightList) { - html.tag("/p"); - html.line(); - } - } - - @Override - public void visit(BlockQuote blockQuote) { - html.line(); - html.tag("blockquote", getAttrs(blockQuote)); - html.line(); - visitChildren(blockQuote); - html.line(); - html.tag("/blockquote"); - html.line(); - } - - @Override - public void visit(BulletList bulletList) { - renderListBlock(bulletList, "ul", getAttrs(bulletList)); - } - - @Override - public void visit(FencedCodeBlock fencedCodeBlock) { - String literal = fencedCodeBlock.getLiteral(); - Map<String, String> attributes = new LinkedHashMap<>(); - String info = fencedCodeBlock.getInfo(); - if (info != null && !info.isEmpty()) { - int space = info.indexOf(" "); - String language; - if (space == -1) { - language = info; - } else { - language = info.substring(0, space); - } - attributes.put("class", "language-" + language); - } - renderCodeBlock(literal, getAttrs(fencedCodeBlock, attributes)); - } - - @Override - public void visit(HtmlBlock htmlBlock) { - html.line(); - if (escapeHtml) { - html.raw(escape(htmlBlock.getLiteral(), false)); - } else { - html.raw(htmlBlock.getLiteral()); - } - html.line(); - } - - @Override - public void visit(HorizontalRule horizontalRule) { - html.line(); - html.tag("hr", getAttrs(horizontalRule), true); - html.line(); - } - - @Override - public void visit(IndentedCodeBlock indentedCodeBlock) { - renderCodeBlock(indentedCodeBlock.getLiteral(), getAttrs(indentedCodeBlock)); - } - - @Override - public void visit(Link link) { - Map<String, String> attrs = new LinkedHashMap<>(); - String url = optionallyPercentEncodeUrl(link.getDestination()); - attrs.put("href", url); - if (link.getTitle() != null) { - attrs.put("title", link.getTitle()); - } - html.tag("a", getAttrs(link, attrs)); - visitChildren(link); - html.tag("/a"); - } - - @Override - public void visit(ListItem listItem) { - html.tag("li", getAttrs(listItem)); - visitChildren(listItem); - html.tag("/li"); - html.line(); - } - - @Override - public void visit(OrderedList orderedList) { - int start = orderedList.getStartNumber(); - Map<String, String> attrs = new LinkedHashMap<>(); - if (start != 1) { - attrs.put("start", String.valueOf(start)); - } - renderListBlock(orderedList, "ol", getAttrs(orderedList, attrs)); - } - - @Override - public void visit(Image image) { - if (html.isTagAllowed()) { - String url = optionallyPercentEncodeUrl(image.getDestination()); - html.raw("<img src=\"" + escape(url, true) + - "\" alt=\""); - } - html.disableTags(); - visitChildren(image); - html.enableTags(); - if (html.isTagAllowed()) { - if (image.getTitle() != null) { - html.raw("\" title=\"" + escape(image.getTitle(), true)); - } - html.raw("\" />"); - } - } - - @Override - public void visit(Emphasis emphasis) { - html.tag("em"); - visitChildren(emphasis); - html.tag("/em"); - } - - @Override - public void visit(StrongEmphasis strongEmphasis) { - html.tag("strong"); - visitChildren(strongEmphasis); - html.tag("/strong"); - } - - @Override - public void visit(Text text) { - html.raw(escape(text.getLiteral(), false)); - } - - @Override - public void visit(Code code) { - html.tag("code"); - html.raw(escape(code.getLiteral(), false)); - html.tag("/code"); - } - - @Override - public void visit(HtmlTag htmlTag) { - if (escapeHtml) { - html.raw(escape(htmlTag.getLiteral(), false)); - } else { - html.raw(htmlTag.getLiteral()); - } - } - - @Override - public void visit(SoftLineBreak softLineBreak) { - html.raw(softbreak); - } - - @Override - public void visit(HardLineBreak hardLineBreak) { - html.tag("br", NO_ATTRIBUTES, true); - html.line(); - } - - @Override - public void visit(CustomBlock customBlock) { - renderCustom(customBlock); - } - - @Override - public void visit(CustomNode customNode) { - renderCustom(customNode); - } - - private void renderCustom(Node node) { - for (CustomHtmlRenderer customHtmlRenderer : customHtmlRenderers) { - // TODO: Should we pass attributes here? - boolean handled = customHtmlRenderer.render(node, html, this); - if (handled) { - break; - } - } - } - - private void renderCodeBlock(String literal, Map<String, String> attributes) { - html.line(); - html.tag("pre"); - html.tag("code", attributes); - html.raw(escape(literal, false)); - html.tag("/code"); - html.tag("/pre"); - html.line(); - } - - private void renderListBlock(ListBlock listBlock, String tagName, Map<String, String> attributes) { - html.line(); - html.tag(tagName, attributes); - html.line(); - visitChildren(listBlock); - html.line(); - html.tag('/' + tagName); - html.line(); - } - - private boolean isInTightList(Paragraph paragraph) { - Node parent = paragraph.getParent(); - if (parent != null) { - Node gramps = parent.getParent(); - if (gramps != null && gramps instanceof ListBlock) { - ListBlock list = (ListBlock) gramps; - return list.isTight(); - } - } - return false; - } - - private Map<String, String> getAttrs(Node node) { - return getAttrs(node, Collections.<String, String>emptyMap()); - } - - private Map<String, String> getAttrs(Node node, Map<String, String> defaultAttributes) { - Map<String, String> attrs = new LinkedHashMap<>(defaultAttributes); - setCustomAttributes(node, attrs); - return attrs; - } - - private void setCustomAttributes(Node node, Map<String, String> attrs) { - for (AttributeProvider attributeProvider : attributeProviders) { - attributeProvider.setAttributes(node, attrs); - } - } - } -} diff --git a/commonmark/src/main/java/org/commonmark/internal/BlockContent.java b/commonmark/src/main/java/org/commonmark/internal/BlockContent.java index f278c20c0..9a9ce6f44 100644 --- a/commonmark/src/main/java/org/commonmark/internal/BlockContent.java +++ b/commonmark/src/main/java/org/commonmark/internal/BlockContent.java @@ -22,10 +22,6 @@ public void add(CharSequence line) { lineCount++; } - public boolean hasSingleLine() { - return lineCount == 1; - } - public String getString() { return sb.toString(); } diff --git a/commonmark/src/main/java/org/commonmark/internal/BlockQuoteParser.java b/commonmark/src/main/java/org/commonmark/internal/BlockQuoteParser.java index 247af08cc..572c491f8 100644 --- a/commonmark/src/main/java/org/commonmark/internal/BlockQuoteParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/BlockQuoteParser.java @@ -1,8 +1,10 @@ package org.commonmark.internal; +import org.commonmark.internal.util.Parsing; import org.commonmark.node.Block; import org.commonmark.node.BlockQuote; import org.commonmark.parser.block.*; +import org.commonmark.text.Characters; public class BlockQuoteParser extends AbstractBlockParser { @@ -26,29 +28,34 @@ public BlockQuote getBlock() { @Override public BlockContinue tryContinue(ParserState state) { int nextNonSpace = state.getNextNonSpaceIndex(); - CharSequence line = state.getLine(); - if (state.getIndent() <= 3 && nextNonSpace < line.length() && line.charAt(nextNonSpace) == '>') { - int newIndex = nextNonSpace + 1; - if (newIndex < line.length() && line.charAt(newIndex) == ' ') { - newIndex++; + if (isMarker(state, nextNonSpace)) { + int newColumn = state.getColumn() + state.getIndent() + 1; + // optional following space or tab + if (Characters.isSpaceOrTab(state.getLine().getContent(), nextNonSpace + 1)) { + newColumn++; } - return BlockContinue.atIndex(newIndex); + return BlockContinue.atColumn(newColumn); } else { return BlockContinue.none(); } } + private static boolean isMarker(ParserState state, int index) { + CharSequence line = state.getLine().getContent(); + return state.getIndent() < Parsing.CODE_BLOCK_INDENT && index < line.length() && line.charAt(index) == '>'; + } + public static class Factory extends AbstractBlockParserFactory { + @Override public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { - CharSequence line = state.getLine(); int nextNonSpace = state.getNextNonSpaceIndex(); - if (state.getIndent() < 4 && line.charAt(nextNonSpace) == '>') { - int newOffset = nextNonSpace + 1; - // optional following space - if (newOffset < line.length() && line.charAt(newOffset) == ' ') { - newOffset++; + if (isMarker(state, nextNonSpace)) { + int newColumn = state.getColumn() + state.getIndent() + 1; + // optional following space or tab + if (Characters.isSpaceOrTab(state.getLine().getContent(), nextNonSpace + 1)) { + newColumn++; } - return BlockStart.of(new BlockQuoteParser()).atIndex(newOffset); + return BlockStart.of(new BlockQuoteParser()).atColumn(newColumn); } else { return BlockStart.none(); } diff --git a/commonmark/src/main/java/org/commonmark/internal/BlockStartImpl.java b/commonmark/src/main/java/org/commonmark/internal/BlockStartImpl.java index c7e967d46..516f944b2 100644 --- a/commonmark/src/main/java/org/commonmark/internal/BlockStartImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/BlockStartImpl.java @@ -9,6 +9,7 @@ public class BlockStartImpl extends BlockStart { private int newIndex = -1; private int newColumn = -1; private boolean replaceActiveBlockParser = false; + private int replaceParagraphLines = 0; public BlockStartImpl(BlockParser... blockParsers) { this.blockParsers = blockParsers; @@ -30,6 +31,10 @@ public boolean isReplaceActiveBlockParser() { return replaceActiveBlockParser; } + int getReplaceParagraphLines() { + return replaceParagraphLines; + } + @Override public BlockStart atIndex(int newIndex) { this.newIndex = newIndex; @@ -48,4 +53,12 @@ public BlockStart replaceActiveBlockParser() { return this; } + @Override + public BlockStart replaceParagraphLines(int lines) { + if (!(lines >= 1)) { + throw new IllegalArgumentException("Lines must be >= 1"); + } + this.replaceParagraphLines = lines; + return this; + } } diff --git a/commonmark/src/main/java/org/commonmark/internal/Bracket.java b/commonmark/src/main/java/org/commonmark/internal/Bracket.java new file mode 100644 index 000000000..c04b6ecda --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/Bracket.java @@ -0,0 +1,73 @@ +package org.commonmark.internal; + +import org.commonmark.node.Text; +import org.commonmark.parser.beta.Position; + +/** + * Opening bracket for links ({@code [}), images ({@code ![}), or links with other markers. + */ +public class Bracket { + + /** + * The node of a marker such as {@code !} if present, null otherwise. + */ + public final Text markerNode; + + /** + * The position of the marker if present, null otherwise. + */ + public final Position markerPosition; + + /** + * The node of {@code [}. + */ + public final Text bracketNode; + + /** + * The position of {@code [}. + */ + public final Position bracketPosition; + + /** + * The position of the content (after the opening bracket) + */ + public final Position contentPosition; + + /** + * Previous bracket. + */ + public final Bracket previous; + + /** + * Previous delimiter (emphasis, etc) before this bracket. + */ + public final Delimiter previousDelimiter; + + /** + * Whether this bracket is allowed to form a link/image (also known as "active"). + */ + public boolean allowed = true; + + /** + * Whether there is an unescaped bracket (opening or closing) after this opening bracket in the text parsed so far. + */ + public boolean bracketAfter = false; + + static public Bracket link(Text bracketNode, Position bracketPosition, Position contentPosition, Bracket previous, Delimiter previousDelimiter) { + return new Bracket(null, null, bracketNode, bracketPosition, contentPosition, previous, previousDelimiter); + } + + static public Bracket withMarker(Text markerNode, Position markerPosition, Text bracketNode, Position bracketPosition, Position contentPosition, Bracket previous, Delimiter previousDelimiter) { + return new Bracket(markerNode, markerPosition, bracketNode, bracketPosition, contentPosition, previous, previousDelimiter); + } + + private Bracket(Text markerNode, Position markerPosition, Text bracketNode, Position bracketPosition, Position contentPosition, Bracket previous, Delimiter previousDelimiter) { + this.markerNode = markerNode; + this.markerPosition = markerPosition; + this.bracketNode = bracketNode; + this.bracketPosition = bracketPosition; + this.contentPosition = contentPosition; + this.previous = previous; + this.previousDelimiter = previousDelimiter; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/Definitions.java b/commonmark/src/main/java/org/commonmark/internal/Definitions.java new file mode 100644 index 000000000..0377842c9 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/Definitions.java @@ -0,0 +1,33 @@ +package org.commonmark.internal; + +import org.commonmark.node.DefinitionMap; + +import java.util.HashMap; +import java.util.Map; + +public class Definitions { + + private final Map<Class<?>, DefinitionMap<?>> definitionsByType = new HashMap<>(); + + public <D> void addDefinitions(DefinitionMap<D> definitionMap) { + var existingMap = getMap(definitionMap.getType()); + if (existingMap == null) { + definitionsByType.put(definitionMap.getType(), definitionMap); + } else { + existingMap.addAll(definitionMap); + } + } + + public <V> V getDefinition(Class<V> type, String label) { + var definitionMap = getMap(type); + if (definitionMap == null) { + return null; + } + return definitionMap.get(label); + } + + private <V> DefinitionMap<V> getMap(Class<V> type) { + //noinspection unchecked + return (DefinitionMap<V>) definitionsByType.get(type); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/Delimiter.java b/commonmark/src/main/java/org/commonmark/internal/Delimiter.java index 127a834b5..9083ce3cb 100644 --- a/commonmark/src/main/java/org/commonmark/internal/Delimiter.java +++ b/commonmark/src/main/java/org/commonmark/internal/Delimiter.java @@ -1,61 +1,82 @@ package org.commonmark.internal; -import org.commonmark.node.Node; import org.commonmark.node.Text; +import org.commonmark.parser.delimiter.DelimiterRun; -class Delimiter { +import java.util.List; - final Text node; - final int index; +/** + * Delimiter (emphasis, strong emphasis or custom emphasis). + */ +public class Delimiter implements DelimiterRun { - Delimiter previous; - Delimiter next; + public final List<Text> characters; + public final char delimiterChar; + private final int originalLength; - char delimiterChar; - int numDelims = 1; + // Can open emphasis, see spec. + private final boolean canOpen; - /** - * Can open emphasis, see spec. - */ - boolean canOpen = true; + // Can close emphasis, see spec. + private final boolean canClose; - /** - * Can close emphasis, see spec. - */ - boolean canClose = false; + public Delimiter previous; + public Delimiter next; - /** - * Whether this delimiter is allowed to form a link/image. - */ - boolean allowed = true; + public Delimiter(List<Text> characters, char delimiterChar, boolean canOpen, boolean canClose, Delimiter previous) { + this.characters = characters; + this.delimiterChar = delimiterChar; + this.canOpen = canOpen; + this.canClose = canClose; + this.previous = previous; + this.originalLength = characters.size(); + } - /** - * Skip this delimiter when looking for a link/image opener because it was already matched. - */ - boolean matched = false; + @Override + public boolean canOpen() { + return canOpen; + } - Delimiter(Text node, Delimiter previous, int index) { - this.node = node; - this.previous = previous; - this.index = index; + @Override + public boolean canClose() { + return canClose; } - Text getPreviousNonDelimiterTextNode() { - Node previousNode = node.getPrevious(); - if (previousNode instanceof Text && (this.previous == null || this.previous.node != previousNode)) { - return (Text) previousNode; - } else { - return null; - } + @Override + public int length() { + return characters.size(); + } + + @Override + public int originalLength() { + return originalLength; } - Text getNextNonDelimiterTextNode() { - Node nextNode = node.getNext(); - if (nextNode instanceof Text && (this.next == null || this.next.node != nextNode)) { - return (Text) nextNode; - } else { - return null; + @Override + public Text getOpener() { + return characters.get(characters.size() - 1); + } + + @Override + public Text getCloser() { + return characters.get(0); + } + + @Override + public Iterable<Text> getOpeners(int length) { + if (!(length >= 1 && length <= length())) { + throw new IllegalArgumentException("length must be between 1 and " + length() + ", was " + length); } + + return characters.subList(characters.size() - length, characters.size()); } + @Override + public Iterable<Text> getClosers(int length) { + if (!(length >= 1 && length <= length())) { + throw new IllegalArgumentException("length must be between 1 and " + length() + ", was " + length); + } + + return characters.subList(0, length); + } } diff --git a/commonmark/src/main/java/org/commonmark/internal/DelimiterRun.java b/commonmark/src/main/java/org/commonmark/internal/DelimiterRun.java deleted file mode 100644 index a8a363fa8..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/DelimiterRun.java +++ /dev/null @@ -1,15 +0,0 @@ -package org.commonmark.internal; - -class DelimiterRun { - - final int count; - final boolean canClose; - final boolean canOpen; - - DelimiterRun(int count, boolean canOpen, boolean canClose) { - this.count = count; - this.canOpen = canOpen; - this.canClose = canClose; - } - -} diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentBlockParser.java index 4a30544e7..db3d3854f 100644 --- a/commonmark/src/main/java/org/commonmark/internal/DocumentBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/DocumentBlockParser.java @@ -2,6 +2,7 @@ import org.commonmark.node.Block; import org.commonmark.node.Document; +import org.commonmark.parser.SourceLine; import org.commonmark.parser.block.AbstractBlockParser; import org.commonmark.parser.block.BlockContinue; import org.commonmark.parser.block.ParserState; @@ -31,7 +32,7 @@ public BlockContinue tryContinue(ParserState state) { } @Override - public void addLine(CharSequence line) { + public void addLine(SourceLine line) { } } diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java index aeab876d4..07d97296b 100644 --- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java @@ -1,27 +1,53 @@ package org.commonmark.internal; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.Reader; +import org.commonmark.internal.util.LineReader; import org.commonmark.internal.util.Parsing; -import org.commonmark.internal.util.Substring; import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.InlineParserFactory; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.InlineContentParserFactory; import org.commonmark.parser.block.*; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.text.Characters; +import java.io.IOException; +import java.io.Reader; import java.util.*; public class DocumentParser implements ParserState { - private static List<BlockParserFactory> CORE_FACTORIES = Arrays.<BlockParserFactory>asList( - new BlockQuoteParser.Factory(), - new HeaderParser.Factory(), - new FencedCodeBlockParser.Factory(), - new HtmlBlockParser.Factory(), - new HorizontalRuleParser.Factory(), - new ListBlockParser.Factory(), - new IndentedCodeBlockParser.Factory()); + private static final Set<Class<? extends Block>> CORE_FACTORY_TYPES = new LinkedHashSet<>(List.of( + BlockQuote.class, + Heading.class, + FencedCodeBlock.class, + HtmlBlock.class, + ThematicBreak.class, + ListBlock.class, + IndentedCodeBlock.class)); - private CharSequence line; + private static final Map<Class<? extends Block>, BlockParserFactory> NODES_TO_CORE_FACTORIES; + + static { + Map<Class<? extends Block>, BlockParserFactory> map = new HashMap<>(); + map.put(BlockQuote.class, new BlockQuoteParser.Factory()); + map.put(Heading.class, new HeadingParser.Factory()); + map.put(FencedCodeBlock.class, new FencedCodeBlockParser.Factory()); + map.put(HtmlBlock.class, new HtmlBlockParser.Factory()); + map.put(ThematicBreak.class, new ThematicBreakParser.Factory()); + map.put(ListBlock.class, new ListBlockParser.Factory()); + map.put(IndentedCodeBlock.class, new IndentedCodeBlockParser.Factory()); + NODES_TO_CORE_FACTORIES = Collections.unmodifiableMap(map); + } + + private SourceLine line; + + /** + * Line index (0-based) + */ + private int lineIndex = -1; /** * current index (offset) in input line (0-based) @@ -33,75 +59,110 @@ public class DocumentParser implements ParserState { */ private int column = 0; + /** + * if the current column is within a tab character (partially consumed tab) + */ + private boolean columnIsInTab; + private int nextNonSpace = 0; private int nextNonSpaceColumn = 0; - private boolean blank; - private int indent = 0; + private boolean blank; private final List<BlockParserFactory> blockParserFactories; - private final InlineParserImpl inlineParser; + private final InlineParserFactory inlineParserFactory; + private final List<InlineContentParserFactory> inlineContentParserFactories; + private final List<DelimiterProcessor> delimiterProcessors; + private final List<LinkProcessor> linkProcessors; + private final Set<Character> linkMarkers; + private final IncludeSourceSpans includeSourceSpans; + private final int maxOpenBlockParsers; private final DocumentBlockParser documentBlockParser; + private final Definitions definitions = new Definitions(); - private List<BlockParser> activeBlockParsers = new ArrayList<>(); - private Set<BlockParser> allBlockParsers = new HashSet<>(); - private Map<Node, Boolean> lastLineBlank = new HashMap<>(); + private final List<OpenBlockParser> openBlockParsers = new ArrayList<>(); + private final List<BlockParser> allBlockParsers = new ArrayList<>(); - public DocumentParser(List<BlockParserFactory> blockParserFactories, InlineParserImpl inlineParser) { + public DocumentParser(List<BlockParserFactory> blockParserFactories, InlineParserFactory inlineParserFactory, + List<InlineContentParserFactory> inlineContentParserFactories, List<DelimiterProcessor> delimiterProcessors, + List<LinkProcessor> linkProcessors, Set<Character> linkMarkers, + IncludeSourceSpans includeSourceSpans, int maxOpenBlockParsers) { this.blockParserFactories = blockParserFactories; - this.inlineParser = inlineParser; - + this.inlineParserFactory = inlineParserFactory; + this.inlineContentParserFactories = inlineContentParserFactories; + this.delimiterProcessors = delimiterProcessors; + this.linkProcessors = linkProcessors; + this.linkMarkers = linkMarkers; + this.includeSourceSpans = includeSourceSpans; + this.maxOpenBlockParsers = maxOpenBlockParsers; + this.documentBlockParser = new DocumentBlockParser(); - activateBlockParser(this.documentBlockParser); + activateBlockParser(new OpenBlockParser(documentBlockParser, 0)); + } + + public static Set<Class<? extends Block>> getDefaultBlockParserTypes() { + return CORE_FACTORY_TYPES; } - public static List<BlockParserFactory> calculateBlockParserFactories(List<BlockParserFactory> customBlockParserFactories) { + public static List<BlockParserFactory> calculateBlockParserFactories(List<BlockParserFactory> customBlockParserFactories, Set<Class<? extends Block>> enabledBlockTypes) { List<BlockParserFactory> list = new ArrayList<>(); - list.addAll(DocumentParser.CORE_FACTORIES); + // By having the custom factories come first, extensions are able to change behavior of core syntax. list.addAll(customBlockParserFactories); + for (Class<? extends Block> blockType : enabledBlockTypes) { + list.add(NODES_TO_CORE_FACTORIES.get(blockType)); + } return list; } + public static void checkEnabledBlockTypes(Set<Class<? extends Block>> enabledBlockTypes) { + for (Class<? extends Block> enabledBlockType : enabledBlockTypes) { + if (!NODES_TO_CORE_FACTORIES.containsKey(enabledBlockType)) { + throw new IllegalArgumentException("Can't enable block type " + enabledBlockType + ", possible options are: " + NODES_TO_CORE_FACTORIES.keySet()); + } + } + } + /** * The main parsing function. Returns a parsed document AST. */ public Document parse(String input) { int lineStart = 0; int lineBreak; - while ((lineBreak = Parsing.findLineBreak(input, lineStart)) != -1) { - CharSequence line = Substring.of(input, lineStart, lineBreak); - incorporateLine(line); + while ((lineBreak = Characters.findLineBreak(input, lineStart)) != -1) { + String line = input.substring(lineStart, lineBreak); + parseLine(line, lineStart); if (lineBreak + 1 < input.length() && input.charAt(lineBreak) == '\r' && input.charAt(lineBreak + 1) == '\n') { lineStart = lineBreak + 2; } else { lineStart = lineBreak + 1; } } - if (input.length() > 0 && (lineStart == 0 || lineStart < input.length())) { - incorporateLine(Substring.of(input, lineStart, input.length())); + if (!input.isEmpty() && (lineStart == 0 || lineStart < input.length())) { + String line = input.substring(lineStart); + parseLine(line, lineStart); } return finalizeAndProcess(); } - + public Document parse(Reader input) throws IOException { - BufferedReader bufferedReader; - if (input instanceof BufferedReader) { - bufferedReader = (BufferedReader) input; - } else { - bufferedReader = new BufferedReader(input); - } - + var lineReader = new LineReader(input); + int inputIndex = 0; String line; - while ((line = bufferedReader.readLine()) != null) { - incorporateLine(line); + while ((line = lineReader.readLine()) != null) { + parseLine(line, inputIndex); + inputIndex += line.length(); + var eol = lineReader.getLineTerminator(); + if (eol != null) { + inputIndex += eol.length(); + } } return finalizeAndProcess(); } @Override - public CharSequence getLine() { + public SourceLine getLine() { return line; } @@ -132,33 +193,31 @@ public boolean isBlank() { @Override public BlockParser getActiveBlockParser() { - return activeBlockParsers.get(activeBlockParsers.size() - 1); + return openBlockParsers.get(openBlockParsers.size() - 1).blockParser; } /** * Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each * line of input, then finalizing the document. */ - private void incorporateLine(CharSequence ln) { - line = Parsing.prepareLine(ln); - index = 0; - column = 0; - nextNonSpace = 0; - nextNonSpaceColumn = 0; + private void parseLine(String ln, int inputIndex) { + setLine(ln, inputIndex); // For each containing block, try to parse the associated line start. - // Bail out on failure: container will point to the last matching block. - // Set all_matched to false if not all containers match. - // The document will always match, can be skipped + // The document will always match, so we can skip the first block parser and start at 1 matches int matches = 1; - for (BlockParser blockParser : activeBlockParsers.subList(1, activeBlockParsers.size())) { + for (int i = 1; i < openBlockParsers.size(); i++) { + OpenBlockParser openBlockParser = openBlockParsers.get(i); + BlockParser blockParser = openBlockParser.blockParser; findNextNonSpace(); BlockContinue result = blockParser.tryContinue(this); if (result instanceof BlockContinueImpl) { BlockContinueImpl blockContinue = (BlockContinueImpl) result; + openBlockParser.sourceIndex = getIndex(); if (blockContinue.isFinalize()) { - finalize(blockParser); + addSourceSpans(); + closeBlockParsers(openBlockParsers.size() - i); return; } else { if (blockContinue.getNewIndex() != -1) { @@ -173,25 +232,21 @@ private void incorporateLine(CharSequence ln) { } } - List<BlockParser> unmatchedBlockParsers = new ArrayList<>(activeBlockParsers.subList(matches, activeBlockParsers.size())); - BlockParser lastMatchedBlockParser = activeBlockParsers.get(matches - 1); - BlockParser blockParser = lastMatchedBlockParser; - boolean allClosed = unmatchedBlockParsers.isEmpty(); + int unmatchedBlocks = openBlockParsers.size() - matches; + BlockParser blockParser = openBlockParsers.get(matches - 1).blockParser; + boolean startedNewBlock = false; - // Check to see if we've hit 2nd blank line; if so break out of list: - if (isBlank() && isLastLineBlank(blockParser.getBlock())) { - List<BlockParser> matchedBlockParsers = new ArrayList<>(activeBlockParsers.subList(0, matches)); - breakOutOfLists(matchedBlockParsers); - } + int lastIndex = index; // Unless last matched container is a code block, try new container starts, // adding children to the last matched container: boolean tryBlockStarts = blockParser.getBlock() instanceof Paragraph || blockParser.isContainer(); while (tryBlockStarts) { + lastIndex = index; findNextNonSpace(); // this is a little performance optimization: - if (isBlank() || (indent < IndentedCodeBlockParser.INDENT && Parsing.isLetter(line, nextNonSpace))) { + if (isBlank() || (indent < Parsing.CODE_BLOCK_INDENT && Characters.isLetter(this.line.getContent(), nextNonSpace))) { setNewIndex(nextNonSpace); break; } @@ -202,9 +257,13 @@ private void incorporateLine(CharSequence ln) { break; } - if (!allClosed) { - finalizeBlocks(unmatchedBlockParsers); - allClosed = true; + startedNewBlock = true; + int sourceIndex = getIndex(); + + // We're starting a new block. If we have any previous blocks that need to be closed, we need to do it now. + if (unmatchedBlocks > 0) { + closeBlockParsers(unmatchedBlocks); + unmatchedBlocks = 0; } if (blockStart.getNewIndex() != -1) { @@ -213,12 +272,24 @@ private void incorporateLine(CharSequence ln) { setNewColumn(blockStart.getNewColumn()); } - if (blockStart.isReplaceActiveBlockParser()) { - removeActiveBlockParser(); + List<SourceSpan> replacedSourceSpans = null; + if (blockStart.getReplaceParagraphLines() >= 1 || blockStart.isReplaceActiveBlockParser()) { + var activeBlockParser = getActiveBlockParser(); + if (activeBlockParser instanceof ParagraphParser) { + var paragraphParser = (ParagraphParser) activeBlockParser; + var lines = blockStart.isReplaceActiveBlockParser() ? Integer.MAX_VALUE : blockStart.getReplaceParagraphLines(); + replacedSourceSpans = replaceParagraphLines(lines, paragraphParser); + } else if (blockStart.isReplaceActiveBlockParser()) { + replacedSourceSpans = prepareActiveBlockParserForReplacement(activeBlockParser); + } } for (BlockParser newBlockParser : blockStart.getBlockParsers()) { - blockParser = addChild(newBlockParser); + addChild(new OpenBlockParser(newBlockParser, sourceIndex)); + if (replacedSourceSpans != null) { + newBlockParser.getBlock().setSourceSpans(replacedSourceSpans); + } + blockParser = newBlockParser; tryBlockStarts = newBlockParser.isContainer(); } } @@ -226,37 +297,62 @@ private void incorporateLine(CharSequence ln) { // What remains at the offset is a text line. Add the text to the // appropriate block. - // First check for a lazy paragraph continuation: - if (!allClosed && !isBlank() && - getActiveBlockParser() instanceof ParagraphParser) { + // First check for a lazy continuation line + if (!startedNewBlock && !isBlank() && + getActiveBlockParser().canHaveLazyContinuationLines()) { + openBlockParsers.get(openBlockParsers.size() - 1).sourceIndex = lastIndex; // lazy paragraph continuation addLine(); } else { // finalize any blocks not matched - if (!allClosed) { - finalizeBlocks(unmatchedBlockParsers); + if (unmatchedBlocks > 0) { + closeBlockParsers(unmatchedBlocks); } - propagateLastLineBlank(blockParser, lastMatchedBlockParser); if (!blockParser.isContainer()) { addLine(); } else if (!isBlank()) { // create paragraph container for line - addChild(new ParagraphParser()); + ParagraphParser paragraphParser = new ParagraphParser(); + addChild(new OpenBlockParser(paragraphParser, lastIndex)); addLine(); + } else { + // This can happen for a list item like this: + // ``` + // * + // list item + // ``` + // + // The first line does not start a paragraph yet, but we still want to record source positions. + addSourceSpans(); } } } + private void setLine(String ln, int inputIndex) { + lineIndex++; + index = 0; + column = 0; + columnIsInTab = false; + + String lineContent = prepareLine(ln); + SourceSpan sourceSpan = null; + if (includeSourceSpans != IncludeSourceSpans.NONE) { + sourceSpan = SourceSpan.of(lineIndex, 0, inputIndex, lineContent.length()); + } + this.line = SourceLine.of(lineContent, sourceSpan); + } + private void findNextNonSpace() { int i = index; int cols = column; blank = true; - while (i < line.length()) { - char c = line.charAt(i); + int length = line.getContent().length(); + while (i < length) { + char c = line.getContent().charAt(i); switch (c) { case ' ': i++; @@ -282,9 +378,12 @@ private void setNewIndex(int newIndex) { index = nextNonSpace; column = nextNonSpaceColumn; } - while (index < newIndex && index != line.length()) { + int length = line.getContent().length(); + while (index < newIndex && index != length) { advance(); } + // If we're going to an index as opposed to a column, we're never within a tab + columnIsInTab = false; } private void setNewColumn(int newColumn) { @@ -293,23 +392,81 @@ private void setNewColumn(int newColumn) { index = nextNonSpace; column = nextNonSpaceColumn; } - while (column < newColumn && index != line.length()) { + int length = line.getContent().length(); + while (column < newColumn && index != length) { advance(); } + if (column > newColumn) { + // Last character was a tab and we overshot our target + index--; + column = newColumn; + columnIsInTab = true; + } else { + columnIsInTab = false; + } } private void advance() { - char c = line.charAt(index); + char c = line.getContent().charAt(index); + index++; if (c == '\t') { - index++; - column += (4 - (column % 4)); + column += Parsing.columnsToNextTabStop(column); } else { - index++; column++; } } + /** + * Add line content to the active block parser. We assume it can accept lines -- that check should be done before + * calling this. + */ + private void addLine() { + CharSequence content; + if (columnIsInTab) { + // Our column is in a partially consumed tab. Expand the remaining columns (to the next tab stop) to spaces. + int afterTab = index + 1; + CharSequence rest = line.getContent().subSequence(afterTab, line.getContent().length()); + int spaces = Parsing.columnsToNextTabStop(column); + StringBuilder sb = new StringBuilder(spaces + rest.length()); + for (int i = 0; i < spaces; i++) { + sb.append(' '); + } + sb.append(rest); + content = sb.toString(); + } else if (index == 0) { + content = line.getContent(); + } else { + content = line.getContent().subSequence(index, line.getContent().length()); + } + SourceSpan sourceSpan = null; + if (includeSourceSpans == IncludeSourceSpans.BLOCKS_AND_INLINES && index < line.getSourceSpan().getLength()) { + // Note that if we're in a partially-consumed tab the length of the source span and the content don't match. + sourceSpan = line.getSourceSpan().subSpan(index); + } + getActiveBlockParser().addLine(SourceLine.of(content, sourceSpan)); + addSourceSpans(); + } + + private void addSourceSpans() { + if (includeSourceSpans != IncludeSourceSpans.NONE) { + // Don't add source spans for Document itself (it would get the whole source text), so start at 1, not 0 + for (int i = 1; i < openBlockParsers.size(); i++) { + var openBlockParser = openBlockParsers.get(i); + // In case of a lazy continuation line, the index is less than where the block parser would expect the + // contents to start, so let's use whichever is smaller. + int blockIndex = Math.min(openBlockParser.sourceIndex, index); + int length = line.getContent().length() - blockIndex; + if (length != 0) { + openBlockParser.blockParser.addSourceSpan(line.getSourceSpan().subSpan(blockIndex)); + } + } + } + } + private BlockStartImpl findBlockStart(BlockParser blockParser) { + if (openBlockParsers.size() > maxOpenBlockParsers) { + return null; + } MatchedBlockParser matchedBlockParser = new MatchedBlockParserImpl(blockParser); for (BlockParserFactory blockParserFactory : blockParserFactories) { BlockStart result = blockParserFactory.tryStart(this, matchedBlockParser); @@ -320,182 +477,101 @@ private BlockStartImpl findBlockStart(BlockParser blockParser) { return null; } - /** - * Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, - * setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference - * definitions. - */ - private void finalize(BlockParser blockParser) { - if (getActiveBlockParser() == blockParser) { - deactivateBlockParser(); - } - - blockParser.closeBlock(); - - if (blockParser instanceof ParagraphParser) { - ParagraphParser paragraphParser = (ParagraphParser) blockParser; - paragraphParser.closeBlock(inlineParser); - } else if (blockParser instanceof ListBlockParser) { - ListBlockParser listBlockParser = (ListBlockParser) blockParser; - finalizeListTight(listBlockParser); - } - } - /** * Walk through a block & children recursively, parsing string content into inline content where appropriate. */ private void processInlines() { - for (BlockParser blockParser : allBlockParsers) { - blockParser.parseInlines(inlineParser); - } - } + var context = new InlineParserContextImpl(inlineContentParserFactories, delimiterProcessors, linkProcessors, linkMarkers, definitions); + var inlineParser = inlineParserFactory.create(context); - private void finalizeListTight(ListBlockParser listBlockParser) { - Node item = listBlockParser.getBlock().getFirstChild(); - while (item != null) { - // check for non-final list item ending with blank line: - if (endsWithBlankLine(item) && item.getNext() != null) { - listBlockParser.setTight(false); - break; - } - // recurse into children of list item, to see if there are - // spaces between any of them: - Node subItem = item.getFirstChild(); - while (subItem != null) { - if (endsWithBlankLine(subItem) && (item.getNext() != null || subItem.getNext() != null)) { - listBlockParser.setTight(false); - break; - } - subItem = subItem.getNext(); - } - item = item.getNext(); - } - } - - private boolean endsWithBlankLine(Node block) { - while (block != null) { - if (isLastLineBlank(block)) { - return true; - } - if (block instanceof ListBlock || block instanceof ListItem) { - block = block.getLastChild(); - } else { - break; - } + for (var blockParser : allBlockParsers) { + blockParser.parseInlines(inlineParser); } - return false; } /** - * Break out of all containing lists, resetting the tip of the document to the parent of the highest list, - * and finalizing all the lists. (This is used to implement the "two blank lines break of of all lists" feature.) + * Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try + * its parent, and so on until we find a block that can accept children. */ - private void breakOutOfLists(List<BlockParser> blockParsers) { - int lastList = -1; - for (int i = blockParsers.size() - 1; i >= 0; i--) { - BlockParser blockParser = blockParsers.get(i); - if (blockParser instanceof ListBlockParser) { - lastList = i; - } + private void addChild(OpenBlockParser openBlockParser) { + while (!getActiveBlockParser().canContain(openBlockParser.blockParser.getBlock())) { + closeBlockParsers(1); } - if (lastList != -1) { - finalizeBlocks(blockParsers.subList(lastList, blockParsers.size())); - } - } - - /** - * Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before - * calling this. - */ - private void addLine() { - getActiveBlockParser().addLine(line.subSequence(index, line.length())); + getActiveBlockParser().getBlock().appendChild(openBlockParser.blockParser.getBlock()); + activateBlockParser(openBlockParser); } - /** - * Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try - * its parent, and so on til we find a block that can accept children. - */ - private <T extends BlockParser> T addChild(T blockParser) { - while (!getActiveBlockParser().canContain(blockParser.getBlock())) { - finalize(getActiveBlockParser()); - } - - getActiveBlockParser().getBlock().appendChild(blockParser.getBlock()); - activateBlockParser(blockParser); - - return blockParser; + private void activateBlockParser(OpenBlockParser openBlockParser) { + openBlockParsers.add(openBlockParser); } - private void activateBlockParser(BlockParser blockParser) { - activeBlockParsers.add(blockParser); - allBlockParsers.add(blockParser); + private OpenBlockParser deactivateBlockParser() { + return openBlockParsers.remove(openBlockParsers.size() - 1); } - private void deactivateBlockParser() { - activeBlockParsers.remove(activeBlockParsers.size() - 1); + private List<SourceSpan> replaceParagraphLines(int lines, ParagraphParser paragraphParser) { + // Remove lines from paragraph as the new block is using them. + // If all lines are used, this also unlinks the Paragraph block. + var sourceSpans = paragraphParser.removeLines(lines); + // Close the paragraph block parser, which will finalize it. + closeBlockParsers(1); + return sourceSpans; } - private void removeActiveBlockParser() { - BlockParser old = getActiveBlockParser(); + private List<SourceSpan> prepareActiveBlockParserForReplacement(BlockParser blockParser) { + // Note that we don't want to parse inlines here, as it's getting replaced. deactivateBlockParser(); - allBlockParsers.remove(old); - old.getBlock().unlink(); + // Do this so that source positions are calculated, which we will carry over to the replacing block. + blockParser.closeBlock(); + blockParser.getBlock().unlink(); + return blockParser.getBlock().getSourceSpans(); } - private void propagateLastLineBlank(BlockParser blockParser, BlockParser lastMatchedBlockParser) { - if (isBlank() && blockParser.getBlock().getLastChild() != null) { - setLastLineBlank(blockParser.getBlock().getLastChild(), true); - } + private Document finalizeAndProcess() { + closeBlockParsers(openBlockParsers.size()); + processInlines(); + return documentBlockParser.getBlock(); + } - Block block = blockParser.getBlock(); - - // Block quote lines are never blank as they start with > - // and we don't count blanks in fenced code for purposes of tight/loose - // lists or breaking out of lists. We also don't set lastLineBlank - // on an empty list item. - boolean lastLineBlank = isBlank() && - !(block instanceof BlockQuote || - block instanceof FencedCodeBlock || - (block instanceof ListItem && - block.getFirstChild() == null && - blockParser != lastMatchedBlockParser)); - - // Propagate lastLineBlank up through parents - Node node = blockParser.getBlock(); - while (node != null) { - setLastLineBlank(node, lastLineBlank); - node = node.getParent(); + private void closeBlockParsers(int count) { + for (int i = 0; i < count; i++) { + BlockParser blockParser = deactivateBlockParser().blockParser; + finalize(blockParser); + // Remember for inline parsing. Note that a lot of blocks don't need inline parsing. We could have a + // separate interface (e.g. BlockParserWithInlines) so that we only have to remember those that actually + // have inlines to parse. + allBlockParsers.add(blockParser); } } - private void setLastLineBlank(Node node, boolean value) { - lastLineBlank.put(node, value); + /** + * Finalize a block. Close it and do any necessary postprocessing, e.g. setting the content of blocks and + * collecting link reference definitions from paragraphs. + */ + private void finalize(BlockParser blockParser) { + addDefinitionsFrom(blockParser); + blockParser.closeBlock(); } - private boolean isLastLineBlank(Node node) { - Boolean value = lastLineBlank.get(node); - return value != null && value; + private void addDefinitionsFrom(BlockParser blockParser) { + for (var definitionMap : blockParser.getDefinitions()) { + definitions.addDefinitions(definitionMap); + } } /** - * Finalize blocks of previous line. Returns true. + * Prepares the input line replacing {@code \0} */ - private boolean finalizeBlocks(List<BlockParser> blockParsers) { - for (int i = blockParsers.size() - 1; i >= 0; i--) { - BlockParser blockParser = blockParsers.get(i); - finalize(blockParser); + private static String prepareLine(String line) { + if (line.indexOf('\0') == -1) { + return line; + } else { + return line.replace('\0', '\uFFFD'); } - return true; } - private Document finalizeAndProcess() { - finalizeBlocks(this.activeBlockParsers); - this.processInlines(); - return this.documentBlockParser.getBlock(); - } - private static class MatchedBlockParserImpl implements MatchedBlockParser { private final BlockParser matchedBlockParser; @@ -510,14 +586,22 @@ public BlockParser getMatchedBlockParser() { } @Override - public CharSequence getParagraphStartLine() { + public SourceLines getParagraphLines() { if (matchedBlockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) matchedBlockParser; - if (paragraphParser.hasSingleLine()) { - return paragraphParser.getContentString(); - } + return paragraphParser.getParagraphLines(); } - return null; + return SourceLines.empty(); + } + } + + private static class OpenBlockParser { + private final BlockParser blockParser; + private int sourceIndex; + + OpenBlockParser(BlockParser blockParser, int sourceIndex) { + this.blockParser = blockParser; + this.sourceIndex = sourceIndex; } } } diff --git a/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java index 15c430278..d550f1d25 100644 --- a/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java @@ -1,25 +1,28 @@ package org.commonmark.internal; +import org.commonmark.internal.util.Parsing; import org.commonmark.node.Block; import org.commonmark.node.FencedCodeBlock; +import org.commonmark.parser.SourceLine; import org.commonmark.parser.block.*; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import org.commonmark.text.Characters; import static org.commonmark.internal.util.Escaping.unescapeString; public class FencedCodeBlockParser extends AbstractBlockParser { - private static final Pattern OPENING_FENCE = Pattern.compile("^`{3,}(?!.*`)|^~{3,}(?!.*~)"); - private static final Pattern CLOSING_FENCE = Pattern.compile("^(?:`{3,}|~{3,})(?= *$)"); - private final FencedCodeBlock block = new FencedCodeBlock(); - private BlockContent content = new BlockContent(); + private final char fenceChar; + private final int openingFenceLength; + + private String firstLine; + private StringBuilder otherLines = new StringBuilder(); public FencedCodeBlockParser(char fenceChar, int fenceLength, int fenceIndent) { - block.setFenceChar(fenceChar); - block.setFenceLength(fenceLength); + this.fenceChar = fenceChar; + this.openingFenceLength = fenceLength; + block.setFenceCharacter(String.valueOf(fenceChar)); + block.setOpeningFenceLength(fenceLength); block.setFenceIndent(fenceIndent); } @@ -32,20 +35,15 @@ public Block getBlock() { public BlockContinue tryContinue(ParserState state) { int nextNonSpace = state.getNextNonSpaceIndex(); int newIndex = state.getIndex(); - CharSequence line = state.getLine(); - Matcher matcher = null; - boolean matches = (state.getIndent() <= 3 && - nextNonSpace < line.length() && - line.charAt(nextNonSpace) == block.getFenceChar() && - (matcher = CLOSING_FENCE.matcher(line.subSequence(nextNonSpace, line.length()))) - .find()); - if (matches && matcher.group(0).length() >= block.getFenceLength()) { + CharSequence line = state.getLine().getContent(); + if (state.getIndent() < Parsing.CODE_BLOCK_INDENT && nextNonSpace < line.length() && tryClosing(line, nextNonSpace)) { // closing fence - we're at end of line, so we can finalize now return BlockContinue.finished(); } else { // skip optional spaces of fence indent int i = block.getFenceIndent(); - while (i > 0 && newIndex < line.length() && line.charAt(newIndex) == ' ') { + int length = line.length(); + while (i > 0 && newIndex < length && line.charAt(newIndex) == ' ') { newIndex++; i--; } @@ -54,46 +52,88 @@ public BlockContinue tryContinue(ParserState state) { } @Override - public void addLine(CharSequence line) { - content.add(line); + public void addLine(SourceLine line) { + if (firstLine == null) { + firstLine = line.getContent().toString(); + } else { + otherLines.append(line.getContent()); + otherLines.append('\n'); + } } @Override public void closeBlock() { - boolean singleLine = content.hasSingleLine(); - // add trailing newline - content.add(""); - String contentString = content.getString(); - content = null; - // first line becomes info string - int firstNewline = contentString.indexOf('\n'); - String firstLine = contentString.substring(0, firstNewline); block.setInfo(unescapeString(firstLine.trim())); - if (singleLine) { - block.setLiteral(""); - } else { - String literal = contentString.substring(firstNewline + 1); - block.setLiteral(literal); - } + block.setLiteral(otherLines.toString()); } public static class Factory extends AbstractBlockParserFactory { @Override public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + int indent = state.getIndent(); + if (indent >= Parsing.CODE_BLOCK_INDENT) { + return BlockStart.none(); + } + int nextNonSpace = state.getNextNonSpaceIndex(); - CharSequence line = state.getLine(); - Matcher matcher; - if (state.getIndent() < 4 && (matcher = OPENING_FENCE.matcher(line.subSequence(nextNonSpace, line.length()))).find()) { - int fenceLength = matcher.group(0).length(); - char fenceChar = matcher.group(0).charAt(0); - FencedCodeBlockParser blockParser = new FencedCodeBlockParser(fenceChar, fenceLength, state.getIndent()); - return BlockStart.of(blockParser).atIndex(nextNonSpace + fenceLength); + FencedCodeBlockParser blockParser = checkOpener(state.getLine().getContent(), nextNonSpace, indent); + if (blockParser != null) { + return BlockStart.of(blockParser).atIndex(nextNonSpace + blockParser.block.getOpeningFenceLength()); } else { return BlockStart.none(); } } } -} + // spec: A code fence is a sequence of at least three consecutive backtick characters (`) or tildes (~). (Tildes and + // backticks cannot be mixed.) + private static FencedCodeBlockParser checkOpener(CharSequence line, int index, int indent) { + int backticks = 0; + int tildes = 0; + int length = line.length(); + loop: + for (int i = index; i < length; i++) { + switch (line.charAt(i)) { + case '`': + backticks++; + break; + case '~': + tildes++; + break; + default: + break loop; + } + } + if (backticks >= 3 && tildes == 0) { + // spec: If the info string comes after a backtick fence, it may not contain any backtick characters. + if (Characters.find('`', line, index + backticks) != -1) { + return null; + } + return new FencedCodeBlockParser('`', backticks, indent); + } else if (tildes >= 3 && backticks == 0) { + // spec: Info strings for tilde code blocks can contain backticks and tildes + return new FencedCodeBlockParser('~', tildes, indent); + } else { + return null; + } + } + + // spec: The content of the code block consists of all subsequent lines, until a closing code fence of the same type + // as the code block began with (backticks or tildes), and with at least as many backticks or tildes as the opening + // code fence. + private boolean tryClosing(CharSequence line, int index) { + int fences = Characters.skip(fenceChar, line, index, line.length()) - index; + if (fences < openingFenceLength) { + return false; + } + // spec: The closing code fence [...] may be followed only by spaces, which are ignored. + int after = Characters.skipSpaceTab(line, index + fences, line.length()); + if (after == line.length()) { + block.setClosingFenceLength(fences); + return true; + } + return false; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/HeaderParser.java b/commonmark/src/main/java/org/commonmark/internal/HeaderParser.java deleted file mode 100644 index 81ef32bb7..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/HeaderParser.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.commonmark.internal; - -import org.commonmark.node.Block; -import org.commonmark.node.Header; -import org.commonmark.parser.InlineParser; -import org.commonmark.parser.block.*; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class HeaderParser extends AbstractBlockParser { - - private static Pattern ATX_HEADER = Pattern.compile("^#{1,6}(?: +|$)"); - private static Pattern ATX_TRAILING = Pattern.compile("(^| ) *#+ *$"); - private static Pattern SETEXT_HEADER = Pattern.compile("^(?:=+|-+) *$"); - - private final Header block = new Header(); - private final String content; - - public HeaderParser(int level, String content) { - block.setLevel(level); - this.content = content; - } - - @Override - public Block getBlock() { - return block; - } - - @Override - public BlockContinue tryContinue(ParserState parserState) { - // a header can never container > 1 line, so fail to match - return BlockContinue.none(); - } - - @Override - public void parseInlines(InlineParser inlineParser) { - inlineParser.parse(content, block); - } - - public static class Factory extends AbstractBlockParserFactory { - - @Override - public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { - if (state.getIndent() >= 4) { - return BlockStart.none(); - } - CharSequence line = state.getLine(); - int nextNonSpace = state.getNextNonSpaceIndex(); - CharSequence paragraphStartLine = matchedBlockParser.getParagraphStartLine(); - Matcher matcher; - if ((matcher = ATX_HEADER.matcher(line.subSequence(nextNonSpace, line.length()))).find()) { - // ATX header - int newOffset = nextNonSpace + matcher.group(0).length(); - int level = matcher.group(0).trim().length(); // number of #s - // remove trailing ###s: - String content = ATX_TRAILING.matcher(line.subSequence(newOffset, line.length())).replaceAll(""); - return BlockStart.of(new HeaderParser(level, content)) - .atIndex(line.length()); - - } else if (paragraphStartLine != null && - ((matcher = SETEXT_HEADER.matcher(line.subSequence(nextNonSpace, line.length()))).find())) { - // setext header line - - int level = matcher.group(0).charAt(0) == '=' ? 1 : 2; - String content = paragraphStartLine.toString(); - return BlockStart.of(new HeaderParser(level, content)) - .atIndex(line.length()) - .replaceActiveBlockParser(); - } else { - return BlockStart.none(); - } - } - } -} diff --git a/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java b/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java new file mode 100644 index 000000000..05f070137 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/HeadingParser.java @@ -0,0 +1,158 @@ +package org.commonmark.internal; + +import org.commonmark.internal.util.Parsing; +import org.commonmark.node.Block; +import org.commonmark.node.Heading; +import org.commonmark.parser.InlineParser; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.beta.Position; +import org.commonmark.parser.beta.Scanner; +import org.commonmark.parser.block.*; +import org.commonmark.text.Characters; + +public class HeadingParser extends AbstractBlockParser { + + private final Heading block = new Heading(); + private final SourceLines content; + + public HeadingParser(int level, SourceLines content) { + block.setLevel(level); + this.content = content; + } + + @Override + public Block getBlock() { + return block; + } + + @Override + public BlockContinue tryContinue(ParserState parserState) { + // In both ATX and Setext headings, once we have the heading markup, there's nothing more to parse. + return BlockContinue.none(); + } + + @Override + public void parseInlines(InlineParser inlineParser) { + inlineParser.parse(content, block); + } + + public static class Factory extends AbstractBlockParserFactory { + + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT) { + return BlockStart.none(); + } + + SourceLine line = state.getLine(); + int nextNonSpace = state.getNextNonSpaceIndex(); + if (line.getContent().charAt(nextNonSpace) == '#') { + HeadingParser atxHeading = getAtxHeading(line.substring(nextNonSpace, line.getContent().length())); + if (atxHeading != null) { + return BlockStart.of(atxHeading).atIndex(line.getContent().length()); + } + } + + int setextHeadingLevel = getSetextHeadingLevel(line.getContent(), nextNonSpace); + if (setextHeadingLevel > 0) { + SourceLines paragraph = matchedBlockParser.getParagraphLines(); + if (!paragraph.isEmpty()) { + return BlockStart.of(new HeadingParser(setextHeadingLevel, paragraph)) + .atIndex(line.getContent().length()) + .replaceParagraphLines(paragraph.getLines().size()); + } + } + + return BlockStart.none(); + } + } + + // spec: An ATX heading consists of a string of characters, parsed as inline content, between an opening sequence of + // 1-6 unescaped # characters and an optional closing sequence of any number of unescaped # characters. The opening + // sequence of # characters must be followed by a space or by the end of line. The optional closing sequence of #s + // must be preceded by a space and may be followed by spaces only. + private static HeadingParser getAtxHeading(SourceLine line) { + Scanner scanner = Scanner.of(SourceLines.of(line)); + int level = scanner.matchMultiple('#'); + + if (level == 0 || level > 6) { + return null; + } + + if (!scanner.hasNext()) { + // End of line after markers is an empty heading + return new HeadingParser(level, SourceLines.empty()); + } + + char next = scanner.peek(); + if (!(next == ' ' || next == '\t')) { + return null; + } + + scanner.whitespace(); + Position start = scanner.position(); + Position end = start; + boolean hashCanEnd = true; + + while (scanner.hasNext()) { + char c = scanner.peek(); + switch (c) { + case '#': + if (hashCanEnd) { + scanner.matchMultiple('#'); + int whitespace = scanner.whitespace(); + // If there's other characters, the hashes and spaces were part of the heading + if (scanner.hasNext()) { + end = scanner.position(); + } + hashCanEnd = whitespace > 0; + } else { + scanner.next(); + end = scanner.position(); + } + break; + case ' ': + case '\t': + hashCanEnd = true; + scanner.next(); + break; + default: + hashCanEnd = false; + scanner.next(); + end = scanner.position(); + } + } + + SourceLines source = scanner.getSource(start, end); + String content = source.getContent(); + if (content.isEmpty()) { + return new HeadingParser(level, SourceLines.empty()); + } + return new HeadingParser(level, source); + } + + // spec: A setext heading underline is a sequence of = characters or a sequence of - characters, with no more than + // 3 spaces indentation and any number of trailing spaces. + private static int getSetextHeadingLevel(CharSequence line, int index) { + switch (line.charAt(index)) { + case '=': + if (isSetextHeadingRest(line, index + 1, '=')) { + return 1; + } + break; + case '-': + if (isSetextHeadingRest(line, index + 1, '-')) { + return 2; + } + break; + } + return 0; + } + + private static boolean isSetextHeadingRest(CharSequence line, int index, char marker) { + int afterMarker = Characters.skip(marker, line, index, line.length()); + int afterSpace = Characters.skipSpaceTab(line, afterMarker, line.length()); + return afterSpace >= line.length(); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/HorizontalRuleParser.java b/commonmark/src/main/java/org/commonmark/internal/HorizontalRuleParser.java deleted file mode 100644 index 0bc8422e2..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/HorizontalRuleParser.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.commonmark.internal; - -import org.commonmark.node.Block; -import org.commonmark.node.HorizontalRule; -import org.commonmark.parser.block.*; - -import java.util.regex.Pattern; - -public class HorizontalRuleParser extends AbstractBlockParser { - - private static Pattern H_RULE = Pattern.compile("^(?:(?:\\* *){3,}|(?:_ *){3,}|(?:- *){3,}) *$"); - - private final HorizontalRule block = new HorizontalRule(); - - @Override - public Block getBlock() { - return block; - } - - @Override - public BlockContinue tryContinue(ParserState state) { - // a horizontal rule can never container > 1 line, so fail to match - return BlockContinue.none(); - } - - public static class Factory extends AbstractBlockParserFactory { - - @Override - public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { - if (state.getIndent() >= 4) { - return BlockStart.none(); - } - int nextNonSpace = state.getNextNonSpaceIndex(); - CharSequence line = state.getLine(); - if (H_RULE.matcher(line.subSequence(nextNonSpace, line.length())).matches()) { - return BlockStart.of(new HorizontalRuleParser()).atIndex(line.length()); - } else { - return BlockStart.none(); - } - } - } -} diff --git a/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java index 12b17cc9f..123d9ec1f 100644 --- a/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java @@ -1,20 +1,35 @@ package org.commonmark.internal; -import org.commonmark.internal.util.Parsing; import org.commonmark.node.Block; import org.commonmark.node.HtmlBlock; import org.commonmark.node.Paragraph; +import org.commonmark.parser.SourceLine; import org.commonmark.parser.block.*; import java.util.regex.Pattern; public class HtmlBlockParser extends AbstractBlockParser { + private static final String TAGNAME = "[A-Za-z][A-Za-z0-9-]*"; + private static final String ATTRIBUTENAME = "[a-zA-Z_:][a-zA-Z0-9:._-]*"; + private static final String UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"; + private static final String SINGLEQUOTEDVALUE = "'[^']*'"; + private static final String DOUBLEQUOTEDVALUE = "\"[^\"]*\""; + private static final String ATTRIBUTEVALUE = "(?:" + UNQUOTEDVALUE + "|" + SINGLEQUOTEDVALUE + + "|" + DOUBLEQUOTEDVALUE + ")"; + private static final String ATTRIBUTEVALUESPEC = "(?:" + "\\s*=" + "\\s*" + ATTRIBUTEVALUE + + ")"; + private static final String ATTRIBUTE = "(?:" + "\\s+" + ATTRIBUTENAME + ATTRIBUTEVALUESPEC + + "?)"; + + private static final String OPENTAG = "<" + TAGNAME + ATTRIBUTE + "*" + "\\s*/?>"; + private static final String CLOSETAG = "</" + TAGNAME + "\\s*[>]"; + private static final Pattern[][] BLOCK_PATTERNS = new Pattern[][]{ {null, null}, // not used (no type 0) { - Pattern.compile("^<(?:script|pre|style)(?:\\s|>|$)", Pattern.CASE_INSENSITIVE), - Pattern.compile("</(?:script|pre|style)>", Pattern.CASE_INSENSITIVE) + Pattern.compile("^<(?:script|pre|style|textarea)(?:\\s|>|$)", Pattern.CASE_INSENSITIVE), + Pattern.compile("</(?:script|pre|style|textarea)>", Pattern.CASE_INSENSITIVE) }, { Pattern.compile("^<!--"), @@ -39,20 +54,21 @@ public class HtmlBlockParser extends AbstractBlockParser { "caption|center|col|colgroup|" + "dd|details|dialog|dir|div|dl|dt|" + "fieldset|figcaption|figure|footer|form|frame|frameset|" + - "h1|head|header|hr|html|" + + "h1|h2|h3|h4|h5|h6|head|header|hr|html|" + + "iframe|" + "legend|li|link|" + - "main|menu|menuitem|meta|" + + "main|menu|menuitem|" + "nav|noframes|" + "ol|optgroup|option|" + - "p|param|pre|" + - "section|source|summary|" + + "p|param|" + + "search|section|summary|" + "table|tbody|td|tfoot|th|thead|title|tr|track|" + "ul" + ")(?:\\s|[/]?[>]|$)", Pattern.CASE_INSENSITIVE), null // terminated by blank line }, { - Pattern.compile("^(?:" + Parsing.OPENTAG + '|' + Parsing.CLOSETAG + ")\\s*$", Pattern.CASE_INSENSITIVE), + Pattern.compile("^(?:" + OPENTAG + '|' + CLOSETAG + ")\\s*$", Pattern.CASE_INSENSITIVE), null // terminated by blank line } }; @@ -87,10 +103,10 @@ public BlockContinue tryContinue(ParserState state) { } @Override - public void addLine(CharSequence line) { - content.add(line); + public void addLine(SourceLine line) { + content.add(line.getContent()); - if (closingPattern != null && closingPattern.matcher(line).find()) { + if (closingPattern != null && closingPattern.matcher(line.getContent()).find()) { finished = true; } } @@ -106,12 +122,14 @@ public static class Factory extends AbstractBlockParserFactory { @Override public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { int nextNonSpace = state.getNextNonSpaceIndex(); - CharSequence line = state.getLine(); + CharSequence line = state.getLine().getContent(); if (state.getIndent() < 4 && line.charAt(nextNonSpace) == '<') { for (int blockType = 1; blockType <= 7; blockType++) { - // Type 7 can not interrupt a paragraph - if (blockType == 7 && matchedBlockParser.getMatchedBlockParser().getBlock() instanceof Paragraph) { + // Type 7 can not interrupt a paragraph (not even a lazy one) + if (blockType == 7 && ( + matchedBlockParser.getMatchedBlockParser().getBlock() instanceof Paragraph || + state.getActiveBlockParser().canHaveLazyContinuationLines())) { continue; } Pattern opener = BLOCK_PATTERNS[blockType][0]; diff --git a/commonmark/src/main/java/org/commonmark/internal/IndentedCodeBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/IndentedCodeBlockParser.java index 3444f95e6..3598f5615 100644 --- a/commonmark/src/main/java/org/commonmark/internal/IndentedCodeBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/IndentedCodeBlockParser.java @@ -1,18 +1,20 @@ package org.commonmark.internal; -import org.commonmark.node.*; +import org.commonmark.internal.util.Parsing; +import org.commonmark.node.Block; +import org.commonmark.node.IndentedCodeBlock; +import org.commonmark.node.Paragraph; +import org.commonmark.parser.SourceLine; import org.commonmark.parser.block.*; +import org.commonmark.text.Characters; -import java.util.regex.Pattern; +import java.util.ArrayList; +import java.util.List; public class IndentedCodeBlockParser extends AbstractBlockParser { - public static int INDENT = 4; - - private static final Pattern TRAILING_BLANK_LINES = Pattern.compile("(?:\n[ \t]*)+$"); - private final IndentedCodeBlock block = new IndentedCodeBlock(); - private BlockContent content = new BlockContent(); + private final List<CharSequence> lines = new ArrayList<>(); @Override public Block getBlock() { @@ -21,8 +23,8 @@ public Block getBlock() { @Override public BlockContinue tryContinue(ParserState state) { - if (state.getIndent() >= INDENT) { - return BlockContinue.atColumn(state.getColumn() + INDENT); + if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT) { + return BlockContinue.atColumn(state.getColumn() + Parsing.CODE_BLOCK_INDENT); } else if (state.isBlank()) { return BlockContinue.atIndex(state.getNextNonSpaceIndex()); } else { @@ -31,18 +33,27 @@ public BlockContinue tryContinue(ParserState state) { } @Override - public void addLine(CharSequence line) { - content.add(line); + public void addLine(SourceLine line) { + lines.add(line.getContent()); } @Override public void closeBlock() { - // add trailing newline - content.add(""); - String contentString = content.getString(); - content = null; + int lastNonBlank = lines.size() - 1; + while (lastNonBlank >= 0) { + if (!Characters.isBlank(lines.get(lastNonBlank))) { + break; + } + lastNonBlank--; + } + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < lastNonBlank + 1; i++) { + sb.append(lines.get(i)); + sb.append('\n'); + } - String literal = TRAILING_BLANK_LINES.matcher(contentString).replaceFirst("\n"); + String literal = sb.toString(); block.setLiteral(literal); } @@ -51,9 +62,8 @@ public static class Factory extends AbstractBlockParserFactory { @Override public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { // An indented code block cannot interrupt a paragraph. - if (state.getIndent() >= INDENT && !state.isBlank() && !(state.getActiveBlockParser().getBlock() instanceof Paragraph)) { - int nextNonSpace = state.getNextNonSpaceIndex(); - return BlockStart.of(new IndentedCodeBlockParser()).atColumn(state.getColumn() + INDENT); + if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT && !state.isBlank() && !(state.getActiveBlockParser().getBlock() instanceof Paragraph)) { + return BlockStart.of(new IndentedCodeBlockParser()).atColumn(state.getColumn() + Parsing.CODE_BLOCK_INDENT); } else { return BlockStart.none(); } diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java new file mode 100644 index 000000000..233041f62 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java @@ -0,0 +1,61 @@ +package org.commonmark.internal; + +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.InlineContentParserFactory; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.util.List; +import java.util.Set; + +public class InlineParserContextImpl implements InlineParserContext { + + private final List<InlineContentParserFactory> inlineContentParserFactories; + private final List<DelimiterProcessor> delimiterProcessors; + private final List<LinkProcessor> linkProcessors; + private final Set<Character> linkMarkers; + private final Definitions definitions; + + public InlineParserContextImpl(List<InlineContentParserFactory> inlineContentParserFactories, + List<DelimiterProcessor> delimiterProcessors, + List<LinkProcessor> linkProcessors, + Set<Character> linkMarkers, + Definitions definitions) { + this.inlineContentParserFactories = inlineContentParserFactories; + this.delimiterProcessors = delimiterProcessors; + this.linkProcessors = linkProcessors; + this.linkMarkers = linkMarkers; + this.definitions = definitions; + } + + @Override + public List<InlineContentParserFactory> getCustomInlineContentParserFactories() { + return inlineContentParserFactories; + } + + @Override + public List<DelimiterProcessor> getCustomDelimiterProcessors() { + return delimiterProcessors; + } + + @Override + public List<LinkProcessor> getCustomLinkProcessors() { + return linkProcessors; + } + + @Override + public Set<Character> getCustomLinkMarkers() { + return linkMarkers; + } + + @Override + public LinkReferenceDefinition getLinkReferenceDefinition(String label) { + return definitions.getDefinition(LinkReferenceDefinition.class, label); + } + + @Override + public <D> D getDefinition(Class<D> type, String label) { + return definitions.getDefinition(type, label); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index ea145c595..44422f421 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -1,771 +1,644 @@ package org.commonmark.internal; -import org.commonmark.internal.inline.AsteriskDelimiterProcessor; -import org.commonmark.internal.inline.UnderscoreDelimiterProcessor; +import org.commonmark.internal.inline.*; import org.commonmark.internal.util.Escaping; -import org.commonmark.internal.util.Html5Entities; -import org.commonmark.internal.util.Parsing; +import org.commonmark.internal.util.LinkScanner; import org.commonmark.node.*; -import org.commonmark.parser.DelimiterProcessor; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.beta.Scanner; +import org.commonmark.parser.beta.*; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.text.Characters; import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -public class InlineParserImpl implements InlineParser { +public class InlineParserImpl implements InlineParser, InlineParserState { - private static final String ESCAPED_CHAR = "\\\\" + Escaping.ESCAPABLE; - private static final String REG_CHAR = "[^\\\\()\\x00-\\x20]"; - private static final String IN_PARENS_NOSP = "\\((" + REG_CHAR + '|' + ESCAPED_CHAR + ")*\\)"; - private static final String HTMLCOMMENT = "<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->"; - private static final String PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"; - private static final String DECLARATION = "<![A-Z]+" + "\\s+[^>]*>"; - private static final String CDATA = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>"; - private static final String HTMLTAG = "(?:" + Parsing.OPENTAG + "|" + Parsing.CLOSETAG + "|" + HTMLCOMMENT - + "|" + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"; - private static final String ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; - - private static final String ASCII_PUNCTUATION = "'!\"#\\$%&\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~"; - private static final Pattern PUNCTUATION = Pattern - .compile("^[" + ASCII_PUNCTUATION + "\\p{Pc}\\p{Pd}\\p{Pe}\\p{Pf}\\p{Pi}\\p{Po}\\p{Ps}]"); - - private static final Pattern HTML_TAG = Pattern.compile('^' + HTMLTAG, Pattern.CASE_INSENSITIVE); - - private static final Pattern LINK_TITLE = Pattern.compile( - "^(?:\"(" + ESCAPED_CHAR + "|[^\"\\x00])*\"" + - '|' + - "'(" + ESCAPED_CHAR + "|[^'\\x00])*'" + - '|' + - "\\((" + ESCAPED_CHAR + "|[^)\\x00])*\\))"); - - private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile( - "^(?:[<](?:[^<>\\n\\\\\\x00]" + '|' + ESCAPED_CHAR + '|' + "\\\\)*[>])"); - - private static final Pattern LINK_DESTINATION = Pattern.compile( - "^(?:" + REG_CHAR + "+|" + ESCAPED_CHAR + "|\\\\|" + IN_PARENS_NOSP + ")*"); - - private static final Pattern LINK_LABEL = Pattern - .compile("^\\[(?:[^\\\\\\[\\]]|" + ESCAPED_CHAR + "|\\\\){0,1000}\\]"); - - private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE); - - private static final Pattern ENTITY_HERE = Pattern.compile('^' + ENTITY, Pattern.CASE_INSENSITIVE); - - private static final Pattern TICKS = Pattern.compile("`+"); - - private static final Pattern TICKS_HERE = Pattern.compile("^`+"); - - private static final Pattern EMAIL_AUTOLINK = Pattern - .compile("^<([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>"); - - private static final Pattern AUTOLINK = Pattern - .compile("^<(?:coap|doi|javascript|aaa|aaas|about|acap|cap|cid|crid|data|dav|dict|dns|file|ftp|geo|go|gopher|h323|http|https|iax|icap|im|imap|info|ipp|iris|iris.beep|iris.xpc|iris.xpcs|iris.lwz|ldap|mailto|mid|msrp|msrps|mtqp|mupdate|news|nfs|ni|nih|nntp|opaquelocktoken|pop|pres|rtsp|service|session|shttp|sieve|sip|sips|sms|snmp|soap.beep|soap.beeps|tag|tel|telnet|tftp|thismessage|tn3270|tip|tv|urn|vemmi|ws|wss|xcon|xcon-userid|xmlrpc.beep|xmlrpc.beeps|xmpp|z39.50r|z39.50s|adiumxtra|afp|afs|aim|apt|attachment|aw|beshare|bitcoin|bolo|callto|chrome|chrome-extension|com-eventbrite-attendee|content|cvs|dlna-playsingle|dlna-playcontainer|dtn|dvb|ed2k|facetime|feed|finger|fish|gg|git|gizmoproject|gtalk|hcp|icon|ipn|irc|irc6|ircs|itms|jar|jms|keyparc|lastfm|ldaps|magnet|maps|market|message|mms|ms-help|msnim|mumble|mvn|notes|oid|palm|paparazzi|platform|proxy|psyc|query|res|resource|rmi|rsync|rtmp|secondlife|sftp|sgn|skype|smb|soldat|spotify|ssh|steam|svn|teamspeak|things|udp|unreal|ut2004|ventrilo|view-source|webcal|wtai|wyciwyg|xfire|xri|ymsgr):[^<>\u0000-\u0020]*>", Pattern.CASE_INSENSITIVE); - - private static final Pattern SPNL = Pattern.compile("^ *(?:\n *)?"); - - private static final Pattern WHITESPACE_CHAR = Pattern.compile("^\\p{IsWhite_Space}"); - - private static final Pattern WHITESPACE = Pattern.compile("\\s+"); - - private static final Pattern FINAL_SPACE = Pattern.compile(" *$"); - - private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); - - private final BitSet specialCharacters; - private final BitSet delimiterCharacters; + private final InlineParserContext context; + private final List<InlineContentParserFactory> inlineContentParserFactories; private final Map<Character, DelimiterProcessor> delimiterProcessors; + private final List<LinkProcessor> linkProcessors; + private final BitSet specialCharacters; + private final BitSet linkMarkers; - /** - * Link references by ID, needs to be built up using parseReference before calling parse. - */ - private Map<String, Link> referenceMap = new HashMap<>(); + private Map<Character, List<InlineContentParser>> inlineParsers; + private Scanner scanner; + private boolean includeSourceSpans; + private int trailingSpaces; - private Node block; - private String subject; - private int pos; /** - * Stack of delimiters (emphasis, strong emphasis). + * Top delimiter (emphasis, strong emphasis or custom emphasis). (Brackets are on a separate stack, different + * from the algorithm described in the spec.) */ - private Delimiter delimiter; + private Delimiter lastDelimiter; /** - * Earliest possible bracket delimiter to go back to when searching for opener. + * Top opening bracket (<code>[</code> or <code>![)</code>). */ - private Delimiter bracketDelimiterBottom = null; + private Bracket lastBracket; + + public InlineParserImpl(InlineParserContext context) { + this.context = context; + this.inlineContentParserFactories = calculateInlineContentParserFactories(context.getCustomInlineContentParserFactories()); + this.delimiterProcessors = calculateDelimiterProcessors(context.getCustomDelimiterProcessors()); + this.linkProcessors = calculateLinkProcessors(context.getCustomLinkProcessors()); + this.linkMarkers = calculateLinkMarkers(context.getCustomLinkMarkers()); + this.specialCharacters = calculateSpecialCharacters(linkMarkers, this.delimiterProcessors.keySet(), this.inlineContentParserFactories); + } + + private List<InlineContentParserFactory> calculateInlineContentParserFactories(List<InlineContentParserFactory> customFactories) { + // Custom parsers can override built-in parsers if they want, so make sure they are tried first + var list = new ArrayList<>(customFactories); + list.add(new BackslashInlineParser.Factory()); + list.add(new BackticksInlineParser.Factory()); + list.add(new EntityInlineParser.Factory()); + list.add(new AutolinkInlineParser.Factory()); + list.add(new HtmlInlineParser.Factory()); + return list; + } + + private List<LinkProcessor> calculateLinkProcessors(List<LinkProcessor> linkProcessors) { + // Custom link processors can override the built-in behavior, so make sure they are tried first + var list = new ArrayList<>(linkProcessors); + list.add(new CoreLinkProcessor()); + return list; + } + + private static Map<Character, DelimiterProcessor> calculateDelimiterProcessors(List<DelimiterProcessor> delimiterProcessors) { + var map = new HashMap<Character, DelimiterProcessor>(); + addDelimiterProcessors(List.of(new AsteriskDelimiterProcessor(), new UnderscoreDelimiterProcessor()), map); + addDelimiterProcessors(delimiterProcessors, map); + return map; + } - private StringBuilder currentText; + private static void addDelimiterProcessors(Iterable<DelimiterProcessor> delimiterProcessors, Map<Character, DelimiterProcessor> map) { + for (DelimiterProcessor delimiterProcessor : delimiterProcessors) { + char opening = delimiterProcessor.getOpeningCharacter(); + char closing = delimiterProcessor.getClosingCharacter(); + if (opening == closing) { + DelimiterProcessor old = map.get(opening); + if (old != null && old.getOpeningCharacter() == old.getClosingCharacter()) { + StaggeredDelimiterProcessor s; + if (old instanceof StaggeredDelimiterProcessor) { + s = (StaggeredDelimiterProcessor) old; + } else { + s = new StaggeredDelimiterProcessor(opening); + s.add(old); + } + s.add(delimiterProcessor); + map.put(opening, s); + } else { + addDelimiterProcessorForChar(opening, delimiterProcessor, map); + } + } else { + addDelimiterProcessorForChar(opening, delimiterProcessor, map); + addDelimiterProcessorForChar(closing, delimiterProcessor, map); + } + } + } - public InlineParserImpl(BitSet specialCharacters, BitSet delimiterCharacters, Map<Character, DelimiterProcessor> delimiterProcessors) { - this.delimiterProcessors = delimiterProcessors; - this.delimiterCharacters = delimiterCharacters; - this.specialCharacters = specialCharacters; + private static void addDelimiterProcessorForChar(char delimiterChar, DelimiterProcessor toAdd, Map<Character, DelimiterProcessor> delimiterProcessors) { + DelimiterProcessor existing = delimiterProcessors.put(delimiterChar, toAdd); + if (existing != null) { + throw new IllegalArgumentException("Delimiter processor conflict with delimiter char '" + delimiterChar + "'"); + } } - public static BitSet calculateDelimiterCharacters(Set<Character> characters) { - BitSet bitSet = new BitSet(); - for (Character character : characters) { - bitSet.set(character); + private static BitSet calculateLinkMarkers(Set<Character> linkMarkers) { + var bitSet = new BitSet(); + for (var c : linkMarkers) { + bitSet.set(c); } + bitSet.set('!'); return bitSet; } - public static BitSet calculateSpecialCharacters(BitSet delimiterCharacters) { - BitSet bitSet = new BitSet(); - bitSet.or(delimiterCharacters); - bitSet.set('\n'); - bitSet.set('`'); + private static BitSet calculateSpecialCharacters(BitSet linkMarkers, + Set<Character> delimiterCharacters, + List<InlineContentParserFactory> inlineContentParserFactories) { + BitSet bitSet = (BitSet) linkMarkers.clone(); + for (Character c : delimiterCharacters) { + bitSet.set(c); + } + for (var factory : inlineContentParserFactories) { + for (var c : factory.getTriggerCharacters()) { + bitSet.set(c); + } + } bitSet.set('['); bitSet.set(']'); - bitSet.set('\\'); bitSet.set('!'); - bitSet.set('<'); - bitSet.set('&'); + bitSet.set('\n'); return bitSet; } - public static Map<Character, DelimiterProcessor> calculateDelimiterProcessors(List<DelimiterProcessor> delimiterProcessors) { - Map<Character, DelimiterProcessor> map = new HashMap<>(); - addDelimiterProcessors(Arrays.<DelimiterProcessor>asList(new AsteriskDelimiterProcessor(), new UnderscoreDelimiterProcessor()), map); - addDelimiterProcessors(delimiterProcessors, map); - return map; - } - - private static void addDelimiterProcessors(Iterable<DelimiterProcessor> delimiterProcessors, Map<Character, DelimiterProcessor> map) { - for (DelimiterProcessor delimiterProcessor : delimiterProcessors) { - char c = delimiterProcessor.getDelimiterChar(); - DelimiterProcessor existing = map.put(c, delimiterProcessor); - if (existing != null) { - throw new IllegalArgumentException("Inline delimiter parser can not be registered more than once, delimiter character: " + c); + private Map<Character, List<InlineContentParser>> createInlineContentParsers() { + var map = new HashMap<Character, List<InlineContentParser>>(); + for (var factory : inlineContentParserFactories) { + var parser = factory.create(); + for (var c : factory.getTriggerCharacters()) { + map.computeIfAbsent(c, k -> new ArrayList<>()).add(parser); } } + return map; } - /** - * Parse content in block into inline children, using reference map to resolve references. - */ @Override - public void parse(String content, Node block) { - this.block = block; - this.subject = content.trim(); - this.pos = 0; - this.delimiter = null; - this.bracketDelimiterBottom = null; - - boolean moreToParse; - do { - moreToParse = parseInline(); - } while (moreToParse); - flushTextNode(); - - processDelimiters(null); + public Scanner scanner() { + return scanner; } /** - * Attempt to parse a link reference, modifying the internal reference map. - * - * @return how many characters were parsed as a reference, {@code 0} if none + * Parse content in block into inline children, appending them to the block node. */ - public int parseReference(String s) { - this.subject = s; - this.pos = 0; - String rawLabel; - String dest; - String title; - int matchChars; - int startPos = this.pos; - - // label: - matchChars = this.parseLinkLabel(); - if (matchChars == 0) { - return 0; - } else { - rawLabel = this.subject.substring(0, matchChars); - } - - // colon: - if (this.peek() == ':') { - this.pos++; - } else { - this.pos = startPos; - return 0; - } - - // link url - this.spnl(); - - dest = this.parseLinkDestination(); - if (dest == null || dest.length() == 0) { - this.pos = startPos; - return 0; - } - - int beforeTitle = this.pos; - this.spnl(); - title = this.parseLinkTitle(); - if (title == null) { - // rewind before spaces - this.pos = beforeTitle; - } + @Override + public void parse(SourceLines lines, Node block) { + reset(lines); - boolean atLineEnd = true; - if (this.pos != this.subject.length() && this.match(LINE_END) == null) { - if (title == null) { - atLineEnd = false; - } else { - // the potential title we found is not at the line end, - // but it could still be a legal link reference if we - // discard the title - title = null; - // rewind before spaces - this.pos = beforeTitle; - // and instead check if the link URL is at the line end - atLineEnd = this.match(LINE_END) != null; + while (true) { + var nodes = parseInline(); + if (nodes == null) { + break; + } + for (Node node : nodes) { + block.appendChild(node); } } - if (!atLineEnd) { - this.pos = startPos; - return 0; - } - - String normalizedLabel = Escaping.normalizeReference(rawLabel); - if (normalizedLabel.isEmpty()) { - this.pos = startPos; - return 0; - } - - if (!referenceMap.containsKey(normalizedLabel)) { - Link link = new Link(dest, title); - referenceMap.put(normalizedLabel, link); - } - return this.pos - startPos; - } - - private void appendText(CharSequence text) { - appendText(text, 0, text.length()); - } - - private void appendText(CharSequence text, int beginIndex, int endIndex) { - if (currentText == null) { - currentText = new StringBuilder(endIndex - beginIndex + 16); - } - currentText.append(text, beginIndex, endIndex); - } - - private void appendNode(Node node) { - flushTextNode(); - block.appendChild(node); + processDelimiters(null); + mergeChildTextNodes(block); } - // In some cases, we don't want the text to be appended to an existing node, we need it separate - private Text appendSeparateText(String text) { - Text node = new Text(text); - appendNode(node); - return node; + void reset(SourceLines lines) { + this.scanner = Scanner.of(lines); + this.includeSourceSpans = !lines.getSourceSpans().isEmpty(); + this.trailingSpaces = 0; + this.lastDelimiter = null; + this.lastBracket = null; + this.inlineParsers = createInlineContentParsers(); } - private void flushTextNode() { - if (currentText != null) { - block.appendChild(new Text(currentText.toString())); - currentText = null; - } + private Text text(SourceLines sourceLines) { + Text text = new Text(sourceLines.getContent()); + text.setSourceSpans(sourceLines.getSourceSpans()); + return text; } /** - * Parse the next inline element in subject, advancing subject position. - * On success, add the result to block's children and return true. - * On failure, return false. + * Parse the next inline element in subject, advancing our position. + * On success, return the new inline node. + * On failure, return null. */ - private boolean parseInline() { - boolean res; - char c = this.peek(); - if (c == '\0') { - return false; - } + private List<? extends Node> parseInline() { + char c = scanner.peek(); + switch (c) { - case '\n': - res = this.parseNewline(); - break; - case '\\': - res = this.parseBackslash(); - break; - case '`': - res = this.parseBackticks(); - break; case '[': - res = this.parseOpenBracket(); - break; - case '!': - res = this.parseBang(); - break; + return List.of(parseOpenBracket()); case ']': - res = this.parseCloseBracket(); - break; - case '<': - res = this.parseAutolink() || this.parseHtmlTag(); - break; - case '&': - res = this.parseEntity(); - break; - default: - boolean isDelimiter = delimiterCharacters.get(c); - if (isDelimiter) { - DelimiterProcessor inlineDelimiter = delimiterProcessors.get(c); - res = parseDelimiters(inlineDelimiter); + return List.of(parseCloseBracket()); + case '\n': + return List.of(parseLineBreak()); + case Scanner.END: + return null; + } + + if (linkMarkers.get(c)) { + var markerPosition = scanner.position(); + var nodes = parseLinkMarker(); + if (nodes != null) { + return nodes; + } + // Reset and try other things (e.g. inline parsers below) + scanner.setPosition(markerPosition); + } + + // No inline parser, delimiter or other special handling. + if (!specialCharacters.get(c)) { + return List.of(parseText()); + } + + List<InlineContentParser> inlineParsers = this.inlineParsers.get(c); + if (inlineParsers != null) { + Position position = scanner.position(); + for (InlineContentParser inlineParser : inlineParsers) { + ParsedInline parsedInline = inlineParser.tryParse(this); + if (parsedInline instanceof ParsedInlineImpl) { + ParsedInlineImpl parsedInlineImpl = (ParsedInlineImpl) parsedInline; + Node node = parsedInlineImpl.getNode(); + scanner.setPosition(parsedInlineImpl.getPosition()); + if (includeSourceSpans && node.getSourceSpans().isEmpty()) { + node.setSourceSpans(scanner.getSource(position, scanner.position()).getSourceSpans()); + } + return List.of(node); } else { - res = this.parseString(); + // Reset position + scanner.setPosition(position); } - break; + } } - if (!res) { - this.pos += 1; - // When we get here, it's only for a single special character that turned out to not have a special meaning. - // So we shouldn't have a single surrogate here, hence it should be ok to turn it into a String. - String literal = String.valueOf(c); - appendText(literal); + + DelimiterProcessor delimiterProcessor = delimiterProcessors.get(c); + if (delimiterProcessor != null) { + List<? extends Node> nodes = parseDelimiters(delimiterProcessor, c); + if (nodes != null) { + return nodes; + } } - return true; + // If we get here, even for a special/delimiter character, we will just treat it as text. + return List.of(parseText()); } /** - * If re matches at current position in the subject, advance position in subject and return the match; otherwise - * return null. + * Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters. */ - private String match(Pattern re) { - if (pos >= subject.length()) { - return null; - } - Matcher matcher = re.matcher(subject); - matcher.region(pos, subject.length()); - boolean m = matcher.find(); - if (m) { - pos = matcher.end(); - return matcher.group(); - } else { + private List<? extends Node> parseDelimiters(DelimiterProcessor delimiterProcessor, char delimiterChar) { + DelimiterData res = scanDelimiters(delimiterProcessor, delimiterChar); + if (res == null) { return null; } - } - /** - * Returns the char at the current subject position, or {@code '\0'} in case there are no more characters. - */ - private char peek() { - if (this.pos < this.subject.length()) { - return this.subject.charAt(this.pos); - } else { - return '\0'; + List<Text> characters = res.characters; + + // Add entry to stack for this opener + lastDelimiter = new Delimiter(characters, delimiterChar, res.canOpen, res.canClose, lastDelimiter); + if (lastDelimiter.previous != null) { + lastDelimiter.previous.next = lastDelimiter; } - } - /** - * Parse zero or more space characters, including at most one newline. - */ - private boolean spnl() { - this.match(SPNL); - return true; + return characters; } /** - * Parse a newline. If it was preceded by two spaces, return a hard line break; otherwise a soft line break. + * Add open bracket to delimiter stack and add a text node to block's children. */ - private boolean parseNewline() { - this.pos += 1; // assume we're at a \n - - // We're gonna add a new node in any case and we need to check the last text node, so flush outstanding text. - flushTextNode(); - - Node lastChild = block.getLastChild(); - // Check previous text for trailing spaces. - // The "endsWith" is an optimization to avoid an RE match in the common case. - if (lastChild != null && lastChild instanceof Text && ((Text) lastChild).getLiteral().endsWith(" ")) { - Text text = (Text) lastChild; - String literal = text.getLiteral(); - Matcher matcher = FINAL_SPACE.matcher(literal); - int spaces = matcher.find() ? matcher.end() - matcher.start() : 0; - if (spaces > 0) { - text.setLiteral(literal.substring(0, literal.length() - spaces)); - } - appendNode(spaces >= 2 ? new HardLineBreak() : new SoftLineBreak()); - } else { - appendNode(new SoftLineBreak()); - } + private Node parseOpenBracket() { + Position start = scanner.position(); + scanner.next(); + Position contentPosition = scanner.position(); - // gobble leading spaces in next line - while (pos < subject.length() && subject.charAt(pos) == ' ') { - pos++; - } - return true; + Text node = text(scanner.getSource(start, contentPosition)); + + // Add entry to stack for this opener + addBracket(Bracket.link(node, start, contentPosition, lastBracket, lastDelimiter)); + + return node; } /** - * Parse a backslash-escaped special character, adding either the escaped character, a hard line break - * (if the backslash is followed by a newline), or a literal backslash to the block's children. + * If next character is {@code [}, add a bracket to the stack. + * Otherwise, return null. */ - private boolean parseBackslash() { - String subj = this.subject; - pos++; - if (peek() == '\n') { - appendNode(new HardLineBreak()); - pos++; - } else if (pos < subj.length() && ESCAPABLE.matcher(subj.substring(pos, pos + 1)).matches()) { - appendText(subj, pos, pos + 1); - pos++; + private List<? extends Node> parseLinkMarker() { + var markerPosition = scanner.position(); + scanner.next(); + var bracketPosition = scanner.position(); + if (scanner.next('[')) { + var contentPosition = scanner.position(); + var bangNode = text(scanner.getSource(markerPosition, bracketPosition)); + var bracketNode = text(scanner.getSource(bracketPosition, contentPosition)); + + // Add entry to stack for this opener + addBracket(Bracket.withMarker(bangNode, markerPosition, bracketNode, bracketPosition, contentPosition, lastBracket, lastDelimiter)); + return List.of(bangNode, bracketNode); } else { - appendText("\\"); + return null; } - return true; } /** - * Attempt to parse backticks, adding either a backtick code span or a literal sequence of backticks. + * Try to match close bracket against an opening in the delimiter stack. Return either a link or image, or a + * plain [ character. If there is a matching delimiter, remove it from the delimiter stack. */ - private boolean parseBackticks() { - String ticks = this.match(TICKS_HERE); - if (ticks == null) { - return false; - } - int afterOpenTicks = this.pos; - String matched; - while ((matched = this.match(TICKS)) != null) { - if (matched.equals(ticks)) { - Code node = new Code(); - String content = this.subject.substring(afterOpenTicks, this.pos - ticks.length()); - String literal = WHITESPACE.matcher(content.trim()).replaceAll(" "); - node.setLiteral(literal); - appendNode(node); - return true; - } - } - // If we got here, we didn't match a closing backtick sequence. - this.pos = afterOpenTicks; - appendText(ticks); - return true; - } + private Node parseCloseBracket() { + Position beforeClose = scanner.position(); + scanner.next(); + Position afterClose = scanner.position(); - /** - * Attempt to parse delimiters like emphasis, strong emphasis or custom delimiters. - */ - private boolean parseDelimiters(DelimiterProcessor inlineDelimiter) { - DelimiterRun res = this.scanDelims(inlineDelimiter); - if (res == null) { - return false; + // Get previous `[` or `![` + Bracket opener = lastBracket; + if (opener == null) { + // No matching opener, just return a literal. + return text(scanner.getSource(beforeClose, afterClose)); } - int numDelims = res.count; - int startPos = this.pos; - this.pos += numDelims; - Text node = appendSeparateText(this.subject.substring(startPos, this.pos)); + if (!opener.allowed) { + // Matching opener, but it's not allowed, just return a literal. + removeLastBracket(); + return text(scanner.getSource(beforeClose, afterClose)); + } - // Add entry to stack for this opener - this.delimiter = new Delimiter(node, this.delimiter, startPos); - this.delimiter.delimiterChar = inlineDelimiter.getDelimiterChar(); - this.delimiter.numDelims = numDelims; - this.delimiter.canOpen = res.canOpen; - this.delimiter.canClose = res.canClose; - if (this.delimiter.previous != null) { - this.delimiter.previous.next = this.delimiter; + var linkOrImage = parseLinkOrImage(opener, beforeClose); + if (linkOrImage != null) { + return linkOrImage; } + scanner.setPosition(afterClose); - return true; + // Nothing parsed, just parse the bracket as text and continue + removeLastBracket(); + return text(scanner.getSource(beforeClose, afterClose)); } - /** - * Add open bracket to delimiter stack and add a text node to block's children. - */ - private boolean parseOpenBracket() { - int startPos = this.pos; - this.pos += 1; + private Node parseLinkOrImage(Bracket opener, Position beforeClose) { + var linkInfo = parseLinkInfo(opener, beforeClose); + if (linkInfo == null) { + return null; + } + var processorStartPosition = scanner.position(); - Text node = appendSeparateText("["); + for (var linkProcessor : linkProcessors) { + var linkResult = linkProcessor.process(linkInfo, scanner, context); + if (!(linkResult instanceof LinkResultImpl)) { + // Reset position in case the processor used the scanner, and it didn't work out. + scanner.setPosition(processorStartPosition); + continue; + } - // Add entry to stack for this opener - this.delimiter = new Delimiter(node, this.delimiter, startPos); - this.delimiter.delimiterChar = '['; - this.delimiter.numDelims = 1; - this.delimiter.canOpen = true; - this.delimiter.canClose = false; - this.delimiter.allowed = true; - if (this.delimiter.previous != null) { - this.delimiter.previous.next = this.delimiter; + var result = (LinkResultImpl) linkResult; + var node = result.getNode(); + var position = result.getPosition(); + var includeMarker = result.isIncludeMarker(); + + switch (result.getType()) { + case WRAP: + scanner.setPosition(position); + return wrapBracket(opener, node, includeMarker); + case REPLACE: + scanner.setPosition(position); + return replaceBracket(opener, node, includeMarker); + } } - return true; + return null; } - /** - * If next character is [, and ! delimiter to delimiter stack and add a text node to block's children. - * Otherwise just add a text node. - */ - private boolean parseBang() { - int startPos = this.pos; - this.pos += 1; - if (this.peek() == '[') { - this.pos += 1; + private LinkInfo parseLinkInfo(Bracket opener, Position beforeClose) { + // Check to see if we have a link (or image, with a ! in front). The different types: + // - Inline: `[foo](/uri)` or with optional title `[foo](/uri "title")` + // - Reference links + // - Full: `[foo][bar]` (foo is the text and bar is the label that needs to match a reference) + // - Collapsed: `[foo][]` (foo is both the text and label) + // - Shortcut: `[foo]` (foo is both the text and label) - Text node = appendSeparateText("!["); + // Starting position is after the closing `]` + var afterClose = scanner.position(); - // Add entry to stack for this opener - this.delimiter = new Delimiter(node, this.delimiter, startPos + 1); - this.delimiter.delimiterChar = '!'; - this.delimiter.numDelims = 1; - this.delimiter.canOpen = true; - this.delimiter.canClose = false; - this.delimiter.allowed = true; - if (this.delimiter.previous != null) { - this.delimiter.previous.next = this.delimiter; - } - } else { - appendText("!"); + // Maybe an inline link/image + var destinationTitle = parseInlineDestinationTitle(scanner); + if (destinationTitle != null) { + var text = scanner.getSource(opener.contentPosition, beforeClose).getContent(); + return new LinkInfoImpl(opener.markerNode, opener.bracketNode, text, null, destinationTitle.destination, destinationTitle.title, afterClose); + } + // Not an inline link/image, rewind back to after `]`. + scanner.setPosition(afterClose); + + // Maybe a reference link/image like `[foo][bar]`, `[foo][]` or `[foo]`. + // Note that even `[foo](` could be a valid link if foo is a reference, which is why we try this even if the `(` + // failed to be parsed as an inline link/image before. + + // See if there's a link label like `[bar]` or `[]` + var label = parseLinkLabel(scanner); + if (label == null) { + // No label, rewind back + scanner.setPosition(afterClose); } - return true; + var textIsReference = label == null || label.isEmpty(); + if (opener.bracketAfter && textIsReference && opener.markerNode == null) { + // In case of shortcut or collapsed links, the text is used as the reference. But the reference is not allowed to + // contain an unescaped bracket, so if that's the case we don't need to continue. This is an optimization. + return null; + } + + var text = scanner.getSource(opener.contentPosition, beforeClose).getContent(); + return new LinkInfoImpl(opener.markerNode, opener.bracketNode, text, label, null, null, afterClose); } - /** - * Try to match close bracket against an opening in the delimiter stack. Add either a link or image, or a - * plain [ character, to block's children. If there is a matching delimiter, remove it from the delimiter stack. - */ - private boolean parseCloseBracket() { - this.pos += 1; - int startPos = this.pos; - - boolean containsBracket = false; - // look through stack of delimiters for a [ or ![ - Delimiter opener = this.delimiter; - while (opener != bracketDelimiterBottom) { - if (opener.delimiterChar == '[' || opener.delimiterChar == '!') { - if (!opener.matched) { - break; - } - containsBracket = true; - } - opener = opener.previous; + private Node wrapBracket(Bracket opener, Node wrapperNode, boolean includeMarker) { + // Add all nodes between the opening bracket and now (closing bracket) as child nodes of the link + Node n = opener.bracketNode.getNext(); + while (n != null) { + Node next = n.getNext(); + wrapperNode.appendChild(n); + n = next; } - if (opener == bracketDelimiterBottom) { - // No matched opener, just return a literal. - appendText("]"); - // No need to search same delimiters for openers next time. - bracketDelimiterBottom = this.delimiter; - return true; + if (includeSourceSpans) { + var startPosition = includeMarker && opener.markerPosition != null ? opener.markerPosition : opener.bracketPosition; + wrapperNode.setSourceSpans(scanner.getSource(startPosition, scanner.position()).getSourceSpans()); } - if (!opener.allowed) { - // Matching opener but it's not allowed, just return a literal. - appendText("]"); - // We could remove the opener now, but that would complicate text node merging. So just skip it next time. - opener.matched = true; - return true; + // Process delimiters such as emphasis inside link/image + processDelimiters(opener.previousDelimiter); + mergeChildTextNodes(wrapperNode); + // We don't need the corresponding text node anymore, we turned it into a link/image node + if (includeMarker && opener.markerNode != null) { + opener.markerNode.unlink(); } + opener.bracketNode.unlink(); + removeLastBracket(); - // Check to see if we have a link/image + // Links within links are not allowed. We found this link, so there can be no other links around it. + if (opener.markerNode == null) { + disallowPreviousLinks(); + } - String dest = null; - String title = null; - boolean isLinkOrImage = false; - - // Inline link? - if (this.peek() == '(') { - this.pos++; - this.spnl(); - if ((dest = this.parseLinkDestination()) != null) { - this.spnl(); - // title needs a whitespace before - if (WHITESPACE_CHAR.matcher(this.subject.substring(this.pos - 1, this.pos)).matches()) { - title = this.parseLinkTitle(); - this.spnl(); - } - if (this.subject.charAt(this.pos) == ')') { - this.pos += 1; - isLinkOrImage = true; - } - } - } else { // maybe reference link - - // See if there's a link label - this.spnl(); - - int beforeLabel = this.pos; - int labelLength = this.parseLinkLabel(); - String ref = null; - if (labelLength > 2) { - ref = this.subject.substring(beforeLabel, beforeLabel + labelLength); - } else if (!containsBracket) { - // Empty or missing second label can only be a reference if there's no unescaped bracket in it. - ref = this.subject.substring(opener.index, startPos); - } - if (labelLength == 0) { - // If shortcut reference link, rewind before spaces we skipped. - this.pos = startPos; - } + return wrapperNode; + } - if (ref != null) { - Link link = referenceMap.get(Escaping.normalizeReference(ref)); - if (link != null) { - dest = link.getDestination(); - title = link.getTitle(); - isLinkOrImage = true; - } - } + private Node replaceBracket(Bracket opener, Node node, boolean includeMarker) { + // Remove delimiters (but keep text nodes) + while (lastDelimiter != null && lastDelimiter != opener.previousDelimiter) { + removeDelimiterKeepNode(lastDelimiter); } - if (isLinkOrImage) { - // If we got here, open is a potential opener - boolean isImage = opener.delimiterChar == '!'; - Node linkOrImage = isImage ? new Image(dest, title) : new Link(dest, title); + if (includeSourceSpans) { + var startPosition = includeMarker && opener.markerPosition != null ? opener.markerPosition : opener.bracketPosition; + node.setSourceSpans(scanner.getSource(startPosition, scanner.position()).getSourceSpans()); + } - // Flush text now. We don't need to worry about combining it with adjacent text nodes, as we'll wrap it in a - // link or image node. - flushTextNode(); + removeLastBracket(); - Node node = opener.node.getNext(); - while (node != null) { - Node next = node.getNext(); - linkOrImage.appendChild(node); - node = next; - } - appendNode(linkOrImage); - - // Process delimiters such as emphasis inside link/image - processDelimiters(opener); - removeDelimiterAndNode(opener); - - // Links within links are not allowed. We found this link, so there can be no other link around it. - if (!isImage) { - Delimiter delim = this.delimiter; - while (delim != null) { - if (delim.delimiterChar == '[') { - // Disallow link opener. It will still get matched, but will not result in a link. - delim.allowed = false; - } - delim = delim.previous; - } - } + // Remove nodes that we added since the opener, because we're replacing them + Node n = includeMarker && opener.markerNode != null ? opener.markerNode : opener.bracketNode; + while (n != null) { + var next = n.getNext(); + n.unlink(); + n = next; + } - return true; + // Links within links are not allowed. We found this link, so there can be no other links around it. + // Note that this makes any syntax like `[foo]` behave the same as built-in links, which is probably a good + // default (it works for footnotes). It might be useful for a `LinkProcessor` to be able to specify the + // behavior; something we could add to `LinkResult` in the future if requested. + if (opener.markerNode == null || !includeMarker) { + disallowPreviousLinks(); + } - } else { // no link or image + return node; + } - appendText("]"); - // We could remove the opener now, but that would complicate text node merging. - // E.g. `[link] (/uri)` isn't a link because of the space, so we want to keep appending text. - opener.matched = true; - this.pos = startPos; - return true; + private void addBracket(Bracket bracket) { + if (lastBracket != null) { + lastBracket.bracketAfter = true; } + lastBracket = bracket; } - /** - * Attempt to parse link destination, returning the string or null if no match. - */ - private String parseLinkDestination() { - String res = this.match(LINK_DESTINATION_BRACES); - if (res != null) { // chop off surrounding <..>: - if (res.length() == 2) { - return ""; - } else { - return Escaping.unescapeString(res.substring(1, res.length() - 1)); - } - } else { - res = this.match(LINK_DESTINATION); - if (res != null) { - return Escaping.unescapeString(res); - } else { - return null; + private void removeLastBracket() { + lastBracket = lastBracket.previous; + } + + private void disallowPreviousLinks() { + Bracket bracket = lastBracket; + while (bracket != null) { + if (bracket.markerNode == null) { + // Disallow link opener. It will still get matched, but will not result in a link. + bracket.allowed = false; } + bracket = bracket.previous; } } /** - * Attempt to parse link title (sans quotes), returning the string or null if no match. + * Try to parse the destination and an optional title for an inline link/image. */ - private String parseLinkTitle() { - String title = this.match(LINK_TITLE); - if (title != null) { - // chop off quotes from title and unescape: - return Escaping.unescapeString(title.substring(1, title.length() - 1)); - } else { + private static DestinationTitle parseInlineDestinationTitle(Scanner scanner) { + if (!scanner.next('(')) { return null; } - } - /** - * Attempt to parse a link label, returning number of characters parsed. - */ - private int parseLinkLabel() { - String m = this.match(LINK_LABEL); - return m == null ? 0 : m.length(); + scanner.whitespace(); + String dest = parseLinkDestination(scanner); + if (dest == null) { + return null; + } + + String title = null; + int whitespace = scanner.whitespace(); + // title needs a whitespace before + if (whitespace >= 1) { + title = parseLinkTitle(scanner); + scanner.whitespace(); + } + if (!scanner.next(')')) { + // Don't have a closing `)`, so it's not a destination and title. + // Note that something like `[foo](` could still be valid later, `(` will just be text. + return null; + } + return new DestinationTitle(dest, title); } /** - * Attempt to parse an autolink (URL or email in pointy brackets). + * Attempt to parse link destination, returning the string or null if no match. */ - private boolean parseAutolink() { - String m; - if ((m = this.match(EMAIL_AUTOLINK)) != null) { - String dest = m.substring(1, m.length() - 1); - Link node = new Link("mailto:" + dest, null); - node.appendChild(new Text(dest)); - appendNode(node); - return true; - } else if ((m = this.match(AUTOLINK)) != null) { - String dest = m.substring(1, m.length() - 1); - Link node = new Link(dest, null); - node.appendChild(new Text(dest)); - appendNode(node); - return true; + private static String parseLinkDestination(Scanner scanner) { + char delimiter = scanner.peek(); + Position start = scanner.position(); + if (!LinkScanner.scanLinkDestination(scanner)) { + return null; + } + + String dest; + if (delimiter == '<') { + // chop off surrounding <..>: + String rawDestination = scanner.getSource(start, scanner.position()).getContent(); + dest = rawDestination.substring(1, rawDestination.length() - 1); } else { - return false; + dest = scanner.getSource(start, scanner.position()).getContent(); } + + return Escaping.unescapeString(dest); } /** - * Attempt to parse a raw HTML tag. + * Attempt to parse link title (sans quotes), returning the string or null if no match. */ - private boolean parseHtmlTag() { - String m = this.match(HTML_TAG); - if (m != null) { - HtmlTag node = new HtmlTag(); - node.setLiteral(m); - appendNode(node); - return true; - } else { - return false; + private static String parseLinkTitle(Scanner scanner) { + Position start = scanner.position(); + if (!LinkScanner.scanLinkTitle(scanner)) { + return null; } + + // chop off ', " or parens + String rawTitle = scanner.getSource(start, scanner.position()).getContent(); + String title = rawTitle.substring(1, rawTitle.length() - 1); + return Escaping.unescapeString(title); } /** - * Attempt to parse an entity, return Entity object if successful. + * Attempt to parse a link label, returning the label between the brackets or null. */ - private boolean parseEntity() { - String m; - if ((m = this.match(ENTITY_HERE)) != null) { - appendText(Html5Entities.entityToString(m)); - return true; + static String parseLinkLabel(Scanner scanner) { + if (!scanner.next('[')) { + return null; + } + + Position start = scanner.position(); + if (!LinkScanner.scanLinkLabelContent(scanner)) { + return null; + } + Position end = scanner.position(); + + if (!scanner.next(']')) { + return null; + } + + String content = scanner.getSource(start, end).getContent(); + // spec: A link label can have at most 999 characters inside the square brackets. + if (content.length() > 999) { + return null; + } + + return content; + } + + private Node parseLineBreak() { + scanner.next(); + + var hard = trailingSpaces >= 2; + trailingSpaces = 0; + if (hard) { + return new HardLineBreak(); } else { - return false; + return new SoftLineBreak(); } } /** - * Parse a run of ordinary characters, or a single character with a special meaning in markdown, as a plain string. + * Parse the next character as plain text, and possibly more if the following characters are non-special. */ - private boolean parseString() { - int begin = pos; - int length = subject.length(); - while (pos != length) { - if (specialCharacters.get(subject.charAt(pos))) { + private Node parseText() { + Position start = scanner.position(); + scanner.next(); + char c; + while (true) { + c = scanner.peek(); + if (c == Scanner.END || specialCharacters.get(c)) { break; } - pos++; + scanner.next(); } - if (begin != pos) { - appendText(subject, begin, pos); - return true; - } else { - return false; + + SourceLines source = scanner.getSource(start, scanner.position()); + String content = source.getContent(); + + if (c == '\n') { + // We parsed until the end of the line. Trim any trailing spaces and remember them (for hard line breaks). + int end = Characters.skipBackwards(' ', content, content.length() - 1, 0) + 1; + trailingSpaces = content.length() - end; + content = content.substring(0, end); + } else if (c == Scanner.END) { + // For the last line, both tabs and spaces are trimmed for some reason (checked with commonmark.js). + int end = Characters.skipSpaceTabBackwards(content, content.length() - 1, 0) + 1; + content = content.substring(0, end); } + + Text text = new Text(content); + text.setSourceSpans(source.getSourceSpans()); + return text; } /** @@ -774,49 +647,49 @@ private boolean parseString() { * * @return information about delimiter run, or {@code null} */ - private DelimiterRun scanDelims(DelimiterProcessor inlineDelimiter) { - int startPos = this.pos; - - int delimiterCount = 0; - char delimiterChar = inlineDelimiter.getDelimiterChar(); - while (this.peek() == delimiterChar) { - delimiterCount++; - this.pos++; - } - - if (delimiterCount < inlineDelimiter.getMinDelimiterCount()) { - this.pos = startPos; + private DelimiterData scanDelimiters(DelimiterProcessor delimiterProcessor, char delimiterChar) { + int before = scanner.peekPreviousCodePoint(); + Position start = scanner.position(); + + // Quick check to see if we have enough delimiters. + int delimiterCount = scanner.matchMultiple(delimiterChar); + if (delimiterCount < delimiterProcessor.getMinLength()) { + scanner.setPosition(start); return null; } - String before = startPos == 0 ? "\n" : - this.subject.substring(startPos - 1, startPos); + // We do have enough, extract a text node for each delimiter character. + List<Text> delimiters = new ArrayList<>(); + scanner.setPosition(start); + Position positionBefore = start; + while (scanner.next(delimiterChar)) { + delimiters.add(text(scanner.getSource(positionBefore, scanner.position()))); + positionBefore = scanner.position(); + } - char charAfter = this.peek(); - String after = charAfter == '\0' ? "\n" : - String.valueOf(charAfter); + int after = scanner.peekCodePoint(); - boolean beforeIsPunctuation = PUNCTUATION.matcher(before).matches(); - boolean beforeIsWhitespace = WHITESPACE_CHAR.matcher(before).matches(); - boolean afterIsWhitespace = WHITESPACE_CHAR.matcher(after).matches(); - boolean afterIsPunctuation = PUNCTUATION.matcher(after).matches(); + // We could be more lazy here, in most cases we don't need to do every match case. + boolean beforeIsPunctuation = before == Scanner.END || Characters.isPunctuationCodePoint(before); + boolean beforeIsWhitespace = before == Scanner.END || Characters.isWhitespaceCodePoint(before); + boolean afterIsPunctuation = after == Scanner.END || Characters.isPunctuationCodePoint(after); + boolean afterIsWhitespace = after == Scanner.END || Characters.isWhitespaceCodePoint(after); boolean leftFlanking = !afterIsWhitespace && - !(afterIsPunctuation && !beforeIsWhitespace && !beforeIsPunctuation); + (!afterIsPunctuation || beforeIsWhitespace || beforeIsPunctuation); boolean rightFlanking = !beforeIsWhitespace && - !(beforeIsPunctuation && !afterIsWhitespace && !afterIsPunctuation); + (!beforeIsPunctuation || afterIsWhitespace || afterIsPunctuation); boolean canOpen; boolean canClose; if (delimiterChar == '_') { canOpen = leftFlanking && (!rightFlanking || beforeIsPunctuation); canClose = rightFlanking && (!leftFlanking || afterIsPunctuation); } else { - canOpen = leftFlanking; - canClose = rightFlanking; + canOpen = leftFlanking && delimiterChar == delimiterProcessor.getOpeningCharacter(); + canClose = rightFlanking && delimiterChar == delimiterProcessor.getClosingCharacter(); } - this.pos = startPos; - return new DelimiterRun(delimiterCount, canOpen, canClose); + return new DelimiterData(delimiters, canOpen, canClose); } private void processDelimiters(Delimiter stackBottom) { @@ -824,7 +697,7 @@ private void processDelimiters(Delimiter stackBottom) { Map<Character, Delimiter> openersBottom = new HashMap<>(); // find first closer above stackBottom: - Delimiter closer = this.delimiter; + Delimiter closer = lastDelimiter; while (closer != null && closer.previous != stackBottom) { closer = closer.previous; } @@ -832,73 +705,78 @@ private void processDelimiters(Delimiter stackBottom) { while (closer != null) { char delimiterChar = closer.delimiterChar; - if (!closer.canClose || !delimiterProcessors.containsKey(delimiterChar)) { + DelimiterProcessor delimiterProcessor = delimiterProcessors.get(delimiterChar); + if (!closer.canClose() || delimiterProcessor == null) { closer = closer.next; continue; } - // found delimiter closer. now look back for first matching opener: + char openingDelimiterChar = delimiterProcessor.getOpeningCharacter(); + + // Found delimiter closer. Now look back for first matching opener. + int usedDelims = 0; boolean openerFound = false; + boolean potentialOpenerFound = false; Delimiter opener = closer.previous; while (opener != null && opener != stackBottom && opener != openersBottom.get(delimiterChar)) { - if (opener.delimiterChar == delimiterChar && opener.canOpen) { - openerFound = true; - break; + if (opener.canOpen() && opener.delimiterChar == openingDelimiterChar) { + potentialOpenerFound = true; + usedDelims = delimiterProcessor.process(opener, closer); + if (usedDelims > 0) { + openerFound = true; + break; + } } opener = opener.previous; } if (!openerFound) { - // Set lower bound for future searches for openers: - openersBottom.put(delimiterChar, closer.previous); - if (!closer.canOpen) { - // We can remove a closer that can't be an opener, - // once we've seen there's no matching opener: - removeDelimiterKeepNode(closer); + if (!potentialOpenerFound) { + // Set lower bound for future searches for openers. + // Only do this when we didn't even have a potential + // opener (one that matches the character and can open). + // If an opener was rejected because of the number of + // delimiters (e.g. because of the "multiple of 3" rule), + // we want to consider it next time because the number + // of delimiters can change as we continue processing. + openersBottom.put(delimiterChar, closer.previous); + if (!closer.canOpen()) { + // We can remove a closer that can't be an opener, + // once we've seen there's no matching opener: + removeDelimiterKeepNode(closer); + } } closer = closer.next; continue; } - DelimiterProcessor delimiterProcessor = delimiterProcessors.get(closer.delimiterChar); - - int useDelims = delimiterProcessor.getDelimiterUse(opener.numDelims, closer.numDelims); - if (useDelims <= 0) { - // nope - useDelims = 1; + // Remove number of used delimiters nodes. + for (int i = 0; i < usedDelims; i++) { + Text delimiter = opener.characters.remove(opener.characters.size() - 1); + delimiter.unlink(); + } + for (int i = 0; i < usedDelims; i++) { + Text delimiter = closer.characters.remove(0); + delimiter.unlink(); } - - Text openerNode = opener.node; - Text closerNode = closer.node; - - // remove used delimiters from stack elts and inlines - opener.numDelims -= useDelims; - closer.numDelims -= useDelims; - openerNode.setLiteral( - openerNode.getLiteral().substring(0, - openerNode.getLiteral().length() - useDelims)); - closerNode.setLiteral( - closerNode.getLiteral().substring(0, - closerNode.getLiteral().length() - useDelims)); removeDelimitersBetween(opener, closer); - delimiterProcessor.process(openerNode, closerNode, useDelims); - // if opener has 0 delims, remove it and the inline - if (opener.numDelims == 0) { - removeDelimiterAndNode(opener); + // No delimiter characters left to process, so we can remove delimiter and the now empty node. + if (opener.length() == 0) { + removeDelimiterAndNodes(opener); } - if (closer.numDelims == 0) { + if (closer.length() == 0) { Delimiter next = closer.next; - removeDelimiterAndNode(closer); + removeDelimiterAndNodes(closer); closer = next; } } // remove all delimiters - while (delimiter != null && delimiter != stackBottom) { - removeDelimiterKeepNode(delimiter); + while (lastDelimiter != null && lastDelimiter != stackBottom) { + removeDelimiterKeepNode(lastDelimiter); } } @@ -914,17 +792,7 @@ private void removeDelimitersBetween(Delimiter opener, Delimiter closer) { /** * Remove the delimiter and the corresponding text node. For used delimiters, e.g. `*` in `*foo*`. */ - private void removeDelimiterAndNode(Delimiter delim) { - Text node = delim.node; - Text previousText = delim.getPreviousNonDelimiterTextNode(); - Text nextText = delim.getNextNonDelimiterTextNode(); - if (previousText != null && nextText != null) { - // Merge adjacent text nodes - previousText.setLiteral(previousText.getLiteral() + nextText.getLiteral()); - nextText.unlink(); - } - - node.unlink(); + private void removeDelimiterAndNodes(Delimiter delim) { removeDelimiter(delim); } @@ -932,23 +800,6 @@ private void removeDelimiterAndNode(Delimiter delim) { * Remove the delimiter but keep the corresponding node as text. For unused delimiters such as `_` in `foo_bar`. */ private void removeDelimiterKeepNode(Delimiter delim) { - Text node = delim.node; - Text previousText = delim.getPreviousNonDelimiterTextNode(); - Text nextText = delim.getNextNonDelimiterTextNode(); - if (previousText != null || nextText != null) { - // Merge adjacent text nodes into one - StringBuilder sb = new StringBuilder(node.getLiteral()); - if (previousText != null) { - sb.insert(0, previousText.getLiteral()); - previousText.unlink(); - } - if (nextText != null) { - sb.append(nextText.getLiteral()); - nextText.unlink(); - } - node.setLiteral(sb.toString()); - } - removeDelimiter(delim); } @@ -958,10 +809,161 @@ private void removeDelimiter(Delimiter delim) { } if (delim.next == null) { // top of stack - this.delimiter = delim.previous; + lastDelimiter = delim.previous; } else { delim.next.previous = delim.previous; } } + private void mergeChildTextNodes(Node node) { + // No children, no need for merging + if (node.getFirstChild() == null) { + return; + } + + mergeTextNodesInclusive(node.getFirstChild(), node.getLastChild()); + } + + private void mergeTextNodesInclusive(Node fromNode, Node toNode) { + Text first = null; + Text last = null; + int length = 0; + + Node node = fromNode; + while (node != null) { + if (node instanceof Text) { + Text text = (Text) node; + if (first == null) { + first = text; + } + length += text.getLiteral().length(); + last = text; + } else { + mergeIfNeeded(first, last, length); + first = null; + last = null; + length = 0; + + mergeChildTextNodes(node); + } + if (node == toNode) { + break; + } + node = node.getNext(); + } + + mergeIfNeeded(first, last, length); + } + + private void mergeIfNeeded(Text first, Text last, int textLength) { + if (first != null && last != null && first != last) { + StringBuilder sb = new StringBuilder(textLength); + sb.append(first.getLiteral()); + SourceSpans sourceSpans = null; + if (includeSourceSpans) { + sourceSpans = new SourceSpans(); + sourceSpans.addAll(first.getSourceSpans()); + } + Node node = first.getNext(); + Node stop = last.getNext(); + while (node != stop) { + sb.append(((Text) node).getLiteral()); + if (sourceSpans != null) { + sourceSpans.addAll(node.getSourceSpans()); + } + + Node unlink = node; + node = node.getNext(); + unlink.unlink(); + } + String literal = sb.toString(); + first.setLiteral(literal); + if (sourceSpans != null) { + first.setSourceSpans(sourceSpans.getSourceSpans()); + } + } + } + + private static class DelimiterData { + + final List<Text> characters; + final boolean canClose; + final boolean canOpen; + + DelimiterData(List<Text> characters, boolean canOpen, boolean canClose) { + this.characters = characters; + this.canOpen = canOpen; + this.canClose = canClose; + } + } + + /** + * A destination and optional title for a link or image. + */ + private static class DestinationTitle { + final String destination; + final String title; + + public DestinationTitle(String destination, String title) { + this.destination = destination; + this.title = title; + } + } + + private static class LinkInfoImpl implements LinkInfo { + + private final Text marker; + private final Text openingBracket; + private final String text; + private final String label; + private final String destination; + private final String title; + private final Position afterTextBracket; + + private LinkInfoImpl(Text marker, Text openingBracket, String text, String label, + String destination, String title, Position afterTextBracket) { + this.marker = marker; + this.openingBracket = openingBracket; + this.text = text; + this.label = label; + this.destination = destination; + this.title = title; + this.afterTextBracket = afterTextBracket; + } + + @Override + public Text marker() { + return marker; + } + + @Override + public Text openingBracket() { + return openingBracket; + } + + @Override + public String text() { + return text; + } + + @Override + public String label() { + return label; + } + + @Override + public String destination() { + return destination; + } + + @Override + public String title() { + return title; + } + + @Override + public Position afterTextBracket() { + return afterTextBracket; + } + } } diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java new file mode 100644 index 000000000..637d3b111 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java @@ -0,0 +1,306 @@ +package org.commonmark.internal; + +import org.commonmark.internal.util.Escaping; +import org.commonmark.internal.util.LinkScanner; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.beta.Position; +import org.commonmark.parser.beta.Scanner; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Parser for link reference definitions at the beginning of a paragraph. + * + * @see <a href="https://spec.commonmark.org/0.31.2/#link-reference-definitions">Link reference definitions</a> + */ +public class LinkReferenceDefinitionParser { + + private State state = State.START_DEFINITION; + + private final List<SourceLine> paragraphLines = new ArrayList<>(); + private final List<LinkReferenceDefinition> definitions = new ArrayList<>(); + private final List<SourceSpan> sourceSpans = new ArrayList<>(); + + private StringBuilder label; + private String destination; + private char titleDelimiter; + private StringBuilder title; + private boolean referenceValid = false; + + public void parse(SourceLine line) { + paragraphLines.add(line); + if (state == State.PARAGRAPH) { + // We're in a paragraph now. Link reference definitions can only appear at the beginning, so once + // we're in a paragraph, there's no going back. + return; + } + + Scanner scanner = Scanner.of(SourceLines.of(line)); + while (scanner.hasNext()) { + boolean success; + switch (state) { + case START_DEFINITION: { + success = startDefinition(scanner); + break; + } + case LABEL: { + success = label(scanner); + break; + } + case DESTINATION: { + success = destination(scanner); + break; + } + case START_TITLE: { + success = startTitle(scanner); + break; + } + case TITLE: { + success = title(scanner); + break; + } + default: { + throw new IllegalStateException("Unknown parsing state: " + state); + } + } + // Parsing failed, which means we fall back to treating text as a paragraph. + if (!success) { + state = State.PARAGRAPH; + // If parsing of the title part failed, we still have a valid reference that we can add, and we need to + // do it before the source span for this line is added. + finishReference(); + return; + } + } + } + + public void addSourceSpan(SourceSpan sourceSpan) { + sourceSpans.add(sourceSpan); + } + + /** + * @return the lines that are normal paragraph content, without newlines + */ + SourceLines getParagraphLines() { + return SourceLines.of(paragraphLines); + } + + List<SourceSpan> getParagraphSourceSpans() { + return sourceSpans; + } + + List<LinkReferenceDefinition> getDefinitions() { + finishReference(); + return definitions; + } + + State getState() { + return state; + } + + List<SourceSpan> removeLines(int lines) { + var removedSpans = Collections.unmodifiableList(new ArrayList<>( + sourceSpans.subList(Math.max(sourceSpans.size() - lines, 0), sourceSpans.size()))); + removeLast(lines, paragraphLines); + removeLast(lines, sourceSpans); + return removedSpans; + } + + private boolean startDefinition(Scanner scanner) { + // Finish any outstanding references now. We don't do this earlier because we need addSourceSpan to have been + // called before we do it. + finishReference(); + + scanner.whitespace(); + if (!scanner.next('[')) { + return false; + } + + state = State.LABEL; + label = new StringBuilder(); + + if (!scanner.hasNext()) { + label.append('\n'); + } + return true; + } + + private boolean label(Scanner scanner) { + Position start = scanner.position(); + if (!LinkScanner.scanLinkLabelContent(scanner)) { + return false; + } + + label.append(scanner.getSource(start, scanner.position()).getContent()); + + if (!scanner.hasNext()) { + // label might continue on next line + label.append('\n'); + return true; + } else if (scanner.next(']')) { + // end of label + if (!scanner.next(':')) { + return false; + } + + // spec: A link label can have at most 999 characters inside the square brackets. + if (label.length() > 999) { + return false; + } + + String normalizedLabel = Escaping.normalizeLabelContent(label.toString()); + if (normalizedLabel.isEmpty()) { + return false; + } + + state = State.DESTINATION; + + scanner.whitespace(); + return true; + } else { + return false; + } + } + + private boolean destination(Scanner scanner) { + scanner.whitespace(); + Position start = scanner.position(); + if (!LinkScanner.scanLinkDestination(scanner)) { + return false; + } + + String rawDestination = scanner.getSource(start, scanner.position()).getContent(); + destination = rawDestination.startsWith("<") ? + rawDestination.substring(1, rawDestination.length() - 1) : + rawDestination; + + int whitespace = scanner.whitespace(); + if (!scanner.hasNext()) { + // Destination was at end of line, so this is a valid reference for sure (and maybe a title). + // If not at end of line, wait for title to be valid first. + referenceValid = true; + paragraphLines.clear(); + } else if (whitespace == 0) { + // spec: The title must be separated from the link destination by whitespace + return false; + } + + state = State.START_TITLE; + return true; + } + + private boolean startTitle(Scanner scanner) { + scanner.whitespace(); + if (!scanner.hasNext()) { + state = State.START_DEFINITION; + return true; + } + + titleDelimiter = '\0'; + char c = scanner.peek(); + switch (c) { + case '"': + case '\'': + titleDelimiter = c; + break; + case '(': + titleDelimiter = ')'; + break; + } + + if (titleDelimiter != '\0') { + state = State.TITLE; + title = new StringBuilder(); + scanner.next(); + if (!scanner.hasNext()) { + title.append('\n'); + } + } else { + // There might be another reference instead, try that for the same character. + state = State.START_DEFINITION; + } + return true; + } + + private boolean title(Scanner scanner) { + Position start = scanner.position(); + if (!LinkScanner.scanLinkTitleContent(scanner, titleDelimiter)) { + // Invalid title, stop. Title collected so far must not be used. + title = null; + return false; + } + + title.append(scanner.getSource(start, scanner.position()).getContent()); + + if (!scanner.hasNext()) { + // Title ran until the end of line, so continue on next line (until we find the delimiter) + title.append('\n'); + return true; + } + + // Skip delimiter character + scanner.next(); + scanner.whitespace(); + if (scanner.hasNext()) { + // spec: No further non-whitespace characters may occur on the line. + // Title collected so far must not be used. + title = null; + return false; + } + referenceValid = true; + paragraphLines.clear(); + + // See if there's another definition. + state = State.START_DEFINITION; + return true; + } + + private void finishReference() { + if (!referenceValid) { + return; + } + + String d = Escaping.unescapeString(destination); + String t = title != null ? Escaping.unescapeString(title.toString()) : null; + LinkReferenceDefinition definition = new LinkReferenceDefinition(label.toString(), d, t); + definition.setSourceSpans(sourceSpans); + sourceSpans.clear(); + definitions.add(definition); + + label = null; + referenceValid = false; + destination = null; + title = null; + } + + private static <T> void removeLast(int n, List<T> list) { + if (n >= list.size()) { + list.clear(); + } else { + for (int i = 0; i < n; i++) { + list.remove(list.size() - 1); + } + } + } + + enum State { + // Looking for the start of a definition, i.e. `[` + START_DEFINITION, + // Parsing the label, i.e. `foo` within `[foo]` + LABEL, + // Parsing the destination, i.e. `/url` in `[foo]: /url` + DESTINATION, + // Looking for the start of a title, i.e. the first `"` in `[foo]: /url "title"` + START_TITLE, + // Parsing the content of the title, i.e. `title` in `[foo]: /url "title"` + TITLE, + + // End state, no matter what kind of lines we add, they won't be references + PARAGRAPH, + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java index ed069044e..fbf034757 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java @@ -1,19 +1,18 @@ package org.commonmark.internal; +import org.commonmark.internal.util.Parsing; import org.commonmark.node.*; import org.commonmark.parser.block.*; import java.util.Objects; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class ListBlockParser extends AbstractBlockParser { - private static Pattern BULLET_LIST_MARKER = Pattern.compile("^[*+-]( +|$)"); - private static Pattern ORDERED_LIST_MARKER = Pattern.compile("^(\\d{1,9})([.)])( +|$)"); - private final ListBlock block; + private boolean hadBlankLine; + private int linesAfterBlank; + public ListBlockParser(ListBlock block) { this.block = block; } @@ -24,8 +23,20 @@ public boolean isContainer() { } @Override - public boolean canContain(Block block) { - return block instanceof ListItem; + public boolean canContain(Block childBlock) { + if (childBlock instanceof ListItem) { + // Another list item is added to this list block. If the previous line was blank, that means this list block + // is "loose" (not tight). + // + // spec: A list is loose if any of its constituent list items are separated by blank lines + if (hadBlankLine && linesAfterBlank == 1) { + block.setTight(false); + hadBlankLine = false; + } + return true; + } else { + return false; + } } @Override @@ -35,46 +46,141 @@ public Block getBlock() { @Override public BlockContinue tryContinue(ParserState state) { + if (state.isBlank()) { + hadBlankLine = true; + linesAfterBlank = 0; + } else if (hadBlankLine) { + linesAfterBlank++; + } // List blocks themselves don't have any markers, only list items. So try to stay in the list. // If there is a block start other than list item, canContain makes sure that this list is closed. return BlockContinue.atIndex(state.getIndex()); } - public void setTight(boolean tight) { - block.setTight(tight); - } - /** * Parse a list marker and return data on the marker or null. */ - private static ListData parseListMarker(CharSequence ln, int offset) { - CharSequence rest = ln.subSequence(offset, ln.length()); - int spacesAfterMarker; - ListBlock listBlock; - - Matcher match; - if ((match = BULLET_LIST_MARKER.matcher(rest)).find()) { - BulletList bulletList = new BulletList(); - bulletList.setBulletMarker(match.group(0).charAt(0)); - listBlock = bulletList; - spacesAfterMarker = match.group(1).length(); - } else if ((match = ORDERED_LIST_MARKER.matcher(rest)).find()) { - OrderedList orderedList = new OrderedList(); - orderedList.setStartNumber(Integer.parseInt(match.group(1))); - orderedList.setDelimiter(match.group(2).charAt(0)); - listBlock = orderedList; - spacesAfterMarker = match.group(3).length(); - } else { + private static ListData parseList(CharSequence line, final int markerIndex, final int markerColumn, + final boolean inParagraph) { + ListMarkerData listMarker = parseListMarker(line, markerIndex); + if (listMarker == null) { return null; } - int padding; - boolean blankItem = match.group(0).length() == rest.length(); - if (spacesAfterMarker >= 5 || spacesAfterMarker < 1 || blankItem) { - padding = match.group(0).length() - spacesAfterMarker + 1; + ListBlock listBlock = listMarker.listBlock; + + int indexAfterMarker = listMarker.indexAfterMarker; + int markerLength = indexAfterMarker - markerIndex; + // marker doesn't include tabs, so counting them as columns directly is ok + int columnAfterMarker = markerColumn + markerLength; + // the column within the line where the content starts + int contentColumn = columnAfterMarker; + + // See at which column the content starts if there is content + boolean hasContent = false; + int length = line.length(); + for (int i = indexAfterMarker; i < length; i++) { + char c = line.charAt(i); + if (c == '\t') { + contentColumn += Parsing.columnsToNextTabStop(contentColumn); + } else if (c == ' ') { + contentColumn++; + } else { + hasContent = true; + break; + } + } + + if (inParagraph) { + // If the list item is ordered, the start number must be 1 to interrupt a paragraph. + if (listBlock instanceof OrderedList && ((OrderedList) listBlock).getMarkerStartNumber() != 1) { + return null; + } + // Empty list item can not interrupt a paragraph. + if (!hasContent) { + return null; + } + } + + if (!hasContent || (contentColumn - columnAfterMarker) > Parsing.CODE_BLOCK_INDENT) { + // If this line is blank or has a code block, default to 1 space after marker + contentColumn = columnAfterMarker + 1; + } + + return new ListData(listBlock, contentColumn); + } + + private static ListMarkerData parseListMarker(CharSequence line, int index) { + char c = line.charAt(index); + switch (c) { + // spec: A bullet list marker is a -, +, or * character. + case '-': + case '+': + case '*': + if (isSpaceTabOrEnd(line, index + 1)) { + BulletList bulletList = new BulletList(); + bulletList.setMarker(String.valueOf(c)); + return new ListMarkerData(bulletList, index + 1); + } else { + return null; + } + default: + return parseOrderedList(line, index); + } + } + + // spec: An ordered list marker is a sequence of 1-9 arabic digits (0-9), followed by either a `.` character or a + // `)` character. + private static ListMarkerData parseOrderedList(CharSequence line, int index) { + int digits = 0; + int length = line.length(); + for (int i = index; i < length; i++) { + char c = line.charAt(i); + switch (c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + digits++; + if (digits > 9) { + return null; + } + break; + case '.': + case ')': + if (digits >= 1 && isSpaceTabOrEnd(line, i + 1)) { + String number = line.subSequence(index, i).toString(); + OrderedList orderedList = new OrderedList(); + orderedList.setMarkerStartNumber(Integer.parseInt(number)); + orderedList.setMarkerDelimiter(String.valueOf(c)); + return new ListMarkerData(orderedList, i + 1); + } else { + return null; + } + default: + return null; + } + } + return null; + } + + private static boolean isSpaceTabOrEnd(CharSequence line, int index) { + if (index < line.length()) { + switch (line.charAt(index)) { + case ' ': + case '\t': + return true; + default: + return false; + } } else { - padding = match.group(0).length(); + return true; } - return new ListData(listBlock, padding); } /** @@ -84,9 +190,9 @@ private static ListData parseListMarker(CharSequence ln, int offset) { */ private static boolean listsMatch(ListBlock a, ListBlock b) { if (a instanceof BulletList && b instanceof BulletList) { - return Objects.equals(((BulletList) a).getBulletMarker(), ((BulletList) b).getBulletMarker()); + return Objects.equals(((BulletList) a).getMarker(), ((BulletList) b).getMarker()); } else if (a instanceof OrderedList && b instanceof OrderedList) { - return Objects.equals(((OrderedList) a).getDelimiter(), ((OrderedList) b).getDelimiter()); + return Objects.equals(((OrderedList) a).getMarkerDelimiter(), ((OrderedList) b).getMarkerDelimiter()); } return false; } @@ -97,43 +203,52 @@ public static class Factory extends AbstractBlockParserFactory { public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { BlockParser matched = matchedBlockParser.getMatchedBlockParser(); - if (state.getIndent() >= 4 && !(matched instanceof ListBlockParser)) { + if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT) { return BlockStart.none(); } - int nextNonSpace = state.getNextNonSpaceIndex(); - ListData listData = parseListMarker(state.getLine(), nextNonSpace); + int markerIndex = state.getNextNonSpaceIndex(); + int markerColumn = state.getColumn() + state.getIndent(); + boolean inParagraph = !matchedBlockParser.getParagraphLines().isEmpty(); + ListData listData = parseList(state.getLine().getContent(), markerIndex, markerColumn, inParagraph); if (listData == null) { return BlockStart.none(); } - // list item - int newIndex = nextNonSpace + listData.padding; - - int itemIndent = state.getIndent() + listData.padding; - ListItemParser listItemParser = new ListItemParser(itemIndent); + int newColumn = listData.contentColumn; + ListItemParser listItemParser = new ListItemParser(state.getIndent(), newColumn - state.getColumn()); // prepend the list block if needed if (!(matched instanceof ListBlockParser) || !(listsMatch((ListBlock) matched.getBlock(), listData.listBlock))) { ListBlockParser listBlockParser = new ListBlockParser(listData.listBlock); - listBlockParser.setTight(true); + // We start out with assuming a list is tight. If we find a blank line, we set it to loose later. + listData.listBlock.setTight(true); - return BlockStart.of(listBlockParser, listItemParser).atIndex(newIndex); + return BlockStart.of(listBlockParser, listItemParser).atColumn(newColumn); } else { - return BlockStart.of(listItemParser).atIndex(newIndex); + return BlockStart.of(listItemParser).atColumn(newColumn); } } } private static class ListData { final ListBlock listBlock; - final int padding; + final int contentColumn; - public ListData(ListBlock listBlock, int padding) { + ListData(ListBlock listBlock, int contentColumn) { this.listBlock = listBlock; - this.padding = padding; + this.contentColumn = contentColumn; } } + private static class ListMarkerData { + final ListBlock listBlock; + final int indexAfterMarker; + + ListMarkerData(ListBlock listBlock, int indexAfterMarker) { + this.listBlock = listBlock; + this.indexAfterMarker = indexAfterMarker; + } + } } diff --git a/commonmark/src/main/java/org/commonmark/internal/ListItemParser.java b/commonmark/src/main/java/org/commonmark/internal/ListItemParser.java index dfdae39b9..49722dff2 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ListItemParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ListItemParser.java @@ -1,7 +1,9 @@ package org.commonmark.internal; import org.commonmark.node.Block; +import org.commonmark.node.ListBlock; import org.commonmark.node.ListItem; +import org.commonmark.node.Paragraph; import org.commonmark.parser.block.AbstractBlockParser; import org.commonmark.parser.block.BlockContinue; import org.commonmark.parser.block.ParserState; @@ -10,10 +12,18 @@ public class ListItemParser extends AbstractBlockParser { private final ListItem block = new ListItem(); - private int itemIndent; + /** + * Minimum number of columns that the content has to be indented (relative to the containing block) to be part of + * this list item. + */ + private int contentIndent; - public ListItemParser(int itemIndent) { - this.itemIndent = itemIndent; + private boolean hadBlankLine; + + public ListItemParser(int markerIndent, int contentIndent) { + this.contentIndent = contentIndent; + block.setMarkerIndent(markerIndent); + block.setContentIndent(contentIndent); } @Override @@ -22,7 +32,17 @@ public boolean isContainer() { } @Override - public boolean canContain(Block block) { + public boolean canContain(Block childBlock) { + if (hadBlankLine) { + // We saw a blank line in this list item, that means the list block is loose. + // + // spec: if any of its constituent list items directly contain two block-level elements with a blank line + // between them + Block parent = block.getParent(); + if (parent instanceof ListBlock) { + ((ListBlock) parent).setTight(false); + } + } return true; } @@ -34,14 +54,22 @@ public Block getBlock() { @Override public BlockContinue tryContinue(ParserState state) { if (state.isBlank()) { - return BlockContinue.atIndex(state.getNextNonSpaceIndex()); + if (block.getFirstChild() == null) { + // Blank line after empty list item + return BlockContinue.none(); + } else { + Block activeBlock = state.getActiveBlockParser().getBlock(); + // If the active block is a code block, blank lines in it should not affect if the list is tight. + hadBlankLine = activeBlock instanceof Paragraph || activeBlock instanceof ListItem; + return BlockContinue.atIndex(state.getNextNonSpaceIndex()); + } } - if (state.getIndent() >= itemIndent) { - return BlockContinue.atColumn(state.getColumn() + itemIndent); + if (state.getIndent() >= contentIndent) { + return BlockContinue.atColumn(state.getColumn() + contentIndent); } else { + // Note: We'll hit this case for lazy continuation lines, they will get added later. return BlockContinue.none(); } } - } diff --git a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java index 4ca32c487..27eb1e647 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java @@ -1,17 +1,24 @@ package org.commonmark.internal; -import org.commonmark.internal.util.Parsing; -import org.commonmark.node.Block; -import org.commonmark.node.Paragraph; +import org.commonmark.node.*; +import org.commonmark.parser.InlineParser; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; import org.commonmark.parser.block.AbstractBlockParser; import org.commonmark.parser.block.BlockContinue; -import org.commonmark.parser.InlineParser; import org.commonmark.parser.block.ParserState; +import java.util.List; + public class ParagraphParser extends AbstractBlockParser { private final Paragraph block = new Paragraph(); - private BlockContent content = new BlockContent(); + private final LinkReferenceDefinitionParser linkReferenceDefinitionParser = new LinkReferenceDefinitionParser(); + + @Override + public boolean canHaveLazyContinuationLines() { + return true; + } @Override public Block getBlock() { @@ -28,45 +35,52 @@ public BlockContinue tryContinue(ParserState state) { } @Override - public void addLine(CharSequence line) { - content.add(line); + public void addLine(SourceLine line) { + linkReferenceDefinitionParser.parse(line); } @Override - public void closeBlock() { + public void addSourceSpan(SourceSpan sourceSpan) { + // Some source spans might belong to link reference definitions, others to the paragraph. + // The parser will handle that. + linkReferenceDefinitionParser.addSourceSpan(sourceSpan); } - public void closeBlock(InlineParserImpl inlineParser) { - String contentString = content.getString(); - boolean hasReferenceDefs = false; + @Override + public List<DefinitionMap<?>> getDefinitions() { + var map = new DefinitionMap<>(LinkReferenceDefinition.class); + for (var def : linkReferenceDefinitionParser.getDefinitions()) { + map.putIfAbsent(def.getLabel(), def); + } + return List.of(map); + } - int pos; - // try parsing the beginning as link reference definitions: - while (contentString.length() > 3 && contentString.charAt(0) == '[' && - (pos = inlineParser.parseReference(contentString)) != 0) { - contentString = contentString.substring(pos); - hasReferenceDefs = true; + @Override + public void closeBlock() { + for (var def : linkReferenceDefinitionParser.getDefinitions()) { + block.insertBefore(def); } - if (hasReferenceDefs && Parsing.isBlank(contentString)) { + + if (linkReferenceDefinitionParser.getParagraphLines().isEmpty()) { block.unlink(); - content = null; } else { - content = new BlockContent(contentString); + block.setSourceSpans(linkReferenceDefinitionParser.getParagraphSourceSpans()); } } @Override public void parseInlines(InlineParser inlineParser) { - if (content != null) { - inlineParser.parse(content.getString(), block); + SourceLines lines = linkReferenceDefinitionParser.getParagraphLines(); + if (!lines.isEmpty()) { + inlineParser.parse(lines, block); } } - public boolean hasSingleLine() { - return content.hasSingleLine(); + public SourceLines getParagraphLines() { + return linkReferenceDefinitionParser.getParagraphLines(); } - public String getContentString() { - return content.getString(); + public List<SourceSpan> removeLines(int lines) { + return linkReferenceDefinitionParser.removeLines(lines); } } diff --git a/commonmark/src/main/java/org/commonmark/internal/StaggeredDelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/internal/StaggeredDelimiterProcessor.java new file mode 100644 index 000000000..2836e346a --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/StaggeredDelimiterProcessor.java @@ -0,0 +1,76 @@ +package org.commonmark.internal; + +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; + +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * An implementation of DelimiterProcessor that dispatches all calls to two or more other DelimiterProcessors + * depending on the length of the delimiter run. All child DelimiterProcessors must have different minimum + * lengths. A given delimiter run is dispatched to the child with the largest acceptable minimum length. If no + * child is applicable, the one with the largest minimum length is chosen. + */ +class StaggeredDelimiterProcessor implements DelimiterProcessor { + + private final char delim; + private int minLength = 0; + private LinkedList<DelimiterProcessor> processors = new LinkedList<>(); // in reverse getMinLength order + + StaggeredDelimiterProcessor(char delim) { + this.delim = delim; + } + + + @Override + public char getOpeningCharacter() { + return delim; + } + + @Override + public char getClosingCharacter() { + return delim; + } + + @Override + public int getMinLength() { + return minLength; + } + + void add(DelimiterProcessor dp) { + final int len = dp.getMinLength(); + ListIterator<DelimiterProcessor> it = processors.listIterator(); + boolean added = false; + while (it.hasNext()) { + DelimiterProcessor p = it.next(); + int pLen = p.getMinLength(); + if (len > pLen) { + it.previous(); + it.add(dp); + added = true; + break; + } else if (len == pLen) { + throw new IllegalArgumentException("Cannot add two delimiter processors for char '" + delim + "' and minimum length " + len + "; conflicting processors: " + p + ", " + dp); + } + } + if (!added) { + processors.add(dp); + this.minLength = len; + } + } + + private DelimiterProcessor findProcessor(int len) { + for (DelimiterProcessor p : processors) { + if (p.getMinLength() <= len) { + return p; + } + } + return processors.getFirst(); + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + return findProcessor(openingRun.length()).process(openingRun, closingRun); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/ThematicBreakParser.java b/commonmark/src/main/java/org/commonmark/internal/ThematicBreakParser.java new file mode 100644 index 000000000..0f0613221 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/ThematicBreakParser.java @@ -0,0 +1,75 @@ +package org.commonmark.internal; + +import org.commonmark.node.Block; +import org.commonmark.node.ThematicBreak; +import org.commonmark.parser.block.*; + +public class ThematicBreakParser extends AbstractBlockParser { + + private final ThematicBreak block = new ThematicBreak(); + + public ThematicBreakParser(String literal) { + block.setLiteral(literal); + } + + @Override + public Block getBlock() { + return block; + } + + @Override + public BlockContinue tryContinue(ParserState state) { + // a horizontal rule can never container > 1 line, so fail to match + return BlockContinue.none(); + } + + public static class Factory extends AbstractBlockParserFactory { + + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + if (state.getIndent() >= 4) { + return BlockStart.none(); + } + int nextNonSpace = state.getNextNonSpaceIndex(); + CharSequence line = state.getLine().getContent(); + if (isThematicBreak(line, nextNonSpace)) { + var literal = String.valueOf(line.subSequence(state.getIndex(), line.length())); + return BlockStart.of(new ThematicBreakParser(literal)).atIndex(line.length()); + } else { + return BlockStart.none(); + } + } + } + + // spec: A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching -, _, or * + // characters, each followed optionally by any number of spaces, forms a thematic break. + private static boolean isThematicBreak(CharSequence line, int index) { + int dashes = 0; + int underscores = 0; + int asterisks = 0; + int length = line.length(); + for (int i = index; i < length; i++) { + switch (line.charAt(i)) { + case '-': + dashes++; + break; + case '_': + underscores++; + break; + case '*': + asterisks++; + break; + case ' ': + case '\t': + // Allowed, even between markers + break; + default: + return false; + } + } + + return ((dashes >= 3 && underscores == 0 && asterisks == 0) || + (underscores >= 3 && dashes == 0 && asterisks == 0) || + (asterisks >= 3 && dashes == 0 && underscores == 0)); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AsteriskDelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/internal/inline/AsteriskDelimiterProcessor.java index b8630dfc4..321c78ed6 100644 --- a/commonmark/src/main/java/org/commonmark/internal/inline/AsteriskDelimiterProcessor.java +++ b/commonmark/src/main/java/org/commonmark/internal/inline/AsteriskDelimiterProcessor.java @@ -1,8 +1,8 @@ package org.commonmark.internal.inline; public class AsteriskDelimiterProcessor extends EmphasisDelimiterProcessor { - @Override - public char getDelimiterChar() { - return '*'; + + public AsteriskDelimiterProcessor() { + super('*'); } } diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java new file mode 100644 index 000000000..a18966e54 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java @@ -0,0 +1,61 @@ +package org.commonmark.internal.inline; + +import org.commonmark.node.Link; +import org.commonmark.node.Text; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.beta.*; + +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Attempt to parse an autolink (URL or email in pointy brackets). + */ +public class AutolinkInlineParser implements InlineContentParser { + + private static final Pattern URI = Pattern + .compile("^[a-zA-Z][a-zA-Z0-9.+-]{1,31}:[^<>\u0000-\u0020]*$"); + + private static final Pattern EMAIL = Pattern + .compile("^([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)$"); + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + Scanner scanner = inlineParserState.scanner(); + scanner.next(); + Position textStart = scanner.position(); + if (scanner.find('>') > 0) { + SourceLines textSource = scanner.getSource(textStart, scanner.position()); + String content = textSource.getContent(); + scanner.next(); + + String destination = null; + if (URI.matcher(content).matches()) { + destination = content; + } else if (EMAIL.matcher(content).matches()) { + destination = "mailto:" + content; + } + + if (destination != null) { + Link link = new Link(destination, null); + Text text = new Text(content); + text.setSourceSpans(textSource.getSourceSpans()); + link.appendChild(text); + return ParsedInline.of(link, scanner.position()); + } + } + return ParsedInline.none(); + } + + public static class Factory implements InlineContentParserFactory { + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('<'); + } + + @Override + public InlineContentParser create() { + return new AutolinkInlineParser(); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java new file mode 100644 index 000000000..7baeed4de --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java @@ -0,0 +1,48 @@ +package org.commonmark.internal.inline; + +import org.commonmark.internal.util.Escaping; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Text; +import org.commonmark.parser.beta.*; + +import java.util.Set; +import java.util.regex.Pattern; + +/** + * Parse a backslash-escaped special character, adding either the escaped character, a hard line break + * (if the backslash is followed by a newline), or a literal backslash to the block's children. + */ +public class BackslashInlineParser implements InlineContentParser { + + private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE); + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + Scanner scanner = inlineParserState.scanner(); + // Backslash + scanner.next(); + + char next = scanner.peek(); + if (next == '\n') { + scanner.next(); + return ParsedInline.of(new HardLineBreak(), scanner.position()); + } else if (ESCAPABLE.matcher(String.valueOf(next)).matches()) { + scanner.next(); + return ParsedInline.of(new Text(String.valueOf(next)), scanner.position()); + } else { + return ParsedInline.of(new Text("\\"), scanner.position()); + } + } + + public static class Factory implements InlineContentParserFactory { + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('\\'); + } + + @Override + public InlineContentParser create() { + return new BackslashInlineParser(); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java new file mode 100644 index 000000000..b8e8984e8 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java @@ -0,0 +1,63 @@ +package org.commonmark.internal.inline; + +import org.commonmark.node.Code; +import org.commonmark.node.Text; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.beta.*; +import org.commonmark.text.Characters; + +import java.util.Set; + +/** + * Attempt to parse backticks, returning either a backtick code span or a literal sequence of backticks. + */ +public class BackticksInlineParser implements InlineContentParser { + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + Scanner scanner = inlineParserState.scanner(); + Position start = scanner.position(); + int openingTicks = scanner.matchMultiple('`'); + Position afterOpening = scanner.position(); + + while (scanner.find('`') > 0) { + Position beforeClosing = scanner.position(); + int count = scanner.matchMultiple('`'); + if (count == openingTicks) { + Code node = new Code(); + + String content = scanner.getSource(afterOpening, beforeClosing).getContent(); + content = content.replace('\n', ' '); + + // spec: If the resulting string both begins and ends with a space character, but does not consist + // entirely of space characters, a single space character is removed from the front and back. + if (content.length() >= 3 && + content.charAt(0) == ' ' && + content.charAt(content.length() - 1) == ' ' && + Characters.hasNonSpace(content)) { + content = content.substring(1, content.length() - 1); + } + + node.setLiteral(content); + return ParsedInline.of(node, scanner.position()); + } + } + + // If we got here, we didn't find a matching closing backtick sequence. + SourceLines source = scanner.getSource(start, afterOpening); + Text text = new Text(source.getContent()); + return ParsedInline.of(text, afterOpening); + } + + public static class Factory implements InlineContentParserFactory { + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('`'); + } + + @Override + public InlineContentParser create() { + return new BackticksInlineParser(); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/CoreLinkProcessor.java b/commonmark/src/main/java/org/commonmark/internal/inline/CoreLinkProcessor.java new file mode 100644 index 000000000..528750aba --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/CoreLinkProcessor.java @@ -0,0 +1,37 @@ +package org.commonmark.internal.inline; + +import org.commonmark.node.Image; +import org.commonmark.node.Link; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.beta.LinkInfo; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.LinkResult; +import org.commonmark.parser.beta.Scanner; + +public class CoreLinkProcessor implements LinkProcessor { + + @Override + public LinkResult process(LinkInfo linkInfo, Scanner scanner, InlineParserContext context) { + if (linkInfo.destination() != null) { + // Inline link + return process(linkInfo, scanner, linkInfo.destination(), linkInfo.title()); + } + + var label = linkInfo.label(); + var ref = label != null && !label.isEmpty() ? label : linkInfo.text(); + var def = context.getDefinition(LinkReferenceDefinition.class, ref); + if (def != null) { + // Reference link + return process(linkInfo, scanner, def.getDestination(), def.getTitle()); + } + return LinkResult.none(); + } + + private static LinkResult process(LinkInfo linkInfo, Scanner scanner, String destination, String title) { + if (linkInfo.marker() != null && linkInfo.marker().getLiteral().equals("!")) { + return LinkResult.wrapTextIn(new Image(destination, title), scanner.position()).includeMarker(); + } + return LinkResult.wrapTextIn(new Link(destination, title), scanner.position()); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java index 3e83dd7a9..493e4299c 100644 --- a/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java +++ b/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java @@ -1,40 +1,66 @@ package org.commonmark.internal.inline; -import org.commonmark.parser.DelimiterProcessor; -import org.commonmark.node.Emphasis; -import org.commonmark.node.Node; -import org.commonmark.node.StrongEmphasis; -import org.commonmark.node.Text; +import org.commonmark.node.*; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; public abstract class EmphasisDelimiterProcessor implements DelimiterProcessor { + private final char delimiterChar; + + protected EmphasisDelimiterProcessor(char delimiterChar) { + this.delimiterChar = delimiterChar; + } + + @Override + public char getOpeningCharacter() { + return delimiterChar; + } + + @Override + public char getClosingCharacter() { + return delimiterChar; + } + @Override - public int getMinDelimiterCount() { + public int getMinLength() { return 1; } @Override - public int getDelimiterUse(int openerCount, int closerCount) { + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + // "multiple of 3" rule for internal delimiter runs + if ((openingRun.canClose() || closingRun.canOpen()) && + closingRun.originalLength() % 3 != 0 && + (openingRun.originalLength() + closingRun.originalLength()) % 3 == 0) { + return 0; + } + + int usedDelimiters; + Node emphasis; // calculate actual number of delimiters used from this closer - if (closerCount < 3 || openerCount < 3) { - return closerCount <= openerCount ? - closerCount : openerCount; + if (openingRun.length() >= 2 && closingRun.length() >= 2) { + usedDelimiters = 2; + emphasis = new StrongEmphasis(String.valueOf(delimiterChar) + delimiterChar); } else { - return closerCount % 2 == 0 ? 2 : 1; + usedDelimiters = 1; + emphasis = new Emphasis(String.valueOf(delimiterChar)); } - } - @Override - public void process(Text opener, Text closer, int delimiterUse) { - Node emphasis = delimiterUse == 1 ? new Emphasis() : new StrongEmphasis(); - - Node tmp = opener.getNext(); - while (tmp != null && tmp != closer) { - Node next = tmp.getNext(); - emphasis.appendChild(tmp); - tmp = next; + SourceSpans sourceSpans = SourceSpans.empty(); + sourceSpans.addAllFrom(openingRun.getOpeners(usedDelimiters)); + + Text opener = openingRun.getOpener(); + for (Node node : Nodes.between(opener, closingRun.getCloser())) { + emphasis.appendChild(node); + sourceSpans.addAll(node.getSourceSpans()); } + sourceSpans.addAllFrom(closingRun.getClosers(usedDelimiters)); + + emphasis.setSourceSpans(sourceSpans.getSourceSpans()); opener.insertAfter(emphasis); + + return usedDelimiters; } } diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java new file mode 100644 index 000000000..c24e60747 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/EntityInlineParser.java @@ -0,0 +1,69 @@ +package org.commonmark.internal.inline; + +import org.commonmark.internal.util.Html5Entities; +import org.commonmark.node.Text; +import org.commonmark.parser.beta.*; +import org.commonmark.text.AsciiMatcher; + +import java.util.Set; + +/** + * Attempts to parse an HTML entity or numeric character reference. + */ +public class EntityInlineParser implements InlineContentParser { + + private static final AsciiMatcher hex = AsciiMatcher.builder().range('0', '9').range('A', 'F').range('a', 'f').build(); + private static final AsciiMatcher dec = AsciiMatcher.builder().range('0', '9').build(); + private static final AsciiMatcher entityStart = AsciiMatcher.builder().range('A', 'Z').range('a', 'z').build(); + private static final AsciiMatcher entityContinue = entityStart.newBuilder().range('0', '9').build(); + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + Scanner scanner = inlineParserState.scanner(); + Position start = scanner.position(); + // Skip `&` + scanner.next(); + + char c = scanner.peek(); + if (c == '#') { + // Numeric + scanner.next(); + if (scanner.next('x') || scanner.next('X')) { + int digits = scanner.match(hex); + if (1 <= digits && digits <= 6 && scanner.next(';')) { + return entity(scanner, start); + } + } else { + int digits = scanner.match(dec); + if (1 <= digits && digits <= 7 && scanner.next(';')) { + return entity(scanner, start); + } + } + } else if (entityStart.matches(c)) { + scanner.match(entityContinue); + if (scanner.next(';')) { + return entity(scanner, start); + } + } + + return ParsedInline.none(); + } + + private ParsedInline entity(Scanner scanner, Position start) { + String text = scanner.getSource(start, scanner.position()).getContent(); + return ParsedInline.of(new Text(Html5Entities.entityToString(text)), scanner.position()); + } + + public static class Factory implements InlineContentParserFactory { + + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('&'); + } + + @Override + public InlineContentParser create() { + return new EntityInlineParser(); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java new file mode 100644 index 000000000..a48ea5022 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/HtmlInlineParser.java @@ -0,0 +1,217 @@ +package org.commonmark.internal.inline; + +import org.commonmark.node.HtmlInline; +import org.commonmark.parser.beta.*; +import org.commonmark.text.AsciiMatcher; + +import java.util.Set; + +/** + * Attempt to parse inline HTML. + */ +public class HtmlInlineParser implements InlineContentParser { + + private static final AsciiMatcher asciiLetter = AsciiMatcher.builder().range('A', 'Z').range('a', 'z').build(); + + // spec: A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or hyphens (-). + private static final AsciiMatcher tagNameStart = asciiLetter; + private static final AsciiMatcher tagNameContinue = tagNameStart.newBuilder().range('0', '9').c('-').build(); + + // spec: An attribute name consists of an ASCII letter, _, or :, followed by zero or more ASCII letters, digits, + // _, ., :, or -. (Note: This is the XML specification restricted to ASCII. HTML5 is laxer.) + private static final AsciiMatcher attributeStart = asciiLetter.newBuilder().c('_').c(':').build(); + private static final AsciiMatcher attributeContinue = attributeStart.newBuilder().range('0', '9').c('.').c('-').build(); + // spec: An unquoted attribute value is a nonempty string of characters not including whitespace, ", ', =, <, >, or `. + private static final AsciiMatcher attributeValueEnd = AsciiMatcher.builder() + .c(' ').c('\t').c('\n').c('\u000B').c('\f').c('\r') + .c('"').c('\'').c('=').c('<').c('>').c('`') + .build(); + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + Scanner scanner = inlineParserState.scanner(); + Position start = scanner.position(); + // Skip over `<` + scanner.next(); + + char c = scanner.peek(); + if (tagNameStart.matches(c)) { + if (tryOpenTag(scanner)) { + return htmlInline(start, scanner); + } + } else if (c == '/') { + if (tryClosingTag(scanner)) { + return htmlInline(start, scanner); + } + } else if (c == '?') { + if (tryProcessingInstruction(scanner)) { + return htmlInline(start, scanner); + } + } else if (c == '!') { + // comment, declaration or CDATA + scanner.next(); + c = scanner.peek(); + if (c == '-') { + if (tryComment(scanner)) { + return htmlInline(start, scanner); + } + } else if (c == '[') { + if (tryCdata(scanner)) { + return htmlInline(start, scanner); + } + } else if (asciiLetter.matches(c)) { + if (tryDeclaration(scanner)) { + return htmlInline(start, scanner); + } + } + } + + return ParsedInline.none(); + } + + private static ParsedInline htmlInline(Position start, Scanner scanner) { + String text = scanner.getSource(start, scanner.position()).getContent(); + HtmlInline node = new HtmlInline(); + node.setLiteral(text); + return ParsedInline.of(node, scanner.position()); + } + + private static boolean tryOpenTag(Scanner scanner) { + // spec: An open tag consists of a < character, a tag name, zero or more attributes, optional whitespace, + // an optional / character, and a > character. + scanner.next(); + scanner.match(tagNameContinue); + boolean whitespace = scanner.whitespace() >= 1; + // spec: An attribute consists of whitespace, an attribute name, and an optional attribute value specification. + while (whitespace && scanner.match(attributeStart) >= 1) { + scanner.match(attributeContinue); + // spec: An attribute value specification consists of optional whitespace, a = character, + // optional whitespace, and an attribute value. + whitespace = scanner.whitespace() >= 1; + if (scanner.next('=')) { + scanner.whitespace(); + char valueStart = scanner.peek(); + if (valueStart == '\'') { + scanner.next(); + if (scanner.find('\'') < 0) { + return false; + } + scanner.next(); + } else if (valueStart == '"') { + scanner.next(); + if (scanner.find('"') < 0) { + return false; + } + scanner.next(); + } else { + if (scanner.find(attributeValueEnd) <= 0) { + return false; + } + } + + // Whitespace is required between attributes + whitespace = scanner.whitespace() >= 1; + } + } + + scanner.next('/'); + return scanner.next('>'); + } + + private static boolean tryClosingTag(Scanner scanner) { + // spec: A closing tag consists of the string </, a tag name, optional whitespace, and the character >. + scanner.next(); + if (scanner.match(tagNameStart) >= 1) { + scanner.match(tagNameContinue); + scanner.whitespace(); + return scanner.next('>'); + } + return false; + } + + private static boolean tryProcessingInstruction(Scanner scanner) { + // spec: A processing instruction consists of the string <?, a string of characters not including the string ?>, + // and the string ?>. + scanner.next(); + while (scanner.find('?') > 0) { + scanner.next(); + if (scanner.next('>')) { + return true; + } + } + return false; + } + + private static boolean tryComment(Scanner scanner) { + // spec: An [HTML comment](@) consists of `<!-->`, `<!--->`, or `<!--`, a string of + // characters not including the string `-->`, and `-->` (see the + // [HTML spec](https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state)). + + // Skip first `-` + scanner.next(); + if (!scanner.next('-')) { + return false; + } + + if (scanner.next('>') || scanner.next("->")) { + return true; + } + + while (scanner.find('-') >= 0) { + if (scanner.next("-->")) { + return true; + } else { + scanner.next(); + } + } + + return false; + } + + private static boolean tryCdata(Scanner scanner) { + // spec: A CDATA section consists of the string <![CDATA[, a string of characters not including the string ]]>, + // and the string ]]>. + + // Skip `[` + scanner.next(); + + if (scanner.next("CDATA[")) { + while (scanner.find(']') >= 0) { + if (scanner.next("]]>")) { + return true; + } else { + scanner.next(); + } + } + } + + return false; + } + + private static boolean tryDeclaration(Scanner scanner) { + // spec: A declaration consists of the string <!, an ASCII letter, zero or more characters not including + // the character >, and the character >. + scanner.match(asciiLetter); + if (scanner.whitespace() <= 0) { + return false; + } + if (scanner.find('>') >= 0) { + scanner.next(); + return true; + } + return false; + } + + public static class Factory implements InlineContentParserFactory { + + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('<'); + } + + @Override + public InlineContentParser create() { + return new HtmlInlineParser(); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/LinkResultImpl.java b/commonmark/src/main/java/org/commonmark/internal/inline/LinkResultImpl.java new file mode 100644 index 000000000..c05b24451 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/LinkResultImpl.java @@ -0,0 +1,46 @@ +package org.commonmark.internal.inline; + +import org.commonmark.node.Node; +import org.commonmark.parser.beta.LinkResult; +import org.commonmark.parser.beta.Position; + +public class LinkResultImpl implements LinkResult { + @Override + public LinkResult includeMarker() { + includeMarker = true; + return this; + } + + public enum Type { + WRAP, + REPLACE + } + + private final Type type; + private final Node node; + private final Position position; + + private boolean includeMarker = false; + + public LinkResultImpl(Type type, Node node, Position position) { + this.type = type; + this.node = node; + this.position = position; + } + + public Type getType() { + return type; + } + + public Node getNode() { + return node; + } + + public Position getPosition() { + return position; + } + + public boolean isIncludeMarker() { + return includeMarker; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java new file mode 100644 index 000000000..a77630610 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/inline/ParsedInlineImpl.java @@ -0,0 +1,23 @@ +package org.commonmark.internal.inline; + +import org.commonmark.node.Node; +import org.commonmark.parser.beta.ParsedInline; +import org.commonmark.parser.beta.Position; + +public class ParsedInlineImpl implements ParsedInline { + private final Node node; + private final Position position; + + public ParsedInlineImpl(Node node, Position position) { + this.node = node; + this.position = position; + } + + public Node getNode() { + return node; + } + + public Position getPosition() { + return position; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/UnderscoreDelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/internal/inline/UnderscoreDelimiterProcessor.java index 2804739c3..886eb89fe 100644 --- a/commonmark/src/main/java/org/commonmark/internal/inline/UnderscoreDelimiterProcessor.java +++ b/commonmark/src/main/java/org/commonmark/internal/inline/UnderscoreDelimiterProcessor.java @@ -1,8 +1,8 @@ package org.commonmark.internal.inline; public class UnderscoreDelimiterProcessor extends EmphasisDelimiterProcessor { - @Override - public char getDelimiterChar() { - return '_'; + + public UnderscoreDelimiterProcessor() { + super('_'); } } diff --git a/commonmark/src/main/java/org/commonmark/internal/renderer/NodeRendererMap.java b/commonmark/src/main/java/org/commonmark/internal/renderer/NodeRendererMap.java new file mode 100644 index 000000000..c74f90758 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/renderer/NodeRendererMap.java @@ -0,0 +1,41 @@ +package org.commonmark.internal.renderer; + +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class NodeRendererMap { + + private final List<NodeRenderer> nodeRenderers = new ArrayList<>(); + private final Map<Class<? extends Node>, NodeRenderer> renderers = new HashMap<>(32); + + /** + * Set the renderer for each {@link NodeRenderer#getNodeTypes()}, unless there was already a renderer set (first wins). + */ + public void add(NodeRenderer nodeRenderer) { + nodeRenderers.add(nodeRenderer); + for (var nodeType : nodeRenderer.getNodeTypes()) { + // The first node renderer for a node type "wins". + renderers.putIfAbsent(nodeType, nodeRenderer); + } + } + + public void render(Node node) { + var nodeRenderer = renderers.get(node.getClass()); + if (nodeRenderer != null) { + nodeRenderer.render(node); + } + } + + public void beforeRoot(Node node) { + nodeRenderers.forEach(r -> r.beforeRoot(node)); + } + + public void afterRoot(Node node) { + nodeRenderers.forEach(r -> r.afterRoot(node)); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java index cc10ec906..3350003c0 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java @@ -9,20 +9,13 @@ public class Escaping { public static final String ESCAPABLE = "[!\"#$%&\'()*+,./:;<=>?@\\[\\\\\\]^_`{|}~-]"; - private static final String ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; + public static final String ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"; private static final Pattern BACKSLASH_OR_AMP = Pattern.compile("[\\\\&]"); private static final Pattern ENTITY_OR_ESCAPED_CHAR = Pattern.compile("\\\\" + ESCAPABLE + '|' + ENTITY, Pattern.CASE_INSENSITIVE); - private static final String XML_SPECIAL = "[&<>\"]"; - - private static final Pattern XML_SPECIAL_RE = Pattern.compile(XML_SPECIAL); - - private static final Pattern XML_SPECIAL_OR_ENTITY = - Pattern.compile(ENTITY + '|' + XML_SPECIAL, Pattern.CASE_INSENSITIVE); - // From RFC 3986 (see "reserved", "unreserved") except don't escape '[' or ']' to be compatible with JS encodeURI private static final Pattern ESCAPE_IN_URI = Pattern.compile("(%[a-fA-F0-9]{0,2}|[^:/?#@!$&'()*+,;=a-zA-Z0-9\\-._~])"); @@ -32,28 +25,6 @@ public class Escaping { private static final Pattern WHITESPACE = Pattern.compile("[ \t\r\n]+"); - private static final Replacer UNSAFE_CHAR_REPLACER = new Replacer() { - @Override - public void replace(String input, StringBuilder sb) { - switch (input) { - case "&": - sb.append("&"); - break; - case "<": - sb.append("<"); - break; - case ">": - sb.append(">"); - break; - case "\"": - sb.append("""); - break; - default: - sb.append(input); - } - } - }; - private static final Replacer UNESCAPE_REPLACER = new Replacer() { @Override public void replace(String input, StringBuilder sb) { @@ -88,9 +59,41 @@ public void replace(String input, StringBuilder sb) { } }; - public static String escapeHtml(String input, boolean preserveEntities) { - Pattern p = preserveEntities ? XML_SPECIAL_OR_ENTITY : XML_SPECIAL_RE; - return replaceAll(p, input, UNSAFE_CHAR_REPLACER); + public static String escapeHtml(String input) { + // Avoid building a new string in the majority of cases (nothing to escape) + StringBuilder sb = null; + + loop: + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + String replacement; + switch (c) { + case '&': + replacement = "&"; + break; + case '<': + replacement = "<"; + break; + case '>': + replacement = ">"; + break; + case '\"': + replacement = """; + break; + default: + if (sb != null) { + sb.append(c); + } + continue loop; + } + if (sb == null) { + sb = new StringBuilder(); + sb.append(input, 0, i); + } + sb.append(replacement); + } + + return sb != null ? sb.toString() : input; } /** @@ -108,11 +111,17 @@ public static String percentEncodeUrl(String s) { return replaceAll(ESCAPE_IN_URI, s, URI_REPLACER); } - public static String normalizeReference(String input) { - // Strip '[' and ']', then trim - String stripped = input.substring(1, input.length() - 1).trim(); - String lowercase = stripped.toLowerCase(Locale.ROOT); - return WHITESPACE.matcher(lowercase).replaceAll(" "); + public static String normalizeLabelContent(String input) { + String trimmed = input.trim(); + + // This is necessary to correctly case fold "\u1E9E" (LATIN CAPITAL LETTER SHARP S) to "SS": + // "\u1E9E".toLowerCase(Locale.ROOT) -> "\u00DF" (LATIN SMALL LETTER SHARP S) + // "\u00DF".toUpperCase(Locale.ROOT) -> "SS" + // Note that doing upper first (or only upper without lower) wouldn't work because: + // "\u1E9E".toUpperCase(Locale.ROOT) -> "\u1E9E" + String caseFolded = trimmed.toLowerCase(Locale.ROOT).toUpperCase(Locale.ROOT); + + return WHITESPACE.matcher(caseFolded).replaceAll(" "); } private static String replaceAll(Pattern p, String s, Replacer replacer) { diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java b/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java index 3a6eac9cc..8da53c053 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Html5Entities.java @@ -4,24 +4,32 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class Html5Entities { private static final Map<String, String> NAMED_CHARACTER_REFERENCES = readEntities(); - private static final Pattern NUMERIC_PATTERN = Pattern.compile("^&#[Xx]?"); + private static final String ENTITY_PATH = "/org/commonmark/internal/util/entities.txt"; public static String entityToString(String input) { - Matcher matcher = NUMERIC_PATTERN.matcher(input); + if (!input.startsWith("&") || !input.endsWith(";")) { + return input; + } + + String value = input.substring(1, input.length() - 1); + if (value.startsWith("#")) { + value = value.substring(1); + int base = 10; + if (value.startsWith("x") || value.startsWith("X")) { + value = value.substring(1); + base = 16; + } - if (matcher.find()) { - int base = matcher.end() == 2 ? 10 : 16; try { - int codePoint = Integer.parseInt(input.substring(matcher.end(), input.length() - 1), base); + int codePoint = Integer.parseInt(value, base); if (codePoint == 0) { return "\uFFFD"; } @@ -30,8 +38,7 @@ public static String entityToString(String input) { return "\uFFFD"; } } else { - String name = input.substring(1, input.length() - 1); - String s = NAMED_CHARACTER_REFERENCES.get(name); + String s = NAMED_CHARACTER_REFERENCES.get(value); if (s != null) { return s; } else { @@ -42,8 +49,9 @@ public static String entityToString(String input) { private static Map<String, String> readEntities() { Map<String, String> entities = new HashMap<>(); - InputStream stream = Html5Entities.class.getResourceAsStream("entities.properties"); - try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))) { + InputStream stream = Html5Entities.class.getResourceAsStream(ENTITY_PATH); + Charset charset = StandardCharsets.UTF_8; + try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(stream, charset))) { String line; while ((line = bufferedReader.readLine()) != null) { if (line.length() == 0) { diff --git a/commonmark/src/main/java/org/commonmark/internal/util/LineReader.java b/commonmark/src/main/java/org/commonmark/internal/util/LineReader.java new file mode 100644 index 000000000..b44098257 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/util/LineReader.java @@ -0,0 +1,149 @@ +package org.commonmark.internal.util; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Reader; + +/** + * Reads lines from a reader like {@link java.io.BufferedReader} but also returns the line terminators. + * <p> + * Line terminators can be either a line feed {@code "\n"}, carriage return {@code "\r"}, or a carriage return followed + * by a line feed {@code "\r\n"}. Call {@link #getLineTerminator()} after {@link #readLine()} to obtain the + * corresponding line terminator. If a stream has a line at the end without a terminator, {@link #getLineTerminator()} + * returns {@code null}. + */ +public class LineReader implements Closeable { + + // Same as java.io.BufferedReader + static final int CHAR_BUFFER_SIZE = 8192; + static final int EXPECTED_LINE_LENGTH = 80; + + private Reader reader; + private char[] cbuf; + + private int position = 0; + private int limit = 0; + + private String lineTerminator = null; + + public LineReader(Reader reader) { + this.reader = reader; + this.cbuf = new char[CHAR_BUFFER_SIZE]; + } + + /** + * Read a line of text. + * + * @return the line, or {@code null} when the end of the stream has been reached and no more lines can be read + */ + public String readLine() throws IOException { + StringBuilder sb = null; + boolean cr = false; + + while (true) { + if (position >= limit) { + fill(); + } + + if (cr) { + // We saw a CR before, check if we have CR LF or just CR. + if (position < limit && cbuf[position] == '\n') { + position++; + return line(sb.toString(), "\r\n"); + } else { + return line(sb.toString(), "\r"); + } + } + + if (position >= limit) { + // End of stream, return either the last line without terminator or null for end. + return line(sb != null ? sb.toString() : null, null); + } + + int start = position; + int i = position; + for (; i < limit; i++) { + char c = cbuf[i]; + if (c == '\n') { + position = i + 1; + return line(finish(sb, start, i), "\n"); + } else if (c == '\r') { + if (i + 1 < limit) { + // We know what the next character is, so we can check now whether we have + // a CR LF or just a CR and return. + if (cbuf[i + 1] == '\n') { + position = i + 2; + return line(finish(sb, start, i), "\r\n"); + } else { + position = i + 1; + return line(finish(sb, start, i), "\r"); + } + } else { + // We don't know what the next character is yet, check on next iteration. + cr = true; + position = i + 1; + break; + } + } + } + + if (position < i) { + position = i; + } + + // Haven't found a finished line yet, copy the data from the buffer so that we can fill + // the buffer again. + if (sb == null) { + sb = new StringBuilder(EXPECTED_LINE_LENGTH); + } + sb.append(cbuf, start, i - start); + } + } + + /** + * Return the line terminator of the last read line from {@link #readLine()}. + * + * @return {@code "\n"}, {@code "\r"}, {@code "\r\n"}, or {@code null} + */ + public String getLineTerminator() { + return lineTerminator; + } + + @Override + public void close() throws IOException { + if (reader == null) { + return; + } + try { + reader.close(); + } finally { + reader = null; + cbuf = null; + } + } + + private void fill() throws IOException { + int read; + do { + read = reader.read(cbuf, 0, cbuf.length); + } while (read == 0); + if (read > 0) { + limit = read; + position = 0; + } + } + + private String line(String line, String lineTerminator) { + this.lineTerminator = lineTerminator; + return line; + } + + private String finish(StringBuilder sb, int start, int end) { + int len = end - start; + if (sb == null) { + return new String(cbuf, start, len); + } else { + return sb.append(cbuf, start, len).toString(); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java new file mode 100644 index 000000000..ffed047e5 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java @@ -0,0 +1,202 @@ +package org.commonmark.internal.util; + +import org.commonmark.parser.beta.Scanner; + +public class LinkScanner { + + /** + * Attempt to scan the contents of a link label (inside the brackets), stopping after the content or returning false. + * The stopped position can bei either the closing {@code ]}, or the end of the line if the label continues on + * the next line. + */ + public static boolean scanLinkLabelContent(Scanner scanner) { + while (scanner.hasNext()) { + switch (scanner.peek()) { + case '\\': + scanner.next(); + if (isEscapable(scanner.peek())) { + scanner.next(); + } + break; + case ']': + return true; + case '[': + // spec: Unescaped square bracket characters are not allowed inside the opening and closing + // square brackets of link labels. + return false; + default: + scanner.next(); + } + } + return true; + } + + /** + * Attempt to scan a link destination, stopping after the destination or returning false. + */ + public static boolean scanLinkDestination(Scanner scanner) { + if (!scanner.hasNext()) { + return false; + } + + if (scanner.next('<')) { + while (scanner.hasNext()) { + switch (scanner.peek()) { + case '\\': + scanner.next(); + if (isEscapable(scanner.peek())) { + scanner.next(); + } + break; + case '\n': + case '<': + return false; + case '>': + scanner.next(); + return true; + default: + scanner.next(); + } + } + return false; + } else { + return scanLinkDestinationWithBalancedParens(scanner); + } + } + + public static boolean scanLinkTitle(Scanner scanner) { + if (!scanner.hasNext()) { + return false; + } + + char endDelimiter; + switch (scanner.peek()) { + case '"': + endDelimiter = '"'; + break; + case '\'': + endDelimiter = '\''; + break; + case '(': + endDelimiter = ')'; + break; + default: + return false; + } + scanner.next(); + + if (!scanLinkTitleContent(scanner, endDelimiter)) { + return false; + } + if (!scanner.hasNext()) { + return false; + } + scanner.next(); + return true; + } + + public static boolean scanLinkTitleContent(Scanner scanner, char endDelimiter) { + while (scanner.hasNext()) { + char c = scanner.peek(); + if (c == '\\') { + scanner.next(); + if (isEscapable(scanner.peek())) { + scanner.next(); + } + } else if (c == endDelimiter) { + return true; + } else if (endDelimiter == ')' && c == '(') { + // unescaped '(' in title within parens is invalid + return false; + } else { + scanner.next(); + } + } + return true; + } + + // spec: a nonempty sequence of characters that does not start with <, does not include ASCII space or control + // characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of a balanced + // pair of unescaped parentheses + private static boolean scanLinkDestinationWithBalancedParens(Scanner scanner) { + int parens = 0; + boolean empty = true; + while (scanner.hasNext()) { + char c = scanner.peek(); + switch (c) { + case ' ': + return !empty; + case '\\': + scanner.next(); + if (isEscapable(scanner.peek())) { + scanner.next(); + } + break; + case '(': + parens++; + // Limit to 32 nested parens for pathological cases + if (parens > 32) { + return false; + } + scanner.next(); + break; + case ')': + if (parens == 0) { + return true; + } else { + parens--; + } + scanner.next(); + break; + default: + // or control character + if (Character.isISOControl(c)) { + return !empty; + } + scanner.next(); + break; + } + empty = false; + } + return true; + } + + private static boolean isEscapable(char c) { + switch (c) { + case '!': + case '"': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '-': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '[': + case '\\': + case ']': + case '^': + case '_': + case '`': + case '{': + case '|': + case '}': + case '~': + return true; + } + return false; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java index 43c888dfa..972fdef62 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java @@ -1,88 +1,10 @@ package org.commonmark.internal.util; public class Parsing { + public static int CODE_BLOCK_INDENT = 4; - private static final String[] TAB_SPACES = new String[]{" ", " ", " ", " "}; - - private static final String TAGNAME = "[A-Za-z][A-Za-z0-9-]*"; - private static final String ATTRIBUTENAME = "[a-zA-Z_:][a-zA-Z0-9:._-]*"; - private static final String UNQUOTEDVALUE = "[^\"'=<>`\\x00-\\x20]+"; - private static final String SINGLEQUOTEDVALUE = "'[^']*'"; - private static final String DOUBLEQUOTEDVALUE = "\"[^\"]*\""; - private static final String ATTRIBUTEVALUE = "(?:" + UNQUOTEDVALUE + "|" + SINGLEQUOTEDVALUE - + "|" + DOUBLEQUOTEDVALUE + ")"; - private static final String ATTRIBUTEVALUESPEC = "(?:" + "\\s*=" + "\\s*" + ATTRIBUTEVALUE - + ")"; - private static final String ATTRIBUTE = "(?:" + "\\s+" + ATTRIBUTENAME + ATTRIBUTEVALUESPEC - + "?)"; - - public static final String OPENTAG = "<" + TAGNAME + ATTRIBUTE + "*" + "\\s*/?>"; - public static final String CLOSETAG = "</" + TAGNAME + "\\s*[>]"; - - public static boolean isBlank(CharSequence s) { - return findNonSpace(s, 0) == -1; - } - - private static int findNonSpace(CharSequence s, int startIndex) { - for (int i = startIndex; i < s.length(); i++) { - switch (s.charAt(i)) { - case ' ': - case '\t': - case '\n': - case '\u000B': - case '\f': - case '\r': - break; - default: - return i; - } - } - return -1; - } - - public static int findLineBreak(CharSequence s, int startIndex) { - for (int i = startIndex; i < s.length(); i++) { - switch (s.charAt(i)) { - case '\n': - case '\r': - return i; - } - } - return -1; - } - - public static boolean isLetter(CharSequence s, int index) { - int codePoint = Character.codePointAt(s, index); - return Character.isLetter(codePoint); - } - - /** - * Prepares the input line replacing {@code \0} - */ - public static CharSequence prepareLine(CharSequence line) { - // Avoid building a new string in the majority of cases (no \0) - StringBuilder sb = null; - for (int i = 0; i < line.length(); i++) { - char c = line.charAt(i); - switch (line.charAt(i)) { - case '\0': - if (sb == null) { - sb = new StringBuilder(line.length()); - sb.append(line, 0, i); - } - sb.append('\uFFFD'); - break; - default: - if (sb != null) { - sb.append(c); - } - } - } - - if (sb != null) { - return sb.toString(); - } else { - return line; - } + public static int columnsToNextTabStop(int column) { + // Tab stop is 4 + return 4 - (column % 4); } } diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Substring.java b/commonmark/src/main/java/org/commonmark/internal/util/Substring.java deleted file mode 100644 index 426c24e06..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/util/Substring.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.commonmark.internal.util; - -/** - * A CharSequence that avoids copying string data when getting a substring. - */ -public class Substring implements CharSequence { - - private final String base; - private final int beginIndex; - private final int endIndex; - - public static CharSequence of(String base, int beginIndex, int endIndex) { - return new Substring(base, beginIndex, endIndex); - } - - private Substring(String base, int beginIndex, int endIndex) { - this.base = base; - this.beginIndex = beginIndex; - this.endIndex = endIndex; - if (endIndex > base.length()) { - throw new IndexOutOfBoundsException("endIndex must not be greater than length"); - } - } - - @Override - public int length() { - return endIndex - beginIndex; - } - - @Override - public char charAt(int index) { - return base.charAt(index + beginIndex); - } - - @Override - public CharSequence subSequence(int start, int end) { - return new Substring(base, beginIndex + start, beginIndex + end); - } - - @Override - public String toString() { - return base.substring(beginIndex, endIndex); - } -} diff --git a/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java b/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java index c7ecbe150..7edd635d7 100644 --- a/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java +++ b/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java @@ -44,18 +44,18 @@ public void visit(HardLineBreak hardLineBreak) { } @Override - public void visit(Header header) { - visitChildren(header); + public void visit(Heading heading) { + visitChildren(heading); } @Override - public void visit(HorizontalRule horizontalRule) { - visitChildren(horizontalRule); + public void visit(ThematicBreak thematicBreak) { + visitChildren(thematicBreak); } @Override - public void visit(HtmlTag htmlTag) { - visitChildren(htmlTag); + public void visit(HtmlInline htmlInline) { + visitChildren(htmlInline); } @Override @@ -108,6 +108,11 @@ public void visit(Text text) { visitChildren(text); } + @Override + public void visit(LinkReferenceDefinition linkReferenceDefinition) { + visitChildren(linkReferenceDefinition); + } + @Override public void visit(CustomBlock customBlock) { visitChildren(customBlock); diff --git a/commonmark/src/main/java/org/commonmark/node/Block.java b/commonmark/src/main/java/org/commonmark/node/Block.java index e6a317d7c..332346b0e 100644 --- a/commonmark/src/main/java/org/commonmark/node/Block.java +++ b/commonmark/src/main/java/org/commonmark/node/Block.java @@ -1,7 +1,11 @@ package org.commonmark.node; +/** + * Block nodes such as paragraphs, list blocks, code blocks etc. + */ public abstract class Block extends Node { + @Override public Block getParent() { return (Block) super.getParent(); } diff --git a/commonmark/src/main/java/org/commonmark/node/BlockQuote.java b/commonmark/src/main/java/org/commonmark/node/BlockQuote.java index 160f25ae2..f68252398 100644 --- a/commonmark/src/main/java/org/commonmark/node/BlockQuote.java +++ b/commonmark/src/main/java/org/commonmark/node/BlockQuote.java @@ -1,5 +1,15 @@ package org.commonmark.node; +/** + * A block quote, e.g.: + * <pre> + * > Some quoted text + * </pre> + * <p> + * Note that child nodes are themselves blocks, e.g. {@link Paragraph}, {@link ListBlock} etc. + * + * @see <a href="https://spec.commonmark.org/0.31.2/#block-quotes">CommonMark Spec</a> + */ public class BlockQuote extends Block { @Override diff --git a/commonmark/src/main/java/org/commonmark/node/BulletList.java b/commonmark/src/main/java/org/commonmark/node/BulletList.java index 127862312..014f4d3b2 100644 --- a/commonmark/src/main/java/org/commonmark/node/BulletList.java +++ b/commonmark/src/main/java/org/commonmark/node/BulletList.java @@ -1,20 +1,50 @@ package org.commonmark.node; +/** + * A bullet list, e.g.: + * <pre> + * - One + * - Two + * - Three + * </pre> + * <p> + * The children are {@link ListItem} blocks, which contain other blocks (or nested lists). + * + * @see <a href="https://spec.commonmark.org/0.31.2/#list-items">CommonMark Spec: List items</a> + */ public class BulletList extends ListBlock { - private char bulletMarker; + private String marker; @Override public void accept(Visitor visitor) { visitor.visit(this); } + /** + * @return the bullet list marker that was used, e.g. {@code -}, {@code *} or {@code +}, if available, or null otherwise + */ + public String getMarker() { + return marker; + } + + public void setMarker(String marker) { + this.marker = marker; + } + + /** + * @deprecated use {@link #getMarker()} instead + */ + @Deprecated public char getBulletMarker() { - return bulletMarker; + return marker != null && !marker.isEmpty() ? marker.charAt(0) : '\0'; } + /** + * @deprecated use {@link #getMarker()} instead + */ + @Deprecated public void setBulletMarker(char bulletMarker) { - this.bulletMarker = bulletMarker; + this.marker = bulletMarker != '\0' ? String.valueOf(bulletMarker) : null; } - } diff --git a/commonmark/src/main/java/org/commonmark/node/Code.java b/commonmark/src/main/java/org/commonmark/node/Code.java index 0b47ecb71..3b79e0c9c 100644 --- a/commonmark/src/main/java/org/commonmark/node/Code.java +++ b/commonmark/src/main/java/org/commonmark/node/Code.java @@ -1,5 +1,13 @@ package org.commonmark.node; +/** + * Inline code span, e.g.: + * <pre> + * Some `inline code` + * </pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#code-spans">CommonMark Spec</a> + */ public class Code extends Node { private String literal; @@ -16,6 +24,10 @@ public void accept(Visitor visitor) { visitor.visit(this); } + /** + * @return the literal text in the code span (note that it's not necessarily the raw text between tildes, + * e.g. when spaces are stripped) + */ public String getLiteral() { return literal; } diff --git a/commonmark/src/main/java/org/commonmark/node/CustomBlock.java b/commonmark/src/main/java/org/commonmark/node/CustomBlock.java index 6596ec1a0..cad88933a 100644 --- a/commonmark/src/main/java/org/commonmark/node/CustomBlock.java +++ b/commonmark/src/main/java/org/commonmark/node/CustomBlock.java @@ -1,5 +1,8 @@ package org.commonmark.node; +/** + * A block that extensions can subclass to define custom blocks (not part of the core specification). + */ public abstract class CustomBlock extends Block { @Override diff --git a/commonmark/src/main/java/org/commonmark/node/CustomNode.java b/commonmark/src/main/java/org/commonmark/node/CustomNode.java index a68e5cc11..88f0254da 100644 --- a/commonmark/src/main/java/org/commonmark/node/CustomNode.java +++ b/commonmark/src/main/java/org/commonmark/node/CustomNode.java @@ -1,5 +1,8 @@ package org.commonmark.node; +/** + * A node that extensions can subclass to define custom nodes (not part of the core specification). + */ public abstract class CustomNode extends Node { @Override public void accept(Visitor visitor) { diff --git a/commonmark/src/main/java/org/commonmark/node/DefinitionMap.java b/commonmark/src/main/java/org/commonmark/node/DefinitionMap.java new file mode 100644 index 000000000..59cb88274 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/DefinitionMap.java @@ -0,0 +1,67 @@ +package org.commonmark.node; + +import org.commonmark.internal.util.Escaping; + +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +/** + * A map that can be used to store and look up reference definitions by a label. The labels are case-insensitive and + * normalized, the same way as for {@link LinkReferenceDefinition} nodes. + * + * @param <D> the type of value + */ +public class DefinitionMap<D> { + + private final Class<D> type; + // LinkedHashMap for determinism and to preserve document order + private final Map<String, D> definitions = new LinkedHashMap<>(); + + public DefinitionMap(Class<D> type) { + this.type = type; + } + + public Class<D> getType() { + return type; + } + + public void addAll(DefinitionMap<D> that) { + for (var entry : that.definitions.entrySet()) { + // Note that keys are already normalized, so we can add them directly + definitions.putIfAbsent(entry.getKey(), entry.getValue()); + } + } + + /** + * Store a new definition unless one is already in the map. If there is no definition for that label yet, return null. + * Otherwise, return the existing definition. + * <p> + * The label is normalized by the definition map before storing. + */ + public D putIfAbsent(String label, D definition) { + String normalizedLabel = Escaping.normalizeLabelContent(label); + + // spec: When there are multiple matching link reference definitions, the first is used + return definitions.putIfAbsent(normalizedLabel, definition); + } + + /** + * Look up a definition by label. The label is normalized by the definition map before lookup. + * + * @return the value or null + */ + public D get(String label) { + String normalizedLabel = Escaping.normalizeLabelContent(label); + return definitions.get(normalizedLabel); + } + + public Set<String> keySet() { + return definitions.keySet(); + } + + public Collection<D> values() { + return definitions.values(); + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/Delimited.java b/commonmark/src/main/java/org/commonmark/node/Delimited.java new file mode 100644 index 000000000..ef02c84ad --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/Delimited.java @@ -0,0 +1,17 @@ +package org.commonmark.node; + +/** + * A node that uses delimiters in the source form (e.g. <code>*bold*</code>). + */ +public interface Delimited { + + /** + * @return the opening (beginning) delimiter, e.g. <code>*</code> + */ + String getOpeningDelimiter(); + + /** + * @return the closing (ending) delimiter, e.g. <code>*</code> + */ + String getClosingDelimiter(); +} diff --git a/commonmark/src/main/java/org/commonmark/node/Document.java b/commonmark/src/main/java/org/commonmark/node/Document.java index 5b7e74189..b4968c206 100644 --- a/commonmark/src/main/java/org/commonmark/node/Document.java +++ b/commonmark/src/main/java/org/commonmark/node/Document.java @@ -1,5 +1,8 @@ package org.commonmark.node; +/** + * The root block of a document, containing the top-level blocks. + */ public class Document extends Block { @Override diff --git a/commonmark/src/main/java/org/commonmark/node/Emphasis.java b/commonmark/src/main/java/org/commonmark/node/Emphasis.java index 3f7c86051..5efc8c327 100644 --- a/commonmark/src/main/java/org/commonmark/node/Emphasis.java +++ b/commonmark/src/main/java/org/commonmark/node/Emphasis.java @@ -1,6 +1,37 @@ package org.commonmark.node; -public class Emphasis extends Node { +/** + * Emphasis, e.g.: + * <pre> + * Some *emphasis* or _emphasis_ + * </pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis">CommonMark Spec: Emphasis and strong emphasis</a> + */ +public class Emphasis extends Node implements Delimited { + + private String delimiter; + + public Emphasis() { + } + + public Emphasis(String delimiter) { + this.delimiter = delimiter; + } + + public void setDelimiter(String delimiter) { + this.delimiter = delimiter; + } + + @Override + public String getOpeningDelimiter() { + return delimiter; + } + + @Override + public String getClosingDelimiter() { + return delimiter; + } @Override public void accept(Visitor visitor) { diff --git a/commonmark/src/main/java/org/commonmark/node/FencedCodeBlock.java b/commonmark/src/main/java/org/commonmark/node/FencedCodeBlock.java index 7e2612331..0e279a470 100644 --- a/commonmark/src/main/java/org/commonmark/node/FencedCodeBlock.java +++ b/commonmark/src/main/java/org/commonmark/node/FencedCodeBlock.java @@ -1,9 +1,22 @@ package org.commonmark.node; +/** + * A fenced code block, e.g.: + * <pre> + * ``` + * foo + * bar + * ``` + * </pre> + * <p> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#fenced-code-blocks">CommonMark Spec</a> + */ public class FencedCodeBlock extends Block { - private char fenceChar; - private int fenceLength; + private String fenceCharacter; + private Integer openingFenceLength; + private Integer closingFenceLength; private int fenceIndent; private String info; @@ -14,20 +27,47 @@ public void accept(Visitor visitor) { visitor.visit(this); } - public char getFenceChar() { - return fenceChar; + /** + * @return the fence character that was used, e.g. {@code `} or {@code ~}, if available, or null otherwise + */ + public String getFenceCharacter() { + return fenceCharacter; } - public void setFenceChar(char fenceChar) { - this.fenceChar = fenceChar; + public void setFenceCharacter(String fenceCharacter) { + this.fenceCharacter = fenceCharacter; } - public int getFenceLength() { - return fenceLength; + /** + * @return the length of the opening fence (how many of {{@link #getFenceCharacter()}} were used to start the code + * block) if available, or null otherwise + */ + public Integer getOpeningFenceLength() { + return openingFenceLength; } - public void setFenceLength(int fenceLength) { - this.fenceLength = fenceLength; + public void setOpeningFenceLength(Integer openingFenceLength) { + if (openingFenceLength != null && openingFenceLength < 3) { + throw new IllegalArgumentException("openingFenceLength needs to be >= 3"); + } + checkFenceLengths(openingFenceLength, closingFenceLength); + this.openingFenceLength = openingFenceLength; + } + + /** + * @return the length of the closing fence (how many of {@link #getFenceCharacter()} were used to end the code + * block) if available, or null otherwise + */ + public Integer getClosingFenceLength() { + return closingFenceLength; + } + + public void setClosingFenceLength(Integer closingFenceLength) { + if (closingFenceLength != null && closingFenceLength < 3) { + throw new IllegalArgumentException("closingFenceLength needs to be >= 3"); + } + checkFenceLengths(openingFenceLength, closingFenceLength); + this.closingFenceLength = closingFenceLength; } public int getFenceIndent() { @@ -39,7 +79,7 @@ public void setFenceIndent(int fenceIndent) { } /** - * @see <a href="http://spec.commonmark.org/0.18/#info-string">CommonMark spec</a> + * @see <a href="http://spec.commonmark.org/0.31.2/#info-string">CommonMark spec</a> */ public String getInfo() { return info; @@ -56,4 +96,44 @@ public String getLiteral() { public void setLiteral(String literal) { this.literal = literal; } + + /** + * @deprecated use {@link #getFenceCharacter()} instead + */ + @Deprecated + public char getFenceChar() { + return fenceCharacter != null && !fenceCharacter.isEmpty() ? fenceCharacter.charAt(0) : '\0'; + } + + /** + * @deprecated use {@link #setFenceCharacter} instead + */ + @Deprecated + public void setFenceChar(char fenceChar) { + this.fenceCharacter = fenceChar != '\0' ? String.valueOf(fenceChar) : null; + } + + /** + * @deprecated use {@link #getOpeningFenceLength} instead + */ + @Deprecated + public int getFenceLength() { + return openingFenceLength != null ? openingFenceLength : 0; + } + + /** + * @deprecated use {@link #setOpeningFenceLength} instead + */ + @Deprecated + public void setFenceLength(int fenceLength) { + this.openingFenceLength = fenceLength != 0 ? fenceLength : null; + } + + private static void checkFenceLengths(Integer openingFenceLength, Integer closingFenceLength) { + if (openingFenceLength != null && closingFenceLength != null) { + if (closingFenceLength < openingFenceLength) { + throw new IllegalArgumentException("fence lengths required to be: closingFenceLength >= openingFenceLength"); + } + } + } } diff --git a/commonmark/src/main/java/org/commonmark/node/HardLineBreak.java b/commonmark/src/main/java/org/commonmark/node/HardLineBreak.java index 0640fc3c4..28874ec01 100644 --- a/commonmark/src/main/java/org/commonmark/node/HardLineBreak.java +++ b/commonmark/src/main/java/org/commonmark/node/HardLineBreak.java @@ -1,5 +1,15 @@ package org.commonmark.node; +/** + * A hard line break, e.g.: + * <pre> + * line\ + * break + * </pre> + * <p> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#hard-line-breaks">CommonMark Spec</a> + */ public class HardLineBreak extends Node { @Override diff --git a/commonmark/src/main/java/org/commonmark/node/Header.java b/commonmark/src/main/java/org/commonmark/node/Header.java deleted file mode 100644 index 0402e15cb..000000000 --- a/commonmark/src/main/java/org/commonmark/node/Header.java +++ /dev/null @@ -1,19 +0,0 @@ -package org.commonmark.node; - -public class Header extends Block { - - private int level; - - @Override - public void accept(Visitor visitor) { - visitor.visit(this); - } - - public int getLevel() { - return level; - } - - public void setLevel(int level) { - this.level = level; - } -} diff --git a/commonmark/src/main/java/org/commonmark/node/Heading.java b/commonmark/src/main/java/org/commonmark/node/Heading.java new file mode 100644 index 000000000..5369d8739 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/Heading.java @@ -0,0 +1,31 @@ +package org.commonmark.node; + +/** + * A heading, e.g.: + * <pre> + * First heading + * ============= + * + * ## Another heading + * </pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#atx-headings">CommonMark Spec: ATX headings</a> + * @see <a href="https://spec.commonmark.org/0.31.2/#setext-headings">CommonMark Spec: Setext headings</a> + */ +public class Heading extends Block { + + private int level; + + @Override + public void accept(Visitor visitor) { + visitor.visit(this); + } + + public int getLevel() { + return level; + } + + public void setLevel(int level) { + this.level = level; + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/HorizontalRule.java b/commonmark/src/main/java/org/commonmark/node/HorizontalRule.java deleted file mode 100644 index 6d2f2ec21..000000000 --- a/commonmark/src/main/java/org/commonmark/node/HorizontalRule.java +++ /dev/null @@ -1,9 +0,0 @@ -package org.commonmark.node; - -public class HorizontalRule extends Block { - - @Override - public void accept(Visitor visitor) { - visitor.visit(this); - } -} diff --git a/commonmark/src/main/java/org/commonmark/node/HtmlBlock.java b/commonmark/src/main/java/org/commonmark/node/HtmlBlock.java index ad46c56ce..fbe00927d 100644 --- a/commonmark/src/main/java/org/commonmark/node/HtmlBlock.java +++ b/commonmark/src/main/java/org/commonmark/node/HtmlBlock.java @@ -3,7 +3,7 @@ /** * HTML block * - * @see <a href="http://spec.commonmark.org/0.18/#html-blocks">CommonMark Spec</a> + * @see <a href="http://spec.commonmark.org/0.31.2/#html-blocks">CommonMark Spec</a> */ public class HtmlBlock extends Block { diff --git a/commonmark/src/main/java/org/commonmark/node/HtmlTag.java b/commonmark/src/main/java/org/commonmark/node/HtmlInline.java similarity index 73% rename from commonmark/src/main/java/org/commonmark/node/HtmlTag.java rename to commonmark/src/main/java/org/commonmark/node/HtmlInline.java index 4271e743c..35360c639 100644 --- a/commonmark/src/main/java/org/commonmark/node/HtmlTag.java +++ b/commonmark/src/main/java/org/commonmark/node/HtmlInline.java @@ -3,9 +3,9 @@ /** * Inline HTML element. * - * @see <a href="http://spec.commonmark.org/0.18/#raw-html">CommonMark Spec</a> + * @see <a href="http://spec.commonmark.org/0.31.2/#raw-html">CommonMark Spec</a> */ -public class HtmlTag extends Node { +public class HtmlInline extends Node { private String literal; diff --git a/commonmark/src/main/java/org/commonmark/node/Image.java b/commonmark/src/main/java/org/commonmark/node/Image.java index 63481773a..1b31f6020 100644 --- a/commonmark/src/main/java/org/commonmark/node/Image.java +++ b/commonmark/src/main/java/org/commonmark/node/Image.java @@ -1,5 +1,13 @@ package org.commonmark.node; +/** + * An image, e.g.: + * <pre> + * ![foo](/url "title") + * </pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#images">CommonMark Spec</a> + */ public class Image extends Node { private String destination; diff --git a/commonmark/src/main/java/org/commonmark/node/IndentedCodeBlock.java b/commonmark/src/main/java/org/commonmark/node/IndentedCodeBlock.java index ccafca943..97642b7f3 100644 --- a/commonmark/src/main/java/org/commonmark/node/IndentedCodeBlock.java +++ b/commonmark/src/main/java/org/commonmark/node/IndentedCodeBlock.java @@ -1,5 +1,17 @@ package org.commonmark.node; +/** + * An indented code block, e.g.: + * <pre><code> + * Code follows: + * + * foo + * bar + * </code></pre> + * <p> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#indented-code-blocks">CommonMark Spec</a> + */ public class IndentedCodeBlock extends Block { private String literal; diff --git a/commonmark/src/main/java/org/commonmark/node/Link.java b/commonmark/src/main/java/org/commonmark/node/Link.java index b4794253e..4edc7f676 100644 --- a/commonmark/src/main/java/org/commonmark/node/Link.java +++ b/commonmark/src/main/java/org/commonmark/node/Link.java @@ -1,5 +1,25 @@ package org.commonmark.node; +/** + * A link with a destination and an optional title; the link text is in child nodes. + * <p> + * Example for an inline link in a CommonMark document: + * <pre><code> + * [link](/uri "title") + * </code></pre> + * <p> + * The corresponding Link node would look like this: + * <ul> + * <li>{@link #getDestination()} returns {@code "/uri"} + * <li>{@link #getTitle()} returns {@code "title"} + * <li>A {@link Text} child node with {@link Text#getLiteral() getLiteral} that returns {@code "link"}</li> + * </ul> + * <p> + * Note that the text in the link can contain inline formatting, so it could also contain an {@link Image} or + * {@link Emphasis}, etc. + * + * @see <a href="http://spec.commonmark.org/0.31.2/#links">CommonMark Spec</a> + */ public class Link extends Node { private String destination; @@ -26,6 +46,9 @@ public void setDestination(String destination) { this.destination = destination; } + /** + * @return the title or null + */ public String getTitle() { return title; } diff --git a/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java b/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java new file mode 100644 index 000000000..b866781f0 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java @@ -0,0 +1,57 @@ +package org.commonmark.node; + +/** + * A link reference definition, e.g.: + * <pre><code> + * [foo]: /url "title" + * </code></pre> + * <p> + * They can be referenced anywhere else in the document to produce a link using <code>[foo]</code>. The definitions + * themselves are usually not rendered in the final output. + * + * @see <a href="https://spec.commonmark.org/0.31.2/#link-reference-definition">CommonMark Spec</a> + */ +public class LinkReferenceDefinition extends Block { + + private String label; + private String destination; + private String title; + + public LinkReferenceDefinition() { + } + + public LinkReferenceDefinition(String label, String destination, String title) { + this.label = label; + this.destination = destination; + this.title = title; + } + + public String getLabel() { + return label; + } + + public void setLabel(String label) { + this.label = label; + } + + public String getDestination() { + return destination; + } + + public void setDestination(String destination) { + this.destination = destination; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + @Override + public void accept(Visitor visitor) { + visitor.visit(this); + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/ListBlock.java b/commonmark/src/main/java/org/commonmark/node/ListBlock.java index d49657ffe..1290bc622 100644 --- a/commonmark/src/main/java/org/commonmark/node/ListBlock.java +++ b/commonmark/src/main/java/org/commonmark/node/ListBlock.java @@ -1,9 +1,16 @@ package org.commonmark.node; +/** + * A list block like {@link BulletList} or {@link OrderedList}. + */ public abstract class ListBlock extends Block { private boolean tight; + /** + * @return whether this list is tight or loose + * @see <a href="https://spec.commonmark.org/0.31.2/#tight">CommonMark Spec for tight lists</a> + */ public boolean isTight() { return tight; } diff --git a/commonmark/src/main/java/org/commonmark/node/ListItem.java b/commonmark/src/main/java/org/commonmark/node/ListItem.java index aa526be01..c4d1214e7 100644 --- a/commonmark/src/main/java/org/commonmark/node/ListItem.java +++ b/commonmark/src/main/java/org/commonmark/node/ListItem.java @@ -1,9 +1,78 @@ package org.commonmark.node; +/** + * A child of a {@link ListBlock}, containing other blocks (e.g. {@link Paragraph}, other lists, etc). + * <p> + * Note that a list item can't directly contain {@link Text}, it needs to be: + * {@link ListItem} : {@link Paragraph} : {@link Text}. + * If you want a list that is rendered tightly, create a list with {@link ListBlock#setTight(boolean)}. + * + * @see <a href="https://spec.commonmark.org/0.31.2/#list-items">CommonMark Spec: List items</a> + */ public class ListItem extends Block { + private Integer markerIndent; + private Integer contentIndent; + @Override public void accept(Visitor visitor) { visitor.visit(this); } + + /** + * Returns the indent of the marker such as "-" or "1." in columns (spaces or tab stop of 4) if available, or null + * otherwise. + * <p> + * Some examples and their marker indent: + * <pre>- Foo</pre> + * Marker indent: 0 + * <pre> - Foo</pre> + * Marker indent: 1 + * <pre> 1. Foo</pre> + * Marker indent: 2 + */ + public Integer getMarkerIndent() { + return markerIndent; + } + + public void setMarkerIndent(Integer markerIndent) { + this.markerIndent = markerIndent; + } + + /** + * Returns the indent of the content in columns (spaces or tab stop of 4) if available, or null otherwise. + * The content indent is counted from the beginning of the line and includes the marker on the first line. + * <p> + * Some examples and their content indent: + * <pre>- Foo</pre> + * Content indent: 2 + * <pre> - Foo</pre> + * Content indent: 3 + * <pre> 1. Foo</pre> + * Content indent: 5 + * <p> + * Note that subsequent lines in the same list item need to be indented by at least the content indent to be counted + * as part of the list item. + */ + public Integer getContentIndent() { + return contentIndent; + } + + public void setContentIndent(Integer contentIndent) { + this.contentIndent = contentIndent; + } + + /** + * @deprecated list items should only contain block nodes; if you're trying to create a list that is rendered + * without paragraphs, use {@link ListBlock#setTight(boolean)} instead. + */ + @Override + @Deprecated + public void appendChild(Node child) { + super.appendChild(child); + } + + public void appendChild(Block child) { + super.appendChild(child); + } } diff --git a/commonmark/src/main/java/org/commonmark/node/Node.java b/commonmark/src/main/java/org/commonmark/node/Node.java index e7b24c08c..d95a72c60 100644 --- a/commonmark/src/main/java/org/commonmark/node/Node.java +++ b/commonmark/src/main/java/org/commonmark/node/Node.java @@ -1,5 +1,14 @@ package org.commonmark.node; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * The base class of all CommonMark AST nodes ({@link Block} and inlines). + * <p> + * A node can have multiple children, and a parent (except for the root node). + */ public abstract class Node { private Node parent = null; @@ -7,6 +16,7 @@ public abstract class Node { private Node lastChild = null; private Node prev = null; private Node next = null; + private List<SourceSpan> sourceSpans = null; public abstract void accept(Visitor visitor); @@ -76,6 +86,9 @@ public void unlink() { this.prev = null; } + /** + * Inserts the {@code sibling} node after {@code this} node. + */ public void insertAfter(Node sibling) { sibling.unlink(); sibling.next = this.next; @@ -90,6 +103,9 @@ public void insertAfter(Node sibling) { } } + /** + * Inserts the {@code sibling} node before {@code this} node. + */ public void insertBefore(Node sibling) { sibling.unlink(); sibling.prev = this.prev; @@ -104,6 +120,41 @@ public void insertBefore(Node sibling) { } } + /** + * @return the source spans of this node if included by the parser, an empty list otherwise + * @since 0.16.0 + */ + public List<SourceSpan> getSourceSpans() { + return sourceSpans != null ? Collections.unmodifiableList(sourceSpans) : List.of(); + } + + /** + * Replace the current source spans with the provided list. + * + * @param sourceSpans the new source spans to set + * @since 0.16.0 + */ + public void setSourceSpans(List<SourceSpan> sourceSpans) { + if (sourceSpans.isEmpty()) { + this.sourceSpans = null; + } else { + this.sourceSpans = new ArrayList<>(sourceSpans); + } + } + + /** + * Add a source span to the end of the list. + * + * @param sourceSpan the source span to add + * @since 0.16.0 + */ + public void addSourceSpan(SourceSpan sourceSpan) { + if (sourceSpans == null) { + this.sourceSpans = new ArrayList<>(); + } + this.sourceSpans.add(sourceSpan); + } + @Override public String toString() { return getClass().getSimpleName() + "{" + toStringAttributes() + "}"; diff --git a/commonmark/src/main/java/org/commonmark/node/Nodes.java b/commonmark/src/main/java/org/commonmark/node/Nodes.java new file mode 100644 index 000000000..22d5932af --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/Nodes.java @@ -0,0 +1,66 @@ +package org.commonmark.node; + +import java.util.Iterator; + +/** + * Utility class for working with multiple {@link Node}s. + * + * @since 0.16.0 + */ +public class Nodes { + + private Nodes() { + } + + /** + * The nodes between (not including) start and end. + */ + public static Iterable<Node> between(Node start, Node end) { + return new NodeIterable(start.getNext(), end); + } + + private static class NodeIterable implements Iterable<Node> { + + private final Node first; + private final Node end; + + private NodeIterable(Node first, Node end) { + this.first = first; + this.end = end; + } + + @Override + public Iterator<Node> iterator() { + return new NodeIterator(first, end); + } + } + + private static class NodeIterator implements Iterator<Node> { + + private Node node; + private final Node end; + + private NodeIterator(Node first, Node end) { + node = first; + this.end = end; + } + + @Override + public boolean hasNext() { + return node != null && node != end; + } + + @Override + public Node next() { + Node result = node; + node = node.getNext(); + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("remove"); + } + } +} + diff --git a/commonmark/src/main/java/org/commonmark/node/OrderedList.java b/commonmark/src/main/java/org/commonmark/node/OrderedList.java index 1f988234c..61f8902c0 100644 --- a/commonmark/src/main/java/org/commonmark/node/OrderedList.java +++ b/commonmark/src/main/java/org/commonmark/node/OrderedList.java @@ -1,29 +1,78 @@ package org.commonmark.node; +/** + * An ordered list, e.g.: + * <pre><code> + * 1. One + * 2. Two + * 3. Three + * </code></pre> + * <p> + * The children are {@link ListItem} blocks, which contain other blocks (or nested lists). + * + * @see <a href="https://spec.commonmark.org/0.31.2/#list-items">CommonMark Spec: List items</a> + */ public class OrderedList extends ListBlock { - private int startNumber; - private char delimiter; + private String markerDelimiter; + private Integer markerStartNumber; @Override public void accept(Visitor visitor) { visitor.visit(this); } + /** + * @return the start number used in the marker, e.g. {@code 1}, if available, or null otherwise + */ + public Integer getMarkerStartNumber() { + return markerStartNumber; + } + + public void setMarkerStartNumber(Integer markerStartNumber) { + this.markerStartNumber = markerStartNumber; + } + + /** + * @return the delimiter used in the marker, e.g. {@code .} or {@code )}, if available, or null otherwise + */ + public String getMarkerDelimiter() { + return markerDelimiter; + } + + public void setMarkerDelimiter(String markerDelimiter) { + this.markerDelimiter = markerDelimiter; + } + + /** + * @deprecated use {@link #getMarkerStartNumber()} instead + */ + @Deprecated public int getStartNumber() { - return startNumber; + return markerStartNumber != null ? markerStartNumber : 0; } + /** + * @deprecated use {@link #setMarkerStartNumber} instead + */ + @Deprecated public void setStartNumber(int startNumber) { - this.startNumber = startNumber; + this.markerStartNumber = startNumber != 0 ? startNumber : null; } + /** + * @deprecated use {@link #getMarkerDelimiter()} instead + */ + @Deprecated public char getDelimiter() { - return delimiter; + return markerDelimiter != null && !markerDelimiter.isEmpty() ? markerDelimiter.charAt(0) : '\0'; } + /** + * @deprecated use {@link #setMarkerDelimiter} instead + */ + @Deprecated public void setDelimiter(char delimiter) { - this.delimiter = delimiter; + this.markerDelimiter = delimiter != '\0' ? String.valueOf(delimiter) : null; } - } diff --git a/commonmark/src/main/java/org/commonmark/node/Paragraph.java b/commonmark/src/main/java/org/commonmark/node/Paragraph.java index 0c3f88f39..b298f1ce4 100644 --- a/commonmark/src/main/java/org/commonmark/node/Paragraph.java +++ b/commonmark/src/main/java/org/commonmark/node/Paragraph.java @@ -1,5 +1,10 @@ package org.commonmark.node; +/** + * A paragraph block, contains inline nodes such as {@link Text} + * + * @see <a href="https://spec.commonmark.org/0.31.2/#paragraphs">CommonMark Spec</a> + */ public class Paragraph extends Block { @Override diff --git a/commonmark/src/main/java/org/commonmark/node/SoftLineBreak.java b/commonmark/src/main/java/org/commonmark/node/SoftLineBreak.java index e66458912..87445db56 100644 --- a/commonmark/src/main/java/org/commonmark/node/SoftLineBreak.java +++ b/commonmark/src/main/java/org/commonmark/node/SoftLineBreak.java @@ -1,5 +1,14 @@ package org.commonmark.node; +/** + * A soft line break (as opposed to a {@link HardLineBreak}), e.g. between: + * <pre> + * foo + * bar + * </pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#soft-line-breaks">CommonMark Spec</a> + */ public class SoftLineBreak extends Node { @Override diff --git a/commonmark/src/main/java/org/commonmark/node/SourceSpan.java b/commonmark/src/main/java/org/commonmark/node/SourceSpan.java new file mode 100644 index 000000000..6558cc84a --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/SourceSpan.java @@ -0,0 +1,150 @@ +package org.commonmark.node; + +import java.util.Objects; + +/** + * A source span references a snippet of text from the source input. + * <p> + * It has a starting position (line and column index) and a length of how many characters it spans. + * <p> + * For example, this CommonMark source text: + * <pre><code> + * > foo + * </code></pre> + * The {@link BlockQuote} node would have this source span: line 0, column 0, length 5. + * <p> + * The {@link Paragraph} node inside it would have: line 0, column 2, length 3. + * <p> + * If a block has multiple lines, it will have a source span for each line. + * <p> + * Note that the column index and length are measured in Java characters (UTF-16 code units). If you're outputting them + * to be consumed by another programming language, e.g. one that uses UTF-8 strings, you will need to translate them, + * otherwise characters such as emojis will result in incorrect positions. + * + * @since 0.16.0 + */ +public class SourceSpan { + + private final int lineIndex; + private final int columnIndex; + private final int inputIndex; + private final int length; + + public static SourceSpan of(int line, int col, int input, int length) { + return new SourceSpan(line, col, input, length); + } + + /** + * @deprecated Use {{@link #of(int, int, int, int)}} instead to also specify input index. Using the deprecated one + * will set {@link #inputIndex} to 0. + */ + @Deprecated + public static SourceSpan of(int lineIndex, int columnIndex, int length) { + return of(lineIndex, columnIndex, 0, length); + } + + private SourceSpan(int lineIndex, int columnIndex, int inputIndex, int length) { + if (lineIndex < 0) { + throw new IllegalArgumentException("lineIndex " + lineIndex + " must be >= 0"); + } + if (columnIndex < 0) { + throw new IllegalArgumentException("columnIndex " + columnIndex + " must be >= 0"); + } + if (inputIndex < 0) { + throw new IllegalArgumentException("inputIndex " + inputIndex + " must be >= 0"); + } + if (length < 0) { + throw new IllegalArgumentException("length " + length + " must be >= 0"); + } + this.lineIndex = lineIndex; + this.columnIndex = columnIndex; + this.inputIndex = inputIndex; + this.length = length; + } + + /** + * @return 0-based line index, e.g. 0 for first line, 1 for the second line, etc + */ + public int getLineIndex() { + return lineIndex; + } + + /** + * @return 0-based index of column (character on line) in source, e.g. 0 for the first character of a line, 1 for + * the second character, etc + */ + public int getColumnIndex() { + return columnIndex; + } + + /** + * @return 0-based index in whole input + * @since 0.24.0 + */ + public int getInputIndex() { + return inputIndex; + } + + /** + * @return length of the span in characters + */ + public int getLength() { + return length; + } + + public SourceSpan subSpan(int beginIndex) { + return subSpan(beginIndex, length); + } + + public SourceSpan subSpan(int beginIndex, int endIndex) { + if (beginIndex < 0) { + throw new IndexOutOfBoundsException("beginIndex " + beginIndex + " + must be >= 0"); + } + if (beginIndex > length) { + throw new IndexOutOfBoundsException("beginIndex " + beginIndex + " must be <= length " + length); + } + if (endIndex < 0) { + throw new IndexOutOfBoundsException("endIndex " + endIndex + " + must be >= 0"); + } + if (endIndex > length) { + throw new IndexOutOfBoundsException("endIndex " + endIndex + " must be <= length " + length); + } + if (beginIndex > endIndex) { + throw new IndexOutOfBoundsException("beginIndex " + beginIndex + " must be <= endIndex " + endIndex); + } + if (beginIndex == 0 && endIndex == length) { + return this; + } + return new SourceSpan(lineIndex, columnIndex + beginIndex, inputIndex + beginIndex, endIndex - beginIndex); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SourceSpan that = (SourceSpan) o; + return lineIndex == that.lineIndex && + columnIndex == that.columnIndex && + inputIndex == that.inputIndex && + length == that.length; + } + + @Override + public int hashCode() { + return Objects.hash(lineIndex, columnIndex, inputIndex, length); + } + + @Override + public String toString() { + return "SourceSpan{" + + "line=" + lineIndex + + ", column=" + columnIndex + + ", input=" + inputIndex + + ", length=" + length + + "}"; + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/SourceSpans.java b/commonmark/src/main/java/org/commonmark/node/SourceSpans.java new file mode 100644 index 000000000..975d7fbdb --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/SourceSpans.java @@ -0,0 +1,52 @@ +package org.commonmark.node; + +import java.util.ArrayList; +import java.util.List; + +/** + * A list of source spans that can be added to. Takes care of merging adjacent source spans. + * + * @since 0.16.0 + */ +public class SourceSpans { + + private List<SourceSpan> sourceSpans; + + public static SourceSpans empty() { + return new SourceSpans(); + } + + public List<SourceSpan> getSourceSpans() { + return sourceSpans != null ? sourceSpans : List.of(); + } + + public void addAllFrom(Iterable<? extends Node> nodes) { + for (Node node : nodes) { + addAll(node.getSourceSpans()); + } + } + + public void addAll(List<SourceSpan> other) { + if (other.isEmpty()) { + return; + } + + if (sourceSpans == null) { + sourceSpans = new ArrayList<>(); + } + + if (sourceSpans.isEmpty()) { + sourceSpans.addAll(other); + } else { + int lastIndex = sourceSpans.size() - 1; + SourceSpan a = sourceSpans.get(lastIndex); + SourceSpan b = other.get(0); + if (a.getInputIndex() + a.getLength() == b.getInputIndex()) { + sourceSpans.set(lastIndex, SourceSpan.of(a.getLineIndex(), a.getColumnIndex(), a.getInputIndex(), a.getLength() + b.getLength())); + sourceSpans.addAll(other.subList(1, other.size())); + } else { + sourceSpans.addAll(other); + } + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/StrongEmphasis.java b/commonmark/src/main/java/org/commonmark/node/StrongEmphasis.java index 26ab4fbf5..0dbeed3df 100644 --- a/commonmark/src/main/java/org/commonmark/node/StrongEmphasis.java +++ b/commonmark/src/main/java/org/commonmark/node/StrongEmphasis.java @@ -1,6 +1,37 @@ package org.commonmark.node; -public class StrongEmphasis extends Node { +/** + * Strong emphasis, e.g.: + * <pre><code> + * Some **strong emphasis** or __strong emphasis__ + * </code></pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis">CommonMark Spec: Emphasis and strong emphasis</a> + */ +public class StrongEmphasis extends Node implements Delimited { + + private String delimiter; + + public StrongEmphasis() { + } + + public StrongEmphasis(String delimiter) { + this.delimiter = delimiter; + } + + public void setDelimiter(String delimiter) { + this.delimiter = delimiter; + } + + @Override + public String getOpeningDelimiter() { + return delimiter; + } + + @Override + public String getClosingDelimiter() { + return delimiter; + } @Override public void accept(Visitor visitor) { diff --git a/commonmark/src/main/java/org/commonmark/node/Text.java b/commonmark/src/main/java/org/commonmark/node/Text.java index f16fc907b..9a04c41c1 100644 --- a/commonmark/src/main/java/org/commonmark/node/Text.java +++ b/commonmark/src/main/java/org/commonmark/node/Text.java @@ -1,5 +1,15 @@ package org.commonmark.node; +/** + * A text node, e.g. in: + * <pre> + * foo *bar* + * </pre> + * <p> + * The <code>foo </code> is a text node, and the <code>bar</code> inside the emphasis is also a text node. + * + * @see <a href="https://spec.commonmark.org/0.31.2/#textual-content">CommonMark Spec</a> + */ public class Text extends Node { private String literal; diff --git a/commonmark/src/main/java/org/commonmark/node/ThematicBreak.java b/commonmark/src/main/java/org/commonmark/node/ThematicBreak.java new file mode 100644 index 000000000..a31131e07 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/ThematicBreak.java @@ -0,0 +1,34 @@ +package org.commonmark.node; + +/** + * A thematic break, e.g. between text: + * <pre> + * Some text + * + * ___ + * + * Some other text. + * </pre> + * + * @see <a href="https://spec.commonmark.org/0.31.2/#thematic-breaks">CommonMark Spec</a> + */ +public class ThematicBreak extends Block { + + private String literal; + + @Override + public void accept(Visitor visitor) { + visitor.visit(this); + } + + /** + * @return the source literal that represents this node, if available + */ + public String getLiteral() { + return literal; + } + + public void setLiteral(String literal) { + this.literal = literal; + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/Visitor.java b/commonmark/src/main/java/org/commonmark/node/Visitor.java index 3f0b4fb6c..a155296f0 100644 --- a/commonmark/src/main/java/org/commonmark/node/Visitor.java +++ b/commonmark/src/main/java/org/commonmark/node/Visitor.java @@ -3,7 +3,7 @@ /** * Node visitor. * <p> - * See {@link AbstractVisitor} for a base class that can be extended. + * Implementations should subclass {@link AbstractVisitor} instead of implementing this directly. */ public interface Visitor { @@ -21,11 +21,11 @@ public interface Visitor { void visit(HardLineBreak hardLineBreak); - void visit(Header header); + void visit(Heading heading); - void visit(HorizontalRule horizontalRule); + void visit(ThematicBreak thematicBreak); - void visit(HtmlTag htmlTag); + void visit(HtmlInline htmlInline); void visit(HtmlBlock htmlBlock); @@ -47,6 +47,8 @@ public interface Visitor { void visit(Text text); + void visit(LinkReferenceDefinition linkReferenceDefinition); + void visit(CustomBlock customBlock); void visit(CustomNode customNode); diff --git a/commonmark/src/main/java/org/commonmark/node/package-info.java b/commonmark/src/main/java/org/commonmark/node/package-info.java new file mode 100644 index 000000000..e9fee1aba --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/package-info.java @@ -0,0 +1,4 @@ +/** + * AST node types (see {@link org.commonmark.node.Node}) and visitors (see {@link org.commonmark.node.AbstractVisitor}) + */ +package org.commonmark.node; diff --git a/commonmark/src/main/java/org/commonmark/package-info.java b/commonmark/src/main/java/org/commonmark/package-info.java new file mode 100644 index 000000000..b683017f6 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/package-info.java @@ -0,0 +1,10 @@ +/** + * Root package of commonmark-java + * <ul> + * <li>{@link org.commonmark.parser} for parsing input text to AST nodes</li> + * <li>{@link org.commonmark.node} for AST node types and visitors</li> + * <li>{@link org.commonmark.renderer.html} for HTML rendering</li> + * <li>{@link org.commonmark.renderer.markdown} for Markdown rendering</li> + * </ul> + */ +package org.commonmark; diff --git a/commonmark/src/main/java/org/commonmark/parser/DelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/parser/DelimiterProcessor.java deleted file mode 100644 index 7e9e84df9..000000000 --- a/commonmark/src/main/java/org/commonmark/parser/DelimiterProcessor.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.commonmark.parser; - -import org.commonmark.node.Text; - -/** - * Custom delimiter processor for additional delimiters besides {@code _} and {@code *}. - * <p> - * Note that implementations of this need to be thread-safe, the same instance may be used by multiple parsers. - */ -public interface DelimiterProcessor { - - /** - * @return the character that activates this, must not clash with any built-in special characters - */ - char getDelimiterChar(); - - /** - * Minimum number of delimiter characters that are needed to activate this. Must be at least 1. - */ - int getMinDelimiterCount(); - - /** - * Determine how many of the delimiters should be used. Useful in case the same character with a different count - * should have a different meaning (e.g. with "*" for emphasis and "**" for strong emphasis). - * - * @param openerCount the delimiter count of the opening delimiter, at least 1 - * @param closerCount the delimiter count of the closing delimiter, at least 1 - * @return how many delimiters should be used; cannot be 0; must not be greater than either openerCount or closerCount - */ - int getDelimiterUse(int openerCount, int closerCount); - - /** - * Process the matched delimiters, e.g. by wrapping the nodes between opener and closer in a new node, or appending - * a new node after the opener. - * <p> - * Note that removal of the delimiter from the delimiter nodes and unlinking them is done by the caller. - * - * @param opener the text node that contained the opening delimiter - * @param closer the text node that contained the closing delimiter - * @param delimiterUse the number of delimiters that were used - */ - void process(Text opener, Text closer, int delimiterUse); - -} diff --git a/commonmark/src/main/java/org/commonmark/parser/IncludeSourceSpans.java b/commonmark/src/main/java/org/commonmark/parser/IncludeSourceSpans.java new file mode 100644 index 000000000..91d2b4e00 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/IncludeSourceSpans.java @@ -0,0 +1,22 @@ +package org.commonmark.parser; + +/** + * Whether to include {@link org.commonmark.node.SourceSpan} or not while parsing, + * see {@link Parser.Builder#includeSourceSpans(IncludeSourceSpans)}. + * + * @since 0.16.0 + */ +public enum IncludeSourceSpans { + /** + * Do not include source spans. + */ + NONE, + /** + * Include source spans on {@link org.commonmark.node.Block} nodes. + */ + BLOCKS, + /** + * Include source spans on block nodes and inline nodes. + */ + BLOCKS_AND_INLINES, +} diff --git a/commonmark/src/main/java/org/commonmark/parser/InlineParser.java b/commonmark/src/main/java/org/commonmark/parser/InlineParser.java index 828ea7946..49043a64f 100644 --- a/commonmark/src/main/java/org/commonmark/parser/InlineParser.java +++ b/commonmark/src/main/java/org/commonmark/parser/InlineParser.java @@ -4,14 +4,12 @@ /** * Parser for inline content (text, links, emphasized text, etc). - * <p><em>This interface is not intended to be implemented by clients.</em></p> */ public interface InlineParser { /** - * @param input the content to parse as inline + * @param lines the source content to parse as inline * @param node the node to append resulting nodes to (as children) */ - void parse(String input, Node node); - + void parse(SourceLines lines, Node node); } diff --git a/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java b/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java new file mode 100644 index 000000000..12007610b --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java @@ -0,0 +1,60 @@ +package org.commonmark.parser; + +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.InlineContentParserFactory; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.util.List; +import java.util.Set; + +/** + * Context for inline parsing. + */ +public interface InlineParserContext { + + /** + * @return custom inline content parsers that have been configured with + * {@link Parser.Builder#customInlineContentParserFactory(InlineContentParserFactory)} + */ + List<InlineContentParserFactory> getCustomInlineContentParserFactories(); + + /** + * @return custom delimiter processors that have been configured with + * {@link Parser.Builder#customDelimiterProcessor(DelimiterProcessor)} + */ + List<DelimiterProcessor> getCustomDelimiterProcessors(); + + /** + * @return custom link processors that have been configured with {@link Parser.Builder#linkProcessor}. + */ + List<LinkProcessor> getCustomLinkProcessors(); + + /** + * @return custom link markers that have been configured with {@link Parser.Builder#linkMarker}. + */ + Set<Character> getCustomLinkMarkers(); + + /** + * Look up a {@link LinkReferenceDefinition} for a given label. + * <p> + * Note that the passed in label does not need to be normalized; implementations are responsible for doing the + * normalization before lookup. + * + * @param label the link label to look up + * @return the definition if one exists, {@code null} otherwise + * @deprecated use {@link #getDefinition} with {@link LinkReferenceDefinition} instead + */ + @Deprecated + LinkReferenceDefinition getLinkReferenceDefinition(String label); + + /** + * Look up a definition of a type for a given label. + * <p> + * Note that the passed in label does not need to be normalized; implementations are responsible for doing the + * normalization before lookup. + * + * @return the definition if one exists, null otherwise + */ + <D> D getDefinition(Class<D> type, String label); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/InlineParserFactory.java b/commonmark/src/main/java/org/commonmark/parser/InlineParserFactory.java new file mode 100644 index 000000000..c1640e9d8 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/InlineParserFactory.java @@ -0,0 +1,12 @@ +package org.commonmark.parser; + +/** + * Factory for custom inline parser. + */ +public interface InlineParserFactory { + + /** + * Create an {@link InlineParser} to use for parsing inlines. This is called once per parsed document. + */ + InlineParser create(InlineParserContext inlineParserContext); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/Parser.java b/commonmark/src/main/java/org/commonmark/parser/Parser.java index 7d46dfb0d..8faac789b 100644 --- a/commonmark/src/main/java/org/commonmark/parser/Parser.java +++ b/commonmark/src/main/java/org/commonmark/parser/Parser.java @@ -1,81 +1,151 @@ package org.commonmark.parser; -import java.io.IOException; -import java.io.Reader; import org.commonmark.Extension; +import org.commonmark.internal.Definitions; import org.commonmark.internal.DocumentParser; +import org.commonmark.internal.InlineParserContextImpl; import org.commonmark.internal.InlineParserImpl; -import org.commonmark.node.Node; +import org.commonmark.node.*; +import org.commonmark.parser.beta.LinkInfo; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.InlineContentParserFactory; +import org.commonmark.parser.beta.LinkResult; import org.commonmark.parser.block.BlockParserFactory; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.io.IOException; +import java.io.Reader; +import java.util.*; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.List; -import java.util.Map; +/** + * Parses input text to a tree of nodes. + * <p> + * Start with the {@link #builder} method, configure the parser and build it. Example: + * <pre><code> + * Parser parser = Parser.builder().build(); + * Node document = parser.parse("input text"); + * </code></pre> + */ public class Parser { private final List<BlockParserFactory> blockParserFactories; - private final Map<Character, DelimiterProcessor> delimiterProcessors; - private final BitSet delimiterCharacters; - private final BitSet specialCharacters; + private final List<InlineContentParserFactory> inlineContentParserFactories; + private final List<DelimiterProcessor> delimiterProcessors; + private final List<LinkProcessor> linkProcessors; + private final Set<Character> linkMarkers; + private final InlineParserFactory inlineParserFactory; private final List<PostProcessor> postProcessors; + private final IncludeSourceSpans includeSourceSpans; + private final int maxOpenBlockParsers; private Parser(Builder builder) { - this.blockParserFactories = DocumentParser.calculateBlockParserFactories(builder.blockParserFactories); - this.delimiterProcessors = InlineParserImpl.calculateDelimiterProcessors(builder.delimiterProcessors); - this.delimiterCharacters = InlineParserImpl.calculateDelimiterCharacters(delimiterProcessors.keySet()); - this.specialCharacters = InlineParserImpl.calculateSpecialCharacters(delimiterCharacters); + this.blockParserFactories = DocumentParser.calculateBlockParserFactories(builder.blockParserFactories, builder.enabledBlockTypes); + this.inlineParserFactory = builder.getInlineParserFactory(); this.postProcessors = builder.postProcessors; + this.inlineContentParserFactories = builder.inlineContentParserFactories; + this.delimiterProcessors = builder.delimiterProcessors; + this.linkProcessors = builder.linkProcessors; + this.linkMarkers = builder.linkMarkers; + this.includeSourceSpans = builder.includeSourceSpans; + this.maxOpenBlockParsers = builder.maxOpenBlockParsers; + + // Try to construct an inline parser. Invalid configuration might result in an exception, which we want to + // detect as soon as possible. + var context = new InlineParserContextImpl( + inlineContentParserFactories, delimiterProcessors, linkProcessors, linkMarkers, new Definitions()); + this.inlineParserFactory.create(context); } + /** + * Create a new builder for configuring a {@link Parser}. + * + * @return a builder + */ public static Builder builder() { return new Builder(); } /** - * Parse the specified input text into a AST (tree of nodes). + * Parse the specified input text into a tree of nodes. * <p> - * Note that this method is thread-safe (a new parser state is used for each invocation). + * This method is thread-safe (a new parser state is used for each invocation). * - * @param input the text to parse + * @param input the text to parse - must not be null * @return the root node */ public Node parse(String input) { - InlineParserImpl inlineParser = new InlineParserImpl(specialCharacters, delimiterCharacters, delimiterProcessors); - DocumentParser documentParser = new DocumentParser(blockParserFactories, inlineParser); + Objects.requireNonNull(input, "input must not be null"); + DocumentParser documentParser = createDocumentParser(); Node document = documentParser.parse(input); return postProcess(document); } - + + /** + * Parse the specified reader into a tree of nodes. The caller is responsible for closing the reader. + * <pre><code> + * Parser parser = Parser.builder().build(); + * try (InputStreamReader reader = new InputStreamReader(new FileInputStream("file.md"), StandardCharsets.UTF_8)) { + * Node document = parser.parseReader(reader); + * // ... + * } + * </code></pre> + * Note that if you have a file with a byte order mark (BOM), you need to skip it before handing the reader to this + * library. There's existing classes that do that, e.g. see {@code BOMInputStream} in Commons IO. + * <p> + * This method is thread-safe (a new parser state is used for each invocation). + * + * @param input the reader to parse - must not be null + * @return the root node + * @throws IOException when reading throws an exception + */ public Node parseReader(Reader input) throws IOException { - InlineParserImpl inlineParser = new InlineParserImpl(specialCharacters, delimiterCharacters, delimiterProcessors); - DocumentParser documentParser = new DocumentParser(blockParserFactories, inlineParser); + Objects.requireNonNull(input, "input must not be null"); + DocumentParser documentParser = createDocumentParser(); Node document = documentParser.parse(input); return postProcess(document); } + private DocumentParser createDocumentParser() { + return new DocumentParser(blockParserFactories, inlineParserFactory, inlineContentParserFactories, + delimiterProcessors, linkProcessors, linkMarkers, includeSourceSpans, maxOpenBlockParsers); + } + private Node postProcess(Node document) { for (PostProcessor postProcessor : postProcessors) { document = postProcessor.process(document); } return document; } - + + /** + * Builder for configuring a {@link Parser}. + */ public static class Builder { private final List<BlockParserFactory> blockParserFactories = new ArrayList<>(); + private final List<InlineContentParserFactory> inlineContentParserFactories = new ArrayList<>(); private final List<DelimiterProcessor> delimiterProcessors = new ArrayList<>(); + private final List<LinkProcessor> linkProcessors = new ArrayList<>(); private final List<PostProcessor> postProcessors = new ArrayList<>(); + private final Set<Character> linkMarkers = new HashSet<>(); + private Set<Class<? extends Block>> enabledBlockTypes = DocumentParser.getDefaultBlockParserTypes(); + private InlineParserFactory inlineParserFactory; + private IncludeSourceSpans includeSourceSpans = IncludeSourceSpans.NONE; + private int maxOpenBlockParsers = Integer.MAX_VALUE; + /** + * @return the configured {@link Parser} + */ public Parser build() { return new Parser(this); } /** * @param extensions extensions to use on this parser - * @return this + * @return {@code this} */ public Builder extensions(Iterable<? extends Extension> extensions) { + Objects.requireNonNull(extensions, "extensions must not be null"); for (Extension extension : extensions) { if (extension instanceof ParserExtension) { ParserExtension parserExtension = (ParserExtension) extension; @@ -85,24 +155,196 @@ public Builder extensions(Iterable<? extends Extension> extensions) { return this; } + /** + * Describe the list of markdown features the parser will recognize and parse. + * <p> + * By default, CommonMark will recognize and parse the following set of "block" elements: + * <ul> + * <li>{@link Heading} ({@code #}) + * <li>{@link HtmlBlock} ({@code <html></html>}) + * <li>{@link ThematicBreak} (Horizontal Rule) ({@code ---}) + * <li>{@link FencedCodeBlock} ({@code ```}) + * <li>{@link IndentedCodeBlock} + * <li>{@link BlockQuote} ({@code >}) + * <li>{@link ListBlock} (Ordered / Unordered List) ({@code 1. / *}) + * </ul> + * <p> + * To parse only a subset of the features listed above, pass a list of each feature's associated {@link Block} class. + * <p> + * E.g., to only parse headings and lists: + * <pre> + * {@code + * Parser.builder().enabledBlockTypes(Set.of(Heading.class, ListBlock.class)); + * } + * </pre> + * + * @param enabledBlockTypes A list of block nodes the parser will parse. + * If this list is empty, the parser will not recognize any CommonMark core features. + * @return {@code this} + */ + public Builder enabledBlockTypes(Set<Class<? extends Block>> enabledBlockTypes) { + Objects.requireNonNull(enabledBlockTypes, "enabledBlockTypes must not be null"); + DocumentParser.checkEnabledBlockTypes(enabledBlockTypes); + this.enabledBlockTypes = enabledBlockTypes; + return this; + } + + /** + * Whether to calculate source positions for parsed {@link Node Nodes}, see {@link Node#getSourceSpans()}. + * <p> + * By default, source spans are disabled. + * + * @param includeSourceSpans which kind of source spans should be included + * @return {@code this} + * @since 0.16.0 + */ + public Builder includeSourceSpans(IncludeSourceSpans includeSourceSpans) { + this.includeSourceSpans = includeSourceSpans; + return this; + } + + /** + * Limit how many block parsers may be open at once while parsing. + * <p> + * Once the limit is reached, additional block starts are treated as plain text instead of + * creating deeper nested block structure. + * <p> + * The document root parser is not counted. The default is unlimited, so callers that keep + * using {@code Parser.builder().build()} preserve behavior. + * + * @param maxOpenBlockParsers maximum number of open non-document block parsers, must be + * zero or greater + * @return {@code this} + */ + public Builder maxOpenBlockParsers(int maxOpenBlockParsers) { + if (maxOpenBlockParsers < 0) { + throw new IllegalArgumentException("maxOpenBlockParsers must be >= 0"); + } + this.maxOpenBlockParsers = maxOpenBlockParsers; + return this; + } + + /** + * Add a custom block parser factory. + * <p> + * Note that custom factories are applied <em>before</em> the built-in factories. This is so that + * extensions can change how some syntax is parsed that would otherwise be handled by built-in factories. + * "With great power comes great responsibility." + * + * @param blockParserFactory a block parser factory implementation + * @return {@code this} + */ public Builder customBlockParserFactory(BlockParserFactory blockParserFactory) { + Objects.requireNonNull(blockParserFactory, "blockParserFactory must not be null"); blockParserFactories.add(blockParserFactory); return this; } + /** + * Add a factory for a custom inline content parser, for extending inline parsing or overriding built-in parsing. + * <p> + * Note that parsers are triggered based on a special character as specified by + * {@link InlineContentParserFactory#getTriggerCharacters()}. It is possible to register multiple parsers for the same + * character, or even for some built-in special character such as {@code `}. The custom parsers are tried first + * in order in which they are registered, and then the built-in ones. + */ + public Builder customInlineContentParserFactory(InlineContentParserFactory inlineContentParserFactory) { + Objects.requireNonNull(inlineContentParserFactory, "inlineContentParser must not be null"); + inlineContentParserFactories.add(inlineContentParserFactory); + return this; + } + + /** + * Add a custom delimiter processor for inline parsing. + * <p> + * Note that multiple delimiter processors with the same characters can be added, as long as they have a + * different minimum length. In that case, the processor with the shortest matching length is used. Adding more + * than one delimiter processor with the same character and minimum length is invalid. + * <p> + * If you want more control over how parsing is done, you might want to use + * {@link #customInlineContentParserFactory} instead. + * + * @param delimiterProcessor a delimiter processor implementation + * @return {@code this} + */ public Builder customDelimiterProcessor(DelimiterProcessor delimiterProcessor) { + Objects.requireNonNull(delimiterProcessor, "delimiterProcessor must not be null"); delimiterProcessors.add(delimiterProcessor); return this; } + /** + * Add a custom link/image processor for inline parsing. + * <p> + * Multiple link processors can be added, and will be tried in order in which they were added. If no link + * processor applies, the normal behavior applies. That means these can override built-in link parsing. + * + * @param linkProcessor a link processor implementation + * @return {@code this} + */ + public Builder linkProcessor(LinkProcessor linkProcessor) { + Objects.requireNonNull(linkProcessor, "linkProcessor must not be null"); + linkProcessors.add(linkProcessor); + return this; + } + + /** + * Add a custom link marker for link processing. A link marker is a character like {@code !} which, if it + * appears before the {@code [} of a link, changes the meaning of the link. + * <p> + * If a link marker followed by a valid link is parsed, the {@link org.commonmark.parser.beta.LinkInfo} + * that is passed to {@link LinkProcessor} will have its {@link LinkInfo#marker()} set. A link processor should + * check the {@link Text#getLiteral()} and then do any processing, and will probably want to use {@link LinkResult#includeMarker()}. + * + * @param linkMarker a link marker character + * @return {@code this} + */ + public Builder linkMarker(Character linkMarker) { + Objects.requireNonNull(linkMarker, "linkMarker must not be null"); + linkMarkers.add(linkMarker); + return this; + } + public Builder postProcessor(PostProcessor postProcessor) { + Objects.requireNonNull(postProcessor, "postProcessor must not be null"); postProcessors.add(postProcessor); return this; } + + /** + * Overrides the parser used for inline markdown processing. + * <p> + * Provide an implementation of InlineParserFactory which provides a custom inline parser + * to modify how the following are parsed: + * bold (**) + * italic (*) + * strikethrough (~~) + * backtick quote (`) + * link ([title](http://)) + * image (![alt](http://)) + * <p> + * Note that if this method is not called or the inline parser factory is set to null, then the default + * implementation will be used. + * + * @param inlineParserFactory an inline parser factory implementation + * @return {@code this} + */ + public Builder inlineParserFactory(InlineParserFactory inlineParserFactory) { + this.inlineParserFactory = inlineParserFactory; + return this; + } + + private InlineParserFactory getInlineParserFactory() { + if (inlineParserFactory != null) { + return inlineParserFactory; + } else { + return InlineParserImpl::new; + } + } } /** - * Extension for parser. + * Extension for {@link Parser}. */ public interface ParserExtension extends Extension { void extend(Builder parserBuilder); diff --git a/commonmark/src/main/java/org/commonmark/parser/SourceLine.java b/commonmark/src/main/java/org/commonmark/parser/SourceLine.java new file mode 100644 index 000000000..92a8cdfaf --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/SourceLine.java @@ -0,0 +1,47 @@ +package org.commonmark.parser; + +import org.commonmark.node.SourceSpan; + +import java.util.Objects; + +/** + * A line or part of a line from the input source. + * + * @since 0.16.0 + */ +public class SourceLine { + + private final CharSequence content; + private final SourceSpan sourceSpan; + + public static SourceLine of(CharSequence content, SourceSpan sourceSpan) { + return new SourceLine(content, sourceSpan); + } + + private SourceLine(CharSequence content, SourceSpan sourceSpan) { + this.content = Objects.requireNonNull(content, "content must not be null"); + this.sourceSpan = sourceSpan; + } + + public CharSequence getContent() { + return content; + } + + public SourceSpan getSourceSpan() { + return sourceSpan; + } + + public SourceLine substring(int beginIndex, int endIndex) { + CharSequence newContent = content.subSequence(beginIndex, endIndex); + SourceSpan newSourceSpan = null; + if (sourceSpan != null) { + int length = endIndex - beginIndex; + if (length != 0) { + int columnIndex = sourceSpan.getColumnIndex() + beginIndex; + int inputIndex = sourceSpan.getInputIndex() + beginIndex; + newSourceSpan = SourceSpan.of(sourceSpan.getLineIndex(), columnIndex, inputIndex, length); + } + } + return SourceLine.of(newContent, newSourceSpan); + } +} diff --git a/commonmark/src/main/java/org/commonmark/parser/SourceLines.java b/commonmark/src/main/java/org/commonmark/parser/SourceLines.java new file mode 100644 index 000000000..0b4290341 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/SourceLines.java @@ -0,0 +1,66 @@ +package org.commonmark.parser; + +import org.commonmark.node.SourceSpan; + +import java.util.ArrayList; +import java.util.List; + +/** + * A set of lines ({@link SourceLine}) from the input source. + * + * @since 0.16.0 + */ +public class SourceLines { + + private final List<SourceLine> lines = new ArrayList<>(); + + public static SourceLines empty() { + return new SourceLines(); + } + + public static SourceLines of(SourceLine sourceLine) { + SourceLines sourceLines = new SourceLines(); + sourceLines.addLine(sourceLine); + return sourceLines; + } + + public static SourceLines of(List<SourceLine> sourceLines) { + SourceLines result = new SourceLines(); + result.lines.addAll(sourceLines); + return result; + } + + public void addLine(SourceLine sourceLine) { + lines.add(sourceLine); + } + + public List<SourceLine> getLines() { + return lines; + } + + public boolean isEmpty() { + return lines.isEmpty(); + } + + public String getContent() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < lines.size(); i++) { + if (i != 0) { + sb.append('\n'); + } + sb.append(lines.get(i).getContent()); + } + return sb.toString(); + } + + public List<SourceSpan> getSourceSpans() { + List<SourceSpan> sourceSpans = new ArrayList<>(); + for (SourceLine line : lines) { + SourceSpan sourceSpan = line.getSourceSpan(); + if (sourceSpan != null) { + sourceSpans.add(sourceSpan); + } + } + return sourceSpans; + } +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/InlineContentParser.java b/commonmark/src/main/java/org/commonmark/parser/beta/InlineContentParser.java new file mode 100644 index 000000000..bc5c9a54f --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/InlineContentParser.java @@ -0,0 +1,21 @@ +package org.commonmark.parser.beta; + +/** + * Parser for a type of inline content. Registered via a {@link InlineContentParserFactory} and created by its + * {@link InlineContentParserFactory#create() create} method. The lifetime of this is tied to each inline content + * snippet that is parsed, as a new instance is created for each. + */ +public interface InlineContentParser { + + /** + * Try to parse inline content starting from the current position. Note that the character at the current position + * is one of {@link InlineContentParserFactory#getTriggerCharacters()} of the factory that created this parser. + * <p> + * For a given inline content snippet that is being parsed, this method can be called multiple times: each time a + * trigger character is encountered. + * + * @param inlineParserState the current state of the inline parser + * @return the result of parsing; can indicate that this parser is not interested, or that parsing was successful + */ + ParsedInline tryParse(InlineParserState inlineParserState); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/InlineContentParserFactory.java b/commonmark/src/main/java/org/commonmark/parser/beta/InlineContentParserFactory.java new file mode 100644 index 000000000..c86f93a41 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/InlineContentParserFactory.java @@ -0,0 +1,24 @@ +package org.commonmark.parser.beta; + +import java.util.Set; + +/** + * A factory for extending inline content parsing. + * <p> + * See {@link org.commonmark.parser.Parser.Builder#customInlineContentParserFactory} for how to register it. + */ +public interface InlineContentParserFactory { + + /** + * An inline content parser needs to have a special "trigger" character which activates it. When this character is + * encountered during inline parsing, {@link InlineContentParser#tryParse} is called with the current parser state. + * It can also register for more than one trigger character. + */ + Set<Character> getTriggerCharacters(); + + /** + * Create an {@link InlineContentParser} that will do the parsing. Create is called once per text snippet of inline + * content inside block structures, and then called each time a trigger character is encountered. + */ + InlineContentParser create(); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/InlineParserState.java b/commonmark/src/main/java/org/commonmark/parser/beta/InlineParserState.java new file mode 100644 index 000000000..e434d45d6 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/InlineParserState.java @@ -0,0 +1,13 @@ +package org.commonmark.parser.beta; + +public interface InlineParserState { + + /** + * Return a scanner for the input for the current position (on the trigger character that the inline parser was + * added for). + * <p> + * Note that this always returns the same instance, if you want to backtrack you need to use + * {@link Scanner#position()} and {@link Scanner#setPosition(Position)}. + */ + Scanner scanner(); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/LinkInfo.java b/commonmark/src/main/java/org/commonmark/parser/beta/LinkInfo.java new file mode 100644 index 000000000..b2fda57e4 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/LinkInfo.java @@ -0,0 +1,69 @@ +package org.commonmark.parser.beta; + +import org.commonmark.node.Text; + +/** + * A parsed link/image. There are different types of links. + * <p> + * Inline links: + * <pre> + * [text](destination) + * [text](destination "title") + * </pre> + * <p> + * Reference links, which have different subtypes. Full:: + * <pre> + * [text][label] + * </pre> + * Collapsed (label is ""): + * <pre> + * [text][] + * </pre> + * Shortcut (label is null): + * <pre> + * [text] + * </pre> + * Images use the same syntax as links but with a {@code !} {@link #marker()} front, e.g. {@code ![text](destination)}. + */ +public interface LinkInfo { + + /** + * The marker if present, or null. A marker is e.g. {@code !} for an image, or a custom marker as specified in + * {@link org.commonmark.parser.Parser.Builder#linkMarker}. + */ + Text marker(); + + /** + * The text node of the opening bracket {@code [}. + */ + Text openingBracket(); + + /** + * The text between the first brackets, e.g. `foo` in `[foo][bar]`. + */ + String text(); + + /** + * The label, or null for inline links or for shortcut links (in which case {@link #text()} should be used as the label). + */ + String label(); + + /** + * The destination if available, e.g. in `[foo](destination)`, or null + */ + String destination(); + + /** + * The title if available, e.g. in `[foo](destination "title")`, or null + */ + String title(); + + /** + * The position after the closing text bracket, e.g.: + * <pre> + * [foo][bar] + * ^ + * </pre> + */ + Position afterTextBracket(); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/LinkProcessor.java b/commonmark/src/main/java/org/commonmark/parser/beta/LinkProcessor.java new file mode 100644 index 000000000..3e448fd91 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/LinkProcessor.java @@ -0,0 +1,40 @@ +package org.commonmark.parser.beta; + +import org.commonmark.parser.InlineParserContext; + +/** + * An interface to decide how links/images are handled. + * <p> + * Implementations need to be registered with a parser via {@link org.commonmark.parser.Parser.Builder#linkProcessor}. + * Then, when inline parsing is run, each parsed link/image is passed to the processor. This includes links like these: + * <p> + * <pre><code> + * [text](destination) + * [text] + * [text][] + * [text][label] + * </code></pre> + * And images: + * <pre><code> + * ![text](destination) + * ![text] + * ![text][] + * ![text][label] + * </code></pre> + * See {@link LinkInfo} for accessing various parts of the parsed link/image. + * <p> + * The processor can then inspect the link/image and decide what to do with it by returning the appropriate + * {@link LinkResult}. If it returns {@link LinkResult#none()}, the next registered processor is tried. If none of them + * apply, the link is handled as it normally would. + */ +public interface LinkProcessor { + + /** + * @param linkInfo information about the parsed link/image + * @param scanner the scanner at the current position after the parsed link/image + * @param context context for inline parsing + * @return what to do with the link/image, e.g. do nothing (try the next processor), wrap the text in a node, or + * replace the link/image with a node + */ + LinkResult process(LinkInfo linkInfo, Scanner scanner, InlineParserContext context); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/LinkResult.java b/commonmark/src/main/java/org/commonmark/parser/beta/LinkResult.java new file mode 100644 index 000000000..43bc82af8 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/LinkResult.java @@ -0,0 +1,50 @@ +package org.commonmark.parser.beta; + +import org.commonmark.internal.inline.LinkResultImpl; +import org.commonmark.node.Node; + +/** + * What to do with a link/image processed by {@link LinkProcessor}. + */ +public interface LinkResult { + /** + * Link not handled by processor. + */ + static LinkResult none() { + return null; + } + + /** + * Wrap the link text in a node. This is the normal behavior for links, e.g. for this: + * <pre><code> + * [my *text*](destination) + * </code></pre> + * The text is {@code my *text*}, a text node and emphasis. The text is wrapped in a + * {@link org.commonmark.node.Link} node, which means the text is added as child nodes to it. + * + * @param node the node to which the link text nodes will be added as child nodes + * @param position the position to continue parsing from + */ + static LinkResult wrapTextIn(Node node, Position position) { + return new LinkResultImpl(LinkResultImpl.Type.WRAP, node, position); + } + + /** + * Replace the link with a node. E.g. for this: + * <pre><code> + * [^foo] + * </code></pre> + * The processor could decide to create a {@code FootnoteReference} node instead which replaces the link. + * + * @param node the node to replace the link with + * @param position the position to continue parsing from + */ + static LinkResult replaceWith(Node node, Position position) { + return new LinkResultImpl(LinkResultImpl.Type.REPLACE, node, position); + } + + /** + * If a {@link LinkInfo#marker()} is present, include it in processing (i.e. treat it the same way as the brackets). + */ + LinkResult includeMarker(); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/ParsedInline.java b/commonmark/src/main/java/org/commonmark/parser/beta/ParsedInline.java new file mode 100644 index 000000000..5d1402cae --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/ParsedInline.java @@ -0,0 +1,24 @@ +package org.commonmark.parser.beta; + +import org.commonmark.internal.inline.ParsedInlineImpl; +import org.commonmark.node.Node; + +import java.util.Objects; + +/** + * The result of a single inline parser. Use the static methods to create instances. + * <p> + * <em>This interface is not intended to be implemented by clients.</em> + */ +public interface ParsedInline { + + static ParsedInline none() { + return null; + } + + static ParsedInline of(Node node, Position position) { + Objects.requireNonNull(node, "node must not be null"); + Objects.requireNonNull(position, "position must not be null"); + return new ParsedInlineImpl(node, position); + } +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/Position.java b/commonmark/src/main/java/org/commonmark/parser/beta/Position.java new file mode 100644 index 000000000..3dbb4870f --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/Position.java @@ -0,0 +1,16 @@ +package org.commonmark.parser.beta; + +/** + * Position within a {@link Scanner}. This is intentionally kept opaque so as not to expose the internal structure of + * the Scanner. + */ +public class Position { + + final int lineIndex; + final int index; + + Position(int lineIndex, int index) { + this.lineIndex = lineIndex; + this.index = index; + } +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/Scanner.java b/commonmark/src/main/java/org/commonmark/parser/beta/Scanner.java new file mode 100644 index 000000000..324639493 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/Scanner.java @@ -0,0 +1,281 @@ +package org.commonmark.parser.beta; + +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; +import org.commonmark.text.CharMatcher; + +import java.util.List; + +public class Scanner { + + /** + * Character representing the end of input source (or outside of the text in case of the "previous" methods). + * <p> + * Note that we can use NULL to represent this because CommonMark does not allow those in the input (we replace them + * in the beginning of parsing). + */ + public static final char END = '\0'; + + // Lines without newlines at the end. The scanner will yield `\n` between lines because they're significant for + // parsing and the final output. There is no `\n` after the last line. + private final List<SourceLine> lines; + // Which line we're at. + private int lineIndex; + // The index within the line. If index == length(), we pretend that there's a `\n` and only advance after we yield + // that. + private int index; + + // Current line or "" if at the end of the lines (using "" instead of null saves a null check) + private SourceLine line = SourceLine.of("", null); + private int lineLength = 0; + + Scanner(List<SourceLine> lines, int lineIndex, int index) { + this.lines = lines; + this.lineIndex = lineIndex; + this.index = index; + if (!lines.isEmpty()) { + checkPosition(lineIndex, index); + setLine(lines.get(lineIndex)); + } + } + + public static Scanner of(SourceLines lines) { + return new Scanner(lines.getLines(), 0, 0); + } + + public char peek() { + if (index < lineLength) { + return line.getContent().charAt(index); + } else { + if (lineIndex < lines.size() - 1) { + return '\n'; + } else { + // Don't return newline for end of last line + return END; + } + } + } + + public int peekCodePoint() { + if (index < lineLength) { + char c = line.getContent().charAt(index); + if (Character.isHighSurrogate(c) && index + 1 < lineLength) { + char low = line.getContent().charAt(index + 1); + if (Character.isLowSurrogate(low)) { + return Character.toCodePoint(c, low); + } + } + return c; + } else { + if (lineIndex < lines.size() - 1) { + return '\n'; + } else { + // Don't return newline for end of last line + return END; + } + } + } + + public int peekPreviousCodePoint() { + if (index > 0) { + int prev = index - 1; + char c = line.getContent().charAt(prev); + if (Character.isLowSurrogate(c) && prev > 0) { + char high = line.getContent().charAt(prev - 1); + if (Character.isHighSurrogate(high)) { + return Character.toCodePoint(high, c); + } + } + return c; + } else { + if (lineIndex > 0) { + return '\n'; + } else { + return END; + } + } + } + + public boolean hasNext() { + if (index < lineLength) { + return true; + } else { + // No newline at end of last line + return lineIndex < lines.size() - 1; + } + } + + public void next() { + index++; + if (index > lineLength) { + lineIndex++; + if (lineIndex < lines.size()) { + setLine(lines.get(lineIndex)); + } else { + setLine(SourceLine.of("", null)); + } + index = 0; + } + } + + /** + * Check if the specified char is next and advance the position. + * + * @param c the char to check (including newline characters) + * @return true if matched and position was advanced, false otherwise + */ + public boolean next(char c) { + if (peek() == c) { + next(); + return true; + } else { + return false; + } + } + + /** + * Check if we have the specified content on the line and advanced the position. Note that if you want to match + * newline characters, use {@link #next(char)}. + * + * @param content the text content to match on a single line (excluding newline characters) + * @return true if matched and position was advanced, false otherwise + */ + public boolean next(String content) { + if (index < lineLength && index + content.length() <= lineLength) { + // Can't use startsWith because it's not available on CharSequence + for (int i = 0; i < content.length(); i++) { + if (line.getContent().charAt(index + i) != content.charAt(i)) { + return false; + } + } + index += content.length(); + return true; + } else { + return false; + } + } + + public int matchMultiple(char c) { + int count = 0; + while (peek() == c) { + count++; + next(); + } + return count; + } + + public int match(CharMatcher matcher) { + int count = 0; + while (matcher.matches(peek())) { + count++; + next(); + } + return count; + } + + public int whitespace() { + int count = 0; + while (true) { + switch (peek()) { + case ' ': + case '\t': + case '\n': + case '\u000B': + case '\f': + case '\r': + count++; + next(); + break; + default: + return count; + } + } + } + + public int find(char c) { + int count = 0; + while (true) { + char cur = peek(); + if (cur == Scanner.END) { + return -1; + } else if (cur == c) { + return count; + } + count++; + next(); + } + } + + public int find(CharMatcher matcher) { + int count = 0; + while (true) { + char c = peek(); + if (c == END) { + return -1; + } else if (matcher.matches(c)) { + return count; + } + count++; + next(); + } + } + + // Don't expose the int index, because it would be good if we could switch input to a List<String> of lines later + // instead of one contiguous String. + public Position position() { + return new Position(lineIndex, index); + } + + public void setPosition(Position position) { + checkPosition(position.lineIndex, position.index); + this.lineIndex = position.lineIndex; + this.index = position.index; + setLine(lines.get(this.lineIndex)); + } + + // For cases where the caller appends the result to a StringBuilder, we could offer another method to avoid some + // unnecessary copying. + public SourceLines getSource(Position begin, Position end) { + if (begin.lineIndex == end.lineIndex) { + // Shortcut for common case of text from a single line + SourceLine line = lines.get(begin.lineIndex); + CharSequence newContent = line.getContent().subSequence(begin.index, end.index); + SourceSpan newSourceSpan = null; + SourceSpan sourceSpan = line.getSourceSpan(); + if (sourceSpan != null) { + newSourceSpan = sourceSpan.subSpan(begin.index, end.index); + } + return SourceLines.of(SourceLine.of(newContent, newSourceSpan)); + } else { + SourceLines sourceLines = SourceLines.empty(); + + SourceLine firstLine = lines.get(begin.lineIndex); + sourceLines.addLine(firstLine.substring(begin.index, firstLine.getContent().length())); + + // Lines between begin and end (we are appending the full line) + for (int line = begin.lineIndex + 1; line < end.lineIndex; line++) { + sourceLines.addLine(lines.get(line)); + } + + SourceLine lastLine = lines.get(end.lineIndex); + sourceLines.addLine(lastLine.substring(0, end.index)); + return sourceLines; + } + } + + private void setLine(SourceLine line) { + this.line = line; + this.lineLength = line.getContent().length(); + } + + private void checkPosition(int lineIndex, int index) { + if (lineIndex < 0 || lineIndex >= lines.size()) { + throw new IllegalArgumentException("Line index " + lineIndex + " out of range, number of lines: " + lines.size()); + } + SourceLine line = lines.get(lineIndex); + if (index < 0 || index > line.getContent().length()) { + throw new IllegalArgumentException("Index " + index + " out of range, line length: " + line.getContent().length()); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/parser/beta/package-info.java b/commonmark/src/main/java/org/commonmark/parser/beta/package-info.java new file mode 100644 index 000000000..029d80507 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/beta/package-info.java @@ -0,0 +1,4 @@ +/** + * Experimental APIs to use for extensions. APIs are subject to change if necessary. + */ +package org.commonmark.parser.beta; diff --git a/commonmark/src/main/java/org/commonmark/parser/block/AbstractBlockParser.java b/commonmark/src/main/java/org/commonmark/parser/block/AbstractBlockParser.java index 709bd7b93..4fb1a05ac 100644 --- a/commonmark/src/main/java/org/commonmark/parser/block/AbstractBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/parser/block/AbstractBlockParser.java @@ -1,7 +1,12 @@ package org.commonmark.parser.block; import org.commonmark.node.Block; +import org.commonmark.node.DefinitionMap; +import org.commonmark.node.SourceSpan; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.SourceLine; + +import java.util.List; public abstract class AbstractBlockParser implements BlockParser { @@ -11,12 +16,27 @@ public boolean isContainer() { } @Override - public boolean canContain(Block block) { + public boolean canHaveLazyContinuationLines() { + return false; + } + + @Override + public boolean canContain(Block childBlock) { return false; } @Override - public void addLine(CharSequence line) { + public void addLine(SourceLine line) { + } + + @Override + public void addSourceSpan(SourceSpan sourceSpan) { + getBlock().addSourceSpan(sourceSpan); + } + + @Override + public List<DefinitionMap<?>> getDefinitions() { + return List.of(); } @Override diff --git a/commonmark/src/main/java/org/commonmark/parser/block/BlockParser.java b/commonmark/src/main/java/org/commonmark/parser/block/BlockParser.java index 48e9098e3..32ff2a474 100644 --- a/commonmark/src/main/java/org/commonmark/parser/block/BlockParser.java +++ b/commonmark/src/main/java/org/commonmark/parser/block/BlockParser.java @@ -1,7 +1,12 @@ package org.commonmark.parser.block; import org.commonmark.node.Block; +import org.commonmark.node.DefinitionMap; +import org.commonmark.node.SourceSpan; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.SourceLine; + +import java.util.List; /** * Parser for a specific block node. @@ -15,13 +20,43 @@ public interface BlockParser { */ boolean isContainer(); - boolean canContain(Block block); + /** + * Return true if the block can have lazy continuation lines. + * <p> + * Lazy continuation lines are lines that were rejected by this {@link #tryContinue(ParserState)} but didn't match + * any other block parsers either. + * <p> + * If true is returned here, those lines will get added via {@link #addLine(SourceLine)}. For false, the block is + * closed instead. + */ + boolean canHaveLazyContinuationLines(); + + boolean canContain(Block childBlock); Block getBlock(); BlockContinue tryContinue(ParserState parserState); - void addLine(CharSequence line); + /** + * Add the part of a line that belongs to this block parser to parse (i.e. without any container block markers). + * Note that the line will only include a {@link SourceLine#getSourceSpan()} if source spans are enabled for inlines. + */ + void addLine(SourceLine line); + + /** + * Add a source span of the currently parsed block. The default implementation in {@link AbstractBlockParser} adds + * it to the block. Unless you have some complicated parsing where you need to check source positions, you don't + * need to override this. + * + * @since 0.16.0 + */ + void addSourceSpan(SourceSpan sourceSpan); + + /** + * Return definitions parsed by this parser. The definitions returned here can later be accessed during inline + * parsing via {@link org.commonmark.parser.InlineParserContext#getDefinition}. + */ + List<DefinitionMap<?>> getDefinitions(); void closeBlock(); diff --git a/commonmark/src/main/java/org/commonmark/parser/block/BlockStart.java b/commonmark/src/main/java/org/commonmark/parser/block/BlockStart.java index da8f3b751..c41f1caa3 100644 --- a/commonmark/src/main/java/org/commonmark/parser/block/BlockStart.java +++ b/commonmark/src/main/java/org/commonmark/parser/block/BlockStart.java @@ -2,8 +2,6 @@ import org.commonmark.internal.BlockStartImpl; -import java.util.Collections; - /** * Result object for starting parsing of a block, see static methods for constructors. */ @@ -12,18 +10,59 @@ public abstract class BlockStart { protected BlockStart() { } + /** + * Result for when there is no block start. + */ public static BlockStart none() { return null; } + /** + * Start block(s) with the specified parser(s). + */ public static BlockStart of(BlockParser... blockParsers) { return new BlockStartImpl(blockParsers); } + /** + * Continue parsing at the specified index. + * + * @param newIndex the new index, see {@link ParserState#getIndex()} + */ public abstract BlockStart atIndex(int newIndex); + /** + * Continue parsing at the specified column (for tab handling). + * + * @param newColumn the new column, see {@link ParserState#getColumn()} + */ public abstract BlockStart atColumn(int newColumn); + /** + * @deprecated use {@link #replaceParagraphLines(int)} instead; please raise an issue if that doesn't work for you + * for some reason. + */ + @Deprecated public abstract BlockStart replaceActiveBlockParser(); + /** + * Replace a number of lines from the current paragraph (as returned by + * {@link MatchedBlockParser#getParagraphLines()}) with the new block. + * <p> + * This is useful for parsing blocks that start with normal paragraphs and only have special marker syntax in later + * lines, e.g. in this: + * <pre> + * Foo + * === + * </pre> + * The <code>Foo</code> line is initially parsed as a normal paragraph, then <code>===</code> is parsed as a heading + * marker, replacing the 1 paragraph line before. The end result is a single Heading block. + * <p> + * Note that source spans from the replaced lines are automatically added to the new block. + * + * @param lines the number of lines to replace (at least 1); use {@link Integer#MAX_VALUE} to replace the whole + * paragraph + */ + public abstract BlockStart replaceParagraphLines(int lines); + } diff --git a/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java b/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java index 4f8c7e8fd..c4619d8c2 100644 --- a/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/parser/block/MatchedBlockParser.java @@ -1,5 +1,7 @@ package org.commonmark.parser.block; +import org.commonmark.parser.SourceLines; + /** * Open block parser that was last matched during the continue phase. This is different from the currently active * block parser, as an unmatched block is only closed when a new block is started. @@ -10,9 +12,11 @@ public interface MatchedBlockParser { BlockParser getMatchedBlockParser(); /** - * @return the first line of the paragraph if the matched block is a paragraph and we're on the second line, - * null otherwise + * Returns the current paragraph lines if the matched block is a paragraph. If you want to use some or all of the + * lines for starting a new block instead, use {@link BlockStart#replaceParagraphLines(int)}. + * + * @return paragraph content or an empty list */ - CharSequence getParagraphStartLine(); + SourceLines getParagraphLines(); } diff --git a/commonmark/src/main/java/org/commonmark/parser/block/ParserState.java b/commonmark/src/main/java/org/commonmark/parser/block/ParserState.java index 8c63e964e..b32bbaee5 100644 --- a/commonmark/src/main/java/org/commonmark/parser/block/ParserState.java +++ b/commonmark/src/main/java/org/commonmark/parser/block/ParserState.java @@ -1,5 +1,7 @@ package org.commonmark.parser.block; +import org.commonmark.parser.SourceLine; + /** * State of the parser that is used in block parsers. * <p><em>This interface is not intended to be implemented by clients.</em></p> @@ -7,9 +9,9 @@ public interface ParserState { /** - * @return the current line + * @return the current source line being parsed (full line) */ - CharSequence getLine(); + SourceLine getLine(); /** * @return the current index within the line (0-based) diff --git a/commonmark/src/main/java/org/commonmark/parser/block/package-info.java b/commonmark/src/main/java/org/commonmark/parser/block/package-info.java new file mode 100644 index 000000000..095d4d565 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/block/package-info.java @@ -0,0 +1,4 @@ +/** + * Types for extending block parsing + */ +package org.commonmark.parser.block; diff --git a/commonmark/src/main/java/org/commonmark/parser/delimiter/DelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/parser/delimiter/DelimiterProcessor.java new file mode 100644 index 000000000..3b6abf214 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/delimiter/DelimiterProcessor.java @@ -0,0 +1,46 @@ +package org.commonmark.parser.delimiter; + +import org.commonmark.node.Text; + +/** + * Custom delimiter processor for additional delimiters besides {@code _} and {@code *}. + * <p> + * Note that implementations of this need to be thread-safe, the same instance may be used by multiple parsers. + * + * @see org.commonmark.parser.beta.InlineContentParserFactory + */ +public interface DelimiterProcessor { + + /** + * @return the character that marks the beginning of a delimited node, must not clash with any built-in special + * characters + */ + char getOpeningCharacter(); + + /** + * @return the character that marks the the ending of a delimited node, must not clash with any built-in special + * characters. Note that for a symmetric delimiter such as "*", this is the same as the opening. + */ + char getClosingCharacter(); + + /** + * Minimum number of delimiter characters that are needed to activate this. Must be at least 1. + */ + int getMinLength(); + + /** + * Process the delimiter runs. + * <p> + * The processor can examine the runs and the nodes and decide if it wants to process or not. If not, it should not + * change any nodes and return 0. If yes, it should do the processing (wrapping nodes, etc) and then return how many + * delimiters were used. + * <p> + * Note that removal (unlinking) of the used delimiter {@link Text} nodes is done by the caller. + * + * @param openingRun the opening delimiter run + * @param closingRun the closing delimiter run + * @return how many delimiters were used; must not be greater than length of either opener or closer + */ + int process(DelimiterRun openingRun, DelimiterRun closingRun); + +} diff --git a/commonmark/src/main/java/org/commonmark/parser/delimiter/DelimiterRun.java b/commonmark/src/main/java/org/commonmark/parser/delimiter/DelimiterRun.java new file mode 100644 index 000000000..578eac96b --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/delimiter/DelimiterRun.java @@ -0,0 +1,58 @@ +package org.commonmark.parser.delimiter; + +import org.commonmark.node.Text; + +/** + * A delimiter run is one or more of the same delimiter character, e.g. {@code ***}. + */ +public interface DelimiterRun { + + /** + * @return whether this can open a delimiter + */ + boolean canOpen(); + + /** + * @return whether this can close a delimiter + */ + boolean canClose(); + + /** + * @return the number of characters in this delimiter run (that are left for processing) + */ + int length(); + + /** + * @return the number of characters originally in this delimiter run; at the start of processing, this is the same + * as {{@link #length()}} + */ + int originalLength(); + + /** + * @return the innermost opening delimiter, e.g. for {@code ***} this is the last {@code *} + */ + Text getOpener(); + + /** + * @return the innermost closing delimiter, e.g. for {@code ***} this is the first {@code *} + */ + Text getCloser(); + + /** + * Get the opening delimiter nodes for the specified length of delimiters. Length must be between 1 and + * {@link #length()}. + * <p> + * For example, for a delimiter run {@code ***}, calling this with 1 would return the last {@code *}. + * Calling it with 2 would return the second last {@code *} and the last {@code *}. + */ + Iterable<Text> getOpeners(int length); + + /** + * Get the closing delimiter nodes for the specified length of delimiters. Length must be between 1 and + * {@link #length()}. + * <p> + * For example, for a delimiter run {@code ***}, calling this with 1 would return the first {@code *}. + * Calling it with 2 would return the first {@code *} and the second {@code *}. + */ + Iterable<Text> getClosers(int length); +} diff --git a/commonmark/src/main/java/org/commonmark/parser/package-info.java b/commonmark/src/main/java/org/commonmark/parser/package-info.java new file mode 100644 index 000000000..2afb3b96d --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/parser/package-info.java @@ -0,0 +1,4 @@ +/** + * Parsing input text to AST nodes (see {@link org.commonmark.parser.Parser}) + */ +package org.commonmark.parser; diff --git a/commonmark/src/main/java/org/commonmark/renderer/NodeRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/NodeRenderer.java new file mode 100644 index 000000000..4ae4b5dcd --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/NodeRenderer.java @@ -0,0 +1,39 @@ +package org.commonmark.renderer; + +import org.commonmark.node.Node; + +import java.util.Set; + +/** + * A renderer for a set of node types. + */ +public interface NodeRenderer { + + /** + * @return the types of nodes that this renderer handles + */ + Set<Class<? extends Node>> getNodeTypes(); + + /** + * Render the specified node. + * + * @param node the node to render, will be an instance of one of {@link #getNodeTypes()} + */ + void render(Node node); + + /** + * Called before the root node is rendered, to do any initial processing at the start. + * + * @param rootNode the root (top-level) node + */ + default void beforeRoot(Node rootNode) { + } + + /** + * Called after the root node is rendered, to do any final processing at the end. + * + * @param rootNode the root (top-level) node + */ + default void afterRoot(Node rootNode) { + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/Renderer.java b/commonmark/src/main/java/org/commonmark/renderer/Renderer.java new file mode 100644 index 000000000..42740d91a --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/Renderer.java @@ -0,0 +1,22 @@ +package org.commonmark.renderer; + +import org.commonmark.node.Node; + +public interface Renderer { + + /** + * Render the tree of nodes to output. + * + * @param node the root node + * @param output output for rendering + */ + void render(Node node, Appendable output); + + /** + * Render the tree of nodes to string. + * + * @param node the root node + * @return the rendered string + */ + String render(Node node); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProvider.java b/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProvider.java new file mode 100644 index 000000000..24a471d46 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProvider.java @@ -0,0 +1,29 @@ +package org.commonmark.renderer.html; + +import org.commonmark.node.Node; + +import java.util.Map; + +/** + * Extension point for adding/changing attributes on HTML tags for a node. + */ +public interface AttributeProvider { + + /** + * Set the attributes for a HTML tag of the specified node by modifying the provided map. + * <p> + * This allows to change or even remove default attributes. With great power comes great responsibility. + * <p> + * The attribute key and values will be escaped (preserving character entities), so don't escape them here, + * otherwise they will be double-escaped. + * <p> + * This method may be called multiple times for the same node, if the node is rendered using multiple nested + * tags (e.g. code blocks). + * + * @param node the node to set attributes for + * @param tagName the HTML tag name that these attributes are for (e.g. {@code h1}, {@code pre}, {@code code}). + * @param attributes the attributes, with any default attributes already set in the map + */ + void setAttributes(Node node, String tagName, Map<String, String> attributes); + +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProviderContext.java b/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProviderContext.java new file mode 100644 index 000000000..0959932bc --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProviderContext.java @@ -0,0 +1,9 @@ +package org.commonmark.renderer.html; + +/** + * The context for attribute providers. + * <p>Note: There are currently no methods here, this is for future extensibility.</p> + * <p><em>This interface is not intended to be implemented by clients.</em></p> + */ +public interface AttributeProviderContext { +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProviderFactory.java b/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProviderFactory.java new file mode 100644 index 000000000..d4c12ca06 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/AttributeProviderFactory.java @@ -0,0 +1,15 @@ +package org.commonmark.renderer.html; + +/** + * Factory for instantiating new attribute providers when rendering is done. + */ +public interface AttributeProviderFactory { + + /** + * Create a new attribute provider. + * + * @param context for this attribute provider + * @return an AttributeProvider + */ + AttributeProvider create(AttributeProviderContext context); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/CoreHtmlNodeRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/html/CoreHtmlNodeRenderer.java new file mode 100644 index 000000000..5c536558e --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/CoreHtmlNodeRenderer.java @@ -0,0 +1,329 @@ +package org.commonmark.renderer.html; + +import org.commonmark.node.*; +import org.commonmark.renderer.NodeRenderer; + +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +/** + * The node renderer that renders all the core nodes (comes last in the order of node renderers). + */ +public class CoreHtmlNodeRenderer extends AbstractVisitor implements NodeRenderer { + + protected final HtmlNodeRendererContext context; + private final HtmlWriter html; + + public CoreHtmlNodeRenderer(HtmlNodeRendererContext context) { + this.context = context; + this.html = context.getWriter(); + } + + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of( + Document.class, + Heading.class, + Paragraph.class, + BlockQuote.class, + BulletList.class, + FencedCodeBlock.class, + HtmlBlock.class, + ThematicBreak.class, + IndentedCodeBlock.class, + Link.class, + ListItem.class, + OrderedList.class, + Image.class, + Emphasis.class, + StrongEmphasis.class, + Text.class, + Code.class, + HtmlInline.class, + SoftLineBreak.class, + HardLineBreak.class + ); + } + + @Override + public void render(Node node) { + node.accept(this); + } + + @Override + public void visit(Document document) { + // No rendering itself + visitChildren(document); + } + + @Override + public void visit(Heading heading) { + String htag = "h" + heading.getLevel(); + html.line(); + html.tag(htag, getAttrs(heading, htag)); + visitChildren(heading); + html.tag('/' + htag); + html.line(); + } + + @Override + public void visit(Paragraph paragraph) { + boolean omitP = isInTightList(paragraph) || // + (context.shouldOmitSingleParagraphP() && paragraph.getParent() instanceof Document && // + paragraph.getPrevious() == null && paragraph.getNext() == null); + if (!omitP) { + html.line(); + html.tag("p", getAttrs(paragraph, "p")); + } + visitChildren(paragraph); + if (!omitP) { + html.tag("/p"); + html.line(); + } + } + + @Override + public void visit(BlockQuote blockQuote) { + html.line(); + html.tag("blockquote", getAttrs(blockQuote, "blockquote")); + html.line(); + visitChildren(blockQuote); + html.line(); + html.tag("/blockquote"); + html.line(); + } + + @Override + public void visit(BulletList bulletList) { + renderListBlock(bulletList, "ul", getAttrs(bulletList, "ul")); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + String literal = fencedCodeBlock.getLiteral(); + Map<String, String> attributes = new LinkedHashMap<>(); + String info = fencedCodeBlock.getInfo(); + if (info != null && !info.isEmpty()) { + int space = info.indexOf(" "); + String language; + if (space == -1) { + language = info; + } else { + language = info.substring(0, space); + } + attributes.put("class", "language-" + language); + } + renderCodeBlock(literal, fencedCodeBlock, attributes); + } + + @Override + public void visit(HtmlBlock htmlBlock) { + html.line(); + if (context.shouldEscapeHtml()) { + html.tag("p", getAttrs(htmlBlock, "p")); + html.text(htmlBlock.getLiteral()); + html.tag("/p"); + } else { + html.raw(htmlBlock.getLiteral()); + } + html.line(); + } + + @Override + public void visit(ThematicBreak thematicBreak) { + html.line(); + html.tag("hr", getAttrs(thematicBreak, "hr"), true); + html.line(); + } + + @Override + public void visit(IndentedCodeBlock indentedCodeBlock) { + renderCodeBlock(indentedCodeBlock.getLiteral(), indentedCodeBlock, Map.of()); + } + + @Override + public void visit(Link link) { + Map<String, String> attrs = new LinkedHashMap<>(); + String url = link.getDestination(); + + if (context.shouldSanitizeUrls()) { + url = context.urlSanitizer().sanitizeLinkUrl(url); + attrs.put("rel", "nofollow"); + } + + url = context.encodeUrl(url); + attrs.put("href", url); + if (link.getTitle() != null) { + attrs.put("title", link.getTitle()); + } + html.tag("a", getAttrs(link, "a", attrs)); + visitChildren(link); + html.tag("/a"); + } + + @Override + public void visit(ListItem listItem) { + html.tag("li", getAttrs(listItem, "li")); + visitChildren(listItem); + html.tag("/li"); + html.line(); + } + + @Override + public void visit(OrderedList orderedList) { + int start = orderedList.getMarkerStartNumber() != null ? orderedList.getMarkerStartNumber() : 1; + Map<String, String> attrs = new LinkedHashMap<>(); + if (start != 1) { + attrs.put("start", String.valueOf(start)); + } + renderListBlock(orderedList, "ol", getAttrs(orderedList, "ol", attrs)); + } + + @Override + public void visit(Image image) { + String url = image.getDestination(); + + AltTextVisitor altTextVisitor = new AltTextVisitor(); + image.accept(altTextVisitor); + String altText = altTextVisitor.getAltText(); + + Map<String, String> attrs = new LinkedHashMap<>(); + if (context.shouldSanitizeUrls()) { + url = context.urlSanitizer().sanitizeImageUrl(url); + } + + attrs.put("src", context.encodeUrl(url)); + attrs.put("alt", altText); + if (image.getTitle() != null) { + attrs.put("title", image.getTitle()); + } + + html.tag("img", getAttrs(image, "img", attrs), true); + } + + @Override + public void visit(Emphasis emphasis) { + html.tag("em", getAttrs(emphasis, "em")); + visitChildren(emphasis); + html.tag("/em"); + } + + @Override + public void visit(StrongEmphasis strongEmphasis) { + html.tag("strong", getAttrs(strongEmphasis, "strong")); + visitChildren(strongEmphasis); + html.tag("/strong"); + } + + @Override + public void visit(Text text) { + html.text(text.getLiteral()); + } + + @Override + public void visit(Code code) { + html.tag("code", getAttrs(code, "code")); + html.text(code.getLiteral()); + html.tag("/code"); + } + + @Override + public void visit(HtmlInline htmlInline) { + if (context.shouldEscapeHtml()) { + html.text(htmlInline.getLiteral()); + } else { + html.raw(htmlInline.getLiteral()); + } + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + html.raw(context.getSoftbreak()); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + html.tag("br", getAttrs(hardLineBreak, "br"), true); + html.line(); + } + + @Override + protected void visitChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } + + private void renderCodeBlock(String literal, Node node, Map<String, String> attributes) { + html.line(); + html.tag("pre", getAttrs(node, "pre")); + html.tag("code", getAttrs(node, "code", attributes)); + html.text(literal); + html.tag("/code"); + html.tag("/pre"); + html.line(); + } + + private void renderListBlock(ListBlock listBlock, String tagName, Map<String, String> attributes) { + html.line(); + html.tag(tagName, attributes); + html.line(); + visitChildren(listBlock); + html.line(); + html.tag('/' + tagName); + html.line(); + } + + private boolean isInTightList(Paragraph paragraph) { + Node parent = paragraph.getParent(); + if (parent != null) { + Node gramps = parent.getParent(); + if (gramps instanceof ListBlock) { + ListBlock list = (ListBlock) gramps; + return list.isTight(); + } + } + return false; + } + + private Map<String, String> getAttrs(Node node, String tagName) { + return getAttrs(node, tagName, Map.of()); + } + + private Map<String, String> getAttrs(Node node, String tagName, Map<String, String> defaultAttributes) { + return context.extendAttributes(node, tagName, defaultAttributes); + } + + private static class AltTextVisitor extends AbstractVisitor { + + private final StringBuilder sb = new StringBuilder(); + + String getAltText() { + return sb.toString(); + } + + @Override + public void visit(Text text) { + sb.append(text.getLiteral()); + } + + @Override + public void visit(Code code) { + sb.append(code.getLiteral()); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + sb.append('\n'); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + sb.append('\n'); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/DefaultUrlSanitizer.java b/commonmark/src/main/java/org/commonmark/renderer/html/DefaultUrlSanitizer.java new file mode 100644 index 000000000..4c5bed12c --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/DefaultUrlSanitizer.java @@ -0,0 +1,80 @@ +package org.commonmark.renderer.html; + +import java.util.*; + +/** + * + * Allows http, https, mailto, and data protocols for url. + * Also allows protocol relative urls, and relative urls. + * Implementation based on https://github.com/OWASP/java-html-sanitizer/blob/f07e44b034a45d94d6fd010279073c38b6933072/src/main/java/org/owasp/html/FilterUrlByProtocolAttributePolicy.java + */ +public class DefaultUrlSanitizer implements UrlSanitizer { + private Set<String> protocols; + + public DefaultUrlSanitizer() { + this(List.of("http", "https", "mailto", "data")); + } + + public DefaultUrlSanitizer(Collection<String> protocols) { + this.protocols = new HashSet<>(protocols); + } + + @Override + public String sanitizeLinkUrl(String url) { + url = stripHtmlSpaces(url); + protocol_loop: + for (int i = 0, n = url.length(); i < n; ++i) { + switch (url.charAt(i)) { + case '/': + case '#': + case '?': // No protocol. + break protocol_loop; + case ':': + String protocol = url.substring(0, i).toLowerCase(); + if (!protocols.contains(protocol)) { + return ""; + } + break protocol_loop; + } + } + return url; + } + + + @Override + public String sanitizeImageUrl(String url) { + return sanitizeLinkUrl(url); + } + + private String stripHtmlSpaces(String s) { + int i = 0, n = s.length(); + for (; n > i; --n) { + if (!isHtmlSpace(s.charAt(n - 1))) { + break; + } + } + for (; i < n; ++i) { + if (!isHtmlSpace(s.charAt(i))) { + break; + } + } + if (i == 0 && n == s.length()) { + return s; + } + return s.substring(i, n); + } + + private boolean isHtmlSpace(int ch) { + switch (ch) { + case ' ': + case '\t': + case '\n': + case '\u000c': + case '\r': + return true; + default: + return false; + + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlNodeRendererContext.java b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlNodeRendererContext.java new file mode 100644 index 000000000..eecff0f44 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlNodeRendererContext.java @@ -0,0 +1,66 @@ +package org.commonmark.renderer.html; + +import org.commonmark.node.Image; +import org.commonmark.node.Link; +import org.commonmark.node.Node; + +import java.util.Map; + +public interface HtmlNodeRendererContext { + + /** + * @param url to be encoded + * @return an encoded URL (depending on the configuration) + */ + String encodeUrl(String url); + + /** + * Let extensions modify the HTML tag attributes. + * + * @param node the node for which the attributes are applied + * @param tagName the HTML tag name that these attributes are for (e.g. {@code h1}, {@code pre}, {@code code}). + * @param attributes the attributes that were calculated by the renderer + * @return the extended attributes with added/updated/removed entries + */ + Map<String, String> extendAttributes(Node node, String tagName, Map<String, String> attributes); + + /** + * @return the HTML writer to use + */ + HtmlWriter getWriter(); + + /** + * @return HTML that should be rendered for a soft line break + */ + String getSoftbreak(); + + /** + * Render the specified node and its children using the configured renderers. This should be used to render child + * nodes; be careful not to pass the node that is being rendered, that would result in an endless loop. + * + * @param node the node to render + */ + void render(Node node); + + /** + * @return whether HTML blocks and tags should be escaped or not + */ + boolean shouldEscapeHtml(); + + /** + * @return whether documents that only contain a single paragraph should be rendered without the {@code <p>} tag + */ + boolean shouldOmitSingleParagraphP(); + + /** + * @return true if the {@link UrlSanitizer} should be used. + * @since 0.14.0 + */ + boolean shouldSanitizeUrls(); + + /** + * @return Sanitizer to use for securing {@link Link} href and {@link Image} src if {@link #shouldSanitizeUrls()} is true. + * @since 0.14.0 + */ + UrlSanitizer urlSanitizer(); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlNodeRendererFactory.java b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlNodeRendererFactory.java new file mode 100644 index 000000000..8a343bf0f --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlNodeRendererFactory.java @@ -0,0 +1,17 @@ +package org.commonmark.renderer.html; + +import org.commonmark.renderer.NodeRenderer; + +/** + * Factory for instantiating new node renderers when rendering is done. + */ +public interface HtmlNodeRendererFactory { + + /** + * Create a new node renderer for the specified rendering context. + * + * @param context the context for rendering (normally passed on to the node renderer) + * @return a node renderer + */ + NodeRenderer create(HtmlNodeRendererContext context); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlRenderer.java new file mode 100644 index 000000000..386abebf0 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlRenderer.java @@ -0,0 +1,319 @@ +package org.commonmark.renderer.html; + +import org.commonmark.Extension; +import org.commonmark.internal.renderer.NodeRendererMap; +import org.commonmark.internal.util.Escaping; +import org.commonmark.node.*; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.Renderer; + +import java.util.*; + +/** + * Renders a tree of nodes to HTML. + * <p> + * Start with the {@link #builder} method to configure the renderer. Example: + * <pre><code> + * HtmlRenderer renderer = HtmlRenderer.builder().escapeHtml(true).build(); + * renderer.render(node); + * </code></pre> + */ +public class HtmlRenderer implements Renderer { + + private final String softbreak; + private final boolean escapeHtml; + private final boolean percentEncodeUrls; + private final boolean omitSingleParagraphP; + private final boolean sanitizeUrls; + private final UrlSanitizer urlSanitizer; + private final List<AttributeProviderFactory> attributeProviderFactories; + private final List<HtmlNodeRendererFactory> nodeRendererFactories; + + private HtmlRenderer(Builder builder) { + this.softbreak = builder.softbreak; + this.escapeHtml = builder.escapeHtml; + this.percentEncodeUrls = builder.percentEncodeUrls; + this.omitSingleParagraphP = builder.omitSingleParagraphP; + this.sanitizeUrls = builder.sanitizeUrls; + this.urlSanitizer = builder.urlSanitizer; + this.attributeProviderFactories = new ArrayList<>(builder.attributeProviderFactories); + + this.nodeRendererFactories = new ArrayList<>(builder.nodeRendererFactories.size() + 1); + this.nodeRendererFactories.addAll(builder.nodeRendererFactories); + // Add as last. This means clients can override the rendering of core nodes if they want. + this.nodeRendererFactories.add(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new CoreHtmlNodeRenderer(context); + } + }); + } + + /** + * Create a new builder for configuring an {@link HtmlRenderer}. + * + * @return a builder + */ + public static Builder builder() { + return new Builder(); + } + + @Override + public void render(Node node, Appendable output) { + Objects.requireNonNull(node, "node must not be null"); + RendererContext context = new RendererContext(new HtmlWriter(output)); + context.beforeRoot(node); + context.render(node); + context.afterRoot(node); + } + + @Override + public String render(Node node) { + Objects.requireNonNull(node, "node must not be null"); + StringBuilder sb = new StringBuilder(); + render(node, sb); + return sb.toString(); + } + + /** + * Builder for configuring an {@link HtmlRenderer}. See methods for default configuration. + */ + public static class Builder { + + private String softbreak = "\n"; + private boolean escapeHtml = false; + private boolean sanitizeUrls = false; + private UrlSanitizer urlSanitizer = new DefaultUrlSanitizer(); + private boolean percentEncodeUrls = false; + private boolean omitSingleParagraphP = false; + private List<AttributeProviderFactory> attributeProviderFactories = new ArrayList<>(); + private List<HtmlNodeRendererFactory> nodeRendererFactories = new ArrayList<>(); + + /** + * @return the configured {@link HtmlRenderer} + */ + public HtmlRenderer build() { + return new HtmlRenderer(this); + } + + /** + * The HTML to use for rendering a softbreak, defaults to {@code "\n"} (meaning the rendered result doesn't have + * a line break). + * <p> + * Set it to {@code "<br>"} (or {@code "<br />"} to make them hard breaks. + * <p> + * Set it to {@code " "} to ignore line wrapping in the source. + * + * @param softbreak HTML for softbreak + * @return {@code this} + */ + public Builder softbreak(String softbreak) { + this.softbreak = softbreak; + return this; + } + + /** + * Whether {@link HtmlInline} and {@link HtmlBlock} should be escaped, defaults to {@code false}. + * <p> + * Note that {@link HtmlInline} is only a tag itself, not the text between an opening tag and a closing tag. So + * markup in the text will be parsed as normal and is not affected by this option. + * + * @param escapeHtml true for escaping, false for preserving raw HTML + * @return {@code this} + */ + public Builder escapeHtml(boolean escapeHtml) { + this.escapeHtml = escapeHtml; + return this; + } + + /** + * Whether {@link Image} src and {@link Link} href should be sanitized, defaults to {@code false}. + * + * @param sanitizeUrls true for sanitization, false for preserving raw attribute + * @return {@code this} + * @since 0.14.0 + */ + public Builder sanitizeUrls(boolean sanitizeUrls) { + this.sanitizeUrls = sanitizeUrls; + return this; + } + + /** + * {@link UrlSanitizer} used to filter URL's if {@link #sanitizeUrls} is true. + * + * @param urlSanitizer Filterer used to filter {@link Image} src and {@link Link}. + * @return {@code this} + * @since 0.14.0 + */ + public Builder urlSanitizer(UrlSanitizer urlSanitizer) { + this.urlSanitizer = urlSanitizer; + return this; + } + + /** + * Whether URLs of link or images should be percent-encoded, defaults to {@code false}. + * <p> + * If enabled, the following is done: + * <ul> + * <li>Existing percent-encoded parts are preserved (e.g. "%20" is kept as "%20")</li> + * <li>Reserved characters such as "/" are preserved, except for "[" and "]" (see encodeURI in JS)</li> + * <li>Unreserved characters such as "a" are preserved</li> + * <li>Other characters such umlauts are percent-encoded</li> + * </ul> + * + * @param percentEncodeUrls true to percent-encode, false for leaving as-is + * @return {@code this} + */ + public Builder percentEncodeUrls(boolean percentEncodeUrls) { + this.percentEncodeUrls = percentEncodeUrls; + return this; + } + + /** + * Whether documents that only contain a single paragraph should be rendered without the {@code <p>} tag. Set to + * {@code true} to render without the tag; the default of {@code false} always renders the tag. + * + * @return {@code this} + */ + public Builder omitSingleParagraphP(boolean omitSingleParagraphP) { + this.omitSingleParagraphP = omitSingleParagraphP; + return this; + } + + /** + * Add a factory for an attribute provider for adding/changing HTML attributes to the rendered tags. + * + * @param attributeProviderFactory the attribute provider factory to add + * @return {@code this} + */ + public Builder attributeProviderFactory(AttributeProviderFactory attributeProviderFactory) { + Objects.requireNonNull(attributeProviderFactory, "attributeProviderFactory must not be null"); + this.attributeProviderFactories.add(attributeProviderFactory); + return this; + } + + /** + * Add a factory for instantiating a node renderer (done when rendering). This allows to override the rendering + * of node types or define rendering for custom node types. + * <p> + * If multiple node renderers for the same node type are created, the one from the factory that was added first + * "wins". (This is how the rendering for core node types can be overridden; the default rendering comes last.) + * + * @param nodeRendererFactory the factory for creating a node renderer + * @return {@code this} + */ + public Builder nodeRendererFactory(HtmlNodeRendererFactory nodeRendererFactory) { + Objects.requireNonNull(nodeRendererFactory, "nodeRendererFactory must not be null"); + this.nodeRendererFactories.add(nodeRendererFactory); + return this; + } + + /** + * @param extensions extensions to use on this HTML renderer + * @return {@code this} + */ + public Builder extensions(Iterable<? extends Extension> extensions) { + Objects.requireNonNull(extensions, "extensions must not be null"); + for (Extension extension : extensions) { + if (extension instanceof HtmlRendererExtension) { + HtmlRendererExtension htmlRendererExtension = (HtmlRendererExtension) extension; + htmlRendererExtension.extend(this); + } + } + return this; + } + } + + /** + * Extension for {@link HtmlRenderer}. + */ + public interface HtmlRendererExtension extends Extension { + void extend(Builder rendererBuilder); + } + + private class RendererContext implements HtmlNodeRendererContext, AttributeProviderContext { + + private final HtmlWriter htmlWriter; + private final List<AttributeProvider> attributeProviders; + private final NodeRendererMap nodeRendererMap = new NodeRendererMap(); + + private RendererContext(HtmlWriter htmlWriter) { + this.htmlWriter = htmlWriter; + + attributeProviders = new ArrayList<>(attributeProviderFactories.size()); + for (var attributeProviderFactory : attributeProviderFactories) { + attributeProviders.add(attributeProviderFactory.create(this)); + } + + for (var factory : nodeRendererFactories) { + var renderer = factory.create(this); + nodeRendererMap.add(renderer); + } + } + + @Override + public boolean shouldEscapeHtml() { + return escapeHtml; + } + + @Override + public boolean shouldOmitSingleParagraphP() { + return omitSingleParagraphP; + } + + @Override + public boolean shouldSanitizeUrls() { + return sanitizeUrls; + } + + @Override + public UrlSanitizer urlSanitizer() { + return urlSanitizer; + } + + @Override + public String encodeUrl(String url) { + if (percentEncodeUrls) { + return Escaping.percentEncodeUrl(url); + } else { + return url; + } + } + + @Override + public Map<String, String> extendAttributes(Node node, String tagName, Map<String, String> attributes) { + Map<String, String> attrs = new LinkedHashMap<>(attributes); + setCustomAttributes(node, tagName, attrs); + return attrs; + } + + @Override + public HtmlWriter getWriter() { + return htmlWriter; + } + + @Override + public String getSoftbreak() { + return softbreak; + } + + @Override + public void render(Node node) { + nodeRendererMap.render(node); + } + + public void beforeRoot(Node node) { + nodeRendererMap.beforeRoot(node); + } + + public void afterRoot(Node node) { + nodeRendererMap.afterRoot(node); + } + + private void setCustomAttributes(Node node, String tagName, Map<String, String> attrs) { + for (AttributeProvider attributeProvider : attributeProviders) { + attributeProvider.setAttributes(node, tagName, attrs); + } + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/html/HtmlWriter.java b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java similarity index 55% rename from commonmark/src/main/java/org/commonmark/html/HtmlWriter.java rename to commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java index b849569ee..a4ac05d45 100644 --- a/commonmark/src/main/java/org/commonmark/html/HtmlWriter.java +++ b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java @@ -1,43 +1,29 @@ -package org.commonmark.html; +package org.commonmark.renderer.html; import org.commonmark.internal.util.Escaping; import java.io.IOException; -import java.util.Collections; import java.util.Map; -import java.util.regex.Pattern; +import java.util.Objects; public class HtmlWriter { - private static final Map<String, String> NO_ATTRIBUTES = Collections.emptyMap(); - private static final Pattern HTML_TAG_PATTERN = Pattern.compile("<[^>]*>"); + private static final Map<String, String> NO_ATTRIBUTES = Map.of(); private final Appendable buffer; - private int nesting = 0; private char lastChar = 0; public HtmlWriter(Appendable out) { + Objects.requireNonNull(out, "out must not be null"); this.buffer = out; } public void raw(String s) { - if (isTagAllowed()) { - append(s); - } else { - append(HTML_TAG_PATTERN.matcher(s).replaceAll("")); - } - } - - public boolean isTagAllowed() { - return nesting == 0; + append(s); } - public void disableTags() { - nesting++; - } - - public void enableTags() { - nesting--; + public void text(String text) { + append(Escaping.escapeHtml(text)); } public void tag(String name) { @@ -48,21 +34,18 @@ public void tag(String name, Map<String, String> attrs) { tag(name, attrs, false); } - // Helper function to produce an HTML tag. public void tag(String name, Map<String, String> attrs, boolean voidElement) { - if (!isTagAllowed()) { - return; - } - append("<"); append(name); if (attrs != null && !attrs.isEmpty()) { - for (Map.Entry<String, String> attrib : attrs.entrySet()) { + for (var attr : attrs.entrySet()) { append(" "); - append(Escaping.escapeHtml(attrib.getKey(), true)); - append("=\""); - append(Escaping.escapeHtml(attrib.getValue(), true)); - append("\""); + append(Escaping.escapeHtml(attr.getKey())); + if (attr.getValue() != null) { + append("=\""); + append(Escaping.escapeHtml(attr.getValue())); + append("\""); + } } } if (voidElement) { diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/UrlSanitizer.java b/commonmark/src/main/java/org/commonmark/renderer/html/UrlSanitizer.java new file mode 100644 index 000000000..fb48ca361 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/UrlSanitizer.java @@ -0,0 +1,30 @@ +package org.commonmark.renderer.html; + +import org.commonmark.node.Image; +import org.commonmark.node.Link; + +/** + * Sanitizes urls for img and a elements by whitelisting protocols. + * This is intended to prevent XSS payloads like [Click this totally safe url](javascript:document.xss=true;) + * <p> + * Implementation based on https://github.com/OWASP/java-html-sanitizer/blob/f07e44b034a45d94d6fd010279073c38b6933072/src/main/java/org/owasp/html/FilterUrlByProtocolAttributePolicy.java + * + * @since 0.14.0 + */ +public interface UrlSanitizer { + /** + * Sanitize a url for use in the href attribute of a {@link Link}. + * + * @param url Link to sanitize + * @return Sanitized link + */ + String sanitizeLinkUrl(String url); + + /** + * Sanitize a url for use in the src attribute of a {@link Image}. + * + * @param url Link to sanitize + * @return Sanitized link {@link Image} + */ + String sanitizeImageUrl(String url); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/package-info.java b/commonmark/src/main/java/org/commonmark/renderer/html/package-info.java new file mode 100644 index 000000000..014a4c69c --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/html/package-info.java @@ -0,0 +1,4 @@ +/** + * HTML rendering (see {@link org.commonmark.renderer.html.HtmlRenderer}) + */ +package org.commonmark.renderer.html; diff --git a/commonmark/src/main/java/org/commonmark/renderer/markdown/CoreMarkdownNodeRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/markdown/CoreMarkdownNodeRenderer.java new file mode 100644 index 000000000..5a81676f4 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/markdown/CoreMarkdownNodeRenderer.java @@ -0,0 +1,554 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.node.*; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.text.AsciiMatcher; +import org.commonmark.text.CharMatcher; +import org.commonmark.text.Characters; + +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * The node renderer that renders all the core nodes (comes last in the order of node renderers). + * <p> + * Note that while sometimes it would be easier to record what kind of syntax was used on parsing (e.g. ATX vs Setext + * heading), this renderer is intended to also work for documents that were created by directly creating + * {@link Node Nodes} instead. So in order to support that, it sometimes needs to do a bit more work. + */ +public class CoreMarkdownNodeRenderer extends AbstractVisitor implements NodeRenderer { + + private final AsciiMatcher textEscape; + private final CharMatcher textEscapeInHeading; + private final CharMatcher linkDestinationNeedsAngleBrackets = + AsciiMatcher.builder().c(' ').c('(').c(')').c('<').c('>').c('\n').c('\\').build(); + private final CharMatcher linkDestinationEscapeInAngleBrackets = + AsciiMatcher.builder().c('<').c('>').c('\n').c('\\').build(); + private final CharMatcher linkTitleEscapeInQuotes = + AsciiMatcher.builder().c('"').c('\n').c('\\').build(); + + private final Pattern orderedListMarkerPattern = Pattern.compile("^([0-9]{1,9})([.)])"); + + protected final MarkdownNodeRendererContext context; + private final MarkdownWriter writer; + /** + * If we're currently within a {@link BulletList} or {@link OrderedList}, this keeps the context of that list. + * It has a parent field so that it can represent a stack (for nested lists). + */ + private ListHolder listHolder; + + public CoreMarkdownNodeRenderer(MarkdownNodeRendererContext context) { + this.context = context; + this.writer = context.getWriter(); + + textEscape = AsciiMatcher.builder().anyOf("[]<>`*_&\n\\").anyOf(context.getSpecialCharacters()).build(); + textEscapeInHeading = AsciiMatcher.builder(textEscape).anyOf("#").build(); + } + + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of( + BlockQuote.class, + BulletList.class, + Code.class, + Document.class, + Emphasis.class, + FencedCodeBlock.class, + HardLineBreak.class, + Heading.class, + HtmlBlock.class, + HtmlInline.class, + Image.class, + IndentedCodeBlock.class, + Link.class, + ListItem.class, + OrderedList.class, + Paragraph.class, + SoftLineBreak.class, + StrongEmphasis.class, + Text.class, + ThematicBreak.class + ); + } + + @Override + public void render(Node node) { + node.accept(this); + } + + @Override + public void visit(Document document) { + // No rendering itself + visitChildren(document); + writer.line(); + } + + @Override + public void visit(ThematicBreak thematicBreak) { + String literal = thematicBreak.getLiteral(); + if (literal == null) { + // Let's use ___ as it doesn't introduce ambiguity with * or - list item markers + literal = "___"; + } + writer.raw(literal); + writer.block(); + } + + @Override + public void visit(Heading heading) { + if (heading.getLevel() <= 2) { + LineBreakVisitor lineBreakVisitor = new LineBreakVisitor(); + heading.accept(lineBreakVisitor); + boolean isMultipleLines = lineBreakVisitor.hasLineBreak(); + + if (isMultipleLines) { + // Setext headings: Can have multiple lines, but only level 1 or 2 + visitChildren(heading); + writer.line(); + if (heading.getLevel() == 1) { + // Note that it would be nice to match the length of the contents instead of just using 3, but that's + // not easy. + writer.raw("==="); + } else { + writer.raw("---"); + } + writer.block(); + return; + } + } + + // ATX headings: Can't have multiple lines, but up to level 6. + for (int i = 0; i < heading.getLevel(); i++) { + writer.raw('#'); + } + writer.raw(' '); + visitChildren(heading); + + writer.block(); + } + + @Override + public void visit(IndentedCodeBlock indentedCodeBlock) { + String literal = indentedCodeBlock.getLiteral(); + // We need to respect line prefixes which is why we need to write it line by line (e.g. an indented code block + // within a block quote) + writer.writePrefix(" "); + writer.pushPrefix(" "); + List<String> lines = getLines(literal); + for (int i = 0; i < lines.size(); i++) { + String line = lines.get(i); + writer.raw(line); + if (i != lines.size() - 1) { + writer.line(); + } + } + writer.popPrefix(); + writer.block(); + } + + @Override + public void visit(FencedCodeBlock codeBlock) { + String literal = codeBlock.getLiteral(); + String fenceChar = codeBlock.getFenceCharacter() != null ? codeBlock.getFenceCharacter() : "`"; + int openingFenceLength; + if (codeBlock.getOpeningFenceLength() != null) { + // If we have a known fence length, use it + openingFenceLength = codeBlock.getOpeningFenceLength(); + } else { + // Otherwise, calculate the closing fence length pessimistically, e.g. if the code block itself contains a + // line with ```, we need to use a fence of length 4. If ``` occurs with non-whitespace characters on a + // line, we technically don't need a longer fence, but it's not incorrect to do so. + int fenceCharsInLiteral = findMaxRunLength(fenceChar, literal); + openingFenceLength = Math.max(fenceCharsInLiteral + 1, 3); + } + int closingFenceLength = codeBlock.getClosingFenceLength() != null ? codeBlock.getClosingFenceLength() : openingFenceLength; + + String openingFence = repeat(fenceChar, openingFenceLength); + String closingFence = repeat(fenceChar, closingFenceLength); + int indent = codeBlock.getFenceIndent(); + + if (indent > 0) { + String indentPrefix = repeat(" ", indent); + writer.writePrefix(indentPrefix); + writer.pushPrefix(indentPrefix); + } + + writer.raw(openingFence); + if (codeBlock.getInfo() != null) { + writer.raw(codeBlock.getInfo()); + } + writer.line(); + if (!literal.isEmpty()) { + List<String> lines = getLines(literal); + for (String line : lines) { + writer.raw(line); + writer.line(); + } + } + writer.raw(closingFence); + if (indent > 0) { + writer.popPrefix(); + } + writer.block(); + } + + @Override + public void visit(HtmlBlock htmlBlock) { + List<String> lines = getLines(htmlBlock.getLiteral()); + for (int i = 0; i < lines.size(); i++) { + String line = lines.get(i); + writer.raw(line); + if (i != lines.size() - 1) { + writer.line(); + } + } + writer.block(); + } + + @Override + public void visit(Paragraph paragraph) { + visitChildren(paragraph); + writer.block(); + } + + @Override + public void visit(BlockQuote blockQuote) { + writer.writePrefix("> "); + writer.pushPrefix("> "); + visitChildren(blockQuote); + writer.popPrefix(); + writer.block(); + } + + @Override + public void visit(BulletList bulletList) { + writer.pushTight(bulletList.isTight()); + listHolder = new BulletListHolder(listHolder, bulletList); + visitChildren(bulletList); + listHolder = listHolder.parent; + writer.popTight(); + writer.block(); + } + + @Override + public void visit(OrderedList orderedList) { + writer.pushTight(orderedList.isTight()); + listHolder = new OrderedListHolder(listHolder, orderedList); + visitChildren(orderedList); + listHolder = listHolder.parent; + writer.popTight(); + writer.block(); + } + + @Override + public void visit(ListItem listItem) { + int markerIndent = listItem.getMarkerIndent() != null ? listItem.getMarkerIndent() : 0; + String marker; + if (listHolder instanceof BulletListHolder) { + BulletListHolder bulletListHolder = (BulletListHolder) listHolder; + marker = repeat(" ", markerIndent) + bulletListHolder.marker; + } else if (listHolder instanceof OrderedListHolder) { + OrderedListHolder orderedListHolder = (OrderedListHolder) listHolder; + marker = repeat(" ", markerIndent) + orderedListHolder.number + orderedListHolder.delimiter; + orderedListHolder.number++; + } else { + throw new IllegalStateException("Unknown list holder type: " + listHolder); + } + Integer contentIndent = listItem.getContentIndent(); + String spaces = contentIndent != null ? repeat(" ", Math.max(contentIndent - marker.length(), 1)) : " "; + writer.writePrefix(marker); + writer.writePrefix(spaces); + writer.pushPrefix(repeat(" ", marker.length() + spaces.length())); + + if (listItem.getFirstChild() == null) { + // Empty list item + writer.block(); + } else { + visitChildren(listItem); + } + + writer.popPrefix(); + } + + @Override + public void visit(Code code) { + String literal = code.getLiteral(); + // If the literal includes backticks, we can surround them by using one more backtick. + int backticks = findMaxRunLength("`", literal); + for (int i = 0; i < backticks + 1; i++) { + writer.raw('`'); + } + // If the literal starts or ends with a backtick, surround it with a single space. + // If it starts and ends with a space (but is not only spaces), add an additional space (otherwise they would + // get removed on parsing). + boolean addSpace = literal.startsWith("`") || literal.endsWith("`") || + (literal.startsWith(" ") && literal.endsWith(" ") && Characters.hasNonSpace(literal)); + if (addSpace) { + writer.raw(' '); + } + writer.raw(literal); + if (addSpace) { + writer.raw(' '); + } + for (int i = 0; i < backticks + 1; i++) { + writer.raw('`'); + } + } + + @Override + public void visit(Emphasis emphasis) { + String delimiter = emphasis.getOpeningDelimiter(); + // Use delimiter that was parsed if available + if (delimiter == null) { + // When emphasis is nested, a different delimiter needs to be used + delimiter = writer.getLastChar() == '*' ? "_" : "*"; + } + writer.raw(delimiter); + super.visit(emphasis); + writer.raw(delimiter); + } + + @Override + public void visit(StrongEmphasis strongEmphasis) { + writer.raw("**"); + super.visit(strongEmphasis); + writer.raw("**"); + } + + @Override + public void visit(Link link) { + writeLinkLike(link.getTitle(), link.getDestination(), link, "["); + } + + @Override + public void visit(Image image) { + writeLinkLike(image.getTitle(), image.getDestination(), image, "!["); + } + + @Override + public void visit(HtmlInline htmlInline) { + writer.raw(htmlInline.getLiteral()); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + writer.raw(" "); + writer.line(); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + writer.line(); + } + + @Override + public void visit(Text text) { + // Text is tricky. In Markdown special characters (`-`, `#` etc.) can be escaped (`\-`, `\#` etc.) so that + // they're parsed as plain text. Currently, whether a character was escaped or not is not recorded in the Node, + // so here we don't know. If we just wrote out those characters unescaped, the resulting Markdown would change + // meaning (turn into a list item, heading, etc.). + // You might say "Why not store that in the Node when parsing", but that wouldn't work for the use case where + // nodes are constructed directly instead of via parsing. This renderer needs to work for that too. + // So currently, when in doubt, we escape. For special characters only occurring at the beginning of a line, + // we only escape them then (we wouldn't want to escape every `.` for example). + String literal = text.getLiteral(); + if (writer.isAtLineStart() && !literal.isEmpty()) { + char c = literal.charAt(0); + switch (c) { + case '-': { + // Would be ambiguous with a bullet list marker, escape + writer.raw("\\-"); + literal = literal.substring(1); + break; + } + case '#': { + // Would be ambiguous with an ATX heading, escape + writer.raw("\\#"); + literal = literal.substring(1); + break; + } + case '=': { + // Would be ambiguous with a Setext heading, escape unless it's the first line in the block + if (text.getPrevious() != null) { + writer.raw("\\="); + literal = literal.substring(1); + } + break; + } + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + // Check for ordered list marker + Matcher m = orderedListMarkerPattern.matcher(literal); + if (m.find()) { + writer.raw(m.group(1)); + writer.raw("\\" + m.group(2)); + literal = literal.substring(m.end()); + } + break; + } + case '\t': { + writer.raw(" "); + literal = literal.substring(1); + break; + } + case ' ': { + writer.raw(" "); + literal = literal.substring(1); + break; + } + } + } + + CharMatcher escape = text.getParent() instanceof Heading ? textEscapeInHeading : textEscape; + + if (literal.endsWith("!") && text.getNext() instanceof Link) { + // If we wrote the `!` unescaped, it would turn the link into an image instead. + writer.text(literal.substring(0, literal.length() - 1), escape); + writer.raw("\\!"); + } else { + writer.text(literal, escape); + } + } + + @Override + protected void visitChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } + + private static int findMaxRunLength(String needle, String s) { + int maxRunLength = 0; + int pos = 0; + while (pos < s.length()) { + pos = s.indexOf(needle, pos); + if (pos == -1) { + return maxRunLength; + } + int runLength = 0; + do { + pos += needle.length(); + runLength++; + } while (s.startsWith(needle, pos)); + maxRunLength = Math.max(runLength, maxRunLength); + } + return maxRunLength; + } + + private static boolean contains(String s, CharMatcher charMatcher) { + for (int i = 0; i < s.length(); i++) { + if (charMatcher.matches(s.charAt(i))) { + return true; + } + } + return false; + } + + // Keep for Android compat (String.repeat only available on Android 12 and later) + private static String repeat(String s, int count) { + StringBuilder sb = new StringBuilder(s.length() * count); + for (int i = 0; i < count; i++) { + sb.append(s); + } + return sb.toString(); + } + + private static List<String> getLines(String literal) { + // Without -1, split would discard all trailing empty strings, which is not what we want, e.g. it would + // return the same result for "abc", "abc\n" and "abc\n\n". + // With -1, it returns ["abc"], ["abc", ""] and ["abc", "", ""]. + String[] parts = literal.split("\n", -1); + if (parts[parts.length - 1].isEmpty()) { + // But we don't want the last empty string, as "\n" is used as a line terminator (not a separator), + // so return without the last element. + return List.of(parts).subList(0, parts.length - 1); + } else { + return List.of(parts); + } + } + + private void writeLinkLike(String title, String destination, Node node, String opener) { + writer.raw(opener); + visitChildren(node); + writer.raw(']'); + writer.raw('('); + if (contains(destination, linkDestinationNeedsAngleBrackets)) { + writer.raw('<'); + writer.text(destination, linkDestinationEscapeInAngleBrackets); + writer.raw('>'); + } else { + writer.raw(destination); + } + if (title != null) { + writer.raw(' '); + writer.raw('"'); + writer.text(title, linkTitleEscapeInQuotes); + writer.raw('"'); + } + writer.raw(')'); + } + + private static class ListHolder { + final ListHolder parent; + + protected ListHolder(ListHolder parent) { + this.parent = parent; + } + } + + private static class BulletListHolder extends ListHolder { + final String marker; + + public BulletListHolder(ListHolder parent, BulletList bulletList) { + super(parent); + this.marker = bulletList.getMarker() != null ? bulletList.getMarker() : "-"; + } + } + + private static class OrderedListHolder extends ListHolder { + final String delimiter; + private int number; + + protected OrderedListHolder(ListHolder parent, OrderedList orderedList) { + super(parent); + delimiter = orderedList.getMarkerDelimiter() != null ? orderedList.getMarkerDelimiter() : "."; + number = orderedList.getMarkerStartNumber() != null ? orderedList.getMarkerStartNumber() : 1; + } + } + + /** + * Visits nodes to check if there are any soft or hard line breaks. + */ + private static class LineBreakVisitor extends AbstractVisitor { + private boolean lineBreak = false; + + public boolean hasLineBreak() { + return lineBreak; + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + super.visit(softLineBreak); + lineBreak = true; + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + super.visit(hardLineBreak); + lineBreak = true; + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownNodeRendererContext.java b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownNodeRendererContext.java new file mode 100644 index 000000000..40640d1b4 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownNodeRendererContext.java @@ -0,0 +1,30 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.node.Node; + +import java.util.Set; + +/** + * Context that is passed to custom node renderers, see {@link MarkdownNodeRendererFactory#create}. + */ +public interface MarkdownNodeRendererContext { + + /** + * @return the writer to use + */ + MarkdownWriter getWriter(); + + /** + * Render the specified node and its children using the configured renderers. This should be used to render child + * nodes; be careful not to pass the node that is being rendered, that would result in an endless loop. + * + * @param node the node to render + */ + void render(Node node); + + /** + * @return additional special characters that need to be escaped if they occur in normal text; currently only ASCII + * characters are allowed + */ + Set<Character> getSpecialCharacters(); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownNodeRendererFactory.java b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownNodeRendererFactory.java new file mode 100644 index 000000000..14221ea56 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownNodeRendererFactory.java @@ -0,0 +1,25 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.renderer.NodeRenderer; + +import java.util.Set; + +/** + * Factory for instantiating new node renderers for rendering custom nodes. + */ +public interface MarkdownNodeRendererFactory { + + /** + * Create a new node renderer for the specified rendering context. + * + * @param context the context for rendering (normally passed on to the node renderer) + * @return a node renderer + */ + NodeRenderer create(MarkdownNodeRendererContext context); + + /** + * @return the additional special characters that this factory would like to have escaped in normal text; currently + * only ASCII characters are allowed + */ + Set<Character> getSpecialCharacters(); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownRenderer.java new file mode 100644 index 000000000..e4996fb08 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownRenderer.java @@ -0,0 +1,161 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.Extension; +import org.commonmark.internal.renderer.NodeRendererMap; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.Renderer; + +import java.util.*; + +/** + * Renders nodes to Markdown (CommonMark syntax); use {@link #builder()} to create a renderer. + * <p> + * Note that it doesn't currently preserve the exact syntax of the original input Markdown (if any): + * <ul> + * <li>Headings are output as ATX headings if possible (multi-line headings need Setext headings)</li> + * <li>Links are always rendered as inline links (no support for reference links yet)</li> + * <li>Escaping might be over-eager, e.g. a plain {@code *} might be escaped + * even though it doesn't need to be in that particular context</li> + * <li>Leading whitespace in paragraphs is not preserved</li> + * </ul> + * However, it should produce Markdown that is semantically equivalent to the input, i.e. if the Markdown was parsed + * again and compared against the original AST, it should be the same (minus bugs). + */ +public class MarkdownRenderer implements Renderer { + + private final List<MarkdownNodeRendererFactory> nodeRendererFactories; + + private MarkdownRenderer(Builder builder) { + this.nodeRendererFactories = new ArrayList<>(builder.nodeRendererFactories.size() + 1); + this.nodeRendererFactories.addAll(builder.nodeRendererFactories); + // Add as last. This means clients can override the rendering of core nodes if they want. + this.nodeRendererFactories.add(new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new CoreMarkdownNodeRenderer(context); + } + + @Override + public Set<Character> getSpecialCharacters() { + return Set.of(); + } + }); + } + + /** + * Create a new builder for configuring a {@link MarkdownRenderer}. + * + * @return a builder + */ + public static Builder builder() { + return new Builder(); + } + + @Override + public void render(Node node, Appendable output) { + RendererContext context = new RendererContext(new MarkdownWriter(output)); + context.render(node); + } + + @Override + public String render(Node node) { + StringBuilder sb = new StringBuilder(); + render(node, sb); + return sb.toString(); + } + + /** + * Builder for configuring a {@link MarkdownRenderer}. See methods for default configuration. + */ + public static class Builder { + + private final List<MarkdownNodeRendererFactory> nodeRendererFactories = new ArrayList<>(); + + /** + * @return the configured {@link MarkdownRenderer} + */ + public MarkdownRenderer build() { + return new MarkdownRenderer(this); + } + + /** + * Add a factory for instantiating a node renderer (done when rendering). This allows to override the rendering + * of node types or define rendering for custom node types. + * <p> + * If multiple node renderers for the same node type are created, the one from the factory that was added first + * "wins". (This is how the rendering for core node types can be overridden; the default rendering comes last.) + * + * @param nodeRendererFactory the factory for creating a node renderer + * @return {@code this} + */ + public Builder nodeRendererFactory(MarkdownNodeRendererFactory nodeRendererFactory) { + this.nodeRendererFactories.add(nodeRendererFactory); + return this; + } + + /** + * @param extensions extensions to use on this renderer + * @return {@code this} + */ + public Builder extensions(Iterable<? extends Extension> extensions) { + for (Extension extension : extensions) { + if (extension instanceof MarkdownRendererExtension) { + MarkdownRendererExtension markdownRendererExtension = (MarkdownRendererExtension) extension; + markdownRendererExtension.extend(this); + } + } + return this; + } + } + + /** + * Extension for {@link MarkdownRenderer} for rendering custom nodes. + */ + public interface MarkdownRendererExtension extends Extension { + + /** + * Extend Markdown rendering, usually by registering custom node renderers using {@link Builder#nodeRendererFactory}. + * + * @param rendererBuilder the renderer builder to extend + */ + void extend(Builder rendererBuilder); + } + + private class RendererContext implements MarkdownNodeRendererContext { + private final MarkdownWriter writer; + private final NodeRendererMap nodeRendererMap = new NodeRendererMap(); + private final Set<Character> additionalTextEscapes; + + private RendererContext(MarkdownWriter writer) { + // Set fields that are used by interface + this.writer = writer; + Set<Character> escapes = new HashSet<>(); + for (MarkdownNodeRendererFactory factory : nodeRendererFactories) { + escapes.addAll(factory.getSpecialCharacters()); + } + additionalTextEscapes = Collections.unmodifiableSet(escapes); + + for (var factory : nodeRendererFactories) { + // Pass in this as context here, which uses the fields set above + var renderer = factory.create(this); + nodeRendererMap.add(renderer); + } + } + + @Override + public MarkdownWriter getWriter() { + return writer; + } + + @Override + public void render(Node node) { + nodeRendererMap.render(node); + } + + @Override + public Set<Character> getSpecialCharacters() { + return additionalTextEscapes; + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownWriter.java b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownWriter.java new file mode 100644 index 000000000..c9f427021 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/markdown/MarkdownWriter.java @@ -0,0 +1,246 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.text.CharMatcher; + +import java.io.IOException; +import java.util.LinkedList; + +/** + * Writer for Markdown (CommonMark) text. + */ +public class MarkdownWriter { + + private final Appendable buffer; + + private int blockSeparator = 0; + private char lastChar; + private boolean atLineStart = true; + + // Stacks of settings that affect various rendering behaviors. The common pattern here is that callers use "push" to + // change a setting, render some nodes, and then "pop" the setting off the stack again to restore previous state. + private final LinkedList<String> prefixes = new LinkedList<>(); + private final LinkedList<Boolean> tight = new LinkedList<>(); + private final LinkedList<CharMatcher> rawEscapes = new LinkedList<>(); + + public MarkdownWriter(Appendable out) { + buffer = out; + } + + /** + * Write the supplied string (raw/unescaped except if {@link #pushRawEscape} was used). + */ + public void raw(String s) { + flushBlockSeparator(); + write(s, null); + } + + /** + * Write the supplied character (raw/unescaped except if {@link #pushRawEscape} was used). + */ + public void raw(char c) { + flushBlockSeparator(); + write(c); + } + + /** + * Write the supplied string with escaping. + * + * @param s the string to write + * @param escape which characters to escape + */ + public void text(String s, CharMatcher escape) { + if (s.isEmpty()) { + return; + } + flushBlockSeparator(); + write(s, escape); + + lastChar = s.charAt(s.length() - 1); + atLineStart = false; + } + + /** + * Write a newline (line terminator). + */ + public void line() { + write('\n'); + writePrefixes(); + atLineStart = true; + } + + /** + * Enqueue a block separator to be written before the next text is written. Block separators are not written + * straight away because if there are no more blocks to write we don't want a separator (at the end of the document). + */ + public void block() { + // Remember whether this should be a tight or loose separator now because tight could get changed in between + // this and the next flush. + blockSeparator = isTight() ? 1 : 2; + atLineStart = true; + } + + /** + * Push a prefix onto the top of the stack. All prefixes are written at the beginning of each line, until the + * prefix is popped again. + * + * @param prefix the raw prefix string + */ + public void pushPrefix(String prefix) { + prefixes.addLast(prefix); + } + + /** + * Write a prefix. + * + * @param prefix the raw prefix string to write + */ + public void writePrefix(String prefix) { + boolean tmp = atLineStart; + raw(prefix); + atLineStart = tmp; + } + + /** + * Remove the last prefix from the top of the stack. + */ + public void popPrefix() { + prefixes.removeLast(); + } + + /** + * Change whether blocks are tight or loose. Loose is the default where blocks are separated by a blank line. Tight + * is where blocks are not separated by a blank line. Tight blocks are used in lists, if there are no blank lines + * within the list. + * <p> + * Note that changing this does not affect block separators that have already been enqueued with {@link #block()}, + * only future ones. + */ + public void pushTight(boolean tight) { + this.tight.addLast(tight); + } + + /** + * Remove the last "tight" setting from the top of the stack. + */ + public void popTight() { + this.tight.removeLast(); + } + + /** + * Escape the characters matching the supplied matcher, in all text (text and raw). This might be useful to + * extensions that add another layer of syntax, e.g. the tables extension that uses `|` to separate cells and needs + * all `|` characters to be escaped (even in code spans). + * + * @param rawEscape the characters to escape in raw text + */ + public void pushRawEscape(CharMatcher rawEscape) { + rawEscapes.add(rawEscape); + } + + /** + * Remove the last raw escape from the top of the stack. + */ + public void popRawEscape() { + rawEscapes.removeLast(); + } + + /** + * @return the last character that was written + */ + public char getLastChar() { + return lastChar; + } + + /** + * @return whether we're at the line start (not counting any prefixes), i.e. after a {@link #line} or {@link #block}. + */ + public boolean isAtLineStart() { + return atLineStart; + } + + private void write(String s, CharMatcher escape) { + try { + if (rawEscapes.isEmpty() && escape == null) { + // Normal fast path + buffer.append(s); + } else { + for (int i = 0; i < s.length(); i++) { + append(s.charAt(i), escape); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + + int length = s.length(); + if (length != 0) { + lastChar = s.charAt(length - 1); + } + atLineStart = false; + } + + private void write(char c) { + try { + append(c, null); + } catch (IOException e) { + throw new RuntimeException(e); + } + + lastChar = c; + atLineStart = false; + } + + private void writePrefixes() { + if (!prefixes.isEmpty()) { + for (String prefix : prefixes) { + write(prefix, null); + } + } + } + + /** + * If a block separator has been enqueued with {@link #block()} but not yet written, write it now. + */ + private void flushBlockSeparator() { + if (blockSeparator != 0) { + write('\n'); + writePrefixes(); + if (blockSeparator > 1) { + write('\n'); + writePrefixes(); + } + blockSeparator = 0; + } + } + + private void append(char c, CharMatcher escape) throws IOException { + if (needsEscaping(c, escape)) { + if (c == '\n') { + // Can't escape this with \, use numeric character reference + buffer.append(" "); + } else { + buffer.append('\\'); + buffer.append(c); + } + } else { + buffer.append(c); + } + } + + private boolean isTight() { + return !tight.isEmpty() && tight.getLast(); + } + + private boolean needsEscaping(char c, CharMatcher escape) { + return (escape != null && escape.matches(c)) || rawNeedsEscaping(c); + } + + private boolean rawNeedsEscaping(char c) { + for (CharMatcher rawEscape : rawEscapes) { + if (rawEscape.matches(c)) { + return true; + } + } + return false; + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/markdown/package-info.java b/commonmark/src/main/java/org/commonmark/renderer/markdown/package-info.java new file mode 100644 index 000000000..f707671d5 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/markdown/package-info.java @@ -0,0 +1,4 @@ +/** + * Markdown rendering (see {@link org.commonmark.renderer.markdown.MarkdownRenderer}) + */ +package org.commonmark.renderer.markdown; diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/CoreTextContentNodeRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/text/CoreTextContentNodeRenderer.java new file mode 100644 index 000000000..ee564cbdb --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/CoreTextContentNodeRenderer.java @@ -0,0 +1,336 @@ +package org.commonmark.renderer.text; + +import org.commonmark.node.*; +import org.commonmark.renderer.NodeRenderer; + +import java.util.Set; + +/** + * The node renderer that renders all the core nodes (comes last in the order of node renderers). + */ +public class CoreTextContentNodeRenderer extends AbstractVisitor implements NodeRenderer { + + protected final TextContentNodeRendererContext context; + private final TextContentWriter textContent; + + private ListHolder listHolder; + + public CoreTextContentNodeRenderer(TextContentNodeRendererContext context) { + this.context = context; + this.textContent = context.getWriter(); + } + + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of( + Document.class, + Heading.class, + Paragraph.class, + BlockQuote.class, + BulletList.class, + FencedCodeBlock.class, + HtmlBlock.class, + ThematicBreak.class, + IndentedCodeBlock.class, + Link.class, + ListItem.class, + OrderedList.class, + Image.class, + Emphasis.class, + StrongEmphasis.class, + Text.class, + Code.class, + HtmlInline.class, + SoftLineBreak.class, + HardLineBreak.class + ); + } + + @Override + public void render(Node node) { + node.accept(this); + } + + @Override + public void visit(Document document) { + // No rendering itself + visitChildren(document); + } + + @Override + public void visit(BlockQuote blockQuote) { + // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + textContent.write('\u00AB'); + visitChildren(blockQuote); + textContent.resetBlock(); + // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + textContent.write('\u00BB'); + + textContent.block(); + } + + @Override + public void visit(BulletList bulletList) { + textContent.pushTight(bulletList.isTight()); + listHolder = new BulletListHolder(listHolder, bulletList); + visitChildren(bulletList); + textContent.popTight(); + textContent.block(); + listHolder = listHolder.getParent(); + } + + @Override + public void visit(Code code) { + textContent.write('\"'); + textContent.write(code.getLiteral()); + textContent.write('\"'); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + var literal = stripTrailingNewline(fencedCodeBlock.getLiteral()); + if (stripNewlines()) { + textContent.writeStripped(literal); + } else { + textContent.write(literal); + } + textContent.block(); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + if (stripNewlines()) { + textContent.whitespace(); + } else { + textContent.line(); + } + } + + @Override + public void visit(Heading heading) { + visitChildren(heading); + if (stripNewlines()) { + textContent.write(": "); + } else { + textContent.block(); + } + } + + @Override + public void visit(ThematicBreak thematicBreak) { + if (!stripNewlines()) { + textContent.write("***"); + } + textContent.block(); + } + + @Override + public void visit(HtmlInline htmlInline) { + writeText(htmlInline.getLiteral()); + } + + @Override + public void visit(HtmlBlock htmlBlock) { + writeText(htmlBlock.getLiteral()); + } + + @Override + public void visit(Image image) { + writeLink(image, image.getTitle(), image.getDestination()); + } + + @Override + public void visit(IndentedCodeBlock indentedCodeBlock) { + var literal = stripTrailingNewline(indentedCodeBlock.getLiteral()); + if (stripNewlines()) { + textContent.writeStripped(literal); + } else { + textContent.write(literal); + } + textContent.block(); + } + + @Override + public void visit(Link link) { + writeLink(link, link.getTitle(), link.getDestination()); + } + + @Override + public void visit(ListItem listItem) { + if (listHolder != null && listHolder instanceof OrderedListHolder) { + var orderedListHolder = (OrderedListHolder) listHolder; + var marker = orderedListHolder.getCounter() + orderedListHolder.getDelimiter(); + var spaces = " "; + textContent.write(marker); + textContent.write(spaces); + textContent.pushPrefix(repeat(" ", marker.length() + spaces.length())); + visitChildren(listItem); + textContent.block(); + textContent.popPrefix(); + orderedListHolder.increaseCounter(); + } else if (listHolder != null && listHolder instanceof BulletListHolder) { + BulletListHolder bulletListHolder = (BulletListHolder) listHolder; + if (!stripNewlines()) { + var marker = bulletListHolder.getMarker(); + var spaces = " "; + textContent.write(marker); + textContent.write(spaces); + textContent.pushPrefix(repeat(" ", marker.length() + spaces.length())); + } + visitChildren(listItem); + textContent.block(); + if (!stripNewlines()) { + textContent.popPrefix(); + } + } + } + + @Override + public void visit(OrderedList orderedList) { + textContent.pushTight(orderedList.isTight()); + listHolder = new OrderedListHolder(listHolder, orderedList); + visitChildren(orderedList); + textContent.popTight(); + textContent.block(); + listHolder = listHolder.getParent(); + } + + @Override + public void visit(Paragraph paragraph) { + visitChildren(paragraph); + textContent.block(); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + if (stripNewlines()) { + textContent.whitespace(); + } else { + textContent.line(); + } + } + + @Override + public void visit(Text text) { + writeText(text.getLiteral()); + } + + @Override + protected void visitChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } + + private void writeText(String text) { + if (stripNewlines()) { + textContent.writeStripped(text); + } else { + textContent.write(text); + } + } + + private void writeLink(Node node, String title, String destination) { + boolean hasChild = node.getFirstChild() != null; + boolean hasTitle = title != null && !title.equals(destination); + boolean hasDestination = destination != null && !destination.equals(""); + + if (hasChild) { + textContent.write('"'); + visitChildren(node); + textContent.write('"'); + if (hasTitle || hasDestination) { + textContent.whitespace(); + textContent.write('('); + } + } + + if (hasTitle) { + textContent.write(title); + if (hasDestination) { + textContent.colon(); + textContent.whitespace(); + } + } + + if (hasDestination) { + textContent.write(destination); + } + + if (hasChild && (hasTitle || hasDestination)) { + textContent.write(')'); + } + } + + private boolean stripNewlines() { + return context.lineBreakRendering() == LineBreakRendering.STRIP; + } + + private static String stripTrailingNewline(String s) { + if (s.endsWith("\n")) { + return s.substring(0, s.length() - 1); + } else { + return s; + } + } + + // Keep for Android compat (String.repeat only available on Android 12 and later) + private static String repeat(String s, int count) { + var sb = new StringBuilder(s.length() * count); + for (int i = 0; i < count; i++) { + sb.append(s); + } + return sb.toString(); + } + + private static class BulletListHolder extends ListHolder { + private final String marker; + + public BulletListHolder(ListHolder parent, BulletList list) { + super(parent); + marker = list.getMarker(); + } + + public String getMarker() { + return marker; + } + } + + private abstract static class ListHolder { + private final ListHolder parent; + + ListHolder(ListHolder parent) { + this.parent = parent; + } + + public ListHolder getParent() { + return parent; + } + } + + private static class OrderedListHolder extends ListHolder { + private final String delimiter; + private int counter; + + public OrderedListHolder(ListHolder parent, OrderedList list) { + super(parent); + delimiter = list.getMarkerDelimiter() != null ? list.getMarkerDelimiter() : "."; + counter = list.getMarkerStartNumber() != null ? list.getMarkerStartNumber() : 1; + } + + public String getDelimiter() { + return delimiter; + } + + public int getCounter() { + return counter; + } + + public void increaseCounter() { + counter++; + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/LineBreakRendering.java b/commonmark/src/main/java/org/commonmark/renderer/text/LineBreakRendering.java new file mode 100644 index 000000000..27eeaf0da --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/LineBreakRendering.java @@ -0,0 +1,19 @@ +package org.commonmark.renderer.text; + +/** + * Control how line breaks are rendered. + */ +public enum LineBreakRendering { + /** + * Strip all line breaks within blocks and between blocks, resulting in all the text in a single line. + */ + STRIP, + /** + * Use single line breaks between blocks, not a blank line (also render all lists as tight). + */ + COMPACT, + /** + * Separate blocks by a blank line (and respect tight vs loose lists). + */ + SEPARATE_BLOCKS, +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/TextContentNodeRendererContext.java b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentNodeRendererContext.java new file mode 100644 index 000000000..d6fcb8d77 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentNodeRendererContext.java @@ -0,0 +1,32 @@ +package org.commonmark.renderer.text; + +import org.commonmark.node.Node; + +public interface TextContentNodeRendererContext { + + /** + * Controls how line breaks should be rendered, see {@link LineBreakRendering}. + */ + LineBreakRendering lineBreakRendering(); + + /** + * @return true for stripping new lines and render text as "single line", + * false for keeping all line breaks. + * @deprecated Use {@link #lineBreakRendering()} instead + */ + @Deprecated + boolean stripNewlines(); + + /** + * @return the writer to use + */ + TextContentWriter getWriter(); + + /** + * Render the specified node and its children using the configured renderers. This should be used to render child + * nodes; be careful not to pass the node that is being rendered, that would result in an endless loop. + * + * @param node the node to render + */ + void render(Node node); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/TextContentNodeRendererFactory.java b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentNodeRendererFactory.java new file mode 100644 index 000000000..bf193dff4 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentNodeRendererFactory.java @@ -0,0 +1,17 @@ +package org.commonmark.renderer.text; + +import org.commonmark.renderer.NodeRenderer; + +/** + * Factory for instantiating new node renderers when rendering is done. + */ +public interface TextContentNodeRendererFactory { + + /** + * Create a new node renderer for the specified rendering context. + * + * @param context the context for rendering (normally passed on to the node renderer) + * @return a node renderer + */ + NodeRenderer create(TextContentNodeRendererContext context); +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/TextContentRenderer.java b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentRenderer.java new file mode 100644 index 000000000..d64d0c7ef --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentRenderer.java @@ -0,0 +1,169 @@ +package org.commonmark.renderer.text; + +import org.commonmark.Extension; +import org.commonmark.internal.renderer.NodeRendererMap; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.Renderer; + +import java.util.ArrayList; +import java.util.List; + +/** + * Renders nodes to plain text content with minimal markup-like additions. + */ +public class TextContentRenderer implements Renderer { + + private final LineBreakRendering lineBreakRendering; + + private final List<TextContentNodeRendererFactory> nodeRendererFactories; + + private TextContentRenderer(Builder builder) { + this.lineBreakRendering = builder.lineBreakRendering; + + this.nodeRendererFactories = new ArrayList<>(builder.nodeRendererFactories.size() + 1); + this.nodeRendererFactories.addAll(builder.nodeRendererFactories); + // Add as last. This means clients can override the rendering of core nodes if they want. + this.nodeRendererFactories.add(new TextContentNodeRendererFactory() { + @Override + public NodeRenderer create(TextContentNodeRendererContext context) { + return new CoreTextContentNodeRenderer(context); + } + }); + } + + /** + * Create a new builder for configuring a {@link TextContentRenderer}. + * + * @return a builder + */ + public static Builder builder() { + return new Builder(); + } + + @Override + public void render(Node node, Appendable output) { + RendererContext context = new RendererContext(new TextContentWriter(output, lineBreakRendering)); + context.render(node); + } + + @Override + public String render(Node node) { + StringBuilder sb = new StringBuilder(); + render(node, sb); + return sb.toString(); + } + + /** + * Builder for configuring a {@link TextContentRenderer}. See methods for default configuration. + */ + public static class Builder { + + private List<TextContentNodeRendererFactory> nodeRendererFactories = new ArrayList<>(); + private LineBreakRendering lineBreakRendering = LineBreakRendering.COMPACT; + + /** + * @return the configured {@link TextContentRenderer} + */ + public TextContentRenderer build() { + return new TextContentRenderer(this); + } + + /** + * Configure how line breaks (newlines) are rendered, see {@link LineBreakRendering}. + * The default is {@link LineBreakRendering#COMPACT}. + * + * @param lineBreakRendering the mode to use + * @return {@code this} + */ + public Builder lineBreakRendering(LineBreakRendering lineBreakRendering) { + this.lineBreakRendering = lineBreakRendering; + return this; + } + + /** + * Set the value of flag for stripping new lines. + * + * @param stripNewlines true for stripping new lines and render text as "single line", + * false for keeping all line breaks + * @return {@code this} + * @deprecated Use {@link #lineBreakRendering(LineBreakRendering)} with {@link LineBreakRendering#STRIP} instead + */ + @Deprecated + public Builder stripNewlines(boolean stripNewlines) { + this.lineBreakRendering = stripNewlines ? LineBreakRendering.STRIP : LineBreakRendering.COMPACT; + return this; + } + + /** + * Add a factory for instantiating a node renderer (done when rendering). This allows to override the rendering + * of node types or define rendering for custom node types. + * <p> + * If multiple node renderers for the same node type are created, the one from the factory that was added first + * "wins". (This is how the rendering for core node types can be overridden; the default rendering comes last.) + * + * @param nodeRendererFactory the factory for creating a node renderer + * @return {@code this} + */ + public Builder nodeRendererFactory(TextContentNodeRendererFactory nodeRendererFactory) { + this.nodeRendererFactories.add(nodeRendererFactory); + return this; + } + + /** + * @param extensions extensions to use on this text content renderer + * @return {@code this} + */ + public Builder extensions(Iterable<? extends Extension> extensions) { + for (Extension extension : extensions) { + if (extension instanceof TextContentRenderer.TextContentRendererExtension) { + TextContentRenderer.TextContentRendererExtension textContentRendererExtension = + (TextContentRenderer.TextContentRendererExtension) extension; + textContentRendererExtension.extend(this); + } + } + return this; + } + } + + /** + * Extension for {@link TextContentRenderer}. + */ + public interface TextContentRendererExtension extends Extension { + void extend(TextContentRenderer.Builder rendererBuilder); + } + + private class RendererContext implements TextContentNodeRendererContext { + private final TextContentWriter textContentWriter; + private final NodeRendererMap nodeRendererMap = new NodeRendererMap(); + + private RendererContext(TextContentWriter textContentWriter) { + this.textContentWriter = textContentWriter; + + for (var factory : nodeRendererFactories) { + var renderer = factory.create(this); + nodeRendererMap.add(renderer); + } + } + + @Override + public LineBreakRendering lineBreakRendering() { + return lineBreakRendering; + } + + @Override + public boolean stripNewlines() { + return lineBreakRendering == LineBreakRendering.STRIP; + } + + @Override + public TextContentWriter getWriter() { + return textContentWriter; + } + + @Override + public void render(Node node) { + nodeRendererMap.render(node); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/TextContentWriter.java b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentWriter.java new file mode 100644 index 000000000..1fb482785 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/TextContentWriter.java @@ -0,0 +1,161 @@ +package org.commonmark.renderer.text; + +import java.io.IOException; +import java.util.LinkedList; + +public class TextContentWriter { + + private final Appendable buffer; + private final LineBreakRendering lineBreakRendering; + + private final LinkedList<String> prefixes = new LinkedList<>(); + private final LinkedList<Boolean> tight = new LinkedList<>(); + + private String blockSeparator = null; + private char lastChar; + + public TextContentWriter(Appendable out) { + this(out, LineBreakRendering.COMPACT); + } + + public TextContentWriter(Appendable out, LineBreakRendering lineBreakRendering) { + this.buffer = out; + this.lineBreakRendering = lineBreakRendering; + } + + public void whitespace() { + if (lastChar != 0 && lastChar != ' ') { + write(' '); + } + } + + public void colon() { + if (lastChar != 0 && lastChar != ':') { + write(':'); + } + } + + public void line() { + append('\n'); + writePrefixes(); + } + + public void block() { + blockSeparator = lineBreakRendering == LineBreakRendering.STRIP ? " " : // + lineBreakRendering == LineBreakRendering.COMPACT || isTight() ? "\n" : "\n\n"; + } + + public void resetBlock() { + blockSeparator = null; + } + + public void writeStripped(String s) { + write(s.replaceAll("[\\r\\n\\s]+", " ")); + } + + public void write(String s) { + flushBlockSeparator(); + append(s); + } + + public void write(char c) { + flushBlockSeparator(); + append(c); + } + + /** + * Push a prefix onto the top of the stack. All prefixes are written at the beginning of each line, until the + * prefix is popped again. + * + * @param prefix the raw prefix string + */ + public void pushPrefix(String prefix) { + prefixes.addLast(prefix); + } + + /** + * Write a prefix. + * + * @param prefix the raw prefix string to write + */ + public void writePrefix(String prefix) { + write(prefix); + } + + /** + * Remove the last prefix from the top of the stack. + */ + public void popPrefix() { + prefixes.removeLast(); + } + + /** + * Change whether blocks are tight or loose. Loose is the default where blocks are separated by a blank line. Tight + * is where blocks are not separated by a blank line. Tight blocks are used in lists, if there are no blank lines + * within the list. + * <p> + * Note that changing this does not affect block separators that have already been enqueued with {@link #block()}, + * only future ones. + */ + public void pushTight(boolean tight) { + this.tight.addLast(tight); + } + + /** + * Remove the last "tight" setting from the top of the stack. + */ + public void popTight() { + this.tight.removeLast(); + } + + private boolean isTight() { + return !tight.isEmpty() && tight.getLast(); + } + + private void writePrefixes() { + for (String prefix : prefixes) { + append(prefix); + } + } + + /** + * If a block separator has been enqueued with {@link #block()} but not yet written, write it now. + */ + private void flushBlockSeparator() { + if (blockSeparator != null) { + if (blockSeparator.equals("\n") || blockSeparator.equals("\n\n")) { + for (int i = 0; i < blockSeparator.length(); i++) { + var sep = blockSeparator.charAt(i); + append(sep); + writePrefixes(); + } + } else { + append(blockSeparator); + } + blockSeparator = null; + } + } + + private void append(String s) { + try { + buffer.append(s); + } catch (IOException e) { + throw new RuntimeException(e); + } + + int length = s.length(); + if (length != 0) { + lastChar = s.charAt(length - 1); + } + } + + private void append(char c) { + try { + buffer.append(c); + } catch (IOException e) { + throw new RuntimeException(e); + } + + lastChar = c; + } +} diff --git a/commonmark/src/main/java/org/commonmark/renderer/text/package-info.java b/commonmark/src/main/java/org/commonmark/renderer/text/package-info.java new file mode 100644 index 000000000..8309f4bd6 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/renderer/text/package-info.java @@ -0,0 +1,4 @@ +/** + * Plain text rendering with minimal markup (see {@link org.commonmark.renderer.text.TextContentRenderer}) + */ +package org.commonmark.renderer.text; diff --git a/commonmark/src/main/java/org/commonmark/text/AsciiMatcher.java b/commonmark/src/main/java/org/commonmark/text/AsciiMatcher.java new file mode 100644 index 000000000..0d9cea458 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/text/AsciiMatcher.java @@ -0,0 +1,73 @@ +package org.commonmark.text; + +import java.util.BitSet; +import java.util.Set; + +/** + * Char matcher that can match ASCII characters efficiently. + */ +public class AsciiMatcher implements CharMatcher { + private final BitSet set; + + private AsciiMatcher(Builder builder) { + this.set = builder.set; + } + + @Override + public boolean matches(char c) { + return set.get(c); + } + + public Builder newBuilder() { + return new Builder((BitSet) set.clone()); + } + + public static Builder builder() { + return new Builder(new BitSet()); + } + + public static Builder builder(AsciiMatcher matcher) { + return new Builder((BitSet) matcher.set.clone()); + } + + public static class Builder { + private final BitSet set; + + private Builder(BitSet set) { + this.set = set; + } + + public Builder c(char c) { + if (c > 127) { + throw new IllegalArgumentException("Can only match ASCII characters"); + } + set.set(c); + return this; + } + + public Builder anyOf(String s) { + for (int i = 0; i < s.length(); i++) { + c(s.charAt(i)); + } + return this; + } + + public Builder anyOf(Set<Character> characters) { + for (Character c : characters) { + c(c); + } + return this; + } + + public Builder range(char from, char toInclusive) { + for (char c = from; c <= toInclusive; c++) { + c(c); + } + return this; + } + + public AsciiMatcher build() { + return new AsciiMatcher(this); + } + } +} diff --git a/commonmark/src/main/java/org/commonmark/text/CharMatcher.java b/commonmark/src/main/java/org/commonmark/text/CharMatcher.java new file mode 100644 index 000000000..2833e65c3 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/text/CharMatcher.java @@ -0,0 +1,13 @@ +package org.commonmark.text; + +/** + * Matcher interface for {@code char} values. + * <p> + * Note that because this matches on {@code char} values only (as opposed to {@code int} code points), + * this only operates on the level of code units and doesn't support supplementary characters + * (see {@link Character#isSupplementaryCodePoint(int)}). + */ +public interface CharMatcher { + + boolean matches(char c); +} diff --git a/commonmark/src/main/java/org/commonmark/text/Characters.java b/commonmark/src/main/java/org/commonmark/text/Characters.java new file mode 100644 index 000000000..ee56ca67e --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/text/Characters.java @@ -0,0 +1,157 @@ +package org.commonmark.text; + +/** + * Functions for finding characters in strings or checking characters. + */ +public class Characters { + + public static int find(char c, CharSequence s, int startIndex) { + int length = s.length(); + for (int i = startIndex; i < length; i++) { + if (s.charAt(i) == c) { + return i; + } + } + return -1; + } + + public static int findLineBreak(CharSequence s, int startIndex) { + int length = s.length(); + for (int i = startIndex; i < length; i++) { + switch (s.charAt(i)) { + case '\n': + case '\r': + return i; + } + } + return -1; + } + + /** + * @see <a href="https://spec.commonmark.org/0.31.2/#blank-line">blank line</a> + */ + public static boolean isBlank(CharSequence s) { + return skipSpaceTab(s, 0, s.length()) == s.length(); + } + + public static boolean hasNonSpace(CharSequence s) { + int length = s.length(); + int skipped = skip(' ', s, 0, length); + return skipped != length; + } + + public static boolean isLetter(CharSequence s, int index) { + int codePoint = Character.codePointAt(s, index); + return Character.isLetter(codePoint); + } + + public static boolean isSpaceOrTab(CharSequence s, int index) { + if (index < s.length()) { + switch (s.charAt(index)) { + case ' ': + case '\t': + return true; + } + } + return false; + } + + /** + * @see <a href="https://spec.commonmark.org/0.31.2/#unicode-punctuation-character">Unicode punctuation character</a> + */ + public static boolean isPunctuationCodePoint(int codePoint) { + switch (Character.getType(codePoint)) { + // General category "P" (punctuation) + case Character.DASH_PUNCTUATION: + case Character.START_PUNCTUATION: + case Character.END_PUNCTUATION: + case Character.CONNECTOR_PUNCTUATION: + case Character.OTHER_PUNCTUATION: + case Character.INITIAL_QUOTE_PUNCTUATION: + case Character.FINAL_QUOTE_PUNCTUATION: + // General category "S" (symbol) + case Character.MATH_SYMBOL: + case Character.CURRENCY_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.OTHER_SYMBOL: + return true; + default: + switch (codePoint) { + case '$': + case '+': + case '<': + case '=': + case '>': + case '^': + case '`': + case '|': + case '~': + return true; + default: + return false; + } + } + } + + /** + * Check whether the provided code point is a Unicode whitespace character as defined in the spec. + * + * @see <a href="https://spec.commonmark.org/0.31.2/#unicode-whitespace-character">Unicode whitespace character</a> + */ + public static boolean isWhitespaceCodePoint(int codePoint) { + switch (codePoint) { + case ' ': + case '\t': + case '\n': + case '\f': + case '\r': + return true; + default: + return Character.getType(codePoint) == Character.SPACE_SEPARATOR; + } + } + + public static int skip(char skip, CharSequence s, int startIndex, int endIndex) { + for (int i = startIndex; i < endIndex; i++) { + if (s.charAt(i) != skip) { + return i; + } + } + return endIndex; + } + + public static int skipBackwards(char skip, CharSequence s, int startIndex, int lastIndex) { + for (int i = startIndex; i >= lastIndex; i--) { + if (s.charAt(i) != skip) { + return i; + } + } + return lastIndex - 1; + } + + public static int skipSpaceTab(CharSequence s, int startIndex, int endIndex) { + for (int i = startIndex; i < endIndex; i++) { + switch (s.charAt(i)) { + case ' ': + case '\t': + break; + default: + return i; + } + } + return endIndex; + } + + public static int skipSpaceTabBackwards(CharSequence s, int startIndex, int lastIndex) { + for (int i = startIndex; i >= lastIndex; i--) { + switch (s.charAt(i)) { + case ' ': + case '\t': + break; + default: + return i; + } + } + return lastIndex - 1; + } +} diff --git a/commonmark/src/main/java/org/commonmark/text/package-info.java b/commonmark/src/main/java/org/commonmark/text/package-info.java new file mode 100644 index 000000000..ab9eec6f1 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/text/package-info.java @@ -0,0 +1,4 @@ +/** + * Text processing utilities for parsing and rendering, exported for use by extensions + */ +package org.commonmark.text; diff --git a/commonmark/src/main/javadoc/overview.html b/commonmark/src/main/javadoc/overview.html new file mode 100644 index 000000000..f562778a3 --- /dev/null +++ b/commonmark/src/main/javadoc/overview.html @@ -0,0 +1,21 @@ +<html> +<body> +<b>Java implementation of CommonMark for parsing markdown and rendering to HTML (core library)</b> +<p>Example:</p> +<pre><code> import org.commonmark.node.*; + import org.commonmark.parser.Parser; + import org.commonmark.renderer.html.HtmlRenderer; + + Parser parser = Parser.builder().build(); + Node document = parser.parse("This is *Sparta*"); + HtmlRenderer renderer = HtmlRenderer.builder().escapeHtml(true).build(); + renderer.render(document); // "<p>This is <em>Sparta</em></p>\n" +</code></pre> +<p>See the following packages for details:</p> +<ul> +<li>{@link org.commonmark.parser} for parsing input text to AST nodes</li> +<li>{@link org.commonmark.node} for AST node types and visitors</li> +<li>{@link org.commonmark.renderer.html} for HTML rendering</li> +</ul> +</body> +</html> diff --git a/commonmark/src/main/resources/META-INF/LICENSE.txt b/commonmark/src/main/resources/META-INF/LICENSE.txt new file mode 100644 index 000000000..b09e367ce --- /dev/null +++ b/commonmark/src/main/resources/META-INF/LICENSE.txt @@ -0,0 +1,23 @@ +Copyright (c) 2015, Atlassian Pty Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/commonmark/src/main/resources/org/commonmark/internal/util/entities.properties b/commonmark/src/main/resources/org/commonmark/internal/util/entities.txt similarity index 100% rename from commonmark/src/main/resources/org/commonmark/internal/util/entities.properties rename to commonmark/src/main/resources/org/commonmark/internal/util/entities.txt diff --git a/commonmark/src/test/java/org/commonmark/ProfilingMain.java b/commonmark/src/test/java/org/commonmark/ProfilingMain.java new file mode 100644 index 000000000..83b1bdaff --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/ProfilingMain.java @@ -0,0 +1,46 @@ +package org.commonmark; + +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.TestResources; + +import java.util.ArrayList; +import java.util.List; + +public class ProfilingMain { + + private static final String SPEC = TestResources.readAsString(TestResources.getSpec()); + // private static final List<String> SPEC_EXAMPLES = ExampleReader.readExampleSources(TestResources.getSpec()); + private static final Parser PARSER = Parser.builder().build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().build(); + + public static void main(String[] args) throws Exception { + System.out.println("Attach profiler, then press enter to start parsing."); + System.in.read(); + System.out.println("Parsing"); + List<Node> nodes = parse(List.of(SPEC)); + System.out.println("Finished parsing, press enter to start rendering"); + System.in.read(); + System.out.println(render(nodes)); + System.out.println("Finished rendering"); + } + + private static List<Node> parse(List<String> examples) { + List<Node> nodes = new ArrayList<>(); + for (String example : examples) { + Node doc = PARSER.parse(example); + nodes.add(doc); + } + return nodes; + } + + private static long render(List<Node> examples) { + long length = 0; + for (Node example : examples) { + String result = RENDERER.render(example); + length += result.length(); + } + return length; + } +} diff --git a/commonmark/src/test/java/org/commonmark/internal/DocumentParserTest.java b/commonmark/src/test/java/org/commonmark/internal/DocumentParserTest.java new file mode 100644 index 000000000..a834665ff --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/DocumentParserTest.java @@ -0,0 +1,57 @@ +package org.commonmark.internal; + +import org.commonmark.node.*; +import org.commonmark.parser.block.BlockParserFactory; +import org.junit.jupiter.api.Test; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +class DocumentParserTest { + private static final List<BlockParserFactory> CORE_FACTORIES = List.of( + new BlockQuoteParser.Factory(), + new HeadingParser.Factory(), + new FencedCodeBlockParser.Factory(), + new HtmlBlockParser.Factory(), + new ThematicBreakParser.Factory(), + new ListBlockParser.Factory(), + new IndentedCodeBlockParser.Factory()); + + @Test + void calculateBlockParserFactories_givenAFullListOfAllowedNodes_includesAllCoreFactories() { + List<BlockParserFactory> customParserFactories = List.of(); + var enabledBlockTypes = Set.of(BlockQuote.class, Heading.class, FencedCodeBlock.class, HtmlBlock.class, ThematicBreak.class, ListBlock.class, IndentedCodeBlock.class); + + List<BlockParserFactory> blockParserFactories = DocumentParser.calculateBlockParserFactories(customParserFactories, enabledBlockTypes); + assertThat(blockParserFactories).hasSameSizeAs(CORE_FACTORIES); + + for (BlockParserFactory factory : CORE_FACTORIES) { + assertThat(hasInstance(blockParserFactories, factory.getClass())).isTrue(); + } + } + + @Test + void calculateBlockParserFactories_givenAListOfAllowedNodes_includesAssociatedFactories() { + List<BlockParserFactory> customParserFactories = List.of(); + Set<Class<? extends Block>> nodes = new HashSet<>(); + nodes.add(IndentedCodeBlock.class); + + List<BlockParserFactory> blockParserFactories = DocumentParser.calculateBlockParserFactories(customParserFactories, nodes); + + assertThat(blockParserFactories).hasSize(1); + assertThat(hasInstance(blockParserFactories, IndentedCodeBlockParser.Factory.class)).isTrue(); + } + + private boolean hasInstance(List<BlockParserFactory> blockParserFactories, Class<? extends BlockParserFactory> factoryClass) { + for (BlockParserFactory factory : blockParserFactories) { + if (factory.getClass().equals(factoryClass)) { + return true; + } + } + return false; + } + +} diff --git a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java new file mode 100644 index 000000000..b69ada0e9 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java @@ -0,0 +1,205 @@ +package org.commonmark.internal; + +import org.commonmark.internal.LinkReferenceDefinitionParser.State; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.SourceLine; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class LinkReferenceDefinitionParserTest { + + private final LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + + @Test + void testStartLabel() { + assertState("[", State.LABEL, "["); + } + + @Test + void testStartNoLabel() { + // Not a label + assertParagraph("a"); + // Can not go back to parsing link reference definitions + parse("a"); + parse("["); + assertThat(parser.getState()).isEqualTo(State.PARAGRAPH); + assertParagraphLines("a\n[", parser); + } + + @Test + void testEmptyLabel() { + assertParagraph("[]: /"); + assertParagraph("[ ]: /"); + assertParagraph("[ \t\n\u000B\f\r ]: /"); + } + + @Test + void testLabelColon() { + assertParagraph("[foo] : /"); + } + + @Test + void testLabel() { + assertState("[foo]:", State.DESTINATION, "[foo]:"); + assertState("[ foo ]:", State.DESTINATION, "[ foo ]:"); + } + + @Test + void testLabelInvalid() { + assertParagraph("[foo[]:"); + } + + @Test + void testLabelMultiline() { + parse("[two"); + assertThat(parser.getState()).isEqualTo(State.LABEL); + parse("lines]:"); + assertThat(parser.getState()).isEqualTo(State.DESTINATION); + parse("/url"); + assertThat(parser.getState()).isEqualTo(State.START_TITLE); + assertDef(parser.getDefinitions().get(0), "two\nlines", "/url", null); + } + + @Test + void testLabelStartsWithNewline() { + parse("["); + assertThat(parser.getState()).isEqualTo(State.LABEL); + parse("weird]:"); + assertThat(parser.getState()).isEqualTo(State.DESTINATION); + parse("/url"); + assertThat(parser.getState()).isEqualTo(State.START_TITLE); + assertDef(parser.getDefinitions().get(0), "\nweird", "/url", null); + } + + @Test + void testDestination() { + parse("[foo]: /url"); + assertThat(parser.getState()).isEqualTo(State.START_TITLE); + assertParagraphLines("", parser); + + assertThat(parser.getDefinitions()).hasSize(1); + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + + parse("[bar]: </url2>"); + assertDef(parser.getDefinitions().get(1), "bar", "/url2", null); + } + + @Test + void testDestinationInvalid() { + assertParagraph("[foo]: <bar<>"); + } + + @Test + void testTitle() { + parse("[foo]: /url 'title'"); + assertThat(parser.getState()).isEqualTo(State.START_DEFINITION); + assertParagraphLines("", parser); + + assertThat(parser.getDefinitions()).hasSize(1); + assertDef(parser.getDefinitions().get(0), "foo", "/url", "title"); + } + + @Test + void testTitleStartWhitespace() { + parse("[foo]: /url"); + assertThat(parser.getState()).isEqualTo(State.START_TITLE); + assertParagraphLines("", parser); + + parse(" "); + + assertThat(parser.getState()).isEqualTo(State.START_DEFINITION); + assertParagraphLines(" ", parser); + + assertThat(parser.getDefinitions()).hasSize(1); + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + } + + @Test + void testTitleMultiline() { + parse("[foo]: /url 'two"); + assertThat(parser.getState()).isEqualTo(State.TITLE); + assertParagraphLines("[foo]: /url 'two", parser); + assertThat(parser.getDefinitions()).isEmpty(); + + parse("lines"); + assertThat(parser.getState()).isEqualTo(State.TITLE); + assertParagraphLines("[foo]: /url 'two\nlines", parser); + assertThat(parser.getDefinitions()).isEmpty(); + + parse("'"); + assertThat(parser.getState()).isEqualTo(State.START_DEFINITION); + assertParagraphLines("", parser); + + assertThat(parser.getDefinitions()).hasSize(1); + assertDef(parser.getDefinitions().get(0), "foo", "/url", "two\nlines\n"); + } + + @Test + void testTitleMultiline2() { + parse("[foo]: /url '"); + assertThat(parser.getState()).isEqualTo(State.TITLE); + parse("title'"); + assertThat(parser.getState()).isEqualTo(State.START_DEFINITION); + + assertDef(parser.getDefinitions().get(0), "foo", "/url", "\ntitle"); + } + + @Test + void testTitleMultiline3() { + parse("[foo]: /url"); + assertThat(parser.getState()).isEqualTo(State.START_TITLE); + // Note that this looks like a valid title until we parse "bad", at which point we need to treat the whole line + // as a paragraph line and discard any already parsed title. + parse("\"title\" bad"); + assertThat(parser.getState()).isEqualTo(State.PARAGRAPH); + + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + } + + @Test + void testTitleMultiline4() { + parse("[foo]: /url"); + assertThat(parser.getState()).isEqualTo(State.START_TITLE); + parse("(title"); + assertThat(parser.getState()).isEqualTo(State.TITLE); + parse("foo("); + assertThat(parser.getState()).isEqualTo(State.PARAGRAPH); + + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + } + + @Test + void testTitleInvalid() { + assertParagraph("[foo]: /url (invalid("); + assertParagraph("[foo]: </url>'title'"); + assertParagraph("[foo]: /url 'title' INVALID"); + } + + private void parse(String content) { + parser.parse(SourceLine.of(content, null)); + } + + private static void assertParagraph(String input) { + assertState(input, State.PARAGRAPH, input); + } + + private static void assertState(String input, State state, String paragraphContent) { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + // TODO: Should we check things with source spans here? + parser.parse(SourceLine.of(input, null)); + assertThat(parser.getState()).isEqualTo(state); + assertParagraphLines(paragraphContent, parser); + } + + private static void assertDef(LinkReferenceDefinition def, String label, String destination, String title) { + assertThat(def.getLabel()).isEqualTo(label); + assertThat(def.getDestination()).isEqualTo(destination); + assertThat(def.getTitle()).isEqualTo(title); + } + + private static void assertParagraphLines(String expectedContent, LinkReferenceDefinitionParser parser) { + String actual = parser.getParagraphLines().getContent(); + assertThat(actual).isEqualTo(expectedContent); + } +} diff --git a/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java b/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java new file mode 100644 index 000000000..eb2f1a801 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java @@ -0,0 +1,21 @@ +package org.commonmark.internal.util; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class EscapingTest { + + @Test + void testEscapeHtml() { + assertThat(Escaping.escapeHtml("nothing to escape")).isEqualTo("nothing to escape"); + assertThat(Escaping.escapeHtml("&")).isEqualTo("&"); + assertThat(Escaping.escapeHtml("<")).isEqualTo("<"); + assertThat(Escaping.escapeHtml(">")).isEqualTo(">"); + assertThat(Escaping.escapeHtml("\"")).isEqualTo("""); + assertThat(Escaping.escapeHtml("< start")).isEqualTo("< start"); + assertThat(Escaping.escapeHtml("end >")).isEqualTo("end >"); + assertThat(Escaping.escapeHtml("< both >")).isEqualTo("< both >"); + assertThat(Escaping.escapeHtml("< middle & too >")).isEqualTo("< middle & too >"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/internal/util/LineReaderTest.java b/commonmark/src/test/java/org/commonmark/internal/util/LineReaderTest.java new file mode 100644 index 000000000..b52713846 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/util/LineReaderTest.java @@ -0,0 +1,124 @@ +package org.commonmark.internal.util; + +import org.junit.jupiter.api.Test; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Objects; + +import static java.util.stream.Collectors.joining; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.commonmark.internal.util.LineReader.CHAR_BUFFER_SIZE; + +class LineReaderTest { + + @Test + void testReadLine() throws IOException { + assertLines(); + + assertLines("", "\n"); + assertLines("foo", "\n", "bar", "\n"); + assertLines("foo", "\n", "bar", null); + assertLines("", "\n", "", "\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE - 1), "\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE), "\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE) + "b", "\n"); + + assertLines("", "\r\n"); + assertLines("foo", "\r\n", "bar", "\r\n"); + assertLines("foo", "\r\n", "bar", null); + assertLines("", "\r\n", "", "\r\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE - 2), "\r\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE - 1), "\r\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE), "\r\n"); + assertLines(repeat("a", CHAR_BUFFER_SIZE) + "b", "\r\n"); + + assertLines("", "\r"); + assertLines("foo", "\r", "bar", "\r"); + assertLines("foo", "\r", "bar", null); + assertLines("", "\r", "", "\r"); + assertLines(repeat("a", CHAR_BUFFER_SIZE - 1), "\r"); + assertLines(repeat("a", CHAR_BUFFER_SIZE), "\r"); + assertLines(repeat("a", CHAR_BUFFER_SIZE) + "b", "\r"); + + assertLines("", "\n", "", "\r", "", "\r\n", "", "\n"); + assertLines("what", "\r", "are", "\r", "", "\r", "you", "\r\n", "", "\r\n", "even", "\n", "doing", null); + } + + @Test + void testClose() throws IOException { + var reader = new InputStreamReader(new ByteArrayInputStream("test".getBytes(StandardCharsets.UTF_8))); + var lineReader = new LineReader(reader); + lineReader.close(); + lineReader.close(); + assertThatThrownBy(reader::read).isInstanceOf(IOException.class); + } + + private void assertLines(String... s) throws IOException { + assertThat(s.length).as("Expected parts needs to be even (pairs of content and terminator)").isEven(); + var input = Arrays.stream(s).filter(Objects::nonNull).collect(joining("")); + + assertLines(new StringReader(input), s); + assertLines(new SlowStringReader(input), s); + } + + private static void assertLines(Reader reader, String... expectedParts) throws IOException { + try (var lineReader = new LineReader(reader)) { + var lines = new ArrayList<>(); + String line; + while ((line = lineReader.readLine()) != null) { + lines.add(line); + lines.add(lineReader.getLineTerminator()); + } + assertThat(lineReader.getLineTerminator()).isNull(); + assertThat(lines).containsExactly(expectedParts); + } + } + + private static String repeat(String s, int count) { + StringBuilder sb = new StringBuilder(s.length() * count); + for (int i = 0; i < count; i++) { + sb.append(s); + } + return sb.toString(); + } + + /** + * Reader that only reads 0 or 1 chars at a time to test the corner cases. + */ + private static class SlowStringReader extends Reader { + + private final String s; + private int position = 0; + private boolean empty = false; + + private SlowStringReader(String s) { + this.s = s; + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + Objects.checkFromIndexSize(off, len, cbuf.length); + if (len == 0) { + return 0; + } + empty = !empty; + if (empty) { + // Return 0 every other time to test handling of 0. + return 0; + } + if (position >= s.length()) { + return -1; + } + cbuf[off] = s.charAt(position++); + return 1; + } + + @Override + public void close() throws IOException { + } + } +} diff --git a/commonmark/src/test/java/org/commonmark/parser/InlineContentParserTest.java b/commonmark/src/test/java/org/commonmark/parser/InlineContentParserTest.java new file mode 100644 index 000000000..d0f45a6bc --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/parser/InlineContentParserTest.java @@ -0,0 +1,125 @@ +package org.commonmark.parser; + +import org.commonmark.node.*; +import org.commonmark.parser.beta.InlineContentParser; +import org.commonmark.parser.beta.InlineContentParserFactory; +import org.commonmark.parser.beta.InlineParserState; +import org.commonmark.parser.beta.ParsedInline; +import org.commonmark.test.Nodes; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +class InlineContentParserTest { + + @Test + void customInlineContentParser() { + var parser = Parser.builder().customInlineContentParserFactory(new DollarInlineParser.Factory()).build(); + var doc = parser.parse("Test: $hey *there*$ $you$\n\n# Heading $heading$\n"); + var inline1 = Nodes.find(doc, DollarInline.class); + assertThat(inline1.getLiteral()).isEqualTo("hey *there*"); + + var inline2 = (DollarInline) doc.getFirstChild().getLastChild(); + assertThat(inline2.getLiteral()).isEqualTo("you"); + + var heading = Nodes.find(doc, Heading.class); + var inline3 = (DollarInline) heading.getLastChild(); + assertThat(inline3.getLiteral()).isEqualTo("heading"); + + // Parser is created for each inline snippet, which is why the index resets for the second snippet. + assertThat(inline1.getIndex()).isEqualTo(0); + assertThat(inline2.getIndex()).isEqualTo(1); + assertThat(inline3.getIndex()).isEqualTo(0); + } + + @Test + void bangInlineContentParser() { + // See if using ! for a custom inline content parser works. + // ![] is used for images, but if it's not followed by a [, it should be possible to parse it differently. + var parser = Parser.builder().customInlineContentParserFactory(new BangInlineParser.Factory()).build(); + var doc = parser.parse("![image](url) !notimage"); + var image = Nodes.find(doc, Image.class); + assertThat(image.getDestination()).isEqualTo("url"); + assertThat(((Text) image.getNext()).getLiteral()).isEqualTo(" "); + // Class + assertThat(image.getNext().getNext()).isInstanceOf(BangInline.class); + assertThat(((Text) image.getNext().getNext().getNext()).getLiteral()).isEqualTo("notimage"); + } + + private static class DollarInline extends CustomNode { + private final String literal; + private final int index; + + public DollarInline(String literal, int index) { + this.literal = literal; + this.index = index; + } + + public String getLiteral() { + return literal; + } + + public int getIndex() { + return index; + } + } + + private static class DollarInlineParser implements InlineContentParser { + + private int index = 0; + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + var scanner = inlineParserState.scanner(); + scanner.next(); + var pos = scanner.position(); + + var end = scanner.find('$'); + if (end == -1) { + return ParsedInline.none(); + } + var content = scanner.getSource(pos, scanner.position()).getContent(); + scanner.next(); + return ParsedInline.of(new DollarInline(content, index++), scanner.position()); + } + + static class Factory implements InlineContentParserFactory { + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('$'); + } + + @Override + public InlineContentParser create() { + return new DollarInlineParser(); + } + } + } + + private static class BangInline extends CustomNode { + } + + private static class BangInlineParser implements InlineContentParser { + + @Override + public ParsedInline tryParse(InlineParserState inlineParserState) { + var scanner = inlineParserState.scanner(); + scanner.next(); + return ParsedInline.of(new BangInline(), scanner.position()); + } + + static class Factory implements InlineContentParserFactory { + @Override + public Set<Character> getTriggerCharacters() { + return Set.of('!'); + } + + @Override + public InlineContentParser create() { + return new BangInlineParser(); + } + } + } +} diff --git a/commonmark/src/test/java/org/commonmark/parser/beta/LinkProcessorTest.java b/commonmark/src/test/java/org/commonmark/parser/beta/LinkProcessorTest.java new file mode 100644 index 000000000..ef8739128 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/parser/beta/LinkProcessorTest.java @@ -0,0 +1,26 @@ +package org.commonmark.parser.beta; + +import org.commonmark.node.Link; +import org.commonmark.node.Text; +import org.commonmark.parser.Parser; +import org.commonmark.test.Nodes; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +class LinkProcessorTest { + + @Test + void testLinkMarkerShouldNotBeIncludedByDefault() { + // If a link marker is registered but is not processed, the built-in link processor shouldn't consume it. + // And I think by default, other processors shouldn't consume it either (by accident). + // So requiring processors to opt into including the marker is better than requiring them to opt out, + // because processors that look for a marker already need to write some code to deal with the marker anyway, + // and will have tests ensuring that the marker is part of the parsed node, not the text. + var parser = Parser.builder().linkMarker('^').build(); + var doc = parser.parse("^[test](url)"); + var link = Nodes.find(doc, Link.class); + assertThat(link.getDestination()).isEqualTo("url"); + assertThat(((Text) link.getPrevious()).getLiteral()).isEqualTo("^"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/parser/beta/ScannerTest.java b/commonmark/src/test/java/org/commonmark/parser/beta/ScannerTest.java new file mode 100644 index 000000000..bd74cab0e --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/parser/beta/ScannerTest.java @@ -0,0 +1,158 @@ +package org.commonmark.parser.beta; + +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.SourceLine; +import org.commonmark.parser.SourceLines; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +class ScannerTest { + + @Test + void testNext() { + Scanner scanner = new Scanner(List.of( + SourceLine.of("foo bar", null)), + 0, 4); + assertThat(scanner.peek()).isEqualTo('b'); + scanner.next(); + assertThat(scanner.peek()).isEqualTo('a'); + scanner.next(); + assertThat(scanner.peek()).isEqualTo('r'); + scanner.next(); + assertThat(scanner.peek()).isEqualTo('\0'); + } + + @Test + void testMultipleLines() { + Scanner scanner = new Scanner(List.of( + SourceLine.of("ab", null), + SourceLine.of("cde", null)), + 0, 0); + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('\0'); + assertThat(scanner.peek()).isEqualTo('a'); + scanner.next(); + + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('a'); + assertThat(scanner.peek()).isEqualTo('b'); + scanner.next(); + + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('b'); + assertThat(scanner.peek()).isEqualTo('\n'); + scanner.next(); + + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('\n'); + assertThat(scanner.peek()).isEqualTo('c'); + scanner.next(); + + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('c'); + assertThat(scanner.peek()).isEqualTo('d'); + scanner.next(); + + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('d'); + assertThat(scanner.peek()).isEqualTo('e'); + scanner.next(); + + assertThat(scanner.hasNext()).isFalse(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('e'); + assertThat(scanner.peek()).isEqualTo('\0'); + } + + @Test + void testCodePoints() { + Scanner scanner = new Scanner(List.of(SourceLine.of("\uD83D\uDE0A", null)), 0, 0); + + assertThat(scanner.hasNext()).isTrue(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo('\0'); + assertThat(scanner.peekCodePoint()).isEqualTo(128522); + scanner.next(); + // This jumps chars, not code points. So jump two here + scanner.next(); + + assertThat(scanner.hasNext()).isFalse(); + assertThat(scanner.peekPreviousCodePoint()).isEqualTo(128522); + assertThat(scanner.peekCodePoint()).isEqualTo('\0'); + } + + @Test + void testTextBetween() { + Scanner scanner = new Scanner(List.of( + SourceLine.of("ab", SourceSpan.of(10, 3, 13, 2)), + SourceLine.of("cde", SourceSpan.of(11, 4, 20, 3))), + 0, 0); + + Position start = scanner.position(); + + scanner.next(); + assertSourceLines(scanner.getSource(start, scanner.position()), + "a", + SourceSpan.of(10, 3, 13, 1)); + + Position afterA = scanner.position(); + + scanner.next(); + assertSourceLines(scanner.getSource(start, scanner.position()), + "ab", + SourceSpan.of(10, 3, 13, 2)); + + Position afterB = scanner.position(); + + scanner.next(); + assertSourceLines(scanner.getSource(start, scanner.position()), + "ab\n", + SourceSpan.of(10, 3, 13, 2)); + + scanner.next(); + assertSourceLines(scanner.getSource(start, scanner.position()), + "ab\nc", + SourceSpan.of(10, 3, 13, 2), + SourceSpan.of(11, 4, 20, 1)); + + scanner.next(); + assertSourceLines(scanner.getSource(start, scanner.position()), + "ab\ncd", + SourceSpan.of(10, 3, 13, 2), + SourceSpan.of(11, 4, 20, 2)); + + scanner.next(); + assertSourceLines(scanner.getSource(start, scanner.position()), + "ab\ncde", + SourceSpan.of(10, 3, 13, 2), + SourceSpan.of(11, 4, 20, 3)); + + assertSourceLines(scanner.getSource(afterA, scanner.position()), + "b\ncde", + SourceSpan.of(10, 4, 14, 1), + SourceSpan.of(11, 4, 20, 3)); + + assertSourceLines(scanner.getSource(afterB, scanner.position()), + "\ncde", + SourceSpan.of(11, 4, 20, 3)); + } + + private void assertSourceLines(SourceLines sourceLines, String expectedContent, SourceSpan... expectedSourceSpans) { + assertThat(sourceLines.getContent()).isEqualTo(expectedContent); + assertThat(sourceLines.getSourceSpans()).isEqualTo(List.of(expectedSourceSpans)); + } + + @Test + void nextString() { + Scanner scanner = Scanner.of(SourceLines.of(List.of( + SourceLine.of("hey ya", null), + SourceLine.of("hi", null)))); + assertThat(scanner.next("hoy")).isFalse(); + assertThat(scanner.next("hey")).isTrue(); + assertThat(scanner.next(' ')).isTrue(); + assertThat(scanner.next("yo")).isFalse(); + assertThat(scanner.next("ya")).isTrue(); + assertThat(scanner.next(" ")).isFalse(); + } +} diff --git a/commonmark/src/test/java/org/commonmark/renderer/markdown/MarkdownRendererTest.java b/commonmark/src/test/java/org/commonmark/renderer/markdown/MarkdownRendererTest.java new file mode 100644 index 000000000..6a468a08e --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/renderer/markdown/MarkdownRendererTest.java @@ -0,0 +1,359 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.node.*; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.NodeRenderer; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.commonmark.testutil.Asserts.assertRendering; + +public class MarkdownRendererTest { + + // Leaf blocks + + @Test + public void testThematicBreaks() { + assertRoundTrip("___\n"); + assertRoundTrip("___\n\nfoo\n"); + // List item with hr -> hr needs to not use the same as the marker + assertRoundTrip("* ___\n"); + assertRoundTrip("- ___\n"); + + // Preserve the literal + assertRoundTrip("----\n"); + assertRoundTrip("*****\n"); + + // Apply fallback for null literal + ThematicBreak node = new ThematicBreak(); + assertThat(render(node)).isEqualTo("___"); + } + + @Test + public void testHeadings() { + // Type of heading is currently not preserved + assertRoundTrip("# foo\n"); + assertRoundTrip("## foo\n"); + assertRoundTrip("### foo\n"); + assertRoundTrip("#### foo\n"); + assertRoundTrip("##### foo\n"); + assertRoundTrip("###### foo\n"); + + assertRoundTrip("Foo\nbar\n===\n"); + assertRoundTrip("Foo \nbar\n===\n"); + assertRoundTrip("[foo\nbar](/url)\n===\n"); + + assertRoundTrip("# foo\n\nbar\n"); + } + + @Test + public void testIndentedCodeBlocks() { + assertRoundTrip(" hi\n"); + assertRoundTrip(" hi\n code\n"); + assertRoundTrip("> hi\n> code\n"); + } + + @Test + public void testFencedCodeBlocks() { + assertRoundTrip("```\ntest\n```\n"); + assertRoundTrip("~~~~\ntest\n~~~~\n"); + assertRoundTrip("```info\ntest\n```\n"); + assertRoundTrip(" ```\n test\n ```\n"); + assertRoundTrip("```\n```\n"); + + // Preserve the length + assertRoundTrip("````\ntest\n````\n"); + assertRoundTrip("~~~\ntest\n~~~~~~\n"); + } + + @Test + public void testFencedCodeBlocksFromAst() { + var doc = new Document(); + var codeBlock = new FencedCodeBlock(); + codeBlock.setLiteral("hi code"); + doc.appendChild(codeBlock); + + assertRendering("", "```\nhi code\n```\n", render(doc)); + + codeBlock.setLiteral("hi`\n```\n``test"); + assertRendering("", "````\nhi`\n```\n``test\n````\n", render(doc)); + } + + @Test + public void testHtmlBlocks() { + assertRoundTrip("<div>test</div>\n"); + assertRoundTrip("> <div>\n> test\n> </div>\n"); + } + + @Test + public void testParagraphs() { + assertRoundTrip("foo\n"); + assertRoundTrip("foo\n\nbar\n"); + } + + // Container blocks + + @Test + public void testBlockQuotes() { + assertRoundTrip("> test\n"); + assertRoundTrip("> foo\n> bar\n"); + assertRoundTrip("> > foo\n> > bar\n"); + assertRoundTrip("> # Foo\n> \n> bar\n> baz\n"); + } + + @Test + public void testBulletListItems() { + assertRoundTrip("* foo\n"); + assertRoundTrip("- foo\n"); + assertRoundTrip("+ foo\n"); + assertRoundTrip("* foo\n bar\n"); + assertRoundTrip("* ```\n code\n ```\n"); + assertRoundTrip("* foo\n\n* bar\n"); + // Note that the " " in the second line is not necessary, but it's not wrong either. + // We could try to avoid it in a future change, but not sure if necessary. + assertRoundTrip("* foo\n \n bar\n"); + + // Tight list + assertRoundTrip("* foo\n* bar\n"); + // Tight list where the second item contains a loose list + assertRoundTrip("- Foo\n - Bar\n \n - Baz\n"); + + // List item indent. This is a tricky one, but here the amount of space between the list marker and "one" + // determines whether "two" is part of the list item or an indented code block. + // In this case, it's an indented code block because it's not indented enough to be part of the list item. + // If the renderer would just use "- one", then "two" would change from being an indented code block to being + // a paragraph in the list item! So it is important for the renderer to preserve the content indent of the list + // item. + assertRoundTrip(" - one\n\n two\n"); + + // Empty list + assertRoundTrip("- \n\nFoo\n"); + } + + @Test + public void testBulletListItemsFromAst() { + var doc = new Document(); + var list = new BulletList(); + var item = new ListItem(); + item.appendChild(new Text("Test")); + list.appendChild(item); + doc.appendChild(list); + + assertRendering("", "- Test\n", render(doc)); + + list.setMarker("*"); + assertRendering("", "* Test\n", render(doc)); + } + + @Test + public void testOrderedListItems() { + assertRoundTrip("1. foo\n"); + assertRoundTrip("2. foo\n\n3. bar\n"); + + // Tight list + assertRoundTrip("1. foo\n2. bar\n"); + // Tight list where the second item contains a loose list + assertRoundTrip("1. Foo\n 1. Bar\n \n 2. Baz\n"); + + assertRoundTrip(" 1. one\n\n two\n"); + } + + @Test + public void testOrderedListItemsFromAst() { + var doc = new Document(); + var list = new OrderedList(); + var item = new ListItem(); + item.appendChild(new Text("Test")); + list.appendChild(item); + doc.appendChild(list); + + assertRendering("", "1. Test\n", render(doc)); + + list.setMarkerStartNumber(2); + list.setMarkerDelimiter(")"); + assertRendering("", "2) Test\n", render(doc)); + } + + @Test + public void testOrderedListItemsWithStartNumberLongerThanLaterNumber() { + var source = "10001.\n20.\n"; + var doc = parse(source); + assertRendering(source, "10001. \n10002. \n", render(doc)); + } + + // Inlines + + @Test + public void testTabs() { + assertRoundTrip("a\tb\n"); + } + + @Test + public void testEscaping() { + // These are a bit tricky. We always escape some characters, even though they only need escaping if they would + // otherwise result in a different parse result (e.g. a link): + assertRoundTrip("\\[a\\](/uri)\n"); + assertRoundTrip("\\`abc\\`\n"); + + // Some characters only need to be escaped at the beginning of the line + assertRoundTrip("\\- Test\n"); + assertRoundTrip("\\-\n"); + assertRoundTrip("Test -\n"); + assertRoundTrip("Abc\n\n\\- Test\n"); + assertRoundTrip("\\# Test\n"); + assertRoundTrip("\\## Test\n"); + assertRoundTrip("\\#\n"); + assertRoundTrip("Foo\n\\===\n"); + // Only needs to be escaped after some text, not at beginning of paragraph + assertRoundTrip("===\n"); + assertRoundTrip("a\n\n===\n"); + // The beginning of the line within the block, so disregarding prefixes + assertRoundTrip("> \\- Test\n"); + assertRoundTrip("- \\- Test\n"); + // That's not the beginning of the line + assertRoundTrip("`a`- foo\n"); + + // This is a bit more tricky as we need to check for a list start + assertRoundTrip("1\\. Foo\n"); + assertRoundTrip("999\\. Foo\n"); + assertRoundTrip("1\\.\n"); + assertRoundTrip("1\\) Foo\n"); + + // Escaped whitespace, wow + assertRoundTrip(" foo\n"); + assertRoundTrip(" foo\n"); + assertRoundTrip("foo bar\n"); + } + + @Test + public void testCodeSpans() { + assertRoundTrip("`foo`\n"); + assertRoundTrip("``foo ` bar``\n"); + assertRoundTrip("```foo `` ` bar```\n"); + + assertRoundTrip("`` `foo ``\n"); + assertRoundTrip("`` ` ``\n"); + assertRoundTrip("` `\n"); + } + + @Test + public void testEmphasis() { + assertRoundTrip("*foo*\n"); + assertRoundTrip("foo*bar*\n"); + // When nesting, a different delimiter needs to be used + assertRoundTrip("*_foo_*\n"); + assertRoundTrip("*_*foo*_*\n"); + assertRoundTrip("_*foo*_\n"); + + // Not emphasis (needs * inside words) + assertRoundTrip("foo\\_bar\\_\n"); + + // Even when rendering a manually constructed tree, the emphasis delimiter needs to be chosen correctly. + Document doc = new Document(); + Paragraph p = new Paragraph(); + doc.appendChild(p); + Emphasis e1 = new Emphasis(); + p.appendChild(e1); + Emphasis e2 = new Emphasis(); + e1.appendChild(e2); + e2.appendChild(new Text("hi")); + assertThat(render(doc)).isEqualTo("*_hi_*\n"); + } + + @Test + public void testStrongEmphasis() { + assertRoundTrip("**foo**\n"); + assertRoundTrip("foo**bar**\n"); + } + + @Test + public void testLinks() { + assertRoundTrip("[link](/uri)\n"); + assertRoundTrip("[link](/uri \"title\")\n"); + assertRoundTrip("[link](</my uri>)\n"); + assertRoundTrip("[a](<b)c>)\n"); + assertRoundTrip("[a](<b(c>)\n"); + assertRoundTrip("[a](<b\\>c>)\n"); + assertRoundTrip("[a](<b\\\\\\>c>)\n"); + assertRoundTrip("[a](/uri \"foo \\\" bar\")\n"); + assertRoundTrip("[link](/uri \"tes\\\\\")\n"); + assertRoundTrip("[link](/url \"test \")\n"); + assertRoundTrip("[link](</url >)\n"); + } + + @Test + public void testImages() { + assertRoundTrip("![link](/uri)\n"); + assertRoundTrip("![link](/uri \"title\")\n"); + assertRoundTrip("![link](</my uri>)\n"); + assertRoundTrip("![a](<b)c>)\n"); + assertRoundTrip("![a](<b(c>)\n"); + assertRoundTrip("![a](<b\\>c>)\n"); + assertRoundTrip("![a](<b\\\\\\>c>)\n"); + assertRoundTrip("![a](/uri \"foo \\\" bar\")\n"); + } + + @Test + public void testHtmlInline() { + assertRoundTrip("<del>*foo*</del>\n"); + } + + @Test + public void testHardLineBreaks() { + assertRoundTrip("foo \nbar\n"); + } + + @Test + public void testSoftLineBreaks() { + assertRoundTrip("foo\nbar\n"); + } + + @Test + public void overrideNodeRender() { + var nodeRendererFactory = new MarkdownNodeRendererFactory() { + @Override + public NodeRenderer create(MarkdownNodeRendererContext context) { + return new NodeRenderer() { + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of(Heading.class); + } + + @Override + public void render(Node node) { + context.getWriter().raw("# Custom heading"); + } + }; + } + + @Override + public Set<Character> getSpecialCharacters() { + return Set.of(); + } + }; + + MarkdownRenderer renderer = MarkdownRenderer.builder().nodeRendererFactory(nodeRendererFactory).build(); + String rendered = renderer.render(parse("# Hello")); + assertThat(rendered).isEqualTo("# Custom heading\n"); + } + + private void assertRoundTrip(String input) { + String rendered = parseAndRender(input); + assertThat(rendered).isEqualTo(input); + } + + private String parseAndRender(String source) { + Node parsed = parse(source); + return render(parsed); + } + + private Node parse(String source) { + return Parser.builder().build().parse(source); + } + + private String render(Node node) { + return MarkdownRenderer.builder().build().render(node); + } +} diff --git a/commonmark/src/test/java/org/commonmark/renderer/markdown/SpecMarkdownRendererTest.java b/commonmark/src/test/java/org/commonmark/renderer/markdown/SpecMarkdownRendererTest.java new file mode 100644 index 000000000..3b88df55d --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/renderer/markdown/SpecMarkdownRendererTest.java @@ -0,0 +1,95 @@ +package org.commonmark.renderer.markdown; + +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Tests Markdown rendering using the examples in the spec like this: + * <ol> + * <li>Parses the source to an AST and then renders it back to Markdown</li> + * <li>Parses that to an AST and then renders it to HTML</li> + * <li>Compares that HTML to the expected HTML of the example: + * If it's the same, then the expected elements were preserved in the Markdown rendering</li> + * </ol> + */ +public class SpecMarkdownRendererTest { + + public static final MarkdownRenderer MARKDOWN_RENDERER = MarkdownRenderer.builder().build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + public static final HtmlRenderer HTML_RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + + @Test + public void testCoverage() { + List<Example> examples = ExampleReader.readExamples(TestResources.getSpec()); + List<Example> passes = new ArrayList<>(); + List<Example> fails = new ArrayList<>(); + for (Example example : examples) { + String markdown = renderMarkdown(example.getSource()); + String rendered = renderHtml(markdown); + if (rendered.equals(example.getHtml())) { + passes.add(example); + } else { + fails.add(example); + } + } + + System.out.println("Passed examples by section (total " + passes.size() + "):"); + printCountsBySection(passes); + System.out.println(); + + System.out.println("Failed examples by section (total " + fails.size() + "):"); + printCountsBySection(fails); + System.out.println(); + + System.out.println("Failed examples:"); + for (Example fail : fails) { + System.out.println("Failed: " + fail); + System.out.println("````````````````````````````````"); + System.out.print(fail.getSource()); + System.out.println("````````````````````````````````"); + System.out.println(); + } + + assertThat(passes).hasSizeGreaterThanOrEqualTo(652); + assertThat(fails).isEmpty(); + } + + private static void printCountsBySection(List<Example> examples) { + Map<String, Integer> bySection = new LinkedHashMap<>(); + for (Example example : examples) { + Integer count = bySection.get(example.getSection()); + if (count == null) { + count = 0; + } + bySection.put(example.getSection(), count + 1); + } + for (Map.Entry<String, Integer> entry : bySection.entrySet()) { + System.out.println(entry.getValue() + ": " + entry.getKey()); + } + } + + private Node parse(String source) { + return Parser.builder().build().parse(source); + } + + private String renderMarkdown(String source) { + return MARKDOWN_RENDERER.render(parse(source)); + } + + private String renderHtml(String source) { + // The spec uses "rightwards arrow" to show tabs + return HTML_RENDERER.render(parse(source)).replace("\t", "\u2192"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/spec/SpecExample.java b/commonmark/src/test/java/org/commonmark/spec/SpecExample.java deleted file mode 100644 index 8e1f56219..000000000 --- a/commonmark/src/test/java/org/commonmark/spec/SpecExample.java +++ /dev/null @@ -1,28 +0,0 @@ -package org.commonmark.spec; - -public class SpecExample { - private final String section; - private final int exampleNumber; - private final String source; - private final String html; - - public SpecExample(String section, int exampleNumber, String source, String html) { - this.section = section; - this.exampleNumber = exampleNumber; - this.source = source; - this.html = html; - } - - public String getSource() { - return source; - } - - public String getHtml() { - return html; - } - - @Override - public String toString() { - return "Section \"" + section + "\" example " + exampleNumber; - } -} diff --git a/commonmark/src/test/java/org/commonmark/test/AbstractVisitorTest.java b/commonmark/src/test/java/org/commonmark/test/AbstractVisitorTest.java index b3b60fa3b..edb6936f4 100644 --- a/commonmark/src/test/java/org/commonmark/test/AbstractVisitorTest.java +++ b/commonmark/src/test/java/org/commonmark/test/AbstractVisitorTest.java @@ -1,10 +1,9 @@ package org.commonmark.test; import org.commonmark.node.*; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; +import static org.assertj.core.api.Assertions.assertThat; public class AbstractVisitorTest { @@ -26,13 +25,13 @@ public void visit(Text text) { assertCode("foo", paragraph.getFirstChild()); assertCode("bar", paragraph.getFirstChild().getNext()); - assertNull(paragraph.getFirstChild().getNext().getNext()); + assertThat(paragraph.getFirstChild().getNext().getNext()).isNull(); assertCode("bar", paragraph.getLastChild()); } private static void assertCode(String expectedLiteral, Node node) { - assertEquals("Expected node to be a Code node: " + node, Code.class, node.getClass()); + assertThat(node).isInstanceOf(Code.class); Code code = (Code) node; - assertEquals(expectedLiteral, code.getLiteral()); + assertThat(code.getLiteral()).isEqualTo(expectedLiteral); } } diff --git a/commonmark/src/test/java/org/commonmark/test/BlockParserFactoryTest.java b/commonmark/src/test/java/org/commonmark/test/BlockParserFactoryTest.java new file mode 100644 index 000000000..b733d7970 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/BlockParserFactoryTest.java @@ -0,0 +1,127 @@ +package org.commonmark.test; + +import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.InlineParser; +import org.commonmark.parser.Parser; +import org.commonmark.parser.SourceLines; +import org.commonmark.parser.block.*; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class BlockParserFactoryTest { + + @Test + public void customBlockParserFactory() { + var parser = Parser.builder().customBlockParserFactory(new DashBlockParser.Factory()).build(); + + // The dashes would normally be a ThematicBreak + var doc = parser.parse("hey\n\n---\n"); + + assertThat(doc.getFirstChild()).isInstanceOf(Paragraph.class); + assertThat(((Text) doc.getFirstChild().getFirstChild()).getLiteral()).isEqualTo("hey"); + assertThat(doc.getLastChild()).isInstanceOf(DashBlock.class); + } + + @Test + public void replaceActiveBlockParser() { + var parser = Parser.builder() + .customBlockParserFactory(new StarHeadingBlockParser.Factory()) + .includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES) + .build(); + + var doc = parser.parse("a\nbc\n***\n"); + + var heading = doc.getFirstChild(); + assertThat(heading).isInstanceOf(StarHeading.class); + assertThat(heading.getNext()).isNull(); + var a = heading.getFirstChild(); + assertThat(a).isInstanceOf(Text.class); + assertThat(((Text) a).getLiteral()).isEqualTo("a"); + var bc = a.getNext().getNext(); + assertThat(bc).isInstanceOf(Text.class); + assertThat(((Text) bc).getLiteral()).isEqualTo("bc"); + assertThat(bc.getNext()).isNull(); + + assertThat(heading.getSourceSpans()).isEqualTo(List.of( + SourceSpan.of(0, 0, 0, 1), + SourceSpan.of(1, 0, 2, 2), + SourceSpan.of(2, 0, 5, 3))); + assertThat(a.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 1))); + assertThat(bc.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 2, 2))); + } + + private static class DashBlock extends CustomBlock { + } + + private static class DashBlockParser extends AbstractBlockParser { + + private DashBlock dash = new DashBlock(); + + @Override + public Block getBlock() { + return dash; + } + + @Override + public BlockContinue tryContinue(ParserState parserState) { + return BlockContinue.none(); + } + + static class Factory extends AbstractBlockParserFactory { + + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + if (state.getLine().getContent().equals("---")) { + return BlockStart.of(new DashBlockParser()); + } + return BlockStart.none(); + } + } + } + + private static class StarHeading extends CustomBlock { + } + + private static class StarHeadingBlockParser extends AbstractBlockParser { + + private final SourceLines content; + private final StarHeading heading = new StarHeading(); + + StarHeadingBlockParser(SourceLines content) { + this.content = content; + } + + @Override + public Block getBlock() { + return heading; + } + + @Override + public BlockContinue tryContinue(ParserState parserState) { + return BlockContinue.none(); + } + + @Override + public void parseInlines(InlineParser inlineParser) { + inlineParser.parse(content, heading); + } + + static class Factory extends AbstractBlockParserFactory { + + @Override + public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { + var lines = matchedBlockParser.getParagraphLines(); + if (state.getLine().getContent().toString().startsWith("***")) { + return BlockStart.of(new StarHeadingBlockParser(lines)) + .replaceActiveBlockParser(); + } else { + return BlockStart.none(); + } + } + } + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/CoreRenderingTestCase.java b/commonmark/src/test/java/org/commonmark/test/CoreRenderingTestCase.java new file mode 100644 index 000000000..2303d2617 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/CoreRenderingTestCase.java @@ -0,0 +1,17 @@ +package org.commonmark.test; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; + +public class CoreRenderingTestCase extends RenderingTestCase { + + private static final Parser PARSER = Parser.builder().build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().build(); + + @Override + protected String render(String source) { + var node = PARSER.parse(source); + return RENDERER.render(node); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/DelimitedTest.java b/commonmark/src/test/java/org/commonmark/test/DelimitedTest.java new file mode 100644 index 000000000..3f2f0d611 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/DelimitedTest.java @@ -0,0 +1,54 @@ +package org.commonmark.test; + +import org.commonmark.node.*; +import org.commonmark.parser.Parser; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class DelimitedTest { + + @Test + public void emphasisDelimiters() { + String input = "* *emphasis* \n" + + "* **strong** \n" + + "* _important_ \n" + + "* __CRITICAL__ \n"; + + Parser parser = Parser.builder().build(); + Node document = parser.parse(input); + + final List<Delimited> list = new ArrayList<>(); + Visitor visitor = new AbstractVisitor() { + @Override + public void visit(Emphasis node) { + list.add(node); + } + + @Override + public void visit(StrongEmphasis node) { + list.add(node); + } + }; + document.accept(visitor); + + assertThat(list).hasSize(4); + + Delimited emphasis = list.get(0); + Delimited strong = list.get(1); + Delimited important = list.get(2); + Delimited critical = list.get(3); + + assertThat(emphasis.getOpeningDelimiter()).isEqualTo("*"); + assertThat(emphasis.getClosingDelimiter()).isEqualTo("*"); + assertThat(strong.getOpeningDelimiter()).isEqualTo("**"); + assertThat(strong.getClosingDelimiter()).isEqualTo("**"); + assertThat(important.getOpeningDelimiter()).isEqualTo("_"); + assertThat(important.getClosingDelimiter()).isEqualTo("_"); + assertThat(critical.getOpeningDelimiter()).isEqualTo("__"); + assertThat(critical.getClosingDelimiter()).isEqualTo("__"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java b/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java new file mode 100644 index 000000000..e4920120d --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java @@ -0,0 +1,228 @@ +package org.commonmark.test; + +import org.commonmark.node.CustomNode; +import org.commonmark.node.Node; +import org.commonmark.node.Text; +import org.commonmark.parser.Parser; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.parser.delimiter.DelimiterRun; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.HtmlNodeRendererContext; +import org.commonmark.renderer.html.HtmlNodeRendererFactory; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import java.util.Locale; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class DelimiterProcessorTest extends RenderingTestCase { + + private static final Parser PARSER = Parser.builder().customDelimiterProcessor(new AsymmetricDelimiterProcessor()).build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().nodeRendererFactory(new UpperCaseNodeRendererFactory()).build(); + + @Test + public void delimiterProcessorWithInvalidDelimiterUse() { + Parser parser = Parser.builder() + .customDelimiterProcessor(new CustomDelimiterProcessor(':', 0)) + .customDelimiterProcessor(new CustomDelimiterProcessor(';', -1)) + .build(); + assertThat(RENDERER.render(parser.parse(":test:"))).isEqualTo("<p>:test:</p>\n"); + assertThat(RENDERER.render(parser.parse(";test;"))).isEqualTo("<p>;test;</p>\n"); + } + + @Test + public void asymmetricDelimiter() { + assertRendering("{foo} bar", "<p>FOO bar</p>\n"); + assertRendering("f{oo ba}r", "<p>fOO BAr</p>\n"); + assertRendering("{{foo} bar", "<p>{FOO bar</p>\n"); + assertRendering("{foo}} bar", "<p>FOO} bar</p>\n"); + assertRendering("{{foo} bar}", "<p>FOO BAR</p>\n"); + assertRendering("{foo bar", "<p>{foo bar</p>\n"); + assertRendering("foo} bar", "<p>foo} bar</p>\n"); + assertRendering("}foo} bar", "<p>}foo} bar</p>\n"); + assertRendering("{foo{ bar", "<p>{foo{ bar</p>\n"); + assertRendering("}foo{ bar", "<p>}foo{ bar</p>\n"); + assertRendering("{} {foo}", "<p> FOO</p>\n"); + } + + @Test + public void multipleDelimitersWithDifferentLengths() { + Parser parser = Parser.builder() + .customDelimiterProcessor(new OneDelimiterProcessor()) + .customDelimiterProcessor(new TwoDelimiterProcessor()) + .build(); + assertThat(RENDERER.render(parser.parse("+one+ ++two++"))).isEqualTo("<p>(1)one(/1) (2)two(/2)</p>\n"); + assertThat(RENDERER.render(parser.parse("+++both+++"))).isEqualTo("<p>(1)(2)both(/2)(/1)</p>\n"); + } + + @Test + public void multipleDelimitersWithSameLengthConflict() { + assertThatThrownBy(() -> + Parser.builder() + .customDelimiterProcessor(new OneDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) + .build()).isInstanceOf(IllegalArgumentException.class); + } + + @Override + protected String render(String source) { + Node node = PARSER.parse(source); + return RENDERER.render(node); + } + + private static class CustomDelimiterProcessor implements DelimiterProcessor { + + private final char delimiterChar; + private final int delimiterUse; + + private CustomDelimiterProcessor(char delimiterChar, int delimiterUse) { + this.delimiterChar = delimiterChar; + this.delimiterUse = delimiterUse; + } + + @Override + public char getOpeningCharacter() { + return delimiterChar; + } + + @Override + public char getClosingCharacter() { + return delimiterChar; + } + + @Override + public int getMinLength() { + return 1; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + return delimiterUse; + } + } + + private static class AsymmetricDelimiterProcessor implements DelimiterProcessor { + + @Override + public char getOpeningCharacter() { + return '{'; + } + + @Override + public char getClosingCharacter() { + return '}'; + } + + @Override + public int getMinLength() { + return 1; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + UpperCaseNode content = new UpperCaseNode(); + Text start = openingRun.getOpener(); + Text end = closingRun.getCloser(); + Node tmp = start.getNext(); + while (tmp != null && tmp != end) { + Node next = tmp.getNext(); + content.appendChild(tmp); + tmp = next; + } + start.insertAfter(content); + + return 1; + } + } + + private static class UpperCaseNode extends CustomNode { + } + + private static class UpperCaseNodeRendererFactory implements HtmlNodeRendererFactory { + + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new UpperCaseNodeRenderer(context); + } + } + + private static class UpperCaseNodeRenderer implements NodeRenderer { + + private final HtmlNodeRendererContext context; + + private UpperCaseNodeRenderer(HtmlNodeRendererContext context) { + this.context = context; + } + + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of(UpperCaseNode.class); + } + + @Override + public void render(Node node) { + UpperCaseNode upperCaseNode = (UpperCaseNode) node; + for (Node child = upperCaseNode.getFirstChild(); child != null; child = child.getNext()) { + if (child instanceof Text) { + Text text = (Text) child; + text.setLiteral(text.getLiteral().toUpperCase(Locale.ENGLISH)); + } + context.render(child); + } + } + } + + private static class OneDelimiterProcessor implements DelimiterProcessor { + + @Override + public char getOpeningCharacter() { + return '+'; + } + + @Override + public char getClosingCharacter() { + return '+'; + } + + @Override + public int getMinLength() { + return 1; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + openingRun.getOpener().insertAfter(new Text("(1)")); + closingRun.getCloser().insertBefore(new Text("(/1)")); + return 1; + } + } + + private static class TwoDelimiterProcessor implements DelimiterProcessor { + + @Override + public char getOpeningCharacter() { + return '+'; + } + + @Override + public char getClosingCharacter() { + return '+'; + } + + @Override + public int getMinLength() { + return 2; + } + + @Override + public int process(DelimiterRun openingRun, DelimiterRun closingRun) { + openingRun.getOpener().insertAfter(new Text("(2)")); + closingRun.getCloser().insertBefore(new Text("(/2)")); + return 2; + } + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java b/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java new file mode 100644 index 000000000..443b0fa51 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java @@ -0,0 +1,63 @@ +package org.commonmark.test; + +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class FencedCodeBlockParserTest extends RenderingTestCase { + + private static final Parser PARSER = Parser.builder().build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().build(); + + @Test + public void backtickInfo() { + Node document = PARSER.parse("```info ~ test\ncode\n```"); + FencedCodeBlock codeBlock = (FencedCodeBlock) document.getFirstChild(); + assertThat(codeBlock.getInfo()).isEqualTo("info ~ test"); + assertThat(codeBlock.getLiteral()).isEqualTo("code\n"); + } + + @Test + public void backtickInfoDoesntAllowBacktick() { + assertRendering("```info ` test\ncode\n```", + "<p>```info ` test\ncode</p>\n<pre><code></code></pre>\n"); + } + + @Test + public void backtickAndTildeCantBeMixed() { + assertRendering("``~`\ncode\n``~`", + "<p><code>~` code </code>~`</p>\n"); + } + + @Test + public void closingCanHaveSpacesAfter() { + assertRendering("```\ncode\n``` ", + "<pre><code>code\n</code></pre>\n"); + } + + @Test + public void closingCanNotHaveNonSpaces() { + assertRendering("```\ncode\n``` a", + "<pre><code>code\n``` a\n</code></pre>\n"); + } + + @Test + public void issue151() { + assertRendering("```\nthis code\n\nshould not have BRs or paragraphs in it\nok\n```", + "<pre><code>this code\n" + + "\n" + + "should not have BRs or paragraphs in it\n" + + "ok\n" + + "</code></pre>\n"); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/HeadingParserTest.java b/commonmark/src/test/java/org/commonmark/test/HeadingParserTest.java new file mode 100644 index 000000000..f7bf35a4c --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/HeadingParserTest.java @@ -0,0 +1,53 @@ +package org.commonmark.test; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.junit.jupiter.api.Test; + +public class HeadingParserTest extends RenderingTestCase { + + private static final Parser PARSER = Parser.builder().build(); + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().build(); + + @Test + public void atxHeadingStart() { + assertRendering("# test", "<h1>test</h1>\n"); + assertRendering("###### test", "<h6>test</h6>\n"); + assertRendering("####### test", "<p>####### test</p>\n"); + assertRendering("#test", "<p>#test</p>\n"); + assertRendering("#", "<h1></h1>\n"); + } + + @Test + public void atxHeadingTrailing() { + assertRendering("# test #", "<h1>test</h1>\n"); + assertRendering("# test ###", "<h1>test</h1>\n"); + assertRendering("# test # ", "<h1>test</h1>\n"); + assertRendering("# test ### ", "<h1>test</h1>\n"); + assertRendering("# test # #", "<h1>test #</h1>\n"); + assertRendering("# test#", "<h1>test#</h1>\n"); + } + + @Test + public void atxHeadingSurrogates() { + assertRendering("# \uD83D\uDE0A #", "<h1>\uD83D\uDE0A</h1>\n"); + } + + @Test + public void setextHeadingMarkers() { + assertRendering("test\n=", "<h1>test</h1>\n"); + assertRendering("test\n-", "<h2>test</h2>\n"); + assertRendering("test\n====", "<h1>test</h1>\n"); + assertRendering("test\n----", "<h2>test</h2>\n"); + assertRendering("test\n==== ", "<h1>test</h1>\n"); + assertRendering("test\n==== =", "<p>test\n==== =</p>\n"); + assertRendering("test\n=-=", "<p>test\n=-=</p>\n"); + assertRendering("test\n=a", "<p>test\n=a</p>\n"); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java b/commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java new file mode 100644 index 000000000..8e1fd9790 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/HtmlInlineParserTest.java @@ -0,0 +1,34 @@ +package org.commonmark.test; + +import org.junit.jupiter.api.Test; + +public class HtmlInlineParserTest extends CoreRenderingTestCase { + + @Test + public void comment() { + assertRendering("inline <!---->", "<p>inline <!----></p>\n"); + assertRendering("inline <!-- -> -->", "<p>inline <!-- -> --></p>\n"); + assertRendering("inline <!-- -- -->", "<p>inline <!-- -- --></p>\n"); + assertRendering("inline <!-- --->", "<p>inline <!-- ---></p>\n"); + assertRendering("inline <!-- ---->", "<p>inline <!-- ----></p>\n"); + assertRendering("inline <!-->-->", "<p>inline <!-->--></p>\n"); + assertRendering("inline <!--->-->", "<p>inline <!--->--></p>\n"); + } + + @Test + public void cdata() { + assertRendering("inline <![CDATA[]]>", "<p>inline <![CDATA[]]></p>\n"); + assertRendering("inline <![CDATA[ ] ]] ]]>", "<p>inline <![CDATA[ ] ]] ]]></p>\n"); + } + + @Test + public void declaration() { + // Whitespace is mandatory + assertRendering("inline <!FOO>", "<p>inline <!FOO></p>\n"); + assertRendering("inline <!FOO >", "<p>inline <!FOO ></p>\n"); + assertRendering("inline <!FOO 'bar'>", "<p>inline <!FOO 'bar'></p>\n"); + + // Lowercase + assertRendering("inline <!foo bar>", "<p>inline <!foo bar></p>\n"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java b/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java index e9d7aee3c..02d970949 100644 --- a/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java +++ b/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java @@ -1,107 +1,338 @@ package org.commonmark.test; -import org.commonmark.html.AttributeProvider; -import org.commonmark.html.HtmlRenderer; -import org.commonmark.node.FencedCodeBlock; -import org.commonmark.node.Node; +import org.commonmark.node.*; import org.commonmark.parser.Parser; -import org.junit.Test; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.*; +import org.commonmark.testutil.TestResources; +import org.junit.jupiter.api.Test; +import java.util.ArrayList; import java.util.Map; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; -import static org.junit.Assert.assertEquals; +import static org.assertj.core.api.Assertions.assertThat; public class HtmlRendererTest { @Test public void htmlAllowingShouldNotEscapeInlineHtml() { String rendered = htmlAllowingRenderer().render(parse("paragraph with <span id='foo' class=\"bar\">inline & html</span>")); - assertEquals("<p>paragraph with <span id='foo' class=\"bar\">inline & html</span></p>\n", rendered); + assertThat(rendered).isEqualTo("<p>paragraph with <span id='foo' class=\"bar\">inline & html</span></p>\n"); } @Test public void htmlAllowingShouldNotEscapeBlockHtml() { String rendered = htmlAllowingRenderer().render(parse("<div id='foo' class=\"bar\">block &</div>")); - assertEquals("<div id='foo' class=\"bar\">block &</div>\n", rendered); + assertThat(rendered).isEqualTo("<div id='foo' class=\"bar\">block &</div>\n"); } @Test public void htmlEscapingShouldEscapeInlineHtml() { String rendered = htmlEscapingRenderer().render(parse("paragraph with <span id='foo' class=\"bar\">inline & html</span>")); // Note that & is not escaped, as it's a normal text node, not part of the inline HTML. - assertEquals("<p>paragraph with <span id='foo' class="bar">inline & html</span></p>\n", rendered); + assertThat(rendered).isEqualTo("<p>paragraph with <span id='foo' class="bar">inline & html</span></p>\n"); } @Test public void htmlEscapingShouldEscapeHtmlBlocks() { String rendered = htmlEscapingRenderer().render(parse("<div id='foo' class=\"bar\">block &</div>")); - assertEquals("<div id='foo' class="bar">block &amp;</div>\n", rendered); + assertThat(rendered).isEqualTo("<p><div id='foo' class="bar">block &amp;</div></p>\n"); } @Test public void textEscaping() { String rendered = defaultRenderer().render(parse("escaping: & < > \" '")); - assertEquals("<p>escaping: & < > " '</p>\n", rendered); + assertThat(rendered).isEqualTo("<p>escaping: & < > " '</p>\n"); } @Test - public void percendEncodeUrlDisabled() { - assertEquals("<p><a href=\"foo&bar\">a</a></p>\n", defaultRenderer().render(parse("[a](foo&bar)"))); - assertEquals("<p><a href=\"ä\">a</a></p>\n", defaultRenderer().render(parse("[a](ä)"))); - assertEquals("<p><a href=\"foo%20bar\">a</a></p>\n", defaultRenderer().render(parse("[a](foo%20bar)"))); + public void characterReferencesWithoutSemicolonsShouldNotBeParsedShouldBeEscaped() { + String input = "[example](javascript:alert('XSS'))"; + String rendered = defaultRenderer().render(parse(input)); + assertThat(rendered).isEqualTo("<p><a href=\"&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29\">example</a></p>\n"); + } + + @Test + public void attributeEscaping() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination(":"); + paragraph.appendChild(link); + assertThat(defaultRenderer().render(paragraph)).isEqualTo("<p><a href=\"&colon;\"></a></p>\n"); + } + + @Test + public void rawUrlsShouldNotFilterDangerousProtocols() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination("javascript:alert(5);"); + paragraph.appendChild(link); + assertThat(rawUrlsRenderer().render(paragraph)).isEqualTo("<p><a href=\"javascript:alert(5);\"></a></p>\n"); + } + + @Test + public void sanitizedUrlsShouldSetRelNoFollow() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination("/exampleUrl"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"/exampleUrl\"></a></p>\n"); + + paragraph = new Paragraph(); + link = new Link(); + link.setDestination("https://google.com"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"https://google.com\"></a></p>\n"); + } + + @Test + public void sanitizedUrlsShouldAllowSafeProtocols() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination("http://google.com"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"http://google.com\"></a></p>\n"); + + paragraph = new Paragraph(); + link = new Link(); + link.setDestination("https://google.com"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"https://google.com\"></a></p>\n"); + + paragraph = new Paragraph(); + link = new Link(); + link.setDestination("mailto:foo@bar.example.com"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"mailto:foo@bar.example.com\"></a></p>\n"); + + String image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAAFiUAABYlAUlSJPAAAAAQSURBVBhXY/iPBVBf8P9/AG8TY51nJdgkAAAAAElFTkSuQmCC"; + paragraph = new Paragraph(); + link = new Link(); + link.setDestination(image); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"" + image + "\"></a></p>\n"); + } + + @Test + public void sanitizedUrlsShouldFilterDangerousProtocols() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination("javascript:alert(5);"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"\"></a></p>\n"); + + paragraph = new Paragraph(); + link = new Link(); + link.setDestination("ftp://google.com"); + paragraph.appendChild(link); + assertThat(sanitizeUrlsRenderer().render(paragraph)).isEqualTo("<p><a rel=\"nofollow\" href=\"\"></a></p>\n"); + } + + @Test + public void percentEncodeUrlDisabled() { + assertThat(defaultRenderer().render(parse("[a](foo&bar)"))).isEqualTo("<p><a href=\"foo&bar\">a</a></p>\n"); + assertThat(defaultRenderer().render(parse("[a](ä)"))).isEqualTo("<p><a href=\"ä\">a</a></p>\n"); + assertThat(defaultRenderer().render(parse("[a](foo%20bar)"))).isEqualTo("<p><a href=\"foo%20bar\">a</a></p>\n"); } @Test public void percentEncodeUrl() { // Entities are escaped anyway - assertEquals("<p><a href=\"foo&bar\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo&bar)"))); + assertThat(percentEncodingRenderer().render(parse("[a](foo&bar)"))).isEqualTo("<p><a href=\"foo&bar\">a</a></p>\n"); // Existing encoding is preserved - assertEquals("<p><a href=\"foo%20bar\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo%20bar)"))); - assertEquals("<p><a href=\"foo%61\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo%61)"))); + assertThat(percentEncodingRenderer().render(parse("[a](foo%20bar)"))).isEqualTo("<p><a href=\"foo%20bar\">a</a></p>\n"); + assertThat(percentEncodingRenderer().render(parse("[a](foo%61)"))).isEqualTo("<p><a href=\"foo%61\">a</a></p>\n"); // Invalid encoding is escaped - assertEquals("<p><a href=\"foo%25\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo%)"))); - assertEquals("<p><a href=\"foo%25a\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo%a)"))); - assertEquals("<p><a href=\"foo%25a_\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo%a_)"))); - assertEquals("<p><a href=\"foo%25xx\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](foo%xx)"))); + assertThat(percentEncodingRenderer().render(parse("[a](foo%)"))).isEqualTo("<p><a href=\"foo%25\">a</a></p>\n"); + assertThat(percentEncodingRenderer().render(parse("[a](foo%a)"))).isEqualTo("<p><a href=\"foo%25a\">a</a></p>\n"); + assertThat(percentEncodingRenderer().render(parse("[a](foo%a_)"))).isEqualTo("<p><a href=\"foo%25a_\">a</a></p>\n"); + assertThat(percentEncodingRenderer().render(parse("[a](foo%xx)"))).isEqualTo("<p><a href=\"foo%25xx\">a</a></p>\n"); // Reserved characters are preserved, except for '[' and ']' - assertEquals("<p><a href=\"!*'();:@&=+$,/?#%5B%5D\">a</a></p>\n", percentEncodingRenderer().render(parse("[a](!*'();:@&=+$,/?#[])"))); + assertThat(percentEncodingRenderer().render(parse("[a](!*'();:@&=+$,/?#[])"))).isEqualTo("<p><a href=\"!*'();:@&=+$,/?#%5B%5D\">a</a></p>\n"); // Unreserved characters are preserved - assertEquals("<p><a href=\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~\">a</a></p>\n", - percentEncodingRenderer().render(parse("[a](ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~)"))); + assertThat(percentEncodingRenderer().render(parse("[a](ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~)"))).isEqualTo("<p><a href=\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~\">a</a></p>\n"); // Other characters are percent-encoded (LATIN SMALL LETTER A WITH DIAERESIS) - assertEquals("<p><a href=\"%C3%A4\">a</a></p>\n", - percentEncodingRenderer().render(parse("[a](ä)"))); + assertThat(percentEncodingRenderer().render(parse("[a](ä)"))).isEqualTo("<p><a href=\"%C3%A4\">a</a></p>\n"); // Other characters are percent-encoded (MUSICAL SYMBOL G CLEF, surrogate pair in UTF-16) - assertEquals("<p><a href=\"%F0%9D%84%9E\">a</a></p>\n", - percentEncodingRenderer().render(parse("[a](\uD834\uDD1E)"))); + assertThat(percentEncodingRenderer().render(parse("[a](\uD834\uDD1E)"))).isEqualTo("<p><a href=\"%F0%9D%84%9E\">a</a></p>\n"); } @Test - public void attributeProvider() { - AttributeProvider custom = new AttributeProvider() { + public void attributeProviderForCodeBlock() { + AttributeProviderFactory custom = new AttributeProviderFactory() { @Override - public void setAttributes(Node node, Map<String, String> attributes) { - if (node instanceof FencedCodeBlock) { - FencedCodeBlock fencedCodeBlock = (FencedCodeBlock) node; - // Remove the default attribute for info - attributes.remove("class"); - // Put info in custom attribute instead - attributes.put("data-custom", fencedCodeBlock.getInfo()); - } + public AttributeProvider create(AttributeProviderContext context) { + return new AttributeProvider() { + @Override + public void setAttributes(Node node, String tagName, Map<String, String> attributes) { + if (node instanceof FencedCodeBlock && tagName.equals("code")) { + FencedCodeBlock fencedCodeBlock = (FencedCodeBlock) node; + // Remove the default attribute for info + attributes.remove("class"); + // Put info in custom attribute instead + attributes.put("data-custom", fencedCodeBlock.getInfo()); + } else if (node instanceof FencedCodeBlock && tagName.equals("pre")) { + attributes.put("data-code-block", "fenced"); + } + } + }; } }; - HtmlRenderer renderer = HtmlRenderer.builder().attributeProvider(custom).build(); + HtmlRenderer renderer = HtmlRenderer.builder().attributeProviderFactory(custom).build(); String rendered = renderer.render(parse("```info\ncontent\n```")); - assertEquals("<pre><code data-custom=\"info\">content\n</code></pre>\n", rendered); + assertThat(rendered).isEqualTo("<pre data-code-block=\"fenced\"><code data-custom=\"info\">content\n</code></pre>\n"); String rendered2 = renderer.render(parse("```evil\"\ncontent\n```")); - assertEquals("<pre><code data-custom=\"evil"\">content\n</code></pre>\n", rendered2); + assertThat(rendered2).isEqualTo("<pre data-code-block=\"fenced\"><code data-custom=\"evil"\">content\n</code></pre>\n"); + } + + @Test + public void attributeProviderForImage() { + AttributeProviderFactory custom = new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return new AttributeProvider() { + @Override + public void setAttributes(Node node, String tagName, Map<String, String> attributes) { + if (node instanceof Image) { + attributes.remove("alt"); + attributes.put("test", "hey"); + } + } + }; + } + }; + + HtmlRenderer renderer = HtmlRenderer.builder().attributeProviderFactory(custom).build(); + String rendered = renderer.render(parse("![foo](/url)\n")); + assertThat(rendered).isEqualTo("<p><img src=\"/url\" test=\"hey\" /></p>\n"); + } + + @Test + public void attributeProviderFactoryNewInstanceForEachRender() { + AttributeProviderFactory factory = new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return new AttributeProvider() { + int i = 0; + + @Override + public void setAttributes(Node node, String tagName, Map<String, String> attributes) { + attributes.put("key", "" + i); + i++; + } + }; + } + }; + + HtmlRenderer renderer = HtmlRenderer.builder().attributeProviderFactory(factory).build(); + String rendered = renderer.render(parse("text node")); + String secondPass = renderer.render(parse("text node")); + assertThat(secondPass).isEqualTo(rendered); + } + + @Test + public void overrideNodeRender() { + HtmlNodeRendererFactory nodeRendererFactory = new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(final HtmlNodeRendererContext context) { + return new NodeRenderer() { + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of(Link.class); + } + + @Override + public void render(Node node) { + context.getWriter().text("test"); + } + }; + } + }; + + HtmlRenderer renderer = HtmlRenderer.builder().nodeRendererFactory(nodeRendererFactory).build(); + String rendered = renderer.render(parse("foo [bar](/url)")); + assertThat(rendered).isEqualTo("<p>foo test</p>\n"); } @Test public void orderedListStartZero() { - assertEquals("<ol start=\"0\">\n<li>Test</li>\n</ol>\n", defaultRenderer().render(parse("0. Test\n"))); + assertThat(defaultRenderer().render(parse("0. Test\n"))).isEqualTo("<ol start=\"0\">\n<li>Test</li>\n</ol>\n"); + } + + @Test + public void imageAltTextWithSoftLineBreak() { + assertThat(defaultRenderer().render(parse("![foo\nbar](/url)\n"))).isEqualTo("<p><img src=\"/url\" alt=\"foo\nbar\" /></p>\n"); + } + + @Test + public void imageAltTextWithHardLineBreak() { + assertThat(defaultRenderer().render(parse("![foo \nbar](/url)\n"))).isEqualTo("<p><img src=\"/url\" alt=\"foo\nbar\" /></p>\n"); + } + + @Test + public void imageAltTextWithEntities() { + assertThat(defaultRenderer().render(parse("![foo ä](/url)\n"))).isEqualTo("<p><img src=\"/url\" alt=\"foo \u00E4\" /></p>\n"); + } + + @Test + public void imageAltTextWithInlines() { + assertThat(defaultRenderer().render(parse("![_foo_ **bar** [link](/url)](/url)\n"))).isEqualTo("<p><img src=\"/url\" alt=\"foo bar link\" /></p>\n"); + } + + @Test + public void imageAltTextWithCode() { + assertThat(defaultRenderer().render(parse("![`foo` bar](/url)\n"))).isEqualTo("<p><img src=\"/url\" alt=\"foo bar\" /></p>\n"); + } + + @Test + public void canRenderContentsOfSingleParagraph() { + Node paragraphs = parse("Here I have a test [link](http://www.google.com)"); + Node paragraph = paragraphs.getFirstChild(); + + Document document = new Document(); + Node child = paragraph.getFirstChild(); + while (child != null) { + Node current = child; + child = current.getNext(); + + document.appendChild(current); + } + + assertThat(defaultRenderer().render(document)).isEqualTo("Here I have a test <a href=\"http://www.google.com\">link</a>"); + } + + @Test + public void omitSingleParagraphP() { + var renderer = HtmlRenderer.builder().omitSingleParagraphP(true).build(); + assertThat(renderer.render(parse("hi *there*"))).isEqualTo("hi <em>there</em>"); + } + + @Test + public void threading() throws Exception { + var parser = Parser.builder().build(); + var spec = TestResources.readAsString(TestResources.getSpec()); + var document = parser.parse(spec); + + var htmlRenderer = HtmlRenderer.builder().build(); + var expectedRendering = htmlRenderer.render(document); + + // Render in parallel using the same HtmlRenderer instance. + var futures = new ArrayList<Future<String>>(); + var executorService = Executors.newFixedThreadPool(4); + for (int i = 0; i < 40; i++) { + var future = executorService.submit(() -> htmlRenderer.render(document)); + futures.add(future); + } + + for (var future : futures) { + var rendering = future.get(); + assertThat(rendering).isEqualTo(expectedRendering); + } } private static HtmlRenderer defaultRenderer() { @@ -112,6 +343,14 @@ private static HtmlRenderer htmlAllowingRenderer() { return HtmlRenderer.builder().escapeHtml(false).build(); } + private static HtmlRenderer sanitizeUrlsRenderer() { + return HtmlRenderer.builder().sanitizeUrls(true).urlSanitizer(new DefaultUrlSanitizer()).build(); + } + + private static HtmlRenderer rawUrlsRenderer() { + return HtmlRenderer.builder().sanitizeUrls(false).build(); + } + private static HtmlRenderer htmlEscapingRenderer() { return HtmlRenderer.builder().escapeHtml(true).build(); } diff --git a/commonmark/src/test/java/org/commonmark/test/InlineParserContextTest.java b/commonmark/src/test/java/org/commonmark/test/InlineParserContextTest.java new file mode 100644 index 000000000..c05cac2d2 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/InlineParserContextTest.java @@ -0,0 +1,81 @@ +package org.commonmark.test; + +import org.commonmark.internal.InlineParserImpl; +import org.commonmark.parser.beta.LinkProcessor; +import org.commonmark.parser.beta.InlineContentParserFactory; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.InlineParser; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.InlineParserFactory; +import org.commonmark.parser.Parser; +import org.commonmark.parser.delimiter.DelimiterProcessor; +import org.commonmark.renderer.html.HtmlRenderer; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; + +public class InlineParserContextTest { + + @Test + public void labelShouldBeOriginalNotNormalized() { + CapturingInlineParserFactory inlineParserFactory = new CapturingInlineParserFactory(); + + Parser parser = Parser.builder().inlineParserFactory(inlineParserFactory).build(); + String input = "[link with special label][FooBarBaz]\n\n[foobarbaz]: /url"; + + String rendered = HtmlRenderer.builder().build().render(parser.parse(input)); + + // Lookup should pass original label to context + assertThat(inlineParserFactory.lookups).isEqualTo(List.of("FooBarBaz")); + + // Context should normalize label for finding reference + assertThat(rendered).isEqualTo("<p><a href=\"/url\">link with special label</a></p>\n"); + } + + static class CapturingInlineParserFactory implements InlineParserFactory { + + private List<String> lookups = new ArrayList<>(); + + @Override + public InlineParser create(final InlineParserContext inlineParserContext) { + InlineParserContext wrappedContext = new InlineParserContext() { + @Override + public List<InlineContentParserFactory> getCustomInlineContentParserFactories() { + return inlineParserContext.getCustomInlineContentParserFactories(); + } + + @Override + public List<DelimiterProcessor> getCustomDelimiterProcessors() { + return inlineParserContext.getCustomDelimiterProcessors(); + } + + @Override + public List<LinkProcessor> getCustomLinkProcessors() { + return inlineParserContext.getCustomLinkProcessors(); + } + + @Override + public Set<Character> getCustomLinkMarkers() { + return inlineParserContext.getCustomLinkMarkers(); + } + + @Override + public LinkReferenceDefinition getLinkReferenceDefinition(String label) { + return getDefinition(LinkReferenceDefinition.class, label); + } + + @Override + public <D> D getDefinition(Class<D> type, String label) { + lookups.add(label); + return inlineParserContext.getDefinition(type, label); + } + }; + + return new InlineParserImpl(wrappedContext); + } + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/LinkReferenceDefinitionNodeTest.java b/commonmark/src/test/java/org/commonmark/test/LinkReferenceDefinitionNodeTest.java new file mode 100644 index 000000000..8410ff028 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/LinkReferenceDefinitionNodeTest.java @@ -0,0 +1,127 @@ +package org.commonmark.test; + +import org.commonmark.node.*; +import org.commonmark.parser.Parser; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class LinkReferenceDefinitionNodeTest { + + @Test + public void testDefinitionWithoutParagraph() { + Node document = parse("This is a paragraph with a [foo] link.\n\n[foo]: /url 'title'"); + List<Node> nodes = Nodes.getChildren(document); + + assertThat(nodes).hasSize(2); + assertThat(nodes.get(0)).isInstanceOf(Paragraph.class); + LinkReferenceDefinition definition = assertDef(nodes.get(1), "foo"); + + assertThat(definition.getDestination()).isEqualTo("/url"); + assertThat(definition.getTitle()).isEqualTo("title"); + } + + @Test + public void testDefinitionWithParagraph() { + Node document = parse("[foo]: /url\nThis is a paragraph with a [foo] link."); + List<Node> nodes = Nodes.getChildren(document); + + assertThat(nodes).hasSize(2); + // Note that definition is not part of the paragraph, it's a sibling + assertThat(nodes.get(0)).isInstanceOf(LinkReferenceDefinition.class); + assertThat(nodes.get(1)).isInstanceOf(Paragraph.class); + } + + @Test + public void testMultipleDefinitions() { + Node document = parse("This is a paragraph with a [foo] link.\n\n[foo]: /url\n[bar]: /url"); + List<Node> nodes = Nodes.getChildren(document); + + assertThat(nodes).hasSize(3); + assertThat(nodes.get(0)).isInstanceOf(Paragraph.class); + assertDef(nodes.get(1), "foo"); + assertDef(nodes.get(2), "bar"); + } + + @Test + public void testMultipleDefinitionsWithSameLabel() { + Node document = parse("This is a paragraph with a [foo] link.\n\n[foo]: /url1\n[foo]: /url2"); + List<Node> nodes = Nodes.getChildren(document); + + assertThat(nodes).hasSize(3); + assertThat(nodes.get(0)).isInstanceOf(Paragraph.class); + LinkReferenceDefinition def1 = assertDef(nodes.get(1), "foo"); + assertThat(def1.getDestination()).isEqualTo("/url1"); + // When there's multiple definitions with the same label, the first one "wins", as in reference links will use + // that. But we still want to preserve the original definitions in the document. + LinkReferenceDefinition def2 = assertDef(nodes.get(2), "foo"); + assertThat(def2.getDestination()).isEqualTo("/url2"); + } + + @Test + public void testDefinitionOfReplacedBlock() { + Node document = parse("[foo]: /url\nHeading\n======="); + List<Node> nodes = Nodes.getChildren(document); + + assertThat(nodes).hasSize(2); + assertDef(nodes.get(0), "foo"); + assertThat(nodes.get(1)).isInstanceOf(Heading.class); + } + + @Test + public void testDefinitionInListItem() { + Node document = parse("* [foo]: /url\n [foo]\n"); + assertThat(document.getFirstChild()).isInstanceOf(BulletList.class); + Node item = document.getFirstChild().getFirstChild(); + assertThat(item).isInstanceOf(ListItem.class); + + List<Node> nodes = Nodes.getChildren(item); + assertThat(nodes).hasSize(2); + assertDef(nodes.get(0), "foo"); + assertThat(nodes.get(1)).isInstanceOf(Paragraph.class); + } + + @Test + public void testDefinitionInListItem2() { + Node document = parse("* [foo]: /url\n* [foo]\n"); + assertThat(document.getFirstChild()).isInstanceOf(BulletList.class); + + List<Node> items = Nodes.getChildren(document.getFirstChild()); + assertThat(items).hasSize(2); + Node item1 = items.get(0); + Node item2 = items.get(1); + + assertThat(item1).isInstanceOf(ListItem.class); + assertThat(item2).isInstanceOf(ListItem.class); + + assertThat(Nodes.getChildren(item1)).hasSize(1); + assertDef(item1.getFirstChild(), "foo"); + + assertThat(Nodes.getChildren(item2)).hasSize(1); + assertThat(item2.getFirstChild()).isInstanceOf(Paragraph.class); + } + + @Test + public void testDefinitionLabelCaseIsPreserved() { + Node document = parse("This is a paragraph with a [foo] link.\n\n[fOo]: /url 'title'"); + List<Node> nodes = Nodes.getChildren(document); + + assertThat(nodes).hasSize(2); + assertThat(nodes.get(0)).isInstanceOf(Paragraph.class); + assertDef(nodes.get(1), "fOo"); + } + + private static Node parse(String input) { + Parser parser = Parser.builder().build(); + return parser.parse(input); + } + + private static LinkReferenceDefinition assertDef(Node node, String label) { + assertThat(node).isInstanceOf(LinkReferenceDefinition.class); + LinkReferenceDefinition def = (LinkReferenceDefinition) node; + assertThat(def.getLabel()).isEqualTo(label); + return def; + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/ListBlockParserTest.java b/commonmark/src/test/java/org/commonmark/test/ListBlockParserTest.java new file mode 100644 index 000000000..02ac3abff --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/ListBlockParserTest.java @@ -0,0 +1,65 @@ +package org.commonmark.test; + +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ListBlockParserTest { + + private static final Parser PARSER = Parser.builder().build(); + + @Test + public void testBulletListIndents() { + assertListItemIndents("* foo", 0, 2); + assertListItemIndents(" * foo", 1, 3); + assertListItemIndents(" * foo", 2, 4); + assertListItemIndents(" * foo", 3, 5); + + assertListItemIndents("* foo", 0, 3); + assertListItemIndents("* foo", 0, 4); + assertListItemIndents("* foo", 0, 5); + assertListItemIndents(" * foo", 1, 4); + assertListItemIndents(" * foo", 3, 8); + + // The indent is relative to any containing blocks + assertListItemIndents("> * foo", 0, 2); + assertListItemIndents("> * foo", 1, 3); + assertListItemIndents("> * foo", 1, 4); + + // Tab counts as 3 spaces here (to the next tab stop column of 4) -> content indent is 1+3 + assertListItemIndents("*\tfoo", 0, 4); + + // Empty list, content indent is expected to be 2 + assertListItemIndents("-\n", 0, 2); + } + + @Test + public void testOrderedListIndents() { + assertListItemIndents("1. foo", 0, 3); + assertListItemIndents(" 1. foo", 1, 4); + assertListItemIndents(" 1. foo", 2, 5); + assertListItemIndents(" 1. foo", 3, 6); + + assertListItemIndents("1. foo", 0, 4); + assertListItemIndents("1. foo", 0, 5); + assertListItemIndents("1. foo", 0, 6); + assertListItemIndents(" 1. foo", 1, 5); + assertListItemIndents(" 1. foo", 2, 8); + + assertListItemIndents("> 1. foo", 0, 3); + assertListItemIndents("> 1. foo", 1, 4); + assertListItemIndents("> 1. foo", 1, 5); + + assertListItemIndents("1.\tfoo", 0, 4); + } + + private void assertListItemIndents(String input, int expectedMarkerIndent, int expectedContentIndent) { + Node doc = PARSER.parse(input); + ListItem listItem = Nodes.find(doc, ListItem.class); + assertThat((int) listItem.getMarkerIndent()).isEqualTo(expectedMarkerIndent); + assertThat((int) listItem.getContentIndent()).isEqualTo(expectedContentIndent); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/ListTightLooseTest.java b/commonmark/src/test/java/org/commonmark/test/ListTightLooseTest.java new file mode 100644 index 000000000..c6bda31ed --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/ListTightLooseTest.java @@ -0,0 +1,193 @@ +package org.commonmark.test; + +import org.junit.jupiter.api.Test; + +public class ListTightLooseTest extends CoreRenderingTestCase { + + @Test + public void tight() { + assertRendering("- foo\n" + + "- bar\n" + + "+ baz\n", + "<ul>\n" + + "<li>foo</li>\n" + + "<li>bar</li>\n" + + "</ul>\n" + + "<ul>\n" + + "<li>baz</li>\n" + + "</ul>\n"); + } + + @Test + public void loose() { + assertRendering("- foo\n" + + "\n" + + "- bar\n" + + "\n" + + "\n" + + "- baz\n", + "<ul>\n" + + "<li>\n" + + "<p>foo</p>\n" + + "</li>\n" + + "<li>\n" + + "<p>bar</p>\n" + + "</li>\n" + + "<li>\n" + + "<p>baz</p>\n" + + "</li>\n" + + "</ul>\n"); + } + + @Test + public void looseNested() { + assertRendering("- foo\n" + + " - bar\n" + + "\n" + + "\n" + + " baz", + "<ul>\n" + + "<li>foo\n" + + "<ul>\n" + + "<li>\n" + + "<p>bar</p>\n" + + "<p>baz</p>\n" + + "</li>\n" + + "</ul>\n" + + "</li>\n" + + "</ul>\n"); + } + + @Test + public void looseNested2() { + assertRendering("- a\n" + + " - b\n" + + "\n" + + " c\n" + + "- d\n", + "<ul>\n" + + "<li>a\n" + + "<ul>\n" + + "<li>\n" + + "<p>b</p>\n" + + "<p>c</p>\n" + + "</li>\n" + + "</ul>\n" + + "</li>\n" + + "<li>d</li>\n" + + "</ul>\n"); + } + + @Test + public void looseOuter() { + assertRendering("- foo\n" + + " - bar\n" + + "\n" + + "\n" + + " baz", + "<ul>\n" + + "<li>\n" + + "<p>foo</p>\n" + + "<ul>\n" + + "<li>bar</li>\n" + + "</ul>\n" + + "<p>baz</p>\n" + + "</li>\n" + + "</ul>\n"); + } + + @Test + public void looseListItem() { + assertRendering("- one\n" + + "\n" + + " two\n", + "<ul>\n" + + "<li>\n" + + "<p>one</p>\n" + + "<p>two</p>\n" + + "</li>\n" + + "</ul>\n"); + } + + @Test + public void tightWithBlankLineAfter() { + assertRendering("- foo\n" + + "- bar\n" + + "\n", + "<ul>\n" + + "<li>foo</li>\n" + + "<li>bar</li>\n" + + "</ul>\n"); + } + + @Test + public void tightListWithCodeBlock() { + assertRendering("- a\n" + + "- ```\n" + + " b\n" + + "\n" + + "\n" + + " ```\n" + + "- c\n", + "<ul>\n" + + "<li>a</li>\n" + + "<li>\n" + + "<pre><code>b\n" + + "\n" + + "\n" + + "</code></pre>\n" + + "</li>\n" + + "<li>c</li>\n" + + "</ul>\n"); + } + + @Test + public void tightListWithCodeBlock2() { + assertRendering("* foo\n" + + " ```\n" + + " bar\n" + + "\n" + + " ```\n" + + " baz\n", + "<ul>\n" + + "<li>foo\n" + + "<pre><code>bar\n" + + "\n" + + "</code></pre>\n" + + "baz</li>\n" + + "</ul>\n"); + } + + @Test + public void looseEmptyListItem() { + assertRendering("* a\n" + + "*\n" + + "\n" + + "* c", + "<ul>\n" + + "<li>\n" + + "<p>a</p>\n" + + "</li>\n" + + "<li></li>\n" + + "<li>\n" + + "<p>c</p>\n" + + "</li>\n" + + "</ul>\n"); + } + + @Test + public void looseBlankLineAfterCodeBlock() { + assertRendering("1. ```\n" + + " foo\n" + + " ```\n" + + "\n" + + " bar", + "<ol>\n" + + "<li>\n" + + "<pre><code>foo\n" + + "</code></pre>\n" + + "<p>bar</p>\n" + + "</li>\n" + + "</ol>\n"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/Nodes.java b/commonmark/src/test/java/org/commonmark/test/Nodes.java new file mode 100644 index 000000000..06d04fde6 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/Nodes.java @@ -0,0 +1,50 @@ +package org.commonmark.test; + +import org.commonmark.node.Node; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class Nodes { + + public static List<Node> getChildren(Node parent) { + List<Node> children = new ArrayList<>(); + for (Node child = parent.getFirstChild(); child != null; child = child.getNext()) { + children.add(child); + } + return children; + } + + /** + * Recursively try to find a node with the given type within the children of the specified node. + * + * @param parent The node to get children from (node itself will not be checked) + * @param nodeClass The type of node to find + */ + public static <T> T tryFind(Node parent, Class<T> nodeClass) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + if (nodeClass.isInstance(node)) { + //noinspection unchecked + return (T) node; + } + T result = tryFind(node, nodeClass); + if (result != null) { + return result; + } + node = next; + } + return null; + } + + /** + * Recursively try to find a node with the given type within the children of the specified node. Throw if node + * could not be found. + */ + public static <T> T find(Node parent, Class<T> nodeClass) { + return Objects.requireNonNull(tryFind(parent, nodeClass), + "Could not find a " + nodeClass.getSimpleName() + " node in " + parent); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/ParserTest.java b/commonmark/src/test/java/org/commonmark/test/ParserTest.java index cc2dda1fc..337196c56 100644 --- a/commonmark/src/test/java/org/commonmark/test/ParserTest.java +++ b/commonmark/src/test/java/org/commonmark/test/ParserTest.java @@ -1,37 +1,253 @@ package org.commonmark.test; +import org.commonmark.node.*; +import org.commonmark.parser.*; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.commonmark.testutil.TestResources; +import org.junit.jupiter.api.Test; + import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import org.commonmark.html.HtmlRenderer; -import org.commonmark.node.Node; -import org.commonmark.parser.Parser; -import static org.junit.Assert.assertEquals; -import org.junit.Test; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; public class ParserTest { + @Test public void ioReaderTest() throws IOException { Parser parser = Parser.builder().build(); - - InputStream input1 = ParserTest.class.getResourceAsStream("/spec.txt"); + + InputStream input1 = TestResources.getSpec().openStream(); Node document1; - try (InputStreamReader reader = new InputStreamReader(input1)) { + try (InputStreamReader reader = new InputStreamReader(input1, StandardCharsets.UTF_8)) { document1 = parser.parseReader(reader); } - - InputStream input2 = ParserTest.class.getResourceAsStream("/spec.txt"); - StringBuilder sb = new StringBuilder(); - try (InputStreamReader reader = new InputStreamReader(input2)) { - int ch; - while ((ch = reader.read()) != -1){ - sb.append((char)ch); + + String spec = TestResources.readAsString(TestResources.getSpec()); + Node document2 = parser.parse(spec); + + HtmlRenderer renderer = HtmlRenderer.builder().escapeHtml(true).build(); + assertThat(renderer.render(document1)).isEqualTo(renderer.render(document2)); + } + + @Test + public void enabledBlockTypes() { + String given = "# heading 1\n\nnot a heading"; + + Parser parser = Parser.builder().build(); // all core parsers by default + Node document = parser.parse(given); + assertThat(document.getFirstChild()).isInstanceOf(Heading.class); + + Set<Class<? extends Block>> headersOnly = new HashSet<>(); + headersOnly.add(Heading.class); + parser = Parser.builder().enabledBlockTypes(headersOnly).build(); + document = parser.parse(given); + assertThat(document.getFirstChild()).isInstanceOf(Heading.class); + + Set<Class<? extends Block>> noCoreTypes = new HashSet<>(); + parser = Parser.builder().enabledBlockTypes(noCoreTypes).build(); + document = parser.parse(given); + assertThat(document.getFirstChild()).isNotInstanceOf(Heading.class); + } + + @Test + public void enabledBlockTypesThrowsWhenGivenUnknownClass() { + // BulletList can't be enabled separately at the moment, only all ListBlock types + assertThatThrownBy(() -> + Parser.builder().enabledBlockTypes(Set.of(Heading.class, BulletList.class)).build()).isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void indentation() { + String given = " - 1 space\n - 3 spaces\n - 5 spaces\n\t - tab + space"; + Parser parser = Parser.builder().build(); + Node document = parser.parse(given); + + assertThat(document.getFirstChild()).isInstanceOf(BulletList.class); + + Node list = document.getFirstChild(); // first level list + assertThat(list.getLastChild()).as("expect one child").isEqualTo(list.getFirstChild()); + assertThat(firstText(list.getFirstChild())).isEqualTo("1 space"); + + list = list.getFirstChild().getLastChild(); // second level list + assertThat(list.getLastChild()).as("expect one child").isEqualTo(list.getFirstChild()); + assertThat(firstText(list.getFirstChild())).isEqualTo("3 spaces"); + + list = list.getFirstChild().getLastChild(); // third level list + assertThat(firstText(list.getFirstChild())).isEqualTo("5 spaces"); + assertThat(firstText(list.getFirstChild().getNext())).isEqualTo("tab + space"); + } + + @Test + public void inlineParser() { + final InlineParser fakeInlineParser = new InlineParser() { + @Override + public void parse(SourceLines lines, Node node) { + node.appendChild(new ThematicBreak()); } + }; + + InlineParserFactory fakeInlineParserFactory = new InlineParserFactory() { + + @Override + public InlineParser create(InlineParserContext inlineParserContext) { + return fakeInlineParser; + } + }; + + Parser parser = Parser.builder().inlineParserFactory(fakeInlineParserFactory).build(); + String input = "**bold** **bold** ~~strikethrough~~"; + + assertThat(parser.parse(input).getFirstChild().getFirstChild()).isInstanceOf(ThematicBreak.class); + } + + @Test + public void threading() throws Exception { + var parser = Parser.builder().build(); + var spec = TestResources.readAsString(TestResources.getSpec()); + + var renderer = HtmlRenderer.builder().build(); + var expectedRendering = renderer.render(parser.parse(spec)); + + // Parse in parallel using the same Parser instance. + var futures = new ArrayList<Future<Node>>(); + var executorService = Executors.newFixedThreadPool(4); + for (int i = 0; i < 40; i++) { + var future = executorService.submit(() -> parser.parse(spec)); + futures.add(future); } - - Node document2 = parser.parse(sb.toString()); - - HtmlRenderer renderer = HtmlRenderer.builder().escapeHtml(true).build(); - assertEquals(renderer.render(document2), renderer.render(document1)); + + for (var future : futures) { + var node = future.get(); + assertThat(renderer.render(node)).isEqualTo(expectedRendering); + } + } + + @Test + public void maxOpenBlockParsersMustBeZeroOrGreater() { + assertThatThrownBy(() -> + Parser.builder().maxOpenBlockParsers(-1)).isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void maxOpenBlockParsersIsOptIn() { + var parser = Parser.builder().build(); + + var document = parser.parse(alternatingNestedList(9)); + + assertThat(renderText(deepestStructuredParagraph(document, 9))).isEqualTo("level9"); + } + + @Test + public void maxOpenBlockParsersPreservesSevenLogicalListLevelsAtSeventeenBlocks() { + var parser = Parser.builder().maxOpenBlockParsers(17).build(); + + var document = parser.parse(alternatingNestedList(7)); + + assertThat(renderText(deepestStructuredParagraph(document, 7))).isEqualTo("level7"); + } + + @Test + public void maxOpenBlockParsersPreservesEightLogicalListLevelsAtSeventeenBlocks() { + var parser = Parser.builder().maxOpenBlockParsers(17).build(); + + var document = parser.parse(alternatingNestedList(8)); + + assertThat(renderText(deepestStructuredParagraph(document, 8))).isEqualTo("level8"); + } + + @Test + public void maxOpenBlockParsersDegradesTheNinthLogicalListLevelToPlainText() { + var parser = Parser.builder().maxOpenBlockParsers(17).build(); + + var document = parser.parse(alternatingNestedList(9)); + var deepestParagraph = deepestStructuredParagraph(document, 8); + + assertThat(renderText(deepestParagraph)).isEqualTo("level8\n\\- level9"); + assertThat(deepestParagraph.getNext()).isNull(); + } + + @Test + public void maxOpenBlockParsersAlsoLimitsMixedListAndBlockQuoteNesting() { + var parser = Parser.builder().maxOpenBlockParsers(5).build(); + + var document = parser.parse(String.join("\n", + "- level1", + " > level2", + " > > level3", + " > > > level4")); + + var listBlock = document.getFirstChild(); + assertThat(listBlock).isInstanceOf(BulletList.class); + + var listItem = listBlock.getFirstChild(); + var blockQuote1 = listItem.getLastChild(); + assertThat(blockQuote1).isInstanceOf(BlockQuote.class); + + var blockQuote2 = blockQuote1.getLastChild(); + assertThat(blockQuote2).isInstanceOf(BlockQuote.class); + + var deepestParagraph = blockQuote2.getLastChild(); + assertThat(deepestParagraph).isInstanceOf(Paragraph.class); + assertThat(renderText(deepestParagraph)).isEqualTo("level3\n\\> level4"); + assertThat(deepestParagraph.getNext()).isNull(); + } + + private String firstText(Node n) { + while (!(n instanceof Text)) { + assertThat(n).isNotNull(); + n = n.getFirstChild(); + } + return ((Text) n).getLiteral(); + } + + private Paragraph deepestStructuredParagraph(Node document, int levels) { + Node node = document.getFirstChild(); + for (int level = 1; level <= levels; level++) { + assertThat(node).isInstanceOf(ListBlock.class); + var listItem = node.getFirstChild(); + assertThat(listItem).isNotNull(); + if (level == levels) { + assertThat(listItem.getFirstChild()).isInstanceOf(Paragraph.class); + return (Paragraph) listItem.getFirstChild(); + } + node = listItem.getLastChild(); + } + throw new AssertionError("unreachable"); + } + + private String renderText(Node node) { + return MarkdownRenderer.builder().build().render(node).trim(); + } + + private String alternatingNestedList(int levels) { + int indent = 0; + var lines = new ArrayList<String>(); + for (int level = 1; level <= levels; level++) { + var ordered = level % 2 == 0; + var marker = ordered ? "1. " : "- "; + lines.add(" ".repeat(indent) + marker + "level" + level); + indent += marker.length(); + } + return String.join("\n", lines); + } + + private int depth(Node node) { + int depth = 0; + while (node.getParent() != null) { + node = node.getParent(); + depth++; + } + return depth; } } diff --git a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java index a42582f92..66d39de23 100644 --- a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java +++ b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java @@ -1,109 +1,108 @@ package org.commonmark.test; -import org.junit.FixMethodOrder; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.Stopwatch; -import org.junit.rules.Timeout; -import org.junit.runner.Description; -import org.junit.runners.MethodSorters; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.api.Timeout; import java.util.concurrent.TimeUnit; /** * Pathological input cases (from commonmark.js). */ -@FixMethodOrder(MethodSorters.NAME_ASCENDING) -public class PathologicalTest extends RenderingTestCase { +@Timeout(value = 3, unit = TimeUnit.SECONDS) +@TestMethodOrder(MethodOrderer.MethodName.class) +public class PathologicalTest extends CoreRenderingTestCase { - private static final int X = 10_000; - - @Rule - public Timeout timeout = new Timeout(2, TimeUnit.SECONDS); - - @Rule - public Stopwatch stopwatch = new Stopwatch() { - @Override - protected void finished(long nanos, Description description) { - System.err.println(description.getDisplayName() + " took " + (nanos / 1000000) + " ms"); - } - }; + private int x = 100_000; @Test public void nestedStrongEmphasis() { // this is limited by the stack size because visitor is recursive - int x = 1000; + x = 500; assertRendering( - repeat("*a **a ", x) + "b" + repeat(" a** a*", x), - "<p>" + repeat("<em>a <strong>a ", x) + "b" + - repeat(" a</strong> a</em>", x) + "</p>\n"); + "*a **a ".repeat(x) + "b" + " a** a*".repeat(x), + "<p>" + "<em>a <strong>a ".repeat(x) + "b" + + " a</strong> a</em>".repeat(x) + "</p>\n"); } @Test public void emphasisClosersWithNoOpeners() { assertRendering( - repeat("a_ ", X), - "<p>" + repeat("a_ ", X - 1) + "a_</p>\n"); + "a_ ".repeat(x), + "<p>" + "a_ ".repeat(x - 1) + "a_</p>\n"); } @Test public void emphasisOpenersWithNoClosers() { assertRendering( - repeat("_a ", X), - "<p>" + repeat("_a ", X - 1) + "_a</p>\n"); + "_a ".repeat(x), + "<p>" + "_a ".repeat(x - 1) + "_a</p>\n"); } @Test public void linkClosersWithNoOpeners() { assertRendering( - repeat("a] ", X), - "<p>" + repeat("a] ", X - 1) + "a]</p>\n"); + "a] ".repeat(x), + "<p>" + "a] ".repeat(x - 1) + "a]</p>\n"); } @Test public void linkOpenersWithNoClosers() { assertRendering( - repeat("[a ", X), - "<p>" + repeat("[a ", X - 1) + "[a</p>\n"); + "[a ".repeat(x), + "<p>" + "[a ".repeat(x - 1) + "[a</p>\n"); } @Test public void linkOpenersAndEmphasisClosers() { assertRendering( - repeat("[ a_ ", X), - "<p>" + repeat("[ a_ ", X - 1) + "[ a_</p>\n"); + "[ a_ ".repeat(x), + "<p>" + "[ a_ ".repeat(x - 1) + "[ a_</p>\n"); } @Test public void mismatchedOpenersAndClosers() { assertRendering( - repeat("*a_ ", X), - "<p>" + repeat("*a_ ", X - 1) + "*a_</p>\n"); + "*a_ ".repeat(x), + "<p>" + "*a_ ".repeat(x - 1) + "*a_</p>\n"); } @Test public void nestedBrackets() { assertRendering( - repeat("[", X) + "a" + repeat("]", X), - "<p>" + repeat("[", X) + "a" + repeat("]", X) + "</p>\n"); + "[".repeat(x) + "a" + "]".repeat(x), + "<p>" + "[".repeat(x) + "a" + "]".repeat(x) + "</p>\n"); } @Test public void nestedBlockQuotes() { // this is limited by the stack size because visitor is recursive - int x = 1000; + x = 1000; assertRendering( - repeat("> ", x) + "a\n", - repeat("<blockquote>\n", x) + "<p>a</p>\n" + - repeat("</blockquote>\n", x)); + "> ".repeat(x) + "a\n", + "<blockquote>\n".repeat(x) + "<p>a</p>\n" + + "</blockquote>\n".repeat(x)); } - private static String repeat(String s, int count) { - StringBuilder sb = new StringBuilder(s.length() * count); - for (int i = 0; i < count; i++) { - sb.append(s); - } - return sb.toString(); + @Test + public void hugeHorizontalRule() { + assertRendering( + "*".repeat(10000) + "\n", + "<hr />\n"); } + @Test + public void backslashInLink() { + // See https://github.com/commonmark/commonmark.js/issues/157 + assertRendering("[" + "\\".repeat(x) + "\n", + "<p>" + "[" + "\\".repeat(x / 2) + "</p>\n"); + } + + @Test + public void unclosedInlineLinks() { + // See https://github.com/commonmark/commonmark.js/issues/129 + assertRendering("[](".repeat(x) + "\n", + "<p>" + "[](".repeat(x) + "</p>\n"); + } } diff --git a/commonmark/src/test/java/org/commonmark/test/RegressionTest.java b/commonmark/src/test/java/org/commonmark/test/RegressionTest.java new file mode 100644 index 000000000..900a6518c --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/RegressionTest.java @@ -0,0 +1,65 @@ +package org.commonmark.test; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.RenderingTestCase; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.Example; +import org.commonmark.testutil.example.ExampleReader; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.Parameter; +import org.junit.jupiter.params.ParameterizedClass; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +@ParameterizedClass +@MethodSource("data") +public class RegressionTest extends RenderingTestCase { + + private static final Parser PARSER = Parser.builder().build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + + private static final Map<String, String> OVERRIDDEN_EXAMPLES = getOverriddenExamples(); + + @Parameter + Example example; + + static List<Example> data() { + var data = new ArrayList<Example>(); + for (var regressionResource : TestResources.getRegressions()) { + data.addAll(ExampleReader.readExamples(regressionResource)); + } + return data; + } + + @Test + public void testHtmlRendering() { + String expectedHtml = OVERRIDDEN_EXAMPLES.get(example.getSource()); + if (expectedHtml == null) { + expectedHtml = example.getHtml(); + } + assertRendering(example.getSource(), expectedHtml); + } + + @Override + protected String render(String source) { + return RENDERER.render(PARSER.parse(source)); + } + + private static Map<String, String> getOverriddenExamples() { + Map<String, String> m = new HashMap<>(); + + // The only difference is that we don't change `%28` and `%29` to `(` and `)` (percent encoding is preserved) + m.put("[XSS](javascript&colon;alert%28'XSS'%29)\n", + "<p><a href=\"javascript&colon;alert%28'XSS'%29\">XSS</a></p>\n"); + // Callers should handle BOMs + m.put("\uFEFF# Hi\n", "<p>\uFEFF# Hi</p>\n"); + + return m; + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/RenderingTestCase.java b/commonmark/src/test/java/org/commonmark/test/RenderingTestCase.java deleted file mode 100644 index 613548ff3..000000000 --- a/commonmark/src/test/java/org/commonmark/test/RenderingTestCase.java +++ /dev/null @@ -1,55 +0,0 @@ -package org.commonmark.test; - -import org.commonmark.Extension; -import org.commonmark.html.HtmlRenderer; -import org.commonmark.node.Node; -import org.commonmark.parser.Parser; -import org.junit.Before; - -import java.util.Collections; - -import static org.junit.Assert.assertEquals; - -public abstract class RenderingTestCase { - - protected Parser parser; - protected HtmlRenderer renderer; - - @Before - public void setup() { - Iterable<? extends Extension> extensions = getExtensions(); - - Parser.Builder parserBuilder = Parser.builder().extensions(extensions); - configureParser(parserBuilder); - parser = parserBuilder.build(); - - HtmlRenderer.Builder rendererBuilder = HtmlRenderer.builder().extensions(extensions); - configureRenderer(rendererBuilder); - renderer = rendererBuilder.build(); - } - - protected Iterable<? extends Extension> getExtensions() { - return Collections.emptyList(); - } - - protected void configureParser(Parser.Builder parserBuilder) { - } - - protected void configureRenderer(HtmlRenderer.Builder rendererBuilder) { - } - - protected void assertRendering(String source, String expectedHtml) { - Node node = parser.parse(source); - String html = renderer.render(node); - - // include source for better assertion errors - String expected = showTabs(expectedHtml + "\n\n" + source); - String actual = showTabs(html + "\n\n" + source); - assertEquals(expected, actual); - } - - private static String showTabs(String s) { - // Tabs are shown as "rightwards arrow" for easier comparison - return s.replace("\t", "\u2192"); - } -} diff --git a/commonmark/src/test/java/org/commonmark/test/SourceLineTest.java b/commonmark/src/test/java/org/commonmark/test/SourceLineTest.java new file mode 100644 index 000000000..5d34bf410 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SourceLineTest.java @@ -0,0 +1,45 @@ +package org.commonmark.test; + +import org.commonmark.node.SourceSpan; +import org.commonmark.parser.SourceLine; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class SourceLineTest { + + @Test + public void testSubstring() { + SourceLine line = SourceLine.of("abcd", SourceSpan.of(3, 10, 13, 4)); + + assertSourceLine(line.substring(0, 4), "abcd", SourceSpan.of(3, 10, 13, 4)); + assertSourceLine(line.substring(0, 3), "abc", SourceSpan.of(3, 10, 13, 3)); + assertSourceLine(line.substring(0, 2), "ab", SourceSpan.of(3, 10, 13, 2)); + assertSourceLine(line.substring(0, 1), "a", SourceSpan.of(3, 10, 13, 1)); + assertSourceLine(line.substring(0, 0), "", null); + + assertSourceLine(line.substring(1, 4), "bcd", SourceSpan.of(3, 11, 14, 3)); + assertSourceLine(line.substring(1, 3), "bc", SourceSpan.of(3, 11, 14, 2)); + + assertSourceLine(line.substring(3, 4), "d", SourceSpan.of(3, 13, 16, 1)); + assertSourceLine(line.substring(4, 4), "", null); + } + + @Test + public void testSubstringBeginOutOfBounds() { + var sourceLine = SourceLine.of("abcd", SourceSpan.of(3, 10, 13, 4)); + assertThatThrownBy(() -> sourceLine.substring(3, 2)).isInstanceOf(StringIndexOutOfBoundsException.class); + } + + @Test + public void testSubstringEndOutOfBounds() { + var sourceLine = SourceLine.of("abcd", SourceSpan.of(3, 10, 13, 4)); + assertThatThrownBy(() -> sourceLine.substring(0, 5)).isInstanceOf(StringIndexOutOfBoundsException.class); + } + + private static void assertSourceLine(SourceLine sourceLine, String expectedContent, SourceSpan expectedSourceSpan) { + assertThat(sourceLine.getContent()).isEqualTo(expectedContent); + assertThat(sourceLine.getSourceSpan()).isEqualTo(expectedSourceSpan); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/SourceSpanRenderer.java b/commonmark/src/test/java/org/commonmark/test/SourceSpanRenderer.java new file mode 100644 index 000000000..c29aac61e --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SourceSpanRenderer.java @@ -0,0 +1,108 @@ +package org.commonmark.test; + +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.Node; + +import java.util.*; + +public class SourceSpanRenderer { + + /** + * Render source spans in the document using source position's line and column index. + */ + public static String renderWithLineColumn(Node document, String source) { + SourceSpanMarkersVisitor visitor = new SourceSpanMarkersVisitor(); + document.accept(visitor); + var lineColumnMarkers = visitor.getLineColumnMarkers(); + + StringBuilder sb = new StringBuilder(); + + String[] lines = source.split("\n"); + + for (int lineIndex = 0; lineIndex < lines.length; lineIndex++) { + String line = lines[lineIndex]; + Map<Integer, List<String>> lineMarkers = lineColumnMarkers.get(lineIndex); + for (int i = 0; i < line.length(); i++) { + appendMarkers(lineMarkers, i, sb); + sb.append(line.charAt(i)); + } + appendMarkers(lineMarkers, line.length(), sb); + sb.append("\n"); + } + + return sb.toString(); + } + + /** + * Render source spans in the document using source position's input index. + */ + public static String renderWithInputIndex(Node document, String source) { + SourceSpanMarkersVisitor visitor = new SourceSpanMarkersVisitor(); + document.accept(visitor); + var markers = visitor.getInputIndexMarkers(); + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < source.length(); i++) { + markers.getOrDefault(i, List.of()).forEach(marker -> sb.append(marker)); + sb.append(source.charAt(i)); + } + return sb.toString(); + } + + private static void appendMarkers(Map<Integer, List<String>> lineMarkers, int columnIndex, StringBuilder sb) { + if (lineMarkers != null) { + List<String> columnMarkers = lineMarkers.get(columnIndex); + if (columnMarkers != null) { + for (String marker : columnMarkers) { + sb.append(marker); + } + } + } + } + + private static class SourceSpanMarkersVisitor extends AbstractVisitor { + + private static final String OPENING = "({[<⸢⸤"; + private static final String CLOSING = ")}]>⸣⸥"; + + private final Map<Integer, Map<Integer, List<String>>> lineColumnMarkers = new HashMap<>(); + private final Map<Integer, List<String>> inputIndexMarkers = new HashMap<>(); + + private int markerIndex; + + public Map<Integer, Map<Integer, List<String>>> getLineColumnMarkers() { + return lineColumnMarkers; + } + + public Map<Integer, List<String>> getInputIndexMarkers() { + return inputIndexMarkers; + } + + @Override + protected void visitChildren(Node parent) { + if (!parent.getSourceSpans().isEmpty()) { + for (var span : parent.getSourceSpans()) { + String opener = String.valueOf(OPENING.charAt(markerIndex % OPENING.length())); + String closer = String.valueOf(CLOSING.charAt(markerIndex % CLOSING.length())); + + int line = span.getLineIndex(); + int col = span.getColumnIndex(); + var input = span.getInputIndex(); + int length = span.getLength(); + getMarkers(line, col).add(opener); + getMarkers(line, col + length).add(0, closer); + + inputIndexMarkers.computeIfAbsent(input, k -> new LinkedList<>()).add(opener); + inputIndexMarkers.computeIfAbsent(input + length, k -> new LinkedList<>()).add(0, closer); + } + markerIndex++; + } + super.visitChildren(parent); + } + + private List<String> getMarkers(int lineIndex, int columnIndex) { + var columnMap = lineColumnMarkers.computeIfAbsent(lineIndex, k -> new HashMap<>()); + return columnMap.computeIfAbsent(columnIndex, k -> new LinkedList<>()); + } + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/SourceSpanTest.java b/commonmark/src/test/java/org/commonmark/test/SourceSpanTest.java new file mode 100644 index 000000000..f1bb231f4 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SourceSpanTest.java @@ -0,0 +1,68 @@ +package org.commonmark.test; + +import org.commonmark.node.SourceSpan; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +public class SourceSpanTest { + + @Test + public void testSubSpan() { + var span = SourceSpan.of(1, 2, 3, 5); + + assertThat(span.subSpan(0)).isSameAs(span); + assertThat(span.subSpan(0, 5)).isSameAs(span); + + assertThat(span.subSpan(1)).isEqualTo(SourceSpan.of(1, 3, 4, 4)); + assertThat(span.subSpan(2)).isEqualTo(SourceSpan.of(1, 4, 5, 3)); + assertThat(span.subSpan(3)).isEqualTo(SourceSpan.of(1, 5, 6, 2)); + assertThat(span.subSpan(4)).isEqualTo(SourceSpan.of(1, 6, 7, 1)); + // Not sure if empty spans are useful, but it probably makes sense to mirror how substrings work + assertThat(span.subSpan(5)).isEqualTo(SourceSpan.of(1, 7, 8, 0)); + assertThat("abcde".substring(5)).isEqualTo(""); + + assertThat(span.subSpan(0, 5)).isEqualTo(SourceSpan.of(1, 2, 3, 5)); + assertThat(span.subSpan(0, 4)).isEqualTo(SourceSpan.of(1, 2, 3, 4)); + assertThat(span.subSpan(0, 3)).isEqualTo(SourceSpan.of(1, 2, 3, 3)); + assertThat(span.subSpan(0, 2)).isEqualTo(SourceSpan.of(1, 2, 3, 2)); + assertThat(span.subSpan(0, 1)).isEqualTo(SourceSpan.of(1, 2, 3, 1)); + assertThat(span.subSpan(0, 0)).isEqualTo(SourceSpan.of(1, 2, 3, 0)); + assertThat("abcde".substring(0, 1)).isEqualTo("a"); + assertThat("abcde".substring(0, 0)).isEqualTo(""); + + assertThat(span.subSpan(1, 4)).isEqualTo(SourceSpan.of(1, 3, 4, 3)); + assertThat(span.subSpan(2, 3)).isEqualTo(SourceSpan.of(1, 4, 5, 1)); + } + + @Test + public void testSubSpanBeginIndexNegative() { + var sourceSpan = SourceSpan.of(1, 2, 3, 5); + assertThatThrownBy(() -> sourceSpan.subSpan(-1)).isInstanceOf(IndexOutOfBoundsException.class); + } + + @Test + public void testSubSpanBeginIndexOutOfBounds() { + var sourceSpan = SourceSpan.of(1, 2, 3, 5); + assertThatThrownBy(() -> sourceSpan.subSpan(6)).isInstanceOf(IndexOutOfBoundsException.class); + } + + @Test + public void testSubSpanEndIndexNegative() { + var sourceSpan = SourceSpan.of(1, 2, 3, 5); + assertThatThrownBy(() -> sourceSpan.subSpan(0, -1)).isInstanceOf(IndexOutOfBoundsException.class); + } + + @Test + public void testSubSpanEndIndexOutOfBounds() { + var sourceSpan = SourceSpan.of(1, 2, 3, 5); + assertThatThrownBy(() -> sourceSpan.subSpan(0, 6)).isInstanceOf(IndexOutOfBoundsException.class); + } + + @Test + public void testSubSpanBeginIndexGreaterThanEndIndex() { + var sourceSpan = SourceSpan.of(1, 2, 3, 5); + assertThatThrownBy(() -> sourceSpan.subSpan(2, 1)).isInstanceOf(IndexOutOfBoundsException.class); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java b/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java new file mode 100644 index 000000000..f4e9d0a17 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SourceSpansTest.java @@ -0,0 +1,428 @@ +package org.commonmark.test; + +import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; +import org.commonmark.parser.Parser; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayDeque; +import java.util.Deque; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class SourceSpansTest { + + private static final Parser PARSER = Parser.builder().includeSourceSpans(IncludeSourceSpans.BLOCKS).build(); + private static final Parser INLINES_PARSER = Parser.builder().includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES).build(); + + @Test + public void paragraph() { + assertSpans("foo\n", Paragraph.class, SourceSpan.of(0, 0, 0, 3)); + assertSpans("foo\nbar\n", Paragraph.class, SourceSpan.of(0, 0, 0, 3), SourceSpan.of(1, 0, 4, 3)); + assertSpans(" foo\n bar\n", Paragraph.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 5)); + assertSpans("> foo\n> bar\n", Paragraph.class, SourceSpan.of(0, 2, 2, 3), SourceSpan.of(1, 2, 8, 3)); + assertSpans("* foo\n bar\n", Paragraph.class, SourceSpan.of(0, 2, 2, 3), SourceSpan.of(1, 2, 8, 3)); + assertSpans("* foo\nbar\n", Paragraph.class, SourceSpan.of(0, 2, 2, 3), SourceSpan.of(1, 0, 6, 3)); + } + + @Test + public void thematicBreak() { + assertSpans("---\n", ThematicBreak.class, SourceSpan.of(0, 0, 0, 3)); + assertSpans(" ---\n", ThematicBreak.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans("> ---\n", ThematicBreak.class, SourceSpan.of(0, 2, 2, 3)); + } + + @Test + public void atxHeading() { + assertSpans("# foo", Heading.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans(" # foo", Heading.class, SourceSpan.of(0, 0, 0, 6)); + assertSpans("## foo ##", Heading.class, SourceSpan.of(0, 0, 0, 9)); + assertSpans("> # foo", Heading.class, SourceSpan.of(0, 2, 2, 5)); + } + + @Test + public void setextHeading() { + assertSpans("foo\n===\n", Heading.class, SourceSpan.of(0, 0, 0, 3), SourceSpan.of(1, 0, 4, 3)); + assertSpans("foo\nbar\n====\n", Heading.class, SourceSpan.of(0, 0, 0, 3), SourceSpan.of(1, 0, 4, 3), SourceSpan.of(2, 0, 8, 4)); + assertSpans(" foo\n ===\n", Heading.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 5)); + assertSpans("> foo\n> ===\n", Heading.class, SourceSpan.of(0, 2, 2, 3), SourceSpan.of(1, 2, 8, 3)); + } + + @Test + public void indentedCodeBlock() { + assertSpans(" foo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 7)); + assertSpans(" foo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 8)); + assertSpans("\tfoo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 4)); + assertSpans(" \tfoo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans(" \tfoo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 6)); + assertSpans(" \tfoo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 7)); + assertSpans(" \tfoo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 8)); + assertSpans(" \t foo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 9)); + assertSpans("\t foo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans("\t foo\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 6)); + assertSpans(" foo\n bar\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 8)); + assertSpans(" foo\n\tbar\n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 4)); + assertSpans(" foo\n \n \n", IndentedCodeBlock.class, SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 4), SourceSpan.of(2, 0, 13, 5)); + assertSpans("> foo\n", IndentedCodeBlock.class, SourceSpan.of(0, 2, 2, 7)); + } + + @Test + public void fencedCodeBlock() { + assertSpans("```\nfoo\n```\n", FencedCodeBlock.class, + SourceSpan.of(0, 0, 0, 3), SourceSpan.of(1, 0, 4, 3), SourceSpan.of(2, 0, 8, 3)); + assertSpans("```\n foo\n```\n", FencedCodeBlock.class, + SourceSpan.of(0, 0, 0, 3), SourceSpan.of(1, 0, 4, 4), SourceSpan.of(2, 0, 9, 3)); + assertSpans("```\nfoo\nbar\n```\n", FencedCodeBlock.class, + SourceSpan.of(0, 0, 0, 3), SourceSpan.of(1, 0, 4, 3), SourceSpan.of(2, 0, 8, 3), SourceSpan.of(3, 0, 12, 3)); + assertSpans(" ```\n foo\n ```\n", FencedCodeBlock.class, + SourceSpan.of(0, 0, 0, 6), SourceSpan.of(1, 0, 7, 6), SourceSpan.of(2, 0, 14, 6)); + assertSpans(" ```\n foo\nfoo\n```\n", FencedCodeBlock.class, + SourceSpan.of(0, 0, 0, 4), SourceSpan.of(1, 0, 5, 4), SourceSpan.of(2, 0, 10, 3), SourceSpan.of(3, 0, 14, 3)); + assertSpans("```info\nfoo\n```\n", FencedCodeBlock.class, + SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 3), SourceSpan.of(2, 0, 12, 3)); + assertSpans("* ```\n foo\n ```\n", FencedCodeBlock.class, + SourceSpan.of(0, 2, 2, 3), SourceSpan.of(1, 2, 8, 3), SourceSpan.of(2, 2, 14, 3)); + assertSpans("> ```\n> foo\n> ```\n", FencedCodeBlock.class, + SourceSpan.of(0, 2, 2, 3), SourceSpan.of(1, 2, 8, 3), SourceSpan.of(2, 2, 14, 3)); + + Node document = PARSER.parse("```\nfoo\n```\nbar\n"); + Paragraph paragraph = (Paragraph) document.getLastChild(); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(3, 0, 12, 3))); + } + + @Test + public void htmlBlock() { + assertSpans("<div>\n", HtmlBlock.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans(" <div>\n foo\n </div>\n", HtmlBlock.class, + SourceSpan.of(0, 0, 0, 6), + SourceSpan.of(1, 0, 7, 4), + SourceSpan.of(2, 0, 12, 7)); + assertSpans("* <div>\n", HtmlBlock.class, SourceSpan.of(0, 2, 2, 5)); + } + + @Test + public void blockQuote() { + assertSpans(">foo\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 4)); + assertSpans("> foo\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans("> foo\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 6)); + assertSpans(" > foo\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 6)); + assertSpans(" > foo\n > bar\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 8), SourceSpan.of(1, 0, 9, 7)); + // Lazy continuations + assertSpans("> foo\nbar\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 3)); + assertSpans("> foo\nbar\n> baz\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 3), SourceSpan.of(2, 0, 10, 5)); + assertSpans("> > foo\nbar\n", BlockQuote.class, SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 3)); + } + + @Test + public void listBlock() { + assertSpans("* foo\n", ListBlock.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans("* foo\n bar\n", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 5)); + assertSpans("* foo\n* bar\n", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 5)); + assertSpans("* foo\n # bar\n", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 7)); + assertSpans("* foo\n * bar\n", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 7)); + assertSpans("* foo\n> bar\n", ListBlock.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans("> * foo\n", ListBlock.class, SourceSpan.of(0, 2, 2, 5)); + + // Lazy continuations + assertSpans("* foo\nbar\nbaz", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 3), SourceSpan.of(2, 0, 10, 3)); + assertSpans("* foo\nbar\n* baz", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 3), SourceSpan.of(2, 0, 10, 5)); + assertSpans("* foo\n * bar\nbaz", ListBlock.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 7), SourceSpan.of(2, 0, 14, 3)); + + Node document = PARSER.parse("* foo\n * bar\n"); + ListBlock listBlock = (ListBlock) document.getFirstChild().getFirstChild().getLastChild(); + assertThat(listBlock.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 2, 8, 5))); + } + + @Test + public void listItem() { + assertSpans("* foo\n", ListItem.class, SourceSpan.of(0, 0, 0, 5)); + assertSpans(" * foo\n", ListItem.class, SourceSpan.of(0, 0, 0, 6)); + assertSpans(" * foo\n", ListItem.class, SourceSpan.of(0, 0, 0, 7)); + assertSpans(" * foo\n", ListItem.class, SourceSpan.of(0, 0, 0, 8)); + assertSpans("*\n foo\n", ListItem.class, SourceSpan.of(0, 0, 0, 1), SourceSpan.of(1, 0, 2, 5)); + assertSpans("*\n foo\n bar\n", ListItem.class, SourceSpan.of(0, 0, 0, 1), SourceSpan.of(1, 0, 2, 5), SourceSpan.of(2, 0, 8, 5)); + assertSpans("> * foo\n", ListItem.class, SourceSpan.of(0, 2, 2, 5)); + + // Lazy continuations + assertSpans("* foo\nbar\n", ListItem.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 3)); + assertSpans("* foo\nbar\nbaz\n", ListItem.class, SourceSpan.of(0, 0, 0, 5), SourceSpan.of(1, 0, 6, 3), SourceSpan.of(2, 0, 10, 3)); + } + + @Test + public void linkReferenceDefinition() { + // This is tricky due to how link reference definition parsing works. It is stripped from the paragraph if it's + // successfully parsed, otherwise it stays part of the paragraph. + Node document = PARSER.parse("[foo]: /url\ntext\n"); + + LinkReferenceDefinition linkReferenceDefinition = (LinkReferenceDefinition) document.getFirstChild(); + assertThat(linkReferenceDefinition.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 11))); + + Paragraph paragraph = (Paragraph) document.getLastChild(); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 12, 4))); + } + + @Test + public void linkReferenceDefinitionMultiple() { + var doc = PARSER.parse("[foo]: /foo\n[bar]: /bar\n"); + var def1 = (LinkReferenceDefinition) doc.getFirstChild(); + var def2 = (LinkReferenceDefinition) doc.getLastChild(); + assertThat(def1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 11))); + assertThat(def2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 12, 11))); + } + + @Test + public void linkReferenceDefinitionWithTitle() { + var doc = PARSER.parse("[1]: #not-code \"Text\"\n[foo]: /foo\n"); + var def1 = (LinkReferenceDefinition) doc.getFirstChild(); + var def2 = (LinkReferenceDefinition) doc.getLastChild(); + assertThat(def1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 21))); + assertThat(def2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 22, 11))); + } + + @Test + public void linkReferenceDefinitionWithTitleInvalid() { + var doc = PARSER.parse("[foo]: /url\n\"title\" ok\n"); + var def = Nodes.find(doc, LinkReferenceDefinition.class); + var paragraph = Nodes.find(doc, Paragraph.class); + assertThat(def.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 11))); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 12, 10))); + } + + @Test + public void linkReferenceDefinitionHeading() { + // This is probably the trickiest because we have a link reference definition at the start of a paragraph + // that gets replaced because of a heading. Phew. + Node document = PARSER.parse("[foo]: /url\nHeading\n===\n"); + + LinkReferenceDefinition linkReferenceDefinition = (LinkReferenceDefinition) document.getFirstChild(); + assertThat(linkReferenceDefinition.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 11))); + + Heading heading = (Heading) document.getLastChild(); + assertThat(heading.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(1, 0, 12, 7), SourceSpan.of(2, 0, 20, 3))); + } + + @Test + public void lazyContinuationLines() { + { + // From https://spec.commonmark.org/0.31.2/#example-250 + // Wrong source span for the inner block quote for the second line. + var doc = PARSER.parse("> > > foo\nbar\n"); + + var bq1 = (BlockQuote) doc.getLastChild(); + assertThat(bq1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 9), SourceSpan.of(1, 0, 10, 3))); + var bq2 = (BlockQuote) bq1.getLastChild(); + assertThat(bq2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 2, 2, 7), SourceSpan.of(1, 0, 10, 3))); + var bq3 = (BlockQuote) bq2.getLastChild(); + assertThat(bq3.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 5), SourceSpan.of(1, 0, 10, 3))); + var paragraph = (Paragraph) bq3.getLastChild(); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 6, 6, 3), SourceSpan.of(1, 0, 10, 3))); + } + + { + // Adding one character to the last line remove blockQuote3 source for the second line + var doc = PARSER.parse("> > > foo\nbars\n"); + + var bq1 = (BlockQuote) doc.getLastChild(); + assertThat(bq1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 9), SourceSpan.of(1, 0, 10, 4))); + var bq2 = (BlockQuote) bq1.getLastChild(); + assertThat(bq2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 2, 2, 7), SourceSpan.of(1, 0, 10, 4))); + var bq3 = (BlockQuote) bq2.getLastChild(); + assertThat(bq3.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 5), SourceSpan.of(1, 0, 10, 4))); + var paragraph = (Paragraph) bq3.getLastChild(); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 6, 6, 3), SourceSpan.of(1, 0, 10, 4))); + } + + { + // From https://spec.commonmark.org/0.31.2/#example-292 + var doc = PARSER.parse("> 1. > Blockquote\ncontinued here."); + + var bq1 = (BlockQuote) doc.getLastChild(); + assertThat(bq1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 17), SourceSpan.of(1, 0, 18, 15))); + var orderedList = (OrderedList) bq1.getLastChild(); + assertThat(orderedList.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 2, 2, 15), SourceSpan.of(1, 0, 18, 15))); + var listItem = (ListItem) orderedList.getLastChild(); + assertThat(listItem.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 2, 2, 15), SourceSpan.of(1, 0, 18, 15))); + var bq2 = (BlockQuote) listItem.getLastChild(); + assertThat(bq2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 5, 5, 12), SourceSpan.of(1, 0, 18, 15))); + var paragraph = (Paragraph) bq2.getLastChild(); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 7, 7, 10), SourceSpan.of(1, 0, 18, 15))); + } + + { + // Lazy continuation line for nested blockquote + var doc = PARSER.parse("> > foo\n> bar\n"); + + var bq1 = (BlockQuote) doc.getLastChild(); + assertThat(bq1.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 0, 0, 7), SourceSpan.of(1, 0, 8, 5))); + var bq2 = (BlockQuote) bq1.getLastChild(); + assertThat(bq2.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 2, 2, 5), SourceSpan.of(1, 2, 10, 3))); + var paragraph = (Paragraph) bq2.getLastChild(); + assertThat(paragraph.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 4, 4, 3), SourceSpan.of(1, 2, 10, 3))); + } + } + + @Test + public void visualCheck() { + assertVisualize("> * foo\n> bar\n> * baz\n", "(> {[* <foo>]})\n(> {[ <bar>]})\n(> {⸢* ⸤baz⸥⸣})\n"); + assertVisualize("> * ```\n> foo\n> ```\n", "(> {[* <```>]})\n(> {[ <foo>]})\n(> {[ <```>]})\n"); + } + + @Test + public void inlineText() { + assertInlineSpans("foo", Text.class, SourceSpan.of(0, 0, 0, 3)); + assertInlineSpans("> foo", Text.class, SourceSpan.of(0, 2, 2, 3)); + assertInlineSpans("* foo", Text.class, SourceSpan.of(0, 2, 2, 3)); + + // SourceSpans should be merged: ` is a separate Text node while inline parsing and gets merged at the end + assertInlineSpans("foo`bar", Text.class, SourceSpan.of(0, 0, 0, 7)); + assertInlineSpans("foo[bar", Text.class, SourceSpan.of(0, 0, 0, 7)); + assertInlineSpans("> foo`bar", Text.class, SourceSpan.of(0, 2, 2, 7)); + + assertInlineSpans("[foo](/url)", Text.class, SourceSpan.of(0, 1, 1, 3)); + assertInlineSpans("*foo*", Text.class, SourceSpan.of(0, 1, 1, 3)); + } + + @Test + public void inlineHeading() { + assertInlineSpans("# foo", Text.class, SourceSpan.of(0, 2, 2, 3)); + assertInlineSpans(" # foo", Text.class, SourceSpan.of(0, 3, 3, 3)); + assertInlineSpans("> # foo", Text.class, SourceSpan.of(0, 4, 4, 3)); + } + + @Test + public void inlineAutolink() { + assertInlineSpans("see <https://example.org>", Link.class, SourceSpan.of(0, 4, 4, 21)); + } + + @Test + public void inlineBackslash() { + assertInlineSpans("\\!", Text.class, SourceSpan.of(0, 0, 0, 2)); + } + + @Test + public void inlineBackticks() { + assertInlineSpans("see `code`", Code.class, SourceSpan.of(0, 4, 4, 6)); + assertInlineSpans("`multi\nline`", Code.class, + SourceSpan.of(0, 0, 0, 6), + SourceSpan.of(1, 0, 7, 5)); + assertInlineSpans("text ```", Text.class, SourceSpan.of(0, 0, 0, 8)); + } + + @Test + public void inlineEntity() { + assertInlineSpans("&", Text.class, SourceSpan.of(0, 0, 0, 5)); + } + + @Test + public void inlineHtml() { + assertInlineSpans("hi <strong>there</strong>", HtmlInline.class, SourceSpan.of(0, 3, 3, 8)); + } + + @Test + public void links() { + assertInlineSpans("\n[text](/url)", Link.class, SourceSpan.of(1, 0, 1, 12)); + assertInlineSpans("\n[text](/url)", Text.class, SourceSpan.of(1, 1, 2, 4)); + + assertInlineSpans("\n[text]\n\n[text]: /url", Link.class, SourceSpan.of(1, 0, 1, 6)); + assertInlineSpans("\n[text]\n\n[text]: /url", Text.class, SourceSpan.of(1, 1, 2, 4)); + assertInlineSpans("\n[text][]\n\n[text]: /url", Link.class, SourceSpan.of(1, 0, 1, 8)); + assertInlineSpans("\n[text][]\n\n[text]: /url", Text.class, SourceSpan.of(1, 1, 2, 4)); + assertInlineSpans("\n[text][ref]\n\n[ref]: /url", Link.class, SourceSpan.of(1, 0, 1, 11)); + assertInlineSpans("\n[text][ref]\n\n[ref]: /url", Text.class, SourceSpan.of(1, 1, 2, 4)); + assertInlineSpans("\n[notalink]", Text.class, SourceSpan.of(1, 0, 1, 10)); + } + + @Test + public void inlineEmphasis() { + assertInlineSpans("\n*hey*", Emphasis.class, SourceSpan.of(1, 0, 1, 5)); + assertInlineSpans("\n*hey*", Text.class, SourceSpan.of(1, 1, 2, 3)); + assertInlineSpans("\n**hey**", StrongEmphasis.class, SourceSpan.of(1, 0, 1, 7)); + assertInlineSpans("\n**hey**", Text.class, SourceSpan.of(1, 2, 3, 3)); + + // This is an interesting one. It renders like this: + // <p>*<em>hey</em></p> + // The delimiter processor only uses one of the asterisks. + // So the first Text node should be the `*` at the beginning with the correct span. + assertInlineSpans("\n**hey*", Text.class, SourceSpan.of(1, 0, 1, 1)); + assertInlineSpans("\n**hey*", Emphasis.class, SourceSpan.of(1, 1, 2, 5)); + + assertInlineSpans("\n***hey**", Text.class, SourceSpan.of(1, 0, 1, 1)); + assertInlineSpans("\n***hey**", StrongEmphasis.class, SourceSpan.of(1, 1, 2, 7)); + + Node document = INLINES_PARSER.parse("*hey**"); + Node lastText = document.getFirstChild().getLastChild(); + assertThat(lastText.getSourceSpans()).isEqualTo(List.of(SourceSpan.of(0, 5, 5, 1))); + } + + @Test + public void tabExpansion() { + assertInlineSpans(">\tfoo", BlockQuote.class, SourceSpan.of(0, 0, 0, 5)); + assertInlineSpans(">\tfoo", Text.class, SourceSpan.of(0, 2, 2, 3)); + + assertInlineSpans("a\tb", Text.class, SourceSpan.of(0, 0, 0, 3)); + } + + @Test + public void differentLineTerminators() { + var input = "foo\nbar\rbaz\r\nqux\r\n\r\n> *hi*"; + assertSpans(input, Paragraph.class, + SourceSpan.of(0, 0, 0, 3), + SourceSpan.of(1, 0, 4, 3), + SourceSpan.of(2, 0, 8, 3), + SourceSpan.of(3, 0, 13, 3)); + assertSpans(input, BlockQuote.class, + SourceSpan.of(5, 0, 20, 6)); + + assertInlineSpans(input, Emphasis.class, SourceSpan.of(5, 2, 22, 4)); + } + + private void assertVisualize(String source, String expected) { + var doc = PARSER.parse(source); + assertThat(SourceSpanRenderer.renderWithLineColumn(doc, source)).isEqualTo(expected); + assertThat(SourceSpanRenderer.renderWithInputIndex(doc, source)).isEqualTo(expected); + } + + private static void assertSpans(String input, Class<? extends Node> nodeClass, SourceSpan... expectedSourceSpans) { + assertSpans(PARSER.parse(input), nodeClass, expectedSourceSpans); + try { + assertSpans(PARSER.parseReader(new StringReader(input)), nodeClass, expectedSourceSpans); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void assertInlineSpans(String input, Class<? extends Node> nodeClass, SourceSpan... expectedSourceSpans) { + assertSpans(INLINES_PARSER.parse(input), nodeClass, expectedSourceSpans); + try { + assertSpans(INLINES_PARSER.parseReader(new StringReader(input)), nodeClass, expectedSourceSpans); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static void assertSpans(Node rootNode, Class<? extends Node> nodeClass, SourceSpan... expectedSourceSpans) { + Node node = findNode(rootNode, nodeClass); + assertThat(node.getSourceSpans()).isEqualTo(List.of(expectedSourceSpans)); + } + + private static Node findNode(Node rootNode, Class<? extends Node> nodeClass) { + Deque<Node> nodes = new ArrayDeque<>(); + nodes.add(rootNode); + while (!nodes.isEmpty()) { + Node node = nodes.removeFirst(); + if (nodeClass.isInstance(node)) { + return node; + } + if (node.getFirstChild() != null) { + nodes.addFirst(node.getFirstChild()); + } + if (node.getNext() != null) { + nodes.addLast(node.getNext()); + } + } + throw new AssertionError("Expected to find " + nodeClass + " node"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/SpecBenchmark.java b/commonmark/src/test/java/org/commonmark/test/SpecBenchmark.java index 32b0efdd2..e7bb080a8 100644 --- a/commonmark/src/test/java/org/commonmark/test/SpecBenchmark.java +++ b/commonmark/src/test/java/org/commonmark/test/SpecBenchmark.java @@ -1,42 +1,64 @@ package org.commonmark.test; -import org.commonmark.html.HtmlRenderer; +import org.commonmark.node.Node; import org.commonmark.parser.Parser; -import org.commonmark.spec.SpecReader; -import org.openjdk.jmh.Main; -import org.openjdk.jmh.annotations.Benchmark; -import org.openjdk.jmh.annotations.Scope; -import org.openjdk.jmh.annotations.State; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.TestResources; +import org.commonmark.testutil.example.ExampleReader; +import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.CommandLineOptions; import org.openjdk.jmh.runner.options.Options; import org.openjdk.jmh.runner.options.OptionsBuilder; -import java.util.Collections; import java.util.List; @State(Scope.Benchmark) +@Fork(5) +@Warmup(iterations = 10) +@Measurement(iterations = 20) public class SpecBenchmark { - private static final String SPEC = SpecReader.readSpec(); - private static final List<String> SPEC_EXAMPLES = SpecReader.readExamplesAsString(); + private static final String SPEC = TestResources.readAsString(TestResources.getSpec()); + private static final List<String> SPEC_EXAMPLES = ExampleReader.readExampleSources(TestResources.getSpec()); private static final Parser PARSER = Parser.builder().build(); private static final HtmlRenderer RENDERER = HtmlRenderer.builder().build(); + private static final Node SPEC_NODE = PARSER.parse(SPEC); + public static void main(String[] args) throws Exception { - Options options = new OptionsBuilder().include(SpecBenchmark.class.getName() + ".*").build(); + Options options = new OptionsBuilder() + .parent(new CommandLineOptions(args)) + .include(SpecBenchmark.class.getName() + ".*") + .build(); new Runner(options).run(); } @Benchmark - public long wholeSpec() { - return parseAndRender(Collections.singletonList(SPEC)); + public long parseWholeSpec() { + return parse(List.of(SPEC)); + } + + @Benchmark + public long parseExamples() { + return parse(SPEC_EXAMPLES); } @Benchmark - public long examples() { + public long parseAndRenderWholeSpec() { + return parseAndRender(List.of(SPEC)); + } + + @Benchmark + public long parseAndRenderExamples() { return parseAndRender(SPEC_EXAMPLES); } + @Benchmark + public long renderWholeSpec() { + return RENDERER.render(SPEC_NODE).length(); + } + private static long parseAndRender(List<String> examples) { long length = 0; for (String example : examples) { @@ -46,4 +68,12 @@ private static long parseAndRender(List<String> examples) { return length; } + private static long parse(List<String> examples) { + long length = 0; + for (String example : examples) { + Node document = PARSER.parse(example); + length += document.getFirstChild() == document.getLastChild() ? 0 : 1; + } + return length; + } } diff --git a/commonmark/src/test/java/org/commonmark/test/SpecTest.java b/commonmark/src/test/java/org/commonmark/test/SpecCoreTest.java similarity index 52% rename from commonmark/src/test/java/org/commonmark/test/SpecTest.java rename to commonmark/src/test/java/org/commonmark/test/SpecCoreTest.java index 7d62c22e2..fefd8fb30 100644 --- a/commonmark/src/test/java/org/commonmark/test/SpecTest.java +++ b/commonmark/src/test/java/org/commonmark/test/SpecCoreTest.java @@ -1,49 +1,27 @@ package org.commonmark.test; -import org.commonmark.html.HtmlRenderer; import org.commonmark.node.AbstractVisitor; import org.commonmark.node.Node; import org.commonmark.node.Text; -import org.commonmark.spec.SpecExample; -import org.commonmark.spec.SpecReader; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.SpecTestCase; +import org.commonmark.testutil.example.Example; +import org.junit.jupiter.api.Test; -import java.util.ArrayList; -import java.util.List; +import static org.assertj.core.api.Assertions.fail; +import static org.commonmark.testutil.Asserts.assertRendering; -import static org.junit.Assert.fail; +public class SpecCoreTest extends SpecTestCase { -@RunWith(Parameterized.class) -public class SpecTest extends RenderingTestCase { - - protected final SpecExample example; - - public SpecTest(SpecExample example) { - this.example = example; - } - - @Parameters(name = "{0}") - public static List<Object[]> data() { - List<SpecExample> examples = SpecReader.readExamples(); - List<Object[]> data = new ArrayList<>(); - for (SpecExample example : examples) { - data.add(new Object[]{example}); - } - return data; - } - - @Test - public void testHtmlRendering() { - assertRendering(example.getSource(), example.getHtml()); - } + private static final Parser PARSER = Parser.builder().build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); @Test public void testTextNodesContiguous() { final String source = example.getSource(); - Node node = parser.parse(source); + Node node = PARSER.parse(source); node.accept(new AbstractVisitor() { @Override protected void visitChildren(Node parent) { @@ -68,9 +46,12 @@ protected void visitChildren(Node parent) { }); } - @Override - protected void configureRenderer(HtmlRenderer.Builder rendererBuilder) { - // The spec says URL-escaping is optional, but the examples assume that it's enabled. - rendererBuilder.percentEncodeUrls(true); + @Test + public void testHtmlRendering() { + assertRendering(example.getSource(), example.getHtml(), render(example.getSource())); + } + + private String render(String source) { + return RENDERER.render(PARSER.parse(source)); } } diff --git a/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java b/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java new file mode 100644 index 000000000..47ca3da4e --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java @@ -0,0 +1,29 @@ +package org.commonmark.test; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.SpecTestCase; +import org.commonmark.testutil.example.Example; +import org.junit.jupiter.api.Test; + +import static org.commonmark.testutil.Asserts.assertRendering; + +/** + * Same as {@link SpecCoreTest} but converts line endings to Windows-style CR+LF endings before parsing. + */ +public class SpecCrLfCoreTest extends SpecTestCase { + + private static final Parser PARSER = Parser.builder().build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + + @Test + public void testHtmlRendering() { + assertRendering(example.getSource(), example.getHtml(), render(example.getSource())); + } + + private String render(String source) { + String windowsStyle = source.replace("\n", "\r\n"); + return RENDERER.render(PARSER.parse(windowsStyle)); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java index 7dc5fd1f7..45cd3aea2 100644 --- a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java +++ b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java @@ -1,8 +1,8 @@ package org.commonmark.test; -import org.junit.Test; +import org.junit.jupiter.api.Test; -public class SpecialInputTest extends RenderingTestCase { +public class SpecialInputTest extends CoreRenderingTestCase { @Test public void empty() { @@ -32,6 +32,7 @@ public void crLfAtEndShouldBeParsed() { @Test public void mixedLineSeparators() { assertRendering("- a\n- b\r- c\r\n- d", "<ul>\n<li>a</li>\n<li>b</li>\n<li>c</li>\n<li>d</li>\n</ul>\n"); + assertRendering("a\n\nb\r\rc\r\n\r\nd\n\re", "<p>a</p>\n<p>b</p>\n<p>c</p>\n<p>d</p>\n<p>e</p>\n"); } @Test @@ -60,4 +61,168 @@ public void looseListInBlockQuote() { assertRendering("> *\n>\n> * a", "<blockquote>\n<ul>\n<li></li>\n<li>\n<p>a</p>\n</li>\n</ul>\n</blockquote>\n"); } + @Test + public void lineWithOnlySpacesAfterListBullet() { + assertRendering("- \n \n foo\n", "<ul>\n<li></li>\n</ul>\n<p>foo</p>\n"); + } + + @Test + public void listWithTwoSpacesForFirstBullet() { + // We have two spaces after the bullet, but no content. With content, the next line would be required + assertRendering("* \n foo\n", "<ul>\n<li>foo</li>\n</ul>\n"); + } + + @Test + public void orderedListMarkerOnly() { + assertRendering("2.", "<ol start=\"2\">\n<li></li>\n</ol>\n"); + } + + @Test + public void columnIsInTabOnPreviousLine() { + assertRendering("- foo\n\n\tbar\n\n# baz\n", + "<ul>\n<li>\n<p>foo</p>\n<p>bar</p>\n</li>\n</ul>\n<h1>baz</h1>\n"); + assertRendering("- foo\n\n\tbar\n# baz\n", + "<ul>\n<li>\n<p>foo</p>\n<p>bar</p>\n</li>\n</ul>\n<h1>baz</h1>\n"); + } + + @Test + public void linkLabelWithBracket() { + assertRendering("[a[b]\n\n[a[b]: /", "<p>[a[b]</p>\n<p>[a[b]: /</p>\n"); + assertRendering("[a]b]\n\n[a]b]: /", "<p>[a]b]</p>\n<p>[a]b]: /</p>\n"); + assertRendering("[a[b]]\n\n[a[b]]: /", "<p>[a[b]]</p>\n<p>[a[b]]: /</p>\n"); + } + + @Test + public void linkLabelLength() { + String label1 = "a".repeat(999); + assertRendering("[foo][" + label1 + "]\n\n[" + label1 + "]: /", "<p><a href=\"/\">foo</a></p>\n"); + assertRendering("[foo][x" + label1 + "]\n\n[x" + label1 + "]: /", + "<p>[foo][x" + label1 + "]</p>\n<p>[x" + label1 + "]: /</p>\n"); + assertRendering("[foo][\n" + label1 + "]\n\n[\n" + label1 + "]: /", + "<p>[foo][\n" + label1 + "]</p>\n<p>[\n" + label1 + "]: /</p>\n"); + + String label2 = "a\n".repeat(499); + assertRendering("[foo][" + label2 + "]\n\n[" + label2 + "]: /", "<p><a href=\"/\">foo</a></p>\n"); + assertRendering("[foo][12" + label2 + "]\n\n[12" + label2 + "]: /", + "<p>[foo][12" + label2 + "]</p>\n<p>[12" + label2 + "]: /</p>\n"); + } + + @Test + public void linkDestinationEscaping() { + // Backslash escapes `)` + assertRendering("[foo](\\))", "<p><a href=\")\">foo</a></p>\n"); + // ` ` is not escapable, so the backslash is a literal backslash and there's an optional space at the end + assertRendering("[foo](\\ )", "<p><a href=\"\\\">foo</a></p>\n"); + // Backslash is a literal, so valid + assertRendering("[foo](<a\\b>)", "<p><a href=\"a\\b\">foo</a></p>\n"); + // Backslash escapes `>` but there's another `>`, valid + assertRendering("[foo](<a\\>>)", "<p><a href=\"a>\">foo</a></p>\n"); + + // This is a tricky one. There's `<` so we try to parse it as a `<` link but fail. + assertRendering("[foo](<\\>)", "<p>[foo](<>)</p>\n"); + } + + // commonmark/CommonMark#468 + @Test + public void linkReferenceBackslash() { + // Backslash escapes ']', so not a valid link label + assertRendering("[\\]: test", "<p>[]: test</p>\n"); + // Backslash is a literal, so valid + assertRendering("[a\\b]\n\n[a\\b]: test", "<p><a href=\"test\">a\\b</a></p>\n"); + // Backslash escapes `]` but there's another `]`, valid + assertRendering("[a\\]]\n\n[a\\]]: test", "<p><a href=\"test\">a]</a></p>\n"); + } + + // commonmark/cmark#177 + @Test + public void emphasisMultipleOf3Rule() { + assertRendering("a***b* c*", "<p>a*<em><em>b</em> c</em></p>\n"); + } + + @Test + public void renderEvenRegexpProducesStackoverflow() { + render("Contents: <!--[if gte mso 9]> <w:LatentStyles DefLockedState=\"false\" DefUnhideWhenUsed=\"false\" DefSemiHidden=\"false\" DefQFormat=\"false\" DefPriority=\"99\" LatentStyleCount=\"371\"> <w:xxx Locked=\"false\" Priority=\"52\" Name=\"Grid Table 7 Colorful 6\"/> <w:xxx Locked=\"false\" Priority=\"46\" Name=\"List Table 1 Light\"/> <w:xxx Locked=\"false\" Priority=\"47\" Name=\"List Table 2\"/> <w:xxx Locked=\"false\" Priority=\"48\" Name=\"List Table 3\"/> <w:xxx Locked=\"false\" Priority=\"49\" Name=\"List Table 4\"/> <w:xxx Locked=\"false\" Priority=\"50\" Name=\"List Table 5 Dark\"/> <w:xxx Locked=\"false\" Priority=\"51\" Name=\"List Table 6 Colorful\"/> <w:xxx Locked=\"false\" Priority=\"52\" Name=\"List Table 7 Colorful\"/> <w:xxx Locked=\"false\" Priority=\"46\" Name=\"List Table 1 Light Accent 1\"/> <w:xxx Locked=\"false\" Priority=\"47\" Name=\"List Table 2 Accent 1\"/> <w:xxx Locked=\"false\" Priority=\"48\" Name=\"List Table 3 Accent 1\"/> <w:xxx Locked=\"false\" Priority=\"49\" Name=\"List Table 4 Accent 1\"/> <w:xxx Locked=\"false\" Priority=\"50\" Name=\"List Table 5 Dark Accent 1\"/> <w:xxx Locked=\"false\" Priority=\"52\" Name=\"List Table 7 Colorful Accent 1\"/> <w:xxx Locked=\"false\" Priority=\"46\" Name=\"List Table 1 Light Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"47\" Name=\"List Table 2 Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"48\" Name=\"List Table 3 Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"49\" Name=\"List Table 4 Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"50\" Name=\"List Table 5 Dark Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"51\" Name=\"List Table 6 Colorful Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"52\" Name=\"List Table 7 Colorful Accent 2\"/> <w:xxx Locked=\"false\" Priority=\"46\" Name=\"List Table 1 Light Accent 3\"/> <w:xxx Locked=\"false\" Priority=\"47\" Name=\"List Table 2 Accent 3\"/> <w:xxx Locked=\"false\" Priority=\"48\" Name=\"List Table 3 Accent 3\"/> <w:xxx Locked=\"false\" Priority=\"49\" Name=\"List Table 4 Accent 3\" /> <w:xxx Locked=\"false\" Priority=\"50\" Name=\"List Table 5 Dark Accent 3\"/><w:xxx Locked=\"false\" Priority=\"51\" Name=\"List Table 6 Colorful Accent 3\"/></xml>"); + } + + @Test + public void deeplyIndentedList() { + assertRendering("* one\n" + + " * two\n" + + " * three\n" + + " * four", + "<ul>\n" + + "<li>one\n" + + "<ul>\n" + + "<li>two\n" + + "<ul>\n" + + "<li>three\n" + + "<ul>\n" + + "<li>four</li>\n" + + "</ul>\n" + + "</li>\n" + + "</ul>\n" + + "</li>\n" + + "</ul>\n" + + "</li>\n" + + "</ul>\n"); + } + + @Test + public void trailingTabs() { + // The tab is not treated as 4 spaces here and so does not result in a hard line break, but is just preserved. + // This matches what commonmark.js did at the time of writing. + assertRendering("a\t\nb\n", "<p>a\t\nb</p>\n"); + } + + @Test + public void unicodePunctuationEmphasis() { + // The character here is: U+12470 CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER + // Which is in Unicode category "Po" and needs 2 code units in UTF-16. That means to implement + // it correctly, we need to check code points, not Java chars. + // Note that currently the reference implementation doesn't implement this correctly (resulting in no <em>). + assertRendering("foo\uD809\uDC70_(bar)_", "<p>foo\uD809\uDC70<em>(bar)</em></p>\n"); + } + + @Test + public void htmlBlockInterruptingList() { + assertRendering("- <script>\n" + + "- some text\n" + + "some other text\n" + + "</script>\n", "<ul>\n" + + "<li>\n" + + "<script>\n" + + "</li>\n" + + "<li>some text\n" + + "some other text\n" + + "</script></li>\n" + + "</ul>\n"); + + assertRendering("- <script>\n" + + "- some text\n" + + "some other text\n" + + "\n" + + "</script>\n", "<ul>\n" + + "<li>\n" + + "<script>\n" + + "</li>\n" + + "<li>some text\n" + + "some other text</li>\n" + + "</ul>\n" + + "</script>\n"); + } + + @Test + public void emphasisAfterHardLineBreak() { + assertRendering("Hello \n" + + "**Bar**\n" + + "Foo\n", "<p>Hello<br />\n" + + "<strong>Bar</strong>\n" + + "Foo</p>\n"); + + assertRendering("Hello \n" + + "**Bar** \n" + + "Foo\n", "<p>Hello<br />\n" + + "<strong>Bar</strong><br />\n" + + "Foo</p>\n"); + } } diff --git a/commonmark/src/test/java/org/commonmark/test/TextContentRendererTest.java b/commonmark/src/test/java/org/commonmark/test/TextContentRendererTest.java new file mode 100644 index 000000000..46757e0c3 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/TextContentRendererTest.java @@ -0,0 +1,268 @@ +package org.commonmark.test; + +import org.commonmark.node.Link; +import org.commonmark.node.Node; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.text.LineBreakRendering; +import org.commonmark.renderer.text.TextContentNodeRendererContext; +import org.commonmark.renderer.text.TextContentNodeRendererFactory; +import org.commonmark.renderer.text.TextContentRenderer; +import org.commonmark.parser.Parser; +import org.commonmark.testutil.Asserts; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +public class TextContentRendererTest { + + private static final Parser PARSER = Parser.builder().build(); + private static final TextContentRenderer COMPACT_RENDERER = TextContentRenderer.builder().build(); + private static final TextContentRenderer SEPARATE_RENDERER = TextContentRenderer.builder() + .lineBreakRendering(LineBreakRendering.SEPARATE_BLOCKS).build(); + private static final TextContentRenderer STRIPPED_RENDERER = TextContentRenderer.builder() + .lineBreakRendering(LineBreakRendering.STRIP).build(); + + @Test + public void textContentText() { + String s; + + s = "foo bar"; + assertCompact(s, "foo bar"); + assertStripped(s, "foo bar"); + + s = "foo foo\n\nbar\nbar"; + assertCompact(s, "foo foo\nbar\nbar"); + assertSeparate(s, "foo foo\n\nbar\nbar"); + assertStripped(s, "foo foo bar bar"); + } + + @Test + public void textContentHeading() { + assertCompact("# Heading\n\nFoo", "Heading\nFoo"); + assertSeparate("# Heading\n\nFoo", "Heading\n\nFoo"); + assertStripped("# Heading\n\nFoo", "Heading: Foo"); + } + + @Test + public void textContentEmphasis() { + String s; + + s = "***foo***"; + assertCompact(s, "foo"); + assertStripped(s, "foo"); + + s = "foo ***foo*** bar ***bar***"; + assertCompact(s, "foo foo bar bar"); + assertStripped(s, "foo foo bar bar"); + + s = "foo\n***foo***\nbar\n\n***bar***"; + assertCompact(s, "foo\nfoo\nbar\nbar"); + assertSeparate(s, "foo\nfoo\nbar\n\nbar"); + assertStripped(s, "foo foo bar bar"); + } + + @Test + public void textContentQuotes() { + String s; + + s = "foo\n>foo\nbar\n\nbar"; + assertCompact(s, "foo\n«foo\nbar»\nbar"); + assertSeparate(s, "foo\n\n«foo\nbar»\n\nbar"); + assertStripped(s, "foo «foo bar» bar"); + } + + @Test + public void textContentLinks() { + assertAll("foo [text](http://link \"title\") bar", "foo \"text\" (title: http://link) bar"); + assertAll("foo [text](http://link \"http://link\") bar", "foo \"text\" (http://link) bar"); + assertAll("foo [text](http://link) bar", "foo \"text\" (http://link) bar"); + assertAll("foo [text]() bar", "foo \"text\" bar"); + assertAll("foo http://link bar", "foo http://link bar"); + } + + @Test + public void textContentImages() { + assertAll("foo ![text](http://link \"title\") bar", "foo \"text\" (title: http://link) bar"); + assertAll("foo ![text](http://link) bar", "foo \"text\" (http://link) bar"); + assertAll("foo ![text]() bar", "foo \"text\" bar"); + } + + @Test + public void textContentLists() { + String s; + + s = "foo\n* foo\n* bar\n\nbar"; + assertCompact(s, "foo\n* foo\n* bar\nbar"); + assertSeparate(s, "foo\n\n* foo\n* bar\n\nbar"); + assertStripped(s, "foo foo bar bar"); + + s = "foo\n- foo\n- bar\n\nbar"; + assertCompact(s, "foo\n- foo\n- bar\nbar"); + assertSeparate(s, "foo\n\n- foo\n- bar\n\nbar"); + assertStripped(s, "foo foo bar bar"); + + s = "foo\n1. foo\n2. bar\n\nbar"; + assertCompact(s, "foo\n1. foo\n2. bar\nbar"); + assertSeparate(s, "foo\n\n1. foo\n2. bar\n\nbar"); + assertStripped(s, "foo 1. foo 2. bar bar"); + + s = "foo\n0) foo\n1) bar\n\nbar"; + assertCompact(s, "foo\n0) foo\n1) bar\nbar"); + assertSeparate(s, "foo\n0) foo\n\n1) bar\n\nbar"); + assertStripped(s, "foo 0) foo 1) bar bar"); + + s = "bar\n1. foo\n 1. bar\n2. foo"; + assertCompact(s, "bar\n1. foo\n 1. bar\n2. foo"); + assertSeparate(s, "bar\n\n1. foo\n 1. bar\n2. foo"); + assertStripped(s, "bar 1. foo 1. bar 2. foo"); + + s = "bar\n* foo\n - bar\n* foo"; + assertCompact(s, "bar\n* foo\n - bar\n* foo"); + assertSeparate(s, "bar\n\n* foo\n - bar\n* foo"); + assertStripped(s, "bar foo bar foo"); + + s = "bar\n* foo\n 1. bar\n 2. bar\n* foo"; + assertCompact(s, "bar\n* foo\n 1. bar\n 2. bar\n* foo"); + assertSeparate(s, "bar\n\n* foo\n 1. bar\n 2. bar\n* foo"); + assertStripped(s, "bar foo 1. bar 2. bar foo"); + + s = "bar\n1. foo\n * bar\n * bar\n2. foo"; + assertCompact(s, "bar\n1. foo\n * bar\n * bar\n2. foo"); + assertSeparate(s, "bar\n\n1. foo\n * bar\n * bar\n2. foo"); + assertStripped(s, "bar 1. foo bar bar 2. foo"); + + // For a loose list (not tight) + s = "foo\n\n* bar\n\n* baz"; + // Compact ignores loose + assertCompact(s, "foo\n* bar\n* baz"); + // Separate preserves it + assertSeparate(s, "foo\n\n* bar\n\n* baz"); + assertStripped(s, "foo bar baz"); + + } + + @Test + public void textContentCode() { + assertAll("foo `code` bar", "foo \"code\" bar"); + } + + @Test + public void textContentCodeBlock() { + String s; + s = "foo\n```\nfoo\nbar\n```\nbar"; + assertCompact(s, "foo\nfoo\nbar\nbar"); + assertSeparate(s, "foo\n\nfoo\nbar\n\nbar"); + assertStripped(s, "foo foo bar bar"); + + s = "foo\n\n foo\n bar\nbar"; + assertCompact(s, "foo\nfoo\n bar\nbar"); + assertSeparate(s, "foo\n\nfoo\n bar\n\nbar"); + assertStripped(s, "foo foo bar bar"); + } + + @Test + public void textContentBreaks() { + String s; + + s = "foo\nbar"; + assertCompact(s, "foo\nbar"); + assertSeparate(s, "foo\nbar"); + assertStripped(s, "foo bar"); + + s = "foo \nbar"; + assertCompact(s, "foo\nbar"); + assertSeparate(s, "foo\nbar"); + assertStripped(s, "foo bar"); + + s = "foo\n___\nbar"; + assertCompact(s, "foo\n***\nbar"); + assertSeparate(s, "foo\n\n***\n\nbar"); + assertStripped(s, "foo bar"); + } + + @Test + public void textContentHtml() { + String html = "<table>\n" + + " <tr>\n" + + " <td>\n" + + " foobar\n" + + " </td>\n" + + " </tr>\n" + + "</table>"; + assertCompact(html, html); + assertSeparate(html, html); + + html = "foo <foo>foobar</foo> bar"; + assertAll(html, html); + } + + @Test + public void testContentNestedLists() { + var s = "List:\n" + + "1. 2) 3. \n" + + "end"; + assertCompact(s, s); + + var s2 = "1. A\n 1) B\n 1. Test"; + assertCompact(s2, s2); + } + + @Test + public void testOverrideNodeRendering() { + var nodeRendererFactory = new TextContentNodeRendererFactory() { + @Override + public NodeRenderer create(TextContentNodeRendererContext context) { + return new NodeRenderer() { + + @Override + public Set<Class<? extends Node>> getNodeTypes() { + return Set.of(Link.class); + } + + @Override + public void render(Node node) { + context.getWriter().write('"'); + renderChildren(node); + context.getWriter().write('"'); + } + + private void renderChildren(Node parent) { + Node node = parent.getFirstChild(); + while (node != null) { + Node next = node.getNext(); + context.render(node); + node = next; + } + } + }; + } + }; + var renderer = TextContentRenderer.builder().nodeRendererFactory(nodeRendererFactory).build(); + var source = "Hi [Example](https://example.com)"; + Asserts.assertRendering(source, "Hi \"Example\"", renderer.render(PARSER.parse(source))); + } + + private void assertCompact(String source, String expected) { + var doc = PARSER.parse(source); + var actualRendering = COMPACT_RENDERER.render(doc); + Asserts.assertRendering(source, expected, actualRendering); + } + + private void assertSeparate(String source, String expected) { + var doc = PARSER.parse(source); + var actualRendering = SEPARATE_RENDERER.render(doc); + Asserts.assertRendering(source, expected, actualRendering); + } + + private void assertStripped(String source, String expected) { + var doc = PARSER.parse(source); + var actualRendering = STRIPPED_RENDERER.render(doc); + Asserts.assertRendering(source, expected, actualRendering); + } + + private void assertAll(String source, String expected) { + assertCompact(source, expected); + assertSeparate(source, expected); + assertStripped(source, expected); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/TextContentWriterTest.java b/commonmark/src/test/java/org/commonmark/test/TextContentWriterTest.java new file mode 100644 index 000000000..a9f37792e --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/TextContentWriterTest.java @@ -0,0 +1,55 @@ +package org.commonmark.test; + +import org.commonmark.renderer.text.TextContentWriter; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TextContentWriterTest { + + @Test + public void whitespace() throws Exception { + StringBuilder stringBuilder = new StringBuilder(); + TextContentWriter writer = new TextContentWriter(stringBuilder); + writer.write("foo"); + writer.whitespace(); + writer.write("bar"); + assertThat(stringBuilder.toString()).isEqualTo("foo bar"); + } + + @Test + public void colon() throws Exception { + StringBuilder stringBuilder = new StringBuilder(); + TextContentWriter writer = new TextContentWriter(stringBuilder); + writer.write("foo"); + writer.colon(); + writer.write("bar"); + assertThat(stringBuilder.toString()).isEqualTo("foo:bar"); + } + + @Test + public void line() throws Exception { + StringBuilder stringBuilder = new StringBuilder(); + TextContentWriter writer = new TextContentWriter(stringBuilder); + writer.write("foo"); + writer.line(); + writer.write("bar"); + assertThat(stringBuilder.toString()).isEqualTo("foo\nbar"); + } + + @Test + public void writeStripped() throws Exception { + StringBuilder stringBuilder = new StringBuilder(); + TextContentWriter writer = new TextContentWriter(stringBuilder); + writer.writeStripped("foo\n bar"); + assertThat(stringBuilder.toString()).isEqualTo("foo bar"); + } + + @Test + public void write() throws Exception { + StringBuilder stringBuilder = new StringBuilder(); + TextContentWriter writer = new TextContentWriter(stringBuilder); + writer.writeStripped("foo bar"); + assertThat(stringBuilder.toString()).isEqualTo("foo bar"); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/ThematicBreakParserTest.java b/commonmark/src/test/java/org/commonmark/test/ThematicBreakParserTest.java new file mode 100644 index 000000000..1d564cca2 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/ThematicBreakParserTest.java @@ -0,0 +1,25 @@ +package org.commonmark.test; + +import org.commonmark.node.ThematicBreak; +import org.commonmark.parser.Parser; +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ThematicBreakParserTest { + + private static final Parser PARSER = Parser.builder().build(); + + @Test + public void testLiteral() { + assertLiteral("***", "***"); + assertLiteral("-- -", "-- -"); + assertLiteral(" __ __ __ ", " __ __ __ "); + assertLiteral("***", "> ***"); + } + + private static void assertLiteral(String expected, String input) { + var tb = Nodes.find(PARSER.parse(input), ThematicBreak.class); + assertThat(tb.getLiteral()).isEqualTo(expected); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/UsageExampleTest.java b/commonmark/src/test/java/org/commonmark/test/UsageExampleTest.java index 30b923257..20cd9f5ab 100644 --- a/commonmark/src/test/java/org/commonmark/test/UsageExampleTest.java +++ b/commonmark/src/test/java/org/commonmark/test/UsageExampleTest.java @@ -1,20 +1,159 @@ package org.commonmark.test; -import org.commonmark.html.HtmlRenderer; -import org.commonmark.node.Node; +import org.commonmark.node.*; +import org.commonmark.parser.IncludeSourceSpans; import org.commonmark.parser.Parser; -import org.junit.Test; +import org.commonmark.renderer.NodeRenderer; +import org.commonmark.renderer.html.*; +import org.commonmark.renderer.markdown.MarkdownRenderer; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertEquals; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import java.util.Set; + +import static org.assertj.core.api.Assertions.assertThat; public class UsageExampleTest { @Test - public void one() { + public void parseAndRender() { Parser parser = Parser.builder().build(); - Node document = parser.parse("This is *Sparta*"); + Node document = parser.parse("This is *Markdown*"); HtmlRenderer renderer = HtmlRenderer.builder().escapeHtml(true).build(); - assertEquals("<p>This is <em>Sparta</em></p>\n", renderer.render(document)); + assertThat(renderer.render(document)).isEqualTo("<p>This is <em>Markdown</em></p>\n"); + } + + @Test + public void renderToMarkdown() { + MarkdownRenderer renderer = MarkdownRenderer.builder().build(); + Node document = new Document(); + Heading heading = new Heading(); + heading.setLevel(2); + heading.appendChild(new Text("My title")); + document.appendChild(heading); + + assertThat(renderer.render(document)).isEqualTo("## My title\n"); + } + + @Test + @Disabled + public void parseReaderRender() throws IOException { + Parser parser = Parser.builder().build(); + try (InputStreamReader reader = new InputStreamReader(new FileInputStream("file.md"), StandardCharsets.UTF_8)) { + Node document = parser.parseReader(reader); + // ... + } + } + + @Test + public void visitor() { + Parser parser = Parser.builder().build(); + Node node = parser.parse("Example\n=======\n\nSome more text"); + WordCountVisitor visitor = new WordCountVisitor(); + node.accept(visitor); + assertThat(visitor.wordCount).isEqualTo(4); + } + + @Test + public void sourcePositions() { + var parser = Parser.builder().includeSourceSpans(IncludeSourceSpans.BLOCKS_AND_INLINES).build(); + + var source = "foo\n\nbar *baz*"; + var doc = parser.parse(source); + var emphasis = doc.getLastChild().getLastChild(); + var s = emphasis.getSourceSpans().get(0); + assertThat(s.getLineIndex()).isEqualTo(2); + assertThat(s.getColumnIndex()).isEqualTo(4); + assertThat(s.getInputIndex()).isEqualTo(9); + assertThat(s.getLength()).isEqualTo(5); + assertThat(source.substring(s.getInputIndex(), s.getInputIndex() + s.getLength())).isEqualTo("*baz*"); + } + + @Test + public void addAttributes() { + Parser parser = Parser.builder().build(); + HtmlRenderer renderer = HtmlRenderer.builder() + .attributeProviderFactory(new AttributeProviderFactory() { + @Override + public AttributeProvider create(AttributeProviderContext context) { + return new ImageAttributeProvider(); + } + }) + .build(); + + Node document = parser.parse("![text](/url.png)"); + assertThat(renderer.render(document)).isEqualTo("<p><img src=\"/url.png\" alt=\"text\" class=\"border\" /></p>\n"); + } + + @Test + public void customizeRendering() { + Parser parser = Parser.builder().build(); + HtmlRenderer renderer = HtmlRenderer.builder() + .nodeRendererFactory(new HtmlNodeRendererFactory() { + @Override + public NodeRenderer create(HtmlNodeRendererContext context) { + return new IndentedCodeBlockNodeRenderer(context); + } + }) + .build(); + + Node document = parser.parse("Example:\n\n code"); + assertThat(renderer.render(document)).isEqualTo("<p>Example:</p>\n<pre>code\n</pre>\n"); + } + + class WordCountVisitor extends AbstractVisitor { + + int wordCount = 0; + + @Override + public void visit(Text text) { + // This is called for all Text nodes. Override other visit methods for other node types. + + // Count words (this is just an example, don't actually do it this way for various reasons). + wordCount += text.getLiteral().split("\\W+").length; + + // Descend into children (could be omitted in this case because Text nodes don't have children). + visitChildren(text); + } + } + + class ImageAttributeProvider implements AttributeProvider { + @Override + public void setAttributes(Node node, String tagName, Map<String, String> attributes) { + if (node instanceof Image) { + attributes.put("class", "border"); + } + } } + class IndentedCodeBlockNodeRenderer implements NodeRenderer { + + private final HtmlWriter html; + + IndentedCodeBlockNodeRenderer(HtmlNodeRendererContext context) { + this.html = context.getWriter(); + } + + @Override + public Set<Class<? extends Node>> getNodeTypes() { + // Return the node types we want to use this renderer for. + return Set.of(IndentedCodeBlock.class); + } + + @Override + public void render(Node node) { + // We only handle one type as per getNodeTypes, so we can just cast it here. + IndentedCodeBlock codeBlock = (IndentedCodeBlock) node; + html.line(); + html.tag("pre"); + html.text(codeBlock.getLiteral()); + html.tag("/pre"); + html.line(); + } + } } diff --git a/commonmark/src/test/java/org/commonmark/text/CharactersTest.java b/commonmark/src/test/java/org/commonmark/text/CharactersTest.java new file mode 100644 index 000000000..99f510cb7 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/text/CharactersTest.java @@ -0,0 +1,33 @@ +package org.commonmark.text; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class CharactersTest { + + @Test + public void isPunctuation() { + // From https://spec.commonmark.org/0.29/#ascii-punctuation-character + char[] chars = { + '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', // (U+0021–2F) + ':', ';', '<', '=', '>', '?', '@', // (U+003A–0040) + '[', '\\', ']', '^', '_', '`', // (U+005B–0060) + '{', '|', '}', '~' // (U+007B–007E) + }; + + for (char c : chars) { + assertThat(Characters.isPunctuationCodePoint(c)).as("Expected to be punctuation: " + c).isTrue(); + } + } + + @Test + public void isBlank() { + assertThat(Characters.isBlank("")).isTrue(); + assertThat(Characters.isBlank(" ")).isTrue(); + assertThat(Characters.isBlank("\t")).isTrue(); + assertThat(Characters.isBlank(" \t")).isTrue(); + assertThat(Characters.isBlank("a")).isFalse(); + assertThat(Characters.isBlank("\f")).isFalse(); + } +} diff --git a/etc/benchmark.sh b/etc/benchmark.sh new file mode 100755 index 000000000..89d3cfb4c --- /dev/null +++ b/etc/benchmark.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd $(dirname $0)/.. +mvn -pl commonmark -Pbenchmark -DskipTests clean package exec:exec diff --git a/etc/update-spec.sh b/etc/update-spec.sh index 2879fc4a0..0f9def8b3 100755 --- a/etc/update-spec.sh +++ b/etc/update-spec.sh @@ -6,4 +6,9 @@ if [ "$#" -ne 1 ]; then fi version=$1 -curl -L "https://raw.githubusercontent.com/jgm/CommonMark/$version/spec.txt" -o commonmark/src/test/resources/spec.txt +curl -L "https://raw.githubusercontent.com/commonmark/commonmark-spec/$version/spec.txt" -o commonmark-test-util/src/main/resources/spec.txt +curl -L "https://raw.githubusercontent.com/github/cmark-gfm/master/test/spec.txt" -o commonmark-test-util/src/main/resources/gfm-spec.txt + +echo "Check cmark and commonmark.js regression.txt:" +echo "https://github.com/commonmark/cmark/blob/master/test/regression.txt" +echo "https://github.com/commonmark/commonmark.js/blob/master/test/regression.txt" diff --git a/mvnw b/mvnw new file mode 100755 index 000000000..19529ddf8 --- /dev/null +++ b/mvnw @@ -0,0 +1,259 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Apache Maven Wrapper startup batch script, version 3.3.2 +# +# Optional ENV vars +# ----------------- +# JAVA_HOME - location of a JDK home dir, required when download maven via java source +# MVNW_REPOURL - repo url base for downloading maven distribution +# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven +# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output +# ---------------------------------------------------------------------------- + +set -euf +[ "${MVNW_VERBOSE-}" != debug ] || set -x + +# OS specific support. +native_path() { printf %s\\n "$1"; } +case "$(uname)" in +CYGWIN* | MINGW*) + [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")" + native_path() { cygpath --path --windows "$1"; } + ;; +esac + +# set JAVACMD and JAVACCMD +set_java_home() { + # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched + if [ -n "${JAVA_HOME-}" ]; then + if [ -x "$JAVA_HOME/jre/sh/java" ]; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACCMD="$JAVA_HOME/jre/sh/javac" + else + JAVACMD="$JAVA_HOME/bin/java" + JAVACCMD="$JAVA_HOME/bin/javac" + + if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then + echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2 + echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2 + return 1 + fi + fi + else + JAVACMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v java + )" || : + JAVACCMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v javac + )" || : + + if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then + echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2 + return 1 + fi + fi +} + +# hash string like Java String::hashCode +hash_string() { + str="${1:-}" h=0 + while [ -n "$str" ]; do + char="${str%"${str#?}"}" + h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296)) + str="${str#?}" + done + printf %x\\n $h +} + +verbose() { :; } +[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; } + +die() { + printf %s\\n "$1" >&2 + exit 1 +} + +trim() { + # MWRAPPER-139: + # Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds. + # Needed for removing poorly interpreted newline sequences when running in more + # exotic environments such as mingw bash on Windows. + printf "%s" "${1}" | tr -d '[:space:]' +} + +# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties +while IFS="=" read -r key value; do + case "${key-}" in + distributionUrl) distributionUrl=$(trim "${value-}") ;; + distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;; + esac +done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties" +[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties" + +case "${distributionUrl##*/}" in +maven-mvnd-*bin.*) + MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ + case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in + *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;; + :Darwin*x86_64) distributionPlatform=darwin-amd64 ;; + :Darwin*arm64) distributionPlatform=darwin-aarch64 ;; + :Linux*x86_64*) distributionPlatform=linux-amd64 ;; + *) + echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2 + distributionPlatform=linux-amd64 + ;; + esac + distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip" + ;; +maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;; +*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;; +esac + +# apply MVNW_REPOURL and calculate MAVEN_HOME +# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash> +[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}" +distributionUrlName="${distributionUrl##*/}" +distributionUrlNameMain="${distributionUrlName%.*}" +distributionUrlNameMain="${distributionUrlNameMain%-bin}" +MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}" +MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")" + +exec_maven() { + unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || : + exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD" +} + +if [ -d "$MAVEN_HOME" ]; then + verbose "found existing MAVEN_HOME at $MAVEN_HOME" + exec_maven "$@" +fi + +case "${distributionUrl-}" in +*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;; +*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;; +esac + +# prepare tmp dir +if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then + clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; } + trap clean HUP INT TERM EXIT +else + die "cannot create temp dir" +fi + +mkdir -p -- "${MAVEN_HOME%/*}" + +# Download and Install Apache Maven +verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." +verbose "Downloading from: $distributionUrl" +verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" + +# select .zip or .tar.gz +if ! command -v unzip >/dev/null; then + distributionUrl="${distributionUrl%.zip}.tar.gz" + distributionUrlName="${distributionUrl##*/}" +fi + +# verbose opt +__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR='' +[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v + +# normalize http auth +case "${MVNW_PASSWORD:+has-password}" in +'') MVNW_USERNAME='' MVNW_PASSWORD='' ;; +has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;; +esac + +if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then + verbose "Found wget ... using wget" + wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl" +elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then + verbose "Found curl ... using curl" + curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl" +elif set_java_home; then + verbose "Falling back to use Java to download" + javaSource="$TMP_DOWNLOAD_DIR/Downloader.java" + targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName" + cat >"$javaSource" <<-END + public class Downloader extends java.net.Authenticator + { + protected java.net.PasswordAuthentication getPasswordAuthentication() + { + return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() ); + } + public static void main( String[] args ) throws Exception + { + setDefault( new Downloader() ); + java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() ); + } + } + END + # For Cygwin/MinGW, switch paths to Windows format before running javac and java + verbose " - Compiling Downloader.java ..." + "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java" + verbose " - Running Downloader.java ..." + "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")" +fi + +# If specified, validate the SHA-256 sum of the Maven distribution zip file +if [ -n "${distributionSha256Sum-}" ]; then + distributionSha256Result=false + if [ "$MVN_CMD" = mvnd.sh ]; then + echo "Checksum validation is not supported for maven-mvnd." >&2 + echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + elif command -v sha256sum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + elif command -v shasum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + else + echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2 + echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + fi + if [ $distributionSha256Result = false ]; then + echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2 + echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2 + exit 1 + fi +fi + +# unzip and move +if command -v unzip >/dev/null; then + unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip" +else + tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar" +fi +printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url" +mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME" + +clean || : +exec_maven "$@" diff --git a/pom.xml b/pom.xml index cd490c1cb..f12805316 100644 --- a/pom.xml +++ b/pom.xml @@ -1,49 +1,129 @@ <?xml version="1.0" encoding="UTF-8"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>com.atlassian.pom</groupId> - <artifactId>central-pom</artifactId> - <version>3.0.91</version> - </parent> - <packaging>pom</packaging> - <groupId>com.atlassian.commonmark</groupId> + <groupId>org.commonmark</groupId> <artifactId>commonmark-parent</artifactId> - <version>0.1.1-SNAPSHOT</version> + <version>0.28.1-SNAPSHOT</version> <name>commonmark-java parent</name> <description> Java implementation of CommonMark, a specification of the Markdown format for turning plain text into formatted text. </description> - <url>https://github.com/atlassian/commonmark-java</url> + <url>https://github.com/commonmark/commonmark-java</url> <modules> <module>commonmark</module> <module>commonmark-ext-autolink</module> + <module>commonmark-ext-footnotes</module> + <module>commonmark-ext-gfm-alerts</module> <module>commonmark-ext-gfm-strikethrough</module> <module>commonmark-ext-gfm-tables</module> + <module>commonmark-ext-heading-anchor</module> + <module>commonmark-ext-image-attributes</module> + <module>commonmark-ext-ins</module> + <module>commonmark-ext-task-list-items</module> + <module>commonmark-ext-yaml-front-matter</module> <module>commonmark-integration-test</module> + <module>commonmark-test-util</module> </modules> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <commonmark.javadoc.location>${project.basedir}/../commonmark/target/apidocs/</commonmark.javadoc.location> </properties> <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.14.0</version> + <configuration> + <release>11</release> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>3.4.2</version> + <configuration> + <archive> + <manifestFile>${project.build.outputDirectory}/META-INF/MANIFEST.MF</manifestFile> + </archive> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-install-plugin</artifactId> + <version>3.1.4</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + <version>3.11.2</version> + <configuration> + <excludePackageNames>*.internal,*.internal.*</excludePackageNames> + <!-- The offline links make links from extensions to core work. --> + <detectOfflineLinks>false</detectOfflineLinks> + <offlineLinks> + <offlineLink> + <url>https://static.javadoc.io/org.commonmark/commonmark/${project.version}/ + </url> + <location>${commonmark.javadoc.location}</location> + </offlineLink> + </offlineLinks> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>3.5.3</version> + </plugin> + </plugins> + </pluginManagement> + <plugins> + <!-- https://central.sonatype.org/publish/publish-portal-maven/ --> + <plugin> + <groupId>org.sonatype.central</groupId> + <artifactId>central-publishing-maven-plugin</artifactId> + <version>0.8.0</version> + <extensions>true</extensions> + <configuration> + <publishingServerId>central</publishingServerId> + <autoPublish>true</autoPublish> + <waitUntil>published</waitUntil> + </configuration> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <version>3.3</version> + <artifactId>maven-release-plugin</artifactId> + <version>3.1.1</version> <configuration> - <source>7</source> - <target>7</target> - <testCompilerArgument>-proc:none</testCompilerArgument> + <autoVersionSubmodules>true</autoVersionSubmodules> + <useReleaseProfile>false</useReleaseProfile> + <releaseProfiles>release</releaseProfiles> + <goals>deploy</goals> </configuration> </plugin> + <plugin> + <groupId>org.apache.felix</groupId> + <artifactId>maven-bundle-plugin</artifactId> + <!-- 6.0.0 requires Java 17+ (currently on Java 11) --> + <version>5.1.9</version> + <executions> + <execution> + <id>bundle-manifest</id> + <phase>process-classes</phase> + <goals> + <goal>manifest</goal> + </goals> + </execution> + </executions> + </plugin> </plugins> </build> @@ -51,47 +131,86 @@ <dependencies> <!-- For dependencies between modules --> <dependency> - <groupId>com.atlassian.commonmark</groupId> + <groupId>org.commonmark</groupId> <artifactId>commonmark</artifactId> - <version>0.1.1-SNAPSHOT</version> + <version>0.28.1-SNAPSHOT</version> </dependency> <dependency> - <groupId>com.atlassian.commonmark</groupId> + <groupId>org.commonmark</groupId> <artifactId>commonmark-ext-autolink</artifactId> - <version>0.1.1-SNAPSHOT</version> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-footnotes</artifactId> + <version>0.28.1-SNAPSHOT</version> </dependency> <dependency> - <groupId>com.atlassian.commonmark</groupId> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-image-attributes</artifactId> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-ins</artifactId> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-gfm-alerts</artifactId> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> <artifactId>commonmark-ext-gfm-strikethrough</artifactId> - <version>0.1.1-SNAPSHOT</version> + <version>0.28.1-SNAPSHOT</version> </dependency> <dependency> - <groupId>com.atlassian.commonmark</groupId> + <groupId>org.commonmark</groupId> <artifactId>commonmark-ext-gfm-tables</artifactId> - <version>0.1.1-SNAPSHOT</version> + <version>0.28.1-SNAPSHOT</version> </dependency> <dependency> - <groupId>com.atlassian.commonmark</groupId> - <artifactId>commonmark</artifactId> - <version>0.1.1-SNAPSHOT</version> - <type>test-jar</type> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-heading-anchor</artifactId> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-task-list-items</artifactId> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-ext-yaml-front-matter</artifactId> + <version>0.28.1-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.commonmark</groupId> + <artifactId>commonmark-test-util</artifactId> + <version>0.28.1-SNAPSHOT</version> </dependency> <!-- Common test dependencies --> <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>4.12</version> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter</artifactId> + <version>5.13.1</version> + </dependency> + <dependency> + <groupId>org.assertj</groupId> + <artifactId>assertj-core</artifactId> + <version>3.27.7</version> </dependency> <dependency> <groupId>org.openjdk.jmh</groupId> <artifactId>jmh-core</artifactId> - <version>1.10.3</version> + <version>1.37</version> </dependency> <dependency> <groupId>org.openjdk.jmh</groupId> <artifactId>jmh-generator-annprocess</artifactId> - <version>1.10.3</version> + <version>1.37</version> </dependency> </dependencies> </dependencyManagement> @@ -104,6 +223,7 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-source-plugin</artifactId> + <version>3.3.1</version> <executions> <execution> <id>attach-sources</id> @@ -124,13 +244,11 @@ </goals> </execution> </executions> - <configuration> - <excludePackageNames>*.internal,*.internal.*</excludePackageNames> - </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-gpg-plugin</artifactId> + <version>3.2.7</version> <executions> <execution> <id>sign-artifacts</id> @@ -138,15 +256,41 @@ <goals> <goal>sign</goal> </goals> + <configuration> + <gpgArguments> + <arg>--pinentry-mode</arg> + <arg>loopback</arg> + </gpgArguments> + </configuration> </execution> </executions> </plugin> + </plugins> + </build> + </profile> + <profile> + <id>coverage</id> + <build> + <plugins> <plugin> - <groupId>org.sonatype.plugins</groupId> - <artifactId>nexus-staging-maven-plugin</artifactId> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> + <version>0.8.13</version> <configuration> - <autoReleaseAfterClose>false</autoReleaseAfterClose> + <excludes> + <!-- Classes from test-util --> + <exclude>org/commonmark/spec/*</exclude> + <exclude>org/commonmark/test/*</exclude> + </excludes> </configuration> + <executions> + <execution> + <id>prepare-agent</id> + <goals> + <goal>prepare-agent</goal> + </goals> + </execution> + </executions> </plugin> </plugins> </build> @@ -155,8 +299,8 @@ <licenses> <license> - <name>BSD 2-Clause License</name> - <url>http://opensource.org/licenses/BSD-2-Clause</url> + <name>BSD-2-Clause</name> + <url>https://opensource.org/licenses/BSD-2-Clause</url> <distribution>repo</distribution> </license> </licenses> @@ -164,26 +308,14 @@ <developers> <developer> <name>Robin Stocker</name> - <email>rstocker@atlassian.com</email> - <organization>Atlassian</organization> - <organizationUrl>https://www.atlassian.com/</organizationUrl> </developer> </developers> <scm> - <connection>scm:git:git@github.com:atlassian/commonmark-java.git</connection> - <developerConnection>scm:git:git@github.com:atlassian/commonmark-java.git</developerConnection> - <url>https://github.com/atlassian/commonmark-java</url> + <connection>scm:git:https://github.com/commonmark/commonmark-java</connection> + <developerConnection>scm:git:https://github.com/commonmark/commonmark-java</developerConnection> + <url>https://github.com/commonmark/commonmark-java</url> <tag>HEAD</tag> </scm> - <!-- Work around central-pom depending on things that are not in Maven Central --> - <pluginRepositories> - <pluginRepository> - <id>atlassian-public</id> - <name>Atlassian Public Repository</name> - <url>https://maven.atlassian.com/repository/public</url> - </pluginRepository> - </pluginRepositories> - </project> diff --git a/renovate.json b/renovate.json new file mode 100644 index 000000000..f45d8f110 --- /dev/null +++ b/renovate.json @@ -0,0 +1,5 @@ +{ + "extends": [ + "config:base" + ] +}