/*Package parser implements parser for markdown text that generates AST (abstract syntax tree).*/
package parserimport ()// Extensions is a bitmask of enabled parser extensions.typeExtensionsint// Bit flags representing markdown parsing extensions.// Use | (or) to specify multiple extensions.const (NoExtensionsExtensions = 0NoIntraEmphasisExtensions = 1 << iota// Ignore emphasis markers inside wordsTables// Parse tablesFencedCode// Parse fenced code blocksAutolink// Detect embedded URLs that are not explicitly markedStrikethrough// Strikethrough text using ~~test~~LaxHTMLBlocks// Loosen up HTML block parsing rulesSpaceHeadings// Be strict about prefix heading rulesHardLineBreak// Translate newlines into line breaksNonBlockingSpace// Translate backspace spaces into line non-blocking spacesTabSizeEight// Expand tabs to eight spaces instead of fourFootnotes// Pandoc-style footnotesNoEmptyLineBeforeBlock// No need to insert an empty line to start a (code, quote, ordered list, unordered list) blockHeadingIDs// specify heading IDs with {#id}Titleblock// Titleblock ala pandocAutoHeadingIDs// Create the heading ID from the textBackslashLineBreak// Translate trailing backslashes into line breaksDefinitionLists// Parse definition listsMathJax// Parse MathJaxOrderedListStart// Keep track of the first number used when starting an ordered list.Attributes// Block AttributesSuperSubscript// Super- and subscript support: 2^10^, H~2~O.EmptyLinesBreakList// 2 empty lines break out of listIncludes// Support including other files.Mmark// Support Mmark syntax, see https://mmark.miek.nl/post/syntax/CommonExtensionsExtensions = NoIntraEmphasis | Tables | FencedCode |Autolink | Strikethrough | SpaceHeadings | HeadingIDs |BackslashLineBreak | DefinitionLists | MathJax)// The size of a tab stop.const ( tabSizeDefault = 4 tabSizeDouble = 8)// for each character that triggers a response when parsing inline data.typeInlineParserfunc(p *Parser, data []byte, offset int) (int, ast.Node)// ReferenceOverrideFunc is expected to be called with a reference string and// return either a valid Reference type that the reference string maps to or// nil. If overridden is false, the default reference logic will be executed.// See the documentation in Options for more details on use-case.typeReferenceOverrideFuncfunc(reference string) (ref *Reference, overridden bool)// Parser is a type that holds extensions and the runtime state used by// Parse, and the renderer. You can not use it directly, construct it with New.typeParserstruct {// ReferenceOverride is an optional function callback that is called every // time a reference is resolved. It can be set before starting parsing. // // In Markdown, the link reference syntax can be made to resolve a link to // a reference instead of an inline URL, in one of the following ways: // // * [link text][refid] // * [refid][] // // Usually, the refid is defined at the bottom of the Markdown document. If // this override function is provided, the refid is passed to the override // function first, before consulting the defined refids at the bottom. If // the override function indicates an override did not occur, the refids at // the bottom will be used to fill in the link details. ReferenceOverride ReferenceOverrideFunc// IsSafeURLOverride allows overriding the default URL matcher. URL is // safe if the overriding function returns true. Can be used to extend // the default list of safe URLs. IsSafeURLOverride func(url []byte) bool Opts Options// after parsing, this is AST root of parsed markdown text Doc ast.Node extensions Extensions refs map[string]*reference refsRecord map[string]struct{} inlineCallback [256]InlineParser nesting int maxNesting int InsideLink bool indexCnt int// incremented after every index// Footnotes need to be ordered as well as available to quickly check for // presence. If a ref is also a footnote, it's stored both in refs and here // in notes. Slice is nil if footnotes not enabled. notes []*reference tip ast.Node// = doc oldTip ast.Node lastMatchedContainer ast.Node// = doc allClosed bool// Attributes are attached to block level elements. attr *ast.Attribute includeStack *incStack// collect headings where we auto-generated id so that we can // ensure they are unique at the end allHeadingsWithAutoID []*ast.Heading didParse bool}// New creates a markdown parser with CommonExtensions.//// You can then call `doc := p.Parse(markdown)` to parse markdown document// and `markdown.Render(doc, renderer)` to convert it to another format with// a renderer.func () *Parser {returnNewWithExtensions(CommonExtensions)}// NewWithExtensions creates a markdown parser with given extensions.func ( Extensions) *Parser { := Parser{refs: make(map[string]*reference),refsRecord: make(map[string]struct{}),maxNesting: 64,InsideLink: false,Doc: &ast.Document{},extensions: ,allClosed: true,includeStack: newIncStack(), } .tip = .Doc .oldTip = .Doc .lastMatchedContainer = .Doc .inlineCallback[' '] = maybeLineBreak .inlineCallback['*'] = emphasis .inlineCallback['_'] = emphasisif .extensions&Strikethrough != 0 { .inlineCallback['~'] = emphasis } .inlineCallback['`'] = codeSpan .inlineCallback['\n'] = lineBreak .inlineCallback['['] = link .inlineCallback['<'] = leftAngle .inlineCallback['\\'] = escape .inlineCallback['&'] = entity .inlineCallback['!'] = maybeImageif .extensions&Mmark != 0 { .inlineCallback['('] = maybeShortRefOrIndex } .inlineCallback['^'] = maybeInlineFootnoteOrSuperif .extensions&Autolink != 0 { .inlineCallback['h'] = maybeAutoLink .inlineCallback['m'] = maybeAutoLink .inlineCallback['f'] = maybeAutoLink .inlineCallback['H'] = maybeAutoLink .inlineCallback['M'] = maybeAutoLink .inlineCallback['F'] = maybeAutoLink }if .extensions&MathJax != 0 { .inlineCallback['$'] = math }return &}func ( *Parser) ( byte, InlineParser) InlineParser { := .inlineCallback[] .inlineCallback[] = return}func ( *Parser) ( string) ( *reference, bool) {if .ReferenceOverride != nil { , := .ReferenceOverride()if {if == nil {returnnil, false }return &reference{link: []byte(.Link),title: []byte(.Title),noteID: 0,hasBlock: false,text: []byte(.Text)}, true } }// refs are case insensitive , = .refs[strings.ToLower()]return , }func ( *Parser) ( *reference) bool { , := .refsRecord[string(.link)]return}func ( *Parser) ( ast.Node) { .tip = .GetParent()}func ( *Parser) ( ast.Node) ast.Node {for !canNodeContain(.tip, ) { .Finalize(.tip) }ast.AppendChild(.tip, ) .tip = return}func canNodeContain( ast.Node, ast.Node) bool {switch .(type) {case *ast.List:returnisListItem()case *ast.Document, *ast.BlockQuote, *ast.Aside, *ast.ListItem, *ast.CaptionFigure:return !isListItem()case *ast.Table:switch .(type) {case *ast.TableHeader, *ast.TableBody, *ast.TableFooter:returntruedefault:returnfalse }case *ast.TableHeader, *ast.TableBody, *ast.TableFooter: , := .(*ast.TableRow)returncase *ast.TableRow: , := .(*ast.TableCell)return }// for nodes implemented outside of ast package, allow them // to implement this logic via CanContain interfaceif , := .(ast.CanContain); {return .CanContain() }// for container nodes outside of ast package default to true // because false is a bad default := fmt.Sprintf("%T", ) := !strings.HasPrefix(, "*ast.")if {return .AsLeaf() == nil }returnfalse}func ( *Parser) () {if .allClosed {return }for .oldTip != .lastMatchedContainer { := .oldTip.GetParent() .Finalize(.oldTip) .oldTip = } .allClosed = true}// Reference represents the details of a link.// See the documentation in Options for more details on use-case.typeReferencestruct {// Link is usually the URL the reference points to. Link string// Title is the alternate text describing the link in more detail. Title string// Text is the optional text to override the ref with if the syntax used was // [refid][] Text string}// Parse generates AST (abstract syntax tree) representing markdown document.//// The result is a root of the tree whose underlying type is *ast.Document//// You can then convert AST to html using html.Renderer, to some other format// using a custom renderer or transform the tree.//// Parser is not reusable. Create a new Parser for each Parse() call.func ( *Parser) ( []byte) ast.Node {if .didParse {panic("Parser is not reusable. Must create new Parser for each Parse() call.") } .didParse = true// the code only works with Unix CR newlines so to make life easy for // callers normalize newlines = NormalizeNewlines() .Block()// Walk the tree and finish up some of unfinished blocksfor .tip != nil { .Finalize(.tip) }// Walk the tree again and process inline markdown in each blockast.WalkFunc(.Doc, func( ast.Node, bool) ast.WalkStatus {switch .(type) {case *ast.Paragraph, *ast.Heading, *ast.TableCell: .Inline(, .AsContainer().Content) .AsContainer().Content = nil }returnast.GoToNext })if .Opts.Flags&SkipFootnoteList == 0 { .parseRefsToAST() }// ensure HeadingIDs generated with AutoHeadingIDs are unique // this is delayed here (as opposed to done when we create the id) // so that we can preserve more original ids when there are conflicts := map[string]bool{}for , := range .allHeadingsWithAutoID { := .HeadingIDif == "" {continue } := 0for [] { ++ = .HeadingID + "-" + strconv.Itoa() } .HeadingID = [] = true }return .Doc}func ( *Parser) () {if .extensions&Footnotes == 0 || len(.notes) == 0 {return } .tip = .Doc := &ast.List{IsFootnotesList: true,ListFlags: ast.ListTypeOrdered, } .AddBlock(&ast.Footnotes{}) := .AddBlock() := ast.ListItemBeginningOfList// Note: this loop is intentionally explicit, not range-form. This is // because the body of the loop will append nested footnotes to p.notes and // we need to process those late additions. Range form would only walk over // the fixed initial set.for := 0; < len(.notes); ++ { := .notes[] .addChild(.footnote) := .footnote := .(*ast.ListItem) .ListFlags = | ast.ListTypeOrdered .RefLink = .linkif .hasBlock { |= ast.ListItemContainsBlock .Block(.title) } else { .Inline(, .title) } &^= ast.ListItemBeginningOfList | ast.ListItemContainsBlock } := .ParentfinalizeList() .tip = ast.WalkFunc(, func( ast.Node, bool) ast.WalkStatus {switch .(type) {case *ast.Paragraph, *ast.Heading: .Inline(, .AsContainer().Content) .AsContainer().Content = nil }returnast.GoToNext })}//// Link references//// This section implements support for references that (usually) appear// as footnotes in a document, and can be referenced anywhere in the document.// The basic format is://// [1]: http://www.google.com/ "Google"// [2]: http://www.github.com/ "Github"//// Anywhere in the document, the reference can be linked by referring to its// label, i.e., 1 and 2 in this example, as in://// This library is hosted on [Github][2], a git hosting site.//// Actual footnotes as specified in Pandoc and supported by some other Markdown// libraries such as php-markdown are also taken care of. They look like this://// This sentence needs a bit of further explanation.[^note]//// [^note]: This is the explanation.//// Footnotes should be placed at the end of the document in an ordered list.// Inline footnotes such as://// Inline footnotes^[Not supported.] also exist.//// are not yet supported.// reference holds all information necessary for a reference-style links or// footnotes.//// Consider this markdown with reference-style links://// [link][ref]//// [ref]: /url/ "tooltip title"//// It will be ultimately converted to this HTML://// <p><a href=\"/url/\" title=\"title\">link</a></p>//// And a reference structure will be populated as follows://// p.refs["ref"] = &reference{// link: "/url/",// title: "tooltip title",// }//// Alternatively, reference can contain information about a footnote. Consider// this markdown://// Text needing a footnote.[^a]//// [^a]: This is the note//// A reference structure will be populated as follows://// p.refs["a"] = &reference{// link: "a",// title: "This is the note",// noteID: <some positive int>,// }//// TODO: As you can see, it begs for splitting into two dedicated structures// for refs and for footnotes.type reference struct { link []byte title []byte noteID int// 0 if not a footnote ref hasBlock bool footnote ast.Node// a link to the Item node within a list of footnotes text []byte// only gets populated by refOverride feature with Reference.Text}func ( *reference) () string {returnfmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}", .link, .title, .text, .noteID, .hasBlock)}// Check whether or not data starts with a reference link.// If so, it is parsed and stored in the list of references// (in the render struct).// Returns the number of bytes to skip to move past it,// or zero if the first line is not a reference.func isReference( *Parser, []byte, int) int {// up to 3 optional leading spacesiflen() < 4 {return0 } := 0for < 3 && [] == ' ' { ++ } := 0// id part: anything but a newline between bracketsif [] != '[' {return0 } ++if .extensions&Footnotes != 0 {if < len() && [] == '^' {// we can set it to anything here because the proper noteIds will // be assigned later during the second pass. It just has to be != 0 = 1 ++ } } := for < len() && [] != '\n' && [] != '\r' && [] != ']' { ++ }if >= len() || [] != ']' {return0 } := // footnotes can have empty ID, like this: [^], but a reference can not be // empty like this: []. Break early if it's not a footnote and there's no IDif == 0 && == {return0 }// spacer: colon (space | tab)* newline? (space | tab)* ++if >= len() || [] != ':' {return0 } ++for < len() && ([] == ' ' || [] == '\t') { ++ }if < len() && ([] == '\n' || [] == '\r') { ++if < len() && [] == '\n' && [-1] == '\r' { ++ } }for < len() && ([] == ' ' || [] == '\t') { ++ }if >= len() {return0 }var ( , int , intint []bytebool )if .extensions&Footnotes != 0 && != 0 { , , , = scanFootnote(, , , ) = } else { , , , , = scanLinkRef(, , ) }if == 0 {return0 }// a valid ref has been found := &reference{noteID: ,hasBlock: , }if > 0 {// reusing the link field for the id since footnotes don't have links .link = [:]// if footnote, it's not really a title, it's the contained text .title = } else { .link = [:] .title = [:] }// id matches are case-insensitive := string(bytes.ToLower([:])) .refs[] = return}func scanLinkRef( *Parser, []byte, int) (, , , , int) {// link: whitespace-free sequence, optionally between angle bracketsif [] == '<' { ++ } = for < len() && [] != ' ' && [] != '\t' && [] != '\n' && [] != '\r' { ++ } = if < len() && [] == '<' && [-1] == '>' { ++ -- }// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )for < len() && ([] == ' ' || [] == '\t') { ++ }if < len() && [] != '\n' && [] != '\r' && [] != '\'' && [] != '"' && [] != '(' {return }// compute end-of-lineif >= len() || [] == '\r' || [] == '\n' { = }if +1 < len() && [] == '\r' && [+1] == '\n' { ++ }// optional (space|tab)* spacer after a newlineif > 0 { = + 1for < len() && ([] == ' ' || [] == '\t') { ++ } }// optional title: any non-newline sequence enclosed in '"() alone on its lineif +1 < len() && ([] == '\'' || [] == '"' || [] == '(') { ++ = // look for EOLfor < len() && [] != '\n' && [] != '\r' { ++ }if +1 < len() && [] == '\n' && [+1] == '\r' { = + 1 } else { = }// step back --for > && ([] == ' ' || [] == '\t') { -- }if > && ([] == '\'' || [] == '"' || [] == ')') { = = } }return}// The first bit of this logic is the same as Parser.listItem, but the rest// is much simpler. This function simply finds the entire block and shifts it// over by one tab if it is indeed a block (just returns the line if it's not).// blockEnd is the end of the section in the input buffer, and contents is the// extracted text that was shifted over one tab. It will need to be rendered at// the end of the document.func scanFootnote( *Parser, []byte, , int) (, int, []byte, bool) {if == 0 || len() == 0 {return }// skip leading whitespace on first linefor < len() && [] == ' ' { ++ } = // find the end of the line = for < len() && [-1] != '\n' { ++ }// get working buffervarbytes.Buffer// put the first line into the working buffer .Write([:]) = // process the following lines := false:for < len() { ++// find the end of this linefor < len() && [-1] != '\n' { ++ }// if it is an empty line, guess that it is part of this item // and move on to the next lineifIsEmpty([:]) > 0 { = true = continue } := 0if = isIndented([:], ); == 0 {// this is the end of the block. // we don't want to include this last line in the index.break }// if there were blank lines before this one, insert a new one nowif { .WriteByte('\n') = false }// get rid of that first tab, write to buffer .Write([+ : ]) = true = }if [-1] != '\n' { .WriteByte('\n') } = .Bytes()return}// IsPunctuation returns true if c is a punctuation symbol.func ( byte) bool {for , := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {if == {returntrue } }returnfalse}func ( []byte) bool {iflen() == 0 {returnfalse }ifIsPunctuation([0]) {returntrue } , := utf8.DecodeRune()if == utf8.RuneError {returnfalse }returnunicode.IsPunct()}// IsSpace returns true if c is a white-space characterfunc ( byte) bool {return == ' ' || == '\t' || == '\n' || == '\r' || == '\f' || == '\v'}// IsLetter returns true if c is ascii letterfunc ( byte) bool {return ( >= 'a' && <= 'z') || ( >= 'A' && <= 'Z')}// IsAlnum returns true if c is a digit or letter// TODO: check when this is looking for ASCII alnum and when it should use unicodefunc ( byte) bool {return ( >= '0' && <= '9') || IsLetter()}varURIs = [][]byte{ []byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto:"),}varPaths = [][]byte{ []byte("/"), []byte("./"), []byte("../"),}// IsSafeURL returns true if url starts with one of the valid schemes or is a relative path.func ( []byte) bool { := len()for , := rangePaths { := len() := [:]if >= && bytes.Equal(, ) {if == {returntrue } elseifIsAlnum([]) {returntrue } } }for , := rangeURIs {// TODO: handle unicode here // case-insensitive prefix test := len()if > { := bytes.ToLower([:])ifbytes.Equal(, ) && IsAlnum([]) {returntrue } } }returnfalse}// TODO: this is not used// Replace tab characters with spaces, aligning to the next TAB_SIZE column.// always ends output with a newline/*func expandTabs(out *bytes.Buffer, line []byte, tabSize int) { // first, check for common cases: no tabs, or only tabs at beginning of line i, prefix := 0, 0 slowcase := false for i = 0; i < len(line); i++ { if line[i] == '\t' { if prefix == i { prefix++ } else { slowcase = true break } } } // no need to decode runes if all tabs are at the beginning of the line if !slowcase { for i = 0; i < prefix*tabSize; i++ { out.WriteByte(' ') } out.Write(line[prefix:]) return } // the slow case: we need to count runes to figure out how // many spaces to insert for each tab column := 0 i = 0 for i < len(line) { start := i for i < len(line) && line[i] != '\t' { _, size := utf8.DecodeRune(line[i:]) i += size column++ } if i > start { out.Write(line[start:i]) } if i >= len(line) { break } for { out.WriteByte(' ') column++ if column%tabSize == 0 { break } } i++ }}*/// Find if a line counts as indented or not.// Returns number of characters the indent is (0 = not indented).func isIndented( []byte, int) int {iflen() == 0 {return0 }if [0] == '\t' {return1 }iflen() < {return0 }for := 0; < ; ++ {if [] != ' ' {return0 } }return}// Create a url-safe slug for fragmentsfunc slugify( []byte) []byte {iflen() == 0 {return } := make([]byte, 0, len()) := falsefor , := range {ifIsAlnum() { = false = append(, ) } elseif {continue } else { = append(, '-') = true } }var , intvarbytefor , = range {if != '-' {break } }for = len() - 1; > 0; -- {if [] != '-' {break } }return [ : +1]}func isListItem( ast.Node) bool { , := .(*ast.ListItem)return}func ( []byte) []byte { := make([]byte, len())copy(, ) = := 0 := len()for := 0; < ; ++ { := []// 13 is CRif != 13 { [] = ++continue }// replace CR (mac / win) with LF (unix) [] = 10 ++if < -1 && [+1] == 10 {// this was CRLF, so skip the LF ++ } }return [:]}
The pages are generated with Goldsv0.8.2. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.