From 6130522aa86316c7d87e130cc8c440fd06920928 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Wed, 21 Feb 2024 18:08:08 +0800 Subject: [PATCH] Refactor markup rendering to accept general "protocol:" prefix (#29276) Follow #29024 Major changes: * refactor validLinksPattern to fullURLPattern and add comments, now it accepts "protocol:" prefix * rename `IsLink*` to `IsFullURL*`, and remove unnecessray "mailto:" check * fix some comments (by the way) * rename EmojiShortCodeRegex -> emojiShortCodeRegex (by the way) --- modules/markup/html.go | 34 ++++++++++++++--------------- modules/markup/html_test.go | 15 +++++++++++++ modules/markup/markdown/goldmark.go | 18 +++++---------- modules/markup/orgmode/orgmode.go | 3 +-- 4 files changed, 38 insertions(+), 32 deletions(-) diff --git a/modules/markup/html.go b/modules/markup/html.go index b7291823b..56e1a1c54 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -53,38 +53,38 @@ var ( // shortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`) - // anySHA1Pattern splits url containing SHA into parts + // anyHashPattern splits url containing SHA into parts anyHashPattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{40,64})(/[-+~_%.a-zA-Z0-9/]+)?(#[-+~_%.a-zA-Z0-9]+)?`) // comparePattern matches "http://domain/org/repo/compare/COMMIT1...COMMIT2#hash" comparePattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{7,64})(\.\.\.?)([0-9a-f]{7,64})?(#[-+~_%.a-zA-Z0-9]+)?`) - validLinksPattern = regexp.MustCompile(`^[a-z][\w-]+://`) + // fullURLPattern matches full URL like "mailto:...", "https://..." and "ssh+git://..." + fullURLPattern = regexp.MustCompile(`^[a-z][-+\w]+:`) - // While this email regex is definitely not perfect and I'm sure you can come up - // with edge cases, it is still accepted by the CommonMark specification, as - // well as the HTML5 spec: + // emailRegex is definitely not perfect with edge cases, + // it is still accepted by the CommonMark specification, as well as the HTML5 spec: // http://spec.commonmark.org/0.28/#email-address // https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail) emailRegex = regexp.MustCompile("(?:\\s|^|\\(|\\[)([a-zA-Z0-9.!#$%&'*+\\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\\.[a-zA-Z0-9]{2,}(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)(?:\\s|$|\\)|\\]|;|,|\\?|!|\\.(\\s|$))") - // blackfriday extensions create IDs like fn:user-content-footnote + // blackfridayExtRegex is for blackfriday extensions create IDs like fn:user-content-footnote blackfridayExtRegex = regexp.MustCompile(`[^:]*:user-content-`) - // EmojiShortCodeRegex find emoji by alias like :smile: - EmojiShortCodeRegex = regexp.MustCompile(`:[-+\w]+:`) + // emojiShortCodeRegex find emoji by alias like :smile: + emojiShortCodeRegex = regexp.MustCompile(`:[-+\w]+:`) ) // CSS class for action keywords (e.g. "closes: #1") const keywordClass = "issue-keyword" -// IsLink reports whether link fits valid format. -func IsLink(link []byte) bool { - return validLinksPattern.Match(link) +// IsFullURLBytes reports whether link fits valid format. +func IsFullURLBytes(link []byte) bool { + return fullURLPattern.Match(link) } -func IsLinkStr(link string) bool { - return validLinksPattern.MatchString(link) +func IsFullURLString(link string) bool { + return fullURLPattern.MatchString(link) } // regexp for full links to issues/pulls @@ -399,7 +399,7 @@ func visitNode(ctx *RenderContext, procs []processor, node *html.Node) { if attr.Key != "src" { continue } - if len(attr.Val) > 0 && !IsLinkStr(attr.Val) && !strings.HasPrefix(attr.Val, "data:image/") { + if len(attr.Val) > 0 && !IsFullURLString(attr.Val) && !strings.HasPrefix(attr.Val, "data:image/") { attr.Val = util.URLJoin(ctx.Links.ResolveMediaLink(ctx.IsWiki), attr.Val) } attr.Val = camoHandleLink(attr.Val) @@ -650,7 +650,7 @@ func shortLinkProcessor(ctx *RenderContext, node *html.Node) { if equalPos := strings.IndexByte(v, '='); equalPos == -1 { // There is no equal in this argument; this is a mandatory arg if props["name"] == "" { - if IsLinkStr(v) { + if IsFullURLString(v) { // If we clearly see it is a link, we save it so // But first we need to ensure, that if both mandatory args provided @@ -725,7 +725,7 @@ func shortLinkProcessor(ctx *RenderContext, node *html.Node) { DataAtom: atom.A, } childNode.Parent = linkNode - absoluteLink := IsLinkStr(link) + absoluteLink := IsFullURLString(link) if !absoluteLink { if image { link = strings.ReplaceAll(link, " ", "+") @@ -1059,7 +1059,7 @@ func emojiShortCodeProcessor(ctx *RenderContext, node *html.Node) { start := 0 next := node.NextSibling for node != nil && node != next && start < len(node.Data) { - m := EmojiShortCodeRegex.FindStringSubmatchIndex(node.Data[start:]) + m := emojiShortCodeRegex.FindStringSubmatchIndex(node.Data[start:]) if m == nil { return } diff --git a/modules/markup/html_test.go b/modules/markup/html_test.go index 89ecfc036..cb29431d4 100644 --- a/modules/markup/html_test.go +++ b/modules/markup/html_test.go @@ -204,6 +204,15 @@ func TestRender_links(t *testing.T) { test( "magnet:?xt=urn:btih:5dee65101db281ac9c46344cd6b175cdcadabcde&dn=download", `

magnet:?xt=urn:btih:5dee65101db281ac9c46344cd6b175cdcadabcde&dn=download

`) + test( + `[link](https://example.com)`, + `

link

`) + test( + `[link](mailto:test@example.com)`, + `

link

`) + test( + `[link](javascript:xss)`, + `

link

`) // Test that should *not* be turned into URL test( @@ -673,3 +682,9 @@ func TestIssue18471(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "783b039...da951ce", res.String()) } + +func TestIsFullURL(t *testing.T) { + assert.True(t, markup.IsFullURLString("https://example.com")) + assert.True(t, markup.IsFullURLString("mailto:test@example.com")) + assert.False(t, markup.IsFullURLString("/foo:bar")) +} diff --git a/modules/markup/markdown/goldmark.go b/modules/markup/markdown/goldmark.go index 36ce6397f..c4b23e66f 100644 --- a/modules/markup/markdown/goldmark.go +++ b/modules/markup/markdown/goldmark.go @@ -26,8 +26,6 @@ import ( "github.com/yuin/goldmark/util" ) -var byteMailto = []byte("mailto:") - // ASTTransformer is a default transformer of the goldmark tree. type ASTTransformer struct{} @@ -84,7 +82,7 @@ func (g *ASTTransformer) Transform(node *ast.Document, reader text.Reader, pc pa // 2. If they're not wrapped with a link they need a link wrapper // Check if the destination is a real link - if len(v.Destination) > 0 && !markup.IsLink(v.Destination) { + if len(v.Destination) > 0 && !markup.IsFullURLBytes(v.Destination) { v.Destination = []byte(giteautil.URLJoin( ctx.Links.ResolveMediaLink(ctx.IsWiki), strings.TrimLeft(string(v.Destination), "/"), @@ -130,23 +128,17 @@ func (g *ASTTransformer) Transform(node *ast.Document, reader text.Reader, pc pa case *ast.Link: // Links need their href to munged to be a real value link := v.Destination - if len(link) > 0 && !markup.IsLink(link) && - link[0] != '#' && !bytes.HasPrefix(link, byteMailto) { - // special case: this is not a link, a hash link or a mailto:, so it's a - // relative URL - - var base string + isAnchorFragment := len(link) > 0 && link[0] == '#' + if !isAnchorFragment && !markup.IsFullURLBytes(link) { + base := ctx.Links.Base if ctx.IsWiki { base = ctx.Links.WikiLink() } else if ctx.Links.HasBranchInfo() { base = ctx.Links.SrcLink() - } else { - base = ctx.Links.Base } - link = []byte(giteautil.URLJoin(base, string(link))) } - if len(link) > 0 && link[0] == '#' { + if isAnchorFragment { link = []byte("#user-content-" + string(link)[1:]) } v.Destination = link diff --git a/modules/markup/orgmode/orgmode.go b/modules/markup/orgmode/orgmode.go index ac1cedff6..7f253ae5f 100644 --- a/modules/markup/orgmode/orgmode.go +++ b/modules/markup/orgmode/orgmode.go @@ -136,8 +136,7 @@ type Writer struct { func (r *Writer) resolveLink(kind, link string) string { link = strings.TrimPrefix(link, "file:") if !strings.HasPrefix(link, "#") && // not a URL fragment - !markup.IsLinkStr(link) && // not an absolute URL - !strings.HasPrefix(link, "mailto:") { + !markup.IsFullURLString(link) { if kind == "regular" { // orgmode reports the link kind as "regular" for "[[ImageLink.svg][The Image Desc]]" // so we need to try to guess the link kind again here