itdoxy-lab/modules/charset/escape_test.go

178 lines
16 KiB
Go
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package charset
import (
"strings"
"testing"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"
"code.gitea.io/gitea/modules/translation"
"github.com/stretchr/testify/assert"
)
type escapeControlTest struct {
name string
text string
status EscapeStatus
result string
}
var escapeControlTests = []escapeControlTest{
{
name: "<empty>",
},
{
name: "single line western",
text: "single line western",
result: "single line western",
status: EscapeStatus{},
},
{
name: "multi line western",
text: "single line western\nmulti line western\n",
result: "single line western\nmulti line western\n",
status: EscapeStatus{},
},
{
name: "multi line western non-breaking space",
text: "single lineย western\nmulti lineย western\n",
result: `single line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char">ย </span></span>western` + "\n" + `multi line<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char">ย </span></span>western` + "\n",
status: EscapeStatus{Escaped: true, HasInvisible: true},
},
{
name: "mixed scripts: western + japanese",
text: "ๆ—ฅๅฑž็ง˜ใžใ—ใกใ‚…ใ€‚Then some western.",
result: "ๆ—ฅๅฑž็ง˜ใžใ—ใกใ‚…ใ€‚Then some western.",
status: EscapeStatus{},
},
{
name: "japanese",
text: "ๆ—ฅๅฑž็ง˜ใžใ—ใกใ‚…ใ€‚",
result: "ๆ—ฅๅฑž็ง˜ใžใ—ใกใ‚…ใ€‚",
status: EscapeStatus{},
},
{
name: "hebrew",
text: "ืขื“ ืชืงื•ืคืช ื™ื•ื•ืŸ ื”ืขืชื™ืงื” ื”ื™ื” ื”ืขื™ืกื•ืง ื‘ืžืชืžื˜ื™ืงื” ืชื›ืœื™ืชื™ ื‘ืœื‘ื“: ื”ื™ื ืฉื™ืžืฉื” ื›ืื•ืกืฃ ืฉืœ ื ื•ืกื—ืื•ืช ืœื—ื™ืฉื•ื‘ ืงืจืงืข, ืื•ื›ืœื•ืกื™ืŸ ื•ื›ื•'. ืคืจื™ืฆืช ื”ื“ืจืš ืฉืœ ื”ื™ื•ื•ื ื™ื, ืคืจื˜ ืœืชืจื•ืžื•ืชื™ื”ื ื”ื’ื“ื•ืœื•ืช ืœื™ื“ืข ื”ืžืชืžื˜ื™, ื”ื™ื™ืชื” ื‘ืœื™ืžื•ื“ ื”ืžืชืžื˜ื™ืงื” ื›ืฉืœืขืฆืžื”, ืžืชื•ืงืฃ ืขืจื›ื” ื”ืจื•ื—ื ื™. ื™ื—ืกื ืฉืœ ื—ืœืง ืžื”ื™ื•ื•ื ื™ื ื”ืงื“ืžื•ื ื™ื ืœืžืชืžื˜ื™ืงื” ื”ื™ื” ื“ืชื™ - ืœืžืฉืœ, ื”ื›ืช ืฉืืกืฃ ืกื‘ื™ื‘ื• ืคื™ืชื’ื•ืจืก ื”ืืžื™ื ื” ื›ื™ ื”ืžืชืžื˜ื™ืงื” ื”ื™ื ื”ื‘ืกื™ืก ืœื›ืœ ื”ื“ื‘ืจื™ื. ื”ื™ื•ื•ื ื™ื ื ื—ืฉื‘ื™ื ืœื™ื•ืฆืจื™ ืžื•ืฉื’ ื”ื”ื•ื›ื—ื” ื”ืžืชืžื˜ื™ืช, ื•ื›ืŸ ืœืจืืฉื•ื ื™ื ืฉืขืกืงื• ื‘ืžืชืžื˜ื™ืงื” ืœืฉื ืขืฆืžื”, ื›ืœื•ืžืจ ื›ืชื—ื•ื ืžื—ืงืจื™ ืขื™ื•ื ื™ ื•ืžื•ืคืฉื˜ ื•ืœื ืจืง ื›ืขื–ืจ ืฉื™ืžื•ืฉื™. ืขื ื–ืืช, ืœืฆื“ื”",
result: `ืขื“ ืชืงื•ืคืช <span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ื™</span></span><span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ื•</span></span><span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ื•</span></span><span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ืŸ</span></span> ื”ืขืชื™ืงื” ื”ื™ื” ื”ืขื™ืกื•ืง ื‘ืžืชืžื˜ื™ืงื” ืชื›ืœื™ืชื™ ื‘ืœื‘ื“: ื”ื™ื ืฉื™ืžืฉื” ื›ืื•ืกืฃ ืฉืœ ื ื•ืกื—ืื•ืช ืœื—ื™ืฉื•ื‘ ืงืจืงืข, ืื•ื›ืœื•ืกื™ืŸ ื•ื›ื•&#39;. ืคืจื™ืฆืช ื”ื“ืจืš ืฉืœ ื”ื™ื•ื•ื ื™ื, ืคืจื˜ ืœืชืจื•ืžื•ืชื™ื”ื ื”ื’ื“ื•ืœื•ืช ืœื™ื“ืข ื”ืžืชืžื˜ื™, ื”ื™ื™ืชื” ื‘ืœื™ืžื•ื“ ื”ืžืชืžื˜ื™ืงื” ื›ืฉืœืขืฆืžื”, ืžืชื•ืงืฃ ืขืจื›ื” ื”ืจื•ื—ื ื™. ื™ื—ืกื ืฉืœ ื—ืœืง ืžื”ื™ื•ื•ื ื™ื ื”ืงื“ืžื•ื ื™ื ืœืžืชืžื˜ื™ืงื” ื”ื™ื” ื“ืชื™ - ืœืžืฉืœ, ื”ื›ืช ืฉืืกืฃ ืกื‘ื™ื‘ื• ืคื™ืชื’ื•ืจืก ื”ืืžื™ื ื” ื›ื™ ื”ืžืชืžื˜ื™ืงื” ื”ื™ื ื”ื‘ืกื™ืก ืœื›ืœ ื”ื“ื‘ืจื™ื. ื”ื™ื•ื•ื ื™ื ื ื—ืฉื‘ื™ื ืœื™ื•ืฆืจื™ ืžื•ืฉื’ ื”ื”ื•ื›ื—ื” ื”ืžืชืžื˜ื™ืช, ื•ื›ืŸ ืœืจืืฉื•ื ื™ื ืฉืขืกืงื• ื‘ืžืชืžื˜ื™ืงื” ืœืฉื ืขืฆืžื”, ื›ืœื•ืžืจ ื›ืชื—ื•ื ืžื—ืงืจื™ ืขื™ื•ื ื™ ื•ืžื•ืคืฉื˜ ื•ืœื ืจืง ื›ืขื–ืจ ืฉื™ืžื•ืฉื™. ืขื ื–ืืช, ืœืฆื“ื”`,
status: EscapeStatus{Escaped: true, HasAmbiguous: true},
},
{
name: "more hebrew",
text: `ื‘ืชืงื•ืคื” ืžืื•ื—ืจืช ื™ื•ืชืจ, ื”ืฉืชืžืฉื• ื”ื™ื•ื•ื ื™ื ื‘ืฉื™ื˜ืช ืกื™ืžื•ืŸ ืžืชืงื“ืžืช ื™ื•ืชืจ, ืฉื‘ื” ื”ื•ืฆื’ื• ื”ืžืกืคืจื™ื ืœืคื™ 22 ืื•ืชื™ื•ืช ื”ืืœืคื‘ื™ืช ื”ื™ื•ื•ื ื™. ืœืกื™ืžื•ืŸ ื”ืžืกืคืจื™ื ื‘ื™ืŸ 1 ืœ-9 ื ืงื‘ืขื• ืชืฉืข ื”ืื•ืชื™ื•ืช ื”ืจืืฉื•ื ื•ืช, ื‘ืชื•ืกืคืช ื’ืจืฉ ( ' ) ื‘ืฆื“ ื™ืžื™ืŸ ืฉืœ ื”ืื•ืช, ืœืžืขืœื”; ืชืฉืข ื”ืื•ืชื™ื•ืช ื”ื‘ืื•ืช ื™ื™ืฆื’ื• ืืช ื”ืขืฉืจื•ืช ืž-10 ืขื“ 90, ื•ื”ื‘ืื•ืช ืืช ื”ืžืื•ืช. ืœืกื™ืžื•ืŸ ื”ืกืคืจื•ืช ื‘ื™ืŸ 1000 ืœ-900,000, ื”ืฉืชืžืฉื• ื”ื™ื•ื•ื ื™ื ื‘ืื•ืชืŸ ืื•ืชื™ื•ืช, ืืš ื”ื•ืกื™ืคื• ืœืื•ืชื™ื•ืช ืืช ื”ื’ืจืฉ ื“ื•ื•ืงื ืžืฆื“ ืฉืžืืœ ืฉืœ ื”ืื•ืชื™ื•ืช, ืœืžื˜ื”. ืžืžื™ืœื™ื•ืŸ ื•ืžืขืœื”, ื›ื ืจืื” ื”ืฉืชืžืฉื• ื”ื™ื•ื•ื ื™ื ื‘ืฉื ื™ ืชื’ื™ื ื‘ืžืงื•ื ืื—ื“.
ื”ืžืชืžื˜ื™ืงืื™ ื”ื‘ื•ืœื˜ ื”ืจืืฉื•ืŸ ื‘ื™ื•ื•ืŸ ื”ืขืชื™ืงื”, ื•ื™ืฉ ื”ืื•ืžืจื™ื ื‘ืชื•ืœื“ื•ืช ื”ืื ื•ืฉื•ืช, ื”ื•ื ืชืืœืก (624 ืœืคื ื”"ืก - 546 ืœืคื ื”"ืก ื‘ืงื™ืจื•ื‘).[1] ืœื ื™ื”ื™ื” ื–ื” ืžืฉื•ืœืœ ื™ืกื•ื“ ืœื”ื ื™ื— ืฉื”ื•ื ื”ืื“ื ื”ืจืืฉื•ืŸ ืฉื”ื•ื›ื™ื— ืžืฉืคื˜ ืžืชืžื˜ื™, ื•ืœื ืจืง ื’ื™ืœื” ืื•ืชื•. ืชืืœืก ื”ื•ื›ื™ื— ืฉื™ืฉืจื™ื ืžืงื‘ื™ืœื™ื ื—ื•ืชื›ื™ื ืžืฆื“ ืื—ื“ ืฉืœ ืฉื•ืงื™ ื–ื•ื•ื™ืช ืงื˜ืขื™ื ื‘ืขืœื™ ื™ื—ืกื™ื ืฉื•ื•ื™ื (ืžืฉืคื˜ ืชืืœืก ื”ืจืืฉื•ืŸ), ืฉื”ื–ื•ื•ื™ืช ื”ืžื•ื ื—ืช ืขืœ ืงื•ื˜ืจ ื‘ืžืขื’ืœ ื”ื™ื ื–ื•ื•ื™ืช ื™ืฉืจื” (ืžืฉืคื˜ ืชืืœืก ื”ืฉื ื™), ืฉื”ืงื•ื˜ืจ ืžื—ืœืง ืืช ื”ืžืขื’ืœ ืœืฉื ื™ ื—ืœืงื™ื ืฉื•ื•ื™ื, ื•ืฉื–ื•ื•ื™ื•ืช ื”ื‘ืกื™ืก ื‘ืžืฉื•ืœืฉ ืฉื•ื•ื”-ืฉื•ืงื™ื™ื ืฉื•ื•ืช ื–ื• ืœื–ื•. ืžื™ื•ื—ืกื•ืช ืœื• ื’ื ืฉื™ื˜ื•ืช ืœืžื“ื™ื“ืช ื’ื•ื‘ื”ืŸ ืฉืœ ื”ืคื™ืจืžื™ื“ื•ืช ื‘ืขื–ืจืช ืžื“ื™ื“ืช ืฆื™ืœืŸ ื•ืœืงื‘ื™ืขืช ืžื™ืงื•ืžื” ืฉืœ ืกืคื™ื ื” ื”ื ืจืื™ืช ืžืŸ ื”ื—ื•ืฃ.
ื‘ืฉื ื™ื 582 ืœืคื ื”"ืก ืขื“ 496 ืœืคื ื”"ืก, ื‘ืงื™ืจื•ื‘, ื—ื™ ืžืชืžื˜ื™ืงืื™ ื—ืฉื•ื‘ ื‘ืžื™ื•ื—ื“ - ืคื™ืชื’ื•ืจืก. ื”ืžืงื•ืจื•ืช ื”ืจืืฉื•ื ื™ื™ื ืขืœื™ื• ืžื•ืขื˜ื™ื, ื•ื”ื”ื™ืกื˜ื•ืจื™ื•ื ื™ื ืžืชืงืฉื™ื ืœื”ืคืจื™ื“ ืืช ื”ืขื•ื‘ื“ื•ืช ืžืฉื›ื‘ืช ื”ืžืกืชื•ืจื™ืŸ ื•ื”ืื’ื“ื•ืช ืฉื ืงืฉืจื• ื‘ื•. ื™ื“ื•ืข ืฉืกื‘ื™ื‘ื• ื”ืชืงื‘ืฆื” ื”ืืกื›ื•ืœื” ื”ืคื™ืชื’ื•ืจืื™ืช ืžืขื™ืŸ ื›ืช ืคืกื‘ื“ื•-ืžืชืžื˜ื™ืช ืฉื”ืืžื™ื ื” ืฉ"ื”ื›ืœ ืžืกืคืจ", ืื• ืœื™ืชืจ ื“ื™ื•ืง ื”ื›ืœ ื ื™ืชืŸ ืœื›ื™ืžื•ืช, ื•ื™ื™ื—ืกื” ืœืžืกืคืจื™ื ืžืฉืžืขื•ื™ื•ืช ืžื™ืกื˜ื™ื•ืช. ื›ื›ืœ ื”ื ืจืื” ื”ืคื™ืชื’ื•ืจืื™ื ื™ื“ืขื• ืœื‘ื ื•ืช ืืช ื”ื’ื•ืคื™ื ื”ืืคืœื˜ื•ื ื™ื™ื, ื”ื›ื™ืจื• ืืช ื”ืžืžื•ืฆืข ื”ืืจื™ืชืžื˜ื™, ื”ืžืžื•ืฆืข ื”ื’ืื•ืžื˜ืจื™ ื•ื”ืžืžื•ืฆืข ื”ื”ืจืžื•ื ื™ ื•ื”ื’ื™ืขื• ืœื”ื™ืฉื’ื™ื ื—ืฉื•ื‘ื™ื ื ื•ืกืคื™ื. ื ื™ืชืŸ ืœื•ืžืจ ืฉื”ืคื™ืชื’ื•ืจืื™ื ื’ื™ืœื• ืืช ื”ื™ื•ืชื• ืฉืœ ื”ืฉื•ืจืฉ ื”ืจื™ื‘ื•ืขื™ ืฉืœ 2, ืฉื”ื•ื ื’ื ื”ืืœื›ืกื•ืŸ ื‘ืจื™ื‘ื•ืข ืฉืื•ืจืš ืฆืœืขื•ืชื™ื• 1, ืื™ ืจืฆื™ื•ื ืœื™, ืืš ืชื’ืœื™ืชื ื”ื™ื™ืชื” ืœืžืขืฉื” ืจืง ืฉื”ืงื˜ืขื™ื "ื—ืกืจื™ ืžื™ื“ื” ืžืฉื•ืชืคืช", ื•ืžื•ืฉื’ ื”ืžืกืคืจ ื”ืื™ ืจืฆื™ื•ื ืœื™ ืžืื•ื—ืจ ื™ื•ืชืจ.[2] ืื–ื›ื•ืจ ืจืืฉื•ืŸ ืœืงื™ื•ืžื ืฉืœ ืงื˜ืขื™ื ื—ืกืจื™ ืžื™ื“ื” ืžืฉื•ืชืคืช ืžื•ืคื™ืข ื‘ื“ื™ืืœื•ื’ "ืชืื™ื˜ื™ื˜ื•ืก" ืฉืœ ืืคืœื˜ื•ืŸ, ืืš ืจืขื™ื•ืŸ ื–ื” ื”ื™ื” ืžื•ื›ืจ ืขื•ื“ ืงื•ื“ื ืœื›ืŸ, ื‘ืžืื” ื”ื—ืžื™ืฉื™ืช ืœืคื ื”"ืก ืœื”ื™ืคืืกื•ืก, ื‘ืŸ ื”ืืกื›ื•ืœื” ื”ืคื™ืชื’ื•ืจืื™ืช, ื•ืื•ืœื™ ืœืคื™ืชื’ื•ืจืก ืขืฆืžื•.[3]`,
result: `ื‘ืชืงื•ืคื” ืžืื•ื—ืจืช ื™ื•ืชืจ, ื”ืฉืชืžืฉื• ื”ื™ื•ื•ื ื™ื ื‘ืฉื™ื˜ืช ืกื™ืžื•ืŸ ืžืชืงื“ืžืช ื™ื•ืชืจ, ืฉื‘ื” ื”ื•ืฆื’ื• ื”ืžืกืคืจื™ื ืœืคื™ 22 ืื•ืชื™ื•ืช ื”ืืœืคื‘ื™ืช ื”ื™ื•ื•ื ื™. ืœืกื™ืžื•ืŸ ื”ืžืกืคืจื™ื ื‘ื™ืŸ 1 ืœ-9 ื ืงื‘ืขื• ืชืฉืข ื”ืื•ืชื™ื•ืช ื”ืจืืฉื•ื ื•ืช, ื‘ืชื•ืกืคืช ื’ืจืฉ ( &#39; ) ื‘ืฆื“ ื™ืžื™ืŸ ืฉืœ ื”ืื•ืช, ืœืžืขืœื”; ืชืฉืข ื”ืื•ืชื™ื•ืช ื”ื‘ืื•ืช ื™ื™ืฆื’ื• ืืช ื”ืขืฉืจื•ืช ืž-10 ืขื“ 90, ื•ื”ื‘ืื•ืช ืืช ื”ืžืื•ืช. ืœืกื™ืžื•ืŸ ื”ืกืคืจื•ืช ื‘ื™ืŸ 1000 ืœ-900,000, ื”ืฉืชืžืฉื• ื”ื™ื•ื•ื ื™ื ื‘ืื•ืชืŸ ืื•ืชื™ื•ืช, ืืš ื”ื•ืกื™ืคื• ืœืื•ืชื™ื•ืช ืืช ื”ื’ืจืฉ ื“ื•ื•ืงื ืžืฆื“ ืฉืžืืœ ืฉืœ ื”ืื•ืชื™ื•ืช, ืœืžื˜ื”. ืžืžื™ืœื™ื•ืŸ ื•ืžืขืœื”, ื›ื ืจืื” ื”ืฉืชืžืฉื• ื”ื™ื•ื•ื ื™ื ื‘ืฉื ื™ ืชื’ื™ื ื‘ืžืงื•ื ืื—ื“.
ื”ืžืชืžื˜ื™ืงืื™ ื”ื‘ื•ืœื˜ ื”ืจืืฉื•ืŸ ื‘ื™ื•ื•ืŸ ื”ืขืชื™ืงื”, ื•ื™ืฉ ื”ืื•ืžืจื™ื ื‘ืชื•ืœื“ื•ืช ื”ืื ื•ืฉื•ืช, ื”ื•ื ืชืืœืก (624 ืœืคื ื”&#34;<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ืก</span></span> - 546 ืœืคื ื”&#34;<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ืก</span></span> ื‘ืงื™ืจื•ื‘).[1] ืœื ื™ื”ื™ื” ื–ื” ืžืฉื•ืœืœ ื™ืกื•ื“ ืœื”ื ื™ื— ืฉื”ื•ื ื”ืื“ื ื”ืจืืฉื•ืŸ ืฉื”ื•ื›ื™ื— ืžืฉืคื˜ ืžืชืžื˜ื™, ื•ืœื ืจืง ื’ื™ืœื” ืื•ืชื•. ืชืืœืก ื”ื•ื›ื™ื— ืฉื™ืฉืจื™ื ืžืงื‘ื™ืœื™ื ื—ื•ืชื›ื™ื ืžืฆื“ ืื—ื“ ืฉืœ ืฉื•ืงื™ ื–ื•ื•ื™ืช ืงื˜ืขื™ื ื‘ืขืœื™ ื™ื—ืกื™ื ืฉื•ื•ื™ื (ืžืฉืคื˜ ืชืืœืก ื”ืจืืฉื•ืŸ), ืฉื”ื–ื•ื•ื™ืช ื”ืžื•ื ื—ืช ืขืœ ืงื•ื˜ืจ ื‘ืžืขื’ืœ ื”ื™ื ื–ื•ื•ื™ืช ื™ืฉืจื” (ืžืฉืคื˜ ืชืืœืก ื”ืฉื ื™), ืฉื”ืงื•ื˜ืจ ืžื—ืœืง ืืช ื”ืžืขื’ืœ ืœืฉื ื™ ื—ืœืงื™ื ืฉื•ื•ื™ื, ื•ืฉื–ื•ื•ื™ื•ืช ื”ื‘ืกื™ืก ื‘ืžืฉื•ืœืฉ ืฉื•ื•ื”-ืฉื•ืงื™ื™ื ืฉื•ื•ืช ื–ื• ืœื–ื•. ืžื™ื•ื—ืกื•ืช ืœื• ื’ื ืฉื™ื˜ื•ืช ืœืžื“ื™ื“ืช ื’ื•ื‘ื”ืŸ ืฉืœ ื”ืคื™ืจืžื™ื“ื•ืช ื‘ืขื–ืจืช ืžื“ื™ื“ืช ืฆื™ืœืŸ ื•ืœืงื‘ื™ืขืช ืžื™ืงื•ืžื” ืฉืœ ืกืคื™ื ื” ื”ื ืจืื™ืช ืžืŸ ื”ื—ื•ืฃ.
ื‘ืฉื ื™ื 582 ืœืคื ื”&#34;<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ืก</span></span> ืขื“ 496 ืœืคื ื”&#34;<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ืก</span></span>, ื‘ืงื™ืจื•ื‘, ื—ื™ ืžืชืžื˜ื™ืงืื™ ื—ืฉื•ื‘ ื‘ืžื™ื•ื—ื“ - ืคื™ืชื’ื•ืจืก. ื”ืžืงื•ืจื•ืช ื”ืจืืฉื•ื ื™ื™ื ืขืœื™ื• ืžื•ืขื˜ื™ื, ื•ื”ื”ื™ืกื˜ื•ืจื™ื•ื ื™ื ืžืชืงืฉื™ื ืœื”ืคืจื™ื“ ืืช ื”ืขื•ื‘ื“ื•ืช ืžืฉื›ื‘ืช ื”ืžืกืชื•ืจื™ืŸ ื•ื”ืื’ื“ื•ืช ืฉื ืงืฉืจื• ื‘ื•. ื™ื“ื•ืข ืฉืกื‘ื™ื‘ื• ื”ืชืงื‘ืฆื” ื”ืืกื›ื•ืœื” ื”ืคื™ืชื’ื•ืจืื™ืช ืžืขื™ืŸ ื›ืช ืคืกื‘ื“ื•-ืžืชืžื˜ื™ืช ืฉื”ืืžื™ื ื” ืฉ&#34;ื”ื›ืœ ืžืกืคืจ&#34;, ืื• ืœื™ืชืจ ื“ื™ื•ืง ื”ื›ืœ ื ื™ืชืŸ ืœื›ื™ืžื•ืช, ื•ื™ื™ื—ืกื” ืœืžืกืคืจื™ื ืžืฉืžืขื•ื™ื•ืช ืžื™ืกื˜ื™ื•ืช. ื›ื›ืœ ื”ื ืจืื” ื”ืคื™ืชื’ื•ืจืื™ื ื™ื“ืขื• ืœื‘ื ื•ืช ืืช ื”ื’ื•ืคื™ื ื”ืืคืœื˜ื•ื ื™ื™ื, ื”ื›ื™ืจื• ืืช ื”ืžืžื•ืฆืข ื”ืืจื™ืชืžื˜ื™, ื”ืžืžื•ืฆืข ื”ื’ืื•ืžื˜ืจื™ ื•ื”ืžืžื•ืฆืข ื”ื”ืจืžื•ื ื™ ื•ื”ื’ื™ืขื• ืœื”ื™ืฉื’ื™ื ื—ืฉื•ื‘ื™ื ื ื•ืกืคื™ื. ื ื™ืชืŸ ืœื•ืžืจ ืฉื”ืคื™ืชื’ื•ืจืื™ื ื’ื™ืœื• ืืช ื”ื™ื•ืชื• ืฉืœ ื”ืฉื•ืจืฉ ื”ืจื™ื‘ื•ืขื™ ืฉืœ 2, ืฉื”ื•ื ื’ื ื”ืืœื›ืกื•ืŸ ื‘ืจื™ื‘ื•ืข ืฉืื•ืจืš ืฆืœืขื•ืชื™ื• 1, ืื™ ืจืฆื™ื•ื ืœื™, ืืš ืชื’ืœื™ืชื ื”ื™ื™ืชื” ืœืžืขืฉื” ืจืง ืฉื”ืงื˜ืขื™ื &#34;ื—ืกืจื™ ืžื™ื“ื” ืžืฉื•ืชืคืช&#34;, ื•ืžื•ืฉื’ ื”ืžืกืคืจ ื”ืื™ ืจืฆื™ื•ื ืœื™ ืžืื•ื—ืจ ื™ื•ืชืจ.[2] ืื–ื›ื•ืจ ืจืืฉื•ืŸ ืœืงื™ื•ืžื ืฉืœ ืงื˜ืขื™ื ื—ืกืจื™ ืžื™ื“ื” ืžืฉื•ืชืคืช ืžื•ืคื™ืข ื‘ื“ื™ืืœื•ื’ &#34;ืชืื™ื˜ื™ื˜ื•ืก&#34; ืฉืœ ืืคืœื˜ื•ืŸ, ืืš ืจืขื™ื•ืŸ ื–ื” ื”ื™ื” ืžื•ื›ืจ ืขื•ื“ ืงื•ื“ื ืœื›ืŸ, ื‘ืžืื” ื”ื—ืžื™ืฉื™ืช ืœืคื ื”&#34;<span class="ambiguous-code-point" data-tooltip-content="repo.ambiguous_character"><span class="char">ืก</span></span> ืœื”ื™ืคืืกื•ืก, ื‘ืŸ ื”ืืกื›ื•ืœื” ื”ืคื™ืชื’ื•ืจืื™ืช, ื•ืื•ืœื™ ืœืคื™ืชื’ื•ืจืก ืขืฆืžื•.[3]`,
status: EscapeStatus{Escaped: true, HasAmbiguous: true},
},
{
name: "Mixed RTL+LTR",
text: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah (ืฉืจื”) is spelled: sin (ืฉ) (which appears rightmost),
then resh (ืจ), and finally heh (ื”) (which should appear leftmost).`,
result: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah (ืฉืจื”) is spelled: sin (ืฉ) (which appears rightmost),
then resh (ืจ), and finally heh (ื”) (which should appear leftmost).`,
status: EscapeStatus{},
},
{
name: "Mixed RTL+LTR+BIDI",
text: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `ืฉืจื”` + "\u2066\n" +
`sin (ืฉ) (which appears rightmost), then resh (ืจ), and finally heh (ื”) (which should appear leftmost).`,
result: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `ืฉืจื”` + "\u2066\n" +
`sin (ืฉ) (which appears rightmost), then resh (ืจ), and finally heh (ื”) (which should appear leftmost).`,
status: EscapeStatus{},
},
{
name: "Accented characters",
text: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}),
result: string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}),
status: EscapeStatus{},
},
{
name: "Program",
text: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})",
result: "string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})",
status: EscapeStatus{},
},
{
name: "CVE testcase",
text: "if access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {",
result: `if access_level != &#34;user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>&#34; {`,
status: EscapeStatus{Escaped: true, HasInvisible: true},
},
{
name: "Mixed testcase with fail",
text: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `ืฉืจื”` + "\u2066\n" +
`sin (ืฉ) (which appears rightmost), then resh (ืจ), and finally heh (ื”) (which should appear leftmost).` +
"\nif access_level != \"user\u202E \u2066// Check if admin\u2069 \u2066\" {\n",
result: `Many computer programs fail to display bidirectional text correctly.
For example, the Hebrew name Sarah ` + "\u2067" + `ืฉืจื”` + "\u2066\n" +
`sin (ืฉ) (which appears rightmost), then resh (ืจ), and finally heh (ื”) (which should appear leftmost).` +
"\n" + `if access_level != &#34;user<span class="escaped-code-point" data-escaped="[U+202E]"><span class="char">` + "\u202e" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>// Check if admin<span class="escaped-code-point" data-escaped="[U+2069]"><span class="char">` + "\u2069" + `</span></span> <span class="escaped-code-point" data-escaped="[U+2066]"><span class="char">` + "\u2066" + `</span></span>&#34; {` + "\n",
status: EscapeStatus{Escaped: true, HasInvisible: true},
},
{
// UTF-8/16/32 all use the same codepoint for BOM
// Gitea could read UTF-16/32 content and convert into UTF-8 internally then render it, so we only process UTF-8 internally
name: "UTF BOM",
text: "\xef\xbb\xbftest",
result: "\xef\xbb\xbftest",
status: EscapeStatus{},
},
}
func TestEscapeControlReader(t *testing.T) {
// add some control characters to the tests
tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
copy(tests, escapeControlTests)
// if there is a BOM, we should keep the BOM
addPrefix := func(prefix, s string) string {
if strings.HasPrefix(s, "\xef\xbb\xbf") {
return s[:3] + prefix + s[3:]
}
return prefix + s
}
for _, test := range escapeControlTests {
test.name += " (+Control)"
test.text = addPrefix("\u001E", test.text)
test.result = addPrefix(`<span class="escaped-code-point" data-escaped="[U+001E]"><span class="char">`+"\u001e"+`</span></span>`, test.result)
test.status.Escaped = true
test.status.HasInvisible = true
tests = append(tests, test)
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
output := &strings.Builder{}
status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{})
assert.NoError(t, err)
assert.Equal(t, tt.status, *status)
assert.Equal(t, tt.result, output.String())
})
}
}
func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)()
_, out := EscapeControlHTML("aย test", &translation.MockLocale{})
assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char">ย </span></span>test`, out)
setting.UI.AmbiguousUnicodeDetection = false
_, out = EscapeControlHTML("aย test", &translation.MockLocale{})
assert.EqualValues(t, `aย test`, out)
}