/* * Copyright 2020 National Library of Norway. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package url import ( goerrors "errors" u2 "net/url" "strconv" "strings" "unicode" "unicode/utf8" "github.com/bits-and-blooms/bitset" "github.com/nlnwa/whatwg-url/errors" ) func NewParser(opts ...ParserOption) Parser { p := &parser{opts: defaultParserOptions()} for _, opt := range opts { opt.apply(&p.opts) } return p } type Parser interface { Parse(rawUrl string) (*Url, error) ParseRef(rawUrl, ref string) (*Url, error) BasicParser(urlOrRef string, base *Url, url *Url, stateOverride State) (*Url, error) PercentEncodeString(s string, tr *PercentEncodeSet) string NewUrl() *Url } type parser struct { opts parserOptions } func (p *parser) Parse(rawUrl string) (*Url, error) { return p.BasicParser(rawUrl, nil, nil, NoState) } func (p *parser) ParseRef(rawUrl, ref string) (*Url, error) { if rawUrl == "" { return p.Parse(ref) } b, err := p.Parse(rawUrl) if err != nil { return nil, err } return p.BasicParser(ref, b, nil, NoState) } func (u *Url) Parse(ref string) (*Url, error) { return u.parser.BasicParser(ref, u, nil, NoState) } var defaultParser = NewParser() func Parse(rawUrl string) (*Url, error) { return defaultParser.Parse(rawUrl) } func ParseRef(rawUrl, ref string) (*Url, error) { return defaultParser.ParseRef(rawUrl, ref) } type State int const ( NoState State = iota StateSchemeStart StateScheme StateNoScheme StateOpaquePath StateSpecialRelativeOrAuthority StateSpecialAuthoritySlashes StateSpecialAuthorityIgnoreSlashes StatePathOrAuthority StateAuthority StateHost StateHostname StateFile StateFileHost StateFileSlash StatePort StatePath StatePathStart StateQuery StateFragment StateRelative StateRelativeSlash ) // BasicParser implements WHATWG basic URL parser (https://url.spec.whatwg.org/#concept-basic-url-parser) // In most cases, when possible, prefer using the higher level Parse method. func (p *parser) BasicParser(urlOrRef string, baseUrl *Url, url *Url, stateOverride State) (*Url, error) { stateOverridden := stateOverride > NoState if url == nil { url = &Url{inputUrl: urlOrRef, path: &path{}} if i, changed := trim(url.inputUrl, C0OrSpacePercentEncodeSet); changed { if err := p.handleError(url, errors.InvalidURLUnit, false); err != nil { return nil, err } url.inputUrl = i } } else { url.inputUrl = urlOrRef } url.parser = p if i, changed := remove(url.inputUrl, ASCIITabOrNewline); changed { if err := p.handleError(url, errors.InvalidURLUnit, false); err != nil { return nil, err } url.inputUrl = i } input := newInputString(url.inputUrl) var state State if stateOverridden { state = stateOverride } else { state = StateSchemeStart } var buffer strings.Builder atFlag := false bracketFlag := false passwordTokenSeenFlag := false var base *Url if baseUrl != nil { base = baseUrl.Clone() } for { r := input.nextCodePoint() switch state { case StateSchemeStart: if ASCIIAlpha.Test(uint(r)) { buffer.WriteRune(unicode.ToLower(r)) state = StateScheme } else if !stateOverridden { state = StateNoScheme input.rewindLast() } else { if err := p.handleError(url, errors.InvalidURLUnit, true); err != nil { return nil, err } } case StateScheme: tr := ASCIIAlphanumeric.Clone().Set(0x2b).Set(0x2d).Set(0x2e) if tr.Test(uint(r)) { buffer.WriteRune(unicode.ToLower(r)) } else if r == ':' { if stateOverridden { // If url’s scheme is a special scheme and buffer is not a special scheme, then return. if url.isSpecialScheme(url.scheme) && !url.isSpecialScheme(buffer.String()) { return url, nil } // If url’s scheme is not a special scheme and buffer is a special scheme, then return. if !url.isSpecialScheme(url.scheme) && url.isSpecialScheme(buffer.String()) { return url, nil } // If url includes credentials or has a non-null port, and buffer is "file", then return. if (url.username != "" || url.password != "" || url.port != nil) && buffer.String() == "file" { return url, nil } // If url’s scheme is "file" and its host is an empty host or null, then return. if url.scheme == "file" && *url.host == "" { return url, nil } } url.scheme = buffer.String() if stateOverridden { url.cleanDefaultPort() return url, nil } buffer.Reset() if url.scheme == "file" { if !input.remainingStartsWith("//") { if err := p.handleError(url, errors.SpecialSchemeMissingFollowingSolidus, false); err != nil { return nil, err } } state = StateFile } else if url.IsSpecialScheme() && base != nil && base.scheme == url.scheme { state = StateSpecialRelativeOrAuthority } else if url.IsSpecialScheme() { state = StateSpecialAuthoritySlashes } else if input.remainingStartsWith("/") { state = StatePathOrAuthority input.nextCodePoint() } else { url.path.setOpaque("") state = StateOpaquePath } } else if !stateOverridden { buffer.Reset() state = StateNoScheme input.reset() } else { if err := p.handleError(url, errors.InvalidURLUnit, true); err != nil { return nil, err } } case StateNoScheme: if base == nil || (base.path.isOpaque() && r != '#') { if err := p.handleError(url, errors.MissingSchemeNonRelativeURL, true); err != nil { return nil, err } } else if base.path.isOpaque() && r == '#' { url.scheme = base.scheme url.path = base.path url.query = base.query url.fragment = new(string) state = StateFragment } else if base.scheme != "file" { state = StateRelative input.rewindLast() } else { state = StateFile input.rewindLast() } case StateSpecialRelativeOrAuthority: if r == '/' && input.remainingStartsWith("/") { state = StateSpecialAuthorityIgnoreSlashes input.nextCodePoint() } else { if err := p.handleError(url, errors.SpecialSchemeMissingFollowingSolidus, false); err != nil { return nil, err } state = StateRelative input.rewindLast() } case StatePathOrAuthority: if r == '/' { state = StateAuthority } else { state = StatePath input.rewindLast() } case StateRelative: url.scheme = base.scheme if r == '/' { state = StateRelativeSlash } else if url.isSpecialSchemeAndBackslash(r) { if err := p.handleError(url, errors.InvalidReverseSolidus, false); err != nil { return nil, err } state = StateRelativeSlash } else { url.username = base.username url.password = base.password url.host = base.host url.port = base.port url.decodedPort = base.decodedPort url.path = base.path url.query = base.query if r == '?' { url.query = new(string) state = StateQuery } else if r == '#' { url.fragment = new(string) state = StateFragment } else if !input.eof { url.query = nil url.path.shortenPath(url.scheme) state = StatePath input.rewindLast() } } case StateRelativeSlash: if url.IsSpecialScheme() && (r == '/' || r == '\\') { if r == '\\' { if err := p.handleError(url, errors.InvalidReverseSolidus, false); err != nil { return nil, err } } state = StateSpecialAuthorityIgnoreSlashes } else if r == '/' { state = StateAuthority } else { url.username = base.username url.password = base.password url.host = base.host url.port = base.port url.decodedPort = base.decodedPort state = StatePath input.rewindLast() } case StateSpecialAuthoritySlashes: if r == '/' && input.remainingStartsWith("/") { state = StateSpecialAuthorityIgnoreSlashes input.nextCodePoint() } else { if err := p.handleError(url, errors.SpecialSchemeMissingFollowingSolidus, false); err != nil { return nil, err } state = StateSpecialAuthorityIgnoreSlashes input.rewindLast() } case StateSpecialAuthorityIgnoreSlashes: if r != '/' && r != '\\' { state = StateAuthority input.rewindLast() } else { if err := p.handleError(url, errors.SpecialSchemeMissingFollowingSolidus, false); err != nil { return nil, err } } case StateAuthority: if r == '@' { if err := p.handleError(url, errors.InvalidCredentials, false); err != nil { return nil, err } if atFlag { // Prepend %40 to buffer tmp := buffer.String() buffer.Reset() buffer.WriteString("%40") buffer.WriteString(tmp) } atFlag = true bb := newInputString(buffer.String()) c := bb.nextCodePoint() for !bb.eof { if c == ':' && !passwordTokenSeenFlag { passwordTokenSeenFlag = true c = bb.nextCodePoint() continue } encodedCodePoints := p.percentEncodeRune(c, UserInfoPercentEncodeSet) if passwordTokenSeenFlag { url.password += encodedCodePoints } else { url.username += encodedCodePoints } c = bb.nextCodePoint() } buffer.Reset() } else if (input.eof || r == '/' || r == '?' || r == '#') || url.isSpecialSchemeAndBackslash(r) { if atFlag && buffer.Len() == 0 { if err := p.handleError(url, errors.InvalidCredentials, true); err != nil { return nil, err } } input.rewind(len([]rune(buffer.String())) + 1) buffer.Reset() state = StateHost } else { buffer.WriteRune(r) } case StateHost: fallthrough case StateHostname: if stateOverridden && url.scheme == "file" { input.rewindLast() state = StateFileHost } else if r == ':' && !bracketFlag { if buffer.Len() == 0 { if err := p.handleError(url, errors.HostMissing, true); err != nil { return nil, err } } if stateOverride == StateHostname { return url, nil } host, err := p.parseHost(url, p, buffer.String(), !url.IsSpecialScheme()) if err != nil { return url, err } url.host = &host buffer.Reset() state = StatePort } else if input.eof || (r == '/' || r == '?' || r == '#' || url.isSpecialSchemeAndBackslash(r)) { input.rewindLast() if url.IsSpecialScheme() && buffer.Len() == 0 { if err := p.handleError(url, errors.HostMissing, true); err != nil { return nil, err } } else if stateOverridden && buffer.Len() == 0 && (url.username != "" || url.password != "" || url.port != nil) { return url, nil } else { host, err := p.parseHost(url, p, buffer.String(), !url.IsSpecialScheme()) if err != nil { return url, err } url.host = &host buffer.Reset() state = StatePathStart if stateOverridden { return url, nil } } } else { if r == '[' { bracketFlag = true } else if r == ']' { bracketFlag = false } if input.currentIsInvalid() && p.opts.acceptInvalidCodepoints { buffer.WriteString(string([]byte{input.getCurrentAsByte()})) } else { buffer.WriteRune(r) } } case StatePort: if ASCIIDigit.Test(uint(r)) { buffer.WriteRune(r) } else if (input.eof || r == '/' || r == '?' || r == '#') || url.isSpecialSchemeAndBackslash(r) || stateOverridden { if buffer.Len() > 0 { port, err := strconv.Atoi(buffer.String()) if port > 65535 || goerrors.Is(err, strconv.ErrRange) { if err := p.handleWrappedError(url, errors.PortOutOfRange, true, err); err != nil { return nil, err } } portString := strconv.Itoa(port) url.decodedPort = port url.port = &portString url.cleanDefaultPort() buffer.Reset() } if stateOverridden { return url, nil } state = StatePathStart input.rewindLast() } else { if err := p.handleError(url, errors.PortInvalid, true); err != nil { return nil, err } } case StateFile: url.scheme = "file" url.host = new(string) if r == '/' || r == '\\' { if r == '\\' { if err := p.handleError(url, errors.InvalidReverseSolidus, false); err != nil { return nil, err } } state = StateFileSlash } else if base != nil && base.scheme == "file" { url.host = base.host url.path = base.path url.query = base.query if r == '?' { url.query = new(string) state = StateQuery } else if r == '#' { url.fragment = new(string) state = StateFragment } else if !input.eof { url.query = nil if !startsWithAWindowsDriveLetter(input.remainingFromPointer()) { url.path.shortenPath(url.scheme) } else { if err := p.handleError(url, errors.FileInvalidWindowsDriveLetter, false); err != nil { return nil, err } url.path.init() } state = StatePath input.rewindLast() } } else { state = StatePath input.rewindLast() } case StateFileSlash: if r == '/' || r == '\\' { if r == '\\' { if err := p.handleError(url, errors.InvalidReverseSolidus, false); err != nil { return nil, err } } state = StateFileHost } else { if base != nil && base.scheme == "file" { url.host = base.host if !startsWithAWindowsDriveLetter(input.remainingFromPointer()) && base.path != nil && isNormalizedWindowsDriveLetter(base.path.p[0]) { // This is a (platform-independent) Windows drive letter quirk. Both url’s and base’s host are null under these conditions and therefore not copied url.path.addSegment(base.path.p[0]) } } state = StatePath input.rewindLast() } case StateFileHost: if input.eof || r == '/' || r == '\\' || r == '?' || r == '#' { input.rewindLast() if !stateOverridden && isWindowsDriveLetter(buffer.String()) { if err := p.handleError(url, errors.FileInvalidWindowsDriveLetterHost, false); err != nil { return nil, err } state = StatePath } else if buffer.Len() == 0 { url.host = new(string) if stateOverridden { return nil, nil } state = StatePathStart } else { host, err := p.parseHost(url, p, buffer.String(), !url.IsSpecialScheme()) if err != nil { return url, err } if host == "localhost" { host = "" } url.host = &host if stateOverridden { return url, nil } buffer.Reset() state = StatePathStart } } else { buffer.WriteRune(r) } case StatePathStart: if url.IsSpecialScheme() && !p.opts.skipTrailingSlashNormalization { if r == '\\' { if err := p.handleError(url, errors.InvalidReverseSolidus, false); err != nil { return nil, err } } state = StatePath if r != '/' && r != '\\' { input.rewindLast() } } else if !stateOverridden && r == '?' { url.query = new(string) state = StateQuery } else if !stateOverridden && r == '#' { url.fragment = new(string) state = StateFragment } else if !input.eof { state = StatePath if r != '/' { input.rewindLast() } } else if stateOverridden && url.host == nil { url.path.addSegment("") } case StatePath: if (input.eof || r == '/') || url.isSpecialSchemeAndBackslash(r) || (!stateOverridden && (r == '?' || r == '#')) { if url.isSpecialSchemeAndBackslash(r) { if err := p.handleError(url, errors.InvalidReverseSolidus, false); err != nil { return nil, err } } if isDoubleDotPathSegment(buffer.String()) { url.path.shortenPath(url.scheme) if r != '/' && !url.isSpecialSchemeAndBackslash(r) { url.path.addSegment("") } } else if isSingleDotPathSegment(buffer.String()) && r != '/' && !url.isSpecialSchemeAndBackslash(r) { url.path.addSegment("") } else if !isSingleDotPathSegment(buffer.String()) { if url.scheme == "file" && url.path.isEmpty() && isWindowsDriveLetter(buffer.String()) { // replace second code point in buffer with U+003A (:). // This is a (platform-independent) Windows drive letter quirk. if !p.opts.skipWindowsDriveLetterNormalization { b := buffer.String() buffer.Reset() buffer.WriteString(b[0:1] + ":" + b[2:]) } } if !p.opts.collapseConsecutiveSlashes || !url.IsSpecialScheme() || url.path.isEmpty() || len(url.path.p[len(url.path.p)-1]) > 0 { url.path.addSegment(buffer.String()) } else { url.path.p[len(url.path.p)-1] = buffer.String() } } buffer.Reset() if r == '?' { url.query = new(string) state = StateQuery } else if r == '#' { url.fragment = new(string) state = StateFragment } } else { if !isURLCodePoint(r) && r != '%' { if err := p.handleError(url, errors.InvalidURLUnit, false); err != nil { return nil, err } } invalidPercentEncoding, d := input.remainingIsInvalidPercentEncoded() if invalidPercentEncoding { if err := p.handleErrorWithDescription(url, errors.InvalidURLUnit, false, d); err != nil { return nil, err } } if invalidPercentEncoding { buffer.WriteString(p.percentEncodeInvalidRune(r, p.opts.pathPercentEncodeSet)) } else { buffer.WriteString(p.percentEncodeRune(r, p.opts.pathPercentEncodeSet)) } } case StateOpaquePath: if r == '?' { url.query = new(string) state = StateQuery buffer.Reset() } else if r == '#' { url.fragment = new(string) state = StateFragment buffer.Reset() } else if !input.eof { if !isURLCodePoint(r) && r != '%' { if err := p.handleError(url, errors.InvalidURLUnit, false); err != nil { return nil, err } } invalidPercentEncoding, d := input.remainingIsInvalidPercentEncoded() if invalidPercentEncoding { if err := p.handleErrorWithDescription(url, errors.InvalidURLUnit, false, d); err != nil { return nil, err } buffer.WriteString(p.percentEncodeInvalidRune(r, C0PercentEncodeSet)) } else { buffer.WriteString(p.percentEncodeRune(r, C0PercentEncodeSet)) } url.path.setOpaque(buffer.String()) } case StateQuery: if !stateOverridden && r == '#' { url.fragment = new(string) state = StateFragment *url.query = buffer.String() buffer.Reset() } else if !input.eof { if !isURLCodePoint(r) && r != '%' { if err := p.handleError(url, errors.InvalidURLUnit, false); err != nil { return nil, err } } invalidPercentEncoding, d := input.remainingIsInvalidPercentEncoded() if invalidPercentEncoding { if err := p.handleErrorWithDescription(url, errors.InvalidURLUnit, false, d); err != nil { return nil, err } } encodeSet := p.opts.queryPercentEncodeSet if url.isSpecialScheme(url.scheme) { encodeSet = p.opts.specialQueryPercentEncodeSet } buffer.WriteString(p.percentEncodeRune(r, encodeSet)) } else { q := buffer.String() url.query = &q } case StateFragment: if !input.eof { if !isURLCodePoint(r) && r != '%' { if err := p.handleError(url, errors.InvalidURLUnit, false); err != nil { return nil, err } } invalidPercentEncoding, d := input.remainingIsInvalidPercentEncoded() if invalidPercentEncoding { if err := p.handleErrorWithDescription(url, errors.InvalidURLUnit, false, d); err != nil { return nil, err } } encodeSet := p.opts.fragmentPercentEncodeSet if url.isSpecialScheme(url.scheme) { encodeSet = p.opts.specialFragmentPercentEncodeSet } buffer.WriteString(p.percentEncodeRune(r, encodeSet)) } else { f := buffer.String() url.fragment = &f } } if input.eof { break } } return url, nil } func (p *parser) percentEncodeInvalidRune(r rune, tr *PercentEncodeSet) string { if p.opts.percentEncodeSinglePercentSign { return p.percentEncodeRune(r, tr.Set(0x25)) } return p.percentEncodeRune(r, tr) } func (p *parser) percentEncodeRune(r rune, tr *PercentEncodeSet) string { if tr != nil && !tr.RuneShouldBeEncoded(r) { return string(r) } var bytes = make([]byte, 4) var n int if p.opts.encodingOverride != nil { b, _ := p.opts.encodingOverride.EncodeRune(r) bytes[0] = b n = 1 } else { n = utf8.EncodeRune(bytes, r) } percentEncoded := make([]byte, 4*3) j := 0 for i := 0; i < n; i++ { c := bytes[i] percentEncoded[j] = '%' percentEncoded[j+1] = "0123456789ABCDEF"[c>>4] percentEncoded[j+2] = "0123456789ABCDEF"[c&15] j += 3 } return string(percentEncoded[:j]) } func (p *parser) PercentEncodeString(s string, tr *PercentEncodeSet) string { buffer := &strings.Builder{} runes := []rune(s) for i, r := range runes { if r == '%' { if len(runes) < (i+3) || (!ASCIIHexDigit.Test(uint(runes[i+1])) || !ASCIIHexDigit.Test(uint(runes[i+2]))) { if p.opts.percentEncodeSinglePercentSign { buffer.WriteString(p.percentEncodeRune(r, tr.Set(0x25))) continue } } } buffer.WriteString(p.percentEncodeRune(r, tr)) } return buffer.String() } func (p *parser) DecodePercentEncoded(s string) string { sb := strings.Builder{} bytes := []byte(s) for i := 0; i < len(bytes); i++ { if bytes[i] != '%' { sb.WriteByte(bytes[i]) } else if len(bytes) < (i+3) || (!ASCIIHexDigit.Test(uint(bytes[i+1])) || !ASCIIHexDigit.Test(uint(bytes[i+2]))) { sb.WriteByte(bytes[i]) } else { b, e := u2.PathUnescape(string(bytes[i : i+3])) if e != nil { return sb.String() } if p.opts.encodingOverride != nil { r := p.opts.encodingOverride.DecodeByte(b[0]) sb.WriteRune(r) } else { sb.WriteString(b) } i += 2 } } return sb.String() } func (p *parser) NewUrl() *Url { u := Url{} u.parser = p u.path = &path{} u.path.init() return &u } func isSingleDotPathSegment(s string) bool { if s == "." { return true } s = strings.ToLower(s) return s == "%2e" } func isDoubleDotPathSegment(s string) bool { if s == ".." { return true } s = strings.ToLower(s) if s == ".%2e" || s == "%2e." || s == "%2e%2e" { return true } return false } func startsWithAWindowsDriveLetter(s string) bool { if len(s) >= 2 && isWindowsDriveLetter(s[0:2]) && (len(s) == 2 || s[2] == '/' || s[2] == '\\' || s[2] == '?' || s[2] == '#') { return true } return false } func isWindowsDriveLetter(s string) bool { if len(s) == 2 && ASCIIAlpha.Test(uint(s[0])) && (s[1] == ':' || s[1] == '|') { return true } return false } func isNormalizedWindowsDriveLetter(s string) bool { if len(s) == 2 && ASCIIAlpha.Test(uint(s[0])) && (s[1] == ':') { return true } return false } func trimPrefix(s string, tr *PercentEncodeSet) (string, bool) { if s == "" { return s, false } for i, c := range s { if tr.RuneNotInSet(c) { return s[i:], i > 0 } } return "", true } func trimPostfix(s string, tr *PercentEncodeSet) (string, bool) { if s == "" { return s, false } for i := len(s) - 1; i >= 0; i-- { c := s[i] if tr.RuneNotInSet(int32(c)) { return s[:i+1], i < (len(s) - 1) } } return "", true } func trim(s string, tr *PercentEncodeSet) (string, bool) { var c1, c2 bool s, c1 = trimPrefix(s, tr) s, c2 = trimPostfix(s, tr) return s, c1 || c2 } func remove(s string, tr *bitset.BitSet) (string, bool) { if s == "" { return s, false } changed := false var r []byte for _, c := range []byte(s) { if tr.Test(uint(c)) { changed = true } else { r = append(r, c) } } return string(r), changed } func containsOnly(s string, tr *bitset.BitSet) bool { for _, c := range []byte(s) { if !tr.Test(uint(c)) { return false } } return true } func (u *Url) IsSpecialScheme() bool { return u.isSpecialScheme(u.scheme) } func (u *Url) isSpecialScheme(s string) bool { _, ok := u.getSpecialScheme(s) return ok } func (u *Url) getSpecialScheme(s string) (string, bool) { dp, ok := u.parser.opts.specialSchemes[s] return dp, ok } func (u *Url) isSpecialSchemeAndBackslash(r rune) bool { ok := u.IsSpecialScheme() return ok && r == '\\' } func (u *Url) cleanDefaultPort() { if dp, ok := u.getSpecialScheme(u.scheme); ok && (u.port == nil || dp == *u.port) { u.port = nil u.decodedPort = 0 } } func (u *Url) getDefaultPort() int { if dp, ok := u.getSpecialScheme(u.scheme); ok { if p, err := strconv.Atoi(dp); err == nil { return p } } return 0 }