-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathurlkey.go
More file actions
141 lines (124 loc) · 4.36 KB
/
urlkey.go
File metadata and controls
141 lines (124 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright (c) 2026 Bart Venter <72999113+bartventer@users.noreply.github.com>
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package urlkey derives stable cache-key strings from URLs by applying
// pragmatic HTTP URL normalization.
//
// The normalization follows RFC 3986 equivalence guidance and HTTP URI rules
// where applicable ([RFC 3986 §6.2], [RFC 7230 §2.7.3]). It is intended for cache
// key generation, not as a full URI canonicalization framework.
//
// [RFC 3986 §6.2]: https://datatracker.ietf.org/doc/html/rfc3986#section-6.2
// [RFC 7230 §2.7.3]: https://datatracker.ietf.org/doc/html/rfc7230#section-2.7.3
package urlkey
import (
"net/url"
"strings"
"unicode"
"github.com/bartventer/httpcache/internal/urlutil"
)
// Normalize returns a normalized URL string suitable for use as a cache key.
//
// It normalizes scheme/host case, default ports, dot-segments, and
// percent-encoding (for path and query), and excludes fragments.
//
// For opaque URLs (u.Opaque != ""), the opaque value is returned unchanged.
func Normalize(u *url.URL) string {
if u.Opaque != "" {
return u.Opaque
}
// RFC 3986 §6.2.2.3: Path normalization (dot-segment removal) is handled by
// [url.URL.ResolveReference], which uses the RFC 3986 §5.2.4 algorithm.
base, _ := url.Parse(u.Scheme + "://" + u.Host)
normalized := base.ResolveReference(u)
// RFC 3986 §6.2.2.1: Scheme is lowercased (already done by [url.Parse]).
scheme := normalized.Scheme
host, port := urlutil.SplitHostPort(normalized.Host)
defaultP := urlutil.DefaultPort(scheme)
if port == "" {
port = defaultP
}
// RFC 3986 §6.2.2.1: Host is lowercased.
hostPort := strings.ToLower(host)
// RFC 3986 §6.2.3: Only include port if it is non-default for the scheme.
if port != "" && port != defaultP {
hostPort = hostPort + ":" + port
}
// RFC 3986 §6.2.3: An empty path for http/https is normalized to "/".
// Also see https://datatracker.ietf.org/doc/html/rfc7230#section-2.7.3
path := normalized.EscapedPath()
if path == "" && (scheme == "http" || scheme == "https") {
path = "/"
}
// RFC 3986 §6.2.2.2: Normalize percent-encoding in path.
path = normalizePercentEncoding(path)
result := scheme + "://" + hostPort + path
// RFC 3986 §6.2.2.2: Normalize percent-encoding in query, if present.
if normalized.RawQuery != "" {
result += "?" + normalizePercentEncoding(normalized.RawQuery)
}
// RFC 3986 §6.1 Equivalence: "fragment components (if any) should be excluded from
// the comparison"
return result
}
// normalizePercentEncoding rewrites percent-encoded characters in a URL path or query
// so that unreserved characters are decoded, and all hex digits are uppercase.
// Follows RFC 3986 §6.2.2.2.
func normalizePercentEncoding(s string) string {
var b strings.Builder
i := 0
for i < len(s) {
if s[i] == '%' && i+2 < len(s) &&
isHexDigit(s[i+1]) && isHexDigit(s[i+2]) {
hexVal := fromHex(s[i+1])<<4 | fromHex(s[i+2])
r := rune(hexVal)
if isUnreserved(r) {
b.WriteRune(r)
} else {
b.WriteString(percentEncodeUpper(hexVal))
}
i += 3
} else {
b.WriteByte(s[i])
i++
}
}
return b.String()
}
func isHexDigit(c byte) bool {
return ('0' <= c && c <= '9') ||
('A' <= c && c <= 'F') ||
('a' <= c && c <= 'f')
}
func fromHex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}
return 0
}
// isUnreserved reports whether r is an unreserved character per RFC 3986 §2.3.
func isUnreserved(r rune) bool {
return unicode.IsLetter(r) || unicode.IsDigit(r) ||
r == '-' || r == '.' || r == '_' || r == '~'
}
const hex = "0123456789ABCDEF"
// percentEncodeUpper returns the percent-encoded form of b using uppercase
// hex digits as specified in RFC 3986 §2.1.
func percentEncodeUpper(b byte) string {
return "%" + string(hex[b>>4]) + string(hex[b&0x0F])
}