Skip to content

Commit abd1789

Browse files
authored
Merge pull request #867 from rodionsteshenko/fix-issue-745
Fix #745: Don't attempt gzip decompression of non-2xx responses for .xml.gz URLs
2 parents 93e4e02 + eab972b commit abd1789

2 files changed

Lines changed: 43 additions & 1 deletion

File tree

colly_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,15 @@ y">link</a>
281281
}
282282
})
283283

284+
mux.HandleFunc("/sitemap.xml.gz", func(w http.ResponseWriter, r *http.Request) {
285+
// Return a 404 HTML page for a non-existent .xml.gz URL.
286+
// This simulates the scenario in issue #745 where a server
287+
// returns an HTML error page for a missing gzipped sitemap.
288+
w.Header().Set("Content-Type", "text/html")
289+
w.WriteHeader(404)
290+
w.Write([]byte(`<!DOCTYPE html><html><body><h1>404 Not Found</h1></body></html>`))
291+
})
292+
284293
return httptest.NewUnstartedServer(mux)
285294
}
286295

@@ -1926,3 +1935,36 @@ func TestCheckRequestHeadersFunc(t *testing.T) {
19261935
t.Error("TestCheckRequestHeadersFunc failed")
19271936
}
19281937
}
1938+
1939+
func TestIssue745GzipURLWith404Response(t *testing.T) {
1940+
ts := newTestServer()
1941+
defer ts.Close()
1942+
1943+
c := NewCollector()
1944+
1945+
var responseStatusCode int
1946+
c.OnError(func(resp *Response, err error) {
1947+
responseStatusCode = resp.StatusCode
1948+
// The error should NOT be "gzip: invalid header".
1949+
// A 404 response for a .xml.gz URL should be treated as a
1950+
// normal HTTP error, not a decompression failure.
1951+
if strings.Contains(err.Error(), "gzip") {
1952+
t.Errorf("Expected HTTP error, got gzip decompression error: %v", err)
1953+
}
1954+
})
1955+
1956+
c.OnResponse(func(resp *Response) {
1957+
// A 404 should not reach OnResponse as a successful response
1958+
if resp.StatusCode == 404 {
1959+
responseStatusCode = resp.StatusCode
1960+
}
1961+
})
1962+
1963+
c.Visit(ts.URL + "/sitemap.xml.gz")
1964+
1965+
// The response should have been received (either via OnError or OnResponse)
1966+
// with status 404, not a gzip decompression error
1967+
if responseStatusCode != 404 {
1968+
t.Errorf("Expected status code 404, got %d", responseStatusCode)
1969+
}
1970+
}

http_backend.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkRequestHeader
207207
bodyReader = io.LimitReader(bodyReader, int64(bodySize))
208208
}
209209
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
210-
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz")) {
210+
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || (strings.HasSuffix(strings.ToLower(finalRequest.URL.Path), ".xml.gz") && res.StatusCode >= 200 && res.StatusCode < 300)) {
211211
bodyReader, err = gzip.NewReader(bodyReader)
212212
if err != nil {
213213
return nil, err

0 commit comments

Comments
 (0)