Skip to content

Commit ec776d7

Browse files
committed
fix: disable HTTP/2, set proper User-Agent, and improve retry logic
Signed-off-by: vprashar2929 <vibhu.sharma2929@gmail.com> Made-with: Cursor
1 parent 65d9272 commit ec776d7

3 files changed

Lines changed: 361 additions & 52 deletions

File tree

README.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,21 +150,36 @@ validators:
150150
151151
As seen above, mdox supports validate configuration supports a few parameters and passing an array of link validators with types and regexes. The supported configuration parameters are:
152152
153-
* `timeout`: The HTTP client's timeout. Defaults to "10s".
154-
* `parallelism`: The maximum amount of concurrent HTTP requests. Defaults to 100.
153+
* `timeout`: The HTTP client's timeout. Defaults to "30s".
154+
* `parallelism`: The maximum amount of concurrent HTTP requests. Defaults to 25.
155155
* `host_max_conns`: The maximum amount of HTTP connections open per host. Defaults to 2.
156-
* `random_delay`: A random delay between 0 and this value is added between requests. It takes values like "500ms", "1s", "1m", or "1m30s". Defaults to no delay.
156+
* `random_delay`: A random delay between 0 and this value is added between requests. It takes values like "500ms", "1s", "1m", or "1m30s". Defaults to "500ms".
157157
158158
There are three types of validators:
159159
160160
* `ignore`: This type of validator makes sure that `mdox` does not check links with provided regex. This is the most common use case.
161161
* `githubPullsIssues`: This is a smart validator which only accepts a specific type of regex of the form `(^http[s]?:\/\/)(www\.)?(github\.com\/){ORG}\/{REPO}(\/pull\/|\/issues\/)`. It performs smart validation on GitHub PR and issues links, by fetching GitHub API to get the latest pull/issue number and matching regex. This makes sure that mdox doesn't get rate limited by GitHub, even when checking a large number of GitHub links(which is pretty common in documentation)!
162-
* `roundtrip`: All links are checked with the roundtrip validator by default(no need for including into config explicitly) which means that each link is visited and fails if http status code is not 200(even after retries).
162+
* `roundtrip`: All links are checked with the roundtrip validator by default(no need for including into config explicitly) which means that each link is visited and fails if http status code is not 200(even after retries). Transient failures (HTTP 429, 503, 504, and network timeouts) are retried up to 3 times with exponential backoff. Permanent failures (404, DNS NXDOMAIN, TLS/certificate errors) fail immediately.
163163
164164
Relative link checking *is not* affected by this configuration, as it is expected that such links will work.
165165
166166
YAML can be passed in directly as well using `links.validate.config` flag! For more details [go.dev reference](https://pkg.go.dev/github.com/bwplotka/mdox) or [Go struct](https://github.com/bwplotka/mdox/blob/main/pkg/mdformatter/linktransformer/config.go).
167167
168+
#### Recommended CI configuration
169+
170+
For CI environments, enabling caching avoids re-checking links that were recently verified. This is the most effective way to reduce flaky CI failures from transient network issues or rate limiting:
171+
172+
```yaml
173+
version: 1
174+
timeout: '1m'
175+
parallelism: 25
176+
host_max_conns: 2
177+
random_delay: '500ms'
178+
cache:
179+
type: 'sqlite'
180+
jitter: '24h'
181+
```
182+
168183
### Link localization
169184
170185
It is expected for documentation to contain remote links to the project website. However, in such cases, it creates problems for multi-version docs or multi-domain websites (links would need to be updated for each version which is cumbersome). Also, it would not be navigatable locally or through GitHub (would always redirect to the website) and requires additional link checking.

pkg/mdformatter/linktransformer/link.go

Lines changed: 143 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ import (
77
"bufio"
88
"bytes"
99
"context"
10+
"crypto/x509"
1011
"errors"
1112
"fmt"
1213
"io"
14+
"math/rand"
1315
"net"
1416
"net/http"
1517
"net/url"
@@ -99,6 +101,13 @@ func newLinktransformerMetrics(reg *prometheus.Registry) *linktransformerMetrics
99101
const (
100102
originalURLKey = "originalURLKey"
101103
numberOfRetriesKey = "retryKey"
104+
105+
maxRetries = 3
106+
defaultRequestTimeout = 30 * time.Second
107+
defaultParallelism = 25
108+
defaultRandomDelay = 500 * time.Millisecond
109+
defaultMaxConnsPerHost = 2
110+
maxSingleBackoff = 10 * time.Second
102111
)
103112

104113
type chain struct {
@@ -222,23 +231,25 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
222231
MaxIdleConns: 100,
223232
IdleConnTimeout: 90 * time.Second,
224233
TLSHandshakeTimeout: 10 * time.Second,
225-
ResponseHeaderTimeout: 30 * time.Second,
234+
ResponseHeaderTimeout: defaultRequestTimeout,
226235
ExpectContinueTimeout: 5 * time.Second,
227236
DialContext: (&net.Dialer{
228-
Timeout: 100 * time.Second,
237+
Timeout: 10 * time.Second,
229238
KeepAlive: 30 * time.Second,
230239
}).DialContext,
231240
}
232241
if config.HostMaxConns != nil {
233242
transport.MaxConnsPerHost = *config.HostMaxConns
243+
} else {
244+
transport.MaxConnsPerHost = defaultMaxConnsPerHost
234245
}
235246
v := &validator{
236247
logger: logger,
237248
anchorDir: anchorDir,
238249
validateConfig: config,
239250
localLinks: map[string]*[]string{},
240251
remoteLinks: map[string]error{},
241-
c: colly.NewCollector(colly.Async(), colly.StdlibContext(ctx)),
252+
c: colly.NewCollector(colly.Async(), colly.StdlibContext(ctx), colly.UserAgent("Mozilla/5.0 (compatible; mdox/link-checker; +https://github.com/bwplotka/mdox)")),
242253
storage: nil,
243254
destFutures: map[futureKey]*futureResult{},
244255
l: linktransformerMetrics,
@@ -257,11 +268,11 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
257268
},
258269
}
259270

260-
// Set very soft limits.
261-
// E.g GitHub has 50-5000 https://docs.github.com/en/free-pro-team@latest/rest/reference/rate-limit limit depending
262-
// on API (only search is below 100).
263271
if config.Timeout != "" {
264272
v.c.SetRequestTimeout(config.timeout)
273+
transport.ResponseHeaderTimeout = config.timeout
274+
} else {
275+
v.c.SetRequestTimeout(defaultRequestTimeout)
265276
}
266277

267278
if v.validateConfig.Cache.IsSet() && storage != nil {
@@ -273,7 +284,8 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
273284

274285
limitRule := &colly.LimitRule{
275286
DomainGlob: "*",
276-
Parallelism: 100,
287+
Parallelism: defaultParallelism,
288+
RandomDelay: defaultRandomDelay,
277289
}
278290
if config.Parallelism > 0 {
279291
limitRule.Parallelism = config.Parallelism
@@ -300,53 +312,137 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
300312
v.remoteLinks[response.Ctx.Get(originalURLKey)] = nil
301313
})
302314
v.c.OnError(func(response *colly.Response, err error) {
303-
v.rMu.Lock()
304-
defer v.rMu.Unlock()
305-
retriesStr := response.Ctx.Get(numberOfRetriesKey)
306-
retries, _ := strconv.Atoi(retriesStr)
307-
switch response.StatusCode {
308-
case http.StatusTooManyRequests:
309-
if retries > 0 {
310-
break
311-
}
312-
var retryAfterSeconds int
313-
// Retry calls same methods as Visit and makes request with same options.
314-
// So retryKey is incremented here if onError is called again after Retry. By default retries once.
315-
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
316-
retryAfterSeconds, convErr := strconv.Atoi(response.Headers.Get("Retry-After"))
317-
if convErr != nil {
318-
retryAfterSeconds = 1
319-
}
320-
select {
321-
case <-time.After(time.Duration(retryAfterSeconds) * time.Second):
322-
case <-v.c.Context.Done():
323-
return
324-
}
315+
shouldRetry, backoff := v.recordError(response, err)
316+
if !shouldRetry {
317+
return
318+
}
325319

326-
if retryErr := response.Request.Retry(); retryErr != nil {
327-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("remote link retry %v: %w", response.Ctx.Get(originalURLKey), err)
328-
break
329-
}
330-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("%q rate limited even after retry; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
331-
// 0 StatusCode means error on call side.
332-
case http.StatusMovedPermanently, http.StatusTemporaryRedirect, http.StatusServiceUnavailable, 0:
333-
if retries > 0 {
334-
break
335-
}
336-
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
320+
timer := time.NewTimer(backoff)
321+
defer timer.Stop()
322+
select {
323+
case <-timer.C:
324+
case <-v.c.Context.Done():
325+
v.rMu.Lock()
326+
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf(
327+
"%q not accessible; status code %v; cancelled during backoff: %w",
328+
response.Request.URL.String(), response.StatusCode, err)
329+
v.rMu.Unlock()
330+
return
331+
}
337332

338-
if retryErr := response.Request.Retry(); retryErr != nil {
339-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("remote link retry %v: %w", response.Ctx.Get(originalURLKey), err)
340-
break
341-
}
342-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("%q not accessible even after retry; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
343-
default:
344-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("%q not accessible; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
333+
if retryErr := response.Request.Retry(); retryErr != nil {
334+
v.rMu.Lock()
335+
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf(
336+
"retry failed for %v: %w", response.Ctx.Get(originalURLKey), err)
337+
v.rMu.Unlock()
345338
}
346339
})
347340
return v, nil
348341
}
349342

343+
// recordError classifies a failed request and decides whether to retry.
344+
// Returns (true, backoff) when a retry is warranted.
345+
// All map writes happen under rMu; the caller sleeps without holding the lock.
346+
func (v *validator) recordError(response *colly.Response, err error) (shouldRetry bool, backoff time.Duration) {
347+
v.rMu.Lock()
348+
defer v.rMu.Unlock()
349+
350+
originalURL := response.Ctx.Get(originalURLKey)
351+
retries, _ := strconv.Atoi(response.Ctx.Get(numberOfRetriesKey))
352+
353+
switch response.StatusCode {
354+
case http.StatusTooManyRequests:
355+
if retries >= maxRetries {
356+
v.remoteLinks[originalURL] = fmt.Errorf(
357+
"%q rate limited; status code %v after %d retries: %w",
358+
response.Request.URL.String(), response.StatusCode, retries, err)
359+
return false, 0
360+
}
361+
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
362+
return true, retryAfterOrBackoff(response, retries)
363+
364+
case http.StatusServiceUnavailable, http.StatusGatewayTimeout:
365+
if retries >= maxRetries {
366+
v.remoteLinks[originalURL] = fmt.Errorf(
367+
"%q not accessible; status code %v after %d retries: %w",
368+
response.Request.URL.String(), response.StatusCode, retries, err)
369+
return false, 0
370+
}
371+
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
372+
return true, exponentialBackoff(retries)
373+
374+
case 0:
375+
if isTransientError(err) && retries < maxRetries {
376+
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
377+
return true, exponentialBackoff(retries)
378+
}
379+
v.remoteLinks[originalURL] = fmt.Errorf(
380+
"%q not accessible; status code %v: %w",
381+
response.Request.URL.String(), response.StatusCode, err)
382+
383+
default:
384+
v.remoteLinks[originalURL] = fmt.Errorf(
385+
"%q not accessible; status code %v: %w",
386+
response.Request.URL.String(), response.StatusCode, err)
387+
}
388+
389+
return false, 0
390+
}
391+
392+
// isTransientError classifies the underlying error for status-0 responses.
393+
// Only errors known to be transient return true; permanent failures
394+
// (NXDOMAIN, TLS/certificate) and unknown errors default to false.
395+
func isTransientError(err error) bool {
396+
if err == nil {
397+
return false
398+
}
399+
400+
var dnsErr *net.DNSError
401+
if errors.As(err, &dnsErr) {
402+
return !dnsErr.IsNotFound && dnsErr.IsTemporary
403+
}
404+
405+
var x509CertInvalid x509.CertificateInvalidError
406+
var x509HostErr x509.HostnameError
407+
var x509AuthErr x509.UnknownAuthorityError
408+
if errors.As(err, &x509CertInvalid) || errors.As(err, &x509HostErr) || errors.As(err, &x509AuthErr) {
409+
return false
410+
}
411+
412+
var netErr net.Error
413+
if errors.As(err, &netErr) && netErr.Timeout() {
414+
return true
415+
}
416+
417+
return false
418+
}
419+
420+
// retryAfterOrBackoff honours the Retry-After header when present and within
421+
// the per-retry budget, otherwise falls back to exponential backoff with jitter.
422+
func retryAfterOrBackoff(response *colly.Response, retries int) time.Duration {
423+
if ra := response.Headers.Get("Retry-After"); ra != "" {
424+
if seconds, err := strconv.Atoi(ra); err == nil && seconds > 0 {
425+
d := time.Duration(seconds) * time.Second
426+
if d <= maxSingleBackoff {
427+
return d
428+
}
429+
}
430+
}
431+
return exponentialBackoff(retries)
432+
}
433+
434+
// exponentialBackoff returns 2^(retries+1) seconds with additive jitter,
435+
// capped at maxSingleBackoff.
436+
func exponentialBackoff(retries int) time.Duration {
437+
base := time.Duration(1<<uint(retries+1)) * time.Second
438+
jitter := time.Duration(rand.Int63n(int64(base) / 2))
439+
d := base + jitter
440+
if d > maxSingleBackoff {
441+
d = maxSingleBackoff
442+
}
443+
return d
444+
}
445+
350446
// MustNewValidator returns mdformatter.LinkTransformer that crawls all links.
351447
func MustNewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir string, storage *cache.SQLite3Storage) mdformatter.LinkTransformer {
352448
v, err := NewValidator(context.TODO(), logger, linksValidateConfig, anchorDir, storage, nil)

0 commit comments

Comments
 (0)