Skip to content

Commit c93dfe9

Browse files
committed
fix: disable HTTP/2, set proper User-Agent, and improve retry logic
Signed-off-by: vprashar2929 <vibhu.sharma2929@gmail.com>
1 parent 65d9272 commit c93dfe9

3 files changed

Lines changed: 182 additions & 54 deletions

File tree

README.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,21 +150,36 @@ validators:
150150
151151
As seen above, mdox supports validate configuration supports a few parameters and passing an array of link validators with types and regexes. The supported configuration parameters are:
152152
153-
* `timeout`: The HTTP client's timeout. Defaults to "10s".
154-
* `parallelism`: The maximum amount of concurrent HTTP requests. Defaults to 100.
153+
* `timeout`: The HTTP client's timeout. Defaults to "30s".
154+
* `parallelism`: The maximum amount of concurrent HTTP requests. Defaults to 25.
155155
* `host_max_conns`: The maximum amount of HTTP connections open per host. Defaults to 2.
156-
* `random_delay`: A random delay between 0 and this value is added between requests. It takes values like "500ms", "1s", "1m", or "1m30s". Defaults to no delay.
156+
* `random_delay`: A random delay between 0 and this value is added between requests. It takes values like "500ms", "1s", "1m", or "1m30s". Defaults to "500ms".
157157
158158
There are three types of validators:
159159
160160
* `ignore`: This type of validator makes sure that `mdox` does not check links with provided regex. This is the most common use case.
161161
* `githubPullsIssues`: This is a smart validator which only accepts a specific type of regex of the form `(^http[s]?:\/\/)(www\.)?(github\.com\/){ORG}\/{REPO}(\/pull\/|\/issues\/)`. It performs smart validation on GitHub PR and issues links, by fetching GitHub API to get the latest pull/issue number and matching regex. This makes sure that mdox doesn't get rate limited by GitHub, even when checking a large number of GitHub links(which is pretty common in documentation)!
162-
* `roundtrip`: All links are checked with the roundtrip validator by default(no need for including into config explicitly) which means that each link is visited and fails if http status code is not 200(even after retries).
162+
* `roundtrip`: All links are checked with the roundtrip validator by default(no need for including into config explicitly) which means that each link is visited and fails if http status code is not 200(even after retries). HTTP 429 (Too Many Requests) is treated as a valid response since it proves the server is alive.
163163
164164
Relative link checking *is not* affected by this configuration, as it is expected that such links will work.
165165
166166
YAML can be passed in directly as well using `links.validate.config` flag! For more details [go.dev reference](https://pkg.go.dev/github.com/bwplotka/mdox) or [Go struct](https://github.com/bwplotka/mdox/blob/main/pkg/mdformatter/linktransformer/config.go).
167167
168+
#### Recommended CI configuration
169+
170+
For CI environments, enabling caching avoids re-checking links that were recently verified. This is the most effective way to reduce flaky CI failures:
171+
172+
```yaml
173+
version: 1
174+
timeout: '1m'
175+
parallelism: 25
176+
host_max_conns: 10
177+
random_delay: '1s'
178+
cache:
179+
type: 'SQLite'
180+
jitter: '24h'
181+
```
182+
168183
### Link localization
169184
170185
It is expected for documentation to contain remote links to the project website. However, in such cases, it creates problems for multi-version docs or multi-domain websites (links would need to be updated for each version which is cumbersome). Also, it would not be navigatable locally or through GitHub (would always redirect to the website) and requires additional link checking.

pkg/mdformatter/linktransformer/link.go

Lines changed: 66 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,11 @@ func newLinktransformerMetrics(reg *prometheus.Registry) *linktransformerMetrics
9999
const (
100100
originalURLKey = "originalURLKey"
101101
numberOfRetriesKey = "retryKey"
102+
103+
maxRetries = 3
104+
defaultRequestTimeout = 30 * time.Second
105+
defaultParallelism = 25
106+
defaultRandomDelay = 500 * time.Millisecond
102107
)
103108

104109
type chain struct {
@@ -218,14 +223,14 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
218223
linktransformerMetrics := newLinktransformerMetrics(reg)
219224
transport := &http.Transport{
220225
Proxy: http.ProxyFromEnvironment,
221-
ForceAttemptHTTP2: true,
226+
ForceAttemptHTTP2: false,
222227
MaxIdleConns: 100,
223228
IdleConnTimeout: 90 * time.Second,
224229
TLSHandshakeTimeout: 10 * time.Second,
225-
ResponseHeaderTimeout: 30 * time.Second,
230+
ResponseHeaderTimeout: defaultRequestTimeout,
226231
ExpectContinueTimeout: 5 * time.Second,
227232
DialContext: (&net.Dialer{
228-
Timeout: 100 * time.Second,
233+
Timeout: 10 * time.Second,
229234
KeepAlive: 30 * time.Second,
230235
}).DialContext,
231236
}
@@ -238,7 +243,7 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
238243
validateConfig: config,
239244
localLinks: map[string]*[]string{},
240245
remoteLinks: map[string]error{},
241-
c: colly.NewCollector(colly.Async(), colly.StdlibContext(ctx)),
246+
c: colly.NewCollector(colly.Async(), colly.StdlibContext(ctx), colly.UserAgent("Mozilla/5.0 (compatible; mdox/link-checker; +https://github.com/bwplotka/mdox)")),
242247
storage: nil,
243248
destFutures: map[futureKey]*futureResult{},
244249
l: linktransformerMetrics,
@@ -257,11 +262,11 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
257262
},
258263
}
259264

260-
// Set very soft limits.
261-
// E.g GitHub has 50-5000 https://docs.github.com/en/free-pro-team@latest/rest/reference/rate-limit limit depending
262-
// on API (only search is below 100).
263265
if config.Timeout != "" {
264266
v.c.SetRequestTimeout(config.timeout)
267+
transport.ResponseHeaderTimeout = config.timeout
268+
} else {
269+
v.c.SetRequestTimeout(defaultRequestTimeout)
265270
}
266271

267272
if v.validateConfig.Cache.IsSet() && storage != nil {
@@ -273,7 +278,8 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
273278

274279
limitRule := &colly.LimitRule{
275280
DomainGlob: "*",
276-
Parallelism: 100,
281+
Parallelism: defaultParallelism,
282+
RandomDelay: defaultRandomDelay,
277283
}
278284
if config.Parallelism > 0 {
279285
limitRule.Parallelism = config.Parallelism
@@ -300,53 +306,64 @@ func NewValidator(ctx context.Context, logger log.Logger, linksValidateConfig []
300306
v.remoteLinks[response.Ctx.Get(originalURLKey)] = nil
301307
})
302308
v.c.OnError(func(response *colly.Response, err error) {
303-
v.rMu.Lock()
304-
defer v.rMu.Unlock()
305-
retriesStr := response.Ctx.Get(numberOfRetriesKey)
306-
retries, _ := strconv.Atoi(retriesStr)
307-
switch response.StatusCode {
308-
case http.StatusTooManyRequests:
309-
if retries > 0 {
310-
break
311-
}
312-
var retryAfterSeconds int
313-
// Retry calls same methods as Visit and makes request with same options.
314-
// So retryKey is incremented here if onError is called again after Retry. By default retries once.
315-
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
316-
retryAfterSeconds, convErr := strconv.Atoi(response.Headers.Get("Retry-After"))
317-
if convErr != nil {
318-
retryAfterSeconds = 1
319-
}
320-
select {
321-
case <-time.After(time.Duration(retryAfterSeconds) * time.Second):
322-
case <-v.c.Context.Done():
323-
return
324-
}
325-
326-
if retryErr := response.Request.Retry(); retryErr != nil {
327-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("remote link retry %v: %w", response.Ctx.Get(originalURLKey), err)
328-
break
329-
}
330-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("%q rate limited even after retry; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
331-
// 0 StatusCode means error on call side.
332-
case http.StatusMovedPermanently, http.StatusTemporaryRedirect, http.StatusServiceUnavailable, 0:
333-
if retries > 0 {
334-
break
335-
}
336-
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
309+
shouldRetry, backoff := v.recordError(response, err)
310+
if !shouldRetry {
311+
return
312+
}
337313

338-
if retryErr := response.Request.Retry(); retryErr != nil {
339-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("remote link retry %v: %w", response.Ctx.Get(originalURLKey), err)
340-
break
341-
}
342-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("%q not accessible even after retry; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
343-
default:
344-
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf("%q not accessible; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
314+
timer := time.NewTimer(backoff)
315+
defer timer.Stop()
316+
select {
317+
case <-timer.C:
318+
case <-v.c.Context.Done():
319+
v.rMu.Lock()
320+
v.remoteLinks[response.Ctx.Get(originalURLKey)] = fmt.Errorf(
321+
"%q not accessible; status code %v; cancelled during backoff: %w",
322+
response.Request.URL.String(), response.StatusCode, err)
323+
v.rMu.Unlock()
324+
return
345325
}
326+
327+
// In async mode Retry always returns nil (fires a new goroutine);
328+
// the retry result arrives via a later OnScraped or OnError callback.
329+
response.Request.Retry()
346330
})
347331
return v, nil
348332
}
349333

334+
// recordError evaluates a failed request and decides whether to retry.
335+
// It returns (true, backoff) when a retry is warranted.
336+
// Map writes happen under rMu so the caller can sleep without holding the lock.
337+
func (v *validator) recordError(response *colly.Response, err error) (shouldRetry bool, backoff time.Duration) {
338+
v.rMu.Lock()
339+
defer v.rMu.Unlock()
340+
341+
originalURL := response.Ctx.Get(originalURLKey)
342+
retries, _ := strconv.Atoi(response.Ctx.Get(numberOfRetriesKey))
343+
344+
switch response.StatusCode {
345+
case http.StatusTooManyRequests:
346+
// A 429 proves the server is alive and the URL is routable.
347+
// Retrying would only add load; treat as valid instead.
348+
level.Debug(v.logger).Log("msg", "rate limited, treating as valid", "url", originalURL, "status", response.StatusCode)
349+
v.remoteLinks[originalURL] = nil
350+
return false, 0
351+
352+
case http.StatusServiceUnavailable, 0:
353+
if retries < maxRetries {
354+
response.Ctx.Put(numberOfRetriesKey, strconv.Itoa(retries+1))
355+
v.remoteLinks[originalURL] = fmt.Errorf("%q not accessible; status code %v; retrying: %w", response.Request.URL.String(), response.StatusCode, err)
356+
return true, time.Duration(1<<uint(retries+1)) * time.Second
357+
}
358+
v.remoteLinks[originalURL] = fmt.Errorf("%q not accessible; status code %v after %d retries: %w", response.Request.URL.String(), response.StatusCode, retries, err)
359+
360+
default:
361+
v.remoteLinks[originalURL] = fmt.Errorf("%q not accessible; status code %v: %w", response.Request.URL.String(), response.StatusCode, err)
362+
}
363+
364+
return false, 0
365+
}
366+
350367
// MustNewValidator returns mdformatter.LinkTransformer that crawls all links.
351368
func MustNewValidator(logger log.Logger, linksValidateConfig []byte, anchorDir string, storage *cache.SQLite3Storage) mdformatter.LinkTransformer {
352369
v, err := NewValidator(context.TODO(), logger, linksValidateConfig, anchorDir, storage, nil)

pkg/mdformatter/linktransformer/link_test.go

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,15 @@ import (
77
"context"
88
"database/sql"
99
"fmt"
10+
"net/http"
11+
"net/http/httptest"
1012
"os"
1113
"path/filepath"
1214
"regexp"
1315
"strings"
16+
"sync/atomic"
1417
"testing"
18+
"time"
1519

1620
"github.com/bwplotka/mdox/pkg/cache"
1721
"github.com/bwplotka/mdox/pkg/mdformatter"
@@ -309,7 +313,7 @@ func TestValidator_TransformDestination(t *testing.T) {
309313
MustNewValidator(logger, []byte(""), anchorDir, nil),
310314
))
311315
testutil.NotOk(t, err)
312-
testutil.Assert(t, strings.Contains(err.Error(), fmt.Sprintf("%v:1: \"https://docs.gfoogle.com/drawings/d/e/2PACX-1vTBFK_cGMbxFpYcv/pub?w=960&h=720\" not accessible even after retry; status code 0", relDirPath+filePath)))
316+
testutil.Assert(t, strings.Contains(err.Error(), fmt.Sprintf("%v:1: \"https://docs.gfoogle.com/drawings/d/e/2PACX-1vTBFK_cGMbxFpYcv/pub?w=960&h=720\" not accessible; status code 0 after %d retries", relDirPath+filePath, maxRetries)))
313317
testutil.Assert(t, strings.Contains(err.Error(), fmt.Sprintf("%v:1: \"https://bwplotka.dev/does-not-exists\" not accessible; status code 404: Not Found", relDirPath+filePath)))
314318
})
315319

@@ -483,3 +487,95 @@ func TestValidator_TransformDestination(t *testing.T) {
483487
testutil.Equals(t, "sql: no rows in result set", err.Error())
484488
})
485489
}
490+
491+
func TestValidator_RetryBehavior(t *testing.T) {
492+
tmpDir, err := os.MkdirTemp("", "test-retry")
493+
testutil.Ok(t, err)
494+
t.Cleanup(func() { testutil.Ok(t, os.RemoveAll(tmpDir)) })
495+
496+
logger := log.NewLogfmtLogger(os.Stderr)
497+
anchorDir := tmpDir
498+
499+
t.Run("429 treated as valid", func(t *testing.T) {
500+
var requestCount int32
501+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
502+
atomic.AddInt32(&requestCount, 1)
503+
w.Header().Set("Retry-After", "0")
504+
w.WriteHeader(http.StatusTooManyRequests)
505+
}))
506+
defer srv.Close()
507+
508+
testFile := filepath.Join(tmpDir, "valid-429.md")
509+
testutil.Ok(t, os.WriteFile(testFile, []byte("[link]("+srv.URL+")\n"), os.ModePerm))
510+
511+
_, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer(
512+
MustNewValidator(logger, []byte(""), anchorDir, nil),
513+
))
514+
testutil.Ok(t, err)
515+
testutil.Equals(t, int32(1), atomic.LoadInt32(&requestCount))
516+
})
517+
518+
t.Run("503 recovers on retry", func(t *testing.T) {
519+
var requestCount int32
520+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
521+
count := atomic.AddInt32(&requestCount, 1)
522+
if count <= 1 {
523+
w.WriteHeader(http.StatusServiceUnavailable)
524+
return
525+
}
526+
w.WriteHeader(http.StatusOK)
527+
}))
528+
defer srv.Close()
529+
530+
testFile := filepath.Join(tmpDir, "retry-503.md")
531+
testutil.Ok(t, os.WriteFile(testFile, []byte("[link]("+srv.URL+")\n"), os.ModePerm))
532+
533+
_, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer(
534+
MustNewValidator(logger, []byte(""), anchorDir, nil),
535+
))
536+
testutil.Ok(t, err)
537+
testutil.Equals(t, int32(2), atomic.LoadInt32(&requestCount))
538+
})
539+
540+
t.Run("301 is not retried", func(t *testing.T) {
541+
var requestCount int32
542+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
543+
atomic.AddInt32(&requestCount, 1)
544+
w.WriteHeader(http.StatusMovedPermanently)
545+
}))
546+
defer srv.Close()
547+
548+
testFile := filepath.Join(tmpDir, "no-retry-301.md")
549+
testutil.Ok(t, os.WriteFile(testFile, []byte("[link]("+srv.URL+")\n"), os.ModePerm))
550+
551+
_, err := mdformatter.IsFormatted(context.TODO(), logger, []string{testFile}, mdformatter.WithLinkTransformer(
552+
MustNewValidator(logger, []byte(""), anchorDir, nil),
553+
))
554+
testutil.NotOk(t, err)
555+
testutil.Assert(t, strings.Contains(err.Error(), "not accessible"))
556+
testutil.Assert(t, !strings.Contains(err.Error(), "retries"))
557+
})
558+
559+
t.Run("context cancellation stops retries", func(t *testing.T) {
560+
var requestCount int32
561+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
562+
atomic.AddInt32(&requestCount, 1)
563+
w.WriteHeader(http.StatusServiceUnavailable)
564+
}))
565+
defer srv.Close()
566+
567+
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
568+
defer cancel()
569+
570+
testFile := filepath.Join(tmpDir, "cancel-retry.md")
571+
testutil.Ok(t, os.WriteFile(testFile, []byte("[link]("+srv.URL+")\n"), os.ModePerm))
572+
573+
v, vErr := NewValidator(ctx, logger, []byte(""), anchorDir, nil, nil)
574+
testutil.Ok(t, vErr)
575+
576+
_, err := mdformatter.IsFormatted(ctx, logger, []string{testFile}, mdformatter.WithLinkTransformer(v))
577+
testutil.NotOk(t, err)
578+
testutil.Assert(t, strings.Contains(err.Error(), "cancelled during backoff") || strings.Contains(err.Error(), "not accessible"),
579+
fmt.Sprintf("unexpected error: %v", err))
580+
})
581+
}

0 commit comments

Comments
 (0)