Skip to content

Commit c6c0ff0

Browse files
committed
Add google bot IP "firewall"
1 parent 5db058d commit c6c0ff0

4 files changed

Lines changed: 147 additions & 1 deletion

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ To avoid having this middleware impact your SEO score, it's recommended to provi
157157
A good default value for `goodBots` would be:
158158

159159
```
160-
goodBots: apple.com,archive.org,duckduckgo.com,facebook.com,google.com,googlebot.com,googleusercontent.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
160+
goodBots: apple.com,archive.org,duckduckgo.com,facebook.com,google.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
161161
```
162162

163163
**However** if you set the config parameter `protectParameters="true"`, even good bots won't be allowed to crawl protected routes if a URL parameter is on the request (e.g. `/foo?bar=baz`). This `protectParameters` feature is meant to help protect faceted search pages.

ci/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ services:
1919
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.ipForwardedHeader: "X-Forwarded-For"
2020
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.logLevel: "DEBUG"
2121
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: ""
22+
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "true"
2223
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectRoutes: "/"
2324
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.persistentStateFile: "/tmp/state.json"
2425
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true"
@@ -48,6 +49,7 @@ services:
4849
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.ipForwardedHeader: "X-Forwarded-For"
4950
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.logLevel: "DEBUG"
5051
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: ""
52+
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "true"
5153
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectRoutes: "/"
5254
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.persistentStateFile: "/tmp/state.json"
5355
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true"

internal/state/google.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package state
2+
3+
import (
4+
"log"
5+
"net"
6+
"sync"
7+
)
8+
9+
// GooglebotIPs holds the list of Googlebot IP ranges.
10+
type GooglebotIPs struct {
11+
cidrs []*net.IPNet
12+
mu sync.RWMutex
13+
}
14+
15+
// NewGooglebotIPs creates a new GooglebotIPs struct.
16+
func NewGooglebotIPs() *GooglebotIPs {
17+
return &GooglebotIPs{
18+
cidrs: make([]*net.IPNet, 0),
19+
}
20+
}
21+
22+
// Update replaces the current list of CIDRs with a new one.
23+
func (g *GooglebotIPs) Update(cidrs []string) {
24+
g.mu.Lock()
25+
defer g.mu.Unlock()
26+
27+
g.cidrs = make([]*net.IPNet, 0, len(cidrs))
28+
29+
for _, s := range cidrs {
30+
_, network, err := net.ParseCIDR(s)
31+
if err != nil {
32+
log.Printf("error parsing CIDR %s: %v", s, err)
33+
34+
continue
35+
}
36+
37+
g.cidrs = append(g.cidrs, network)
38+
}
39+
}
40+
41+
// Contains checks if a given IP is in the list of Googlebot IPs.
42+
func (g *GooglebotIPs) Contains(ip net.IP) bool {
43+
g.mu.RLock()
44+
defer g.mu.RUnlock()
45+
46+
for _, network := range g.cidrs {
47+
if network.Contains(ip) {
48+
return true
49+
}
50+
}
51+
52+
return false
53+
}

main.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ type Config struct {
7575
// Only enable this if running multiple plugin instances sharing the same state file.
7676
// Performance warning: Not recommended for sites with >1M unique visitors (see internal/state/state_stress_test.go).
7777
EnableStateReconciliation string `json:"enableStateReconciliation"`
78+
EnableGooglebotIPCheck string `json:"enableGooglebotIPCheck"`
7879
Mode string `json:"mode"`
7980
PeriodSeconds int `json:"periodSeconds"`
8081
FailureThreshold int `json:"failureThreshold"`
@@ -89,6 +90,7 @@ type CaptchaProtect struct {
8990
rateCache *lru.Cache
9091
verifiedCache *lru.Cache
9192
botCache *lru.Cache
93+
googlebotIPs *state.GooglebotIPs
9294
captchaConfig CaptchaConfig
9395
exemptIps []*net.IPNet
9496
tmpl *template.Template
@@ -138,6 +140,7 @@ func CreateConfig() *Config {
138140
CaptchaProvider: "turnstile",
139141
Mode: "prefix",
140142
EnableStateReconciliation: "false",
143+
EnableGooglebotIPCheck: "false",
141144
PeriodSeconds: DefaultHealthCheckPeriodSeconds,
142145
FailureThreshold: DefaultHealthCheckFailureThreshold,
143146
}
@@ -329,9 +332,91 @@ func NewCaptchaProtect(ctx context.Context, next http.Handler, config *Config, n
329332
}()
330333
}
331334

335+
if config.EnableGooglebotIPCheck == "true" {
336+
log.Info("Googlebot IP check enabled")
337+
bc.googlebotIPs = state.NewGooglebotIPs()
338+
childCtx, cancel := context.WithCancel(ctx)
339+
go bc.googlebotIPCheckLoop(childCtx)
340+
go func() {
341+
<-ctx.Done()
342+
log.Debug("Context canceled, stopping Googlebot IP check loop")
343+
cancel()
344+
}()
345+
}
346+
332347
return &bc, nil
333348
}
334349

350+
type googlebotIPs struct {
351+
Prefixes []struct {
352+
IPv4Prefix string `json:"ipv4Prefix"`
353+
IPv6Prefix string `json:"ipv6Prefix"`
354+
} `json:"prefixes"`
355+
}
356+
357+
func (bc *CaptchaProtect) googlebotIPCheckLoop(ctx context.Context) {
358+
ticker := time.NewTicker(24 * time.Hour)
359+
defer ticker.Stop()
360+
361+
// Initial fetch
362+
bc.fetchGooglebotIPs()
363+
364+
for {
365+
select {
366+
case <-ticker.C:
367+
bc.fetchGooglebotIPs()
368+
case <-ctx.Done():
369+
return
370+
}
371+
}
372+
}
373+
374+
func (bc *CaptchaProtect) fetchGooglebotIPs() {
375+
bc.log.Debug("Fetching Googlebot IPs")
376+
377+
req, err := http.NewRequest(http.MethodGet, "https://developers.google.com/static/search/apis/ipranges/googlebot.json", nil)
378+
if err != nil {
379+
bc.log.Error("Failed to create Googlebot IP request", "err", err)
380+
return
381+
}
382+
383+
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
384+
defer cancel()
385+
req = req.WithContext(ctx)
386+
387+
resp, err := bc.httpClient.Do(req)
388+
if err != nil {
389+
bc.log.Error("Failed to fetch Googlebot IPs", "err", err)
390+
return
391+
}
392+
defer resp.Body.Close()
393+
394+
if resp.StatusCode != http.StatusOK {
395+
bc.log.Error("Failed to fetch Googlebot IPs", "statusCode", resp.StatusCode)
396+
return
397+
}
398+
399+
var ips googlebotIPs
400+
err = json.NewDecoder(resp.Body).Decode(&ips)
401+
if err != nil {
402+
bc.log.Error("Failed to decode Googlebot IPs", "err", err)
403+
return
404+
}
405+
406+
var cidrs []string
407+
for _, prefix := range ips.Prefixes {
408+
if prefix.IPv4Prefix != "" {
409+
cidrs = append(cidrs, prefix.IPv4Prefix)
410+
}
411+
if prefix.IPv6Prefix != "" {
412+
cidrs = append(cidrs, prefix.IPv6Prefix)
413+
}
414+
}
415+
416+
bc.googlebotIPs.Update(cidrs)
417+
bc.log.Info("Updated Googlebot IPs", "count", len(cidrs))
418+
}
419+
335420
// getCaptchaConfig returns the captcha configuration for a given provider.
336421
// Returns an empty CaptchaConfig if the provider is invalid.
337422
func getCaptchaConfig(provider string) CaptchaConfig {
@@ -899,6 +984,12 @@ func (bc *CaptchaProtect) isGoodBot(req *http.Request, clientIP string) bool {
899984
}
900985
}
901986

987+
if bc.config.EnableGooglebotIPCheck == "true" {
988+
if bc.googlebotIPs.Contains(net.ParseIP(clientIP)) {
989+
return true
990+
}
991+
}
992+
902993
bot, ok := bc.botCache.Get(clientIP)
903994
if ok {
904995
return bot.(bool)

0 commit comments

Comments
 (0)