From f4e5c5c7b48b6803998adbaab79582cc36142ffb Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Sun, 14 Jun 2026 19:33:23 +0200 Subject: [PATCH 1/3] 1107 Add okhttp IP address filter Optionally limit or block connections to IP address ranges once the host name has been resolved. Filtering URLs is not sufficient since a DNS entry may resolve to a private or loopback address, leaking information to a public index or archive, so the filtering happens at the protocol level. Adds CIDR and IPFilterRules helpers and a network interceptor for the okhttp HttpProtocol, configured via http.filter.ipaddress.include and http.filter.ipaddress.exclude (comma-separated string or YAML list, supporting single IPs, CIDR blocks, localhost/loopback and sitelocal). Includes unit tests and documentation updates. Closes #1107 --- .../stormcrawler/protocol/okhttp/CIDR.java | 83 +++++++++++ .../protocol/okhttp/HttpProtocol.java | 39 +++++ .../protocol/okhttp/IPFilterRules.java | 137 ++++++++++++++++++ core/src/main/resources/crawler-default.yaml | 14 ++ .../protocol/okhttp/CIDRTest.java | 73 ++++++++++ .../protocol/okhttp/IPFilterRulesTest.java | 101 +++++++++++++ docs/src/main/asciidoc/configuration.adoc | 35 +++++ 7 files changed, 482 insertions(+) create mode 100644 core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java create mode 100644 core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java create mode 100644 core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java create mode 100644 core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java new file mode 100644 index 000000000..0948e0f9c --- /dev/null +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.protocol.okhttp; + +import com.google.common.net.InetAddresses; +import java.net.InetAddress; + +/** + * Parse a CIDR block notation and test + * whether an IP address is contained in the subnet range defined by the CIDR. + */ +public class CIDR { + + private final InetAddress addr; + private final int mask; + + public CIDR(InetAddress address, int mask) { + this.addr = address; + this.mask = mask; + } + + public CIDR(String cidr) throws IllegalArgumentException { + String ipStr = cidr; + int sep = cidr.indexOf('/'); + if (sep > -1) { + ipStr = cidr.substring(0, sep); + } + addr = InetAddresses.forString(ipStr); + int parsedMask; + if (sep > -1) { + parsedMask = Integer.parseInt(cidr.substring(sep + 1)); + } else { + parsedMask = addr.getAddress().length * 8; + } + if (cidr.indexOf(':') > -1 && addr.getAddress().length == 4) { + // IPv4-mapped IPv6 addresses are automatically converted to IPv4, + // need to shift the mask + parsedMask = Math.max(0, parsedMask - 96); + } + this.mask = parsedMask; + } + + public boolean contains(InetAddress address) { + byte[] addr0 = addr.getAddress(); + byte[] addr1 = address.getAddress(); + if (addr0.length != addr1.length) { + // not comparing IPv4 and IPv6 addresses + return false; + } + for (int i = 0; i < addr0.length; i++) { + int remainingMaskBits = mask - (i * 8); + if (remainingMaskBits <= 0) { + return true; + } + int m = ~(0xff >> remainingMaskBits); // mask for byte under cursor + if ((addr0[i] & m) != (addr1[i] & m)) { + return false; + } + } + return true; + } + + @Override + public String toString() { + return addr + "/" + mask; + } +} diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java index cbcd4bf72..89a46f727 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.InterruptedIOException; +import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Proxy; @@ -226,6 +227,14 @@ public void configure(Config conf) { customHeaders.forEach(customRequestHeaders::add); + // optionally block connections to forbidden IP address ranges + // (e.g. localhost/loopback, private/site-local addresses), see + // https://github.com/apache/stormcrawler/issues/1107 + final IPFilterRules ipFilterRules = new IPFilterRules(conf); + if (!ipFilterRules.isEmpty()) { + builder.addNetworkInterceptor(new HTTPFilterIPAddressInterceptor(ipFilterRules)); + } + if (storeHttpHeaders) { builder.addNetworkInterceptor(new HTTPHeadersInterceptor()); } @@ -538,6 +547,36 @@ private byte[] toByteArray( return arr; } + /** + * Network interceptor blocking connections to IP addresses rejected by the configured {@link + * IPFilterRules}. The IP address is only known once the connection has been established, hence + * the filtering happens at the protocol level rather than by filtering URLs. + */ + static class HTTPFilterIPAddressInterceptor implements Interceptor { + + private final IPFilterRules rules; + + HTTPFilterIPAddressInterceptor(IPFilterRules rules) { + this.rules = rules; + } + + @NotNull + @Override + public Response intercept(Interceptor.Chain chain) throws IOException { + final Connection connection = Objects.requireNonNull(chain.connection()); + final InetAddress address = connection.socket().getInetAddress(); + final Request request = chain.request(); + + if (rules.accept(address)) { + return chain.proceed(request); + } + + final String hostAddress = address == null ? "unknown" : address.getHostAddress(); + LOG.warn("Blocked connection to IP address {}: {}", hostAddress, request.url()); + throw new IOException("Forbidden connection to IP address " + hostAddress); + } + } + static class HTTPHeadersInterceptor implements Interceptor { private String getNormalizedProtocolName(Protocol protocol) { diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java new file mode 100644 index 000000000..d3bc73f9b --- /dev/null +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.protocol.okhttp; + +import java.net.InetAddress; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.function.Predicate; +import org.apache.commons.lang3.StringUtils; +import org.apache.stormcrawler.util.ConfUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Optionally limit or block connections to IP address ranges (localhost/loopback or site-local + * addresses, subnet ranges given in CIDR notation, or single IP addresses). + * + *

IP filter rules are built from two configuration properties: + * + *

+ * + *

IP ranges can be defined as + * + *

+ * + *

Multiple IP ranges can be given either as a comma-separated string, e.g. + * loopback,sitelocal,fd00::/8, or as a list in the configuration. + */ +public class IPFilterRules { + + protected static final Logger LOG = LoggerFactory.getLogger(IPFilterRules.class); + + public static final String INCLUDE_RULES_KEY = "http.filter.ipaddress.include"; + public static final String EXCLUDE_RULES_KEY = "http.filter.ipaddress.exclude"; + + private final List> includeRules; + private final List> excludeRules; + + public IPFilterRules(Map conf) { + includeRules = parseIPRules(conf, INCLUDE_RULES_KEY); + excludeRules = parseIPRules(conf, EXCLUDE_RULES_KEY); + } + + public boolean isEmpty() { + return includeRules.isEmpty() && excludeRules.isEmpty(); + } + + public boolean accept(InetAddress address) { + if (address == null) { + return false; + } + boolean accept = true; + if (!includeRules.isEmpty()) { + accept = false; + for (Predicate rule : includeRules) { + if (rule.test(address)) { + accept = true; + break; + } + } + } + if (accept && !excludeRules.isEmpty()) { + for (Predicate rule : excludeRules) { + if (rule.test(address)) { + accept = false; + break; + } + } + } + return accept; + } + + private static List> parseIPRules( + Map conf, String ipRuleProperty) { + List> rules = new ArrayList<>(); + for (String entry : ConfUtils.loadListFromConf(ipRuleProperty, conf)) { + // a single config value may itself hold a comma-separated list of rules + for (String ipRule : entry.split(",")) { + ipRule = ipRule.trim(); + if (StringUtils.isBlank(ipRule)) { + continue; + } + switch (ipRule.toLowerCase(Locale.ROOT)) { + case "localhost": + case "loopback": + rules.add(InetAddress::isLoopbackAddress); + break; + case "sitelocal": + rules.add(InetAddress::isSiteLocalAddress); + break; + default: + try { + CIDR cidr = new CIDR(ipRule); + rules.add(cidr::contains); + } catch (IllegalArgumentException e) { + LOG.error( + "Failed to parse {} as CIDR, ignoring it while configuring IP rules ({})", + ipRule, + ipRuleProperty); + } + } + } + } + if (!rules.isEmpty()) { + LOG.info("Found {} IP filter rule(s) for {}", rules.size(), ipRuleProperty); + } + return rules; + } +} diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml index d024ed941..d45cbc6b6 100644 --- a/core/src/main/resources/crawler-default.yaml +++ b/core/src/main/resources/crawler-default.yaml @@ -144,6 +144,20 @@ config: # Follow redirect HTTP responses: http.allow.redirects: false + # IP address filtering (okhttp protocol only). Optionally limit or block + # connections to IP address ranges once the host name has been resolved. This + # prevents information leakage to a public index when a DNS entry points to a + # private or loopback address. Rules can be given as a comma-separated string + # or as a list and may be: + # - a single IP address, e.g. "127.0.0.1" or "::1" + # - a CIDR block, e.g. "192.168.0.0/16" or "fd00::/8" + # - "localhost" / "loopback" (matches InetAddress.isLoopbackAddress()) + # - "sitelocal" (matches InetAddress.isSiteLocalAddress()) + # Only addresses matching an include rule are fetched (empty means all are + # allowed), addresses matching an exclude rule are always blocked. + # http.filter.ipaddress.include: + # http.filter.ipaddress.exclude: "localhost,sitelocal" + # Allow all if robots.txt cannot be parsed due to code 403 (Forbidden): http.robots.403.allow: true diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java new file mode 100644 index 000000000..aaa5f4f1e --- /dev/null +++ b/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.protocol.okhttp; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.google.common.net.InetAddresses; +import java.net.InetAddress; +import org.junit.jupiter.api.Test; + +class CIDRTest { + + private static InetAddress ip(String address) { + return InetAddresses.forString(address); + } + + @Test + void singleIPv4AddressMatchesOnlyItself() { + CIDR cidr = new CIDR("127.0.0.1"); + assertTrue(cidr.contains(ip("127.0.0.1"))); + assertFalse(cidr.contains(ip("127.0.0.2"))); + } + + @Test + void ipv4CidrBlockMatchesAddressesInRange() { + CIDR cidr = new CIDR("192.168.0.0/16"); + assertTrue(cidr.contains(ip("192.168.0.1"))); + assertTrue(cidr.contains(ip("192.168.255.255"))); + assertFalse(cidr.contains(ip("192.169.0.1"))); + assertFalse(cidr.contains(ip("10.0.0.1"))); + } + + @Test + void ipv6CidrBlockMatchesAddressesInRange() { + CIDR cidr = new CIDR("fd00::/8"); + assertTrue(cidr.contains(ip("fd00::1"))); + assertTrue(cidr.contains(ip("fdff::ffff"))); + assertFalse(cidr.contains(ip("fc00::1"))); + } + + @Test + void ipv4AndIpv6AreNotComparable() { + CIDR cidr = new CIDR("192.168.0.0/16"); + assertFalse(cidr.contains(ip("::1"))); + } + + @Test + void invalidCidrThrows() { + assertThrows(IllegalArgumentException.class, () -> new CIDR("not-an-ip")); + } + + @Test + void defaultMaskCoversFullAddress() { + assertTrue(new CIDR("10.0.0.1").toString().endsWith("/32")); + } +} diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java new file mode 100644 index 000000000..6adad523d --- /dev/null +++ b/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.protocol.okhttp; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import com.google.common.net.InetAddresses; +import java.net.InetAddress; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.Test; + +class IPFilterRulesTest { + + private static InetAddress ip(String address) { + return InetAddresses.forString(address); + } + + private static IPFilterRules rules(Object include, Object exclude) { + Map conf = new HashMap<>(); + if (include != null) { + conf.put(IPFilterRules.INCLUDE_RULES_KEY, include); + } + if (exclude != null) { + conf.put(IPFilterRules.EXCLUDE_RULES_KEY, exclude); + } + return new IPFilterRules(conf); + } + + @Test + void emptyConfigAcceptsEverything() { + IPFilterRules r = rules(null, null); + assertTrue(r.isEmpty()); + assertTrue(r.accept(ip("127.0.0.1"))); + assertTrue(r.accept(ip("8.8.8.8"))); + } + + @Test + void excludeLoopbackAndSitelocalAsCommaSeparatedString() { + IPFilterRules r = rules(null, "localhost,sitelocal"); + assertFalse(r.isEmpty()); + assertFalse(r.accept(ip("127.0.0.1"))); + assertFalse(r.accept(ip("::1"))); + assertFalse(r.accept(ip("192.168.1.10"))); + assertFalse(r.accept(ip("10.0.0.1"))); + assertTrue(r.accept(ip("8.8.8.8"))); + } + + @Test + void excludeAsYamlList() { + IPFilterRules r = rules(null, Arrays.asList("loopback", "192.168.0.0/16")); + assertFalse(r.accept(ip("127.0.0.1"))); + assertFalse(r.accept(ip("192.168.5.5"))); + assertTrue(r.accept(ip("172.32.0.1"))); + } + + @Test + void includeRestrictsToAllowedRangesOnly() { + IPFilterRules r = rules("10.0.0.0/8", null); + assertTrue(r.accept(ip("10.1.2.3"))); + assertFalse(r.accept(ip("8.8.8.8"))); + assertFalse(r.accept(ip("127.0.0.1"))); + } + + @Test + void excludeTakesPrecedenceOverInclude() { + IPFilterRules r = rules("sitelocal", "192.168.0.0/16"); + assertTrue(r.accept(ip("10.0.0.1"))); + assertFalse(r.accept(ip("192.168.0.1"))); + } + + @Test + void invalidRuleIsIgnored() { + IPFilterRules r = rules(null, "not-an-ip,localhost"); + assertFalse(r.accept(ip("127.0.0.1"))); + assertTrue(r.accept(ip("8.8.8.8"))); + } + + @Test + void nullAddressIsRejectedWhenRulesConfigured() { + IPFilterRules r = rules(null, "localhost"); + assertFalse(r.accept(null)); + } +} diff --git a/docs/src/main/asciidoc/configuration.adoc b/docs/src/main/asciidoc/configuration.adoc index 479cfd506..1e409416f 100644 --- a/docs/src/main/asciidoc/configuration.adoc +++ b/docs/src/main/asciidoc/configuration.adoc @@ -255,6 +255,8 @@ header. | http.accept.language | en-us,en-gb,en;q=0.7,*;q=0.3 | HTTP Accept-Language header. | http.content.partial.as.trimmed | false | Accepts partially fetched content in OKHTTP. +| http.filter.ipaddress.include | - | (OkHttp only) Comma-separated list (or YAML list) of allowed IP ranges. If empty, all addresses are allowed unless excluded. See <>. +| http.filter.ipaddress.exclude | - | (OkHttp only) Comma-separated list (or YAML list) of blocked IP ranges. See <>. | http.trust.everything | true | If true, trust all SSL/TLS connections. | navigationfilters.config.file | - | JSON config for NavigationFilter (used by the Selenium protocol module). | selenium.addresses | - | WebDriver server addresses. @@ -268,6 +270,39 @@ header. | topology.message.timeout.secs | -1 | OKHTTP message timeout. |=== +[[IP Address Filtering]] +===== IP Address Filtering + +The OkHttp protocol implementation (`org.apache.stormcrawler.protocol.okhttp.HttpProtocol`) can +optionally limit or block connections to IP address ranges. Because a DNS entry of an arbitrary host +name may resolve to a private or loopback address, filtering URLs is not sufficient to prevent +information leakage to a public search index or web archive. The filtering therefore happens at the +protocol level, once the IP address is known. + +Two properties control the behaviour: + +* `http.filter.ipaddress.include` — defines the allowed IP ranges. If empty, all addresses are +allowed unless explicitly excluded. +* `http.filter.ipaddress.exclude` — defines the blocked IP ranges. An excluded address is always +blocked, even if it also matches an include rule. + +Each property accepts a comma-separated string or a YAML list of rules. A rule can be: + +* a single IP address, e.g. `127.0.0.1` or `::1` +* a CIDR block, e.g. `192.168.0.0/16` or `fd00::/8` +* `localhost` or `loopback` — matches any loopback address (`InetAddress.isLoopbackAddress()`) +* `sitelocal` — matches any site-local address (`InetAddress.isSiteLocalAddress()`) + +For example, to block crawling of localhost and private address spaces: + +[source,yaml] +---- +http.filter.ipaddress.exclude: "localhost,sitelocal" +---- + +When a connection to a blocked address is attempted, the fetch fails with an `IOException` and a +warning is logged. If neither property is set, no IP filtering is performed. + ==== Indexing The values below are used by sub-classes of `AbstractIndexerBolt`. From 196e30691878c50fa0c8ac5a6528718990b5e12a Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Sun, 14 Jun 2026 19:46:23 +0200 Subject: [PATCH 2/3] 1107 Fix google-java-format formatting of CIDR and IPFilterRules javadoc --- .../java/org/apache/stormcrawler/protocol/okhttp/CIDR.java | 5 ++--- .../apache/stormcrawler/protocol/okhttp/IPFilterRules.java | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java index 0948e0f9c..22abe438e 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java @@ -21,9 +21,8 @@ import java.net.InetAddress; /** - * Parse a CIDR block notation and test - * whether an IP address is contained in the subnet range defined by the CIDR. + * Parse a CIDR block + * notation and test whether an IP address is contained in the subnet range defined by the CIDR. */ public class CIDR { diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java index d3bc73f9b..6c83a1cb8 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java @@ -45,8 +45,8 @@ *

From 06b20d0419e15e7ed7c1227ec866b32e4eb6cb46 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Mon, 15 Jun 2026 19:31:48 +0200 Subject: [PATCH 3/3] 1107 Move CIDR/IPFilterRules to protocol package; fix CIDR mask handling Move CIDR and IPFilterRules from the okhttp package to org.apache.stormcrawler.protocol so they can be reused by other protocols (review feedback). Fix CIDR byte mask wrapping for /32, /128 and IPv6 prefixes >= 32 (Java shifts mod 32) and reject out-of-range masks. Add a NOTE that IP filtering is bypassed for proxied fetches. --- .../protocol/{okhttp => }/CIDR.java | 10 ++++++++-- .../protocol/{okhttp => }/IPFilterRules.java | 2 +- .../protocol/okhttp/HttpProtocol.java | 5 +++++ .../protocol/{okhttp => }/CIDRTest.java | 19 ++++++++++++++++++- .../{okhttp => }/IPFilterRulesTest.java | 2 +- docs/src/main/asciidoc/configuration.adoc | 4 ++++ 6 files changed, 37 insertions(+), 5 deletions(-) rename core/src/main/java/org/apache/stormcrawler/protocol/{okhttp => }/CIDR.java (85%) rename core/src/main/java/org/apache/stormcrawler/protocol/{okhttp => }/IPFilterRules.java (99%) rename core/src/test/java/org/apache/stormcrawler/protocol/{okhttp => }/CIDRTest.java (77%) rename core/src/test/java/org/apache/stormcrawler/protocol/{okhttp => }/IPFilterRulesTest.java (98%) diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java b/core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java similarity index 85% rename from core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java rename to core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java index 22abe438e..60174bec0 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/CIDR.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.stormcrawler.protocol.okhttp; +package org.apache.stormcrawler.protocol; import com.google.common.net.InetAddresses; import java.net.InetAddress; @@ -52,6 +52,11 @@ public CIDR(String cidr) throws IllegalArgumentException { // need to shift the mask parsedMask = Math.max(0, parsedMask - 96); } + int maxMask = addr.getAddress().length * 8; + if (parsedMask < 0 || parsedMask > maxMask) { + throw new IllegalArgumentException( + "Invalid CIDR mask /" + parsedMask + " for " + ipStr); + } this.mask = parsedMask; } @@ -67,7 +72,8 @@ public boolean contains(InetAddress address) { if (remainingMaskBits <= 0) { return true; } - int m = ~(0xff >> remainingMaskBits); // mask for byte under cursor + // keep the mask within one byte so the shift can't wrap (Java shifts mod 32) + int m = remainingMaskBits >= 8 ? 0xff : (0xff << (8 - remainingMaskBits)) & 0xff; if ((addr0[i] & m) != (addr1[i] & m)) { return false; } diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java b/core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java similarity index 99% rename from core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java rename to core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java index 6c83a1cb8..d9888d25d 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRules.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.stormcrawler.protocol.okhttp; +package org.apache.stormcrawler.protocol; import java.net.InetAddress; import java.util.ArrayList; diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java index 89a46f727..ba60d7f99 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java @@ -73,6 +73,7 @@ import org.apache.stormcrawler.Constants; import org.apache.stormcrawler.Metadata; import org.apache.stormcrawler.protocol.AbstractHttpProtocol; +import org.apache.stormcrawler.protocol.IPFilterRules; import org.apache.stormcrawler.protocol.ProtocolResponse; import org.apache.stormcrawler.protocol.ProtocolResponse.TrimmedContentReason; import org.apache.stormcrawler.proxy.SCProxy; @@ -551,6 +552,10 @@ private byte[] toByteArray( * Network interceptor blocking connections to IP addresses rejected by the configured {@link * IPFilterRules}. The IP address is only known once the connection has been established, hence * the filtering happens at the protocol level rather than by filtering URLs. + * + *

Note that when a proxy is configured the connection is established to the proxy, so the + * filter sees the proxy's IP address rather than the target host's resolved address; IP + * filtering is therefore effectively disabled for proxied fetches. */ static class HTTPFilterIPAddressInterceptor implements Interceptor { diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/CIDRTest.java similarity index 77% rename from core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java rename to core/src/test/java/org/apache/stormcrawler/protocol/CIDRTest.java index aaa5f4f1e..effa28435 100644 --- a/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/CIDRTest.java +++ b/core/src/test/java/org/apache/stormcrawler/protocol/CIDRTest.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.stormcrawler.protocol.okhttp; +package org.apache.stormcrawler.protocol; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; @@ -36,6 +36,23 @@ void singleIPv4AddressMatchesOnlyItself() { CIDR cidr = new CIDR("127.0.0.1"); assertTrue(cidr.contains(ip("127.0.0.1"))); assertFalse(cidr.contains(ip("127.0.0.2"))); + // a /32 must compare every byte, including the first one + assertFalse(cidr.contains(ip("1.0.0.1"))); + } + + @Test + void singleIPv6AddressMatchesOnlyItself() { + CIDR cidr = new CIDR("::1"); + assertTrue(cidr.contains(ip("::1"))); + // a /128 must compare every byte, including the first one + assertFalse(cidr.contains(ip("fd00::1"))); + } + + @Test + void outOfRangeMaskThrows() { + assertThrows(IllegalArgumentException.class, () -> new CIDR("10.0.0.0/-1")); + assertThrows(IllegalArgumentException.class, () -> new CIDR("10.0.0.0/33")); + assertThrows(IllegalArgumentException.class, () -> new CIDR("::1/129")); } @Test diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/IPFilterRulesTest.java similarity index 98% rename from core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java rename to core/src/test/java/org/apache/stormcrawler/protocol/IPFilterRulesTest.java index 6adad523d..109529ce0 100644 --- a/core/src/test/java/org/apache/stormcrawler/protocol/okhttp/IPFilterRulesTest.java +++ b/core/src/test/java/org/apache/stormcrawler/protocol/IPFilterRulesTest.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.stormcrawler.protocol.okhttp; +package org.apache.stormcrawler.protocol; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/docs/src/main/asciidoc/configuration.adoc b/docs/src/main/asciidoc/configuration.adoc index 1e409416f..ab3c0a176 100644 --- a/docs/src/main/asciidoc/configuration.adoc +++ b/docs/src/main/asciidoc/configuration.adoc @@ -303,6 +303,10 @@ http.filter.ipaddress.exclude: "localhost,sitelocal" When a connection to a blocked address is attempted, the fetch fails with an `IOException` and a warning is logged. If neither property is set, no IP filtering is performed. +NOTE: When a proxy is configured, the connection is established to the proxy and the filter sees the +proxy's IP address rather than the target host's resolved address. IP filtering is therefore +effectively disabled for proxied fetches. + ==== Indexing The values below are used by sub-classes of `AbstractIndexerBolt`.