diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java b/core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java new file mode 100644 index 000000000..60174bec0 --- /dev/null +++ b/core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.protocol; + +import com.google.common.net.InetAddresses; +import java.net.InetAddress; + +/** + * Parse a CIDR block + * notation and test whether an IP address is contained in the subnet range defined by the CIDR. + */ +public class CIDR { + + private final InetAddress addr; + private final int mask; + + public CIDR(InetAddress address, int mask) { + this.addr = address; + this.mask = mask; + } + + public CIDR(String cidr) throws IllegalArgumentException { + String ipStr = cidr; + int sep = cidr.indexOf('/'); + if (sep > -1) { + ipStr = cidr.substring(0, sep); + } + addr = InetAddresses.forString(ipStr); + int parsedMask; + if (sep > -1) { + parsedMask = Integer.parseInt(cidr.substring(sep + 1)); + } else { + parsedMask = addr.getAddress().length * 8; + } + if (cidr.indexOf(':') > -1 && addr.getAddress().length == 4) { + // IPv4-mapped IPv6 addresses are automatically converted to IPv4, + // need to shift the mask + parsedMask = Math.max(0, parsedMask - 96); + } + int maxMask = addr.getAddress().length * 8; + if (parsedMask < 0 || parsedMask > maxMask) { + throw new IllegalArgumentException( + "Invalid CIDR mask /" + parsedMask + " for " + ipStr); + } + this.mask = parsedMask; + } + + public boolean contains(InetAddress address) { + byte[] addr0 = addr.getAddress(); + byte[] addr1 = address.getAddress(); + if (addr0.length != addr1.length) { + // not comparing IPv4 and IPv6 addresses + return false; + } + for (int i = 0; i < addr0.length; i++) { + int remainingMaskBits = mask - (i * 8); + if (remainingMaskBits <= 0) { + return true; + } + // keep the mask within one byte so the shift can't wrap (Java shifts mod 32) + int m = remainingMaskBits >= 8 ? 0xff : (0xff << (8 - remainingMaskBits)) & 0xff; + if ((addr0[i] & m) != (addr1[i] & m)) { + return false; + } + } + return true; + } + + @Override + public String toString() { + return addr + "/" + mask; + } +} diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java b/core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java new file mode 100644 index 000000000..d9888d25d --- /dev/null +++ b/core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.stormcrawler.protocol; + +import java.net.InetAddress; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.function.Predicate; +import org.apache.commons.lang3.StringUtils; +import org.apache.stormcrawler.util.ConfUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Optionally limit or block connections to IP address ranges (localhost/loopback or site-local + * addresses, subnet ranges given in CIDR notation, or single IP addresses). + * + *
IP filter rules are built from two configuration properties: + * + *
http.filter.ipaddress.include defines all allowed IP ranges. If not defined or
+ * empty all IP addresses (and not explicitly excluded) are allowed.
+ * http.filter.ipaddress.exclude defines excluded IP address ranges.
+ * IP ranges can be defined as + * + *
127.0.0.1 or ::1 (IPv6)
+ * 192.168.0.0/16 or fd00::/8
+ * localhost or loopback applies to all IP addresses for which
+ * {@link InetAddress#isLoopbackAddress()} is true
+ * sitelocal applies to all IP addresses for which {@link
+ * InetAddress#isSiteLocalAddress()} is true
+ * Multiple IP ranges can be given either as a comma-separated string, e.g. Note that when a proxy is configured the connection is established to the proxy, so the
+ * filter sees the proxy's IP address rather than the target host's resolved address; IP
+ * filtering is therefore effectively disabled for proxied fetches.
+ */
+ static class HTTPFilterIPAddressInterceptor implements Interceptor {
+
+ private final IPFilterRules rules;
+
+ HTTPFilterIPAddressInterceptor(IPFilterRules rules) {
+ this.rules = rules;
+ }
+
+ @NotNull
+ @Override
+ public Response intercept(Interceptor.Chain chain) throws IOException {
+ final Connection connection = Objects.requireNonNull(chain.connection());
+ final InetAddress address = connection.socket().getInetAddress();
+ final Request request = chain.request();
+
+ if (rules.accept(address)) {
+ return chain.proceed(request);
+ }
+
+ final String hostAddress = address == null ? "unknown" : address.getHostAddress();
+ LOG.warn("Blocked connection to IP address {}: {}", hostAddress, request.url());
+ throw new IOException("Forbidden connection to IP address " + hostAddress);
+ }
+ }
+
static class HTTPHeadersInterceptor implements Interceptor {
private String getNormalizedProtocolName(Protocol protocol) {
diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml
index d024ed941..d45cbc6b6 100644
--- a/core/src/main/resources/crawler-default.yaml
+++ b/core/src/main/resources/crawler-default.yaml
@@ -144,6 +144,20 @@ config:
# Follow redirect HTTP responses:
http.allow.redirects: false
+ # IP address filtering (okhttp protocol only). Optionally limit or block
+ # connections to IP address ranges once the host name has been resolved. This
+ # prevents information leakage to a public index when a DNS entry points to a
+ # private or loopback address. Rules can be given as a comma-separated string
+ # or as a list and may be:
+ # - a single IP address, e.g. "127.0.0.1" or "::1"
+ # - a CIDR block, e.g. "192.168.0.0/16" or "fd00::/8"
+ # - "localhost" / "loopback" (matches InetAddress.isLoopbackAddress())
+ # - "sitelocal" (matches InetAddress.isSiteLocalAddress())
+ # Only addresses matching an include rule are fetched (empty means all are
+ # allowed), addresses matching an exclude rule are always blocked.
+ # http.filter.ipaddress.include:
+ # http.filter.ipaddress.exclude: "localhost,sitelocal"
+
# Allow all if robots.txt cannot be parsed due to code 403 (Forbidden):
http.robots.403.allow: true
diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/CIDRTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/CIDRTest.java
new file mode 100644
index 000000000..effa28435
--- /dev/null
+++ b/core/src/test/java/org/apache/stormcrawler/protocol/CIDRTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stormcrawler.protocol;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import com.google.common.net.InetAddresses;
+import java.net.InetAddress;
+import org.junit.jupiter.api.Test;
+
+class CIDRTest {
+
+ private static InetAddress ip(String address) {
+ return InetAddresses.forString(address);
+ }
+
+ @Test
+ void singleIPv4AddressMatchesOnlyItself() {
+ CIDR cidr = new CIDR("127.0.0.1");
+ assertTrue(cidr.contains(ip("127.0.0.1")));
+ assertFalse(cidr.contains(ip("127.0.0.2")));
+ // a /32 must compare every byte, including the first one
+ assertFalse(cidr.contains(ip("1.0.0.1")));
+ }
+
+ @Test
+ void singleIPv6AddressMatchesOnlyItself() {
+ CIDR cidr = new CIDR("::1");
+ assertTrue(cidr.contains(ip("::1")));
+ // a /128 must compare every byte, including the first one
+ assertFalse(cidr.contains(ip("fd00::1")));
+ }
+
+ @Test
+ void outOfRangeMaskThrows() {
+ assertThrows(IllegalArgumentException.class, () -> new CIDR("10.0.0.0/-1"));
+ assertThrows(IllegalArgumentException.class, () -> new CIDR("10.0.0.0/33"));
+ assertThrows(IllegalArgumentException.class, () -> new CIDR("::1/129"));
+ }
+
+ @Test
+ void ipv4CidrBlockMatchesAddressesInRange() {
+ CIDR cidr = new CIDR("192.168.0.0/16");
+ assertTrue(cidr.contains(ip("192.168.0.1")));
+ assertTrue(cidr.contains(ip("192.168.255.255")));
+ assertFalse(cidr.contains(ip("192.169.0.1")));
+ assertFalse(cidr.contains(ip("10.0.0.1")));
+ }
+
+ @Test
+ void ipv6CidrBlockMatchesAddressesInRange() {
+ CIDR cidr = new CIDR("fd00::/8");
+ assertTrue(cidr.contains(ip("fd00::1")));
+ assertTrue(cidr.contains(ip("fdff::ffff")));
+ assertFalse(cidr.contains(ip("fc00::1")));
+ }
+
+ @Test
+ void ipv4AndIpv6AreNotComparable() {
+ CIDR cidr = new CIDR("192.168.0.0/16");
+ assertFalse(cidr.contains(ip("::1")));
+ }
+
+ @Test
+ void invalidCidrThrows() {
+ assertThrows(IllegalArgumentException.class, () -> new CIDR("not-an-ip"));
+ }
+
+ @Test
+ void defaultMaskCoversFullAddress() {
+ assertTrue(new CIDR("10.0.0.1").toString().endsWith("/32"));
+ }
+}
diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/IPFilterRulesTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/IPFilterRulesTest.java
new file mode 100644
index 000000000..109529ce0
--- /dev/null
+++ b/core/src/test/java/org/apache/stormcrawler/protocol/IPFilterRulesTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stormcrawler.protocol;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import com.google.common.net.InetAddresses;
+import java.net.InetAddress;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.junit.jupiter.api.Test;
+
+class IPFilterRulesTest {
+
+ private static InetAddress ip(String address) {
+ return InetAddresses.forString(address);
+ }
+
+ private static IPFilterRules rules(Object include, Object exclude) {
+ Map
+ * loopback,sitelocal,fd00::/8, or as a list in the configuration.
+ */
+public class IPFilterRules {
+
+ protected static final Logger LOG = LoggerFactory.getLogger(IPFilterRules.class);
+
+ public static final String INCLUDE_RULES_KEY = "http.filter.ipaddress.include";
+ public static final String EXCLUDE_RULES_KEY = "http.filter.ipaddress.exclude";
+
+ private final List