Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions core/src/main/java/org/apache/stormcrawler/protocol/CIDR.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.stormcrawler.protocol;

import com.google.common.net.InetAddresses;
import java.net.InetAddress;

/**
* Parse a <a href= "https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing">CIDR</a> block
* notation and test whether an IP address is contained in the subnet range defined by the CIDR.
*/
public class CIDR {

private final InetAddress addr;
private final int mask;

public CIDR(InetAddress address, int mask) {
this.addr = address;
this.mask = mask;
}

public CIDR(String cidr) throws IllegalArgumentException {
String ipStr = cidr;
int sep = cidr.indexOf('/');
if (sep > -1) {
ipStr = cidr.substring(0, sep);
}
addr = InetAddresses.forString(ipStr);
int parsedMask;
if (sep > -1) {
parsedMask = Integer.parseInt(cidr.substring(sep + 1));
} else {
parsedMask = addr.getAddress().length * 8;
}
if (cidr.indexOf(':') > -1 && addr.getAddress().length == 4) {
// IPv4-mapped IPv6 addresses are automatically converted to IPv4,
// need to shift the mask
parsedMask = Math.max(0, parsedMask - 96);
}
int maxMask = addr.getAddress().length * 8;
if (parsedMask < 0 || parsedMask > maxMask) {
throw new IllegalArgumentException(
"Invalid CIDR mask /" + parsedMask + " for " + ipStr);
}
this.mask = parsedMask;
Comment thread
rzo1 marked this conversation as resolved.
}

public boolean contains(InetAddress address) {
byte[] addr0 = addr.getAddress();
byte[] addr1 = address.getAddress();
if (addr0.length != addr1.length) {
// not comparing IPv4 and IPv6 addresses
return false;
}
for (int i = 0; i < addr0.length; i++) {
int remainingMaskBits = mask - (i * 8);
if (remainingMaskBits <= 0) {
return true;
}
// keep the mask within one byte so the shift can't wrap (Java shifts mod 32)
int m = remainingMaskBits >= 8 ? 0xff : (0xff << (8 - remainingMaskBits)) & 0xff;
if ((addr0[i] & m) != (addr1[i] & m)) {
return false;
}
}
return true;
}

@Override
public String toString() {
return addr + "/" + mask;
}
}
137 changes: 137 additions & 0 deletions core/src/main/java/org/apache/stormcrawler/protocol/IPFilterRules.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.stormcrawler.protocol;

import java.net.InetAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Predicate;
import org.apache.commons.lang3.StringUtils;
import org.apache.stormcrawler.util.ConfUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Optionally limit or block connections to IP address ranges (localhost/loopback or site-local
* addresses, subnet ranges given in CIDR notation, or single IP addresses).
*
* <p>IP filter rules are built from two configuration properties:
*
* <ul>
* <li><code>http.filter.ipaddress.include</code> defines all allowed IP ranges. If not defined or
* empty all IP addresses (and not explicitly excluded) are allowed.
* <li><code>http.filter.ipaddress.exclude</code> defines excluded IP address ranges.
* </ul>
*
* <p>IP ranges can be defined as
*
* <ul>
* <li>IP address, e.g. <code>127.0.0.1</code> or <code>::1</code> (IPv6)
* <li>CIDR notation, e.g. <code>192.168.0.0/16</code> or <code>fd00::/8</code>
* <li><code>localhost</code> or <code>loopback</code> applies to all IP addresses for which
* {@link InetAddress#isLoopbackAddress()} is true
* <li><code>sitelocal</code> applies to all IP addresses for which {@link
* InetAddress#isSiteLocalAddress()} is true
* </ul>
*
* <p>Multiple IP ranges can be given either as a comma-separated string, e.g. <code>
* loopback,sitelocal,fd00::/8</code>, or as a list in the configuration.
*/
public class IPFilterRules {

protected static final Logger LOG = LoggerFactory.getLogger(IPFilterRules.class);

public static final String INCLUDE_RULES_KEY = "http.filter.ipaddress.include";
public static final String EXCLUDE_RULES_KEY = "http.filter.ipaddress.exclude";

private final List<Predicate<InetAddress>> includeRules;
private final List<Predicate<InetAddress>> excludeRules;

public IPFilterRules(Map<String, Object> conf) {
includeRules = parseIPRules(conf, INCLUDE_RULES_KEY);
excludeRules = parseIPRules(conf, EXCLUDE_RULES_KEY);
}

public boolean isEmpty() {
return includeRules.isEmpty() && excludeRules.isEmpty();
}

public boolean accept(InetAddress address) {
if (address == null) {
return false;
}
boolean accept = true;
if (!includeRules.isEmpty()) {
accept = false;
for (Predicate<InetAddress> rule : includeRules) {
if (rule.test(address)) {
accept = true;
break;
}
}
}
if (accept && !excludeRules.isEmpty()) {
for (Predicate<InetAddress> rule : excludeRules) {
if (rule.test(address)) {
accept = false;
break;
}
}
}
return accept;
}

private static List<Predicate<InetAddress>> parseIPRules(
Map<String, Object> conf, String ipRuleProperty) {
List<Predicate<InetAddress>> rules = new ArrayList<>();
for (String entry : ConfUtils.loadListFromConf(ipRuleProperty, conf)) {
// a single config value may itself hold a comma-separated list of rules
for (String ipRule : entry.split(",")) {
ipRule = ipRule.trim();
if (StringUtils.isBlank(ipRule)) {
continue;
}
switch (ipRule.toLowerCase(Locale.ROOT)) {
case "localhost":
case "loopback":
rules.add(InetAddress::isLoopbackAddress);
break;
case "sitelocal":
rules.add(InetAddress::isSiteLocalAddress);
break;
default:
try {
CIDR cidr = new CIDR(ipRule);
rules.add(cidr::contains);
} catch (IllegalArgumentException e) {
LOG.error(
"Failed to parse {} as CIDR, ignoring it while configuring IP rules ({})",
ipRule,
ipRuleProperty);
}
}
}
}
if (!rules.isEmpty()) {
LOG.info("Found {} IP filter rule(s) for {}", rules.size(), ipRuleProperty);
}
return rules;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
Expand Down Expand Up @@ -72,6 +73,7 @@
import org.apache.stormcrawler.Constants;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.protocol.AbstractHttpProtocol;
import org.apache.stormcrawler.protocol.IPFilterRules;
import org.apache.stormcrawler.protocol.ProtocolResponse;
import org.apache.stormcrawler.protocol.ProtocolResponse.TrimmedContentReason;
import org.apache.stormcrawler.proxy.SCProxy;
Expand Down Expand Up @@ -226,6 +228,14 @@ public void configure(Config conf) {

customHeaders.forEach(customRequestHeaders::add);

// optionally block connections to forbidden IP address ranges
// (e.g. localhost/loopback, private/site-local addresses), see
// https://github.com/apache/stormcrawler/issues/1107
final IPFilterRules ipFilterRules = new IPFilterRules(conf);
if (!ipFilterRules.isEmpty()) {
builder.addNetworkInterceptor(new HTTPFilterIPAddressInterceptor(ipFilterRules));
}

if (storeHttpHeaders) {
builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
}
Expand Down Expand Up @@ -538,6 +548,40 @@ private byte[] toByteArray(
return arr;
}

/**
* Network interceptor blocking connections to IP addresses rejected by the configured {@link
* IPFilterRules}. The IP address is only known once the connection has been established, hence
* the filtering happens at the protocol level rather than by filtering URLs.
*
* <p>Note that when a proxy is configured the connection is established to the proxy, so the
* filter sees the proxy's IP address rather than the target host's resolved address; IP
* filtering is therefore effectively disabled for proxied fetches.
*/
static class HTTPFilterIPAddressInterceptor implements Interceptor {

private final IPFilterRules rules;

HTTPFilterIPAddressInterceptor(IPFilterRules rules) {
this.rules = rules;
}

@NotNull
@Override
public Response intercept(Interceptor.Chain chain) throws IOException {
final Connection connection = Objects.requireNonNull(chain.connection());
final InetAddress address = connection.socket().getInetAddress();
Comment thread
rzo1 marked this conversation as resolved.
final Request request = chain.request();

if (rules.accept(address)) {
return chain.proceed(request);
}

final String hostAddress = address == null ? "unknown" : address.getHostAddress();
LOG.warn("Blocked connection to IP address {}: {}", hostAddress, request.url());
throw new IOException("Forbidden connection to IP address " + hostAddress);
}
}

static class HTTPHeadersInterceptor implements Interceptor {

private String getNormalizedProtocolName(Protocol protocol) {
Expand Down
14 changes: 14 additions & 0 deletions core/src/main/resources/crawler-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,20 @@ config:
# Follow redirect HTTP responses:
http.allow.redirects: false

# IP address filtering (okhttp protocol only). Optionally limit or block
# connections to IP address ranges once the host name has been resolved. This
# prevents information leakage to a public index when a DNS entry points to a
# private or loopback address. Rules can be given as a comma-separated string
# or as a list and may be:
# - a single IP address, e.g. "127.0.0.1" or "::1"
# - a CIDR block, e.g. "192.168.0.0/16" or "fd00::/8"
# - "localhost" / "loopback" (matches InetAddress.isLoopbackAddress())
# - "sitelocal" (matches InetAddress.isSiteLocalAddress())
# Only addresses matching an include rule are fetched (empty means all are
# allowed), addresses matching an exclude rule are always blocked.
# http.filter.ipaddress.include:
# http.filter.ipaddress.exclude: "localhost,sitelocal"

# Allow all if robots.txt cannot be parsed due to code 403 (Forbidden):
http.robots.403.allow: true

Expand Down
Loading