-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinux-2.6-525-sknid-elevator.patch
More file actions
235 lines (218 loc) · 7.68 KB
/
linux-2.6-525-sknid-elevator.patch
File metadata and controls
235 lines (218 loc) · 7.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6bf49e9..1142baf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1726,6 +1726,7 @@ struct napi_gro_cb {
struct packet_type {
__be16 type; /* This is really htons(ether_type). */
struct net_device *dev; /* NULL is wildcarded here */
+ unsigned char sknid_elevator;
int (*func) (struct sk_buff *,
struct net_device *,
struct packet_type *,
diff --git a/net/core/dev.c b/net/core/dev.c
index b62703a..ff729ae 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -99,6 +99,8 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
#include <linux/if_bridge.h>
#include <linux/if_macvlan.h>
#include <net/dst.h>
@@ -3258,6 +3260,12 @@ void netdev_rx_handler_unregister(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+
+/* The code already makes the assumption that packet handlers run
+ * sequentially on the same CPU. -Sapan */
+DEFINE_PER_CPU(int, sknid_elevator) = 0;
+
+
int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
@@ -3266,8 +3274,11 @@ int __netif_receive_skb(struct sk_buff *skb)
struct net_device *null_or_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
+ int *cur_elevator = &__get_cpu_var(sknid_elevator);
__be16 type;
+ *cur_elevator = 0;
+
if (!skb->tstamp.tv64)
net_timestamp(skb);
@@ -3378,7 +3389,27 @@ ncls:
}
if (pt_prev) {
+ /* At this point, cur_elevator may be -2 or a positive value, in
+ * case a previous protocol handler marked it */
+ if (*cur_elevator) {
+ atomic_inc(&skb->users);
+ }
+
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+
+ if ((*cur_elevator)>0) {
+ skb->skb_tag = *cur_elevator;
+ list_for_each_entry_rcu(ptype, &ptype_all, list) {
+ if ((!ptype->dev || ptype->dev == skb->dev) && (ptype->sknid_elevator)) {
+ ret = deliver_skb(skb, ptype, orig_dev);
+ }
+ }
+ }
+
+ if (*cur_elevator) {
+ /* We have a packet */
+ kfree_skb(skb);
+ }
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
@@ -5435,6 +5466,7 @@ unsigned dev_get_flags(const struct net_device *dev)
return flags;
}
EXPORT_SYMBOL(dev_get_flags);
+EXPORT_PER_CPU_SYMBOL(sknid_elevator);
/**
* dev_change_flags - change device settings
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 4875ae4..3f5c537 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -78,6 +78,7 @@
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/vs_network.h>
#include <linux/mutex.h>
#include <linux/if_vlan.h>
#include <linux/errqueue.h>
@@ -344,12 +345,54 @@ static const struct proto_ops packet_ops;
static const struct proto_ops packet_ops_spkt;
+DECLARE_PER_CPU(int, sknid_elevator);
+
+static inline unsigned int slice_check_and_elevate(struct sk_buff *skb, struct sock *sk) {
+ /* This mechanism is quite involved, and caused us a lot of pain
+ * including crashes and packet loss during the 4.2 rollout. This
+ * function decides if a slice is allowed to see a given packet.
+ * Unfortunately, the first time it is invoked for a packet it does not
+ * have enough information to make this call, since xt_MARK has not had
+ * a chance to tag it with the slice id. There is also no way of
+ * passing state between xt_MARK and this function through a packet --
+ * because the skb gets cloned quite a few times between these two
+ * points. I'd rather not use skb_shared_info because it's treated as
+ * a blob of memory, and so it would be quite hard to maintain.
+ *
+ * What we do is to keep a global variable (per CPU) that transfers the
+ * required state between xt_MARK and af_packet.c. As an optimization,
+ * this state transfer and the step that follows is only executed for
+ * packets that first get dropped here. When we drop a packet, we mark
+ * it for 'elevation' (that's what this trick is called). When xt_MARK
+ * tags the packet with the right slice, it intercepts this mark and
+ * sets the value of sknid_elevator. Next, the packet is sent back here
+ * for a second round, this time with the xid tag set.
+ */
+
+ int *elevator=&__get_cpu_var(sknid_elevator);
+ int tag = skb->skb_tag;
+
+ if (sk->sk_nx_info && !(tag == 1 || sk->sk_nid == tag)) {
+ if (skb->pkt_type==PACKET_HOST) {
+ *elevator=-2; /* Rejecting this packet. Mark it for elevation in xt_MARK */
+ }
+ return 0;
+ }
+ else if (!sk->sk_nx_info && (*elevator>0)) {
+ /* Root has already seen this packet once, since it has been elevated */
+ return 0;
+ }
+
+ return 1;
+}
+
static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct sock *sk;
struct sockaddr_pkt *spkt;
-
+
+
/*
* When we registered the protocol we saved the socket in the data
* field for just this event.
@@ -368,6 +411,16 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
* so that this procedure is noop.
*/
+ /*
+ * (18:05:41) daniel_hozac: where?
+ * (18:05:58) daniel_hozac: we already have filters on PF_PACKET, don't we?
+ * (18:05:58) er: in packet_rcv_skpt
+ * (18:07:33) daniel_hozac: oh, that's evil.
+ */
+
+ if (!slice_check_and_elevate(skb, sk))
+ return 0;
+
if (skb->pkt_type == PACKET_LOOPBACK)
goto out;
@@ -427,6 +480,9 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
int err;
struct flow_keys keys;
+ if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
+ return -EPERM;
+
/*
* Get and verify the address.
*/
@@ -525,11 +581,16 @@ out_unlock:
return err;
}
+
+
static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
unsigned int res)
{
struct sk_filter *filter;
+ if (!slice_check_and_elevate(skb, sk))
+ return 0;
+
rcu_read_lock_bh();
filter = rcu_dereference(sk->sk_filter);
if (filter != NULL)
@@ -1132,6 +1193,9 @@ static int packet_snd(struct socket *sock,
int offset = 0;
struct flow_keys keys;
+ if (!nx_capable(CAP_NET_RAW, NXC_RAW_SEND))
+ return -EPERM;
+
/*
* Get and verify the address.
*/
@@ -1334,6 +1398,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
po->num = proto;
po->prot_hook.type = proto;
+ po->prot_hook.sknid_elevator = 1;
po->prot_hook.dev = dev;
po->ifindex = dev ? dev->ifindex : 0;
@@ -1435,8 +1500,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
- if (!capable(CAP_NET_RAW))
+ if (!nx_capable(CAP_NET_RAW, NXC_RAW_SOCKET))
return -EPERM;
+
if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
sock->type != SOCK_PACKET)
return -ESOCKTNOSUPPORT;
@@ -1468,6 +1534,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
spin_lock_init(&po->bind_lock);
mutex_init(&po->pg_vec_lock);
po->prot_hook.func = packet_rcv;
+ po->prot_hook.sknid_elevator = 1;
if (sock->type == SOCK_PACKET)
po->prot_hook.func = packet_rcv_spkt;