Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73                              struct dst_entry *dst, __be32 saddr)
74 {
75         struct dst_entry *old_dst;
76
77         dst_clone(dst);
78         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79         dst_release(old_dst);
80         idst->saddr = saddr;
81 }
82
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84                            struct dst_entry *dst, __be32 saddr)
85 {
86         __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91         tunnel_dst_set(t, NULL, 0);
92 }
93
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96         int i;
97
98         for_each_possible_cpu(i)
99                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104                                         u32 cookie, __be32 *saddr)
105 {
106         struct ip_tunnel_dst *idst;
107         struct dst_entry *dst;
108
109         rcu_read_lock();
110         idst = raw_cpu_ptr(t->dst_cache);
111         dst = rcu_dereference(idst->dst);
112         if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113                 dst = NULL;
114         if (dst) {
115                 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116                         *saddr = idst->saddr;
117                 } else {
118                         tunnel_dst_reset(t);
119                         dst_release(dst);
120                         dst = NULL;
121                 }
122         }
123         rcu_read_unlock();
124         return (struct rtable *)dst;
125 }
126
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128                                 __be16 flags, __be32 key)
129 {
130         if (p->i_flags & TUNNEL_KEY) {
131                 if (flags & TUNNEL_KEY)
132                         return key == p->i_key;
133                 else
134                         /* key expected, none present */
135                         return false;
136         } else
137                 return !(flags & TUNNEL_KEY);
138 }
139
140 /* Fallback tunnel: no source, no destination, no key, no options
141
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152                                    int link, __be16 flags,
153                                    __be32 remote, __be32 local,
154                                    __be32 key)
155 {
156         unsigned int hash;
157         struct ip_tunnel *t, *cand = NULL;
158         struct hlist_head *head;
159
160         hash = ip_tunnel_hash(key, remote);
161         head = &itn->tunnels[hash];
162
163         hlist_for_each_entry_rcu(t, head, hash_node) {
164                 if (local != t->parms.iph.saddr ||
165                     remote != t->parms.iph.daddr ||
166                     !(t->dev->flags & IFF_UP))
167                         continue;
168
169                 if (!ip_tunnel_key_match(&t->parms, flags, key))
170                         continue;
171
172                 if (t->parms.link == link)
173                         return t;
174                 else
175                         cand = t;
176         }
177
178         hlist_for_each_entry_rcu(t, head, hash_node) {
179                 if (remote != t->parms.iph.daddr ||
180                     t->parms.iph.saddr != 0 ||
181                     !(t->dev->flags & IFF_UP))
182                         continue;
183
184                 if (!ip_tunnel_key_match(&t->parms, flags, key))
185                         continue;
186
187                 if (t->parms.link == link)
188                         return t;
189                 else if (!cand)
190                         cand = t;
191         }
192
193         hash = ip_tunnel_hash(key, 0);
194         head = &itn->tunnels[hash];
195
196         hlist_for_each_entry_rcu(t, head, hash_node) {
197                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199                         continue;
200
201                 if (!(t->dev->flags & IFF_UP))
202                         continue;
203
204                 if (!ip_tunnel_key_match(&t->parms, flags, key))
205                         continue;
206
207                 if (t->parms.link == link)
208                         return t;
209                 else if (!cand)
210                         cand = t;
211         }
212
213         if (flags & TUNNEL_NO_KEY)
214                 goto skip_key_lookup;
215
216         hlist_for_each_entry_rcu(t, head, hash_node) {
217                 if (t->parms.i_key != key ||
218                     t->parms.iph.saddr != 0 ||
219                     t->parms.iph.daddr != 0 ||
220                     !(t->dev->flags & IFF_UP))
221                         continue;
222
223                 if (t->parms.link == link)
224                         return t;
225                 else if (!cand)
226                         cand = t;
227         }
228
229 skip_key_lookup:
230         if (cand)
231                 return cand;
232
233         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234                 return netdev_priv(itn->fb_tunnel_dev);
235
236
237         return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242                                     struct ip_tunnel_parm *parms)
243 {
244         unsigned int h;
245         __be32 remote;
246         __be32 i_key = parms->i_key;
247
248         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249                 remote = parms->iph.daddr;
250         else
251                 remote = 0;
252
253         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254                 i_key = 0;
255
256         h = ip_tunnel_hash(i_key, remote);
257         return &itn->tunnels[h];
258 }
259
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262         struct hlist_head *head = ip_bucket(itn, &t->parms);
263
264         hlist_add_head_rcu(&t->hash_node, head);
265 }
266
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269         hlist_del_init_rcu(&t->hash_node);
270 }
271
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273                                         struct ip_tunnel_parm *parms,
274                                         int type)
275 {
276         __be32 remote = parms->iph.daddr;
277         __be32 local = parms->iph.saddr;
278         __be32 key = parms->i_key;
279         __be16 flags = parms->i_flags;
280         int link = parms->link;
281         struct ip_tunnel *t = NULL;
282         struct hlist_head *head = ip_bucket(itn, parms);
283
284         hlist_for_each_entry_rcu(t, head, hash_node) {
285                 if (local == t->parms.iph.saddr &&
286                     remote == t->parms.iph.daddr &&
287                     link == t->parms.link &&
288                     type == t->dev->type &&
289                     ip_tunnel_key_match(&t->parms, flags, key))
290                         break;
291         }
292         return t;
293 }
294
295 static struct net_device *__ip_tunnel_create(struct net *net,
296                                              const struct rtnl_link_ops *ops,
297                                              struct ip_tunnel_parm *parms)
298 {
299         int err;
300         struct ip_tunnel *tunnel;
301         struct net_device *dev;
302         char name[IFNAMSIZ];
303
304         if (parms->name[0])
305                 strlcpy(name, parms->name, IFNAMSIZ);
306         else {
307                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308                         err = -E2BIG;
309                         goto failed;
310                 }
311                 strlcpy(name, ops->kind, IFNAMSIZ);
312                 strncat(name, "%d", 2);
313         }
314
315         ASSERT_RTNL();
316         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317         if (!dev) {
318                 err = -ENOMEM;
319                 goto failed;
320         }
321         dev_net_set(dev, net);
322
323         dev->rtnl_link_ops = ops;
324
325         tunnel = netdev_priv(dev);
326         tunnel->parms = *parms;
327         tunnel->net = net;
328
329         err = register_netdevice(dev);
330         if (err)
331                 goto failed_free;
332
333         return dev;
334
335 failed_free:
336         free_netdev(dev);
337 failed:
338         return ERR_PTR(err);
339 }
340
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342                                     int proto,
343                                     __be32 daddr, __be32 saddr,
344                                     __be32 key, __u8 tos, int oif)
345 {
346         memset(fl4, 0, sizeof(*fl4));
347         fl4->flowi4_oif = oif;
348         fl4->daddr = daddr;
349         fl4->saddr = saddr;
350         fl4->flowi4_tos = tos;
351         fl4->flowi4_proto = proto;
352         fl4->fl4_gre_key = key;
353 }
354
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357         struct net_device *tdev = NULL;
358         struct ip_tunnel *tunnel = netdev_priv(dev);
359         const struct iphdr *iph;
360         int hlen = LL_MAX_HEADER;
361         int mtu = ETH_DATA_LEN;
362         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363
364         iph = &tunnel->parms.iph;
365
366         /* Guess output device to choose reasonable mtu and needed_headroom */
367         if (iph->daddr) {
368                 struct flowi4 fl4;
369                 struct rtable *rt;
370
371                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372                                  iph->saddr, tunnel->parms.o_key,
373                                  RT_TOS(iph->tos), tunnel->parms.link);
374                 rt = ip_route_output_key(tunnel->net, &fl4);
375
376                 if (!IS_ERR(rt)) {
377                         tdev = rt->dst.dev;
378                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379                         ip_rt_put(rt);
380                 }
381                 if (dev->type != ARPHRD_ETHER)
382                         dev->flags |= IFF_POINTOPOINT;
383         }
384
385         if (!tdev && tunnel->parms.link)
386                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387
388         if (tdev) {
389                 hlen = tdev->hard_header_len + tdev->needed_headroom;
390                 mtu = tdev->mtu;
391         }
392
393         dev->needed_headroom = t_hlen + hlen;
394         mtu -= (dev->hard_header_len + t_hlen);
395
396         if (mtu < 68)
397                 mtu = 68;
398
399         return mtu;
400 }
401
402 static struct ip_tunnel *ip_tunnel_create(struct net *net,
403                                           struct ip_tunnel_net *itn,
404                                           struct ip_tunnel_parm *parms)
405 {
406         struct ip_tunnel *nt;
407         struct net_device *dev;
408
409         BUG_ON(!itn->fb_tunnel_dev);
410         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
411         if (IS_ERR(dev))
412                 return ERR_CAST(dev);
413
414         dev->mtu = ip_tunnel_bind_dev(dev);
415
416         nt = netdev_priv(dev);
417         ip_tunnel_add(itn, nt);
418         return nt;
419 }
420
421 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
422                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
423 {
424         struct pcpu_sw_netstats *tstats;
425         const struct iphdr *iph = ip_hdr(skb);
426         int err;
427
428 #ifdef CONFIG_NET_IPGRE_BROADCAST
429         if (ipv4_is_multicast(iph->daddr)) {
430                 tunnel->dev->stats.multicast++;
431                 skb->pkt_type = PACKET_BROADCAST;
432         }
433 #endif
434
435         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
436              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
437                 tunnel->dev->stats.rx_crc_errors++;
438                 tunnel->dev->stats.rx_errors++;
439                 goto drop;
440         }
441
442         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
443                 if (!(tpi->flags&TUNNEL_SEQ) ||
444                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
445                         tunnel->dev->stats.rx_fifo_errors++;
446                         tunnel->dev->stats.rx_errors++;
447                         goto drop;
448                 }
449                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
450         }
451
452         skb_reset_network_header(skb);
453
454         err = IP_ECN_decapsulate(iph, skb);
455         if (unlikely(err)) {
456                 if (log_ecn_error)
457                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
458                                         &iph->saddr, iph->tos);
459                 if (err > 1) {
460                         ++tunnel->dev->stats.rx_frame_errors;
461                         ++tunnel->dev->stats.rx_errors;
462                         goto drop;
463                 }
464         }
465
466         tstats = this_cpu_ptr(tunnel->dev->tstats);
467         u64_stats_update_begin(&tstats->syncp);
468         tstats->rx_packets++;
469         tstats->rx_bytes += skb->len;
470         u64_stats_update_end(&tstats->syncp);
471
472         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
473
474         if (tunnel->dev->type == ARPHRD_ETHER) {
475                 skb->protocol = eth_type_trans(skb, tunnel->dev);
476                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
477         } else {
478                 skb->dev = tunnel->dev;
479         }
480
481         gro_cells_receive(&tunnel->gro_cells, skb);
482         return 0;
483
484 drop:
485         kfree_skb(skb);
486         return 0;
487 }
488 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
489
490 static int ip_encap_hlen(struct ip_tunnel_encap *e)
491 {
492         const struct ip_tunnel_encap_ops *ops;
493         int hlen = -EINVAL;
494
495         if (e->type == TUNNEL_ENCAP_NONE)
496                 return 0;
497
498         if (e->type >= MAX_IPTUN_ENCAP_OPS)
499                 return -EINVAL;
500
501         rcu_read_lock();
502         ops = rcu_dereference(iptun_encaps[e->type]);
503         if (likely(ops && ops->encap_hlen))
504                 hlen = ops->encap_hlen(e);
505         rcu_read_unlock();
506
507         return hlen;
508 }
509
510 const struct ip_tunnel_encap_ops __rcu *
511                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
512
513 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
514                             unsigned int num)
515 {
516         if (num >= MAX_IPTUN_ENCAP_OPS)
517                 return -ERANGE;
518
519         return !cmpxchg((const struct ip_tunnel_encap_ops **)
520                         &iptun_encaps[num],
521                         NULL, ops) ? 0 : -1;
522 }
523 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
524
525 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
526                             unsigned int num)
527 {
528         int ret;
529
530         if (num >= MAX_IPTUN_ENCAP_OPS)
531                 return -ERANGE;
532
533         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
534                        &iptun_encaps[num],
535                        ops, NULL) == ops) ? 0 : -1;
536
537         synchronize_net();
538
539         return ret;
540 }
541 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
542
543 int ip_tunnel_encap_setup(struct ip_tunnel *t,
544                           struct ip_tunnel_encap *ipencap)
545 {
546         int hlen;
547
548         memset(&t->encap, 0, sizeof(t->encap));
549
550         hlen = ip_encap_hlen(ipencap);
551         if (hlen < 0)
552                 return hlen;
553
554         t->encap.type = ipencap->type;
555         t->encap.sport = ipencap->sport;
556         t->encap.dport = ipencap->dport;
557         t->encap.flags = ipencap->flags;
558
559         t->encap_hlen = hlen;
560         t->hlen = t->encap_hlen + t->tun_hlen;
561
562         return 0;
563 }
564 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
565
566 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
567                     u8 *protocol, struct flowi4 *fl4)
568 {
569         const struct ip_tunnel_encap_ops *ops;
570         int ret = -EINVAL;
571
572         if (t->encap.type == TUNNEL_ENCAP_NONE)
573                 return 0;
574
575         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
576                 return -EINVAL;
577
578         rcu_read_lock();
579         ops = rcu_dereference(iptun_encaps[t->encap.type]);
580         if (likely(ops && ops->build_header))
581                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
582         rcu_read_unlock();
583
584         return ret;
585 }
586 EXPORT_SYMBOL(ip_tunnel_encap);
587
588 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
589                             struct rtable *rt, __be16 df)
590 {
591         struct ip_tunnel *tunnel = netdev_priv(dev);
592         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
593         int mtu;
594
595         if (df)
596                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
597                                         - sizeof(struct iphdr) - tunnel->hlen;
598         else
599                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
600
601         if (skb_dst(skb))
602                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
603
604         if (skb->protocol == htons(ETH_P_IP)) {
605                 if (!skb_is_gso(skb) &&
606                     (df & htons(IP_DF)) && mtu < pkt_size) {
607                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
608                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
609                         return -E2BIG;
610                 }
611         }
612 #if IS_ENABLED(CONFIG_IPV6)
613         else if (skb->protocol == htons(ETH_P_IPV6)) {
614                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
615
616                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
617                            mtu >= IPV6_MIN_MTU) {
618                         if ((tunnel->parms.iph.daddr &&
619                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
620                             rt6->rt6i_dst.plen == 128) {
621                                 rt6->rt6i_flags |= RTF_MODIFIED;
622                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
623                         }
624                 }
625
626                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
627                                         mtu < pkt_size) {
628                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
629                         return -E2BIG;
630                 }
631         }
632 #endif
633         return 0;
634 }
635
636 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
637                     const struct iphdr *tnl_params, u8 protocol)
638 {
639         struct ip_tunnel *tunnel = netdev_priv(dev);
640         const struct iphdr *inner_iph;
641         struct flowi4 fl4;
642         u8     tos, ttl;
643         __be16 df;
644         struct rtable *rt;              /* Route to the other host */
645         unsigned int max_headroom;      /* The extra header space needed */
646         __be32 dst;
647         int err;
648         bool connected;
649
650         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
651         connected = (tunnel->parms.iph.daddr != 0);
652
653         dst = tnl_params->daddr;
654         if (dst == 0) {
655                 /* NBMA tunnel */
656
657                 if (!skb_dst(skb)) {
658                         dev->stats.tx_fifo_errors++;
659                         goto tx_error;
660                 }
661
662                 if (skb->protocol == htons(ETH_P_IP)) {
663                         rt = skb_rtable(skb);
664                         dst = rt_nexthop(rt, inner_iph->daddr);
665                 }
666 #if IS_ENABLED(CONFIG_IPV6)
667                 else if (skb->protocol == htons(ETH_P_IPV6)) {
668                         const struct in6_addr *addr6;
669                         struct neighbour *neigh;
670                         bool do_tx_error_icmp;
671                         int addr_type;
672
673                         neigh = dst_neigh_lookup(skb_dst(skb),
674                                                  &ipv6_hdr(skb)->daddr);
675                         if (!neigh)
676                                 goto tx_error;
677
678                         addr6 = (const struct in6_addr *)&neigh->primary_key;
679                         addr_type = ipv6_addr_type(addr6);
680
681                         if (addr_type == IPV6_ADDR_ANY) {
682                                 addr6 = &ipv6_hdr(skb)->daddr;
683                                 addr_type = ipv6_addr_type(addr6);
684                         }
685
686                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
687                                 do_tx_error_icmp = true;
688                         else {
689                                 do_tx_error_icmp = false;
690                                 dst = addr6->s6_addr32[3];
691                         }
692                         neigh_release(neigh);
693                         if (do_tx_error_icmp)
694                                 goto tx_error_icmp;
695                 }
696 #endif
697                 else
698                         goto tx_error;
699
700                 connected = false;
701         }
702
703         tos = tnl_params->tos;
704         if (tos & 0x1) {
705                 tos &= ~0x1;
706                 if (skb->protocol == htons(ETH_P_IP)) {
707                         tos = inner_iph->tos;
708                         connected = false;
709                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
710                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
711                         connected = false;
712                 }
713         }
714
715         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
716                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
717
718         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
719                 goto tx_error;
720
721         rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
722
723         if (!rt) {
724                 rt = ip_route_output_key(tunnel->net, &fl4);
725
726                 if (IS_ERR(rt)) {
727                         dev->stats.tx_carrier_errors++;
728                         goto tx_error;
729                 }
730                 if (connected)
731                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
732         }
733
734         if (rt->dst.dev == dev) {
735                 ip_rt_put(rt);
736                 dev->stats.collisions++;
737                 goto tx_error;
738         }
739
740         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
741                 ip_rt_put(rt);
742                 goto tx_error;
743         }
744
745         if (tunnel->err_count > 0) {
746                 if (time_before(jiffies,
747                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748                         tunnel->err_count--;
749
750                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
751                         dst_link_failure(skb);
752                 } else
753                         tunnel->err_count = 0;
754         }
755
756         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
757         ttl = tnl_params->ttl;
758         if (ttl == 0) {
759                 if (skb->protocol == htons(ETH_P_IP))
760                         ttl = inner_iph->ttl;
761 #if IS_ENABLED(CONFIG_IPV6)
762                 else if (skb->protocol == htons(ETH_P_IPV6))
763                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
764 #endif
765                 else
766                         ttl = ip4_dst_hoplimit(&rt->dst);
767         }
768
769         df = tnl_params->frag_off;
770         if (skb->protocol == htons(ETH_P_IP))
771                 df |= (inner_iph->frag_off&htons(IP_DF));
772
773         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
774                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
775         if (max_headroom > dev->needed_headroom)
776                 dev->needed_headroom = max_headroom;
777
778         if (skb_cow_head(skb, dev->needed_headroom)) {
779                 ip_rt_put(rt);
780                 dev->stats.tx_dropped++;
781                 kfree_skb(skb);
782                 return;
783         }
784
785         err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
786                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
787         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
788
789         return;
790
791 #if IS_ENABLED(CONFIG_IPV6)
792 tx_error_icmp:
793         dst_link_failure(skb);
794 #endif
795 tx_error:
796         dev->stats.tx_errors++;
797         kfree_skb(skb);
798 }
799 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
800
801 static void ip_tunnel_update(struct ip_tunnel_net *itn,
802                              struct ip_tunnel *t,
803                              struct net_device *dev,
804                              struct ip_tunnel_parm *p,
805                              bool set_mtu)
806 {
807         ip_tunnel_del(t);
808         t->parms.iph.saddr = p->iph.saddr;
809         t->parms.iph.daddr = p->iph.daddr;
810         t->parms.i_key = p->i_key;
811         t->parms.o_key = p->o_key;
812         if (dev->type != ARPHRD_ETHER) {
813                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
814                 memcpy(dev->broadcast, &p->iph.daddr, 4);
815         }
816         ip_tunnel_add(itn, t);
817
818         t->parms.iph.ttl = p->iph.ttl;
819         t->parms.iph.tos = p->iph.tos;
820         t->parms.iph.frag_off = p->iph.frag_off;
821
822         if (t->parms.link != p->link) {
823                 int mtu;
824
825                 t->parms.link = p->link;
826                 mtu = ip_tunnel_bind_dev(dev);
827                 if (set_mtu)
828                         dev->mtu = mtu;
829         }
830         ip_tunnel_dst_reset_all(t);
831         netdev_state_change(dev);
832 }
833
834 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
835 {
836         int err = 0;
837         struct ip_tunnel *t = netdev_priv(dev);
838         struct net *net = t->net;
839         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
840
841         BUG_ON(!itn->fb_tunnel_dev);
842         switch (cmd) {
843         case SIOCGETTUNNEL:
844                 if (dev == itn->fb_tunnel_dev) {
845                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
846                         if (!t)
847                                 t = netdev_priv(dev);
848                 }
849                 memcpy(p, &t->parms, sizeof(*p));
850                 break;
851
852         case SIOCADDTUNNEL:
853         case SIOCCHGTUNNEL:
854                 err = -EPERM;
855                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
856                         goto done;
857                 if (p->iph.ttl)
858                         p->iph.frag_off |= htons(IP_DF);
859                 if (!(p->i_flags & VTI_ISVTI)) {
860                         if (!(p->i_flags & TUNNEL_KEY))
861                                 p->i_key = 0;
862                         if (!(p->o_flags & TUNNEL_KEY))
863                                 p->o_key = 0;
864                 }
865
866                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
867
868                 if (cmd == SIOCADDTUNNEL) {
869                         if (!t) {
870                                 t = ip_tunnel_create(net, itn, p);
871                                 err = PTR_ERR_OR_ZERO(t);
872                                 break;
873                         }
874
875                         err = -EEXIST;
876                         break;
877                 }
878                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
879                         if (t) {
880                                 if (t->dev != dev) {
881                                         err = -EEXIST;
882                                         break;
883                                 }
884                         } else {
885                                 unsigned int nflags = 0;
886
887                                 if (ipv4_is_multicast(p->iph.daddr))
888                                         nflags = IFF_BROADCAST;
889                                 else if (p->iph.daddr)
890                                         nflags = IFF_POINTOPOINT;
891
892                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
893                                         err = -EINVAL;
894                                         break;
895                                 }
896
897                                 t = netdev_priv(dev);
898                         }
899                 }
900
901                 if (t) {
902                         err = 0;
903                         ip_tunnel_update(itn, t, dev, p, true);
904                 } else {
905                         err = -ENOENT;
906                 }
907                 break;
908
909         case SIOCDELTUNNEL:
910                 err = -EPERM;
911                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
912                         goto done;
913
914                 if (dev == itn->fb_tunnel_dev) {
915                         err = -ENOENT;
916                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
917                         if (!t)
918                                 goto done;
919                         err = -EPERM;
920                         if (t == netdev_priv(itn->fb_tunnel_dev))
921                                 goto done;
922                         dev = t->dev;
923                 }
924                 unregister_netdevice(dev);
925                 err = 0;
926                 break;
927
928         default:
929                 err = -EINVAL;
930         }
931
932 done:
933         return err;
934 }
935 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
936
937 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
938 {
939         struct ip_tunnel *tunnel = netdev_priv(dev);
940         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
941
942         if (new_mtu < 68 ||
943             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
944                 return -EINVAL;
945         dev->mtu = new_mtu;
946         return 0;
947 }
948 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
949
950 static void ip_tunnel_dev_free(struct net_device *dev)
951 {
952         struct ip_tunnel *tunnel = netdev_priv(dev);
953
954         gro_cells_destroy(&tunnel->gro_cells);
955         free_percpu(tunnel->dst_cache);
956         free_percpu(dev->tstats);
957         free_netdev(dev);
958 }
959
960 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
961 {
962         struct ip_tunnel *tunnel = netdev_priv(dev);
963         struct ip_tunnel_net *itn;
964
965         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
966
967         if (itn->fb_tunnel_dev != dev) {
968                 ip_tunnel_del(netdev_priv(dev));
969                 unregister_netdevice_queue(dev, head);
970         }
971 }
972 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
973
974 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
975 {
976         struct ip_tunnel *tunnel = netdev_priv(dev);
977
978         return tunnel->net;
979 }
980 EXPORT_SYMBOL(ip_tunnel_get_link_net);
981
982 int ip_tunnel_get_iflink(const struct net_device *dev)
983 {
984         struct ip_tunnel *tunnel = netdev_priv(dev);
985
986         return tunnel->parms.link;
987 }
988 EXPORT_SYMBOL(ip_tunnel_get_iflink);
989
990 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
991                                   struct rtnl_link_ops *ops, char *devname)
992 {
993         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
994         struct ip_tunnel_parm parms;
995         unsigned int i;
996
997         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
998                 INIT_HLIST_HEAD(&itn->tunnels[i]);
999
1000         if (!ops) {
1001                 itn->fb_tunnel_dev = NULL;
1002                 return 0;
1003         }
1004
1005         memset(&parms, 0, sizeof(parms));
1006         if (devname)
1007                 strlcpy(parms.name, devname, IFNAMSIZ);
1008
1009         rtnl_lock();
1010         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1011         /* FB netdevice is special: we have one, and only one per netns.
1012          * Allowing to move it to another netns is clearly unsafe.
1013          */
1014         if (!IS_ERR(itn->fb_tunnel_dev)) {
1015                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1016                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1017                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1018         }
1019         rtnl_unlock();
1020
1021         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1022 }
1023 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1024
1025 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1026                               struct rtnl_link_ops *ops)
1027 {
1028         struct net *net = dev_net(itn->fb_tunnel_dev);
1029         struct net_device *dev, *aux;
1030         int h;
1031
1032         for_each_netdev_safe(net, dev, aux)
1033                 if (dev->rtnl_link_ops == ops)
1034                         unregister_netdevice_queue(dev, head);
1035
1036         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1037                 struct ip_tunnel *t;
1038                 struct hlist_node *n;
1039                 struct hlist_head *thead = &itn->tunnels[h];
1040
1041                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1042                         /* If dev is in the same netns, it has already
1043                          * been added to the list by the previous loop.
1044                          */
1045                         if (!net_eq(dev_net(t->dev), net))
1046                                 unregister_netdevice_queue(t->dev, head);
1047         }
1048 }
1049
1050 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1051 {
1052         LIST_HEAD(list);
1053
1054         rtnl_lock();
1055         ip_tunnel_destroy(itn, &list, ops);
1056         unregister_netdevice_many(&list);
1057         rtnl_unlock();
1058 }
1059 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1060
1061 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1062                       struct ip_tunnel_parm *p)
1063 {
1064         struct ip_tunnel *nt;
1065         struct net *net = dev_net(dev);
1066         struct ip_tunnel_net *itn;
1067         int mtu;
1068         int err;
1069
1070         nt = netdev_priv(dev);
1071         itn = net_generic(net, nt->ip_tnl_net_id);
1072
1073         if (ip_tunnel_find(itn, p, dev->type))
1074                 return -EEXIST;
1075
1076         nt->net = net;
1077         nt->parms = *p;
1078         err = register_netdevice(dev);
1079         if (err)
1080                 goto out;
1081
1082         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1083                 eth_hw_addr_random(dev);
1084
1085         mtu = ip_tunnel_bind_dev(dev);
1086         if (!tb[IFLA_MTU])
1087                 dev->mtu = mtu;
1088
1089         ip_tunnel_add(itn, nt);
1090
1091 out:
1092         return err;
1093 }
1094 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1095
1096 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1097                          struct ip_tunnel_parm *p)
1098 {
1099         struct ip_tunnel *t;
1100         struct ip_tunnel *tunnel = netdev_priv(dev);
1101         struct net *net = tunnel->net;
1102         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1103
1104         if (dev == itn->fb_tunnel_dev)
1105                 return -EINVAL;
1106
1107         t = ip_tunnel_find(itn, p, dev->type);
1108
1109         if (t) {
1110                 if (t->dev != dev)
1111                         return -EEXIST;
1112         } else {
1113                 t = tunnel;
1114
1115                 if (dev->type != ARPHRD_ETHER) {
1116                         unsigned int nflags = 0;
1117
1118                         if (ipv4_is_multicast(p->iph.daddr))
1119                                 nflags = IFF_BROADCAST;
1120                         else if (p->iph.daddr)
1121                                 nflags = IFF_POINTOPOINT;
1122
1123                         if ((dev->flags ^ nflags) &
1124                             (IFF_POINTOPOINT | IFF_BROADCAST))
1125                                 return -EINVAL;
1126                 }
1127         }
1128
1129         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1130         return 0;
1131 }
1132 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1133
1134 int ip_tunnel_init(struct net_device *dev)
1135 {
1136         struct ip_tunnel *tunnel = netdev_priv(dev);
1137         struct iphdr *iph = &tunnel->parms.iph;
1138         int err;
1139
1140         dev->destructor = ip_tunnel_dev_free;
1141         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1142         if (!dev->tstats)
1143                 return -ENOMEM;
1144
1145         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1146         if (!tunnel->dst_cache) {
1147                 free_percpu(dev->tstats);
1148                 return -ENOMEM;
1149         }
1150
1151         err = gro_cells_init(&tunnel->gro_cells, dev);
1152         if (err) {
1153                 free_percpu(tunnel->dst_cache);
1154                 free_percpu(dev->tstats);
1155                 return err;
1156         }
1157
1158         tunnel->dev = dev;
1159         tunnel->net = dev_net(dev);
1160         strcpy(tunnel->parms.name, dev->name);
1161         iph->version            = 4;
1162         iph->ihl                = 5;
1163
1164         return 0;
1165 }
1166 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1167
1168 void ip_tunnel_uninit(struct net_device *dev)
1169 {
1170         struct ip_tunnel *tunnel = netdev_priv(dev);
1171         struct net *net = tunnel->net;
1172         struct ip_tunnel_net *itn;
1173
1174         itn = net_generic(net, tunnel->ip_tnl_net_id);
1175         /* fb_tunnel_dev will be unregisted in net-exit call. */
1176         if (itn->fb_tunnel_dev != dev)
1177                 ip_tunnel_del(netdev_priv(dev));
1178
1179         ip_tunnel_dst_reset_all(tunnel);
1180 }
1181 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1182
1183 /* Do least required initialization, rest of init is done in tunnel_init call */
1184 void ip_tunnel_setup(struct net_device *dev, int net_id)
1185 {
1186         struct ip_tunnel *tunnel = netdev_priv(dev);
1187         tunnel->ip_tnl_net_id = net_id;
1188 }
1189 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1190
1191 MODULE_LICENSE("GPL");