Upgrade to 4.4.50-rt62
[kvmfornfv.git] / kernel / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73                              struct dst_entry *dst, __be32 saddr)
74 {
75         struct dst_entry *old_dst;
76
77         dst_clone(dst);
78         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79         dst_release(old_dst);
80         idst->saddr = saddr;
81 }
82
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84                            struct dst_entry *dst, __be32 saddr)
85 {
86         __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91         tunnel_dst_set(t, NULL, 0);
92 }
93
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96         int i;
97
98         for_each_possible_cpu(i)
99                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104                                         u32 cookie, __be32 *saddr)
105 {
106         struct ip_tunnel_dst *idst;
107         struct dst_entry *dst;
108
109         rcu_read_lock();
110         idst = raw_cpu_ptr(t->dst_cache);
111         dst = rcu_dereference(idst->dst);
112         if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113                 dst = NULL;
114         if (dst) {
115                 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116                         *saddr = idst->saddr;
117                 } else {
118                         tunnel_dst_reset(t);
119                         dst_release(dst);
120                         dst = NULL;
121                 }
122         }
123         rcu_read_unlock();
124         return (struct rtable *)dst;
125 }
126
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128                                 __be16 flags, __be32 key)
129 {
130         if (p->i_flags & TUNNEL_KEY) {
131                 if (flags & TUNNEL_KEY)
132                         return key == p->i_key;
133                 else
134                         /* key expected, none present */
135                         return false;
136         } else
137                 return !(flags & TUNNEL_KEY);
138 }
139
140 /* Fallback tunnel: no source, no destination, no key, no options
141
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152                                    int link, __be16 flags,
153                                    __be32 remote, __be32 local,
154                                    __be32 key)
155 {
156         unsigned int hash;
157         struct ip_tunnel *t, *cand = NULL;
158         struct hlist_head *head;
159
160         hash = ip_tunnel_hash(key, remote);
161         head = &itn->tunnels[hash];
162
163         hlist_for_each_entry_rcu(t, head, hash_node) {
164                 if (local != t->parms.iph.saddr ||
165                     remote != t->parms.iph.daddr ||
166                     !(t->dev->flags & IFF_UP))
167                         continue;
168
169                 if (!ip_tunnel_key_match(&t->parms, flags, key))
170                         continue;
171
172                 if (t->parms.link == link)
173                         return t;
174                 else
175                         cand = t;
176         }
177
178         hlist_for_each_entry_rcu(t, head, hash_node) {
179                 if (remote != t->parms.iph.daddr ||
180                     t->parms.iph.saddr != 0 ||
181                     !(t->dev->flags & IFF_UP))
182                         continue;
183
184                 if (!ip_tunnel_key_match(&t->parms, flags, key))
185                         continue;
186
187                 if (t->parms.link == link)
188                         return t;
189                 else if (!cand)
190                         cand = t;
191         }
192
193         hash = ip_tunnel_hash(key, 0);
194         head = &itn->tunnels[hash];
195
196         hlist_for_each_entry_rcu(t, head, hash_node) {
197                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199                         continue;
200
201                 if (!(t->dev->flags & IFF_UP))
202                         continue;
203
204                 if (!ip_tunnel_key_match(&t->parms, flags, key))
205                         continue;
206
207                 if (t->parms.link == link)
208                         return t;
209                 else if (!cand)
210                         cand = t;
211         }
212
213         if (flags & TUNNEL_NO_KEY)
214                 goto skip_key_lookup;
215
216         hlist_for_each_entry_rcu(t, head, hash_node) {
217                 if (t->parms.i_key != key ||
218                     t->parms.iph.saddr != 0 ||
219                     t->parms.iph.daddr != 0 ||
220                     !(t->dev->flags & IFF_UP))
221                         continue;
222
223                 if (t->parms.link == link)
224                         return t;
225                 else if (!cand)
226                         cand = t;
227         }
228
229 skip_key_lookup:
230         if (cand)
231                 return cand;
232
233         t = rcu_dereference(itn->collect_md_tun);
234         if (t)
235                 return t;
236
237         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
238                 return netdev_priv(itn->fb_tunnel_dev);
239
240         return NULL;
241 }
242 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
243
244 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
245                                     struct ip_tunnel_parm *parms)
246 {
247         unsigned int h;
248         __be32 remote;
249         __be32 i_key = parms->i_key;
250
251         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
252                 remote = parms->iph.daddr;
253         else
254                 remote = 0;
255
256         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
257                 i_key = 0;
258
259         h = ip_tunnel_hash(i_key, remote);
260         return &itn->tunnels[h];
261 }
262
263 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
264 {
265         struct hlist_head *head = ip_bucket(itn, &t->parms);
266
267         if (t->collect_md)
268                 rcu_assign_pointer(itn->collect_md_tun, t);
269         hlist_add_head_rcu(&t->hash_node, head);
270 }
271
272 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
273 {
274         if (t->collect_md)
275                 rcu_assign_pointer(itn->collect_md_tun, NULL);
276         hlist_del_init_rcu(&t->hash_node);
277 }
278
279 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
280                                         struct ip_tunnel_parm *parms,
281                                         int type)
282 {
283         __be32 remote = parms->iph.daddr;
284         __be32 local = parms->iph.saddr;
285         __be32 key = parms->i_key;
286         __be16 flags = parms->i_flags;
287         int link = parms->link;
288         struct ip_tunnel *t = NULL;
289         struct hlist_head *head = ip_bucket(itn, parms);
290
291         hlist_for_each_entry_rcu(t, head, hash_node) {
292                 if (local == t->parms.iph.saddr &&
293                     remote == t->parms.iph.daddr &&
294                     link == t->parms.link &&
295                     type == t->dev->type &&
296                     ip_tunnel_key_match(&t->parms, flags, key))
297                         break;
298         }
299         return t;
300 }
301
302 static struct net_device *__ip_tunnel_create(struct net *net,
303                                              const struct rtnl_link_ops *ops,
304                                              struct ip_tunnel_parm *parms)
305 {
306         int err;
307         struct ip_tunnel *tunnel;
308         struct net_device *dev;
309         char name[IFNAMSIZ];
310
311         if (parms->name[0])
312                 strlcpy(name, parms->name, IFNAMSIZ);
313         else {
314                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
315                         err = -E2BIG;
316                         goto failed;
317                 }
318                 strlcpy(name, ops->kind, IFNAMSIZ);
319                 strncat(name, "%d", 2);
320         }
321
322         ASSERT_RTNL();
323         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
324         if (!dev) {
325                 err = -ENOMEM;
326                 goto failed;
327         }
328         dev_net_set(dev, net);
329
330         dev->rtnl_link_ops = ops;
331
332         tunnel = netdev_priv(dev);
333         tunnel->parms = *parms;
334         tunnel->net = net;
335
336         err = register_netdevice(dev);
337         if (err)
338                 goto failed_free;
339
340         return dev;
341
342 failed_free:
343         free_netdev(dev);
344 failed:
345         return ERR_PTR(err);
346 }
347
348 static inline void init_tunnel_flow(struct flowi4 *fl4,
349                                     int proto,
350                                     __be32 daddr, __be32 saddr,
351                                     __be32 key, __u8 tos, int oif)
352 {
353         memset(fl4, 0, sizeof(*fl4));
354         fl4->flowi4_oif = oif;
355         fl4->daddr = daddr;
356         fl4->saddr = saddr;
357         fl4->flowi4_tos = tos;
358         fl4->flowi4_proto = proto;
359         fl4->fl4_gre_key = key;
360 }
361
362 static int ip_tunnel_bind_dev(struct net_device *dev)
363 {
364         struct net_device *tdev = NULL;
365         struct ip_tunnel *tunnel = netdev_priv(dev);
366         const struct iphdr *iph;
367         int hlen = LL_MAX_HEADER;
368         int mtu = ETH_DATA_LEN;
369         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
370
371         iph = &tunnel->parms.iph;
372
373         /* Guess output device to choose reasonable mtu and needed_headroom */
374         if (iph->daddr) {
375                 struct flowi4 fl4;
376                 struct rtable *rt;
377
378                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
379                                  iph->saddr, tunnel->parms.o_key,
380                                  RT_TOS(iph->tos), tunnel->parms.link);
381                 rt = ip_route_output_key(tunnel->net, &fl4);
382
383                 if (!IS_ERR(rt)) {
384                         tdev = rt->dst.dev;
385                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
386                         ip_rt_put(rt);
387                 }
388                 if (dev->type != ARPHRD_ETHER)
389                         dev->flags |= IFF_POINTOPOINT;
390         }
391
392         if (!tdev && tunnel->parms.link)
393                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
394
395         if (tdev) {
396                 hlen = tdev->hard_header_len + tdev->needed_headroom;
397                 mtu = tdev->mtu;
398         }
399
400         dev->needed_headroom = t_hlen + hlen;
401         mtu -= (dev->hard_header_len + t_hlen);
402
403         if (mtu < 68)
404                 mtu = 68;
405
406         return mtu;
407 }
408
409 static struct ip_tunnel *ip_tunnel_create(struct net *net,
410                                           struct ip_tunnel_net *itn,
411                                           struct ip_tunnel_parm *parms)
412 {
413         struct ip_tunnel *nt;
414         struct net_device *dev;
415
416         BUG_ON(!itn->fb_tunnel_dev);
417         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
418         if (IS_ERR(dev))
419                 return ERR_CAST(dev);
420
421         dev->mtu = ip_tunnel_bind_dev(dev);
422
423         nt = netdev_priv(dev);
424         ip_tunnel_add(itn, nt);
425         return nt;
426 }
427
428 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
429                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
430                   bool log_ecn_error)
431 {
432         struct pcpu_sw_netstats *tstats;
433         const struct iphdr *iph = ip_hdr(skb);
434         int err;
435
436 #ifdef CONFIG_NET_IPGRE_BROADCAST
437         if (ipv4_is_multicast(iph->daddr)) {
438                 tunnel->dev->stats.multicast++;
439                 skb->pkt_type = PACKET_BROADCAST;
440         }
441 #endif
442
443         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
444              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
445                 tunnel->dev->stats.rx_crc_errors++;
446                 tunnel->dev->stats.rx_errors++;
447                 goto drop;
448         }
449
450         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
451                 if (!(tpi->flags&TUNNEL_SEQ) ||
452                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
453                         tunnel->dev->stats.rx_fifo_errors++;
454                         tunnel->dev->stats.rx_errors++;
455                         goto drop;
456                 }
457                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
458         }
459
460         skb_reset_network_header(skb);
461
462         err = IP_ECN_decapsulate(iph, skb);
463         if (unlikely(err)) {
464                 if (log_ecn_error)
465                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
466                                         &iph->saddr, iph->tos);
467                 if (err > 1) {
468                         ++tunnel->dev->stats.rx_frame_errors;
469                         ++tunnel->dev->stats.rx_errors;
470                         goto drop;
471                 }
472         }
473
474         tstats = this_cpu_ptr(tunnel->dev->tstats);
475         u64_stats_update_begin(&tstats->syncp);
476         tstats->rx_packets++;
477         tstats->rx_bytes += skb->len;
478         u64_stats_update_end(&tstats->syncp);
479
480         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
481
482         if (tunnel->dev->type == ARPHRD_ETHER) {
483                 skb->protocol = eth_type_trans(skb, tunnel->dev);
484                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
485         } else {
486                 skb->dev = tunnel->dev;
487         }
488
489         if (tun_dst)
490                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
491
492         gro_cells_receive(&tunnel->gro_cells, skb);
493         return 0;
494
495 drop:
496         kfree_skb(skb);
497         return 0;
498 }
499 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
500
501 static int ip_encap_hlen(struct ip_tunnel_encap *e)
502 {
503         const struct ip_tunnel_encap_ops *ops;
504         int hlen = -EINVAL;
505
506         if (e->type == TUNNEL_ENCAP_NONE)
507                 return 0;
508
509         if (e->type >= MAX_IPTUN_ENCAP_OPS)
510                 return -EINVAL;
511
512         rcu_read_lock();
513         ops = rcu_dereference(iptun_encaps[e->type]);
514         if (likely(ops && ops->encap_hlen))
515                 hlen = ops->encap_hlen(e);
516         rcu_read_unlock();
517
518         return hlen;
519 }
520
521 const struct ip_tunnel_encap_ops __rcu *
522                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
523
524 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
525                             unsigned int num)
526 {
527         if (num >= MAX_IPTUN_ENCAP_OPS)
528                 return -ERANGE;
529
530         return !cmpxchg((const struct ip_tunnel_encap_ops **)
531                         &iptun_encaps[num],
532                         NULL, ops) ? 0 : -1;
533 }
534 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
535
536 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
537                             unsigned int num)
538 {
539         int ret;
540
541         if (num >= MAX_IPTUN_ENCAP_OPS)
542                 return -ERANGE;
543
544         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
545                        &iptun_encaps[num],
546                        ops, NULL) == ops) ? 0 : -1;
547
548         synchronize_net();
549
550         return ret;
551 }
552 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
553
554 int ip_tunnel_encap_setup(struct ip_tunnel *t,
555                           struct ip_tunnel_encap *ipencap)
556 {
557         int hlen;
558
559         memset(&t->encap, 0, sizeof(t->encap));
560
561         hlen = ip_encap_hlen(ipencap);
562         if (hlen < 0)
563                 return hlen;
564
565         t->encap.type = ipencap->type;
566         t->encap.sport = ipencap->sport;
567         t->encap.dport = ipencap->dport;
568         t->encap.flags = ipencap->flags;
569
570         t->encap_hlen = hlen;
571         t->hlen = t->encap_hlen + t->tun_hlen;
572
573         return 0;
574 }
575 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
576
577 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
578                     u8 *protocol, struct flowi4 *fl4)
579 {
580         const struct ip_tunnel_encap_ops *ops;
581         int ret = -EINVAL;
582
583         if (t->encap.type == TUNNEL_ENCAP_NONE)
584                 return 0;
585
586         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
587                 return -EINVAL;
588
589         rcu_read_lock();
590         ops = rcu_dereference(iptun_encaps[t->encap.type]);
591         if (likely(ops && ops->build_header))
592                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
593         rcu_read_unlock();
594
595         return ret;
596 }
597 EXPORT_SYMBOL(ip_tunnel_encap);
598
599 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
600                             struct rtable *rt, __be16 df,
601                             const struct iphdr *inner_iph)
602 {
603         struct ip_tunnel *tunnel = netdev_priv(dev);
604         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
605         int mtu;
606
607         if (df)
608                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
609                                         - sizeof(struct iphdr) - tunnel->hlen;
610         else
611                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
612
613         if (skb_dst(skb))
614                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
615
616         if (skb->protocol == htons(ETH_P_IP)) {
617                 if (!skb_is_gso(skb) &&
618                     (inner_iph->frag_off & htons(IP_DF)) &&
619                     mtu < pkt_size) {
620                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
621                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
622                         return -E2BIG;
623                 }
624         }
625 #if IS_ENABLED(CONFIG_IPV6)
626         else if (skb->protocol == htons(ETH_P_IPV6)) {
627                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
628
629                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
630                            mtu >= IPV6_MIN_MTU) {
631                         if ((tunnel->parms.iph.daddr &&
632                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
633                             rt6->rt6i_dst.plen == 128) {
634                                 rt6->rt6i_flags |= RTF_MODIFIED;
635                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
636                         }
637                 }
638
639                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
640                                         mtu < pkt_size) {
641                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642                         return -E2BIG;
643                 }
644         }
645 #endif
646         return 0;
647 }
648
649 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
650                     const struct iphdr *tnl_params, u8 protocol)
651 {
652         struct ip_tunnel *tunnel = netdev_priv(dev);
653         const struct iphdr *inner_iph;
654         struct flowi4 fl4;
655         u8     tos, ttl;
656         __be16 df;
657         struct rtable *rt;              /* Route to the other host */
658         unsigned int max_headroom;      /* The extra header space needed */
659         __be32 dst;
660         int err;
661         bool connected;
662
663         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
664         connected = (tunnel->parms.iph.daddr != 0);
665
666         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
667
668         dst = tnl_params->daddr;
669         if (dst == 0) {
670                 /* NBMA tunnel */
671
672                 if (!skb_dst(skb)) {
673                         dev->stats.tx_fifo_errors++;
674                         goto tx_error;
675                 }
676
677                 if (skb->protocol == htons(ETH_P_IP)) {
678                         rt = skb_rtable(skb);
679                         dst = rt_nexthop(rt, inner_iph->daddr);
680                 }
681 #if IS_ENABLED(CONFIG_IPV6)
682                 else if (skb->protocol == htons(ETH_P_IPV6)) {
683                         const struct in6_addr *addr6;
684                         struct neighbour *neigh;
685                         bool do_tx_error_icmp;
686                         int addr_type;
687
688                         neigh = dst_neigh_lookup(skb_dst(skb),
689                                                  &ipv6_hdr(skb)->daddr);
690                         if (!neigh)
691                                 goto tx_error;
692
693                         addr6 = (const struct in6_addr *)&neigh->primary_key;
694                         addr_type = ipv6_addr_type(addr6);
695
696                         if (addr_type == IPV6_ADDR_ANY) {
697                                 addr6 = &ipv6_hdr(skb)->daddr;
698                                 addr_type = ipv6_addr_type(addr6);
699                         }
700
701                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
702                                 do_tx_error_icmp = true;
703                         else {
704                                 do_tx_error_icmp = false;
705                                 dst = addr6->s6_addr32[3];
706                         }
707                         neigh_release(neigh);
708                         if (do_tx_error_icmp)
709                                 goto tx_error_icmp;
710                 }
711 #endif
712                 else
713                         goto tx_error;
714
715                 connected = false;
716         }
717
718         tos = tnl_params->tos;
719         if (tos & 0x1) {
720                 tos &= ~0x1;
721                 if (skb->protocol == htons(ETH_P_IP)) {
722                         tos = inner_iph->tos;
723                         connected = false;
724                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
725                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
726                         connected = false;
727                 }
728         }
729
730         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
731                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
732
733         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
734                 goto tx_error;
735
736         rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
737
738         if (!rt) {
739                 rt = ip_route_output_key(tunnel->net, &fl4);
740
741                 if (IS_ERR(rt)) {
742                         dev->stats.tx_carrier_errors++;
743                         goto tx_error;
744                 }
745                 if (connected)
746                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
747         }
748
749         if (rt->dst.dev == dev) {
750                 ip_rt_put(rt);
751                 dev->stats.collisions++;
752                 goto tx_error;
753         }
754
755         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
756                 ip_rt_put(rt);
757                 goto tx_error;
758         }
759
760         if (tunnel->err_count > 0) {
761                 if (time_before(jiffies,
762                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
763                         tunnel->err_count--;
764
765                         dst_link_failure(skb);
766                 } else
767                         tunnel->err_count = 0;
768         }
769
770         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
771         ttl = tnl_params->ttl;
772         if (ttl == 0) {
773                 if (skb->protocol == htons(ETH_P_IP))
774                         ttl = inner_iph->ttl;
775 #if IS_ENABLED(CONFIG_IPV6)
776                 else if (skb->protocol == htons(ETH_P_IPV6))
777                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
778 #endif
779                 else
780                         ttl = ip4_dst_hoplimit(&rt->dst);
781         }
782
783         df = tnl_params->frag_off;
784         if (skb->protocol == htons(ETH_P_IP))
785                 df |= (inner_iph->frag_off&htons(IP_DF));
786
787         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
788                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
789         if (max_headroom > dev->needed_headroom)
790                 dev->needed_headroom = max_headroom;
791
792         if (skb_cow_head(skb, dev->needed_headroom)) {
793                 ip_rt_put(rt);
794                 dev->stats.tx_dropped++;
795                 kfree_skb(skb);
796                 return;
797         }
798
799         err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
800                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
801         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
802
803         return;
804
805 #if IS_ENABLED(CONFIG_IPV6)
806 tx_error_icmp:
807         dst_link_failure(skb);
808 #endif
809 tx_error:
810         dev->stats.tx_errors++;
811         kfree_skb(skb);
812 }
813 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
814
815 static void ip_tunnel_update(struct ip_tunnel_net *itn,
816                              struct ip_tunnel *t,
817                              struct net_device *dev,
818                              struct ip_tunnel_parm *p,
819                              bool set_mtu)
820 {
821         ip_tunnel_del(itn, t);
822         t->parms.iph.saddr = p->iph.saddr;
823         t->parms.iph.daddr = p->iph.daddr;
824         t->parms.i_key = p->i_key;
825         t->parms.o_key = p->o_key;
826         if (dev->type != ARPHRD_ETHER) {
827                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
828                 memcpy(dev->broadcast, &p->iph.daddr, 4);
829         }
830         ip_tunnel_add(itn, t);
831
832         t->parms.iph.ttl = p->iph.ttl;
833         t->parms.iph.tos = p->iph.tos;
834         t->parms.iph.frag_off = p->iph.frag_off;
835
836         if (t->parms.link != p->link) {
837                 int mtu;
838
839                 t->parms.link = p->link;
840                 mtu = ip_tunnel_bind_dev(dev);
841                 if (set_mtu)
842                         dev->mtu = mtu;
843         }
844         ip_tunnel_dst_reset_all(t);
845         netdev_state_change(dev);
846 }
847
848 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
849 {
850         int err = 0;
851         struct ip_tunnel *t = netdev_priv(dev);
852         struct net *net = t->net;
853         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
854
855         BUG_ON(!itn->fb_tunnel_dev);
856         switch (cmd) {
857         case SIOCGETTUNNEL:
858                 if (dev == itn->fb_tunnel_dev) {
859                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
860                         if (!t)
861                                 t = netdev_priv(dev);
862                 }
863                 memcpy(p, &t->parms, sizeof(*p));
864                 break;
865
866         case SIOCADDTUNNEL:
867         case SIOCCHGTUNNEL:
868                 err = -EPERM;
869                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
870                         goto done;
871                 if (p->iph.ttl)
872                         p->iph.frag_off |= htons(IP_DF);
873                 if (!(p->i_flags & VTI_ISVTI)) {
874                         if (!(p->i_flags & TUNNEL_KEY))
875                                 p->i_key = 0;
876                         if (!(p->o_flags & TUNNEL_KEY))
877                                 p->o_key = 0;
878                 }
879
880                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
881
882                 if (cmd == SIOCADDTUNNEL) {
883                         if (!t) {
884                                 t = ip_tunnel_create(net, itn, p);
885                                 err = PTR_ERR_OR_ZERO(t);
886                                 break;
887                         }
888
889                         err = -EEXIST;
890                         break;
891                 }
892                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
893                         if (t) {
894                                 if (t->dev != dev) {
895                                         err = -EEXIST;
896                                         break;
897                                 }
898                         } else {
899                                 unsigned int nflags = 0;
900
901                                 if (ipv4_is_multicast(p->iph.daddr))
902                                         nflags = IFF_BROADCAST;
903                                 else if (p->iph.daddr)
904                                         nflags = IFF_POINTOPOINT;
905
906                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
907                                         err = -EINVAL;
908                                         break;
909                                 }
910
911                                 t = netdev_priv(dev);
912                         }
913                 }
914
915                 if (t) {
916                         err = 0;
917                         ip_tunnel_update(itn, t, dev, p, true);
918                 } else {
919                         err = -ENOENT;
920                 }
921                 break;
922
923         case SIOCDELTUNNEL:
924                 err = -EPERM;
925                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
926                         goto done;
927
928                 if (dev == itn->fb_tunnel_dev) {
929                         err = -ENOENT;
930                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
931                         if (!t)
932                                 goto done;
933                         err = -EPERM;
934                         if (t == netdev_priv(itn->fb_tunnel_dev))
935                                 goto done;
936                         dev = t->dev;
937                 }
938                 unregister_netdevice(dev);
939                 err = 0;
940                 break;
941
942         default:
943                 err = -EINVAL;
944         }
945
946 done:
947         return err;
948 }
949 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
950
951 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
952 {
953         struct ip_tunnel *tunnel = netdev_priv(dev);
954         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
955         int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
956
957         if (new_mtu < 68)
958                 return -EINVAL;
959
960         if (new_mtu > max_mtu) {
961                 if (strict)
962                         return -EINVAL;
963
964                 new_mtu = max_mtu;
965         }
966
967         dev->mtu = new_mtu;
968         return 0;
969 }
970 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
971
972 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
973 {
974         return __ip_tunnel_change_mtu(dev, new_mtu, true);
975 }
976 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
977
978 static void ip_tunnel_dev_free(struct net_device *dev)
979 {
980         struct ip_tunnel *tunnel = netdev_priv(dev);
981
982         gro_cells_destroy(&tunnel->gro_cells);
983         free_percpu(tunnel->dst_cache);
984         free_percpu(dev->tstats);
985         free_netdev(dev);
986 }
987
988 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
989 {
990         struct ip_tunnel *tunnel = netdev_priv(dev);
991         struct ip_tunnel_net *itn;
992
993         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
994
995         if (itn->fb_tunnel_dev != dev) {
996                 ip_tunnel_del(itn, netdev_priv(dev));
997                 unregister_netdevice_queue(dev, head);
998         }
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1001
1002 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1003 {
1004         struct ip_tunnel *tunnel = netdev_priv(dev);
1005
1006         return tunnel->net;
1007 }
1008 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1009
1010 int ip_tunnel_get_iflink(const struct net_device *dev)
1011 {
1012         struct ip_tunnel *tunnel = netdev_priv(dev);
1013
1014         return tunnel->parms.link;
1015 }
1016 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1017
1018 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1019                                   struct rtnl_link_ops *ops, char *devname)
1020 {
1021         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1022         struct ip_tunnel_parm parms;
1023         unsigned int i;
1024
1025         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1026                 INIT_HLIST_HEAD(&itn->tunnels[i]);
1027
1028         if (!ops) {
1029                 itn->fb_tunnel_dev = NULL;
1030                 return 0;
1031         }
1032
1033         memset(&parms, 0, sizeof(parms));
1034         if (devname)
1035                 strlcpy(parms.name, devname, IFNAMSIZ);
1036
1037         rtnl_lock();
1038         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1039         /* FB netdevice is special: we have one, and only one per netns.
1040          * Allowing to move it to another netns is clearly unsafe.
1041          */
1042         if (!IS_ERR(itn->fb_tunnel_dev)) {
1043                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1044                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1045                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1046         }
1047         rtnl_unlock();
1048
1049         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1050 }
1051 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1052
1053 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1054                               struct rtnl_link_ops *ops)
1055 {
1056         struct net *net = dev_net(itn->fb_tunnel_dev);
1057         struct net_device *dev, *aux;
1058         int h;
1059
1060         for_each_netdev_safe(net, dev, aux)
1061                 if (dev->rtnl_link_ops == ops)
1062                         unregister_netdevice_queue(dev, head);
1063
1064         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1065                 struct ip_tunnel *t;
1066                 struct hlist_node *n;
1067                 struct hlist_head *thead = &itn->tunnels[h];
1068
1069                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1070                         /* If dev is in the same netns, it has already
1071                          * been added to the list by the previous loop.
1072                          */
1073                         if (!net_eq(dev_net(t->dev), net))
1074                                 unregister_netdevice_queue(t->dev, head);
1075         }
1076 }
1077
1078 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1079 {
1080         LIST_HEAD(list);
1081
1082         rtnl_lock();
1083         ip_tunnel_destroy(itn, &list, ops);
1084         unregister_netdevice_many(&list);
1085         rtnl_unlock();
1086 }
1087 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1088
1089 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1090                       struct ip_tunnel_parm *p)
1091 {
1092         struct ip_tunnel *nt;
1093         struct net *net = dev_net(dev);
1094         struct ip_tunnel_net *itn;
1095         int mtu;
1096         int err;
1097
1098         nt = netdev_priv(dev);
1099         itn = net_generic(net, nt->ip_tnl_net_id);
1100
1101         if (nt->collect_md) {
1102                 if (rtnl_dereference(itn->collect_md_tun))
1103                         return -EEXIST;
1104         } else {
1105                 if (ip_tunnel_find(itn, p, dev->type))
1106                         return -EEXIST;
1107         }
1108
1109         nt->net = net;
1110         nt->parms = *p;
1111         err = register_netdevice(dev);
1112         if (err)
1113                 goto out;
1114
1115         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1116                 eth_hw_addr_random(dev);
1117
1118         mtu = ip_tunnel_bind_dev(dev);
1119         if (!tb[IFLA_MTU])
1120                 dev->mtu = mtu;
1121
1122         ip_tunnel_add(itn, nt);
1123 out:
1124         return err;
1125 }
1126 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1127
1128 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1129                          struct ip_tunnel_parm *p)
1130 {
1131         struct ip_tunnel *t;
1132         struct ip_tunnel *tunnel = netdev_priv(dev);
1133         struct net *net = tunnel->net;
1134         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1135
1136         if (dev == itn->fb_tunnel_dev)
1137                 return -EINVAL;
1138
1139         t = ip_tunnel_find(itn, p, dev->type);
1140
1141         if (t) {
1142                 if (t->dev != dev)
1143                         return -EEXIST;
1144         } else {
1145                 t = tunnel;
1146
1147                 if (dev->type != ARPHRD_ETHER) {
1148                         unsigned int nflags = 0;
1149
1150                         if (ipv4_is_multicast(p->iph.daddr))
1151                                 nflags = IFF_BROADCAST;
1152                         else if (p->iph.daddr)
1153                                 nflags = IFF_POINTOPOINT;
1154
1155                         if ((dev->flags ^ nflags) &
1156                             (IFF_POINTOPOINT | IFF_BROADCAST))
1157                                 return -EINVAL;
1158                 }
1159         }
1160
1161         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1162         return 0;
1163 }
1164 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1165
1166 int ip_tunnel_init(struct net_device *dev)
1167 {
1168         struct ip_tunnel *tunnel = netdev_priv(dev);
1169         struct iphdr *iph = &tunnel->parms.iph;
1170         int err;
1171
1172         dev->destructor = ip_tunnel_dev_free;
1173         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1174         if (!dev->tstats)
1175                 return -ENOMEM;
1176
1177         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1178         if (!tunnel->dst_cache) {
1179                 free_percpu(dev->tstats);
1180                 return -ENOMEM;
1181         }
1182
1183         err = gro_cells_init(&tunnel->gro_cells, dev);
1184         if (err) {
1185                 free_percpu(tunnel->dst_cache);
1186                 free_percpu(dev->tstats);
1187                 return err;
1188         }
1189
1190         tunnel->dev = dev;
1191         tunnel->net = dev_net(dev);
1192         strcpy(tunnel->parms.name, dev->name);
1193         iph->version            = 4;
1194         iph->ihl                = 5;
1195
1196         if (tunnel->collect_md) {
1197                 dev->features |= NETIF_F_NETNS_LOCAL;
1198                 netif_keep_dst(dev);
1199         }
1200         return 0;
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1203
1204 void ip_tunnel_uninit(struct net_device *dev)
1205 {
1206         struct ip_tunnel *tunnel = netdev_priv(dev);
1207         struct net *net = tunnel->net;
1208         struct ip_tunnel_net *itn;
1209
1210         itn = net_generic(net, tunnel->ip_tnl_net_id);
1211         /* fb_tunnel_dev will be unregisted in net-exit call. */
1212         if (itn->fb_tunnel_dev != dev)
1213                 ip_tunnel_del(itn, netdev_priv(dev));
1214
1215         ip_tunnel_dst_reset_all(tunnel);
1216 }
1217 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1218
1219 /* Do least required initialization, rest of init is done in tunnel_init call */
1220 void ip_tunnel_setup(struct net_device *dev, int net_id)
1221 {
1222         struct ip_tunnel *tunnel = netdev_priv(dev);
1223         tunnel->ip_tnl_net_id = net_id;
1224 }
1225 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1226
1227 MODULE_LICENSE("GPL");