[Babel-users] [PATCH] Use per-table dumps on kernels where this is available

Toke Høiland-Jørgensen toke at toke.dk
Sat Apr 16 19:00:59 BST 2022


Starting with version 4.20, the Linux kernel gained the ability to filter
route dumps on (among other things) table ID on the kernel side of the
netlink transaction. This can significantly increase the performance of
route dumps in cases where babeld is configured to only monitor a subset of
the kernel's route tables. In particular, it avoids the per-table lock
contention inside the kernel if another routing daemon is updating a table
that babeld is not using while babeld issues a dump request.

The filtering works by setting the rtm_table attribute on the netlink dump
request, but it only works if the NETLINK_GET_STRICT_CHK socket option is
set on the netlink socket. Older kernels will just ignore the option, so we
can just always pass it to the kernel, and simply break out of the
per-table loop if we detected that table filtering is unavailable.

Signed-off-by: Toke Høiland-Jørgensen <toke at toke.dk>
---
 kernel_netlink.c | 49 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/kernel_netlink.c b/kernel_netlink.c
index 3d17d7149090..3829d933afa9 100644
--- a/kernel_netlink.c
+++ b/kernel_netlink.c
@@ -46,6 +46,10 @@ THE SOFTWARE.
 #define BRCTL_GET_BRIDGES 1
 #endif
 
+#ifndef NETLINK_GET_STRICT_CHK
+#define NETLINK_GET_STRICT_CHK 12
+#endif
+
 #if(__GLIBC__ < 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ <= 5)
 #define RTA_TABLE 15
 #endif
@@ -74,6 +78,7 @@ THE SOFTWARE.
     } while(0)
 
 int export_table = -1, import_tables[MAX_IMPORT_TABLES], import_table_count = 0;
+int per_table_dumps = 0;
 
 struct sysctl_setting {
     char *name;
@@ -283,7 +288,7 @@ static int nl_setup = 0;
 static int
 netlink_socket(struct netlink *nl, uint32_t groups)
 {
-    int rc;
+    int rc, strict = 1;
     int rcvsize = 512 * 1024;
 
     nl->sock = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
@@ -319,6 +324,10 @@ netlink_socket(struct netlink *nl, uint32_t groups)
         }
     }
 
+    rc = setsockopt(nl->sock, SOL_NETLINK, NETLINK_GET_STRICT_CHK,
+                    &strict, sizeof(strict));
+    per_table_dumps = (rc == 0);
+
     rc = bind(nl->sock, (struct sockaddr *)&nl->sockaddr, nl->socklen);
     if(rc < 0)
         goto fail;
@@ -1302,9 +1311,9 @@ filter_kernel_routes(struct nlmsghdr *nh, struct kernel_route *route)
 int
 kernel_dump(int operation, struct kernel_filter *filter)
 {
-    int i, rc;
+    int i, j, rc;
     int families[2] = { AF_INET6, AF_INET };
-    struct rtgenmsg g;
+    struct rtmsg rtm;
 
     if(!nl_setup) {
         fprintf(stderr,"kernel_dump: netlink not initialized.\n");
@@ -1323,24 +1332,34 @@ kernel_dump(int operation, struct kernel_filter *filter)
     }
 
     for(i = 0; i < 2; i++) {
-        memset(&g, 0, sizeof(g));
-        g.rtgen_family = families[i];
+        memset(&rtm, 0, sizeof(rtm));
+        rtm.rtm_family = families[i];
         if(operation & CHANGE_ROUTE) {
-            rc = netlink_send_dump(RTM_GETROUTE, &g, sizeof(g));
-            if(rc < 0)
-                return -1;
+            for (j = 0; j < import_table_count; j++) {
+                rtm.rtm_table = import_tables[j];
 
-            rc = netlink_read(&nl_command, NULL, 1, filter);
-            if(rc < 0)
-                return -1;
-        }
+                rc = netlink_send_dump(RTM_GETROUTE, &rtm, sizeof(rtm));
+                if(rc < 0)
+                    return -1;
 
+                rc = netlink_read(&nl_command, NULL, 1, filter);
+                if(rc < 0)
+                    return -1;
+
+                /* the filtering on rtm_table above won't work on old kernels,
+                   in which case we'll just get routes from all tables in one
+                   dump; we detect this on socket setup, so we can just break
+                   the loop if we know it won't work */
+                if (!per_table_dumps)
+                    break;
+            }
+        }
     }
 
     if(operation & CHANGE_ADDR) {
-        memset(&g, 0, sizeof(g));
-        g.rtgen_family = AF_UNSPEC;
-        rc = netlink_send_dump(RTM_GETADDR, &g, sizeof(g));
+        memset(&rtm, 0, sizeof(rtm));
+        rtm.rtm_family = AF_UNSPEC;
+        rc = netlink_send_dump(RTM_GETADDR, &rtm, sizeof(rtm));
         if(rc < 0)
             return -1;
 
-- 
2.35.3




More information about the Babel-users mailing list