Skip to the content.

[iptables]Series 2: How Rules Are Passed to the Kernel

发表于 2025-04-02

规则如何下发到内核的

1. 关于iptables下发规则的疑问

iptables的基础操作大家已经了解了。但是有求知欲的同学就会思考: iptables规则如何下发给内核的? 通过ioctl还是socket? iptables规则下发是一次下发一条还是一次下发多条?一次只下发表中的一个链还是整个表? 等等等等。

带着这样的疑问,我们开始了探索的脚步。

2. iptables相关系统调用(基于linux 6.1.62)

我们用strace查看iptables的系统调用。

a. trace iptables 查看操作

# strace /bin/iptables -L    //表的获取
execve("/bin/iptables", ["/bin/iptables", "-L"], 0x7ffeb054e228 /* 4 vars */) = 0
//省略无关,只保留主要系统交互
socket(AF_INET, SOCK_RAW, IPPROTO_RAW)  = 7
getsockopt(7, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0
getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [12744]) = 0

socket(AF_INET, SOCK_RAW, IPPROTO_RAW)  = 8
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "iprange\0\264\220\"_7\177\0\0\230\323O_7\177\0\0\2\0\0\0\0\1", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "connlimit\0\"_7\177\0\0\230\323O_7\177\0\0\2\0\0\0\0\1", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_TARGET, "MARK\0\177\0\0B\344!_7\177\0\0@\217&\230\377\177\0\0\2\0\0\0\1\2", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_TARGET, "MARK\0\177\0\0B\344!_7\177\0\0\300\217&\230\377\177\0\0\0\0\0\0\0\2", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "conntrack\0!_7\177\0\0h\270!_7\177\0\0\2\0\0\0\2\3", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "conntrack\0!_7\177\0\0h\270!_7\177\0\0\2\0\0\0\1\3", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "conntrack\0!_7\177\0\0h\270!_7\177\0\0\2\0\0\0\0\3", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "mark\0\0\0\0U0 _7\177\0\0h\270!_7\177\0\0\0\0\0\0\0\1", [30]) = 0
系统调用 作用
socket(AF_INET, SOCK_RAW, IPPROTO_RAW) 创建一个原始 socket 作为 netfilter 控制通道
getsockopt(7, SOL_IP, IPT_SO_GET_INFO, “filter\0\0”…, [84]) 获取表的总体信息(链数、大小等)
getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, “filter\0\0”…, [12744]) 获取该表中所有链的规则
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, “iprange\0”…, [30]) 检查 match 模块是否版本匹配
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_TARGET, “MARK\0”…, [30]) 检查 target 模块是否版本匹配

第一个问题的答案已经呼之欲出。 从strace中,可以看到iptables使用getsockopt来向内核请求filter表的信息(更早版本使用socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER)来获取iptables表。相比之下getsocktopt逻辑和结构更清晰易懂)。

b. trace iptables 配置操作

# strace /bin/iptables -A INPUT -d 1.2.3.4 -p tcp --dport 22 -j DROP
execve("/bin/iptables", ["/bin/iptables", "-A", "INPUT", "-d", "1.2.3.4", "-p", "tcp", "--dport", "22", "-j", "DROP"], 0x7ffe0cf2dda0 /* 4 vars */) = 0
//省略无关,只保留主要系统交互
socket(AF_INET, SOCK_RAW, IPPROTO_RAW)  = 7
getsockopt(7, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0
getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [12744]) = 0
setsockopt(7, SOL_IP, IPT_SO_SET_REPLACE, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 13064) = 0
setsockopt(7, SOL_IP, IPT_SO_SET_ADD_COUNTERS, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 920) = 0

从strace中,我们可以看到前两个我们已经了解了获取filter表的信息和规则。原有整体结构大小为12744。下一步setsockopt 替换操作的整体结构内容大小为13064,多出来的320个字节,很明显是新添加规则对应的entry大小。 第二个问题的答案也显而易见,iptables将整个filter表通过setsockopt传递给内核用来整体替换旧的filter表。

系统调用 作用
setsockopt(7, SOL_IP, IPT_SO_SET_REPLACE, “filter\0\0”…, 13064) 整体替换fitler表
setsockopt(7, SOL_IP, IPT_SO_SET_ADD_COUNTERS, “filter\0\0”…, 920) 设置规则计数器

3. 相关宏和结构

下面是getsocketopt和setsockopt用到的optname的宏定义。

#define IPT_BASE_CTL		64

#define IPT_SO_SET_REPLACE	(IPT_BASE_CTL)
#define IPT_SO_SET_ADD_COUNTERS	(IPT_BASE_CTL + 1)
#define IPT_SO_SET_MAX		IPT_SO_SET_ADD_COUNTERS

#define IPT_SO_GET_INFO			(IPT_BASE_CTL)
#define IPT_SO_GET_ENTRIES		(IPT_BASE_CTL + 1)
#define IPT_SO_GET_REVISION_MATCH	(IPT_BASE_CTL + 2)
#define IPT_SO_GET_REVISION_TARGET	(IPT_BASE_CTL + 3)
#define IPT_SO_GET_MAX			IPT_SO_GET_REVISION_TARGET

当执行 getsockopt(7, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0 时,系统会返回以下结构数据给调用者。

/* The argument to IPT_SO_GET_INFO */
struct ipt_getinfo {
	/* Which table: caller fills this in. */
	char name[XT_TABLE_MAXNAMELEN];

	/* Kernel fills these in. */
	/* Which hook entry points are valid: bitmask */
	unsigned int valid_hooks;

	/* Hook entry points: one per netfilter hook. */
	unsigned int hook_entry[NF_INET_NUMHOOKS];

	/* Underflow points. */
	unsigned int underflow[NF_INET_NUMHOOKS];

	/* Number of entries */
	unsigned int num_entries;

	/* Size of entries. */
	unsigned int size;
};

当执行 getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [12744]) = 0 时,系统会返回以下结构数据给调用者。 传入ipt_get_entries结构,携带name和size。内核将所有entry拷贝到entrytable中。 为什么要这么做呢?因为 iptables 内核模块本身就是基于“表结构”设计的,规则之间具有偏移/跳转/顺序等逻辑关系,必须整体替换,才能保持一致性和正确性。

/* The argument to IPT_SO_GET_ENTRIES. */
struct ipt_get_entries {
	/* Which table: user fills this in. */
	char name[XT_TABLE_MAXNAMELEN];

	/* User fills this in: total entry size. */
	unsigned int size;

	/* The entries. */
	struct ipt_entry entrytable[];
};

/* This structure defines each of the firewall rules.  Consists of 3
   parts which are 1) general IP header stuff 2) match specific
   stuff 3) the target to perform if the rule matches */
struct ipt_entry {
	struct ipt_ip ip;

	/* Mark with fields that we care about. */
	unsigned int nfcache;

	/* Size of ipt_entry + matches */
	__u16 target_offset;
	/* Size of ipt_entry + matches + target */
	__u16 next_offset;

	/* Back pointer */
	unsigned int comefrom;

	/* Packet and byte counters. */
	struct xt_counters counters;

	/* The matches (if any), then the target. */
	unsigned char elems[];
};

对于iptables规则设置命令

/* The argument to IPT_SO_SET_REPLACE. */
struct ipt_replace {
	/* Which table. */
	char name[XT_TABLE_MAXNAMELEN];

	/* Which hook entry points are valid: bitmask.  You can't
           change this. */
	unsigned int valid_hooks;

	/* Number of entries */
	unsigned int num_entries;

	/* Total size of new entries */
	unsigned int size;

	/* Hook entry points. */
	unsigned int hook_entry[NF_INET_NUMHOOKS];

	/* Underflow points. */
	unsigned int underflow[NF_INET_NUMHOOKS];

	/* Information about old entries: */
	/* Number of counters (must be equal to current number of entries). */
	unsigned int num_counters;
	/* The old entries' counters. */
	struct xt_counters __user *counters;

	/* The entries (hang off end: not really an array). */
	struct ipt_entry entries[];
};

4. 内核对应的处理函数

a. getsockopt的对应处理

当iptables调用getsockopt时,最终会传递给 do_ipt_get_ctl 函数处理。

static int
do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
	int ret;

	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
		return -EPERM;

	switch (cmd) {
	case IPT_SO_GET_INFO:
		ret = get_info(sock_net(sk), user, len);
		break;

	case IPT_SO_GET_ENTRIES:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
		if (in_compat_syscall())
			ret = compat_get_entries(sock_net(sk), user, len);
		else
#endif
			ret = get_entries(sock_net(sk), user, len);
		break;

	case IPT_SO_GET_REVISION_MATCH:
	case IPT_SO_GET_REVISION_TARGET: {
		struct xt_get_revision rev;
		int target;

		if (*len != sizeof(rev)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
			ret = -EFAULT;
			break;
		}
		rev.name[sizeof(rev.name)-1] = 0;

		if (cmd == IPT_SO_GET_REVISION_TARGET)
			target = 1;
		else
			target = 0;

		try_then_request_module(xt_find_revision(AF_INET, rev.name,
							 rev.revision,
							 target, &ret),
					"ipt_%s", rev.name);
		break;
	}

	default:
		ret = -EINVAL;
	}

	return ret;
}

b. getsockopt的对应处理

当iptables调用setsockopt时,最终会传递给 do_ipt_set_ctl 函数处理。

static int
do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
	int ret;

	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
		return -EPERM;

	switch (cmd) {
	case IPT_SO_SET_REPLACE:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
		if (in_compat_syscall())
			ret = compat_do_replace(sock_net(sk), arg, len);
		else
#endif
			ret = do_replace(sock_net(sk), arg, len);
		break;

	case IPT_SO_SET_ADD_COUNTERS:
		ret = do_add_counters(sock_net(sk), arg, len);
		break;

	default:
		ret = -EINVAL;
	}

	return ret;
}

下一章我们将深入分析 do_ipt_set_ctldo_ipt_set_ctl

本文访问次数:...