[iptables]Series 2: How Rules Are Passed to the Kernel
发表于 2025-04-02
规则如何下发到内核的
1. 关于iptables下发规则的疑问
iptables的基础操作大家已经了解了。但是有求知欲的同学就会思考: iptables规则如何下发给内核的? 通过ioctl还是socket? iptables规则下发是一次下发一条还是一次下发多条?一次只下发表中的一个链还是整个表? 等等等等。
带着这样的疑问,我们开始了探索的脚步。
2. iptables相关系统调用(基于linux 6.1.62)
我们用strace查看iptables的系统调用。
a. trace iptables 查看操作
# strace /bin/iptables -L //表的获取
execve("/bin/iptables", ["/bin/iptables", "-L"], 0x7ffeb054e228 /* 4 vars */) = 0
//省略无关,只保留主要系统交互
socket(AF_INET, SOCK_RAW, IPPROTO_RAW) = 7
getsockopt(7, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0
getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [12744]) = 0
socket(AF_INET, SOCK_RAW, IPPROTO_RAW) = 8
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "iprange\0\264\220\"_7\177\0\0\230\323O_7\177\0\0\2\0\0\0\0\1", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "connlimit\0\"_7\177\0\0\230\323O_7\177\0\0\2\0\0\0\0\1", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_TARGET, "MARK\0\177\0\0B\344!_7\177\0\0@\217&\230\377\177\0\0\2\0\0\0\1\2", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_TARGET, "MARK\0\177\0\0B\344!_7\177\0\0\300\217&\230\377\177\0\0\0\0\0\0\0\2", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "conntrack\0!_7\177\0\0h\270!_7\177\0\0\2\0\0\0\2\3", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "conntrack\0!_7\177\0\0h\270!_7\177\0\0\2\0\0\0\1\3", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "conntrack\0!_7\177\0\0h\270!_7\177\0\0\2\0\0\0\0\3", [30]) = 0
getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, "mark\0\0\0\0U0 _7\177\0\0h\270!_7\177\0\0\0\0\0\0\0\1", [30]) = 0
| 系统调用 | 作用 |
|---|---|
| socket(AF_INET, SOCK_RAW, IPPROTO_RAW) | 创建一个原始 socket 作为 netfilter 控制通道 |
| getsockopt(7, SOL_IP, IPT_SO_GET_INFO, “filter\0\0”…, [84]) | 获取表的总体信息(链数、大小等) |
| getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, “filter\0\0”…, [12744]) | 获取该表中所有链的规则 |
| getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_MATCH, “iprange\0”…, [30]) | 检查 match 模块是否版本匹配 |
| getsockopt(8, SOL_IP, IPT_SO_GET_REVISION_TARGET, “MARK\0”…, [30]) | 检查 target 模块是否版本匹配 |
第一个问题的答案已经呼之欲出。
从strace中,可以看到iptables使用getsockopt来向内核请求filter表的信息(更早版本使用socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER)来获取iptables表。相比之下getsocktopt逻辑和结构更清晰易懂)。
b. trace iptables 配置操作
# strace /bin/iptables -A INPUT -d 1.2.3.4 -p tcp --dport 22 -j DROP
execve("/bin/iptables", ["/bin/iptables", "-A", "INPUT", "-d", "1.2.3.4", "-p", "tcp", "--dport", "22", "-j", "DROP"], 0x7ffe0cf2dda0 /* 4 vars */) = 0
//省略无关,只保留主要系统交互
socket(AF_INET, SOCK_RAW, IPPROTO_RAW) = 7
getsockopt(7, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0
getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [12744]) = 0
setsockopt(7, SOL_IP, IPT_SO_SET_REPLACE, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 13064) = 0
setsockopt(7, SOL_IP, IPT_SO_SET_ADD_COUNTERS, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 920) = 0
从strace中,我们可以看到前两个我们已经了解了获取filter表的信息和规则。原有整体结构大小为12744。下一步setsockopt 替换操作的整体结构内容大小为13064,多出来的320个字节,很明显是新添加规则对应的entry大小。
第二个问题的答案也显而易见,iptables将整个filter表通过setsockopt传递给内核用来整体替换旧的filter表。
| 系统调用 | 作用 |
|---|---|
| setsockopt(7, SOL_IP, IPT_SO_SET_REPLACE, “filter\0\0”…, 13064) | 整体替换fitler表 |
| setsockopt(7, SOL_IP, IPT_SO_SET_ADD_COUNTERS, “filter\0\0”…, 920) | 设置规则计数器 |
3. 相关宏和结构
下面是getsocketopt和setsockopt用到的optname的宏定义。
#define IPT_BASE_CTL 64
#define IPT_SO_SET_REPLACE (IPT_BASE_CTL)
#define IPT_SO_SET_ADD_COUNTERS (IPT_BASE_CTL + 1)
#define IPT_SO_SET_MAX IPT_SO_SET_ADD_COUNTERS
#define IPT_SO_GET_INFO (IPT_BASE_CTL)
#define IPT_SO_GET_ENTRIES (IPT_BASE_CTL + 1)
#define IPT_SO_GET_REVISION_MATCH (IPT_BASE_CTL + 2)
#define IPT_SO_GET_REVISION_TARGET (IPT_BASE_CTL + 3)
#define IPT_SO_GET_MAX IPT_SO_GET_REVISION_TARGET
当执行 getsockopt(7, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0 时,系统会返回以下结构数据给调用者。
/* The argument to IPT_SO_GET_INFO */
struct ipt_getinfo {
/* Which table: caller fills this in. */
char name[XT_TABLE_MAXNAMELEN];
/* Kernel fills these in. */
/* Which hook entry points are valid: bitmask */
unsigned int valid_hooks;
/* Hook entry points: one per netfilter hook. */
unsigned int hook_entry[NF_INET_NUMHOOKS];
/* Underflow points. */
unsigned int underflow[NF_INET_NUMHOOKS];
/* Number of entries */
unsigned int num_entries;
/* Size of entries. */
unsigned int size;
};
当执行 getsockopt(7, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [12744]) = 0 时,系统会返回以下结构数据给调用者。
传入ipt_get_entries结构,携带name和size。内核将所有entry拷贝到entrytable中。
为什么要这么做呢?因为 iptables 内核模块本身就是基于“表结构”设计的,规则之间具有偏移/跳转/顺序等逻辑关系,必须整体替换,才能保持一致性和正确性。
/* The argument to IPT_SO_GET_ENTRIES. */
struct ipt_get_entries {
/* Which table: user fills this in. */
char name[XT_TABLE_MAXNAMELEN];
/* User fills this in: total entry size. */
unsigned int size;
/* The entries. */
struct ipt_entry entrytable[];
};
/* This structure defines each of the firewall rules. Consists of 3
parts which are 1) general IP header stuff 2) match specific
stuff 3) the target to perform if the rule matches */
struct ipt_entry {
struct ipt_ip ip;
/* Mark with fields that we care about. */
unsigned int nfcache;
/* Size of ipt_entry + matches */
__u16 target_offset;
/* Size of ipt_entry + matches + target */
__u16 next_offset;
/* Back pointer */
unsigned int comefrom;
/* Packet and byte counters. */
struct xt_counters counters;
/* The matches (if any), then the target. */
unsigned char elems[];
};
对于iptables规则设置命令
/* The argument to IPT_SO_SET_REPLACE. */
struct ipt_replace {
/* Which table. */
char name[XT_TABLE_MAXNAMELEN];
/* Which hook entry points are valid: bitmask. You can't
change this. */
unsigned int valid_hooks;
/* Number of entries */
unsigned int num_entries;
/* Total size of new entries */
unsigned int size;
/* Hook entry points. */
unsigned int hook_entry[NF_INET_NUMHOOKS];
/* Underflow points. */
unsigned int underflow[NF_INET_NUMHOOKS];
/* Information about old entries: */
/* Number of counters (must be equal to current number of entries). */
unsigned int num_counters;
/* The old entries' counters. */
struct xt_counters __user *counters;
/* The entries (hang off end: not really an array). */
struct ipt_entry entries[];
};
4. 内核对应的处理函数
a. getsockopt的对应处理
当iptables调用getsockopt时,最终会传递给 do_ipt_get_ctl 函数处理。
static int
do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IPT_SO_GET_INFO:
ret = get_info(sock_net(sk), user, len);
break;
case IPT_SO_GET_ENTRIES:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
if (in_compat_syscall())
ret = compat_get_entries(sock_net(sk), user, len);
else
#endif
ret = get_entries(sock_net(sk), user, len);
break;
case IPT_SO_GET_REVISION_MATCH:
case IPT_SO_GET_REVISION_TARGET: {
struct xt_get_revision rev;
int target;
if (*len != sizeof(rev)) {
ret = -EINVAL;
break;
}
if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
ret = -EFAULT;
break;
}
rev.name[sizeof(rev.name)-1] = 0;
if (cmd == IPT_SO_GET_REVISION_TARGET)
target = 1;
else
target = 0;
try_then_request_module(xt_find_revision(AF_INET, rev.name,
rev.revision,
target, &ret),
"ipt_%s", rev.name);
break;
}
default:
ret = -EINVAL;
}
return ret;
}
b. getsockopt的对应处理
当iptables调用setsockopt时,最终会传递给 do_ipt_set_ctl 函数处理。
static int
do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
int ret;
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IPT_SO_SET_REPLACE:
#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
if (in_compat_syscall())
ret = compat_do_replace(sock_net(sk), arg, len);
else
#endif
ret = do_replace(sock_net(sk), arg, len);
break;
case IPT_SO_SET_ADD_COUNTERS:
ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
ret = -EINVAL;
}
return ret;
}
下一章我们将深入分析 do_ipt_set_ctl 和 do_ipt_set_ctl 。
本文访问次数:... 次