I'm using the netlink
and genetlink
Go packages to write a utility to interact with the MAC80211_HWSIM
netlink family functions. It's working well ... almost.
I'm finding that if I do a GET_RADIO
call, the next call into the family fails to validate because the sequence number of the response doesn't match that of the request. Specifically, the pattern I see is
GET_RADIO request: seq=655
GET_RADIO response: seq=655
DEL_RADIO request: seq=656
DEL_RADIO response: seq=655
It doesn't matter which command I issue after GET_RADIO
, this same pattern shows up. If GET_RADIO
isn't in the mix, then the sequence numbers are exactly as expected.
There's an existing command written in C, using libnl
and libgenl
, which I modified to run more than one command. As best I can tell, that implementation does a very similar thing, but doesn't have the issue with the sequence numbers, so I'm disinclined to think the problem lies with the kernel module. But strace clearly shows that the kernel is responding with an out-of-sequence number (please forgive the excessive side-scrolling):
socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC) = 3
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=18357, nl_groups=00000000}, [112->12]) = 0
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=40, type=0x10 /* NLMSG_??? */, flags=NLM_F_REQUEST, seq=2596996163, pid=18357}, "\3\1\0\0\23\0\2\0MAC80211_HWSIM\0\0"}, iov_len=40}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 40
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=112->12, msg_iov=[{iov_base=[{{len=224, type=0x10 /* NLMSG_??? */, flags=0, seq=2596996163, pid=18357}, "\1\2\0\0\23\0\2\0MAC80211_HWSIM\0\0\6\0\1\0\31\0\0\0"...}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=4096}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK) = 224
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=112->12, msg_iov=[{iov_base=[{{len=224, type=0x10 /* NLMSG_??? */, flags=0, seq=2596996163, pid=18357}, "\1\2\0\0\23\0\2\0MAC80211_HWSIM\0\0\6\0\1\0\31\0\0\0"...}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=4096}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 224
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=28, type=0x19 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=2596996164, pid=18357}, "\6\1\0\0\10\0
\0T\0\0\0"}, iov_len=28}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 28
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=112->12, msg_iov=[{iov_base=[{{len=48, type=0x19 /* NLMSG_??? */, flags=0, seq=2596996164, pid=18357}, "\6\1\0\0\10\0
\0T\0\0\0\10\0\t\0\1\0\0\0\t\0\21\0phy84\0\0\0"}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=4096}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK) = 48
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=112->12, msg_iov=[{iov_base=[{{len=48, type=0x19 /* NLMSG_??? */, flags=0, seq=2596996164, pid=18357}, "\6\1\0\0\10\0
\0T\0\0\0\10\0\t\0\1\0\0\0\t\0\21\0phy84\0\0\0"}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=4096}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 48
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=28, type=0x19 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=2596996165, pid=18357}, "\5\1\0\0\10\0
\0T\0\0\0"}, iov_len=28}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 28
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=112->12, msg_iov=[{iov_base=[{{len=36, type=NLMSG_ERROR, flags=0, seq=2596996164, pid=18357}, "\0\0\0\0\34\0\0\0\31\0\5\0D\4\313\232\265G\0\0"}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=4096}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_PEEK) = 36
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=112->12, msg_iov=[{iov_base=[{{len=36, type=NLMSG_ERROR, flags=0, seq=2596996164, pid=18357}, "\0\0\0\0\34\0\0\0\31\0\5\0D\4\313\232\265G\0\0"}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=4096}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36
close(3) = 0
+++ exited with 0 +++
Note that there are two recvmsg()
calls for each sendmsg()
because the first one simply does a MSG_PEEK
to see if an answer is available. Of the three sets of requests and responses, the first (sequence ...163) is simply getting the family information. The second (sequence 164) is doing the GET_RADIO
(\6
) call, and the third is the one with the out-of-sequence response: this DEL_RADIO
(\5
) call sends a message with a ...165 sequence number, but the response that comes back, otherwise well-formed, has a sequence number matching the previous command: ...164.
Because this is Go, there are multiple threads active at once, but all the calls in the strace output above are from the same thread. The only calls missing from the trace are those to futex()
.
The C version, because it uses the callback mechanisms provided by libnl
and libgenl
, does process the response in a different thread than the request came from. The request trace:
socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_GENERIC) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [32768], 4) = 0
getpid() = 21338
bind(3, {sa_family=AF_NETLINK, nl_pid=-1300212902, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=-1300212902, nl_groups=00000000}, [12]) = 0
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=20, type=0x10 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK|0x300, seq=1530817156, pid=2994754394}, "\3\1\0\0"}, iov_len=20}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{{len=116, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\v\0\2\0nlctrl\0\0\6\0\1\0\20\0\0\0\10\0\3\0\2\0\0\0"...}, {{len=96, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\16\0\2\0VFS_DQUOT\0\0\0\6\0\1\0\21\0\0\0\10\0\3\0"...}, {{len=104, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\17\0\2\0acpi_event\0\0\6\0\1\0\23\0\0\0\10\0\3\0"...}, {{len=108, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\22\0\2\0thermal_event\0\0\0\6\0\1\0\24\0\0\0"...}, {{len=112, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\20\0\2\0tcp_metrics\0\6\0\1\0\25\0\0\0\10\0\3\0"...}, {{len=112, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\16\0\2\0TASKSTATS\0\0\0\6\0\1\0\26\0\0\0\10\0\3\0"...}, {{len=2076, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\f\0\2\0nl80211\0\6\0\1\0\30\0\0\0\10\0\3\0\1\0\0\0"...}, {{len=224, type=0x10 /* NLMSG_??? */, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\1\2\0\0\23\0\2\0MAC80211_HWSIM\0\0\6\0\1\0\31\0\0\0"...}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 2948
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{{len=20, type=NLMSG_DONE, flags=NLM_F_MULTI, seq=1530817156, pid=2994754394}, "\0\0\0\0"}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=65542}}], iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 20
mmap(NULL, 8392704, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fdb752ce000
mprotect(0x7fdb752ce000, 4096, PROT_NONE) = 0
clone(child_stack=0x7fdb75acdff0, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, parent_tidptr=0x7fdb75ace9d0, tls=0x7fdb75ace700, child_tidptr=0x7fdb75ace9d0) = 21339
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=28, type=0x19 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1530817157, pid=2994754394}, "\6\1\0\0\10\0
\0_\0\0\0"}, iov_len=28}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 28
nanosleep({tv_sec=2, tv_nsec=0}, 0x7ffff64ac200) = 0
write(1, "Deleting radio with id '95'...
", 31) = 31
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=28, type=0x19 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1530817158, pid=2994754394}, "\5\1\0\0\10\0
\0_\0\0\0"}, iov_len=28}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 28
nanosleep({tv_sec=2, tv_nsec=0}, <unfinished ...>) = ?
+++ exited with 0 +++
and the response trace:
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{{len=48, type=0x19 /* NLMSG_??? */, flags=0, seq=1530817157, pid=2994754394}, "\6\1\0\0\10\0
\0_\0\0\0\10\0\t\0\1\0\0\0\t\0\21\0phy95\0\0\0"}, {{len=0, type=0 /* NLMSG_??? */, flags=0, seq=0, pid=0}}], iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 48
epoll_wait(4, [{EPOLLIN, {u32=3, u64=3}}], 32, -1) = 1
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{{len=36, type=NLMSG_ERROR, flags=0, seq=1530817157, pid=2994754394}, "\0\0\0\0\34\0\0\0\31\0\5\0\205j>[ZS\200\262"}, {{len=1114121, type=0x6870 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ECHO|NLM_F_DUMP_INTR|NLM_F_DUMP_FILTERED|0x3940, seq=53, pid=0}, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...}], iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
write(1, "new SSID defined to interface 95"..., 33) = 33
epoll_wait(4, [{EPOLLIN, {u32=3, u64=3}}], 32, -1) = 1
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[{{len=36, type=NLMSG_ERROR, flags=0, seq=1530817158, pid=2994754394}, "\0\0\0\0\34\0\0\0\31\0\5\0\206j>[ZS\200\262"}, {{len=1114121, type=0x6870 /* NLMSG_??? */, flags=NLM_F_REQUEST|NLM_F_ECHO|NLM_F_DUMP_INTR|NLM_F_DUMP_FILTERED|0x3940, seq=53, pid=0}, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...}], iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36
write(1, "Successfully deleted device with"..., 39) = 39
exit_group(0) = ?
+++ exited with 0 +++
The only thing that seems relevant to me (other than the threading structure) is that it uses different flags (NLM_F_DUMP_INTR|NLM_F_DUMP_FILTERED
in the C code, 0
in the Go code).
I'm not sure what other tools I can bring to bear on this, since I don't know how those sequence numbers are constructed. There didn't seem to me to be anything unusually wrong with the code in the kernel module, in the Go packages, or in my Go code, but there could always be a mismatch between the way an interface was supposed to be used and the way it actually was. Does this problem seem familiar to anyone, or does anyone have any thoughts on how to make progress on this?