main
1//go:build linux
2
3// Copyright (C) 2024 SUSE LLC. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file.
6
7package securejoin
8
9import (
10 "errors"
11 "fmt"
12 "os"
13 "runtime"
14 "strconv"
15
16 "golang.org/x/sys/unix"
17)
18
19func fstat(f *os.File) (unix.Stat_t, error) {
20 var stat unix.Stat_t
21 if err := unix.Fstat(int(f.Fd()), &stat); err != nil {
22 return stat, &os.PathError{Op: "fstat", Path: f.Name(), Err: err}
23 }
24 return stat, nil
25}
26
27func fstatfs(f *os.File) (unix.Statfs_t, error) {
28 var statfs unix.Statfs_t
29 if err := unix.Fstatfs(int(f.Fd()), &statfs); err != nil {
30 return statfs, &os.PathError{Op: "fstatfs", Path: f.Name(), Err: err}
31 }
32 return statfs, nil
33}
34
35// The kernel guarantees that the root inode of a procfs mount has an
36// f_type of PROC_SUPER_MAGIC and st_ino of PROC_ROOT_INO.
37const (
38 procSuperMagic = 0x9fa0 // PROC_SUPER_MAGIC
39 procRootIno = 1 // PROC_ROOT_INO
40)
41
42func verifyProcRoot(procRoot *os.File) error {
43 if statfs, err := fstatfs(procRoot); err != nil {
44 return err
45 } else if statfs.Type != procSuperMagic {
46 return fmt.Errorf("%w: incorrect procfs root filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
47 }
48 if stat, err := fstat(procRoot); err != nil {
49 return err
50 } else if stat.Ino != procRootIno {
51 return fmt.Errorf("%w: incorrect procfs root inode number %d", errUnsafeProcfs, stat.Ino)
52 }
53 return nil
54}
55
56var hasNewMountApi = sync_OnceValue(func() bool {
57 // All of the pieces of the new mount API we use (fsopen, fsconfig,
58 // fsmount, open_tree) were added together in Linux 5.1[1,2], so we can
59 // just check for one of the syscalls and the others should also be
60 // available.
61 //
62 // Just try to use open_tree(2) to open a file without OPEN_TREE_CLONE.
63 // This is equivalent to openat(2), but tells us if open_tree is
64 // available (and thus all of the other basic new mount API syscalls).
65 // open_tree(2) is most light-weight syscall to test here.
66 //
67 // [1]: merge commit 400913252d09
68 // [2]: <https://lore.kernel.org/lkml/153754740781.17872.7869536526927736855.stgit@warthog.procyon.org.uk/>
69 fd, err := unix.OpenTree(-int(unix.EBADF), "/", unix.OPEN_TREE_CLOEXEC)
70 if err != nil {
71 return false
72 }
73 _ = unix.Close(fd)
74 return true
75})
76
77func fsopen(fsName string, flags int) (*os.File, error) {
78 // Make sure we always set O_CLOEXEC.
79 flags |= unix.FSOPEN_CLOEXEC
80 fd, err := unix.Fsopen(fsName, flags)
81 if err != nil {
82 return nil, os.NewSyscallError("fsopen "+fsName, err)
83 }
84 return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
85}
86
87func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
88 // Make sure we always set O_CLOEXEC.
89 flags |= unix.FSMOUNT_CLOEXEC
90 fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
91 if err != nil {
92 return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
93 }
94 return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
95}
96
97func newPrivateProcMount() (*os.File, error) {
98 procfsCtx, err := fsopen("proc", unix.FSOPEN_CLOEXEC)
99 if err != nil {
100 return nil, err
101 }
102 defer procfsCtx.Close()
103
104 // Try to configure hidepid=ptraceable,subset=pid if possible, but ignore errors.
105 _ = unix.FsconfigSetString(int(procfsCtx.Fd()), "hidepid", "ptraceable")
106 _ = unix.FsconfigSetString(int(procfsCtx.Fd()), "subset", "pid")
107
108 // Get an actual handle.
109 if err := unix.FsconfigCreate(int(procfsCtx.Fd())); err != nil {
110 return nil, os.NewSyscallError("fsconfig create procfs", err)
111 }
112 return fsmount(procfsCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID)
113}
114
115func openTree(dir *os.File, path string, flags uint) (*os.File, error) {
116 dirFd := -int(unix.EBADF)
117 dirName := "."
118 if dir != nil {
119 dirFd = int(dir.Fd())
120 dirName = dir.Name()
121 }
122 // Make sure we always set O_CLOEXEC.
123 flags |= unix.OPEN_TREE_CLOEXEC
124 fd, err := unix.OpenTree(dirFd, path, flags)
125 if err != nil {
126 return nil, &os.PathError{Op: "open_tree", Path: path, Err: err}
127 }
128 return os.NewFile(uintptr(fd), dirName+"/"+path), nil
129}
130
131func clonePrivateProcMount() (_ *os.File, Err error) {
132 // Try to make a clone without using AT_RECURSIVE if we can. If this works,
133 // we can be sure there are no over-mounts and so if the root is valid then
134 // we're golden. Otherwise, we have to deal with over-mounts.
135 procfsHandle, err := openTree(nil, "/proc", unix.OPEN_TREE_CLONE)
136 if err != nil || hookForcePrivateProcRootOpenTreeAtRecursive(procfsHandle) {
137 procfsHandle, err = openTree(nil, "/proc", unix.OPEN_TREE_CLONE|unix.AT_RECURSIVE)
138 }
139 if err != nil {
140 return nil, fmt.Errorf("creating a detached procfs clone: %w", err)
141 }
142 defer func() {
143 if Err != nil {
144 _ = procfsHandle.Close()
145 }
146 }()
147 if err := verifyProcRoot(procfsHandle); err != nil {
148 return nil, err
149 }
150 return procfsHandle, nil
151}
152
153func privateProcRoot() (*os.File, error) {
154 if !hasNewMountApi() || hookForceGetProcRootUnsafe() {
155 return nil, fmt.Errorf("new mount api: %w", unix.ENOTSUP)
156 }
157 // Try to create a new procfs mount from scratch if we can. This ensures we
158 // can get a procfs mount even if /proc is fake (for whatever reason).
159 procRoot, err := newPrivateProcMount()
160 if err != nil || hookForcePrivateProcRootOpenTree(procRoot) {
161 // Try to clone /proc then...
162 procRoot, err = clonePrivateProcMount()
163 }
164 return procRoot, err
165}
166
167func unsafeHostProcRoot() (_ *os.File, Err error) {
168 procRoot, err := os.OpenFile("/proc", unix.O_PATH|unix.O_NOFOLLOW|unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
169 if err != nil {
170 return nil, err
171 }
172 defer func() {
173 if Err != nil {
174 _ = procRoot.Close()
175 }
176 }()
177 if err := verifyProcRoot(procRoot); err != nil {
178 return nil, err
179 }
180 return procRoot, nil
181}
182
183func doGetProcRoot() (*os.File, error) {
184 procRoot, err := privateProcRoot()
185 if err != nil {
186 // Fall back to using a /proc handle if making a private mount failed.
187 // If we have openat2, at least we can avoid some kinds of over-mount
188 // attacks, but without openat2 there's not much we can do.
189 procRoot, err = unsafeHostProcRoot()
190 }
191 return procRoot, err
192}
193
194var getProcRoot = sync_OnceValues(func() (*os.File, error) {
195 return doGetProcRoot()
196})
197
198var hasProcThreadSelf = sync_OnceValue(func() bool {
199 return unix.Access("/proc/thread-self/", unix.F_OK) == nil
200})
201
202var errUnsafeProcfs = errors.New("unsafe procfs detected")
203
204type procThreadSelfCloser func()
205
206// procThreadSelf returns a handle to /proc/thread-self/<subpath> (or an
207// equivalent handle on older kernels where /proc/thread-self doesn't exist).
208// Once finished with the handle, you must call the returned closer function
209// (runtime.UnlockOSThread). You must not pass the returned *os.File to other
210// Go threads or use the handle after calling the closer.
211//
212// This is similar to ProcThreadSelf from runc, but with extra hardening
213// applied and using *os.File.
214func procThreadSelf(procRoot *os.File, subpath string) (_ *os.File, _ procThreadSelfCloser, Err error) {
215 // We need to lock our thread until the caller is done with the handle
216 // because between getting the handle and using it we could get interrupted
217 // by the Go runtime and hit the case where the underlying thread is
218 // swapped out and the original thread is killed, resulting in
219 // pull-your-hair-out-hard-to-debug issues in the caller.
220 runtime.LockOSThread()
221 defer func() {
222 if Err != nil {
223 runtime.UnlockOSThread()
224 }
225 }()
226
227 // Figure out what prefix we want to use.
228 threadSelf := "thread-self/"
229 if !hasProcThreadSelf() || hookForceProcSelfTask() {
230 /// Pre-3.17 kernels don't have /proc/thread-self, so do it manually.
231 threadSelf = "self/task/" + strconv.Itoa(unix.Gettid()) + "/"
232 if _, err := fstatatFile(procRoot, threadSelf, unix.AT_SYMLINK_NOFOLLOW); err != nil || hookForceProcSelf() {
233 // In this case, we running in a pid namespace that doesn't match
234 // the /proc mount we have. This can happen inside runc.
235 //
236 // Unfortunately, there is no nice way to get the correct TID to
237 // use here because of the age of the kernel, so we have to just
238 // use /proc/self and hope that it works.
239 threadSelf = "self/"
240 }
241 }
242
243 // Grab the handle.
244 var (
245 handle *os.File
246 err error
247 )
248 if hasOpenat2() {
249 // We prefer being able to use RESOLVE_NO_XDEV if we can, to be
250 // absolutely sure we are operating on a clean /proc handle that
251 // doesn't have any cheeky overmounts that could trick us (including
252 // symlink mounts on top of /proc/thread-self). RESOLVE_BENEATH isn't
253 // strictly needed, but just use it since we have it.
254 //
255 // NOTE: /proc/self is technically a magic-link (the contents of the
256 // symlink are generated dynamically), but it doesn't use
257 // nd_jump_link() so RESOLVE_NO_MAGICLINKS allows it.
258 //
259 // NOTE: We MUST NOT use RESOLVE_IN_ROOT here, as openat2File uses
260 // procSelfFdReadlink to clean up the returned f.Name() if we use
261 // RESOLVE_IN_ROOT (which would lead to an infinite recursion).
262 handle, err = openat2File(procRoot, threadSelf+subpath, &unix.OpenHow{
263 Flags: unix.O_PATH | unix.O_NOFOLLOW | unix.O_CLOEXEC,
264 Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_MAGICLINKS,
265 })
266 if err != nil {
267 // TODO: Once we bump the minimum Go version to 1.20, we can use
268 // multiple %w verbs for this wrapping. For now we need to use a
269 // compatibility shim for older Go versions.
270 //err = fmt.Errorf("%w: %w", errUnsafeProcfs, err)
271 return nil, nil, wrapBaseError(err, errUnsafeProcfs)
272 }
273 } else {
274 handle, err = openatFile(procRoot, threadSelf+subpath, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
275 if err != nil {
276 // TODO: Once we bump the minimum Go version to 1.20, we can use
277 // multiple %w verbs for this wrapping. For now we need to use a
278 // compatibility shim for older Go versions.
279 //err = fmt.Errorf("%w: %w", errUnsafeProcfs, err)
280 return nil, nil, wrapBaseError(err, errUnsafeProcfs)
281 }
282 defer func() {
283 if Err != nil {
284 _ = handle.Close()
285 }
286 }()
287 // We can't detect bind-mounts of different parts of procfs on top of
288 // /proc (a-la RESOLVE_NO_XDEV), but we can at least be sure that we
289 // aren't on the wrong filesystem here.
290 if statfs, err := fstatfs(handle); err != nil {
291 return nil, nil, err
292 } else if statfs.Type != procSuperMagic {
293 return nil, nil, fmt.Errorf("%w: incorrect /proc/self/fd filesystem type 0x%x", errUnsafeProcfs, statfs.Type)
294 }
295 }
296 return handle, runtime.UnlockOSThread, nil
297}
298
299// STATX_MNT_ID_UNIQUE is provided in golang.org/x/sys@v0.20.0, but in order to
300// avoid bumping the requirement for a single constant we can just define it
301// ourselves.
302const STATX_MNT_ID_UNIQUE = 0x4000
303
304var hasStatxMountId = sync_OnceValue(func() bool {
305 var (
306 stx unix.Statx_t
307 // We don't care which mount ID we get. The kernel will give us the
308 // unique one if it is supported.
309 wantStxMask uint32 = STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
310 )
311 err := unix.Statx(-int(unix.EBADF), "/", 0, int(wantStxMask), &stx)
312 return err == nil && stx.Mask&wantStxMask != 0
313})
314
315func getMountId(dir *os.File, path string) (uint64, error) {
316 // If we don't have statx(STATX_MNT_ID*) support, we can't do anything.
317 if !hasStatxMountId() {
318 return 0, nil
319 }
320
321 var (
322 stx unix.Statx_t
323 // We don't care which mount ID we get. The kernel will give us the
324 // unique one if it is supported.
325 wantStxMask uint32 = STATX_MNT_ID_UNIQUE | unix.STATX_MNT_ID
326 )
327
328 err := unix.Statx(int(dir.Fd()), path, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW, int(wantStxMask), &stx)
329 if stx.Mask&wantStxMask == 0 {
330 // It's not a kernel limitation, for some reason we couldn't get a
331 // mount ID. Assume it's some kind of attack.
332 err = fmt.Errorf("%w: could not get mount id", errUnsafeProcfs)
333 }
334 if err != nil {
335 return 0, &os.PathError{Op: "statx(STATX_MNT_ID_...)", Path: dir.Name() + "/" + path, Err: err}
336 }
337 return stx.Mnt_id, nil
338}
339
340func checkSymlinkOvermount(procRoot *os.File, dir *os.File, path string) error {
341 // Get the mntId of our procfs handle.
342 expectedMountId, err := getMountId(procRoot, "")
343 if err != nil {
344 return err
345 }
346 // Get the mntId of the target magic-link.
347 gotMountId, err := getMountId(dir, path)
348 if err != nil {
349 return err
350 }
351 // As long as the directory mount is alive, even with wrapping mount IDs,
352 // we would expect to see a different mount ID here. (Of course, if we're
353 // using unsafeHostProcRoot() then an attaker could change this after we
354 // did this check.)
355 if expectedMountId != gotMountId {
356 return fmt.Errorf("%w: symlink %s/%s has an overmount obscuring the real link (mount ids do not match %d != %d)", errUnsafeProcfs, dir.Name(), path, expectedMountId, gotMountId)
357 }
358 return nil
359}
360
361func doRawProcSelfFdReadlink(procRoot *os.File, fd int) (string, error) {
362 fdPath := fmt.Sprintf("fd/%d", fd)
363 procFdLink, closer, err := procThreadSelf(procRoot, fdPath)
364 if err != nil {
365 return "", fmt.Errorf("get safe /proc/thread-self/%s handle: %w", fdPath, err)
366 }
367 defer procFdLink.Close()
368 defer closer()
369
370 // Try to detect if there is a mount on top of the magic-link. Since we use the handle directly
371 // provide to the closure. If the closure uses the handle directly, this
372 // should be safe in general (a mount on top of the path afterwards would
373 // not affect the handle itself) and will definitely be safe if we are
374 // using privateProcRoot() (at least since Linux 5.12[1], when anonymous
375 // mount namespaces were completely isolated from external mounts including
376 // mount propagation events).
377 //
378 // [1]: Linux commit ee2e3f50629f ("mount: fix mounting of detached mounts
379 // onto targets that reside on shared mounts").
380 if err := checkSymlinkOvermount(procRoot, procFdLink, ""); err != nil {
381 return "", fmt.Errorf("check safety of /proc/thread-self/fd/%d magiclink: %w", fd, err)
382 }
383
384 // readlinkat implies AT_EMPTY_PATH since Linux 2.6.39. See Linux commit
385 // 65cfc6722361 ("readlinkat(), fchownat() and fstatat() with empty
386 // relative pathnames").
387 return readlinkatFile(procFdLink, "")
388}
389
390func rawProcSelfFdReadlink(fd int) (string, error) {
391 procRoot, err := getProcRoot()
392 if err != nil {
393 return "", err
394 }
395 return doRawProcSelfFdReadlink(procRoot, fd)
396}
397
398func procSelfFdReadlink(f *os.File) (string, error) {
399 return rawProcSelfFdReadlink(int(f.Fd()))
400}
401
402var (
403 errPossibleBreakout = errors.New("possible breakout detected")
404 errInvalidDirectory = errors.New("wandered into deleted directory")
405 errDeletedInode = errors.New("cannot verify path of deleted inode")
406)
407
408func isDeadInode(file *os.File) error {
409 // If the nlink of a file drops to 0, there is an attacker deleting
410 // directories during our walk, which could result in weird /proc values.
411 // It's better to error out in this case.
412 stat, err := fstat(file)
413 if err != nil {
414 return fmt.Errorf("check for dead inode: %w", err)
415 }
416 if stat.Nlink == 0 {
417 err := errDeletedInode
418 if stat.Mode&unix.S_IFMT == unix.S_IFDIR {
419 err = errInvalidDirectory
420 }
421 return fmt.Errorf("%w %q", err, file.Name())
422 }
423 return nil
424}
425
426func checkProcSelfFdPath(path string, file *os.File) error {
427 if err := isDeadInode(file); err != nil {
428 return err
429 }
430 actualPath, err := procSelfFdReadlink(file)
431 if err != nil {
432 return fmt.Errorf("get path of handle: %w", err)
433 }
434 if actualPath != path {
435 return fmt.Errorf("%w: handle path %q doesn't match expected path %q", errPossibleBreakout, actualPath, path)
436 }
437 return nil
438}
439
440// Test hooks used in the procfs tests to verify that the fallback logic works.
441// See testing_mocks_linux_test.go and procfs_linux_test.go for more details.
442var (
443 hookForcePrivateProcRootOpenTree = hookDummyFile
444 hookForcePrivateProcRootOpenTreeAtRecursive = hookDummyFile
445 hookForceGetProcRootUnsafe = hookDummy
446
447 hookForceProcSelfTask = hookDummy
448 hookForceProcSelf = hookDummy
449)
450
451func hookDummy() bool { return false }
452func hookDummyFile(_ *os.File) bool { return false }