From 16b0324c024f36c6283576029859ba8ec2c34858 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 12:00:05 -0400 Subject: [PATCH 1/4] fix(fslock): delete lock file on Release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Release previously left the lock file on disk to avoid a TOCTOU where one process unlinks the file while another has just opened it and is about to flock — the two would end up holding locks on different inodes at the same path. Delete safely by: (unix) unlinking before close so new openers can't see our inode, plus an inode recheck after flock that reopens if the path no longer points at our fd; (windows) Close then os.Remove, which Go opens without FILE_SHARE_DELETE so Remove fails harmlessly while any handle is open. --- internal/fslock/fslock.go | 9 +--- internal/fslock/fslock_test.go | 56 +++++++++++++++++++++ internal/fslock/fslock_unix.go | 81 ++++++++++++++++++++++++++++--- internal/fslock/fslock_windows.go | 22 ++++++++- 4 files changed, 152 insertions(+), 16 deletions(-) diff --git a/internal/fslock/fslock.go b/internal/fslock/fslock.go index 99a25c6b..cd6f330a 100644 --- a/internal/fslock/fslock.go +++ b/internal/fslock/fslock.go @@ -20,11 +20,6 @@ import "os" // Lock represents an acquired advisory file lock. type Lock struct { - f *os.File -} - -// Release releases the lock and closes the file. The lock file is left on -// disk so that concurrent waiters do not race on a deleted path. -func (l Lock) Release() error { - return l.f.Close() + f *os.File + path string } diff --git a/internal/fslock/fslock_test.go b/internal/fslock/fslock_test.go index 423f3d8a..629d229b 100644 --- a/internal/fslock/fslock_test.go +++ b/internal/fslock/fslock_test.go @@ -15,7 +15,9 @@ package fslock_test import ( + "os" "path/filepath" + "sync" "testing" "time" @@ -48,6 +50,60 @@ func TestAcquireTwiceSequential(t *testing.T) { lock2.Release() } +func TestReleaseRemovesFile(t *testing.T) { + path := filepath.Join(t.TempDir(), "test.lock") + lock, err := fslock.Acquire(path, 5*time.Second) + if err != nil { + t.Fatalf("Acquire: %v", err) + } + if _, err := os.Stat(path); err != nil { + t.Fatalf("lock file missing while held: %v", err) + } + if err := lock.Release(); err != nil { + t.Fatalf("Release: %v", err) + } + if _, err := os.Stat(path); !os.IsNotExist(err) { + t.Fatalf("lock file still on disk after Release: stat err=%v", err) + } +} + +func TestConcurrentAcquireIsExclusive(t *testing.T) { + path := filepath.Join(t.TempDir(), "test.lock") + const workers = 8 + var ( + wg sync.WaitGroup + mu sync.Mutex + holding int + maxSeen int + ) + for range workers { + wg.Go(func() { + lock, err := fslock.Acquire(path, 10*time.Second) + if err != nil { + t.Errorf("Acquire: %v", err) + return + } + mu.Lock() + holding++ + if holding > maxSeen { + maxSeen = holding + } + mu.Unlock() + time.Sleep(10 * time.Millisecond) + mu.Lock() + holding-- + mu.Unlock() + if err := lock.Release(); err != nil { + t.Errorf("Release: %v", err) + } + }) + } + wg.Wait() + if maxSeen != 1 { + t.Fatalf("mutual exclusion violated: saw %d concurrent holders", maxSeen) + } +} + func TestAcquireTimeout(t *testing.T) { path := filepath.Join(t.TempDir(), "test.lock") lock1, err := fslock.Acquire(path, 5*time.Second) diff --git a/internal/fslock/fslock_unix.go b/internal/fslock/fslock_unix.go index 52c3f4c2..944d5ba6 100644 --- a/internal/fslock/fslock_unix.go +++ b/internal/fslock/fslock_unix.go @@ -17,6 +17,7 @@ package fslock import ( + "errors" "fmt" "os" "syscall" @@ -26,21 +27,85 @@ import ( // Acquire acquires an exclusive advisory lock on the file at path, retrying // until timeout elapses. Returns an error if the lock cannot be acquired. func Acquire(path string, timeout time.Duration) (Lock, error) { - f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0600) - if err != nil { - return Lock{}, fmt.Errorf("fslock: open %s: %w", path, err) + deadline := time.Now().Add(timeout) + for { + f, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0600) + if err != nil { + return Lock{}, fmt.Errorf("fslock: open %s: %w", path, err) + } + + lock, err := lockFile(f, path, deadline) + if err == nil { + return lock, nil + } + f.Close() + if errors.Is(err, errStaleInode) { + // Previous holder unlinked the file between our open and our + // flock; the path now refers to a different inode. Reopen. + continue + } + return Lock{}, err } +} - deadline := time.Now().Add(timeout) +// errStaleInode signals that the opened fd refers to an inode that has been +// unlinked (or replaced) since we opened it — we need to reopen and retry. +var errStaleInode = errors.New("fslock: stale inode") + +func lockFile(f *os.File, path string, deadline time.Time) (Lock, error) { for { - err = syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) + err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB) if err == nil { - return Lock{f: f}, nil + // Confirm the path still points at our inode. If the previous + // holder unlinked it (or it was replaced), the lock we just took + // is on a dead inode and a new caller could lock the new file. + if stale, serr := inodeIsStale(f, path); serr != nil { + syscall.Flock(int(f.Fd()), syscall.LOCK_UN) + return Lock{}, fmt.Errorf("fslock: stat %s: %w", path, serr) + } else if stale { + syscall.Flock(int(f.Fd()), syscall.LOCK_UN) + return Lock{}, errStaleInode + } + return Lock{f: f, path: path}, nil } if time.Now().After(deadline) { - f.Close() - return Lock{}, fmt.Errorf("fslock: could not acquire lock on %s within %s: %w", path, timeout, err) + return Lock{}, fmt.Errorf("fslock: could not acquire lock on %s: %w", path, err) } time.Sleep(50 * time.Millisecond) } } + +func inodeIsStale(f *os.File, path string) (bool, error) { + var fdStat syscall.Stat_t + if err := syscall.Fstat(int(f.Fd()), &fdStat); err != nil { + return false, err + } + var pathStat syscall.Stat_t + if err := syscall.Stat(path, &pathStat); err != nil { + if errors.Is(err, syscall.ENOENT) { + return true, nil + } + return false, err + } + return fdStat.Ino != pathStat.Ino || fdStat.Dev != pathStat.Dev, nil +} + +// Release unlinks the lock file and releases the lock. Unlinking before +// closing ensures no new caller can open the same inode and take the lock +// while we still hold it; combined with the inode recheck in Acquire, this +// guarantees that a fresh file at the same path is never raced against a +// soon-to-be-deleted one. +func (l Lock) Release() error { + if l.f == nil { + return nil + } + rmErr := os.Remove(l.path) + closeErr := l.f.Close() + if closeErr != nil { + return closeErr + } + if rmErr != nil && !errors.Is(rmErr, os.ErrNotExist) { + return rmErr + } + return nil +} diff --git a/internal/fslock/fslock_windows.go b/internal/fslock/fslock_windows.go index f4050261..96627041 100644 --- a/internal/fslock/fslock_windows.go +++ b/internal/fslock/fslock_windows.go @@ -17,6 +17,7 @@ package fslock import ( + "errors" "fmt" "os" "time" @@ -39,7 +40,7 @@ func Acquire(path string, timeout time.Duration) (Lock, error) { windows.LOCKFILE_EXCLUSIVE_LOCK|windows.LOCKFILE_FAIL_IMMEDIATELY, 0, 1, 0, ol) if err == nil { - return Lock{f: f}, nil + return Lock{f: f, path: path}, nil } if time.Now().After(deadline) { f.Close() @@ -48,3 +49,22 @@ func Acquire(path string, timeout time.Duration) (Lock, error) { time.Sleep(50 * time.Millisecond) } } + +// Release releases the lock and removes the lock file. On Windows, Go opens +// files without FILE_SHARE_DELETE, so os.Remove will only succeed once no +// other process holds the file open — making the delete inherently safe. +// We close first so our own handle doesn't block the remove. +func (l Lock) Release() error { + if l.f == nil { + return nil + } + if err := l.f.Close(); err != nil { + return err + } + if err := os.Remove(l.path); err != nil && !errors.Is(err, os.ErrNotExist) { + // Another process is still holding (or racing to open) the file; + // that's safe — it just means we can't clean up right now. + return nil + } + return nil +} From 51fd3eb9ec60250433873076afd3cb8f197b462a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 12:04:19 -0400 Subject: [PATCH 2/4] fix(fslock): honor Acquire deadline on stale-inode retry, narrow Windows Release error swallow - Unix: stale-inode retry loop now checks the deadline before continuing and returns a proper timeout error when the budget is exhausted, so Acquire cannot exceed its requested timeout. - Windows: Release only swallows os.Remove failures that indicate another handle is still open (ERROR_SHARING_VIOLATION / ERROR_ACCESS_DENIED) or the file is already gone. Other filesystem errors now propagate. Addresses roborev review 1845 (M+L). --- internal/fslock/fslock_unix.go | 10 +++++++--- internal/fslock/fslock_windows.go | 24 +++++++++++++++++------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/internal/fslock/fslock_unix.go b/internal/fslock/fslock_unix.go index 944d5ba6..51246c68 100644 --- a/internal/fslock/fslock_unix.go +++ b/internal/fslock/fslock_unix.go @@ -40,9 +40,13 @@ func Acquire(path string, timeout time.Duration) (Lock, error) { } f.Close() if errors.Is(err, errStaleInode) { - // Previous holder unlinked the file between our open and our - // flock; the path now refers to a different inode. Reopen. - continue + if time.Now().Before(deadline) { + // Previous holder unlinked the file between our open and + // our flock; the path now refers to a different inode. + // Reopen and try again within the remaining budget. + continue + } + return Lock{}, fmt.Errorf("fslock: could not acquire lock on %s within %s: %w", path, timeout, err) } return Lock{}, err } diff --git a/internal/fslock/fslock_windows.go b/internal/fslock/fslock_windows.go index 96627041..2116f1f1 100644 --- a/internal/fslock/fslock_windows.go +++ b/internal/fslock/fslock_windows.go @@ -51,9 +51,10 @@ func Acquire(path string, timeout time.Duration) (Lock, error) { } // Release releases the lock and removes the lock file. On Windows, Go opens -// files without FILE_SHARE_DELETE, so os.Remove will only succeed once no -// other process holds the file open — making the delete inherently safe. -// We close first so our own handle doesn't block the remove. +// files without FILE_SHARE_DELETE, so os.Remove will fail with a sharing +// violation (or access-denied) if another process still has the file open +// — making the delete inherently safe. We close first so our own handle +// doesn't block the remove, then swallow only sharing-related failures. func (l Lock) Release() error { if l.f == nil { return nil @@ -61,10 +62,19 @@ func (l Lock) Release() error { if err := l.f.Close(); err != nil { return err } - if err := os.Remove(l.path); err != nil && !errors.Is(err, os.ErrNotExist) { - // Another process is still holding (or racing to open) the file; - // that's safe — it just means we can't clean up right now. - return nil + if err := os.Remove(l.path); err != nil && !isBenignRemoveErr(err) { + return err } return nil } + +func isBenignRemoveErr(err error) bool { + if errors.Is(err, os.ErrNotExist) { + return true + } + // Another handle is open without FILE_SHARE_DELETE — cleanup is simply + // deferred to whoever closes last. Any other filesystem error (ACLs, + // I/O failure, etc.) should propagate. + return errors.Is(err, windows.ERROR_SHARING_VIOLATION) || + errors.Is(err, windows.ERROR_ACCESS_DENIED) +} From bad406b3b9a7184be206d7d89f005f07abf05147 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 12:07:29 -0400 Subject: [PATCH 3/4] fix(fslock): don't swallow ERROR_ACCESS_DENIED on Windows Release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ERROR_ACCESS_DENIED is not a reliable signal that another handle is keeping the file open — Windows also returns it for read-only attributes, ACL changes, or the path being replaced with a directory. Only ERROR_SHARING_VIOLATION unambiguously indicates the deferred-cleanup case, so narrow isBenignRemoveErr to just that and ErrNotExist. Other filesystem errors now propagate from Release. Addresses roborev review 1846. --- internal/fslock/fslock_windows.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/fslock/fslock_windows.go b/internal/fslock/fslock_windows.go index 2116f1f1..70fba612 100644 --- a/internal/fslock/fslock_windows.go +++ b/internal/fslock/fslock_windows.go @@ -69,12 +69,12 @@ func (l Lock) Release() error { } func isBenignRemoveErr(err error) bool { - if errors.Is(err, os.ErrNotExist) { - return true - } - // Another handle is open without FILE_SHARE_DELETE — cleanup is simply - // deferred to whoever closes last. Any other filesystem error (ACLs, - // I/O failure, etc.) should propagate. - return errors.Is(err, windows.ERROR_SHARING_VIOLATION) || - errors.Is(err, windows.ERROR_ACCESS_DENIED) + // ERROR_SHARING_VIOLATION is the unambiguous "another handle is open + // without FILE_SHARE_DELETE" case — cleanup is simply deferred to + // whoever closes last. ERROR_ACCESS_DENIED is *not* safe to swallow: + // Windows returns it for read-only attributes, ACL changes, or when + // the path has been replaced with a directory, none of which should + // be reported as a successful Release. + return errors.Is(err, os.ErrNotExist) || + errors.Is(err, windows.ERROR_SHARING_VIOLATION) } From 572d173f45904ba7cddb7f4eb0c5b4db0621a43b Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 5 May 2026 12:09:43 -0400 Subject: [PATCH 4/4] test(fslock): add Windows isBenignRemoveErr test, fix stale Release comment - Update Release doc comment: only ERROR_SHARING_VIOLATION (and ErrNotExist) are swallowed; ERROR_ACCESS_DENIED now propagates. - Add fslock_windows_test.go covering sharing-violation (benign), not-exist (benign), access-denied (propagated), and generic (propagated) cases to lock in the distinction going forward. Addresses roborev review 1847. --- internal/fslock/fslock_windows.go | 9 ++--- internal/fslock/fslock_windows_test.go | 47 ++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 internal/fslock/fslock_windows_test.go diff --git a/internal/fslock/fslock_windows.go b/internal/fslock/fslock_windows.go index 70fba612..123fd7e2 100644 --- a/internal/fslock/fslock_windows.go +++ b/internal/fslock/fslock_windows.go @@ -51,10 +51,11 @@ func Acquire(path string, timeout time.Duration) (Lock, error) { } // Release releases the lock and removes the lock file. On Windows, Go opens -// files without FILE_SHARE_DELETE, so os.Remove will fail with a sharing -// violation (or access-denied) if another process still has the file open -// — making the delete inherently safe. We close first so our own handle -// doesn't block the remove, then swallow only sharing-related failures. +// files without FILE_SHARE_DELETE, so os.Remove returns ERROR_SHARING_VIOLATION +// if another process still has the file open — making the delete inherently +// safe. We close first so our own handle doesn't block the remove, then +// swallow only ERROR_SHARING_VIOLATION (deferred cleanup) and ErrNotExist. +// Other errors, including ERROR_ACCESS_DENIED, propagate. func (l Lock) Release() error { if l.f == nil { return nil diff --git a/internal/fslock/fslock_windows_test.go b/internal/fslock/fslock_windows_test.go new file mode 100644 index 00000000..845078c1 --- /dev/null +++ b/internal/fslock/fslock_windows_test.go @@ -0,0 +1,47 @@ +// Copyright 2026 Columnar Technologies Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build windows + +package fslock + +import ( + "errors" + "io/fs" + "os" + "testing" + + "golang.org/x/sys/windows" +) + +func TestIsBenignRemoveErr(t *testing.T) { + cases := []struct { + name string + err error + benign bool + }{ + {"sharing violation", &fs.PathError{Op: "remove", Err: windows.ERROR_SHARING_VIOLATION}, true}, + {"not exist", os.ErrNotExist, true}, + {"wrapped not exist", &fs.PathError{Op: "remove", Err: os.ErrNotExist}, true}, + {"access denied", &fs.PathError{Op: "remove", Err: windows.ERROR_ACCESS_DENIED}, false}, + {"generic io error", errors.New("boom"), false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := isBenignRemoveErr(tc.err); got != tc.benign { + t.Fatalf("isBenignRemoveErr(%v) = %v, want %v", tc.err, got, tc.benign) + } + }) + } +}