Skip to content

Commit

Permalink
feat: only emit changed files with git walker
Browse files Browse the repository at this point in the history
If the modified time has not changed when compared with the git index we do not emit the file for processing.

This allows users to introduce treefmt to a repository without suffering an initial large formatting commit.

Instead, files can be formatted incrementally as they are changed.

Closes #311

Signed-off-by: Brian McGee <[email protected]>
  • Loading branch information
brianmcgee committed Jul 23, 2024
1 parent 0953dd5 commit 9253e50
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 54 deletions.
8 changes: 4 additions & 4 deletions cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
"git.numtide.com/numtide/treefmt/stats"

"git.numtide.com/numtide/treefmt/format"
"git.numtide.com/numtide/treefmt/walk"
"git.numtide.com/numtide/treefmt/walker"

"github.com/charmbracelet/log"

Expand Down Expand Up @@ -187,7 +187,7 @@ func putEntry(bucket *bolt.Bucket, path string, entry *Entry) error {

// ChangeSet is used to walk a filesystem, starting at root, and outputting any new or changed paths using pathsCh.
// It determines if a path is new or has changed by comparing against cache entries.
func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.File) error {
func ChangeSet(ctx context.Context, wk walker.Walker, filesCh chan<- *walker.File) error {
start := time.Now()

defer func() {
Expand All @@ -205,7 +205,7 @@ func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.Fil
}
}()

return walker.Walk(ctx, func(file *walk.File, err error) error {
return wk.Walk(ctx, func(file *walker.File, err error) error {
select {
case <-ctx.Done():
return ctx.Err()
Expand Down Expand Up @@ -264,7 +264,7 @@ func ChangeSet(ctx context.Context, walker walk.Walker, filesCh chan<- *walk.Fil
}

// Update is used to record updated cache information for the specified list of paths.
func Update(files []*walk.File) error {
func Update(files []*walker.File) error {
start := time.Now()
defer func() {
logger.Debugf("finished processing %v paths in %v", len(files), time.Since(start))
Expand Down
17 changes: 9 additions & 8 deletions cli/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"github.com/gobwas/glob"

"git.numtide.com/numtide/treefmt/format"
"git.numtide.com/numtide/treefmt/walk"
"git.numtide.com/numtide/treefmt/walker"
"github.com/alecthomas/kong"
"github.com/charmbracelet/log"
)
Expand All @@ -25,10 +25,11 @@ type Format struct {
Formatters []string `short:"f" help:"Specify formatters to apply. Defaults to all formatters."`
TreeRoot string `type:"existingdir" xor:"tree-root" help:"The root directory from which treefmt will start walking the filesystem (defaults to the directory containing the config file)."`
TreeRootFile string `type:"string" xor:"tree-root" help:"File to search for to find the project root (if --tree-root is not passed)."`
Walk walk.Type `enum:"auto,git,filesystem" default:"auto" help:"The method used to traverse the files within --tree-root. Currently supports 'auto', 'git' or 'filesystem'."`
Verbosity int `name:"verbose" short:"v" type:"counter" default:"0" env:"LOG_LEVEL" help:"Set the verbosity of logs e.g. -vv."`
Version bool `name:"version" short:"V" help:"Print version."`
Init bool `name:"init" short:"i" help:"Create a new treefmt.toml."`
Walk walker.Type `enum:"auto,git,filesystem" default:"auto" help:"The method used to traverse the files within --tree-root. Currently supports 'auto', 'git' or 'filesystem'."`

Verbosity int `name:"verbose" short:"v" type:"counter" default:"0" env:"LOG_LEVEL" help:"Set the verbosity of logs e.g. -vv."`
Version bool `name:"version" short:"V" help:"Print version."`
Init bool `name:"init" short:"i" help:"Create a new treefmt.toml."`

OnUnmatched log.Level `name:"on-unmatched" short:"u" default:"warn" help:"Log paths that did not match any formatters at the specified log level, with fatal exiting the process with an error. Possible values are <debug|info|warn|error|fatal>."`

Expand All @@ -40,9 +41,9 @@ type Format struct {
formatters map[string]*format.Formatter
globalExcludes []glob.Glob

filesCh chan *walk.File
formattedCh chan *walk.File
processedCh chan *walk.File
fileCh chan *walker.File
formattedCh chan *walker.File
processedCh chan *walker.File
}

func (f *Format) configureLogging() {
Expand Down
36 changes: 18 additions & 18 deletions cli/format.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (

"git.numtide.com/numtide/treefmt/cache"
"git.numtide.com/numtide/treefmt/config"
"git.numtide.com/numtide/treefmt/walk"
"git.numtide.com/numtide/treefmt/walker"

"github.com/charmbracelet/log"
"golang.org/x/sync/errgroup"
Expand Down Expand Up @@ -147,13 +147,13 @@ func (f *Format) Run() (err error) {

// create a channel for files needing to be processed
// we use a multiple of batch size here as a rudimentary concurrency optimization based on the host machine
f.filesCh = make(chan *walk.File, BatchSize*runtime.NumCPU())
f.fileCh = make(chan *walker.File, BatchSize*runtime.NumCPU())

// create a channel for files that have been formatted
f.formattedCh = make(chan *walk.File, cap(f.filesCh))
f.formattedCh = make(chan *walker.File, cap(f.fileCh))

// create a channel for files that have been processed
f.processedCh = make(chan *walk.File, cap(f.filesCh))
f.processedCh = make(chan *walker.File, cap(f.fileCh))

// start concurrent processing tasks in reverse order
eg.Go(f.updateCache(ctx))
Expand All @@ -168,14 +168,14 @@ func (f *Format) Run() (err error) {
func (f *Format) walkFilesystem(ctx context.Context) func() error {
return func() error {
eg, ctx := errgroup.WithContext(ctx)
pathsCh := make(chan string, BatchSize)
pathCh := make(chan string, BatchSize)

// By default, we use the cli arg, but if the stdin flag has been set we force a filesystem walk
// since we will only be processing one file from a temp directory
walkerType := f.Walk

if f.Stdin {
walkerType = walk.Filesystem
walkerType = walker.Filesystem

// check we have only received one path arg which we use for the file extension / matching to formatters
if len(f.Paths) != 1 {
Expand All @@ -197,15 +197,15 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error {
}

walkPaths := func() error {
defer close(pathsCh)
defer close(pathCh)

var idx int
for idx < len(f.Paths) {
select {
case <-ctx.Done():
return ctx.Err()
default:
pathsCh <- f.Paths[idx]
pathCh <- f.Paths[idx]
idx += 1
}
}
Expand All @@ -217,37 +217,37 @@ func (f *Format) walkFilesystem(ctx context.Context) func() error {
eg.Go(walkPaths)
} else {
// no explicit paths to process, so we only need to process root
pathsCh <- f.TreeRoot
close(pathsCh)
pathCh <- f.TreeRoot
close(pathCh)
}

// create a filesystem walker
walker, err := walk.New(walkerType, f.TreeRoot, pathsCh)
wk, err := walker.New(walkerType, f.TreeRoot, f.NoCache, pathCh)
if err != nil {
return fmt.Errorf("failed to create walker: %w", err)
}

// close the files channel when we're done walking the file system
defer close(f.filesCh)
// close the file channel when we're done walking the file system
defer close(f.fileCh)

// if no cache has been configured, or we are processing from stdin, we invoke the walker directly
if f.NoCache || f.Stdin {
return walker.Walk(ctx, func(file *walk.File, err error) error {
return wk.Walk(ctx, func(file *walker.File, err error) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
stats.Add(stats.Traversed, 1)
stats.Add(stats.Emitted, 1)
f.filesCh <- file
f.fileCh <- file
return nil
}
})
}

// otherwise we pass the walker to the cache and have it generate files for processing based on whether or not
// they have been added/changed since the last invocation
if err = cache.ChangeSet(ctx, walker, f.filesCh); err != nil {
if err = cache.ChangeSet(ctx, wk, f.fileCh); err != nil {
return fmt.Errorf("failed to generate change set: %w", err)
}
return nil
Expand Down Expand Up @@ -319,7 +319,7 @@ func (f *Format) applyFormatters(ctx context.Context) func() error {
}()

// iterate the files channel
for file := range f.filesCh {
for file := range f.fileCh {

// first check if this file has been globally excluded
if format.PathMatches(file.RelPath, f.globalExcludes) {
Expand Down Expand Up @@ -419,7 +419,7 @@ func (f *Format) detectFormatted(ctx context.Context) func() error {
func (f *Format) updateCache(ctx context.Context) func() error {
return func() error {
// used to batch updates for more efficient txs
batch := make([]*walk.File, 0, BatchSize)
batch := make([]*walker.File, 0, BatchSize)

// apply a batch
processBatch := func() error {
Expand Down
4 changes: 2 additions & 2 deletions format/formatter.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
"os/exec"
"time"

"git.numtide.com/numtide/treefmt/walk"
"git.numtide.com/numtide/treefmt/walker"

"git.numtide.com/numtide/treefmt/config"

Expand Down Expand Up @@ -89,7 +89,7 @@ func (f *Formatter) Apply(ctx context.Context, tasks []*Task) error {

// Wants is used to test if a Formatter wants a path based on it's configured Includes and Excludes patterns.
// Returns true if the Formatter should be applied to path, false otherwise.
func (f *Formatter) Wants(file *walk.File) bool {
func (f *Formatter) Wants(file *walker.File) bool {
match := !PathMatches(file.RelPath, f.excludes) && PathMatches(file.RelPath, f.includes)
if match {
f.log.Debugf("match: %v", file)
Expand Down
6 changes: 3 additions & 3 deletions format/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ import (
"cmp"
"slices"

"git.numtide.com/numtide/treefmt/walk"
"git.numtide.com/numtide/treefmt/walker"
)

type Task struct {
File *walk.File
File *walker.File
Formatters []*Formatter
BatchKey string
}

func NewTask(file *walk.File, formatters []*Formatter) Task {
func NewTask(file *walker.File, formatters []*Formatter) Task {
// sort by priority in ascending order
slices.SortFunc(formatters, func(a, b *Formatter) int {
priorityA := a.Priority()
Expand Down
2 changes: 1 addition & 1 deletion walk/filesystem.go → walker/filesystem.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package walk
package walker

import (
"context"
Expand Down
2 changes: 1 addition & 1 deletion walk/filesystem_test.go → walker/filesystem_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package walk
package walker

import (
"context"
Expand Down
54 changes: 43 additions & 11 deletions walk/git.go → walker/git.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
package walk
package walker

import (
"context"
"fmt"
"io/fs"
"os"
"path/filepath"
"time"

"github.com/go-git/go-git/v5/plumbing/format/index"

"github.com/charmbracelet/log"

"github.com/go-git/go-git/v5"
)

type gitWalker struct {
root string
paths chan string
repo *git.Repository
root string
paths chan string
repo *git.Repository

noCache bool
relPathOffset int
}

Expand All @@ -39,7 +44,20 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error {
}

// cache in-memory whether a path is present in the git index
var cache map[string]bool
var cache map[string]*index.Entry

// by default, we only emit files if they have changes when compared with the git index
emitFile := func(entry *index.Entry, info os.FileInfo) bool {
// mod time comparison is done with EPOCH (second) precision as per the POSIX spec
return entry.ModifiedAt.Truncate(time.Second) != info.ModTime().Truncate(time.Second)
}

if g.noCache {
// emit all files in the index
emitFile = func(entry *index.Entry, info os.FileInfo) bool {
return true
}
}

for path := range g.paths {

Expand All @@ -63,6 +81,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error {
return fmt.Errorf("failed to stat %s: %w", path, err)
}

// skip processing if the file hasn't changed
if !emitFile(entry, info) {
continue
}

// determine a relative path
relPath, err := g.relPath(path)
if err != nil {
Expand All @@ -83,11 +106,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error {
continue
}

// otherwise we ensure the git index entries are cached and then check if they are in the git index
// otherwise we ensure the git index entries are cached and then check if the path is in the git index
if cache == nil {
cache = make(map[string]bool)
cache = make(map[string]*index.Entry)
for _, entry := range idx.Entries {
cache[entry.Name] = true
cache[entry.Name] = entry
}
}

Expand All @@ -103,7 +126,8 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error {
}

return filepath.Walk(path, func(path string, info fs.FileInfo, _ error) error {
if info.IsDir() {
// ignore directories and symlinks
if info.IsDir() || info.Mode()&os.ModeSymlink == os.ModeSymlink {
return nil
}

Expand All @@ -112,9 +136,12 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error {
return fmt.Errorf("failed to determine a relative path for %s: %w", path, err)
}

if _, ok := cache[relPath]; !ok {
if entry, ok := cache[relPath]; !ok {
log.Debugf("path %v not found in git index, skipping", path)
return nil
} else if !emitFile(entry, info) {
log.Debugf("path %v has not changed, skipping", path)
return nil
}

file := File{
Expand All @@ -130,7 +157,11 @@ func (g gitWalker) Walk(ctx context.Context, fn WalkFunc) error {
return nil
}

func NewGit(root string, paths chan string) (Walker, error) {
func NewGit(
root string,
noCache bool,
paths chan string,
) (Walker, error) {
repo, err := git.PlainOpen(root)
if err != nil {
return nil, fmt.Errorf("failed to open git repo: %w", err)
Expand All @@ -139,6 +170,7 @@ func NewGit(root string, paths chan string) (Walker, error) {
root: root,
paths: paths,
repo: repo,
noCache: noCache,
relPathOffset: len(root) + 1,
}, nil
}
Loading

0 comments on commit 9253e50

Please sign in to comment.