Find files in git repo over x megabytes, that don't exist in HEAD
This is an adaptation of the git-find-blob
script I posted previously:
#!/usr/bin/perl
use 5.008;
use strict;
use Memoize;
sub usage { die "usage: git-large-blob <size[b|k|m]> [<git-log arguments ...>]\n" }
@ARGV or usage();
my ( $max_size, $unit ) = ( shift =~ /^(\d+)([bkm]?)\z/ ) ? ( $1, $2 ) : usage();
my $exp = 10 * ( $unit eq 'b' ? 0 : $unit eq 'k' ? 1 : 2 );
my $cutoff = $max_size * 2**$exp;
sub walk_tree {
my ( $tree, @path ) = @_;
my @subtree;
my @r;
{
open my $ls_tree, '-|', git => 'ls-tree' => -l => $tree
or die "Couldn't open pipe to git-ls-tree: $!\n";
while ( <$ls_tree> ) {
my ( $type, $sha1, $size, $name ) = /\A[0-7]{6} (\S+) (\S+) +(\S+)\t(.*)/;
if ( $type eq 'tree' ) {
push @subtree, [ $sha1, $name ];
}
elsif ( $type eq 'blob' and $size >= $cutoff ) {
push @r, [ $size, @path, $name ];
}
}
}
push @r, walk_tree( $_->[0], @path, $_->[1] )
for @subtree;
return @r;
}
memoize 'walk_tree';
open my $log, '-|', git => log => @ARGV, '--pretty=format:%T %h %cr'
or die "Couldn't open pipe to git-log: $!\n";
my %seen;
while ( <$log> ) {
chomp;
my ( $tree, $commit, $age ) = split " ", $_, 3;
my $is_header_printed;
for ( walk_tree( $tree ) ) {
my ( $size, @path ) = @$_;
my $path = join '/', @path;
next if $seen{ $path }++;
print "$commit $age\n" if not $is_header_printed++;
print "\t$size\t$path\n";
}
}
More compact ruby script:
#!/usr/bin/env ruby -w
head, treshold = ARGV
head ||= 'HEAD'
Megabyte = 1000 ** 2
treshold = (treshold || 0.1).to_f * Megabyte
big_files = {}
IO.popen("git rev-list #{head}", 'r') do |rev_list|
rev_list.each_line do |commit|
commit.chomp!
for object in `git ls-tree -zrl #{commit}`.split("\0")
bits, type, sha, size, path = object.split(/\s+/, 5)
size = size.to_i
big_files[sha] = [path, size, commit] if size >= treshold
end
end
end
big_files.each do |sha, (path, size, commit)|
where = `git show -s #{commit} --format='%h: %cr'`.chomp
puts "%4.1fM\t%s\t(%s)" % [size.to_f / Megabyte, path, where]
end
Usage:
ruby big_file.rb [rev] [size in MB]
$ ruby big_file.rb master 0.3
3.8M example/blah.psd (aad2981: 4 months ago)
1.1M another/big.file (6e73ca2: 2 weeks ago)
Python script to do the same thing (based on this post):
#!/usr/bin/env python
import os, sys
def getOutput(cmd):
return os.popen(cmd).read()
if (len(sys.argv) <> 2):
print "usage: %s size_in_bytes" % sys.argv[0]
else:
maxSize = int(sys.argv[1])
revisions = getOutput("git rev-list HEAD").split()
bigfiles = set()
for revision in revisions:
files = getOutput("git ls-tree -zrl %s" % revision).split('\0')
for file in files:
if file == "":
continue
splitdata = file.split()
commit = splitdata[2]
if splitdata[3] == "-":
continue
size = int(splitdata[3])
path = splitdata[4]
if (size > maxSize):
bigfiles.add("%10d %s %s" % (size, commit, path))
bigfiles = sorted(bigfiles, reverse=True)
for f in bigfiles:
print f