# HG changeset patch
# User Heiko Schlittermann (JUMPER) <hs@schlittermann.de>
# Date 1311347169 -7200
# Node ID fb2455a007a7de208d43b790204f5eb0cf6d5313
# Parent  910cff13054185379769dc80e5ba0ef4afa54068
friday evening

diff -r 910cff130541 -r fb2455a007a7 cleaner
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cleaner	Fri Jul 22 17:06:09 2011 +0200
@@ -0,0 +1,159 @@
+#! /usr/bin/perl
+
+use 5.010;
+use strict;
+use warnings;
+use Pod::Usage;
+use Hash::Util qw(lock_keys);
+use File::Find;
+use File::Temp;
+use DB_File;
+use File::Basename;
+use autodie qw(:all);
+use Cwd qw(abs_path);
+
+use Getopt::Long;
+
+my %o = (
+    dry => undef,
+    verbose => undef,
+    check => undef,
+); lock_keys(%o);
+
+MAIN: {
+    GetOptions(
+	"n|dry!" => \$o{dry},
+	"v|verbose!" => \$o{verbose},
+	"c|check" => \$o{check},
+	"h|help" => sub { pod2usage(-verbose => 1, -exit 0) },
+	"m|man"  => sub { pod2usage(-verbose => 2, -exit 0, 
+			  -noperldoc => system("perldoc -V 1>/dev/null
+			  2>&1"))},
+    ) and @ARGV or pod2usage;
+    my $dir = shift;
+    my $tmp = File::Temp->new;
+
+    # load the index files, remember the latest
+    # timestamp we see
+    my (%inuse, @idx);
+    #tie %idx, "DB_File" => $tmp->filename;
+
+    find(sub {
+	(-f) and (-M > 0) or return;
+	verbose("idx: $File::Find::name");
+	push @idx, abs_path $_;
+	foreach my $f (get_file_list($_)) {
+	    push @{$inuse{$f}} => $#idx;
+	}
+    }, "$dir/idx");
+
+    verbose("indexed: ".scalar(keys %inuse)." files");
+
+    # simple "forward" check: existence of mentioned files
+    if ($o{check}) {
+	my $total = scalar keys %inuse;
+	my $done = 0;
+	local $SIG{ALRM} = sub {
+	    say sprintf "done %5.1f%% (%*d of $total)" 
+		=> 100 * $done/$total, length($total), $done;
+	    alarm 5;
+	};
+	$SIG{ALRM}->();
+	while (my ($f, $i) = each %inuse) {
+	    ++$done;
+	    next if -f "$dir/data/$f"
+		or -f "$dir/data/$f.gz";
+	    say "missing $f from\n",
+		join "-\t" => "", map { "$_\n" } @idx[@$i];
+	}
+	$SIG{ALRM}->();
+	alarm 0;
+	exit 0;
+    }
+
+    # full check and probably cleaning: all files, not mentioned
+    # in some index will be purged
+#   my (%file);
+#-    find(sub {
+#-	(-f) and (-M > 0) or return;
+#-	$File::Find::name =~ s/^$dir\/data\///;
+#-	$file{$_} = $_;
+#-    }, "$dir/data");
+#-
+#-    verbose("file system: ".scalar(keys %file)." files");
+#-    exit 0;
+
+    # ok, now go through all the data files and remove
+    # files not mentioned in some index, but never remove
+    # files created after the cleaner started
+    find(sub {
+	(-f) and (-M > 0) or return;
+
+	# cut away the first part of the filename and
+	# some optional extension
+	$File::Find::name = abs_path $File::Find::name;
+	(my $rn = $File::Find::name) =~ s/^$dir\/data\/(.*?)(?:\..+)?$/$1/;
+	exists $inuse{$rn} and return;
+
+	if ($o{dry}) {
+	    verbose("(unlinking) $File::Find::name");
+	    return;
+	}
+
+	verbose("unlinking $File::Find::name");
+	unlink $File::Find::name;
+
+    }, "$dir/data");
+
+}
+
+sub verbose { say @_ if $o{verbose} }
+
+sub get_file_list {
+    my ($list) = @_;
+    my @files = ();
+
+    open(my $fh => $list);
+    while (<$fh>) {
+	push @files, (split)[2];
+    }
+    return grep /^[a-z\d.\/]+$/ => @files;
+}
+
+
+__END__
+
+=head1 NAME
+
+    cleaner - cleans the imager data directory
+
+=head1 SYNOPSIS
+
+    cleaner [options] {directory}
+
+=head1 DESCRIPTION
+
+This tool loads all the index files from I<directory>F</idx/>
+and purges all not mentioned files below I<directory>F</data/>.
+
+=head1 OPTIONS
+
+=over
+
+=item B<-c>|B<--check>
+
+Check (and exit) if nothing is missing.
+
+=item B<-n>|B<--dry>
+
+Do nothing, just print what should be removed. (default: off)
+
+=item B<-h>|B<--help>
+
+=item B<-m>|B<--man>
+
+The short and longer help.
+
+=back
+
+=cut
diff -r 910cff130541 -r fb2455a007a7 imager
--- a/imager	Thu Jul 21 00:20:10 2011 +0200
+++ b/imager	Fri Jul 22 17:06:09 2011 +0200
@@ -10,27 +10,32 @@
 use File::Basename;
 use File::Temp;
 use Sys::Hostname;
-use IO::Compress::Gzip qw(gzip $GzipError);
+use IO::Compress::Gzip qw(gzip $GzipError :level :strategy);
+use Hash::Util qw(lock_keys);
 use Getopt::Long;
 use Pod::Usage;
 
 use constant KiB => 1024;
 use constant MiB => 1024 * KiB;
 use constant GiB => 1024 * MiB;
-use constant BLOCKSIZE => 8 * MiB;
+use constant BLOCKSIZE => 1 * MiB;
+use constant NOW => time();
+use constant DATETIME => strftime("%Y-%m-%dT%H:%M:%SZ" => gmtime(NOW));
 
 sub get_devsize;
 sub get_devname;
 
 $SIG{INT} = sub { die "Got INT\n" };
 
-my $compress = 0;
+my %o = (
+    compress => undef,
+    verbose => undef,
+); lock_keys(%o);
 
 my $NOW = time();
-my $DATETIME = strftime("%Y-%m-%dT%H:%M:%SZ" => gmtime($NOW));
 
 MAIN: {
-    my ($dev, $dir);
+    my ($src, $dst);
 
     my $idx = "{DIR}/idx/{HOSTNAME}/{DEVICE}/";
     my $data = "{DIR}/data";
@@ -38,35 +43,36 @@
 
     GetOptions(
 	"h|help" => sub { pod2usage(-verbose => 1, exit => 0) },
-	"m|man" => sub { pod2usage(-verbose => 2, exit => 0,
+	"m|man" => sub  { pod2usage(-verbose => 2, exit => 0,
 			 -noperldoc => system("perldoc -V >/dev/null 2>&1"))
-		   }
+		   },
+	"z|compress:i" => sub { $o{compress} = $_[1] ? $_[1] : Z_BEST_SPEED },
     ) and @ARGV == 2 or pod2usage;
-    ($dev, $dir) = @ARGV;
+    ($src, $dst) = @ARGV;
 
     foreach ($idx, $data) {
-	s/{DIR}/$dir/g;
+	s/{DIR}/$dst/g;
 	s/{HOSTNAME}/hostname/eg;
-	s/{DEVICE}/get_devname($dev)/eg;
+	s/{DEVICE}/get_devname($src)/eg;
     }
-    $size = get_devsize($dev);
+    $size = get_devsize($src);
 
-    -d $dir or die "$0: $dir: $!\n";
-    mkpath($data, $idx);
+    -d $dst or die "$0: $dst: $!\n";
+    mkpath([$data, $idx]);
 
     my $index = File::Temp->new(DIR => $idx);
     print {$index} <<__EOT;
 # imager
 format: 1
-filesystem: $dev
+filesystem: $src
 blocksize: @{[BLOCKSIZE]}
 devsize: $size
-timestamp: $NOW
-datetime: $DATETIME
+timestamp: @{[NOW]}
+datetime: @{[DATETIME]}
 
 __EOT
 
-    open(my $in => $dev);
+    open(my $in => $src);
     binmode($in);
     local $/ = \(my $bs = BLOCKSIZE);
     local $| = 1;
@@ -77,19 +83,29 @@
     );
 
     while (my $buffer = <$in>) {
-	my $cs = md5_hex($buffer);
-	(my $file = $cs) =~ s/(..)(..)(.*)/$1\/$2\/$3/g;
-	$file .= ".gz" if $compress;
+	my ($file, $ext, $cs);
+	$file = $cs = md5_hex($buffer);
+#	$file =~ s/((..)(..).*)/$2\/$3\/$1/g;
+	$file =~ s/(?<fn>(?<prefix>...).*)/$+{prefix}\/$+{fn}/g;
+	$ext = $o{compress} ? ".gz" : "";
 
+	# the extension we do not put into the index
 	my $log = sprintf "%6d %s %s" => ($.-1), $cs, $file;
 
-	if (!-e "$data/$file") {
-	    mkpath dirname("$data/$file");
-	    open(my $out, ">$data/$file");
+	if (not (-e "$data/$file" or -e "$data/$file$ext")) {
+	    mkpath dirname("$data/$file$ext");
+	    my $out = File::Temp->new(TEMPLATE => ".XXXXXXX", DIR => dirname("$data/$file$ext"));
 	    binmode($out);
-	    if ($compress) { gzip \$buffer => $out or die $GzipError } 
+	    if ($o{compress}) { 
+		gzip(\$buffer => $out, 
+			-Minimal => 1, 
+			-Level => Z_BEST_SPEED, 
+			-Strategy => Z_FILTERED) 
+		or die $GzipError 
+	    } 
 	    else { print {$out} $buffer }
 	    close($out);
+	    rename($out => "$data/$file$ext");
 	    $log .= " *";
 	    $stats{written}++;
 	}
@@ -109,7 +125,7 @@
     say "# SAVINGS " 
 	. sprintf "%3d%%" => 100 * ($stats{skipped}/($stats{written}+$stats{skipped}));
 
-    rename $index->filename => "$idx/$DATETIME";
+    rename $index->filename => "$idx/".DATETIME;
     close $index;
 
 }
@@ -142,9 +158,29 @@
 =head1 DESCRIPTION
 
 This tool creates a snapshot of a blockdevice.
+Just call it like
+
+    imager /dev/sda1 /media/backup
+
+This will create F</media/backup/{data,idx}>, if not already existing.
+The index (blocklist) goes to
+I<destination>F</idx/>I<hostname>F</>I<devicename>.  The data goes to
+I<destination>/F<data/>.
 
 =head1 OPTIONS
 
-Currently there are no useful options.
+=over
+
+=item B<-z>[I<level>]|B<--compress>[=I<level>]
+
+Use compression when writing the blocks to disk. (default: off)
+
+=item B<-h>|B<--help>
+
+=item B<-m>|B<--man>
+
+The short and longer help. 
+
+=back
 
 =cut