#! /usr/bin/perl
# (c) 2013 Heiko Schlittermann <hs@schlittermann.de>
# source: hg clone https://ssl.schlittermann.de/amanda-plugin-dumpext
#
# This script should be a plugin to use ext2/3/4 dump/restore via the
# APPLICATION interface of Amanda. The rationale behind is, that I'd like to use
# dump(8) with different dumpdates files for the different backup types
# (daily, weekly, …)
#
# The commands we need to support are required by the
# API:  http://wiki.zmanda.com/index.php/Application_API/Operations
#
# This script tries do be as standalone as possible.
# Though we need the tools dump/restore as they exists for the ext2/3/4
# filesystems.

use 5.010;
use strict;
use warnings;
use Carp;
use Pod::Usage;
use Getopt::Long;
use File::Basename;
use POSIX;

our $VERSION = '0.01';
my $ME = basename $0;

# to avoid stupid "not found"
$ENV{PATH} .= ':/usr/local/sbin:/usr/sbin:/sbin';

use constant YES       => 'YES';
use constant NO        => 'NO';
use constant DUMPDATES => '/var/lib/dumpdates';
use constant FD3       => 3;
use constant FD4       => 4;

my $BS = 64 * 2**20;    # 64 MB read size

$SIG{__DIE__} = sub { die $^S ? '' : "$ME: ", @_ };

my %SUPPORT = (
    CONFIG          => YES,    # --config … (default)
    DISK            => NO,     # --disk …
    HOST            => NO,     # --host …
    MAX_LEVEL       => 9,      # --level …
    INDEX_LINE      => YES,    # --index line
    MESSAGE_LINE    => YES,    # --message line
                               #
    CLIENT_ESTIMATE => YES,    # estimate
    MULTI_ESTIMATE  => YES,    # estimate for multiple levels
    CALCSIZE        => NO,     # estimate --calcsize
                               #
    RECORD          => YES,    # --record
    COLLECTION      => NO,
);

sub exec_support;
sub exec_selfcheck;
sub exec_estimate;
sub exec_backup;
sub exec_validate;
sub exec_restore;

# some helper functions

sub device;
sub OK;
sub ERROR;

# bad but common style - the global options

my $opt_config;     # $config
my $opt_device;     # $device DLE[2]
my $opt_message;    # line / <>
my $opt_index;      # line / <>
my $opt_record;     # true / <>
my @opt_level;      # 0…99

my $opt_dumpdates;

MAIN: {
    my $command = shift // pod2usage;

    #warn "<<< $command | @ARGV >>>\n";
    GetOptions(
        'config=s'    => \$opt_config,
        'device=s'    => \$opt_device,       # --device $device
        'message=s'   => \$opt_message,      # --message line|xml
        'index=s'     => \$opt_index,        # --index line
        'record!'     => \$opt_record,       # --record
        'level=i@'    => \@opt_level,        # --level n
        'dumpdates=s' => \$opt_dumpdates,    # --dumpdates <file>
        'host=s'      => sub { },            # ignore
        'disk=s'      => sub { },            # ignore
    ) or pod2usage;

    given ($command) {
        when ('support')  { exec_support }
        when ('validate') { exec_validate }
        when ('restore')  { exec_restore @ARGV }
        when ('selfcheck') {
            pod2usage if undef ~~ $opt_device;
            exec_selfcheck;
        }
        when ('estimate') {
            pod2usage if undef ~~ [$opt_device, $opt_level[0]];
            exec_estimate;
        }
        when ('backup') {
            pod2usage if undef ~~ [$opt_device, $opt_level[0]];
            exec_backup;
        }

        default { pod2usage }
    }
}

# output a list of supported options
sub exec_support {
    print map { "$_ $SUPPORT{$_}\n" =~ s/_/-/gr } sort keys %SUPPORT;
    exit 0;
}

sub exec_selfcheck {

    # must: $opt_device
    # may: $opt_level

    OK "$ME version $VERSION";
    OK "euid=$> (" . getpwuid($>) . ')';
    OK "egid=" . do {
        my ($gid, undef) = split ' ', $);
        my $group = getgrgid $gid;
        "$gid ($group)";
    };
    OK "groups=" . do {
        my (undef, @gids) = split ' ', $);
        my @groups = map { '' . getgrgid $_ } @gids;
        "@gids (@groups)";
    };

    foreach my $tool (qw(dump restore cat)) {
        if (my $path =
            (grep { -x ($_ .= "/$tool") } split /:/ => $ENV{PATH})[0])
        {

            # not all tools understand --version, but fortunately they
            # all output a line starting with the name of the tool and
            # the version information
            my ($version, undef) =
              map  { /^.*?\s+(.*)/ }
              grep { /\A(?:$path|$tool)\s/ } `$path --version 2>&1`;
            chomp $version;
            OK "$tool is $path $version";
        }
        else {
            ERROR "$tool not found in $ENV{PATH}";
        }
    }

    # check the device
    # the opt_disk is just a label, the device is in opt_device!
    my $device = device($opt_device);

    if    (-b $device) { OK "$opt_device ($device is block special)" }
    elsif (-d $device) { OK "$opt_device ($device is directory)" }
    else               { ERROR "$opt_device not recognized" }

    # check the dumpdates file
    if ($opt_record) {
        my $dumpdates = $opt_dumpdates ? expand($opt_dumpdates) : DUMPDATES;

        eval { open(my $x, "+>>", $dumpdates) or die "$!\n" };
        if   (chomp $@) { ERROR "dumpdates file $dumpdates: $@" }
        else            { OK "dumpdates file is $dumpdates" }
    }

    exit 0;
}

sub exec_estimate {

    # must: $opt_level, $opt_device
    # may:  $opt_record, $opt_dumpdates
    my (@errors, @results);

    foreach my $level (@opt_level) {
        my @cmd = (
            dump => "-$level",
            '-S',    # estimate
            $opt_record && $opt_dumpdates ? (-D => expand($opt_dumpdates)) : (),
            device($opt_device),
        );

        my @output = `@cmd 2>&1`;

        given ($?) {
            when (-1) { say "command not found: $cmd[0]" }
            when ($_ > 0) {
                my $rc  = ($? & 0xffff) >> 8;
                my $sig = ($? & 0xff);
                say
"unexpected return code (exit: $rc, signal: $sig) from `@cmd':\n",
                  join "\n" => @output;
                exit 1;
            }
        }
        chomp @output;

        # the last line should be the number of 1K blocks
        my $blocks = do {
            my $_ = pop @output;
            /^(\d+)/ or do {
                say "can't get estimate";
                exit 1;
            };
            $1 / 1024;
        };

        # level blocks blocksize
        say join "\n" => @output if @output;
        say "$level $blocks 1";
    }

    exit 0;
}

sub exec_backup {

    # fd1: data channel
    # fd3: message channel
    # fd4: index channel

    my @dump = (
        dump => "-$opt_level[0]",

        #'-v', # verbose
        -f => '-',
        $opt_record ? '-u' : (),
        $opt_record && $opt_dumpdates ? (-D => expand($opt_dumpdates)) : (),
        device($opt_device)
    );

    #   ,---------> fd3 ---->          (messages)
    # dump --o----> fd1 ---->          (data)
    #         `---> restore -t --> fd4 (index)

    open(my $msg, '>&=', FD3) or die "Can't open fd3: $!\n";
    open(my $idx, '>&=', FD4) or die "Can't open fd4: $!\n" if $opt_index;

    if ($opt_index) {
        my $pid = fork // die "Can't fork: $!\n";
        if (not $pid) {
            $0 = "$ME [about to exec dump]";

            # dump will be execed soon, first we've to establish
            # the channels - one for STDOUT, and one for STDIN
            open(STDOUT, '|-') or do {

                # this is the child that will read
                # the STDOUT from dump
                $0 = "$ME [stdout < dump]";

                my $pid = open(my $restore, '|-') or do {
                    $0 = "$ME [toc]";
                    open(STDOUT, '|-') or do {
                        postprocess_toc($idx);
                        exit 0;
                    };
                    exec 'restore', -tvf => '-';
                    die "Can't exec `restore -tvf -`: $!";
                };

                # The restore may close it's input, we'll get
                # a SIG{PIPE} because of this.
                # But how can we be sure that it was the restore
                # that just sent us the signal? Currently we write
                # to two processes only, writing to STDOUT (the data
                # stream up down to the server). If this fails we die.
                # The other stream we write to is the 'restore'.
                # If we got this SIGPIPE, we 'switch' to cat, since
                # it's faster then our perl script.
                local $SIG{PIPE} = sub {
                    close $restore;
                    $restore = undef;
                    $SIG{PIPE} = 'default';
                    exec 'cat';
                    die "Can't exec `cat': $!\n";
                };

                local $/ = \$BS;
                while (<STDIN>) {
                    print $_ or die "Can't send data to `dump': $!\n";
                    print $restore $_;
                }
                close($restore) if $restore;
                exit 0;
            };

            open(STDERR, '|-') or do {
                $0 = "$ME [stderr < dump]";
                postprocess_dump_messages($msg);
                exit 0;
            };

            # we need to fork again, otherwise dump sees
            # the end of the above children and complains
            my $pid = fork // die "Can't fork: $!\n";
            if (not $pid) {
                exec @dump;
                die "Can't exec `@dump': $!\n";
            }

            waitpid($pid, 0);
        }

        waitpid($pid, 0);
        exit $?;
    }

    # no need to send an index
    # dump [2] --- (postprocess_dump_messages) --> [fd3]
    #      [1] ----------------------------------> [fd1]

    my $pid = fork // die "Can't fork: $!\n";

    # child does all the work
    if (not $pid) {

        # create the subprocess that will read the
        # stderr output  from dump, convert it and send it
        # to the message channel
        open(STDERR, '|-') or do {
            postprocess_dump_messages($msg);
            exit 0;
        };
        exec @dump;
        die "Can't exec `@dump': $!\n";
    }
    waitpid($pid, 0);
    exit $?;

}

sub postprocess_dump_messages {

    select +shift;    # send output to the message channel

    while (<STDIN>) {
        print "| $_";
        if (/^\s+DUMP: (\d+) blocks?/) {

            # we assume a block size of 1K
            say "sendbackup: size $1";
        }
        elsif (/^\s+DUMP: DUMP IS DONE/) {
            say 'sendbackup: end';
        }
    }
}

sub postprocess_toc {

    # the output of restore -tv looks
    # about like this:
    #
    # dir  4711  ./aaa
    # leaf 4712  ./bbb/xxx
    # leaf 4713  ./bbb/a
    # b
    # leaf 8819  ./bbb/x
    #
    # it may break if there is a lf/cr
    # embedded in the filename
    #
    # the more generic solution would be to force
    # restore to use a \0 separated output format

    select +shift;
    die 'IFS is not as expected!' if $/ ne "\n";

    my $buffer = undef;
    my $type   = undef;

    while (1) {

        $_ = <STDIN>;

        # skip the header lines
        if (1 .. defined && /\Adir\s+\d+\s+(.*)\Z/) {
            $buffer = '';
            $type   = 'dir';
            die "Unexpected end of input while reading from restore -tvf\n"
              if not defined;
            next;
        }

        # if we match really good the buffer may be output
        if (not defined
            or chomp and /\A(?'type' dir|leaf)\s+\d+\s+\.(?'name' \/.*)\Z/x)
        {

            # output
            say $buffer . ($type eq 'dir' ? '/' : '');

            # we're done if this was the last line of output
            last if not defined;

            # order matters, do not exchange the next two lines! The %+
            # will break
            $type = $+{type};
            $buffer = $+{name} =~ s/\\/\\\\/gr;

            next;
        }

        $buffer .= "\\n$_";

    }

}

sub exec_validate {

    my $pid = fork // die "Can't fork: $!\n";
    open(STDOUT, '>', '/dev/null')
      or die "Can't redirect STDOUT to /dev/null: $!\n";

    # the first part goes into restore
    # but restore stops reading after the directory
    if (not $pid) {
        exec 'restore', '-tf', '-';
        die "Can't exec restore: $!\n";
    }
    waitpid($pid, 0);
    die $? if $?;

    # read the remaining dump via cat, I think,
    # nobody is fast than cat
    exec 'cat';
    die "Can't exec `cat': $!\n";
}

sub exec_restore {
    if (not @_ or $_[0] eq '.') {

        # complete file system restore
        exec 'restore', '-yrf', '-';
    }

    # just some files - in this mode
    # no true incremental restore can be done
    # (it will just add new files or do updates, but it
    # won't remove any files!)
    exec 'restore', '-yxf', '-', @_;
}

sub device {
    my $_ = shift;
    return $_ if /^\//;
    return "/dev/$_";
}

sub expand {
    my $_ = shift;
    s/\${c}/$opt_config/g;
    return $_;
}

sub OK    { say "OK @_" }
sub ERROR { say "ERROR [@_]" }

=head1 NAME

  amdumpext - the amanda dump/restore ext2/3/4 application plugin

=head1 SYNOPSIS

  amdumpext support   [options]
  amdumpext selfcheck [options] [--level <level>] --device <device> 
  amdumpext backup    [options] [--level <level>] --device <device> 
  amdumpext estimate  [options] [--level <level>]... --device <device> 
  amdumpext validate  [options]
  amdumpext restore   [options] [file system objects]

=head1 DESCRIPTION

The B<amdumpext> is an application plugin for amanda.  It's not intended
to be called directly by the user.

It drives the native ext2/3/4 dump/restore programs found on most Linux
systems. See L<dump(8)> and L<restore(8)> for more information on these
tools.

B<NOTE>: During development it turned out that dump/restore are badly
maintained and that there might arise conditions where you can't easily
restore from an incremtal dump. (See bug #157 at the sourceforge dump
project site.) The problems seem start with restore 0.4b43 and do still
exist in 0.4b44.

=head1 OPTIONS

The commands may need some options.  Unsupported options are silently
ignored. Unknown options are not allowed.

=over

=item B<--device> I<device>

The disk device from the disklist.

=item B<--disk> I<disk>

The diskname from the disklist. Currently ignored.

=item B<--host> I<host>

The hostname from the disklist. Currently ignored.

=item B<--config> I<config>

The name of the configuration.

=item B<--level> I<level>

The level of the backup. 0 means full backup.

=item B<--index> I<xml|line>

Create an index of the files backed up. XML is not supported yet.
(Default: not set, means no index at all)

=item B<--message> I<xml|line>

The format used for messages. XML is not supported yet.
(Default: line)

=back

=head1 PROPERTIES

The properties may be set on the server side to tune the behaviour
of the application. Currently property setting on the client side is
not supported.

=over 4

=item B<--dumpdates> I<dumpdates>

The location of the dumpdates file. Placeholder "${c}" is allowed and
replaced with the name of the current config.

=back

=head1 COMMANDS

The B<amdumpext> supports the commands from the API description found on 
L<http://wiki.zmanda.com/index.php/Application_API/Operations>.

=head2 support

This command returns the list of supported features.

=head2 selfcheck

This command instructs B<amdumpext> to so some selfchecks. In an ideal
world it will detect any problem that might prevent a successful backup.

mandatory options: device, level


=head2 estimate

Create an estimate about the amount of data we may expect for the
backup. Multi-Level estimates are supported.

mandatory options: device, level, ...

=head2 backup

This commands creates a backup. It's send to fd 1, messages from the
backup go to fd3, optional index data goes to fd4.

mandatory options: device, level

=head2 validate

This checks if the data stream is readable and looks like a dump.
Actually it does not try to completly validate the stream, as B<restore>
does not have such an option.

=head2 restore

Restore from a single dump. In addition to the common options the server
passes a list of file system objects to be restored. The list should be
simplified, overlapping objects should be merged into one object.

Currently (Amanda 3.3.5) it seems as if the server sends F<.> as object
name for a complete file system recovery. This triggers C<restore -rf>,
any other object name triggers the invocation of C<restore -xf>. This
means, real incremental recoveries are only possible on the base of the
complete file system.

=head1 TESTING

The B<amdumpext> and it's subcommands may be tested on the command line. The following output
file descriptors are used:

=over

=item 1 (STDOUT)

The normal command output, the data stream.

=item 2 (STDERR)

Not used by the API, but probably redirected to some log files.

=item 3 (messages)

Operational messages, often it's postprocessed output of some of 
the invoked tools

=item 4 (index)

If index generation is ordered (via the B<--index> option), the index is 
sent to this file descriptor.

=back

The following shell command line is suitable to test the B<amdumpext>:

    $ amdumpext backup --device /dev/sda1 --level 0 1>dump 3>messages 4>index

If you're interested in reading the messages as they appear:

    $ amdumpext backup --device /dev/sda1 --level 0 3>&1 1>dump 4>index

To validate the backup:

    $ amdumpext validate <dump
    $ echo $?

And finally to check the restore process:

    $ amdumpext restore . <dump

Or
    $ amdumpext restore dirA dirB file/C


=head1 EXAMPLE

The C<amanda.conf> should contain something similiar to the following
stanza:

    define application "mydump" {
        plugin "amdumpext"
	# optional - define some additional parameters
	# ${c} - the name of the current config
        property "dumpdates" "/var/lib/dumpdates.${c}"
   }

   define dumptype "dump" {
	program "APPLICATION"
	application "mydump"
    }


=cut

# vim:sts=4 sw=4 aw ai sm:
