Am 30.07.2015 um 20:42 schrieb Brandon McCaig:
> I'll give you my 2 cents for whatever that's worth. >:)

Thanks for your many comments.

The updated full project code is below.  Feedback is appreciated.  (you
also find this at github.com/simon0x5b/bookmark-djvu)

> I can't help thinking that the entire recursive subroutine could
> be simplified and tidied up a bit by changing the direction of
> it
could you elaborate on this?

Cheers, Simon


file bookmark-djvu-extract:
#!/usr/bin/env perl

# LICENSE: GPLv3+

use 5.010;
use warnings;
use strict;
use utf8;
use Getopt::Long qw/:config no_ignore_case bundling/;
use File::Basename qw/basename/;
use Encode qw/decode/;
if (!eval 'use YAML::XS qw/Dump/; 1;') {
    die "cannot find the YAML::XS Perl module.\n" .
        "Try '\$ apt-get install libyaml-libyaml-perl'.\n";
}

binmode STDOUT, ':encoding(UTF-8)';
binmode STDIN, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

my $prog_name = basename $0;
my $bug_address = "www.github.com/simon0x5b/bookmark-djvu/issues";

sub print_usage {
    say "Usage: $prog_name [OPTIONS] DJVU [-o BOOKMARKS_FILE]";
}

sub print_help {
    print_usage ();
    say "
extract the outline of DJVU.
The format of the generated BOOKMARKS-FILE is described in the README.

 Options:
  -o, --output=FILE           write bookmarks to FILE
  -s, --simple-format         dump outline in simple format (default: YAML)
  -h, --help                  print this help screen
  -V, --version               print program version

The default for BOOKMARKS-FILE is DJVU-FILENAME with the suffix changed to
- '.outline' for YAML mode (default).
- '.bm' for simple format.

Report bugs to $bug_address";
}

sub print_version {
    say "bookmark-djvu 0.1
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law."
}

$_ = decode ('UTF-8', $_) for @ARGV;

my $output_filename;
my $use_simple_format = 0;

my %opt_hash = (
    "output|o=s" => \$output_filename,
    "simple-format|s" => \$use_simple_format,
    "help|h" => sub {print_help (); exit 0},
    "version|V" => sub {print_version (); exit 0},
    );

print_usage (), exit (1) unless GetOptions (%opt_hash) and @ARGV;

my $djvu_filename = $ARGV[0];
if (! -f $djvu_filename) {
    print_usage ();
    die "error: djvu file '$djvu_filename' does not exist\n";
}

if (!$output_filename) {
    my $suffix = $use_simple_format ? ".bm" : ".outline";
    $output_filename =
        get_default_filename ($djvu_filename, ".djvu", $suffix);
    if (-e $output_filename) {
        die  "won't overwrite existing file '$output_filename'. " .
            "Use the '-o FILE' option!";
    }
}

open my $output_handle, ">", $output_filename
    or die "cannot open file '$output_filename': $!";

# start extracting bookmarks with djvused in a pipe
open my $djvused_handle, "-|", ("djvused", $djvu_filename, "-u", "-e",
    "print-outline") or die "cannot find djvused.\n" .
    "Try '\$ apt-get install djvulibre-bin'.\n";

binmode $djvused_handle, ':encoding(UTF-8)';


my $outline_items = parse_djvused_output ($djvused_handle);

unless (@{$outline_items}) {
    warn "no outline found\n";
    exit 1;
}


close $djvused_handle
    or die "djvused failed with exit status ", $? >> 8, "\n";

if ($use_simple_format) {
    dump_to_simple_format ($output_handle, $outline_items);
}
else {
    dump_to_yaml ($output_handle, $outline_items);
}


# arg: open file descriptor to djvused
# returns: arrayref of hashrefs, each describing level, title and page of an
# outline item.
sub parse_djvused_output {
    my $djvused_handle = shift;
    my $level = 0;
    my $line = <$djvused_handle>;

    if (!$line or $line !~ /^\(bookmarks$/) {
        warn "no bookmarks found\n";
        return [];
    }
    my $outline_items = [];
    while ($line = <$djvused_handle>) {
        chomp $line;
        $line =~ /^\s*\("(?<title>.+)"$/
            or die "line '$line' in djvused output does not match.
Please report to $bug_address\n";
        
        my $title = djvused_unescape ($+{title});
        
        $line = <$djvused_handle>;

        $line =~ /\s*"#(?<page>[0-9]+)"(?<close>(\s*\))*)\s*$/
            or die "line '$line' in djvused output does not match.
Please report to $bug_address\n";
        push @$outline_items, {level => $level, title => $title,
                               page => $+{page}};
        $level -= ($+{close} =~ tr/\)//) - 1;
    }

    return $outline_items;
}

sub dump_to_simple_format {
    my $out_handle = shift;
    my $outline_items = shift;
    binmode $out_handle, ':encoding(UTF-8)';
    for my $outline (@$outline_items) {
        print {$out_handle} "    "x($outline->{level}),
        prepare_title ($outline->{title}),
        , " $outline->{page}\n";
    }
}


sub prepare_title {
    my $title = shift;

    if ($title =~ /\n/) {
            warn "WARNING: removing newline in title '$title'.\n";
            $title =~ s/\n/ /g;
    }

    if ($title =~ /(\.|\s)+$/) {
        warn "WARNING: removing trailing dots and/or whitespace " .
            "in title '$title'.\n";
        $title =~ s/(\.|\s)+$//;
    }

    return $title;
}


sub dump_to_yaml {
    my $out_handle = shift;
    my $input_array = shift;
    (my $array, undef) = get_recursive_array ($input_array, 0);
    print {$out_handle}  Dump ($array);
}

# recursively convert flat array into nested data structure
# args: input array ref, index to start
# returns: output array ref, following index
sub get_recursive_array {
    my ($input_array, $index) = @_;

    my $last = $#{$input_array};
    my $base_level = $input_array->[$index]{level};
    my $result_array = [];

    while ($index <= $last) {
        my $node = $input_array->[$index];
        my $current_level = $node->{level};
        
        last if $current_level < $base_level;

        my $outline_item = {title => $node->{title}, page => $node->{page}};

        my $next_index = $index + 1;
        my $next_level = $input_array->[$next_index]{level};
        
        if ($next_index <= $last && $next_level > $base_level) {
            ($outline_item->{kids}, $index) =
                get_recursive_array ($input_array, $next_index);
        }
        else {
            ++$index;
        }
        
        push @{$result_array}, $outline_item;
    }

    return ($result_array, $index);
}


sub djvused_unescape {
    # see in '$ man djvused': "DJVUSED FILE FORMATS - Strings"
    # and perlrebackslash
    my $string = shift;

    $string =~ s/\\([0-7]{3}|.)/expand_escape_sequence ($1)/ge;

    return $string;
}

sub expand_escape_sequence {
    my ($string) = @_;

    if ($string eq "\\" || $string eq '"') {
        return $string;
    }
    elsif (length ($string) == 3) {
        # octal escape sequence
        return pack 'U', oct ($string);
    }
    elsif ($string eq "a") {
        return "\a";
    }
    elsif ($string eq "b") {
        return "\b";
    }
    elsif ($string eq "t") {
        return "\t";
    }
    elsif ($string eq "n") {
        return "\n";
    }
    elsif ($string eq "v") {
        return "\x{0b}";
    }
    elsif ($string eq "f") {
        return "\f";
    }
    elsif ($string eq "r") {
        return "\r";
    }
    else {
        die "unknown escape sequence '\\$string' in '$string'
 please report to $bug_address";
    }
}

sub get_default_filename {
    my $filename = shift;
    my $from = shift;
    my $too = shift;
    die "get_default_filename needs arg" unless defined ($too);
    my $result_filename;

    if ($filename =~ /\Q$from\E$/) {
        ($result_filename = $filename) =~ s/\Q$from\E/$too/;
    }
    else {
        $result_filename = $filename . $too;
    }

    return $result_filename;
}




file bookmark-djvu:
#!/usr/bin/env perl

# License: GPLv3+

use 5.010;
use warnings;
use strict;
# do not use floating point arithmetic in divisions
use integer;
use utf8;
use Encode qw/decode/;
use Getopt::Long qw/:config no_ignore_case bundling/;
use File::Temp qw/tempfile/;
use File::Basename qw/basename/;
if (!eval 'use YAML::XS qw/Load/; 1;') {
    die "cannot find the YAML::XS Perl module.\n" .
        "Try '\$ apt-get install libyaml-libyaml-perl'.\n";
}

binmode STDOUT, ':encoding(UTF-8)';
binmode STDIN, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

my $prog_name = basename $0;
my $bug_address = "www.github.com/simon0x5b/bookmark-djvu/issues";

sub print_usage {
    say "Usage: $prog_name [OPTIONS] DJVU [BOOKMARKS-FILE]";
}

sub print_help {
    print_usage ();
        say "
update the outline of DJVU.
The format of BOOKMARKS-FILE is described in the README.

 Options:
  -o, --output=FILE           do not modify DJVU and write the
                              bookmarked djvu file to FILE.
  -s, --simple-format         use simple format (default: YAML)

 options controlling simple format:
  -c, --allow-comments        ignore all lines starting with a hash
                              character (#) in BOOKMARKS-FILE.

 general options:
  -h, --help                  print this help screen.
  -V, --version               print program version.

The default for BOOKMARKS-FILE is DJVU with the suffix changed to
- '.outline' for YAML mode (default).
- '.bm' for simple format.

Report bugs to $bug_address";
}

sub print_version {
    say "bookmark-djvu 0.1
License GPLv3+: GNU GPL version 3 or later
<http://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law."
}

$_ = decode ('UTF-8', $_) for @ARGV;


my $output_filename;
my $use_simple_format = 0;
my $allow_comments = 0;


my %opt_hash = (
    "output|o=s" => \$output_filename,
    "simple-format|s" => \$use_simple_format,
    "allow-comments|c" => \$allow_comments,
    "help|h" => sub {print_help (); exit},
    "version|V" => sub {print_version (); exit},
    );

print_usage () , exit (1) unless GetOptions (%opt_hash) && @ARGV;

my $djvu_filename = $ARGV[0];
if (! -f $djvu_filename) {
    print_usage ();
    die "error: djvu file '$djvu_filename' does not exist\n";
}

$output_filename = $djvu_filename unless $output_filename;

my $bookmarks_filename;

if ($ARGV[1]) {
    $bookmarks_filename = $ARGV[1];
}
else {
    my $suffix = $use_simple_format ? ".bm" : ".outline";
    $bookmarks_filename =
        get_default_filename ($djvu_filename, ".djvu", $suffix);
}


open my $input_handle, '<', $bookmarks_filename
    or die "cannot open $bookmarks_filename: $!\n";

my $page_count = page_count ($djvu_filename);

my $djvused_outline;
if ($use_simple_format) {
    $djvused_outline = simple_format_to_djvused_outline ($input_handle);
}
else {
    $djvused_outline = yaml_to_djvused_outline ($input_handle);
}

# run external commands
if ($djvu_filename ne $output_filename) {
    system_wrapper ("cp", $djvu_filename, $output_filename);
}

system_wrapper ("djvused", $output_filename, "-e",
                "set-outline $djvused_outline", "-s");

# argument: djvu filename
# returns: page count
sub page_count {
    my $filename = shift;

    open my $djvused_handle, "-|", "djvused", $filename, "-e", "n"
        or die "cannot find djvused.\n" .
        "Try '\$ apt-get install djvulibre-bin'.\n";

    my $count = <$djvused_handle>;

    if (!close $djvused_handle) {
        die "djvused failed with status ", $? >> 8, ".\n";
    }

    chomp ($count);
    return $count;
}

# returns: filename
sub simple_format_to_djvused_outline {
    my $input_handle = shift;
    binmode $input_handle, ':encoding(UTF-8)';
    (my $out_fh, my $out_filename) = tempfile (UNLINK => 1);
    binmode $out_fh, ':encoding(UTF-8)';

    my $regexp = qr/
       ^
       (?<indent>(\ {4})*)
       (?<title>.+?)
       (\s|\.)+
       (?<page>-?[0-9]+)
       \s*$
    /x;
    my $line_number = 0;
    my $offset = 0;
    my $indent_depth;
    my $prev_indent_depth = -1;

    print {$out_fh} "(bookmarks";

    while (my $line = <$input_handle>) {
        chomp ($line);
        ++$line_number;
        
        # skip blank lines or comments
        next if $line =~ /^\s*$/ || ($allow_comments && $line =~ /^#/);

        # check for offset marker
        if ($line =~ /^\s*d=(?<difference>-?[0-9]+)\s*$/) {
            $offset = $+{difference};
            next;
        }
        
        # match line against $regexp
        if ($line !~ $regexp) {
            die "error: line $line_number is broken:\n$line\n";
        }
        $indent_depth = length ($+{indent}) / 4;

        my $title = $+{title};
        my $page = $+{page} + $offset;
        if ($page < 1 || $page > $page_count) {
            die "page number $page in line $line_number out of range ".
                "(page count: $page_count)\n";
        }
        
        if ($indent_depth > $prev_indent_depth + 1) {
            die "too mutch indentation in line $line_number:\n$line\n";
        }
        
        $title = djvused_escape ($title);
        
        # print closing parenthesis for the previous entry
        print {$out_fh} ")"x($prev_indent_depth - $indent_depth + 1), "\n";
        $prev_indent_depth = $indent_depth;
        
        # print out this outline item
        print {$out_fh} qq{("$title" "#$page"};
    }

    if ($prev_indent_depth == -1) {
        warn "removing outline\n";
    }

    # print closing parenthesis
    print {$out_fh} ")"x($prev_indent_depth + 1), "\n", ")", "\n";

    # flush buffer
    close $out_fh;

    return $out_filename;
}

# arg: input filedescriptor
# returns output filename
sub yaml_to_djvused_outline {
    my $input_handle = shift;
    (my $out_fh, my $out_filename) = tempfile (UNLINK => 1);
    binmode $out_fh, ':encoding(UTF-8)';

    my $outline = Load (do {local $/; local $_ = <$input_handle>});

    print {$out_fh} "(bookmarks";

    if ($outline) {
        array_ref_to_djvused_outline ($out_fh, $outline);
    }
    else {
        warn "no input, removing outline\n";
    }

    print {$out_fh} "\n)\n";
    return $out_filename;
}

# used by yaml_to_djvused_outline
# prints djvused outline recursively from array ref
# args: output file descriptor, arrayref
sub array_ref_to_djvused_outline {
    my $output_handle = shift;
    my $outline = shift;

    unless (ref ($outline) eq "ARRAY") {
        die "array_ref_to_djvused_outline: arg not an array ref";
    }

    for my $hashref (@{$outline}) {
        my $title = $hashref->{title} or die "missing title in outline hash";
        $title = djvused_escape ($title);
        my $page = $hashref->{page};
        if ($page < 1 || $page > $page_count) {
            die "page number $page out of range ".
                "(page count: $page_count)\n";
        }
        
        # print out this outline item
        print {$output_handle} qq{\n("$title" "#$page"};
        
        if ($hashref->{kids}) {
            array_ref_to_djvused_outline ($output_handle, $hashref->{kids});
        }

        print {$output_handle} ")";
    }
}

sub djvused_escape {
    # see djvused(1): "DJVUSED FILE FORMATS - Strings"
    # and perlrebackslash

    my $title = shift;

    if ($title =~ /\n/) {
        die "djvused_escape: \\n in '$title'\n" .
            "Please report this at $bug_address\n"
    }

    # need to escape \, ", \a, \x{08}, \t, \x{0b}, \f and \r

    # \ is first, since it is contained in the replacement texts
    $title =~ s/\\/\\\\/g;

    $title =~ s/"/\\"/g;

    $title =~ s/\a/\\a/g;

    $title =~ s/\x{08}/\\b/g;

    $title =~ s/\t/\\t/g;

    $title =~ s/\x{0b}/\\v/g;

    $title =~ s/\f/\\f/g;

    $title =~ s/\r/\\r/g;

    return $title;
}

sub system_wrapper {
    warn "command: @_\n";
    system (@_) == 0
        or die "error: system failed: $?\n"
}

sub get_default_filename {
    my $filename = shift;
    my $from = shift;
    my $too = shift;
    die "get_default_filename needs arg" unless defined ($too);
    my $result_filename;

    if ($filename =~ /\Q$from\E$/) {
        ($result_filename = $filename) =~ s/\Q$from\E/$too/;
    }
    else {
        $result_filename = $filename . $too;
    }

    return $result_filename;
}



file t/test:
#!/usr/bin/env perl

# LICENSE: GPLv3+

use 5.014;
use warnings;
use strict;
use File::Temp qw/tempfile/;
use utf8;
use Test::More;
if (!eval 'use YAML::XS qw/LoadFile/; 1;') {
    die "cannot find the YAML::XS Perl module.\n" .
        "Try '\$ apt-get install libyaml-libyaml-perl'.\n";
}

use constant GET_OUTLINE => "../bookmark-djvu-extract";
use constant SET_OUTLINE => "../bookmark-djvu";

# create empty djvu
my $djvu = create_djvu (8);

# ################################# #
# compare files after set and get:  #
# ################################# #

system_ok (SET_OUTLINE . " -s $djvu input1");

(undef, my $outline) = tempfile (UNLINK => 1);
system_ok (GET_OUTLINE . " -s $djvu -o $outline");

# compare
files_ok ($outline, "expected1");
unlink ($outline);

# default filenames

$djvu = create_djvu (8);

$outline = ($djvu =~ s/\.djvu$/.bm/r);
say "outline: $outline";

system_ok ("cp input1 $outline");
# compare files after set and get:

system_ok (SET_OUTLINE . " -s $djvu");

unlink ($outline);

system_ok (GET_OUTLINE . " -s $djvu");

# compare
files_ok ($outline, "expected1");
unlink ($outline);

# #### #
# YAML #
# #### #

$djvu = create_djvu (5);

system_ok (SET_OUTLINE . " $djvu input1.yaml");

(undef, $outline) = tempfile (UNLINK => 1);
system_ok (GET_OUTLINE . " $djvu -o $outline");
yaml_ok ("input1.yaml", $outline);

# default filenames

$djvu = create_djvu (5);

$outline = ($djvu =~ s/\.djvu$/.outline/r);
say "outline: $outline";

system_ok ("cp input1.yaml $outline");

# compare files after set and get:

system_ok (SET_OUTLINE . " $djvu");

unlink ($outline);

system_ok (GET_OUTLINE . " $djvu");

# compare
yaml_ok ("input1.yaml", $outline);
unlink ($outline);

# ############### #
# remove outlines #
# ############### #

$djvu = create_djvu (5);
system_ok (SET_OUTLINE . " $djvu input1.yaml");

# empty input file
system_ok (SET_OUTLINE . " $djvu /dev/null");

system_is (GET_OUTLINE . " $djvu -o /dev/null", 1);

#simple format

$djvu = create_djvu (5);
system_ok (SET_OUTLINE . " $djvu input1.yaml");

# empty input file
system_ok (SET_OUTLINE . " $djvu -s /dev/null");

system_is (GET_OUTLINE . " $djvu -o /dev/null", 1);

done_testing ();

sub yaml_ok {
    my ($file1, $file2) = @_;

    is_deeply (LoadFile ($file1), LoadFile ($file2),
               "yaml_ok: file1=$file1, file2=$file2");
}

sub system_ok {
    my ($command) = @_ ;

    ok (system ($command) == 0, "command: $command");
}

sub system_is {
    my ($command, $exit_status) = @_;

    system ($command);

    # see 'perldoc -f system'
    ok ($? != -1
        && !($? & 127)
        && $? >> 8 == $exit_status,
        "exit status of '$command' is '$exit_status'");
}

sub files_ok {
    system_ok ("diff -C 3 $_[0] $_[1]");
}

# arguments: page count, suffix
# returns filename
sub create_djvu {
    my ($page_count, $suffix) = @_;

    $suffix or $suffix = ".djvu";

    $page_count > 0 or $page_count = 1;

    # create one-page djvu file
    (undef, my $filename) = tempfile (UNLINK => 1, SUFFIX => $suffix);
    system_wrapper ("djvumake $filename INFO=100,100,100");

    my $page;
    if ($page_count > 1) {
        (undef, $page) = tempfile (UNLINK => 1, SUFFIX => ".djvu");
        system_wrapper ("djvumake $page INFO=100,100,100");
    }

    for (2..$page_count) {
        system_wrapper ("djvm -i $filename $page");
    }

    return $filename;
}

sub system_wrapper {
    my ($command) = @_;

    system ($command) == 0
        or die "command '$command' failed with status ", $? >> 8, "\n";
}







-- 
To unsubscribe, e-mail: beginners-unsubscr...@perl.org
For additional commands, e-mail: beginners-h...@perl.org
http://learn.perl.org/


Reply via email to