Am 30.07.2015 um 20:42 schrieb Brandon McCaig: > I'll give you my 2 cents for whatever that's worth. >:)
Thanks for your many comments. The updated full project code is below. Feedback is appreciated. (you also find this at github.com/simon0x5b/bookmark-djvu) > I can't help thinking that the entire recursive subroutine could > be simplified and tidied up a bit by changing the direction of > it could you elaborate on this? Cheers, Simon file bookmark-djvu-extract: #!/usr/bin/env perl # LICENSE: GPLv3+ use 5.010; use warnings; use strict; use utf8; use Getopt::Long qw/:config no_ignore_case bundling/; use File::Basename qw/basename/; use Encode qw/decode/; if (!eval 'use YAML::XS qw/Dump/; 1;') { die "cannot find the YAML::XS Perl module.\n" . "Try '\$ apt-get install libyaml-libyaml-perl'.\n"; } binmode STDOUT, ':encoding(UTF-8)'; binmode STDIN, ':encoding(UTF-8)'; binmode STDERR, ':encoding(UTF-8)'; my $prog_name = basename $0; my $bug_address = "www.github.com/simon0x5b/bookmark-djvu/issues"; sub print_usage { say "Usage: $prog_name [OPTIONS] DJVU [-o BOOKMARKS_FILE]"; } sub print_help { print_usage (); say " extract the outline of DJVU. The format of the generated BOOKMARKS-FILE is described in the README. Options: -o, --output=FILE write bookmarks to FILE -s, --simple-format dump outline in simple format (default: YAML) -h, --help print this help screen -V, --version print program version The default for BOOKMARKS-FILE is DJVU-FILENAME with the suffix changed to - '.outline' for YAML mode (default). - '.bm' for simple format. Report bugs to $bug_address"; } sub print_version { say "bookmark-djvu 0.1 License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>. This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law." } $_ = decode ('UTF-8', $_) for @ARGV; my $output_filename; my $use_simple_format = 0; my %opt_hash = ( "output|o=s" => \$output_filename, "simple-format|s" => \$use_simple_format, "help|h" => sub {print_help (); exit 0}, "version|V" => sub {print_version (); exit 0}, ); print_usage (), exit (1) unless GetOptions (%opt_hash) and @ARGV; my $djvu_filename = $ARGV[0]; if (! -f $djvu_filename) { print_usage (); die "error: djvu file '$djvu_filename' does not exist\n"; } if (!$output_filename) { my $suffix = $use_simple_format ? ".bm" : ".outline"; $output_filename = get_default_filename ($djvu_filename, ".djvu", $suffix); if (-e $output_filename) { die "won't overwrite existing file '$output_filename'. " . "Use the '-o FILE' option!"; } } open my $output_handle, ">", $output_filename or die "cannot open file '$output_filename': $!"; # start extracting bookmarks with djvused in a pipe open my $djvused_handle, "-|", ("djvused", $djvu_filename, "-u", "-e", "print-outline") or die "cannot find djvused.\n" . "Try '\$ apt-get install djvulibre-bin'.\n"; binmode $djvused_handle, ':encoding(UTF-8)'; my $outline_items = parse_djvused_output ($djvused_handle); unless (@{$outline_items}) { warn "no outline found\n"; exit 1; } close $djvused_handle or die "djvused failed with exit status ", $? >> 8, "\n"; if ($use_simple_format) { dump_to_simple_format ($output_handle, $outline_items); } else { dump_to_yaml ($output_handle, $outline_items); } # arg: open file descriptor to djvused # returns: arrayref of hashrefs, each describing level, title and page of an # outline item. sub parse_djvused_output { my $djvused_handle = shift; my $level = 0; my $line = <$djvused_handle>; if (!$line or $line !~ /^\(bookmarks$/) { warn "no bookmarks found\n"; return []; } my $outline_items = []; while ($line = <$djvused_handle>) { chomp $line; $line =~ /^\s*\("(?<title>.+)"$/ or die "line '$line' in djvused output does not match. Please report to $bug_address\n"; my $title = djvused_unescape ($+{title}); $line = <$djvused_handle>; $line =~ /\s*"#(?<page>[0-9]+)"(?<close>(\s*\))*)\s*$/ or die "line '$line' in djvused output does not match. Please report to $bug_address\n"; push @$outline_items, {level => $level, title => $title, page => $+{page}}; $level -= ($+{close} =~ tr/\)//) - 1; } return $outline_items; } sub dump_to_simple_format { my $out_handle = shift; my $outline_items = shift; binmode $out_handle, ':encoding(UTF-8)'; for my $outline (@$outline_items) { print {$out_handle} " "x($outline->{level}), prepare_title ($outline->{title}), , " $outline->{page}\n"; } } sub prepare_title { my $title = shift; if ($title =~ /\n/) { warn "WARNING: removing newline in title '$title'.\n"; $title =~ s/\n/ /g; } if ($title =~ /(\.|\s)+$/) { warn "WARNING: removing trailing dots and/or whitespace " . "in title '$title'.\n"; $title =~ s/(\.|\s)+$//; } return $title; } sub dump_to_yaml { my $out_handle = shift; my $input_array = shift; (my $array, undef) = get_recursive_array ($input_array, 0); print {$out_handle} Dump ($array); } # recursively convert flat array into nested data structure # args: input array ref, index to start # returns: output array ref, following index sub get_recursive_array { my ($input_array, $index) = @_; my $last = $#{$input_array}; my $base_level = $input_array->[$index]{level}; my $result_array = []; while ($index <= $last) { my $node = $input_array->[$index]; my $current_level = $node->{level}; last if $current_level < $base_level; my $outline_item = {title => $node->{title}, page => $node->{page}}; my $next_index = $index + 1; my $next_level = $input_array->[$next_index]{level}; if ($next_index <= $last && $next_level > $base_level) { ($outline_item->{kids}, $index) = get_recursive_array ($input_array, $next_index); } else { ++$index; } push @{$result_array}, $outline_item; } return ($result_array, $index); } sub djvused_unescape { # see in '$ man djvused': "DJVUSED FILE FORMATS - Strings" # and perlrebackslash my $string = shift; $string =~ s/\\([0-7]{3}|.)/expand_escape_sequence ($1)/ge; return $string; } sub expand_escape_sequence { my ($string) = @_; if ($string eq "\\" || $string eq '"') { return $string; } elsif (length ($string) == 3) { # octal escape sequence return pack 'U', oct ($string); } elsif ($string eq "a") { return "\a"; } elsif ($string eq "b") { return "\b"; } elsif ($string eq "t") { return "\t"; } elsif ($string eq "n") { return "\n"; } elsif ($string eq "v") { return "\x{0b}"; } elsif ($string eq "f") { return "\f"; } elsif ($string eq "r") { return "\r"; } else { die "unknown escape sequence '\\$string' in '$string' please report to $bug_address"; } } sub get_default_filename { my $filename = shift; my $from = shift; my $too = shift; die "get_default_filename needs arg" unless defined ($too); my $result_filename; if ($filename =~ /\Q$from\E$/) { ($result_filename = $filename) =~ s/\Q$from\E/$too/; } else { $result_filename = $filename . $too; } return $result_filename; } file bookmark-djvu: #!/usr/bin/env perl # License: GPLv3+ use 5.010; use warnings; use strict; # do not use floating point arithmetic in divisions use integer; use utf8; use Encode qw/decode/; use Getopt::Long qw/:config no_ignore_case bundling/; use File::Temp qw/tempfile/; use File::Basename qw/basename/; if (!eval 'use YAML::XS qw/Load/; 1;') { die "cannot find the YAML::XS Perl module.\n" . "Try '\$ apt-get install libyaml-libyaml-perl'.\n"; } binmode STDOUT, ':encoding(UTF-8)'; binmode STDIN, ':encoding(UTF-8)'; binmode STDERR, ':encoding(UTF-8)'; my $prog_name = basename $0; my $bug_address = "www.github.com/simon0x5b/bookmark-djvu/issues"; sub print_usage { say "Usage: $prog_name [OPTIONS] DJVU [BOOKMARKS-FILE]"; } sub print_help { print_usage (); say " update the outline of DJVU. The format of BOOKMARKS-FILE is described in the README. Options: -o, --output=FILE do not modify DJVU and write the bookmarked djvu file to FILE. -s, --simple-format use simple format (default: YAML) options controlling simple format: -c, --allow-comments ignore all lines starting with a hash character (#) in BOOKMARKS-FILE. general options: -h, --help print this help screen. -V, --version print program version. The default for BOOKMARKS-FILE is DJVU with the suffix changed to - '.outline' for YAML mode (default). - '.bm' for simple format. Report bugs to $bug_address"; } sub print_version { say "bookmark-djvu 0.1 License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>. This is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law." } $_ = decode ('UTF-8', $_) for @ARGV; my $output_filename; my $use_simple_format = 0; my $allow_comments = 0; my %opt_hash = ( "output|o=s" => \$output_filename, "simple-format|s" => \$use_simple_format, "allow-comments|c" => \$allow_comments, "help|h" => sub {print_help (); exit}, "version|V" => sub {print_version (); exit}, ); print_usage () , exit (1) unless GetOptions (%opt_hash) && @ARGV; my $djvu_filename = $ARGV[0]; if (! -f $djvu_filename) { print_usage (); die "error: djvu file '$djvu_filename' does not exist\n"; } $output_filename = $djvu_filename unless $output_filename; my $bookmarks_filename; if ($ARGV[1]) { $bookmarks_filename = $ARGV[1]; } else { my $suffix = $use_simple_format ? ".bm" : ".outline"; $bookmarks_filename = get_default_filename ($djvu_filename, ".djvu", $suffix); } open my $input_handle, '<', $bookmarks_filename or die "cannot open $bookmarks_filename: $!\n"; my $page_count = page_count ($djvu_filename); my $djvused_outline; if ($use_simple_format) { $djvused_outline = simple_format_to_djvused_outline ($input_handle); } else { $djvused_outline = yaml_to_djvused_outline ($input_handle); } # run external commands if ($djvu_filename ne $output_filename) { system_wrapper ("cp", $djvu_filename, $output_filename); } system_wrapper ("djvused", $output_filename, "-e", "set-outline $djvused_outline", "-s"); # argument: djvu filename # returns: page count sub page_count { my $filename = shift; open my $djvused_handle, "-|", "djvused", $filename, "-e", "n" or die "cannot find djvused.\n" . "Try '\$ apt-get install djvulibre-bin'.\n"; my $count = <$djvused_handle>; if (!close $djvused_handle) { die "djvused failed with status ", $? >> 8, ".\n"; } chomp ($count); return $count; } # returns: filename sub simple_format_to_djvused_outline { my $input_handle = shift; binmode $input_handle, ':encoding(UTF-8)'; (my $out_fh, my $out_filename) = tempfile (UNLINK => 1); binmode $out_fh, ':encoding(UTF-8)'; my $regexp = qr/ ^ (?<indent>(\ {4})*) (?<title>.+?) (\s|\.)+ (?<page>-?[0-9]+) \s*$ /x; my $line_number = 0; my $offset = 0; my $indent_depth; my $prev_indent_depth = -1; print {$out_fh} "(bookmarks"; while (my $line = <$input_handle>) { chomp ($line); ++$line_number; # skip blank lines or comments next if $line =~ /^\s*$/ || ($allow_comments && $line =~ /^#/); # check for offset marker if ($line =~ /^\s*d=(?<difference>-?[0-9]+)\s*$/) { $offset = $+{difference}; next; } # match line against $regexp if ($line !~ $regexp) { die "error: line $line_number is broken:\n$line\n"; } $indent_depth = length ($+{indent}) / 4; my $title = $+{title}; my $page = $+{page} + $offset; if ($page < 1 || $page > $page_count) { die "page number $page in line $line_number out of range ". "(page count: $page_count)\n"; } if ($indent_depth > $prev_indent_depth + 1) { die "too mutch indentation in line $line_number:\n$line\n"; } $title = djvused_escape ($title); # print closing parenthesis for the previous entry print {$out_fh} ")"x($prev_indent_depth - $indent_depth + 1), "\n"; $prev_indent_depth = $indent_depth; # print out this outline item print {$out_fh} qq{("$title" "#$page"}; } if ($prev_indent_depth == -1) { warn "removing outline\n"; } # print closing parenthesis print {$out_fh} ")"x($prev_indent_depth + 1), "\n", ")", "\n"; # flush buffer close $out_fh; return $out_filename; } # arg: input filedescriptor # returns output filename sub yaml_to_djvused_outline { my $input_handle = shift; (my $out_fh, my $out_filename) = tempfile (UNLINK => 1); binmode $out_fh, ':encoding(UTF-8)'; my $outline = Load (do {local $/; local $_ = <$input_handle>}); print {$out_fh} "(bookmarks"; if ($outline) { array_ref_to_djvused_outline ($out_fh, $outline); } else { warn "no input, removing outline\n"; } print {$out_fh} "\n)\n"; return $out_filename; } # used by yaml_to_djvused_outline # prints djvused outline recursively from array ref # args: output file descriptor, arrayref sub array_ref_to_djvused_outline { my $output_handle = shift; my $outline = shift; unless (ref ($outline) eq "ARRAY") { die "array_ref_to_djvused_outline: arg not an array ref"; } for my $hashref (@{$outline}) { my $title = $hashref->{title} or die "missing title in outline hash"; $title = djvused_escape ($title); my $page = $hashref->{page}; if ($page < 1 || $page > $page_count) { die "page number $page out of range ". "(page count: $page_count)\n"; } # print out this outline item print {$output_handle} qq{\n("$title" "#$page"}; if ($hashref->{kids}) { array_ref_to_djvused_outline ($output_handle, $hashref->{kids}); } print {$output_handle} ")"; } } sub djvused_escape { # see djvused(1): "DJVUSED FILE FORMATS - Strings" # and perlrebackslash my $title = shift; if ($title =~ /\n/) { die "djvused_escape: \\n in '$title'\n" . "Please report this at $bug_address\n" } # need to escape \, ", \a, \x{08}, \t, \x{0b}, \f and \r # \ is first, since it is contained in the replacement texts $title =~ s/\\/\\\\/g; $title =~ s/"/\\"/g; $title =~ s/\a/\\a/g; $title =~ s/\x{08}/\\b/g; $title =~ s/\t/\\t/g; $title =~ s/\x{0b}/\\v/g; $title =~ s/\f/\\f/g; $title =~ s/\r/\\r/g; return $title; } sub system_wrapper { warn "command: @_\n"; system (@_) == 0 or die "error: system failed: $?\n" } sub get_default_filename { my $filename = shift; my $from = shift; my $too = shift; die "get_default_filename needs arg" unless defined ($too); my $result_filename; if ($filename =~ /\Q$from\E$/) { ($result_filename = $filename) =~ s/\Q$from\E/$too/; } else { $result_filename = $filename . $too; } return $result_filename; } file t/test: #!/usr/bin/env perl # LICENSE: GPLv3+ use 5.014; use warnings; use strict; use File::Temp qw/tempfile/; use utf8; use Test::More; if (!eval 'use YAML::XS qw/LoadFile/; 1;') { die "cannot find the YAML::XS Perl module.\n" . "Try '\$ apt-get install libyaml-libyaml-perl'.\n"; } use constant GET_OUTLINE => "../bookmark-djvu-extract"; use constant SET_OUTLINE => "../bookmark-djvu"; # create empty djvu my $djvu = create_djvu (8); # ################################# # # compare files after set and get: # # ################################# # system_ok (SET_OUTLINE . " -s $djvu input1"); (undef, my $outline) = tempfile (UNLINK => 1); system_ok (GET_OUTLINE . " -s $djvu -o $outline"); # compare files_ok ($outline, "expected1"); unlink ($outline); # default filenames $djvu = create_djvu (8); $outline = ($djvu =~ s/\.djvu$/.bm/r); say "outline: $outline"; system_ok ("cp input1 $outline"); # compare files after set and get: system_ok (SET_OUTLINE . " -s $djvu"); unlink ($outline); system_ok (GET_OUTLINE . " -s $djvu"); # compare files_ok ($outline, "expected1"); unlink ($outline); # #### # # YAML # # #### # $djvu = create_djvu (5); system_ok (SET_OUTLINE . " $djvu input1.yaml"); (undef, $outline) = tempfile (UNLINK => 1); system_ok (GET_OUTLINE . " $djvu -o $outline"); yaml_ok ("input1.yaml", $outline); # default filenames $djvu = create_djvu (5); $outline = ($djvu =~ s/\.djvu$/.outline/r); say "outline: $outline"; system_ok ("cp input1.yaml $outline"); # compare files after set and get: system_ok (SET_OUTLINE . " $djvu"); unlink ($outline); system_ok (GET_OUTLINE . " $djvu"); # compare yaml_ok ("input1.yaml", $outline); unlink ($outline); # ############### # # remove outlines # # ############### # $djvu = create_djvu (5); system_ok (SET_OUTLINE . " $djvu input1.yaml"); # empty input file system_ok (SET_OUTLINE . " $djvu /dev/null"); system_is (GET_OUTLINE . " $djvu -o /dev/null", 1); #simple format $djvu = create_djvu (5); system_ok (SET_OUTLINE . " $djvu input1.yaml"); # empty input file system_ok (SET_OUTLINE . " $djvu -s /dev/null"); system_is (GET_OUTLINE . " $djvu -o /dev/null", 1); done_testing (); sub yaml_ok { my ($file1, $file2) = @_; is_deeply (LoadFile ($file1), LoadFile ($file2), "yaml_ok: file1=$file1, file2=$file2"); } sub system_ok { my ($command) = @_ ; ok (system ($command) == 0, "command: $command"); } sub system_is { my ($command, $exit_status) = @_; system ($command); # see 'perldoc -f system' ok ($? != -1 && !($? & 127) && $? >> 8 == $exit_status, "exit status of '$command' is '$exit_status'"); } sub files_ok { system_ok ("diff -C 3 $_[0] $_[1]"); } # arguments: page count, suffix # returns filename sub create_djvu { my ($page_count, $suffix) = @_; $suffix or $suffix = ".djvu"; $page_count > 0 or $page_count = 1; # create one-page djvu file (undef, my $filename) = tempfile (UNLINK => 1, SUFFIX => $suffix); system_wrapper ("djvumake $filename INFO=100,100,100"); my $page; if ($page_count > 1) { (undef, $page) = tempfile (UNLINK => 1, SUFFIX => ".djvu"); system_wrapper ("djvumake $page INFO=100,100,100"); } for (2..$page_count) { system_wrapper ("djvm -i $filename $page"); } return $filename; } sub system_wrapper { my ($command) = @_; system ($command) == 0 or die "command '$command' failed with status ", $? >> 8, "\n"; } -- To unsubscribe, e-mail: beginners-unsubscr...@perl.org For additional commands, e-mail: beginners-h...@perl.org http://learn.perl.org/