And for those who want to run test by their self, here attached is my
tests comparing pure Perl parsing and various SAX parser (which need to
be installed):

    XML::SAX::PurePerl
    XML::LibXML::SAX::Parser
    XML::SAX::Expat
    XML::SAX::ExpatXS

SAX parsing is done directly, without using MARC::File::XML in order to
have raw figures. Parsing in MARC::File::XML should slow down a little
bit but I can't say of what magnitude.

#!/usr/bin/perl

use warnings;
use strict;

use XML::Simple;
use MARC::File::XML;
use Time::HiRes qw(gettimeofday);


# Number of time the following record is parsed
my $max = 1000;

# Tested SAX parsers
my @xml_parsers = qw(
    XML::SAX::PurePerl
    XML::LibXML::SAX::Parser
    XML::SAX::Expat
    XML::SAX::ExpatXS
);

my $raw_xml = <<EOS;
<record>
  <leader>00675cam a22002051  4500</leader>
  <controlfield tag="001">   10026159 </controlfield>
  <controlfield tag="003">DLC</controlfield>
  <controlfield tag="005">20050815184409.0</controlfield>
  <controlfield tag="008">830916s1910    gw            000 0 ger  
</controlfield>
  <datafield tag="010" ind1=" " ind2=" ">
    <subfield code="a">   10026159 </subfield>
  </datafield>
  <datafield tag="035" ind1=" " ind2=" ">
    <subfield code="a">(OCoLC)9914473</subfield>
  </datafield>
  <datafield tag="040" ind1=" " ind2=" ">
    <subfield code="a">DLC</subfield>
    <subfield code="c">OCU</subfield>
    <subfield code="d">OCU</subfield>
    <subfield code="d">DLC</subfield>
  </datafield>
  <datafield tag="042" ind1=" " ind2=" ">
    <subfield code="a">premarc</subfield>
  </datafield>
  <datafield tag="050" ind1="0" ind2="0">
    <subfield code="a">PA6792.Z9</subfield>
    <subfield code="b">G4</subfield>
  </datafield>
  <datafield tag="100" ind1="1" ind2=" ">
    <subfield code="a">Germann, Peter.</subfield>
  </datafield>
  <datafield tag="245" ind1="1" ind2="4">
    <subfield code="a">Die sogenannten Sententiae Varronis.</subfield>
    <subfield code="c">Von Peter Germann.</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="a">Paderborn,</subfield>
    <subfield code="b">F. Schöningh,</subfield>
    <subfield code="c">1910.</subfield>
  </datafield>
  <datafield tag="300" ind1=" " ind2=" ">
    <subfield code="a">2 p. l., 98 p., 1 l.</subfield>
    <subfield code="c">24 cm.</subfield>
  </datafield>
  <datafield tag="440" ind1=" " ind2="0">
    <subfield code="a">Studien zur Geschichte und Kultur des Altertums 
...</subfield>
    <subfield code="v">3. Bd., 6. Hft</subfield>
  </datafield>
  <datafield tag="600" ind1="1" ind2="0">
    <subfield code="a">Varro, Marcus Terentius.</subfield>
    <subfield code="k">Spurious and doubtful works.</subfield>
    <subfield code="t">Sententiae Varronis.</subfield>
  </datafield>
</record>
EOS



# Pure Perl parser
sub get_record_from_xml {
    my $raw = shift;

    return unless $raw;
    return undef unless $raw =~ /<record/;

    my @parts = split />/, $raw;
    my $record = MARC::Record->new();
    my @fields;
    while ( @parts ) {
        $_ = shift @parts;
        $_ = shift @parts if /<record/;
        if ( /<leader/ ) {
            $_ = shift @parts;
            /(.*)<\/leader/;
            $record->leader($_);
            next;
        }
        if ( /<controlfield\s*tag="(.*)"/ ) {
            my $tag = $1;
            $_ = shift @parts;
            s/<\/controlfield//;
            push @fields, MARC::Field->new( $tag, $_ );
            next;
        }
        if ( /<datafield\s*tag="(.*?)"\s*ind1="(.*?)"\s*ind2="(.*)"/ ) {
            my ($tag, $ind1, $ind2) = ($1, $2, $3);
            my @subf;
            while ( @parts && $parts[0] =~ /<subfield.*code="(.*)"/ ) {
                my $letter = $1;
                shift @parts;
                $_ = shift @parts;
                s/<\/subfield//;
                push @subf, $letter => $_;
            }
            push @fields, MARC::Field->new($tag, $ind1, $ind2, @subf);
            shift @parts;
            next;
        }
        last;
    }
    $record->{_fields} = \...@fields;

    return $record;
};


my $xs = XML::Simple->new();

# SAX parser
sub get_record_from_xml_sax {
    my ($raw) = @_;

    return unless $raw;
    my $ref = eval { $xs->XMLin($raw, forcearray => [ 'subfield' ] ) };
    return undef if $@;

    my $record = MARC::Record->new();
    $record->leader( $ref->{leader} );
    my @fields_control = map {
        MARC::Field->new( $_->{tag}, $_->{content} );
    } @{$ref->{controlfield}};
    my @fields_std = map {                                                      
       
        my @sf = map { ($_->{code}, $_->{content}) }  @{$_->{subfield}};        
     
        MARC::Field->new(
            $_->{tag},
            $_->{ind1},
            $_->{ind2},
            @sf,
        );
    } @{$ref->{datafield}};
    $record->{_fields} = [ @fields_control, @fields_std ];
}


sub parse_with_perl {
    my $start = gettimeofday;
    for ( my $count = 0; $count < $max; $count++ ) {
        my $record = get_record_from_xml($raw_xml);
    }
    print "Parsed $max MARC::Record objects from XML in pure Perl: ",
          gettimeofday - $start, "\n";
}


sub parse_with_sax {
    for my $sax_parser ( @xml_parsers ) {
        $XML::SAX::ParserPackage = $sax_parser;
        my $count = 0;
        my $start = gettimeofday;
        for ( my $count = 0; $count < $max; $count++ ) {
            my $record = get_record_from_xml_sax( $raw_xml );
            $count++;
            last if $count == $max;
        }
        print "Parsed $max MARC::Record objects from XML using ",
              "$sax_parser : ", gettimeofday - $start, "\n";
    }
}


parse_with_perl();
parse_with_sax();

_______________________________________________
Koha-devel mailing list
[email protected]
http://lists.koha-community.org/cgi-bin/mailman/listinfo/koha-devel
website : http://www.koha-community.org/
git : http://git.koha-community.org/
bugs : http://bugs.koha-community.org/

Reply via email to