I did some (very limited) testing on storing and retrieving MARC in YAML. The results were not encouraging. IIRC, I just did a direct conversion of the MARC::Record object into YAML and back. Perhaps there's a way to optimize the formatting that would improve performance, but my testing showed sometimes even worse performance than XML.

Did you use YAML or YAML::XS? My tests with YAML::XS shows a very significant improvement with YAML: see attached file. Of course, we should define an serialization format independent from MARC::Record object if we don't want to break the process when MARC::Record internal data structure ever change.

MARCXML is a performance killer at this point, but there's no other apparent way to handle large bib records. The parsing is the issue, not the data transfer load. Perhaps cached BSON-formatted MARC::Record objects are a way out of this.

  Benchmark should be done with all available serialization formats.

We also could implement serialization/deserialization logic directly into MARC::Record library, as ISO2709 and XML format, in order gain control.
--
Frédéric
#!/usr/bin/perl 

#
# Compare MARC::Record object creation from an XML serialization vs
# a YAML one.
#


use strict;
use warnings;
use Storable qw(nstore store_fd nstore_fd freeze thaw dclone);
use MARC::Record;
use MARC::File::XML;
use YAML::XS;
use Time::HiRes qw(gettimeofday);


#test_storable(  1000 );
test_xml(  1000 );
test_yaml( 1000 );


sub test_xml {
    my $max = shift;
    my $xml = <<EOS;
<?xml version="1.0" encoding="UTF-8"?>
<record
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
    xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/ 
standards/marcxml/schema/MARC21slim.xsd"
    xmlns="http://www.loc.gov/MARC21/slim";>

  <leader>00571nam0a2200133   4500</leader>
  <controlfield tag="005">20080924184404.000</controlfield>
  <datafield tag="090" ind1=" " ind2=" ">
    <subfield code="a">1000</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">20081018              frey50       </subfield>
  </datafield>
  <datafield tag="200" ind1="1" ind2=" ">
    <subfield code="a">Propriété barrière de la peau</subfield>
    <subfield code="b">LIVR</subfield>
    <subfield code="e">caractérisation de l'organisation des lipides par 
spectroscopie vibrationnelle</subfield>
    <subfield code="f">Emmanuelle Corbé Guillard</subfield>
    <subfield code="g">[sous la direction de] Arlette Baillet-Guffroy</subfield>
  </datafield>
  <datafield tag="210" ind1=" " ind2=" ">
    <subfield code="a">[S.l.]</subfield>
    <subfield code="c">[s.n.]</subfield>
    <subfield code="d">2008</subfield>
  </datafield>
  <datafield tag="215" ind1=" " ind2=" ">
    <subfield code="a">1 vol. (66 f.)</subfield>
    <subfield code="c">ill.</subfield>
    <subfield code="d">30 cm</subfield>
  </datafield>
  <datafield tag="700" ind1=" " ind2="1">
    <subfield code="9">41714</subfield>
    <subfield code="a">Corbé Guillard</subfield>
    <subfield code="b">Emmanuelle</subfield>
    <subfield code="f">1979-....</subfield>
    <subfield code="4">070</subfield>
  </datafield>
  <datafield tag="995" ind1=" " ind2=" ">
    <subfield code="o">0</subfield>
    <subfield code="y">LIVR</subfield>
    <subfield code="f">100999</subfield>
    <subfield code="9">1000</subfield>
    <subfield code="c">TAMIL</subfield>
    <subfield code="k">TPHB 10352</subfield>
    <subfield code="b">TAMIL</subfield>
  </datafield>
  <controlfield tag="001">1000</controlfield>
</record>
EOS
    my $start = gettimeofday;
    my $count = 0;
    MARC::File::XML->default_record_format( 'UNIMARC' );
    while ( $count < $max ) {
        my $record = MARC::Record::new_from_xml( $xml, "utf8", 'UNIMARC' );
        $count++
    }
    print "Deserialize $count records from XML: ", gettimeofday - $start, "\n";
}


sub test_yaml {
    my $max = shift;
    my $yaml = <<EOS;
--- !!perl/hash:MARC::Record
_fields:
- !!perl/hash:MARC::Field
  _data: '20080924184404.000'
  _is_control_field: 1
  _tag: 005
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: ' '
  _ind2: ' '
  _is_control_field: ''
  _subfields:
  - a
  - '1000'
  _tag: 090
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: ' '
  _ind2: ' '
  _is_control_field: ''
  _subfields:
  - a
  - '20081018              frey50       '
  _tag: 100
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: '1'
  _ind2: ' '
  _is_control_field: ''
  _subfields:
  - a
  - Propriété barrière de la peau
  - b
  - LIVR
  - e
  - caractérisation de l'organisation des lipides par spectroscopie 
vibrationnelle
  - f
  - Emmanuelle Corbé Guillard
  - g
  - '[sous la direction de] Arlette Baillet-Guffroy'
  _tag: 200
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: ' '
  _ind2: ' '
  _is_control_field: ''
  _subfields:
  - a
  - '[S.l.]'
  - c
  - '[s.n.]'
  - d
  - '2008'
  _tag: 210
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: ' '
  _ind2: ' '
  _is_control_field: ''
  _subfields:
  - a
  - 1 vol. (66 f.)
  - c
  - ill.
  - d
  - 30 cm
  _tag: 215
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: ' '
  _ind2: '1'
  _is_control_field: ''
  _subfields:
  - '9'
  - '41714'
  - a
  - Corbé Guillard
  - b
  - Emmanuelle
  - f
  - 1979-....
  - '4'
  - '070'
  _tag: 700
  _warnings: []
- !!perl/hash:MARC::Field
  _ind1: ' '
  _ind2: ' '
  _is_control_field: ''
  _subfields:
  - o
  - '0'
  - y
  - LIVR
  - f
  - '100999'
  - '9'
  - '1000'
  - c
  - TAMIL
  - k
  - TPHB 10352
  - b
  - TAMIL
  _tag: 995
  _warnings: []
- !!perl/hash:MARC::Field
  _data: '1000'
  _is_control_field: 1
  _tag: 001
  _warnings: []
_leader: 00571nam0a2200133   4500
_warnings: []
EOS
    my $count = 0;
    my $start = gettimeofday;
    while ( $count < $max ) {
        my $marc = Load( $yaml );
        $count++;
    }
    print "Deserialize $count records from YAML: ", gettimeofday - $start, "\n";
}
      
sub test_storable {
    my $max = shift;
    my $serial = <<EOS;
pst12345MARC::Record00571nam0a2200133   4500_leader     _warnings       
MARC::Field
1_is_control_field20080924184404.000_data       _warnings005_tag
_is_control_field _ind2a1000
_subfields      _warnings090_tag _ind1
_is_control_field _ind2a#20081018              frey50       
_subfields      _warnings100_tag _ind1
_is_control_field _ind2
a Propriété barrière de la peaubLIVReOcaractérisation de l'organisation des 
lipides par spectroscopie vibrationnellefEmmanuelle Corbé Guillardg.[sous la 
direction de] Arlette Baillet-Guffroy
_subfields      _warnings200_tag1_ind1
_is_control_field _ind2a[S.l.]c[s.n.]d2008
_subfields      _warnings210_tag _ind1
_is_control_field _ind2a1 vol. (66 f.)cill.d30 cm
_subfields      _warnings215_tag _ind1
_is_control_field1_ind2
941714aCorbé Guillardb
Emmanuellef     1979-....4070
_subfields      _warnings700_tag _ind1
_is_control_field _ind2o0yLIVRf10099991000cTAMILk
TPHB 10352bTAMIL
_subfields      _warnings995_tag _ind1
1_is_control_field1000_data     _warnings001_tag_fields
EOS
    my $count = 0;
    my $start = gettimeofday;
    while ( $count < $max ) {
        my $marc = thaw( $serial );
        $count++;
    }
    print "Deserialize $count records from YAML: ", gettimeofday - $start, "\n";
}
  

_______________________________________________
Koha-devel mailing list
[email protected]
http://lists.koha-community.org/cgi-bin/mailman/listinfo/koha-devel

Reply via email to