From 81f726e2603aefc41fbba9a1398f3f5d87329d70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1s=20Cohen=20Arazi?= <[email protected]>
Date: Tue, 24 Aug 2010 13:09:50 -0300
Subject: [PATCH 2/2] [Bug #5166] Refactor rebuild_zebra.pl into a library

---
 C4/Catalog/Rebuild.pm |  426 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 426 insertions(+), 0 deletions(-)
 create mode 100755 C4/Catalog/Rebuild.pm

diff --git a/C4/Catalog/Rebuild.pm b/C4/Catalog/Rebuild.pm
new file mode 100755
index 0000000..c022989
--- /dev/null
+++ b/C4/Catalog/Rebuild.pm
@@ -0,0 +1,426 @@
+package C4::Catalog::Rebuild;
+
+use strict;
+use C4::Context;
+use Getopt::Long;
+use File::Temp qw/ tempdir /;
+use File::Path;
+use C4::Biblio;
+use C4::AuthoritiesMarc;
+
+# library for checking for updates in zerbaqueue
+# can also check zebradir structure & create directories & 
+# mandatory files if needed
+
+    my $nosanitize = '';
+    my $skip_export = '';
+    my $keep_export = '';
+    my $noxml = '';
+    my $as_xml = '';
+    my $process_zebraqueue = 1;
+    my $do_not_clear_zebraqueue = '';
+    my $verbose_logging = '';
+    my $reset = '';
+    my $noshadow = '';
+    my $biblioserverdir     = C4::Context->zebraconfig('biblioserver')->{directory};
+    my $authorityserverdir  = C4::Context->zebraconfig('authorityserver')->{directory};
+    my $kohadir             = C4::Context->config('intranetdir');
+    my $dbh                 = C4::Context->dbh;
+    my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber","");
+    my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber","");
+
+
+sub update_auth {
+
+    # Create tmp dir
+    my $directory = File::Temp->newdir();
+    my $zebraidx_log_opt = " -v none,fatal,warn ";
+
+
+    # Update authorities
+    index_records('authority', $directory, $skip_export, "1", $as_xml,\
+                  $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt);
+
+    return 0;
+
+}
+
+sub update_auth_and_biblio {
+
+    # Create tmp dir
+    my $directory = File::Temp->newdir();
+    my $zebraidx_log_opt = " -v none,fatal,warn ";
+
+    # Update authorities
+    index_records('authority', $directory, 0, 1, 1, 0, 0, 0, $verbose_logging, $zebraidx_log_opt);
+
+    # Update biblios
+    index_records('biblio', $directory, 0, 1, 1, 0, 0, 0, $verbose_logging, $zebraidx_log_opt);
+
+    return 0;
+}
+
+sub index_records {
+    my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt) = @_;
+
+
+    my $num_records_exported = 0;
+    my $num_records_deleted = 0;
+
+    if ($skip_export && $verbose_logging) {
+        print "====================\n";
+        print "SKIPPING $record_type export\n";
+        print "====================\n";
+    } else {
+        if ( $verbose_logging ) {
+            print "====================\n";
+            print "exporting $record_type\n";
+            print "====================\n";
+        }
+        mkdir "$directory" unless (-d $directory);
+        mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
+        if ($process_zebraqueue) {
+            my $entries = select_zebraqueue_records($record_type, 'deleted');
+            mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
+            $num_records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
+            mark_zebraqueue_batch_done($entries);
+            $entries = select_zebraqueue_records($record_type, 'updated');
+            mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
+            $num_records_exported = export_marc_records_from_list($record_type, 
+                                                                  $entries, "$directory/upd_$record_type", $as_xml, $noxml);
+            mark_zebraqueue_batch_done($entries);
+        } else {
+            my $sth = select_all_records($record_type);
+            $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
+            unless ($do_not_clear_zebraqueue) {
+                mark_all_zebraqueue_done($record_type);
+            }
+        }
+    }
+    
+    #
+    # and reindexing everything
+    #
+    if ( $verbose_logging ) {
+        print "====================\n";
+        print "REINDEXING zebra\n";
+        print "====================\n";
+    }
+	my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+    if ($process_zebraqueue) {
+        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) 
+            if $num_records_deleted;
+        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+            if $num_records_exported;
+    } else {
+        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+            if ($num_records_exported or $skip_export);
+    }
+}
+
+sub select_zebraqueue_records {
+    my ($record_type, $update_type) = @_;
+
+    my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+    my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';
+
+    my $sth = $dbh->prepare("SELECT id, biblio_auth_number 
+                             FROM zebraqueue
+                             WHERE server = ?
+                             AND   operation = ?
+                             AND   done = 0
+                             ORDER BY id DESC");
+    $sth->execute($server, $op);
+    my $entries = $sth->fetchall_arrayref({});
+}
+
+sub mark_all_zebraqueue_done {
+    my ($record_type) = @_;
+
+    my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+
+    my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1
+                             WHERE server = ?
+                             AND done = 0");
+    $sth->execute($server);
+    print "MARK\n";
+}
+
+sub mark_zebraqueue_batch_done {
+    my ($entries) = @_;
+
+    $dbh->{AutoCommit} = 0;
+    my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ?");
+    $dbh->commit();
+    foreach my $id (map { $_->{id} } @$entries) {
+        $sth->execute($id);
+    }
+    $dbh->{AutoCommit} = 1;
+}
+
+sub select_all_records {
+    my $record_type = shift;
+    return ($record_type eq 'biblio') ? select_all_biblios() : select_all_authorities();
+}
+
+sub select_all_authorities {
+    my $sth = $dbh->prepare("SELECT authid FROM auth_header");
+    $sth->execute();
+    return $sth;
+}
+
+sub select_all_biblios {
+    my $sth = $dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber");
+    $sth->execute();
+    return $sth;
+}
+
+sub export_marc_records_from_sth {
+    my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
+
+    my $num_exported = 0;
+    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $i = 0;
+    while (my ($record_number) = $sth->fetchrow_array) {
+        print "." if ( $verbose_logging );
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        if ( $nosanitize ) {
+            my $marcxml = $record_type eq 'biblio'
+                          ? GetXmlBiblio( $record_number )
+                          : GetAuthorityXML( $record_number );
+            if ( $marcxml ) {
+                print OUT $marcxml if $marcxml;
+                $num_exported++;
+            }
+            next;
+        }
+        my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+        if (defined $marc) {
+            # FIXME - when more than one record is exported and $as_xml is true,
+            # the output file is not valid XML - it's just multiple <record> elements
+            # strung together with no single root element.  zebraidx doesn't seem
+            # to care, though, at least if you're using the GRS-1 filter.  It does
+            # care if you're using the DOM filter, which requires valid XML file(s).
+            print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+            $num_exported++;
+        }
+    }
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close OUT;
+    return $num_exported;
+}
+
+sub export_marc_records_from_list {
+    my ($record_type, $entries, $directory, $as_xml, $noxml) = @_;
+
+    my $num_exported = 0;
+    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $i = 0;
+    my %found = ();
+    foreach my $record_number ( map { $_->{biblio_auth_number} }
+                                grep { !$found{ $_->{biblio_auth_number} }++ }
+                                @$entries ) {
+        print "." if ( $verbose_logging );
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+        if (defined $marc) {
+            # FIXME - when more than one record is exported and $as_xml is true,
+            # the output file is not valid XML - it's just multiple <record> elements
+            # strung together with no single root element.  zebraidx doesn't seem
+            # to care, though, at least if you're using the GRS-1 filter.  It does
+            # care if you're using the DOM filter, which requires valid XML file(s).
+            print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+            $num_exported++;
+        }
+    }
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close OUT;
+    return $num_exported;
+}
+
+sub generate_deleted_marc_records {
+    my ($record_type, $entries, $directory, $as_xml) = @_;
+
+    my $num_exported = 0;
+    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $i = 0;
+    foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        print "." if ( $verbose_logging );
+
+        my $marc = MARC::Record->new();
+        if ($record_type eq 'biblio') {
+            fix_biblio_ids($marc, $record_number, $record_number);
+        } else {
+            fix_authority_id($marc, $record_number);
+        }
+        if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+            fix_unimarc_100($marc);
+        }
+
+        print OUT ($as_xml) ? $marc->as_xml_record() : $marc->as_usmarc();
+        $num_exported++;
+    }
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close OUT;
+    return $num_exported;
+    
+
+}
+
+sub get_corrected_marc_record {
+    my ($record_type, $record_number, $noxml) = @_;
+
+    my $marc = get_raw_marc_record($record_type, $record_number, $noxml); 
+
+    if (defined $marc) {
+        fix_leader($marc);
+        if ($record_type eq 'biblio') {
+            my $succeeded = fix_biblio_ids($marc, $record_number);
+            return unless $succeeded;
+        } else {
+            fix_authority_id($marc, $record_number);
+        }
+        if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+            fix_unimarc_100($marc);
+        }
+    }
+
+    return $marc;
+}
+
+sub get_raw_marc_record {
+    my ($record_type, $record_number, $noxml) = @_;
+  
+    my $marc; 
+    if ($record_type eq 'biblio') {
+        if ($noxml) {
+            my $fetch_sth = $dbh->prepare_cached("SELECT marc FROM biblioitems WHERE biblionumber = ?");
+            $fetch_sth->execute($record_number);
+            if (my ($blob) = $fetch_sth->fetchrow_array) {
+                $marc = MARC::Record->new_from_usmarc($blob);
+                $fetch_sth->finish();
+            } else {
+                return; # failure to find a bib is not a problem -
+                        # a delete could have been done before
+                        # trying to process a record update
+            }
+        } else {
+            eval { $marc = GetMarcBiblio($record_number); };
+            if ($@) {
+                # here we do warn since catching an exception
+                # means that the bib was found but failed
+                # to be parsed
+                warn "error retrieving biblio $record_number";
+                return;
+            }
+        }
+    } else {
+        eval { $marc = GetAuthority($record_number); };
+        if ($@) {
+            warn "error retrieving authority $record_number";
+            return;
+        }
+    }
+    return $marc;
+}
+
+sub fix_leader {
+    # FIXME - this routine is suspect
+    # It blanks the Leader/00-05 and Leader/12-16 to
+    # force them to be recalculated correct when
+    # the $marc->as_usmarc() or $marc->as_xml() is called.
+    # But why is this necessary?  It would be a serious bug
+    # in MARC::Record (definitely) and MARC::File::XML (arguably) 
+    # if they are emitting incorrect leader values.
+    my $marc = shift;
+
+    my $leader = $marc->leader;
+    substr($leader,  0, 5) = '     ';
+    substr($leader, 10, 7) = '22     ';
+    $marc->leader(substr($leader, 0, 24));
+}
+
+sub fix_biblio_ids {
+    # FIXME - it is essential to ensure that the biblionumber is present,
+    #         otherwise, Zebra will choke on the record.  However, this
+    #         logic belongs in the relevant C4::Biblio APIs.
+    my $marc = shift;
+    my $biblionumber = shift;
+    my $biblioitemnumber;
+    if (@_) {
+        $biblioitemnumber = shift;
+    } else {    
+        my $sth = $dbh->prepare(
+            "SELECT biblioitemnumber FROM biblioitems WHERE biblionumber=?");
+        $sth->execute($biblionumber);
+        ($biblioitemnumber) = $sth->fetchrow_array;
+        $sth->finish;
+        unless ($biblioitemnumber) {
+            warn "failed to get biblioitemnumber for biblio $biblionumber";
+            return 0;
+        }
+    }
+
+    # FIXME - this is cheating on two levels
+    # 1. C4::Biblio::_koha_marc_update_bib_ids is meant to be an internal function
+    # 2. Making sure that the biblionumber and biblioitemnumber are correct and
+    #    present in the MARC::Record object ought to be part of GetMarcBiblio.
+    #
+    # On the other hand, this better for now than what rebuild_zebra.pl used to
+    # do, which was duplicate the code for inserting the biblionumber 
+    # and biblioitemnumber
+    C4::Biblio::_koha_marc_update_bib_ids($marc, '', $biblionumber, $biblioitemnumber);
+
+    return 1;
+}
+
+sub fix_authority_id {
+    # FIXME - as with fix_biblio_ids, the authid must be present
+    #         for Zebra's sake.  However, this really belongs
+    #         in C4::AuthoritiesMarc.
+    my ($marc, $authid) = @_;
+    unless ($marc->field('001') and $marc->field('001')->data() eq $authid){
+        $marc->delete_field($marc->field('001'));
+        $marc->insert_fields_ordered(MARC::Field->new('001',$authid));
+    }
+}
+
+sub fix_unimarc_100 {
+    # FIXME - again, if this is necessary, it belongs in C4::AuthoritiesMarc.
+    my $marc = shift;
+
+    my $string;
+    if ( length($marc->subfield( 100, "a" )) == 35 ) {
+        $string = $marc->subfield( 100, "a" );
+        my $f100 = $marc->field(100);
+        $marc->delete_field($f100);
+    }
+    else {
+        $string = POSIX::strftime( "%Y%m%d", localtime );
+        $string =~ s/\-//g;
+        $string = sprintf( "%-*s", 35, $string );
+    }
+    substr( $string, 22, 6, "frey50" );
+    unless ( length($marc->subfield( 100, "a" )) == 35 ) {
+        $marc->delete_field($marc->field(100));
+        $marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string ));
+    }
+}
+
+sub do_indexing {
+    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format, $zebraidx_log_opt) = @_;
+
+    my $zebra_server  = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+    my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
+    my $zebra_config  = C4::Context->zebraconfig($zebra_server)->{'config'};
+    my $zebra_db_dir  = C4::Context->zebraconfig($zebra_server)->{'directory'};
+
+    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name init") if $reset_index;
+    system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
+    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
+
+}
+
+
+1;
-- 
1.7.0.4

_______________________________________________
Koha-patches mailing list
[email protected]
http://lists.koha-community.org/cgi-bin/mailman/listinfo/koha-patches

Reply via email to