I've written the code below to parse a number of text page files generated by 
Tesseract OCR software to look for a guess the most likely values for VIN, 
Reg number and stock number for a vehicle.

On my development site it works fine and gives the values it should.
However, on my live server it only returns '1' as the value.

The only thing I can think of is that my live server is much older than my 
development server.  My perl versions are:

Live            v5.10.0
Devel   v5.18.4

Am I right, and what can I do to get the code to work on my live server (other 
than upgrage perl)


#!/usr/bin/perl -w

# searches a series of OCR generated text files - one per page
# looks for sets of regex's for field contents and stores in arrays

use warnings;
use strict;
use Data::Dumper;

my %searches=('stock'=>[qr/\b([NU][LD] *\d{5})\b/],
              'regno'=>[qr/\b([A-Za-z]{2}\d{2}[A-Za-z]{3})\b/],
              'vin'=>[qr/\b(WF[0O]XX[A-Z]{6}\d{5}\b)/i,qr/\b([A-Z]
{6}\d{5}\b)/i]);
my %found;
my %values;
foreach my $file (<*.txt>) {
  print "file.....$file\n";
  if (!open FH,$file ) {
    print "file open failed: $!\n";
    next;
  }
  my $content = do { local $/; <FH> };
  close(FH);
# print "*****$content*********\n";

  foreach my $field (keys %searches) { # foreach field
#####THe following line is the one with the problem
    if (my @matches = $content =~ @{$searches{$field}}) {
      foreach (@matches) {
        $_=~s/ //g;
        print STDERR "match found - '$field': '$_'\n";
        if ($found{$field}{$_}) {
          $found{$field}{$_}++;
        } else {
          $found{$field}{$_}=1;
        }
      }
    }
  }
  
} # foreach page


foreach my $field (keys %found) { # foreach field
  my $value='';
  my $count=0;
  foreach my $key (keys %{$found{$field}}) { # foreach field -> value
    $value=$key if ($found{$field}{$key} > $count);
  }
  print STDERR "field='$field' value='$value'\n";
  $values{$field}=$value;
}

print STDERR Dumper(%found);
print STDERR Dumper(%values);



Development server output

[gary@gary tmp]$ parse_deal_pack 
file.....DOC160715-16072015164033.pdf-01.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-02.txt
file.....DOC160715-16072015164033.pdf-03.txt
match found - 'regno': 'yy15yyy'
file.....DOC160715-16072015164033.pdf-04.txt
file.....DOC160715-16072015164033.pdf-05.txt
file.....DOC160715-16072015164033.pdf-06.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-07.txt
file.....DOC160715-16072015164033.pdf-08.txt
file.....DOC160715-16072015164033.pdf-09.txt
file.....DOC160715-16072015164033.pdf-10.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-11.txt
match found - 'stock': 'NL01047'
file.....DOC160715-16072015164033.pdf-12.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-13.txt
match found - 'regno': 'yy15yyy'
file.....DOC160715-16072015164033.pdf-14.txt
file.....DOC160715-16072015164033.pdf-15.txt
match found - 'regno': 'yy15yyy'
file.....DOC160715-16072015164033.pdf-16.txt
file.....DOC160715-16072015164033.pdf-17.txt
file.....DOC160715-16072015164033.pdf-18.txt
file.....DOC160715-16072015164033.pdf-19.txt
match found - 'regno': 'yy15yyy'
field='stock' value='NL31047'
field='regno' value='yy15yyy'
$VAR1 = 'stock';
$VAR2 = {
          'NL01047' => 1,
          'NL31047' => 4
        };
$VAR3 = 'regno';
$VAR4 = {
          'yy15yyy' => 4
        };
$VAR1 = 'regno';
$VAR2 = 'yy15yyy';
$VAR3 = 'stock';
$VAR4 = 'NL31047';
[gary@gary tmp]$ 





Live server
[root@ollie faxgateway_10734]# parse_deal_pack    
file.....DOC160715-16072015164033.pdf-01.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-02.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-03.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-04.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-05.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-06.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-07.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-08.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-09.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-10.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-11.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-12.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-13.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-14.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-15.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-16.txt
match found - 'vin': '1'
file.....DOC160715-16072015164033.pdf-17.txt
match found - 'vin': '1'
file.....DOC160715-16072015164033.pdf-18.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-19.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
field='regno' value='1'
field='stock' value='1'
field='vin' value='1'
$VAR1 = 'regno';
$VAR2 = {
          '1' => 17
        };
$VAR3 = 'stock';
$VAR4 = {
          '1' => 17
        };
$VAR5 = 'vin';
$VAR6 = {
          '1' => 19
        };
$VAR1 = 'regno';
$VAR2 = '1';
$VAR3 = 'vin';
$VAR4 = '1';
$VAR5 = 'stock';
$VAR6 = '1';


-- 
To unsubscribe, e-mail: beginners-unsubscr...@perl.org
For additional commands, e-mail: beginners-h...@perl.org
http://learn.perl.org/


Reply via email to