[CODE4LIB] is python s l o o o w ?

Eric Lease Morgan Mon, 18 May 2015 17:36:36 -0700

Is it just me, or is Python  s l o o o w  when compared to Perl?

I have two scripts, attached. They do EXACTLY the same thing in almost EXACTLY 
the same manner, but the Python script is almost 25 times slower than the Perl 
script:


  $ time bin/json2catalog.py sample/ > sample.db 2>/dev/null
  real 0m10.344s
  user 0m10.281s
  sys 0m0.059s

  $ time bin/json2catalog.pl sample/ > sample.db 2>/dev/null
  real 0m0.364s
  user 0m0.314s
  sys 0m0.048s

When I started learning Python, and specifically learning Python’s Natural 
Language Toolkit (NLTK), I thought this slowness was do to the large NLTK 
library, but now I’m not so sure. It is just me, or is Python really  s l o o o 
w ? Is there anything I can do to improve/optimize my Python code?

—
Eric Lease Morgan

#!/usr/bin/env python2

# json2catalog.py - create a "catalog" from a set of HathiTrust json files

# Eric Lease Morgan <emor...@nd.edu>
# May 18, 2015 - first cut; see https://sharc.hathitrust.org/features


# configure
HEADER   = "id\ttitle\tpublication date\tpage count\tHathiTrust URL\tlanguage\tMARC (JSON) URL\tWorldCat URL"
WORLDCAT = 'http://worldcat.org/oclc/'

# require
import glob
import json
import sys
import os

# sanity check
if len( sys.argv ) != 2 :
	print "Usage:", sys.argv[ 0 ], '<directory>'
	quit()

# get input
directory = sys.argv[ 1 ]

# intialize
print( HEADER )

# process each json file in the given directory
for filename in glob.glob( directory + '*.json' ):

	# open and read the file
	with open( filename ) as data: metadata = json.load( data )
		
	# parse
	id           = metadata[ 'id' ]
	title        = metadata[ 'metadata' ]['title' ]
	date_created = metadata[ 'metadata' ][ 'dateCreated' ]
	page_count   = metadata[ 'features' ][ 'pageCount' ]
	handle       = metadata[ 'metadata' ][ 'handleUrl' ]
	language     = metadata[ 'metadata' ][ 'language' ]
	marc         = metadata[ 'metadata' ][ 'htBibUrl' ]
	worldcat     = WORLDCAT + metadata[ 'metadata' ][ 'oclc' ]

	# create a list and print it
	metadata = [ id, title, date_created, page_count, handle, language, marc, worldcat ]
	print( '\t'.join( map( str, metadata ) ) )
	
# done
quit()

#!/usr/bin/perl

# json2catalog.pl - create a "catalog" from a set of HathiTrust json files

# Eric Lease Morgan <emor...@nd.edu>
# May 15, 2015 - first cut; see https://sharc.hathitrust.org/features


# configure
use constant DEBUG    => 0;
use constant WORLDCAT => 'http://worldcat.org/oclc/';
use constant HEADER   => "id\ttitle\tpublication date\tpage count\tHathiTrust URL\tlanguage\tMARC (JSON) URL\tWorldCat URL\n";

# require
use Data::Dumper;
use JSON;
use strict;

# get input; sanity check
my $directory = $ARGV[ 0 ];
if ( ! $directory ) {

	print "Usage: $0 <directory>\n";
	exit;
	
}

# initialize
$| = 1;
binmode( STDOUT, ':utf8' );
print HEADER;

# process each file in the given directory
opendir DIRECTORY, $directory or die "Error in opening $directory: $!\n";
while ( my $filename = readdir( DIRECTORY ) ) {

	# only .json files
	next if ( $filename !~ /json$/ );

	# convert the json file to a hash
	my $json = decode_json &slurp( "$directory$filename" );
	if ( DEBUG ) { print Dumper( $json ) }

	# parse
	my $id        = $$json{ 'id' };
	my $title     = $$json{ 'metadata' }{ 'title' };
	my $date      = $$json{ 'metadata' }{ 'pubDate' };
	my $pagecount = $$json{ 'features' }{ 'pageCount' };
	my $handle    = $$json{ 'metadata' }{ 'handleUrl' };
	my $language  = $$json{ 'metadata' }{ 'language' };
	my $marc      = $$json{ 'metadata' }{ 'htBibUrl' };
	my $worldcat  = WORLDCAT . $$json{ 'metadata' }{ 'oclc' };

	# dump
	print "$id\t$title\t$date\t$pagecount\t$handle\t$language\t$marc\t$worldcat\n";
	     
}

# clean up and done
closedir(DIRECTORY);    
exit;


# read and return the contents of a file
sub slurp {
 
	my $f = shift;
	open ( F, $f ) or die "Can't open $f: $!\n";
	my $r = do { local $/; <F> };
	close F;
	return $r;
 
}

[CODE4LIB] is python s l o o o w ?

Reply via email to