Is it just me, or is Python s l o o o w when compared to Perl? I have two scripts, attached. They do EXACTLY the same thing in almost EXACTLY the same manner, but the Python script is almost 25 times slower than the Perl script:
$ time bin/json2catalog.py sample/ > sample.db 2>/dev/null real 0m10.344s user 0m10.281s sys 0m0.059s $ time bin/json2catalog.pl sample/ > sample.db 2>/dev/null real 0m0.364s user 0m0.314s sys 0m0.048s When I started learning Python, and specifically learning Python’s Natural Language Toolkit (NLTK), I thought this slowness was do to the large NLTK library, but now I’m not so sure. It is just me, or is Python really s l o o o w ? Is there anything I can do to improve/optimize my Python code? — Eric Lease Morgan
#!/usr/bin/env python2 # json2catalog.py - create a "catalog" from a set of HathiTrust json files # Eric Lease Morgan <emor...@nd.edu> # May 18, 2015 - first cut; see https://sharc.hathitrust.org/features # configure HEADER = "id\ttitle\tpublication date\tpage count\tHathiTrust URL\tlanguage\tMARC (JSON) URL\tWorldCat URL" WORLDCAT = 'http://worldcat.org/oclc/' # require import glob import json import sys import os # sanity check if len( sys.argv ) != 2 : print "Usage:", sys.argv[ 0 ], '<directory>' quit() # get input directory = sys.argv[ 1 ] # intialize print( HEADER ) # process each json file in the given directory for filename in glob.glob( directory + '*.json' ): # open and read the file with open( filename ) as data: metadata = json.load( data ) # parse id = metadata[ 'id' ] title = metadata[ 'metadata' ]['title' ] date_created = metadata[ 'metadata' ][ 'dateCreated' ] page_count = metadata[ 'features' ][ 'pageCount' ] handle = metadata[ 'metadata' ][ 'handleUrl' ] language = metadata[ 'metadata' ][ 'language' ] marc = metadata[ 'metadata' ][ 'htBibUrl' ] worldcat = WORLDCAT + metadata[ 'metadata' ][ 'oclc' ] # create a list and print it metadata = [ id, title, date_created, page_count, handle, language, marc, worldcat ] print( '\t'.join( map( str, metadata ) ) ) # done quit()
#!/usr/bin/perl # json2catalog.pl - create a "catalog" from a set of HathiTrust json files # Eric Lease Morgan <emor...@nd.edu> # May 15, 2015 - first cut; see https://sharc.hathitrust.org/features # configure use constant DEBUG => 0; use constant WORLDCAT => 'http://worldcat.org/oclc/'; use constant HEADER => "id\ttitle\tpublication date\tpage count\tHathiTrust URL\tlanguage\tMARC (JSON) URL\tWorldCat URL\n"; # require use Data::Dumper; use JSON; use strict; # get input; sanity check my $directory = $ARGV[ 0 ]; if ( ! $directory ) { print "Usage: $0 <directory>\n"; exit; } # initialize $| = 1; binmode( STDOUT, ':utf8' ); print HEADER; # process each file in the given directory opendir DIRECTORY, $directory or die "Error in opening $directory: $!\n"; while ( my $filename = readdir( DIRECTORY ) ) { # only .json files next if ( $filename !~ /json$/ ); # convert the json file to a hash my $json = decode_json &slurp( "$directory$filename" ); if ( DEBUG ) { print Dumper( $json ) } # parse my $id = $$json{ 'id' }; my $title = $$json{ 'metadata' }{ 'title' }; my $date = $$json{ 'metadata' }{ 'pubDate' }; my $pagecount = $$json{ 'features' }{ 'pageCount' }; my $handle = $$json{ 'metadata' }{ 'handleUrl' }; my $language = $$json{ 'metadata' }{ 'language' }; my $marc = $$json{ 'metadata' }{ 'htBibUrl' }; my $worldcat = WORLDCAT . $$json{ 'metadata' }{ 'oclc' }; # dump print "$id\t$title\t$date\t$pagecount\t$handle\t$language\t$marc\t$worldcat\n"; } # clean up and done closedir(DIRECTORY); exit; # read and return the contents of a file sub slurp { my $f = shift; open ( F, $f ) or die "Can't open $f: $!\n"; my $r = do { local $/; <F> }; close F; return $r; }