I have written an interface for m-w.com. I found some scripts on the web but nothing really robust. Please have a look, make comments, request functionality, make suggestions, make changes, or anything else you feel useful. If I have enough time, I will make this fully object-oriented.
I welcome all comments. Thanks, Michael --------------begin mw.pl----------------------------- #!/usr/bin/perl use warnings; use strict; use LWP::UserAgent; use HTTP::Request::Common qw(POST); use Getopt::Std; # OPTIONS: # 1. retain tabs a # 2. double spaced d # 3. help h # 4. divider btween words i # 5. wrap text r # 6. thesaurus t # 7. ugly formatting u # 8. verbose or recursive v # 9. set width w local $SIG{__WARN__} = sub { my $msg = shift; die $msg if $msg =~ /Unknown option/; }; getopts('adhi:rtuvw:', \my %opts); if (@ARGV==0 or $opts{'h'} or ($opts{'w'} and $opts{'w'} !~ /^\d+$/)) { die <<"HELP"; usage: $0 [-dhrtuv -iC -wN] word1 [word2 [word3 [...]]] -a Do not convert tabs to single spaces, does not wrap properly -d Double-spaced, default is single-spaced -h Prints this usage message -iC Prints a divider of character C between entries -r Wrap line at width by inserting \\n, use with -wN -t Use the thesaurus, default is dictionary -u Ugly formattting, translate <br> to space instead of \\n But why would you want to do this? -v Verbose, print all entries, default is only the first -wN Set widths of divider and text-wrap to N, default is 80\n Examples: # all definitions of harvest and posture separated by 80 _'s $0 -vi_ harvest posture\n # all thesaurus entries for hot separated by 60 +'s $0 -tvi+ -w60 hot\n # first definitions of cape and toad, double-spaced, separated by 65 \%'s # retain tabs, wrap text at column 65, ugly formatting $0 -adrui% -w65 cape toad HELP } my $spacing = $opts{'d'} ? "\n\n" : "\n" ; my $div_str = $opts{'i'} ? $opts{'i'} : '-' ; my $book = $opts{'t'} ? 'Thesaurus' : 'Dictionary'; my $width = $opts{'w'} ? $opts{'w'} : 80 ; my $ugly = $opts{'u'} ? ' ' : "\n" ; my $divider = join '', substr($div_str x $width,0,$width), $spacing; my $ua = new LWP::UserAgent or die "Cannot create UserAgent\n"; my $url = 'http://www.m-w.com/cgi-bin/dictionary'; my ($html, %entity); while ( my $word = shift ) { $html = get_content([ book => $book, va => $word ]); if ($html =~ /One entry found for .+?$word/i) { $html = join '', $&, $'; print_entry(); } elsif ($html =~ /(\d+ entries found for .+?$word)/i) { $html = $1; html2text(\$html); print "$html$spacing"; $html = join '', $&, $'; my($list) = $html =~ /<input type=hidden name=list value="(.+?)">/; my @entries = split /=|;/, $list; splice @entries, 0, 2; # this entry already displayed print_entry(); while ($opts{'v'} and @entries) { $html = get_content([ book => $book, hdwd => $word, listword => $word, jump => $entries[0], list => $list ]); print_entry(); splice @entries, 0, 2; } } else { print "$word not found in the ", lc $book, " at http://www.m-w.com\n"; } } sub get_content { die "trying to POST an empty request" unless @_; my $request = POST $url, shift; my $response = $ua->request($request); $response->content; } sub print_entry { ($html) = $html =~ m!((?:Main Entry|Entry Word):.*?)</form>!is; { no warnings 'uninitialized'; my $x; # neat trick that turns # text 1 a : definition one b : definition two # into # text # 1a: definition one # 1b: definition two $html =~ s!<b>(\d+)? (?:\s+)? ([a-z])? </b> !join '', "\n", ($1 and $x=$1 or $x), $2 !xeg; } $html =~ s/<br>/$ugly/g unless $opts{'u'}; $html =~ s/\t/ /g unless $opts{'a'}; html2text(\$html); # expecting \n, but server sent chr(13), that's what you get for expectations $html = join $spacing, split /[\n\r\x{0A}\x{0D}]+/, $html; $html =~ s/^\s+|\s+$//g; # substitute only if the present line is greater than width my $wrap = $width + 1; $html =~ s/(?=.{$wrap})(.{1,$width}) +(?=[^\n])/$1$spacing/g if $opts{'r'}; print "$html$spacing"; print $divider if $opts{'i'}; } sub html2text { # typeglob aliases are supposedly faster than refs to scalars our $htm; local *htm = shift; return unless length $htm; my($begin, $end) = ('<!--', '-->'); # The three following substitutions and the %entity initialization below # were brazenly copied from Tom Christiansens's striphtml (striff tummel) # perl script written back in 1996. It can still be found at # http://www.perl.com/CPAN-local/authors/Tom_Christiansen/scripts/. I # slightly changed the first substitution: it now handles embedded comments. # Hopefully, tags within comments are still handled properly. Otherwise, # all else is the same. # 1. remove embedded comments 1 while $htm =~ s/$begin (?:(?!$begin).)*? $end//gxs; # 2. remove tags $htm =~ s/<(?:[^>'"]+ | ".*?" | '.*?')*?>//gxs; load_entity() unless %entity; { no warnings 'uninitialized'; # 3. replace entities with actual characters $htm =~ s/(&(\w+ | \x23\d+);?)/$entity{$2} || $1/gxe; } } sub load_entity { # < %entity = ( lt => '<', # less-than gt => '>', # greater-than amp => '&', # ampersand quot => '"', # verticle double-quote nbsp => chr 160, # no-break space iexcl => chr 161, # inverted exclamation mark cent => chr 162, # cent sign pound => chr 163, # pound sterling sign CURRENCY NOT WEIGHT curren => chr 164, # general currency sign yen => chr 165, # yen sign brvbar => chr 166, # broken vertical bar sect => chr 167, # section sign uml => chr 168, # umlaut (dieresis) copy => chr 169, # copyright sign ordf => chr 170, # ordinal indicator, feminine laquo => chr 171, # angle quotation mark, left not => chr 172, # not sign shy => chr 173, # soft hyphen reg => chr 174, # registered sign macr => chr 175, # macron deg => chr 176, # degree sign plusmn => chr 177, # plus-or-minus sign sup2 => chr 178, # superscript two sup3 => chr 179, # superscript three acute => chr 180, # acute accent micro => chr 181, # micro sign para => chr 182, # pilcrow (paragraph sign) middot => chr 183, # middle dot cedil => chr 184, # cedilla sup1 => chr 185, # superscript one ordm => chr 186, # ordinal indicator, masculine raquo => chr 187, # angle quotation mark, right frac14 => chr 188, # fraction one-quarter frac12 => chr 189, # fraction one-half frac34 => chr 190, # fraction three-quarters iquest => chr 191, # inverted question mark Agrave => chr 192, # capital A, grave accent Aacute => chr 193, # capital A, acute accent Acirc => chr 194, # capital A, circumflex accent Atilde => chr 195, # capital A, tilde Auml => chr 196, # capital A, dieresis or umlaut mark Aring => chr 197, # capital A, ring AElig => chr 198, # capital AE diphthong (ligature) Ccedil => chr 199, # capital C, cedilla Egrave => chr 200, # capital E, grave accent Eacute => chr 201, # capital E, acute accent Ecirc => chr 202, # capital E, circumflex accent Euml => chr 203, # capital E, dieresis or umlaut mark Igrave => chr 204, # capital I, grave accent Iacute => chr 205, # capital I, acute accent Icirc => chr 206, # capital I, circumflex accent Iuml => chr 207, # capital I, dieresis or umlaut mark ETH => chr 208, # capital Eth, Icelandic Ntilde => chr 209, # capital N, tilde Ograve => chr 210, # capital O, grave accent Oacute => chr 211, # capital O, acute accent Ocirc => chr 212, # capital O, circumflex accent Otilde => chr 213, # capital O, tilde Ouml => chr 214, # capital O, dieresis or umlaut mark times => chr 215, # multiply sign Oslash => chr 216, # capital O, slash Ugrave => chr 217, # capital U, grave accent Uacute => chr 218, # capital U, acute accent Ucirc => chr 219, # capital U, circumflex accent Uuml => chr 220, # capital U, dieresis or umlaut mark Yacute => chr 221, # capital Y, acute accent THORN => chr 222, # capital THORN, Icelandic szlig => chr 223, # small sharp s, German (sz ligature) agrave => chr 224, # small a, grave accent aacute => chr 225, # small a, acute accent acirc => chr 226, # small a, circumflex accent atilde => chr 227, # small a, tilde auml => chr 228, # small a, dieresis or umlaut mark aring => chr 229, # small a, ring aelig => chr 230, # small ae diphthong (ligature) ccedil => chr 231, # small c, cedilla egrave => chr 232, # small e, grave accent eacute => chr 233, # small e, acute accent ecirc => chr 234, # small e, circumflex accent euml => chr 235, # small e, dieresis or umlaut mark igrave => chr 236, # small i, grave accent iacute => chr 237, # small i, acute accent icirc => chr 238, # small i, circumflex accent iuml => chr 239, # small i, dieresis or umlaut mark eth => chr 240, # small eth, Icelandic ntilde => chr 241, # small n, tilde ograve => chr 242, # small o, grave accent oacute => chr 243, # small o, acute accent ocirc => chr 244, # small o, circumflex accent otilde => chr 245, # small o, tilde ouml => chr 246, # small o, dieresis or umlaut mark divide => chr 247, # divide sign oslash => chr 248, # small o, slash ugrave => chr 249, # small u, grave accent uacute => chr 250, # small u, acute accent ucirc => chr 251, # small u, circumflex accent uuml => chr 252, # small u, dieresis or umlaut mark yacute => chr 253, # small y, acute accent thorn => chr 254, # small thorn, Icelandic yuml => chr 255, # small y, dieresis or umlaut mark ); # ¡ for(0..255){ $entity{'#' . $_} = chr $_; } }
mw.pl
Description: Perl program
-- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]