Here is what I ended up with - this is a chunk from a much bigger script. Suggestions gladly accepted. Haven't fixed the special characters yet.
my ( $date, $p, @articles ) = (); if ( ! defined( $p = HTML::TokeParser->new( $html ))) { localError( "Unable to parse $html : $!" ); } my ( $title, $body ) = (); while ( my $token = $p->get_token()) { if ( $token->[0] eq 'C' ) { if ( $token->[1] =~ m#<!-- begin header date --># ) { while ( my $token = $p->get_token()) { if ( $token->[0] eq "T" ) { $date .= $token->[1]; } elsif ( $token->[0] eq "S" ) { $date .= $token->[4]; } elsif ( $token->[0] eq "E" ) { $date .= $token->[2]; } elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!- - end header date --># ) { last; } else { localError( "$token->[0] : unrecognized HTML Token Type in Date : <PRE>" . Dumper( $token ) . "</PRE>" ); } } } elsif( $token->[1] =~ m#<!-- begin article\d* title# ) { while ( my $token = $p->get_token()) { if ( $token->[0] eq "T" ) { $title .= $token->[1]; } elsif ( $token->[0] eq "S" ) { $title .= $token->[4]; } elsif ( $token->[0] eq "E" ) { $title .= $token->[2]; } elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!- - end article# ) { last; } elsif ( $token->[0] ne "C" ) { localError( "$token->[0] : unrecognized HTML Token Type in Title : <PRE>" . Dumper( $token ) . "</PRE>" ); } } } elsif( $token->[1] =~ m#<!-- begin article\d* body# ) { while ( my $token = $p->get_token()) { if ( $token->[0] eq "T" ) { $body .= $token->[1]; } elsif ( $token->[0] eq "S" ) { $body .= $token->[4]; } elsif ( $token->[0] eq "E" ) { $body .= $token->[2]; } elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!- - end article# ) { last; } elsif ( $token->[0] ne "C" ) { localError( "$token->[0] : unrecognized HTML Token Type in Body : <PRE>" . Dumper( $token ) . "</PRE>" ); } } } } if ( defined( $title ) && defined( $body ) && $title ne "" && $body ne "" ) { my %article = (); $title =~ s#\n##g; $article{'title'} = $title; $article{'body'} = $body; push( @articles, \%article ); ( $body, $title ) = (); } } -- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]