Here is what I ended up with - this is a chunk from a much bigger 
script.  Suggestions gladly accepted.  Haven't fixed the special 
characters yet.

my ( $date, $p, @articles ) = ();

if ( ! defined( $p = HTML::TokeParser->new( $html )))
{
    localError( "Unable to parse $html : $!" );
}

my ( $title, $body ) = ();

while ( my $token = $p->get_token())
{
    if ( $token->[0] eq 'C' )
    {
        if ( $token->[1] =~ m#<!-- begin header date --># )
        {
            while ( my $token = $p->get_token())
            {
                if ( $token->[0] eq "T" )
                {
                    $date .= $token->[1];
                }
                elsif ( $token->[0] eq "S" )
                {
                    $date .= $token->[4];
                }
                elsif ( $token->[0] eq "E" )
                {
                    $date .= $token->[2];
                }
                elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!-
- end header date --># )
                {
                    last;
                }
                else
                {
                    localError( "$token->[0] : unrecognized HTML 
Token Type in Date : <PRE>" . Dumper( $token ) . "</PRE>" );
                }
            }
        }
        elsif( $token->[1] =~ m#<!-- begin article\d* title# )
        {
            while ( my $token = $p->get_token())
            {
                if ( $token->[0] eq "T" )
                {
                    $title .= $token->[1];
                }
                elsif ( $token->[0] eq "S" )
                {
                    $title .= $token->[4];
                }
                elsif ( $token->[0] eq "E" )
                {
                    $title .= $token->[2];
                }
                elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!-
- end article# )
                {
                    last;
                }
                elsif ( $token->[0] ne "C" )
                {
                    localError( "$token->[0] : unrecognized HTML 
Token Type in Title : <PRE>" . Dumper( $token ) . "</PRE>" );
                }
            }
        }
        elsif( $token->[1] =~ m#<!-- begin article\d* body# )
        {
            while ( my $token = $p->get_token())
            {
                if ( $token->[0] eq "T" )
                {
                    $body .= $token->[1];
                }
                elsif ( $token->[0] eq "S" )
                {
                    $body .= $token->[4];
                }
                elsif ( $token->[0] eq "E" )
                {
                    $body .= $token->[2];
                }
                elsif ( $token->[0] eq "C" && $token->[1] =~ m#<!-
- end article# )
                {
                    last;
                }
                elsif ( $token->[0] ne "C" )
                {
                    localError( "$token->[0] : unrecognized HTML 
Token Type in Body : <PRE>" . Dumper( $token ) . "</PRE>" );
                }
            }
        }
    }

    if ( defined( $title ) && defined( $body ) && $title ne "" && 
$body ne "" )
    {
        my %article = ();
        $title =~ s#\n##g;
        $article{'title'} = $title;
        $article{'body'} = $body;
        push( @articles, \%article );
        ( $body, $title ) = ();
    }
}








-- 
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to