Dan Muey wrote:

> 
> print "body text: @body\n"; # this needs to keep the tags were they are**
>

that' fairly easy to do:

#!/usr/bin/perl -w
use strict;

use HTMP::Parser;

my $text = <<HTML;
<html><head>
<title> HI Title </title>
heaD STUFF
 </head>
 <body bodytag=attributes>
<i> keep the I tag </i>
 hI HERE'S CONTENT i WANT
<img src=""> IMaGE
 <!-- i WANT TO STRIP COMMENTS OUT -->
 <SCRIPT>

 i DON'T WANT THIS SCRIPT EITHER
 </SCRIPT>
 <font>Hello world</font>

 </BODY>
 </HTMl>
HTML

my $body = 0;
my $title = 0;
my @body;
my @title;
my %body_attr;

my $html = HTML::Parser->new(api_version => 3,
                                text_h => [\&text,'dtext'],
                                start_h => [\&open_tag, 'tagname,attr'],
                                end_h   => [\&close_tag, 'tagname']);
$html->ignore_elements(qw(script comment));
$html->parse($text);
$html->eof;

print "title is:\n@title\n\n";
print "body text:\n@body\n\n";
print "body attr:\n";
while(my($k,$v) = each %body_attr){
        print "$k=$v\n";
}

sub text{

        my $text = shift;

        return unless($text =~ /\w/);

        if($title){
                push(@title,$text);
        }elsif($body){
                push(@body,$text);
        }
}

sub open_tag{

        my $tagname = shift;
        my $attr    = shift;

        $title = 1 if($tagname eq 'title');

        if($tagname eq 'body'){
                $body = 1;
                while(my($key,$value) = each %{$attr}){
                        $body_attr{$key} = "'$value'";
                }
        }elsif($body){
                my $t = '';
                while(my($key,$value) = each %{$attr}){
                        $t .= "$key='$value' ";
                }
                $t =~ s/\s$//;
                push(@body,"<$tagname" . ($t ? " $t>" : '>'));
        }
}

sub close_tag{

        my $tagname = shift;

        $title = 0 if($tagname eq 'title');
        $body  = 0 if($tagname eq 'body');

        push(@body,"</$tagname>") if($body);
}

__END__

prints:

title is:
 HI Title

body text:
<i>  keep the I tag  </i>
 hI HERE'S CONTENT i WANT
 <img src=''>  IMaGE
  <font> Hello world </font>

body attr.:
bodytag='attributes'

david

-- 
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to