On Wed, 2002-02-27 at 14:07, Simon Cozens wrote:
> I know some people have been talking about rewriting the assembler; I've
> had some more thoughts on this over the past couple of days.
>
> First, I think that our assembler is going to be a reference implementation
> for those producing bytecode-emitting compilers. It does not need to be
> fast, but it does need to be clear and easy to understand.
>
No arguments there.
> Some people have been talking about making the assembler more OO; in my
> opinion, this is a mistake. OO programming has the great advantage that you
> can abstract away a lot of the tricky bits, hiding the complexity. It however
> has the disadvantage that you hide away the complexity. Let's make this a
> nice, open, transparent program.
>
> The way I've thought about doing it so far is not unlike the Unix toolset
> model; small components doing a dedicated task. Assembly is essentially a
> filtering process, and Unix filter techniques can be brought to bear on it.
> For instance, I've just written a little component which takes the assembly
> and expands the ops, turning "set" into "set_p_ic" or whatever it may be.
Interesting.
>
> Granted, these components will share some library code, such as that to
> parse out a line of assembly source, but I think that specialized elements
> working on text is the way to go here.
What is wrong with a simple array (of arrays) that gets passed around?
>
> The real advantage of this method, other than making the overall design
> and process of the assembler easy to understand, is that we can slot in
> optimizations as additional filters at any stage of the assembler's operation.
There certainly are advantages...but there are disadvantages too. A
scalar holding a honking big chunk of text is going to be difficult to
identify what source line was originally used.
>
> I'll produce a more specific PDD about how I'd like the assembler to look
> if this idea makes any sense to anyone other than me.
>
Here's my hat, which I will now throw into the ring :)
* Yes, its OO, just so we don't have a freaking ton of variables laying
around which could potientially conflict with someone who wants to embed
this into a compiler or something
* sort of does what Simon wants, in that there are separate phases,
which could have extra ones inserted.
* doesn't actually dump bytecode, but will happily parse queens.pasm and
most of the other things I've thrown at it, providing it doesn't have
macros or label arithmetic.
Two files are here:
* a completely hideous "test-newassembler.pl" which calls the module.
* the 350 line NewAssembler.pm thingy itself.
This is just something I'm tinkering with. Vomit on it if you must :)
Brian
package Parrot::NewAssembler;
use Carp;
use Parrot::Config;
use Parrot::Op;
use Parrot::OpLib::core;
use Parrot::PMC qw(%pmc_types);
use Parrot::PackFile;
use Parrot::Types;
use Symbol;
sub new {
my($class,%args)=@_;
my $self={
# output file information
'packfile'=>new Parrot::PackFile,
'bytecode'=>'',
'listing'=>'',
# assembly
'PC'=>0,
'opcodes'=>{},
'errors'=>[],
# constants
'constants'=>{},
'constant_data'=>[],
# Label/fixup information
'label'=>{},
'last_label'=>'__start__',
'fixup'=>{},
# preprocessor data
'macro'=>{},
'equate'=>{},
'include_path'=>[],
};
# Initialize opcode table.
foreach my $op (@$Parrot::OpLib::core::ops) {
$self->{'opcodes'}{$op->full_name}=$op;
}
return bless $self,$class;
}
sub error {
my($self,$msg,$file,$line)=@_;
push(@{$self->{errors}},"ERROR ($file,$line): $msg");
}
#
# preprocess: preprocess assembler source, and convert into internal
# format.
#
sub preprocess {
my($self,$scalar_code,$file,$line)=@_;
my $code=[];
my $counter=$line || 1;
my @lines=split(/\n/,$scalar_code);
while(scalar(@lines)) {
my $l=shift(@lines); # line to pre-process
my $rl=$l; # raw line
$l=~s/\#.*//;
$l=~s/^\s+//;
$l=~s/\s+$//;
# do equate substitution (if any)
foreach (keys(%{$self->{equate}})) {
$l=~s/\b$_\b/$self->{equate}{$_}/g;
}
# handle include directive
if($l=~m/INCLUDE\s+['"](.+)["']/) {
# an include directive has been found.
my($filename)=$1;
if(! -e $filename) {
my $found=0;
foreach my $path (@{$self->{include_path}}) {
if(-e "$path/$filename") {
$filename="$path/$filename";
$found=1;
last;
}
}
if(!$found) {
$self->error("'$filename' not found.",$file,$line);
return undef;
}
}
open(H,$filename);
my $c=join("",<H>);
close(H);
my $pp=$self->preprocess($c,$filename,1);
push(@$code,[$file,$counter,'',$rl]);
push(@$code,@$pp);
$counter++;
next;
}
# handle equate assignment.
if($l=~m/^([_A-Za-z]\w*)\s+EQU\s+(.+)$/) {
$self->{equate}{$1}=$2;
$l=""; # clear line of code.
}
# handle macro definition
if($l=~m/^([_A-Za-z]\w*)\s+MACRO(?:\s+(.+))?$/) {
my($name,$args)=($1,$2);
if(exists($self->{macro}{$name})) {
# macro already exists!
} else {
}
}
# default to normal code: add it to the array.
push(@$code,[$file,$counter,$l,$rl]);
$counter++;
}
foreach (@$code) {
my($file,$line,$c,$raw)=@$_;
print "[$file:$line] $c == $raw\n";
}
return $code;
}
#
# assemble: assemble 'pure' source code (no comments, macros, etc)
#
sub assemble {
my($self,$code,$file,$line)=@_;
if(ref($code) ne "ARRAY") {
# inline parrot, convert to internal format.
$code=$self->preprocess($code,$file,$line);
}
foreach $data (@$code) {
my($file,$line,$code,$rawcode)=@$data;
if($code=~m/^(\S+):/) {
# the line has a label.
my($label)=lc($1);
if($label=~m/^\$([_A-Z0-9]+)/i) {
# local label, canonize it.
$label=$1;
$label=$self->{'last_label'}.".".$label;
} else {
# this is a global label, reset the 'last_label'.
$self->{'last_label'}=$label;
}
if(exists($self->{label}{$label})) {
# we've seen this label already.
## FIXME
}
$self->{label}{$label}=$self->{PC};
# we can now do any needed fixups for this label.
## FIXME
$code=~s/^(\S+:)\s*//; # remove the label.
}
# label arithmetic
## FIXME
# constantize string args
$code =~ s{([NU])?"(((\\")|[^"])*)"}{constantize_string($self,$2, $1)}egx;
$code=~s/,/ /g;
my($op,@args)=split(/\s+/,$code);
next if($op eq ""); # this line only contained a label.
# check if op is really a macro.
my $arg_pc=$self->{PC}+1;
foreach my $arg (@args) {
$arg_pc+=1;
next if($arg=~m/^\[/); # already been fixed up.
# check for registers
if($arg=~m/^([INPS])(\d+)$/i) {
my($type,$num)=(lc $1,$2);
if($num < 32) {
$arg="[$type:$num]";
next;
}
}
# check for labels
if($arg=~m/^([_A-Z][_A-Z0-9]*(\.[_A-Z0-9]*)?)/i) {
# a global label, with optional local
my($glabel,$llabel)=($1,$2);
$glabel=lc($glabel);
if(exists($self->{'label'}{$glabel})) {
$arg="[ic:$self->{'label'}{$glabel}]";
} else {
push(@{$self->{'fixup'}{$glabel}},$arg_pc);
$arg="[ic:-123456789]";
}
next;
} elsif($arg=~m/^\$([_A-Z0-9]+)/i) {
# a local label...canonize it.
my $label=$1;
$label=lc($self->{'last_label'}.".".$label);
if(exists($self->{'label'}{$label})) {
$arg="[ic:$self->{'label'}{$label}]";
} else {
push(@{$self->{'fixup'}{$label}},$arg_pc);
$arg="[ic:-123456789]";
}
next;
}
# check for integers
my $intval=constantize_integer($self,$arg);
if(defined($intval)) {
$arg="[ic:$intval]";
next;
}
# default to numeric
my $numval=constantize_number($self,$arg);
$arg="[nc:$numval]";
}
$self->{PC}=$arg_pc;
# at this point, all arguments should be fixed up. Grab signature
my(@sig);
foreach (@args) {
m/\[([^:]+):/;
push(@sig,$1);
}
my $signature=$op.(scalar(@sig)?"_".join("_",@sig):"");
if(exists $self->{'opcodes'}{$signature}) {
$signature="[ok: $signature]";
} else {
$signature="[not found $signature]";
}
print "($file:$line:$self->{PC}): ",join("|",$signature,@args)," $rawcode\n";
}
}
sub constantize_string {
my $self = shift;
my $s = shift;
my $p = shift || "";
my %encodings=('' => 0, 'N' => 0, 'U' => 3);
my %escape = (
'a' => "\a",
'n' => "\n",
'r' => "\r",
't' => "\t",
'\\' => '\\'
);
my $e = $encodings{$p};
confess if !defined $s || !defined $e;
$s=~s/\\(0\d*)/chr(oct($1))/eg;
$s=~s/\\x([0-9a-fA-F]{1,2})/chr(hex($1))/ge;
$s=~s/\\([anrt\\])/$escape{$1}/ge;
if(!exists($self->{constants}{s}{$s}{$e})) {
push(@{$self->{constant_data}},['s',$s,$e]);
$self->{constants}{s}{$s}{$e}=$#{$self->{constant_data}};
}
return "[sc:$self->{constants}{s}{$s}{$e}]";
}
sub constantize_integer {
my $self = shift;
my $i = shift;
if ($i =~ /^[+-]?0b[01]+$/i) {
$i = from_binary( $i );
}
elsif ($i =~ /^[+-]?0x?[0-9a-f]*$/i) {
$i = oct($i);
}
elsif ($i =~ m/^[+-]?\d+$/) {
# Good ones
} else {
$i=undef;
}
# XXX parrot cannot currently handle integers over 2 ** 31
if( $i > (2 ** 31) || $i < -(2**31) ) {
error( "Cannot have integer $i because it is greater than 2 ** 31.\n", $file,
$line );
}
return $i;
}
sub constantize_number {
my $self = shift;
my $n = shift;
if(!exists($self->{'constants'}{n}{$n})) {
push(@{$self->{constant_data}},['n',$n]);
$self->{constants}{n}{$n}=$#{$self->{constant_data}};
}
return $self->{'constants'}{n}{$n};
}
1;
__END__
#!/usr/bin/perl
use Parrot::NewAssembler;
$asm=new Parrot::NewAssembler;
if($ARGV[0] ne "") {
open(H,$ARGV[0]);
$filedata=join("",<H>);
close(H);
$code=$asm->preprocess($filedata,$ARGV[0]);
$asm->assemble($code);
} else {
my $p=$asm->preprocess('
queen_at EQU queen_fat
$000: set S0, "hello world\n"
set S1, "bye world\n"
test: set S2, "hello world\n"
set S3, S0
set N0, 1.3323
$000: set N1, N0
set N2, -3.23
set I0, 1
$001: set I1, I0
set I2, -1
branch $000
branch __START__.000
branch test
branch foo
foo: end
end
INCLUDE "../examples/assembly/queens.pasm"
');
$asm->assemble($p);
};