|
# -*- mode: perl; perl-indent-level: 2 -*-
|
|
#
|
|
# Compile and evaluate regular expressions in Perl without
|
|
# using built-in regular expressions.
|
|
#
|
|
# Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com)
|
|
# This program is in the PUBLIC DOMAIN.
|
|
#
|
|
|
|
package Regex;
|
|
|
|
# Regexps are handled in three phases. First, they're parsed from a s\
|
|
tring
|
|
# form into an internal parse tree form, thus:
|
|
#
|
|
# ABC => [ CONCAT => [ A, B, C ] ]
|
|
# A* => [ STAR => A ]
|
|
# A+ => [ PLUS => A ]
|
|
# A|B|C => [ ALTERN => [ A, B, C ] ]
|
|
# literal character x => [ LITERAL => x ]
|
|
#
|
|
# `parse' does this.
|
|
sub parse {
|
|
@t = split(//, $_[1]);
|
|
parse_altern(@t);
|
|
}
|
|
|
|
sub parse_altern {
|
|
my @alterns;
|
|
my @terms;
|
|
my $c;
|
|
while (defined($c = shift @_)) {
|
|
next if $c eq '';
|
|
push @seen, $c;
|
|
if ($c eq '(') {
|
|
my $next_term = &parse_altern;
|
|
push @terms, $next_term;
|
|
} elsif ($c eq ')') {
|
|
push @alterns, &joinup(CONCAT, @terms) if @terms;
|
|
return &joinup(ALTERN, @alterns);
|
|
} elsif ($c eq '|') {
|
|
push @alterns, &joinup(CONCAT, @terms) if @terms;
|
|
@terms = ();
|
|
} elsif ($c eq '*' || $c eq '+') {
|
|
if (@terms) {
|
|
$terms[-1] = [ ($c eq '*' ? STAR : PLUS) => $terms[-1] ];
|
|
} else {
|
|
$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n";
|
|
return undef;
|
|
}
|
|
} elsif ($c eq '\\') {
|
|
push @terms, [ LITERAL => (shift) ];
|
|
} else {
|
|
push @terms, [ LITERAL => $c ];
|
|
}
|
|
} # While there are tokens...
|
|
push @alterns, &joinup(CONCAT, @terms) if @terms;
|
|
return joinup(ALTERN, @alterns) if @alterns;
|
|
return undef;
|
|
}
|
|
|
|
sub joinup {
|
|
my $tag = shift;
|
|
if (@_ == 1) {
|
|
$_[0];
|
|
} else {
|
|
[ $tag => [ @_ ] ];
|
|
}
|
|
}
|
|
package NFA;
|
|
################################################################
|
|
#
|
|
# Compile parsed regexp into representation of NFA
|
|
#
|
|
################################################################
|
|
$S = 'aa00';
|
|
$STARTSYMBOL = 0;
|
|
$ENDSYMBOL = 1;
|
|
sub new {
|
|
compile(@_);
|
|
}
|
|
sub compile {
|
|
my $pack = shift;
|
|
my $rx = shift;
|
|
my ($operator, $operands) = @$rx;
|
|
|
|
# A literal has no suboperands to compile.
|
|
# So invoke the special atom-compiler and return that result instead\
|
|
.
|
|
if ($operator eq LITERAL) {
|
|
return $pack->literal($operands);
|
|
}
|
|
|
|
my $startsym = "S" . &gensym();
|
|
my $endsym = "E" . &gensym();
|
|
my $result = { Symbols => [ $startsym, $endsym ] };
|
|
|
|
# Compile the sub-operands first.
|
|
my @submachines;
|
|
if ($operator eq STAR || $operator eq PLUS) {
|
|
@submachines = ($pack->compile($operands));
|
|
} else {
|
|
foreach $operand (@$operands) {
|
|
push @submachines, $pack->compile($operand);
|
|
}
|
|
}
|
|
|
|
if ($operator eq CONCAT) {
|
|
return $submachines[0] if @submachines == 1;
|
|
&putin($result, @submachines);
|
|
my $i;
|
|
for ($i = 0; $i < @submachines - 1; $i++) {
|
|
my $tail = $submachines[$i] {Symbols}[$ENDSYMBOL];
|
|
my $head = $submachines[$i+1]{Symbols}[$STARTSYMBOL];
|
|
$result->{$tail} = { '' => $head };
|
|
}
|
|
$result->{$startsym} = { '' => $submachines[0] {Symbols}[$ST\
|
|
ARTSYMBOL] };
|
|
$result->{$submachines[-1]{Symbols}[$ENDSYMBOL]} = { '' => $\
|
|
endsym };
|
|
} elsif ($operator eq STAR) {
|
|
my $sm = $submachines[0];
|
|
&putin($result, $sm);
|
|
my ($s, $e) = @{$sm->{Symbols}};
|
|
$result->{$e} = { '' => [$s, $endsym] };
|
|
$result->{$startsym} = { '' => [$s, $endsym] };
|
|
} elsif ($operator eq PLUS) {
|
|
my $sm = $submachines[0];
|
|
&putin($result, $sm);
|
|
my ($s, $e) = @{$sm->{Symbols}};
|
|
$result->{$e} = { '' => [$s, $endsym] };
|
|
$result->{$startsym} = { '' => $s };
|
|
} elsif ($operator eq ALTERN) {
|
|
return $submachines[0] if @submachines == 1;
|
|
&putin($result, @submachines);
|
|
my @startsyms = map { $_->{Symbols}[$STARTSYMBOL] } @submachine\
|
|
s;
|
|
my @endsyms = map { $_->{Symbols}[$ENDSYMBOL] } @submachines;
|
|
$result->{$startsym} = { '' => \@startsyms };
|
|
foreach $es (@endsyms) {
|
|
$result->{$es} = { '' => $endsym };
|
|
}
|
|
} else {
|
|
warn "Bizarre oprerator `$operator' encountered.\n";
|
|
}
|
|
bless $result => $pack;
|
|
}
|
|
|
|
sub start_state {
|
|
$_[0]{Symbols}[$STARTSYMBOL];
|
|
}
|
|
|
|
sub is_end_state {
|
|
my $self = shift;
|
|
my $state = shift;
|
|
$state eq $self->{Symbols}[$ENDSYMBOL];
|
|
}
|
|
|
|
sub transition_table {
|
|
my $self = shift;
|
|
my $state = shift;
|
|
|
|
$self->{$state} || {};
|
|
}
|
|
sub literal {
|
|
my $pack = shift;
|
|
my $what = shift;
|
|
my $startsym = "S" . &gensym();
|
|
my $endsym = "E" . &gensym();
|
|
bless
|
|
{ Symbols => [ $startsym, $endsym ],
|
|
$startsym => { $what => $endsym } },
|
|
=> $pack;
|
|
}
|
|
# Given a list of machines, M1 ... Mn, put M2... Mn into M1.
|
|
sub putin {
|
|
my $master = shift;
|
|
foreach $m (@_) {
|
|
foreach $state (keys %$m) {
|
|
next if $state eq 'Symbols';
|
|
if (exists $master->{$state}) {
|
|
print STDERR "Warning: State name conflict for `$state'.\n";
|
|
}
|
|
$master->{$state} = $m->{$state};
|
|
}
|
|
}
|
|
$master;
|
|
}
|
|
|
|
sub gensym {
|
|
$S++;
|
|
}
|
|
|
|
################################################################
|
|
#
|
|
# Execute NFA on a given string
|
|
#
|
|
################################################################
|
|
|
|
package NFA_Exec;
|
|
|
|
sub match {
|
|
my $pack = shift;
|
|
my $nfa = shift;
|
|
my $string = shift;
|
|
my $machine = $pack->init($nfa, $string);
|
|
$machine->run();
|
|
$machine->final_state();
|
|
}
|
|
|
|
sub new {
|
|
&init(@_);
|
|
}
|
|
|
|
#
|
|
# Create a new execution of the specified NFS, and feed it
|
|
# the specified string as its input.
|
|
#
|
|
sub init {
|
|
my $pack = shift;
|
|
my $nfa = shift;
|
|
my $string = shift;
|
|
my $self = {};
|
|
|
|
$self->{nfa} = $nfa;
|
|
$self->{input} = $string;
|
|
$self->{pos} = 0;
|
|
$self->{states} = [ $self->{nfa}->start_state ];
|
|
bless $self => $pack;
|
|
$self->epsilon_transit();
|
|
$self;
|
|
}
|
|
|
|
#
|
|
# Run an execution to the end of the input
|
|
#
|
|
sub run {
|
|
my $self = shift;
|
|
until ($self->end_of_input() || $self->states() == 0) {
|
|
$self->step;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Is this execution object at the end of its input?
|
|
#
|
|
sub end_of_input {
|
|
my $self = shift;
|
|
$self->{pos} >= length($self->{input});
|
|
}
|
|
|
|
#
|
|
# Advance an execution by one step.
|
|
#
|
|
sub step {
|
|
my $self = shift;
|
|
my $next_symbol = substr($self->{input}, $self->{pos}, 1);
|
|
if ($next_symbol eq '') {
|
|
# error
|
|
} else {
|
|
$self->transit($next_symbol);
|
|
$self->epsilon_transit();
|
|
}
|
|
$self->{pos}++;
|
|
}
|
|
#
|
|
# Perform e-transitions in an execution
|
|
#
|
|
sub epsilon_transit {
|
|
my $self = shift;
|
|
my @newstates = $self->states;
|
|
my @result = @newstates;
|
|
my %seen = map {($_ => 1)} @newstates;
|
|
|
|
for (;;) {
|
|
my $s;
|
|
my @nextstates;
|
|
foreach $s (@newstates) {
|
|
my $nextstate = $self->{nfa}->transition_table($s)->{''\
|
|
};
|
|
next unless defined $nextstate;
|
|
push @nextstates, ref $nextstate ? @$nextstate : $nextstate;
|
|
}
|
|
@newstates = grep {! $seen{$_}++} @nextstates;
|
|
last unless @newstates;
|
|
push @result, @newstates;
|
|
}
|
|
$self->{states} = \@result;
|
|
}
|
|
|
|
#
|
|
# Perform a transition
|
|
#
|
|
sub transit {
|
|
my $self = shift;
|
|
my $symbol = shift;
|
|
$self->{states} = $self->transition_table->{$symbol};
|
|
}
|
|
|
|
#
|
|
# Current states
|
|
#
|
|
sub states {
|
|
my $self = shift;
|
|
@{$self->{states}};
|
|
}
|
|
|
|
#
|
|
# Should we accept?
|
|
#
|
|
sub final_state {
|
|
my $self = shift;
|
|
my $s;
|
|
foreach $s ($self->states) {
|
|
return 1 if $self->{nfa}->is_end_state($s);
|
|
}
|
|
0;
|
|
}
|
|
#
|
|
# Get current transition table
|
|
# This is interesting because we have to merge the transition
|
|
# tables for several states.
|
|
sub transition_table {
|
|
my $self = shift;
|
|
my $s;
|
|
my %ttab;
|
|
foreach $s ($self->states) {
|
|
my $sub_ttab = $self->{nfa}->transition_table($s);
|
|
my ($symbol, $next_state);
|
|
while (($symbol, $next_state) = each %$sub_ttab) {
|
|
push @{$ttab{$symbol}}, ref $next_state ? @$next_state : $next_s\
|
|
tate;
|
|
}
|
|
}
|
|
\%ttab;
|
|
}
|
|
1;
|