#
Foo
#
This is a paragraph...
#
This is another paragraph...
#
int main () {return 0;}
#
int main () {return 0} // Won't compile
#
#
Bar
# ...
#
#
# It is generally good about escaping things that would interfere with HTML,
# but within text paragraphs it lets you write literal HTML. The heuristic is
# that known tags that are reasonably well-formed are allowed, but unknown ones
# are escaped.
my ($attribute) = @_;
my @paragraphs = split /\n(?:\s*\n)+/, retrieve($attribute);
my $known_tags = join '|', qw[html head body meta script style link title div a span input button textarea option select form label iframe blockquote code caption
table tbody tr td th thead tfoot img h1 h2 h3 h4 h5 h6 li ol ul noscript p pre samp sub sup var canvas audio video];
my $section_level = 0;
my @markup;
my $indent = sub {' ' x ($_[0] || $section_level)};
my $unindent = sub {my $spaces = ' ' x ($section_level - 1); s/^$spaces//gm};
my $escape_all = sub {s/&/&/g; s/</g; s/>/>/g};
my $escape_some = sub {s/&/&/g; s/<(?!\/|($known_tags)[^>]*>.*<\/\1>)/</g};
my $code = sub {&$escape_all(); &$unindent(); push @markup, &$indent() . "", &$indent($_[0]) . "$2"};
my $close_section = sub {push @markup, &$indent($_[0]) . "
"};
my $title = sub {
my $indentation = (length($1) >> 1) + 1;
&$close_section($section_level) while $section_level-- >= $indentation;
&$section($indentation);
$section_level = $indentation;
};
for (@paragraphs) {
&$code(), next unless /^\h*[A-Z|]/;
&$quoted(), next if /^\h*\|/;
&$title(), s/^.*\n// if /^(\s*)(\S.*)\.\n([^\n]+)/ and length("$1$2") - 10 < length($3);
&$paragraph();
}
&$close_section($section_level) while $section_level--;
join "\n", @markup;
__
meta::function('sdocp', <<'__');
# Renders an attribute as SDocP. This logic was taken directly from the sdoc script.
my $attribute = retrieve($_[0]);
sub escape {my @results = map {s/\\/\\\\/go; s/\n/\\n/go; s/'/\\'/go; $_} @_; wantarray ? @results : $results[0]}
"sdocp('" . escape($_[0]) . "', '" . escape($attribute) . "');";
__
meta::function('serialize', <<'__');
my ($options, @criteria) = separate_options(@_);
my $partial = $$options{'-p'};
my $criteria = join '|', @criteria;
my @attributes = map serialize_single($_), select_keys(%$options, '-m' => 1, '--criteria' => $criteria), select_keys(%$options, '-M' => 1, '--criteria' => $criteria);
my @final_array = @{$partial ? \@attributes : [retrieve('bootstrap::initialization'), @attributes, 'internal::main();', '', '__END__']};
join "\n", @final_array;
__
meta::function('serialize_single', <<'__');
# Serializes a single attribute and optimizes for content.
my $name = $_[0] || $_;
my $contents = $data{$name};
my $meta_function = 'meta::' . namespace($name);
my $invocation = attribute($name);
my $escaped = $contents;
$escaped =~ s/\\/\\\\/go;
$escaped =~ s/'/\\'/go;
return "$meta_function('$invocation', '$escaped');" unless $escaped =~ /\v/;
my $delimiter = '__' . fast_hash($contents);
my $chars = 2;
++$chars until $chars >= length($delimiter) || index("\n$contents", "\n" . substr($delimiter, 0, $chars)) == -1;
$delimiter = substr($delimiter, 0, $chars);
"$meta_function('$invocation', <<'$delimiter');\n$contents\n$delimiter";
__
meta::function('sh', 'system(@_);');
meta::function('shell', <<'__');
terminal::cc(retrieve('data::current-continuation')) if length $data{'data::current-continuation'};
shell::repl();
__
meta::function('size', <<'__');
my $size = 0;
$size += length $data{$_} for keys %data;
sprintf "% 7d % 7d", length(serialize()), $size;
__
meta::function('snapshot', <<'__');
my ($name) = @_;
file::write(my $finalname = temporary_name($name), serialize(), noclobber => 1);
chmod 0700, $finalname;
hook('snapshot', $finalname);
__
meta::function('state', <<'__');
my @keys = sort keys %data;
my $hash = fast_hash(fast_hash(scalar @keys) . join '|', @keys);
$hash = fast_hash("$data{$_}|$hash") for @keys;
$hash;
__
meta::function('t', <<'__');
my @tests = select_keys('--criteria' => "sdoc::js::test/(.*\/)?$_[0]");
edit($tests[0]);
__
meta::function('test', <<'__');
# Unit tests for caterwaul.
# In the past this was done differently. It would run node once for each test, which involved a lot of startup overhead and took a while. Now the tests are all run in a single file, so node
# gets run just once. This should be significantly faster.
my ($options, @args) = separate_options(@_);
my @tests = grep s/^sdoc::js::test\//pp::js::test\//, sort keys %data;
render();
unless ($$options{'-p'}) {
terminal::info('running ' . join(' ', @tests));
node([qw(caterwaul.js extensions/std.js extensions/dev.js), @tests]) unless $$options{'-N'};
gjs([qw(caterwaul.js extensions/std.js extensions/dev.js), @tests]) if $$options{'-a'};
}
terminal::info('with precompiled caterwaul');
node([qw(caterwaul.js build/extensions/std.pre.js build/extensions/dev.pre.js), @tests]) unless $$options{'-N'};
gjs([qw(caterwaul.js build/extensions/std.pre.js build/extensions/dev.pre.js), @tests]) if $$options{'-a'};
__
meta::function('testq-pre', <<'__');
my ($options, @stuff) = separate_options(@_);
my @tests = grep $_ !~ /^js::test\/lib/, grep s/^sdoc:://, select_keys('--criteria' => "^sdoc::js::.*\\.?test");
terminal::warning('running tests without rendering');
node([qw(caterwaul.all.pre.js test/lib/unit.js), @tests]) unless $$options{'-N'};
gjs([qw(caterwaul.all.pre.js test/lib/unit.js), @tests]) if $$options{'-a'};
__
meta::function('touch', 'associate($_, \'\') for @_;');
meta::function('unlock', 'hook(\'unlock\', chmod_self(sub {$_[0] | 0200}));');
meta::function('update', '&{\'update-from\'}(@_, grep s/^parent:://o, sort keys %data);');
meta::function('update-from', <<'__');
# Upgrade all attributes that aren't customized. Customization is defined when the data type is created,
# and we determine it here by checking for $transient{inherit}{$type}.
# Note that this assumes you trust the remote script. If you don't, then you shouldn't update from it.
around_hook('update-from-invocation', separate_options(@_), sub {
my ($options, @targets) = @_;
my %parent_id_cache = cache('parent-identification');
my %already_seen;
@targets or return;
my @known_targets = grep s/^parent:://, parent_ordering(map "parent::$_", grep exists $data{"parent::$_"}, @targets);
my @unknown_targets = grep ! exists $data{"parent::$_"}, @targets;
@targets = (@known_targets, @unknown_targets);
my $save_state = ! ($$options{'-n'} || $$options{'--no-save'});
my $no_parents = $$options{'-P'} || $$options{'--no-parent'} || $$options{'--no-parents'};
my $force = $$options{'-f'} || $$options{'--force'};
my $clobber_divergent = $$options{'-D'} || $$options{'--clobber-divergent'};
&{'save-state'}('before-update') if $save_state;
for my $target (@targets) {
dangerous("updating from $target", sub {
around_hook('update-from', $target, sub {
my $identity = $parent_id_cache{$target} ||= join '', qx($target identity);
next if $already_seen{$identity};
$already_seen{$identity} = 1;
my $attributes = join '', qx($target ls -ahiu);
my %divergent;
die "skipping unreachable $target" unless $attributes;
for my $to_rm (split /\n/, retrieve("parent::$target")) {
my ($name, $hash) = split(/\s+/, $to_rm);
next unless exists $data{$name};
my $local_hash = fast_hash(retrieve($name));
if ($clobber_divergent or $hash eq $local_hash or ! defined $hash) {rm($name)}
else {terminal::info("preserving local version of divergent attribute $name (use update -D to clobber it)");
$divergent{$name} = retrieve($name)}}
associate("parent::$target", $attributes) unless $no_parents;
dangerous('', sub {eval qx($target serialize -ipmu)});
dangerous('', sub {eval qx($target serialize -ipMu)});
map associate($_, $divergent{$_}), keys %divergent unless $clobber_divergent;
reload()})})}
cache('parent-identification', %parent_id_cache);
if (verify()) {hook('update-from-succeeded', $options, @targets);
terminal::info("Successfully updated. Run 'load-state before-update' to undo this change.") if $save_state}
elsif ($force) {hook('update-from-failed', $options, @targets);
terminal::warning('Failed to verify: at this point your object will not save properly, though backup copies will be created.',
'Run "load-state before-update" to undo the update and return to a working state.') if $save_state}
else {hook('update-from-failed', $options, @targets);
terminal::error('Verification failed after the upgrade was complete.');
terminal::info("$0 has been reverted to its pre-upgrade state.", "If you want to upgrade and keep the failure state, then run 'update-from $target --force'.") if $save_state;
return &{'load-state'}('before-update') if $save_state}});
__
meta::function('usage', '"Usage: $0 action [arguments]\\nUnique actions (run \'$0 ls\' to see all actions):" . ls(\'-u\');');
meta::function('verify', <<'__');
file::write(my $other = $transient{temporary_filename} = temporary_name(), my $serialized_data = serialize());
chomp(my $observed = join '', qx|perl '$other' state|);
unlink $other if my $result = $observed eq (my $state = state());
terminal::error("Verification failed; expected $state but got $observed from $other") unless $result;
hook('after-verify', $result, observed => $observed, expected => $state);
$result;
__
meta::hook('after-render::base-files', <<'__');
file::write(attribute($_) . '.js', retrieve("pp::$_"), mkpath => 1) for grep s/^sdoc::js::/js::/ && /^js::(extensions|core)\//, sort keys %data;
file::write('build/caterwaul.js', retrieve('pp::js::caterwaul'), mkpath => 1);
file::write('build/caterwaul.node.js', retrieve('pp::js::caterwaul.node'), mkpath => 1);
my @extensions = qw/std dev ui/;
precompile("extensions/$_.js") for @extensions;
mkdir 'build/extensions';
rename "extensions/$_.pre.js", "build/extensions/$_.pre.js" for @extensions;
__
meta::hook('after-render::notes', <<'__');
# Render each note into an SDoc file.
file::write('doc/notes/' . attribute($_) . '.sdoc', retrieve($_)) for grep /^note::/, keys %data;
__
meta::hook('after-render::tests', 'file::write(\'build/\' . attribute($_) . \'.js\', retrieve("pp::$_"), mkpath => 1) for grep s/^sdoc::js::/js::/ && /^js::test\\//, sort keys %data;');
meta::hook('after-render::tools', 'file::write(\'build/tools/precompile.js\', retrieve(\'pp::js::tools/precompile\'), mkpath => 1);');
meta::internal_function('around_hook', <<'__');
# around_hook('hookname', @args, sub {
# stuff;
# });
# Invokes 'before-hookname' on @args before the sub runs, invokes the
# sub on @args, then invokes 'after-hookname' on @args afterwards.
# The after-hook is not invoked if the sub calls 'die' or otherwise
# unwinds the stack.
my $hook = shift @_;
my $f = pop @_;
hook("before-$hook", @_);
my $result = &$f(@_);
hook("after-$hook", @_);
$result;
__
meta::internal_function('associate', <<'__');
my ($name, $value, %options) = @_;
die "Namespace does not exist" unless exists $datatypes{namespace($name)};
$data{$name} = $value;
execute($name) if $options{'execute'};
$value;
__
meta::internal_function('attribute', <<'__');
my ($name) = @_;
$name =~ s/^[^:]*:://;
$name;
__
meta::internal_function('attribute_is', <<'__');
my ($a, %options) = @_;
my %inherited = parent_attributes(grep /^parent::/o, sort keys %data) if grep exists $options{$_}, qw/-u -U -d -D/;
my $criteria = $options{'--criteria'} || $options{'--namespace'} && "^$options{'--namespace'}::" || '.';
my %tests = ('-u' => sub {! $inherited{$a}},
'-d' => sub {$inherited{$a} && fast_hash(retrieve($a)) ne $inherited{$a}},
'-i' => sub {$transient{inherit}{namespace($a)}},
'-s' => sub {$a =~ /^state::/o},
'-m' => sub {$a =~ /^meta::/o});
return 0 unless scalar keys %tests == scalar grep ! exists $options{$_} || &{$tests{$_}}(), keys %tests;
return 0 unless scalar keys %tests == scalar grep ! exists $options{uc $_} || ! &{$tests{$_}}(), keys %tests;
$a =~ /$criteria/
__
meta::internal_function('bench', <<'__');
use Time::HiRes qw/gettimeofday tv_interval/;
my ($f) = @_;
my $start_time = [gettimeofday];
&$f();
my $duration = tv_interval($start_time);
terminal::info("$duration seconds elapsed");
__
meta::internal_function('cache', <<'__');
my ($name, %pairs) = @_;
if (%pairs) {associate("cache::$name", join "\n", map {$pairs{$_} =~ s/\n//g; "$_ $pairs{$_}"} sort keys %pairs)}
else {map split(/\s/, $_, 2), split /\n/, retrieve("cache::$name")}
__
meta::internal_function('chmod_self', <<'__');
my ($mode_function) = @_;
my (undef, undef, $mode) = stat $0;
chmod &$mode_function($mode), $0;
__
meta::internal_function('dangerous', <<'__');
# Wraps a computation that may produce an error.
my ($message, $computation) = @_;
terminal::info($message) if $message;
my @result = eval {&$computation()};
terminal::warning(translate_backtrace($@)), return undef if $@;
wantarray ? @result : $result[0];
__
meta::internal_function('debug_trace', <<'__');
terminal::debug(join ', ', @_);
wantarray ? @_ : $_[0];
__
meta::internal_function('dep', <<'__');
# A variadic function to prepend cached_dependency:: onto things.
# Used like this: dep(qw/caterwaul.all.js montenegro.server.js/)
map "cached_dependency::$_", @_;
__
meta::internal_function('execute', <<'__');
my ($name, %options) = @_;
my $namespace = namespace($name);
eval {&{"meta::$namespace"}(attribute($name), retrieve($name))};
warn $@ if $@ && $options{'carp'};
__
meta::internal_function('exported', <<'__');
# Allocates a temporary file containing the concatenation of attributes you specify,
# and returns the filename. The filename will be safe for deletion anytime.
my $filename = temporary_name();
file::write($filename, cat(@_));
$filename;
__
meta::internal_function('extension_for', <<'__');
my $extension = $transient{extension}{namespace($_[0])};
$extension = &$extension($_[0]) if ref $extension eq 'CODE';
$extension || '';
__
meta::internal_function('fast_hash', <<'__');
my ($data) = @_;
my $piece_size = length($data) >> 3;
my @pieces = (substr($data, $piece_size * 8) . length($data), map(substr($data, $piece_size * $_, $piece_size), 0 .. 7));
my @hashes = (fnv_hash($pieces[0]));
push @hashes, fnv_hash($pieces[$_ + 1] . $hashes[$_]) for 0 .. 7;
$hashes[$_] ^= $hashes[$_ + 4] >> 16 | ($hashes[$_ + 4] & 0xffff) << 16 for 0 .. 3;
$hashes[0] ^= $hashes[8];
sprintf '%08x' x 4, @hashes[0 .. 3];
__
meta::internal_function('file::read', <<'__');
my $name = shift;
open my($handle), "<", $name;
my $result = join "", <$handle>;
close $handle;
$result;
__
meta::internal_function('file::write', <<'__');
use File::Path 'mkpath';
use File::Basename 'dirname';
my ($name, $contents, %options) = @_;
die "Choosing not to overwrite file $name" if $options{noclobber} and -f $name;
mkpath(dirname($name)) if $options{mkpath};
open my($handle), $options{append} ? '>>' : '>', $name or die "Can't open $name for writing";
print $handle $contents;
close $handle;
__
meta::internal_function('fnv_hash', <<'__');
# A rough approximation to the Fowler-No Voll hash. It's been 32-bit vectorized
# for efficiency, which may compromise its effectiveness for short strings.
my ($data) = @_;
my ($fnv_prime, $fnv_offset) = (16777619, 2166136261);
my $hash = $fnv_offset;
my $modulus = 2 ** 32;
$hash = ($hash ^ ($_ & 0xffff) ^ ($_ >> 16)) * $fnv_prime % $modulus for unpack 'L*', $data . substr($data, -4) x 8;
$hash;
__
meta::internal_function('hypothetically', <<'__');
# Applies a temporary state and returns a serialized representation.
# The original state is restored after this, regardless of whether the
# temporary state was successful.
my %data_backup = %data;
my ($side_effect) = @_;
my $return_value = eval {&$side_effect()};
%data = %data_backup;
die $@ if $@;
$return_value;
__
meta::internal_function('internal::main', <<'__');
disable();
$SIG{'INT'} = sub {snapshot(); exit 1};
$transient{initial} = state();
chomp(my $default_action = retrieve('data::default-action'));
my $function_name = shift(@ARGV) || $default_action || 'usage';
terminal::warning("unknown action: '$function_name'") and $function_name = 'usage' unless $externalized_functions{$function_name};
around_hook('main-function', $function_name, @ARGV, sub {
dangerous('', sub {
chomp(my $result = &$function_name(@ARGV));
print "$result\n" if $result})});
save() unless state() eq $transient{initial};
END {
enable();
}
__
meta::internal_function('invoke_editor_on', <<'__');
my ($data, %options) = @_;
my $editor = $options{editor} || $ENV{VISUAL} || $ENV{EDITOR} || die 'Either the $VISUAL or $EDITOR environment variable should be set to a valid editor';
my $options = $options{options} || $ENV{VISUAL_OPTS} || $ENV{EDITOR_OPTS} || '';
my $attribute = $options{attribute};
$attribute =~ s/\//-/g;
my $filename = temporary_name() . "-$attribute$options{extension}";
file::write($filename, $data);
system("$editor $options '$filename'");
my $result = file::read($filename);
unlink $filename;
$result;
__
meta::internal_function('is_locked', '!((stat($0))[2] & 0222);');
meta::internal_function('namespace', <<'__');
my ($name) = @_;
$name =~ s/::.*$//;
$name;
__
meta::internal_function('parent_attributes', <<'__');
my $attributes = sub {my ($name, $value) = split /\s+/o, $_; $name => ($value || 1)};
map &$attributes(), split /\n/o, join("\n", retrieve(@_));
__
meta::internal_function('parent_ordering', <<'__');
# Topsorts the parents by dependency chain. The simplest way to do this is to
# transitively compute the number of parents referred to by each parent.
my @parents = @_;
my %all_parents = map {$_ => 1} @parents;
my %parents_of = map {
my $t = $_;
my %attributes = parent_attributes($_);
$t => [grep /^parent::/, keys %attributes]} @parents;
my %parent_count;
my $parent_count;
$parent_count = sub {
my ($key) = @_;
return $parent_count{$key} if exists $parent_count{$key};
my $count = 0;
$count += $parent_count->($_) + exists $data{$_} for @{$parents_of{$key}};
$parent_count{$key} = $count};
my %inverses;
push @{$inverses{$parent_count->($_)} ||= []}, $_ for @parents;
grep exists $all_parents{$_}, map @{$inverses{$_}}, sort keys %inverses;
__
meta::internal_function('retrieve', <<'__');
my @results = map defined $data{$_} ? $data{$_} : retrieve_with_hooks($_), @_;
wantarray ? @results : $results[0];
__
meta::internal_function('retrieve_with_hooks', <<'__');
# Uses the hooks defined in $transient{retrievers}, and returns undef if none work.
my ($attribute) = @_;
my $result = undef;
defined($result = &$_($attribute)) and return $result for map $transient{retrievers}{$_}, sort keys %{$transient{retrievers}};
return undef;
__
meta::internal_function('select_keys', <<'__');
my %options = @_;
grep attribute_is($_, %options), sort keys %data;
__
meta::internal_function('separate_options', <<'__');
# Things with one dash are short-form options, two dashes are long-form.
# Characters after short-form are combined; so -auv4 becomes -a -u -v -4.
# Also finds equivalences; so --foo=bar separates into $$options{'--foo'} eq 'bar'.
# Stops processing at the -- option, and removes it. Everything after that
# is considered to be an 'other' argument.
# The only form not supported by this function is the short-form with argument.
# To pass keyed arguments, you need to use long-form options.
my @parseable;
push @parseable, shift @_ until ! @_ or $_[0] eq '--';
my @singles = grep /^-[^-]/, @parseable;
my @longs = grep /^--/, @parseable;
my @others = grep ! /^-/, @parseable;
my @singles = map /-(.{2,})/ ? map("-$_", split(//, $1)) : $_, @singles;
my %options;
$options{$1} = $2 for grep /^([^=]+)=(.*)$/, @longs;
++$options{$_} for grep ! /=/, @singles, @longs;
({%options}, @others, @_);
__
meta::internal_function('strip', 'wantarray ? map {s/^\\s*|\\s*$//g; $_} @_ : $_[0] =~ /^\\s*(.*?)\\s*$/ && $1;');
meta::internal_function('table_display', <<'__');
# Displays an array of arrays as a table; that is, with alignment. Arrays are
# expected to be in column-major order.
sub maximum_length_in {
my $maximum = 0;
length > $maximum and $maximum = length for @_;
$maximum;
}
my @arrays = @_;
my @lengths = map maximum_length_in(@$_), @arrays;
my @row_major = map {my $i = $_; [map $$_[$i], @arrays]} 0 .. $#{$arrays[0]};
my $format = join ' ', map "%-${_}s", @lengths;
join "\n", map strip(sprintf($format, @$_)), @row_major;
__
meta::internal_function('temporary_name', <<'__');
use File::Temp 'tempfile';
my (undef, $temporary_filename) = tempfile("$0." . 'X' x 4, OPEN => 0);
$temporary_filename;
__
meta::internal_function('translate_backtrace', <<'__');
my ($trace) = @_;
$trace =~ s/\(eval (\d+)\)/$locations{$1 - 1}/g;
$trace;
__
meta::internal_function('with_exported', <<'__');
# Like exported(), but removes the file after running some function.
# Usage is with_exported(@files, sub {...});
my $f = pop @_;
my $name = exported(@_);
my $result = eval {&$f($name)};
terminal::warning("$@ when running with_exported()") if $@;
unlink $name;
$result;
__
meta::library('shell', <<'__');
# Functions for shell parsing and execution.
package shell;
use Term::ReadLine;
sub tokenize {grep length, split /\s+|("[^"\\]*(?:\\.)?")/o, join ' ', @_};
sub parse {
my ($fn, @args) = @_;
s/^"(.*)"$/\1/o, s/\\\\"/"/go for @args;
{function => $fn, args => [@args]}}
sub execute {
my %command = %{$_[0]};
die "undefined command: $command{function}" unless exists $externalized_functions{$command{function}};
&{"::$command{function}"}(@{$command{args}})}
sub run {execute(parse(tokenize(@_)))}
sub prompt {
my %options = @_;
my $name = $options{name} // ::name();
my $state = $options{state} // ::state();
my $other = $state ne $transient{initial} ? 33 : 30;
my $locked = ::is_locked() ? "\033[1;31mlocked\033[0;0m" : '';
my $cc = length ::retrieve('data::current-continuation') ? "\033[1;36mcc\033[0;0m" : '';
"\033[1;32m$name\033[1;${other}m" . substr($state, 0, 4) . "\033[0;0m$cc$locked\033[1;34m$options{stuff}\033[0;0m "}
sub repl {
my %options = @_;
my $term = new Term::ReadLine "$0 shell";
$term->ornaments(0);
my $attribs = $term->Attribs;
$attribs->{completion_entry_function} = $attribs->{list_completion_function};
my $autocomplete = $options{autocomplete} || sub {[sort keys %data, sort keys %externalized_functions]};
my $prompt = $options{prompt} || \&prompt;
my $parse = $options{parse} || sub {parse(tokenize(@_))};
my $command = $options{command} || sub {my ($command) = @_; ::around_hook('shell-command', $command, sub {print ::dangerous('', sub {execute($command)}), "\n"})};
&$command(&$parse($_)) while ($attribs->{completion_word} = &$autocomplete(), defined($_ = $term->readline(&$prompt())))}
__
meta::library('terminal', <<'__');
# Functions for nice-looking terminal output.
package terminal;
my $process = ::name();
sub message {print STDERR "[$_[0]] $_[1]\n"}
sub color {
my ($name, $color) = @_;
*{"terminal::$name"} = sub {chomp($_), print STDERR "\033[1;30m$process(\033[1;${color}m$name\033[1;30m)\033[0;0m $_\n" for map join('', $_), @_}}
my %preloaded = (info => 32, progress => 32, state => 34, debug => 34, warning => 33, error => 31);
color $_, $preloaded{$_} for keys %preloaded;
__
meta::message_color('cc', '36');
meta::message_color('state', 'purple');
meta::message_color('states', 'yellow');
meta::message_color('test', 'purple');
meta::message_color('watch', 'blue');
meta::note('core-caterwaul-simplicity', <<'__');
Absence of a core object system.
At first I tried to write caterwaul using an object system to represent its replicative nature, but this turned out to be a major pain. I guess I'm not very good at designing object systems.
In any case, now my goal is to make the caterwaul global as simple as possible.
Here's the new configuration interface:
| var my_caterwaul = caterwaul(function (code) {
return this.macroexpand(code, this.standard_macros, this.html_macros);
});
Because this is such a common use case you can abbreviate it:
| var my_caterwaul = caterwaul.macroexpander('standard_macros', 'html_macros');
But that's it. No more named configurations, no tconfigure(), nothing like that. This has the advantage that no heavy-lifting object system is necessary.
__
meta::note('macroexpander-optimization', <<'__');
Optimization constraints for macroexpansion.
This is a challenging problem. It looks like the total complexity of loading all modules is close to 7000, which isn't too bad by itself. However, there are more than 100 macros during some of
these configurations, and that increases the amount of work by a factor of 100.
It isn't possible to achieve sublinear macroexpansion time in the number of syntax nodes, but I think it is possible to achieve it in the number of macros. I'm not quite sure how this should
work, but probably some kind of indexing strategy is in order.
Masked indexing.
This is probably the most obvious way to solve the problem. The idea is that macroexpand() first builds an index of nodes, then consults the index to determine which macros should be
invoked. This should be roughly O(n) in the size of the tree. The implementation is made challenging by the fact that syntax node patterns aren't necessarily easy to index. The only way I
can think of to go about this sensibly is to compile some kind of mixed mask from all of the macro patterns, and then hash each syntax node based on that mask.
Another possibility is to arrange macro patterns by common factors. Maybe Caterwaul's macroexpander knows how to do things like identify structurally identical trees and dispatch on some
hash of the data, or some such. This involves some fairly advanced decision-making on Caterwaul's part, but I'm fairly confident that a closed-form optimization strategy exists.
Problem statement.
Formally speaking, the problem is to minimize the number of pattern-node match operations. Scanning through all of the nodes up-front is a definite possibility, though each node visit is
somewhat expensive. Match operations are worse, however, since they may have to dig down several layers before rejecting a match, and in the meantime they will have consed up and
concatenated many arrays to build the partial match data.
Potential algorithms for indexing.
There are a lot of commonalities among patterns used for macros, and deriving the structure may be prohibitive. However, I'd like to try anyway. So to state it more clearly, there are a few
possibilities:
| 1. We use an algorithm that is invariant on its input. This is optimal if no structure can be derived from syntax patterns, which obviously isn't the case.
2. We use an algorithm that has heuristics about macros. This may work, but it breaks down for non-Javascript use cases (and it would be cool if Caterwaul were general-purpose).
3. We use an adaptive optimizer to find the best indexing strategy. We can either predict it or test it empirically.
I'm axing (1) right off the bat just because it's no fun. This leaves (2) and (3). Let's see where (3) takes us.
Adaptive optimization.
In this case we know two things ahead of time. First, we know the list of macros that we'll need to apply to the syntax tree, and second we have a fixed number of scans over the tree
(ideally just one) to compute whatever we want to about them. For the sake of comparison, the number to beat is 750000. This means that we can do up to 100 operations per syntax node and
still beat the naive solution. Ideally we do better, but that's a high limit. (Hopefully whatever adaptation needs to happen will happen on the macro end, not the syntax node end -- there
will be at most around 1000 macros, and that would be a pathological case.)
Pattern hashing.
It's fairly straightforward to identify high-information hash regions within the pattern space. All we have to do then is to build a table and do a very fast lookup on each node. However,
what happens when we have two different schemas? For example:
| ([] (let _ _))
([] (foo _ _))
([] (bar _ _))
([] (bif _ _))
This is the first schema, where the path [1].data provides a perfect hash (reduces each tree to at most one match, and from there recursive descent is the only option). However, suppose we
also have these macros:
| ([] (let bif _))
([] (let baz _))
([] (let bok _))
Now the first scheme is perfect only for a subset of inputs. Because of the overlap (which there is in the real world too), we need a multi-stage algorithm to handle failover scenarios like
this one.
One important idea here is that we want to generalize to a /family/ of macros, not construct a perfect hash. It's acceptable to have to try a couple of different things; we just need to
bound the number. Ideally we do the math to find the optimum; i.e. generate more hashes until the overhead of hashing each node outweighs the overhead of each test in a subset of cases.
(There may not be such a moment, for example if two macros have the same pattern.)
Along these lines, it's worth coming up with some simplifications. Pattern matching is transitive; if A matches pattern B and B matches pattern C, then A matches pattern C. This means that
we can remove earlier macros that are matched by later macro-patterns. (Here the earlier macro is B, the later one is A, and C is the syntax tree we'll be matching.) This should simplify the
hashing by removing all duplicate cases, and is O(n^2) in the number of macros. (In fact we can treat macro patterns as some kind of fully-ordered space to get O(n log n).)
Syntactic heterogeneity.
How heterogeneous is the syntax? Half of the nodes must be operators or other punctuation, and this is chosen out of a set of 20. The other half must be identifiers of some sort. There's
probably a curve that dictates how likely each identifier is to be seen elsewhere, but I'm also guessing that the pattern is fractal; that is, each subtree has about the same amount of
information density. If this is the case then it's reasonable to use this as a limit for how much information can be derived.
Interrupting thought: Actually, it doesn't matter how much information is present in the syntax tree. The reason is that the macro patterns will end up dictating that; all we need to do is
distinguish among them, and we're given at least that much information.
Constraints.
Because of terrible string-concat performance from some Javascript runtimes, it's important not to allocate new strings during this hashing process. So any hashing that's done will either
(1) have to be on a single field, or (2) have to use numbers rather than strings. (2) is probably a more robust solution.
Interning strings -> integers.
This has some cost initially, and depending on the JS runtime it may or may not make sense. The idea is to unify symbols into hashable integers rather than leaving them as strings. It has
the advantage of using little extra space and allowing strings to be used in low-overhead arithmetic ways (by their bijection onto numbers). Because we don't want to keep the global index
table around forever, it's important to use only local indexes. This has important consequences, perhaps most significantly that we'll have to regenerate the index for each macroexpansion
cycle. I don't consider this too much of a problem. It's nice to have optimizations not pervade the overall system design too much.
Node ordering (irrelevant -- see below).
If we want to skip the post-indexing traversal step altogether (which sounds attractive to me), then we need to know the right order to expand stuff. It goes outside-in and left-to-right, so
a pre-order tree is required. The rule will be that as we're constructing the indexes we also maintain a counter that gets incremented for each node. (Conveniently, the depth-first traversal
built in to nodes does what we need here.) Then each list of nodes to be transformed will be sorted by this counter. While the asymptotic complexity is worse than a regular depth-first
search, it should be much faster in practice.
Interrupting thought: Why not just transform the nodes as soon as we hit them? No need to build these lists at all. Therefore node ordering is irrelevant to the optimization problem.
__
meta::note('math', <<'__');
Mathematical constructs in caterwaul 1.0.
Caterwaul is no longer just about transforming syntax in convenient ways. It's also about doing totally off the hook stuff with macros and adding structure to its extensions.
| 1. Macros should have standard forms. One common one is the "adverb", which includes things like when[], unless[], where[], se[], re[], etc.
2. Expressions should have well-defined destructuring semantics (see below).
Destructuring.
The lvalue() macro needs to become significantly more capable. Here's an example:
| Foo = fc[x, y][this.x = x, this.y = y];
Bar = fc[x, y][this.x = x, this.y = y];
f(new Foo(x, y)) = x + y;
f(new Bar(x, y)) = x * y;
f(new Foo(3, 4)) // returns 7
f(new Bar(3, 4)) // returns 12
| f(x + 1) = Number(x) * 2
f(4) // returns 6
f('41') // returns 8
I'm not 100% sure how to do the failover for pattern matching because it requires knowledge of previous cases. The generated code would have to end up looking like this:
| Foo = ...
Bar = ...
f = function (gs_value) {
var gs_bindings = null;
if (gs_value.constructor === Foo && Foo.gs_caterwaul_invert && (gs_bindings = Foo.gs_caterwaul_invert(gs_value)))
return (function (x, y) {return x + y}).call(this, gs_bindings.x, gs_bindings.y);
else if (gs_value.constructor === Bar && Bar.gs_caterwaul_invert && (gs_bindings = Bar.gs_caterwaul_invert(gs_value)))
return (function (x, y) {return x * y}).call(this, gs_bindings.x, gs_bindings.y);
else
return undefined;
};
Implementing function inversion.
It's possible to invert a function. All we have to do is assign it combinatory semantics and then invert each combinator in turn. For instance:
| f(x, y) = {foo: x, bar: y} // If this definition is made...
f.gs_caterwaul_invert(o) = o && {x: o.foo, y: o.bar} // this one will be too.
Inversion can happen even over closures:
| f(g)(x, y) = {foo: x, bar: y, bif: g};
f(g).gs_caterwaul_invert(o) = o && o.bif === g && {x: o.foo, y: o.bar};
f.gs_caterwaul_invert(g)(o) = o && o.bif === g && {x: o.foo, y: o.bar};
Failover.
This is really tricky. We have to know in advance whether f has been defined. For example:
| f(x, y) = x + y;
f(new Foo(bar)) = bar.x + bar.y;
What does the generated code look like? (Especially considering that each of these is an imperative statement.)
Interesting case:
| f(x, y) = x + y;
f(3, 4); // should return 7
f(x + 1, y) = x * y;
f(3, 4); // should return 8
Actually, we don't care whether f is already defined; we can assume it:
| f(x, y) = x + y
f(new Foo(bar)) = bar.x * bar.y;
Here's the code:
| f = function (x, y) {return x + y}; // No modifiers; gets inlined into an erasing assignment
f = (function (old_f) {
var this_case = function (x, y) {return x * y};
return function (gensym) {
var gensym_result = Foo && Foo.gensym_caterwaul_invert && Foo.gensym_caterwaul_invert(gensym);
if (gensym_result) return this_case.call(this, gensym_result.x, gensym_result.y);
else return old_f.apply(this, arguments);
};
}).call(this, typeof f === 'undefined' ? function () {throw new Error('no matching case for f')} : f);
It's true that this imposes a linear-time overhead in the number of cases, but it's the best we can do for dynamic assignment.
Consequences of ubiquitous inversion.
One nice side-effect of assigning an inverse to every function is that you can have pattern-matching functions invert each other:
| f(x, y) = x + y
g(f(x, 1)) = x * 2
g(10) // -> 18
Perhaps more interestingly:
| modifier(m, x) = m + ' ' + x;
variable(n, v) = n + ' = ' + v;
g(modifier('var', variable('x', value))) = alert('x is assigned ' + value);
We'd have to detect the presence of a constant in the destructuring pattern and propagate it through the combinatory logic somehow. (This is computationally expensive!) However, it's also
really important to be able to do.
Anonymous destructuring functions.
There isn't a good reason fn[] couldn't take on destructuring semantics. The only difficulty is having multiple alternatives.
| fn[new Foo(x, y)][x + y]
fn[x + 1][x * (x + 1)]
Ultimately the problem is that Javascript doesn't have a way to represent parameter-based multiple dispatch. So pattern matching is something that gets superimposed at the last minute.
__
meta::note('q', 'Object system is now implemented, so at this point the priority is to get stuff working with it.');
meta::note('wrappers', <<'__');
Dealing with Javascript suckage.
Javascript's builtin types are a mess. There isn't a reasonable hierarchy for things that are ordered (though < is magic enough to work properly most of the time), arrays aren't really
extensible, objects are a disaster, etc. Part of this is due to the crossover between the method table and the key-value pairing present on all references. But in any case, it would be really
nice to have a better interface for these things.
Tempting as it is to rebuild all of these types using real data structures, caterwaul isn't trying to solve that problem. So caterwaul internally and externally uses regular old Javascript
types whenever practical. (A notable exception is the global caterwaul function, which is an instance of itself under the caterwaul object system. But it produces and accepts standard
Javascript arrays and objects in general.)
It should be very easy to wrap standard data types however. So caterwaul provides a series of monads
__
meta::parent('/home/spencertipping/bin/configuration', <<'__');
meta::type::configuration d67e10a128e6b1d958c5b9d3bbe25aa4
parent::/home/spencertipping/bin/object 4e114ee5933f8a10ce856c55da4aea34
__
meta::parent('/home/spencertipping/bin/node-base', <<'__');
function::loc 36e0cabf1fe1c1bcaa3c8c708bd82ca0
function::node 3522fde8f76947a5f70bca33f1ee0016
function::node-custom c2f4063798c997ec7f78a3543b1240b3
function::render 4962b4a2e6bc800fd5c11b97550b623e
function::run-forever 76175932a2d2692fc802856c28d0848d
internal_function::dep bad9b934374b176318ed2295b63130bc
message_color::test 14e993fdf2c62df353613c243dc9053b
meta::type::js f7a0080116fecebf7abb030b0156f44d
parent::/home/spencertipping/bin/repository 3e7c06920f556c7ca97aa67b18470f03
parent::/home/spencertipping/conjectures/perl-objects/sdoc d13d5c65b3faaef717d55f9a59e6e842
__
meta::parent('/home/spencertipping/bin/object', <<'__');
bootstrap::html 9b839f4c0c8d6bb18bf6d9d0e670e497
bootstrap::initialization b39f25c213ceb0418551084d1bddeb20
bootstrap::perldoc c63395cbc6f7160b603befbb2d9b6700
function::alias 28744564997657da45ab16cd5b441104
function::cat a19fdfda461f9a0aa01978dff2e2c2f7
function::cc c4d52b1d8f52a480f07b81b93c3aac7b
function::ccc 2351344fc688518c75aa4ab3acec1c4a
function::child f69646398c3123d3d939a7f2b3156606
function::clone 54e00ff2103e54423d4c9febb97ce063
function::cp e5fee448a74ecbf4ae215e6b43dfc048
function::create 090c342a2dc304b39c643d53350474a0
function::current-state d83ae43551c0f58d1d0ce576402a315a
function::disable 49db26fcd680ca34c1f52629a7375d2f
function::edit c31d73791820a10ec910af043f67d4eb
function::enable 37af2d2e603e2455be227ae3c5a42c3a
function::export 388e0cc60507443cb1c0cc3e2658cfef
function::extern 6a9fa8c2a8a4eaae9fe8d38e402145e4
function::grep 3ad630ca30f1abae3cf28ef6293f0e00
function::hash 7a903f90f2c8ed27af7e030f407d9f7b
function::hook d74d8e2b611342af6a0897e0bd62e6e6
function::hooks 230122bdc8929884e45b2f78a7743e2e
function::identity 37106ce13a0200af001d361ce7e81e57
function::import ac86cbe9c9fb12fc8cef2cc88e80c01e
function::initial-state e21ba3519838b221a7b2d4e8a7544e7f
function::is 1d3401965e4720ed972470b54b447db0
function::load-state ea18db867bd62a294e067f60e6975dcf
function::lock 9bf21fee2f0f809131d43553bde82fa5
function::ls 61a247f0a582f63d85b5d5a963a46893
function::mv 52e95180e3c7019116bd798e0da0fdda
function::name 6848cbc257e4b6d7441b25acb04e23c9
function::parents f94c3a5addbc92fe7f90d198fa701484
function::perl 986a274c013b77fe08d29726ce3799fe
function::reload c57ff432c3ffd91a5506cd3eb8bf50c9
function::rm 7cecfb1691a7bf86741f00058bcc54ca
function::save 181da4858ac39c157dbf38e6bac7a0d2
function::save-state 863e4d9fa75ca198ef7a755248d1002a
function::serialize 5148e8ca46eeb3e297f76d098e496bcf
function::serialize_single e7a22f806b580d5c63982ffc7f218a1a
function::sh 9647fa9227bef6c139a79d0dd4acc8b4
function::shell dc8238ad70b1e02eaf230c4f1bd21690
function::size 140728e163286a0f3f514f468c475cfc
function::snapshot d3d84a364524eeb8ee90623f545187e8
function::state 119111f84c3e32a5536838ac84bc6f10
function::touch 819878bc64df094d3323c7050f2c3e97
function::unlock 8fc9bd69f3466f0b54ee2c6965f68cea
function::update 4de1a6a4085836590a3b1ef997f9d5ea
function::update-from beead29d50b0b204bc9a8dc6f07d0003
function::usage b36ead828ad566c8e3919f3b40fb99e6
function::verify d31b85fffd464ddf516d2afeb63dcbde
internal_function::around_hook e1cd17b80d4e8165df9c94facd9f239b
internal_function::associate fc4f785bcf3ffe3225a73a1fdd314703
internal_function::attribute 62efb9f22157835940af1d5feae98d98
internal_function::attribute_is c88e0f87cf807ab0af73ca10032b54b2
internal_function::cache c119f9d7ea9a147c6d526a6283fb119a
internal_function::chmod_self b13487447c65f2dc790bd6b21dde89dd
internal_function::dangerous 4b8343178d6d4d1b760d61b1cfda008c
internal_function::debug_trace 77644ab45a770a6e172680f659911507
internal_function::execute 4b4efc33bc6767a7aade7f427eedf83f
internal_function::exported 27414e8f2ceeaef3555b9726e690eb0f
internal_function::extension_for 65e48f50f20bc04aa561720b03bf494c
internal_function::fast_hash ac70f469e697725cfb87629833434ab1
internal_function::file::read 186bbcef8f6f0dd8b72ba0fdeb1de040
internal_function::file::write eb7b1efebe0db73378b0cce46681788d
internal_function::fnv_hash 8d001a3a7988631bab21a41cee559758
internal_function::hypothetically 33ee2e1595d3877bd1d9accaa72305c8
internal_function::internal::main 435a9e83ac803960745d9aa5aac6c75f
internal_function::invoke_editor_on 1448132d5294a4b8390b4a684d8a78f9
internal_function::is_locked 42c3c89625863a31105d1df49a2a762f
internal_function::namespace 93213d60cafb9627e0736b48cd1f0760
internal_function::parent_attributes 480776f176ab728c6b7450287be5782a
internal_function::parent_ordering 87bee77390de5b9e9c6e07f9af9eb70a
internal_function::retrieve 0b6f4342009684fdfa259f45ac75ae37
internal_function::retrieve_with_hooks 5186a0343624789d08d1cc2084550f3d
internal_function::select_keys 1cd075b241905d9ab3199e680e86dced
internal_function::separate_options d47e8ee23fe55e27bb523c9fcb2f5ca1
internal_function::strip 4af6a470effeed94c0dd9800d01f7d66
internal_function::table_display 8a6897e093f36bf05477a3889b84a61d
internal_function::temporary_name 0fb1402061581b69822f913631b4a9d9
internal_function::translate_backtrace 06fad3d85833a6484e426401b95e0206
internal_function::with_exported fc4f32c46d95c6deed0414364d1c7410
library::shell 528f486cc4d9eb390e4c350b8727c751
library::terminal c52308d05ebb4ff61c5fc36e6d9c7a8a
message_color::cc 6249446f73b3c5af2404c322c150e57b
message_color::state 14e993fdf2c62df353613c243dc9053b
message_color::states 152a940086f7cee6110528a09af7dd78
meta::configure 25976e07665878d3fae18f050160343f
meta::externalize 9141b4e8752515391385516ae94b23b5
meta::functor::editable e3d2ede6edf65ffe2123584b2bd5dab7
meta::type::alias 28fe15dd61f4902ed5180d8604d15d97
meta::type::bootstrap 297d03fb32df03b46ea418469fc4e49e
meta::type::cache 52eac0d7b550a358cc86803fe1a9d921
meta::type::data 58d8027f20099b28a159eaac67314051
meta::type::function d93b3cc15693707dac518e3d6b1f5648
meta::type::hook f55a3f728ddfb90204dff3fe5d86845c
meta::type::inc c95915391b969734305f2f492d5ca8e3
meta::type::internal_function 34abb44c67c7e282569e28ef6f4d62ab
meta::type::library b6dd78120e6d787acdb5c1629f7f1896
meta::type::message_color 794bf137c425293738f07636bcfb5c55
meta::type::meta 640f25635ce2365b0648962918cf9932
meta::type::parent 607e9931309b1b595424bedcee5dfa45
meta::type::retriever 6e847a9d205e4a5589765a3366cdd115
meta::type::state c1f29670be26f1df6100ffe4334e1202
retriever::file b8e7aefc98b8341260d91f21dc61d749
retriever::id a791a5735e9b4f2cb8b99fd39dc17bc3
__
meta::parent('/home/spencertipping/bin/repository', <<'__');
function::dupdate 1c0273217c5b9f2756bb14a4a00aa7e2
meta::type::cached_dependency e9455b403cbff27bbcc41d917fef482f
parent::/home/spencertipping/bin/configuration 4217fa0ccd97c9eecfbb342e8f740271
__
meta::parent('/home/spencertipping/conjectures/perl-objects/sdoc', <<'__');
function::sdoc 060cfa349e629eb90a82b87a8ba00c1d
function::sdoc-html 5fffe6139a81ff67ee063ebcbfadad2a
function::sdocp 8b7ed5bbd537234ae53c0691b6d02c97
meta::type::sdoc 392c65eddae300e2aa67014b85884979
parent::/home/spencertipping/bin/object 4e114ee5933f8a10ce856c55da4aea34
retriever::html-sdoc 54d22ab83d9f3f2e27ef51033b4817a7
retriever::sdoc 6de8ec1f1436fb8e7477b533c321081b
retriever::sdocp fef74cd94fa8761618662802f0bfc171
__
meta::parent('notes', <<'__');
function::note bcbfeac6dd2112f47296265444570a6e
function::notes 15b3aca0cba3b327a984928154eda2b5
meta::type::note 7ca6b375a66ecbfd14bef5bcefeb6643
parent::object 4e114ee5933f8a10ce856c55da4aea34
__
meta::parent('object', <<'__');
bootstrap::html 9b839f4c0c8d6bb18bf6d9d0e670e497
bootstrap::initialization b39f25c213ceb0418551084d1bddeb20
bootstrap::perldoc c63395cbc6f7160b603befbb2d9b6700
function::alias 28744564997657da45ab16cd5b441104
function::cat a19fdfda461f9a0aa01978dff2e2c2f7
function::cc c4d52b1d8f52a480f07b81b93c3aac7b
function::ccc 2351344fc688518c75aa4ab3acec1c4a
function::child f69646398c3123d3d939a7f2b3156606
function::clone 54e00ff2103e54423d4c9febb97ce063
function::cp e5fee448a74ecbf4ae215e6b43dfc048
function::create 090c342a2dc304b39c643d53350474a0
function::current-state d83ae43551c0f58d1d0ce576402a315a
function::disable 49db26fcd680ca34c1f52629a7375d2f
function::edit c31d73791820a10ec910af043f67d4eb
function::enable 37af2d2e603e2455be227ae3c5a42c3a
function::export 388e0cc60507443cb1c0cc3e2658cfef
function::extern 6a9fa8c2a8a4eaae9fe8d38e402145e4
function::grep 3ad630ca30f1abae3cf28ef6293f0e00
function::hash 7a903f90f2c8ed27af7e030f407d9f7b
function::hook d74d8e2b611342af6a0897e0bd62e6e6
function::hooks 230122bdc8929884e45b2f78a7743e2e
function::identity 37106ce13a0200af001d361ce7e81e57
function::import ac86cbe9c9fb12fc8cef2cc88e80c01e
function::initial-state e21ba3519838b221a7b2d4e8a7544e7f
function::is 1d3401965e4720ed972470b54b447db0
function::load-state ea18db867bd62a294e067f60e6975dcf
function::lock 9bf21fee2f0f809131d43553bde82fa5
function::ls 61a247f0a582f63d85b5d5a963a46893
function::mv 52e95180e3c7019116bd798e0da0fdda
function::name 6848cbc257e4b6d7441b25acb04e23c9
function::parents f94c3a5addbc92fe7f90d198fa701484
function::perl 986a274c013b77fe08d29726ce3799fe
function::reload c57ff432c3ffd91a5506cd3eb8bf50c9
function::rm 7cecfb1691a7bf86741f00058bcc54ca
function::save 181da4858ac39c157dbf38e6bac7a0d2
function::save-state 863e4d9fa75ca198ef7a755248d1002a
function::serialize 5148e8ca46eeb3e297f76d098e496bcf
function::serialize_single e7a22f806b580d5c63982ffc7f218a1a
function::sh 9647fa9227bef6c139a79d0dd4acc8b4
function::shell dc8238ad70b1e02eaf230c4f1bd21690
function::size 140728e163286a0f3f514f468c475cfc
function::snapshot d3d84a364524eeb8ee90623f545187e8
function::state 119111f84c3e32a5536838ac84bc6f10
function::touch 819878bc64df094d3323c7050f2c3e97
function::unlock 8fc9bd69f3466f0b54ee2c6965f68cea
function::update 4de1a6a4085836590a3b1ef997f9d5ea
function::update-from beead29d50b0b204bc9a8dc6f07d0003
function::usage b36ead828ad566c8e3919f3b40fb99e6
function::verify d31b85fffd464ddf516d2afeb63dcbde
internal_function::around_hook e1cd17b80d4e8165df9c94facd9f239b
internal_function::associate fc4f785bcf3ffe3225a73a1fdd314703
internal_function::attribute 62efb9f22157835940af1d5feae98d98
internal_function::attribute_is c88e0f87cf807ab0af73ca10032b54b2
internal_function::cache c119f9d7ea9a147c6d526a6283fb119a
internal_function::chmod_self b13487447c65f2dc790bd6b21dde89dd
internal_function::dangerous 4b8343178d6d4d1b760d61b1cfda008c
internal_function::debug_trace 77644ab45a770a6e172680f659911507
internal_function::execute 4b4efc33bc6767a7aade7f427eedf83f
internal_function::exported 27414e8f2ceeaef3555b9726e690eb0f
internal_function::extension_for 65e48f50f20bc04aa561720b03bf494c
internal_function::fast_hash ac70f469e697725cfb87629833434ab1
internal_function::file::read 186bbcef8f6f0dd8b72ba0fdeb1de040
internal_function::file::write eb7b1efebe0db73378b0cce46681788d
internal_function::fnv_hash 8d001a3a7988631bab21a41cee559758
internal_function::hypothetically 33ee2e1595d3877bd1d9accaa72305c8
internal_function::internal::main 435a9e83ac803960745d9aa5aac6c75f
internal_function::invoke_editor_on 1448132d5294a4b8390b4a684d8a78f9
internal_function::is_locked 42c3c89625863a31105d1df49a2a762f
internal_function::namespace 93213d60cafb9627e0736b48cd1f0760
internal_function::parent_attributes 480776f176ab728c6b7450287be5782a
internal_function::parent_ordering 87bee77390de5b9e9c6e07f9af9eb70a
internal_function::retrieve 0b6f4342009684fdfa259f45ac75ae37
internal_function::retrieve_with_hooks 5186a0343624789d08d1cc2084550f3d
internal_function::select_keys 1cd075b241905d9ab3199e680e86dced
internal_function::separate_options d47e8ee23fe55e27bb523c9fcb2f5ca1
internal_function::strip 4af6a470effeed94c0dd9800d01f7d66
internal_function::table_display 8a6897e093f36bf05477a3889b84a61d
internal_function::temporary_name 0fb1402061581b69822f913631b4a9d9
internal_function::translate_backtrace 06fad3d85833a6484e426401b95e0206
internal_function::with_exported fc4f32c46d95c6deed0414364d1c7410
library::shell 528f486cc4d9eb390e4c350b8727c751
library::terminal c52308d05ebb4ff61c5fc36e6d9c7a8a
message_color::cc 6249446f73b3c5af2404c322c150e57b
message_color::state 14e993fdf2c62df353613c243dc9053b
message_color::states 152a940086f7cee6110528a09af7dd78
meta::configure 25976e07665878d3fae18f050160343f
meta::externalize 9141b4e8752515391385516ae94b23b5
meta::functor::editable e3d2ede6edf65ffe2123584b2bd5dab7
meta::type::alias 28fe15dd61f4902ed5180d8604d15d97
meta::type::bootstrap 297d03fb32df03b46ea418469fc4e49e
meta::type::cache 52eac0d7b550a358cc86803fe1a9d921
meta::type::data 58d8027f20099b28a159eaac67314051
meta::type::function d93b3cc15693707dac518e3d6b1f5648
meta::type::hook f55a3f728ddfb90204dff3fe5d86845c
meta::type::inc c95915391b969734305f2f492d5ca8e3
meta::type::internal_function 34abb44c67c7e282569e28ef6f4d62ab
meta::type::library b6dd78120e6d787acdb5c1629f7f1896
meta::type::message_color 794bf137c425293738f07636bcfb5c55
meta::type::meta 640f25635ce2365b0648962918cf9932
meta::type::parent 607e9931309b1b595424bedcee5dfa45
meta::type::retriever 6e847a9d205e4a5589765a3366cdd115
meta::type::state c1f29670be26f1df6100ffe4334e1202
retriever::file b8e7aefc98b8341260d91f21dc61d749
retriever::id a791a5735e9b4f2cb8b99fd39dc17bc3
__
meta::parent('preprocessor', <<'__');
function::preprocess 66e539d29e9afa903569efad0eb7c886
meta::type::template 25f4d6eafb1d3eea6d5d3d9a71a5623e
parent::object 4e114ee5933f8a10ce856c55da4aea34
retriever::pp f4a8c288d69963d6ebc5ce0bf7794777
template::comment 7f8e9be5cd7c865fa64efe3123f62b38
template::eval eb0b1058649eb2d833f348540516b358
template::failing_conditional 5c593329b434a7044f68cec4b77e8ed9
template::include e0624844a65ae41e0217dd871fc0dbfb
template::pinclude 5ba61c3034a4b183881936aec30d2be9
__
meta::retriever('file', '-f $_[0] ? file::read($_[0]) : undef;');
meta::retriever('html-sdoc', <<'__');
my ($attribute) = @_;
return undef unless $attribute =~ s/^html::/sdoc::/ and exists $data{$attribute};
&{'sdoc-html'}($attribute);
__
meta::retriever('id', '$_[0] =~ /^id::/ ? substr($_[0], 4) : undef;');
meta::retriever('pp', <<'__');
return undef unless namespace($_[0]) eq 'pp';
my $attr = retrieve(attribute($_[0]));
defined $attr ? preprocess($attr) : undef;
__
meta::retriever('sdoc', 'exists $data{"sdoc::$_[0]"} ? sdoc("sdoc::$_[0]") : undef;');
meta::retriever('sdocp', <<'__');
my $attribute = attribute($_[0]);
exists $data{"sdoc::$attribute"} ? sdocp("sdoc::$attribute") : undef;
__
meta::sdoc('js::caterwaul', <<'__');
Core caterwaul build with version ID | Spencer Tipping
Licensed under the terms of the MIT source code license
- pinclude pp::js::core/caterwaul
- eval "caterwaul.version('" . fast_hash(retrieve('pp::js::core/caterwaul')) . "');"
__
meta::sdoc('js::caterwaul.all', <<'__');
This file isn't rendered -- it's just used internally for node testing.
- pinclude pp::js::caterwaul
- pinclude pp::js::extensions/std
- pinclude pp::js::extensions/dev
- pinclude pp::js::extensions/ui
__
meta::sdoc('js::caterwaul.node', <<'__');
CommonJS-compatible Caterwaul build | Spencer Tipping
Licensed under the terms of the MIT source code license
- pinclude pp::js::caterwaul
caterwaul.merge(exports, caterwaul);
__
meta::sdoc('js::core/caterwaul', <<'__');
Introduction.
Caterwaul is a Javascript-to-Javascript compiler. Visit http://spencertipping.com/caterwaul/caterwaul.html for information about how and why you might use it.
(function (f) {return f(f, (function (x) {return function () {return ++x}})(0))})(function (initializer, unique, undefined) {
- pinclude pp::js::core/caterwaul.utilities
- pinclude pp::js::core/caterwaul.global
- pinclude pp::js::core/caterwaul.parser-data
- pinclude pp::js::core/caterwaul.tree
- pinclude pp::js::core/caterwaul.parser
- pinclude pp::js::core/caterwaul.compiler
- pinclude pp::js::core/caterwaul.macroexpander
- pinclude pp::js::core/caterwaul.precompile-support
- pinclude pp::js::core/caterwaul.init
return caterwaul = caterwaul_global = caterwaul_global.clone()});
__
meta::sdoc('js::core/caterwaul.compiler', <<'__');
Environment-dependent compilation.
It's possible to bind variables from 'here' (i.e. this runtime environment) inside a compiled function. The way we do it is to create a closure using a gensym. (Another reason that gensyms
must really be unique.) Here's the idea. We use the Function constructor to create an outer function, bind a bunch of variables directly within that scope, and return the function we're
compiling. The variables correspond to gensyms placed in the code, so the code will have closure over those variables.
An optional second parameter 'environment' can contain a hash of variable->value bindings. These will be defined as locals within the compiled function.
New in caterwaul 0.6.5 is the ability to specify a 'this' binding to set the context of the expression being evaluated.
caterwaul_global.compile = function (tree, environment) {
var vars = [], values = [], bindings = merge({}, this._environment || {}, environment || {}, tree.bindings()), s = gensym();
for (var k in bindings) if (own.call(bindings, k)) vars.push(k), values.push(bindings[k]);
var variable_definitions = map(function (v) {return v === 'this' ? '' : 'var ' + v + '=' + s + '.' + v}, vars).join(';'),
return_statement = 'return(' + tree.toString() + ')',
function_body = variable_definitions + ';' + return_statement;
try {return (new Function(s, function_body)).call(bindings['this'], bindings)}
catch (e) {throw new Error(e + ' while compiling ' + function_body)}};
__
meta::sdoc('js::core/caterwaul.global', <<'__');
Global caterwaul variable.
Caterwaul creates a global symbol, caterwaul. Like jQuery, there's a mechanism to get the original one back if you don't want to replace it. You can call caterwaul.deglobalize() to return
caterwaul and restore the global that was there when Caterwaul was loaded (might be useful in the unlikely event that someone else named their library Caterwaul). Note that deglobalize() is
available only on the global caterwaul() function.
var calls_init = function () {var f = function () {return f.init.apply(f, arguments)}; return f},
original_global = typeof caterwaul === 'undefined' ? undefined : caterwaul,
caterwaul_global = calls_init();
caterwaul_global.deglobalize = function () {caterwaul = original_global; return caterwaul_global};
Version management and reinitialization.
There's an interesting case that comes up when loading a global caterwaul. If we detect that the caterwaul we just loaded has the same version as the one that's already there, we revert back
to the original. This is very important for precompilation and the reason for it is subtle. Precompilation relies on tracing to determine the compiled form of each function handed to
caterwaul, so if that caterwaul is replaced for any reason then the traces won't happen. A very common setup is something like this:
|
Often you'll want to precompile the whole bundle, since caterwaul.js includes behaviors that aren't necessarily precompiled and you might get better minification. To do this, it's tempting
to precompile the whole bundle of caterwaul, the extensions, and your code. Without version checking, however, the traces would be lost and nothing would happen.
There is, of course, a pathological failure case in all of this. If you load three caterwauls [why?] and the second of the three has a different version than the other two, then you'll still
get precompiled erasure. I personally don't care about this case. You'd have to be insane to do crazy stuff like this and expect precompilation to work.
caterwaul_global.version = function (v) {return v ? (this._version = v, original_global && original_global.version() === v ? this.deglobalize() : this) : this._version};
caterwaul_global.reinitialize = function (transform) {var c = (transform || function (x) {return x})(this.initializer);
return c(c, this.unique).version(this.version())};
Utility methods.
These are available for use by compiler functions or the end user.
merge(caterwaul_global, {
merge: merge,
map: map,
rmap: rmap,
flatten: flatten,
gensym: gensym,
unique: unique,
initializer: initializer,
variadic: function (f) {return function () {for (var r = [], i = 0, l = arguments.length; i < l; ++i) r.push(f.call(this, arguments[i])); return r}},
right_variadic: function (f) {return function () {for (var r = [], i = 0, l = arguments.length - 1, x = arguments[l]; i < l; ++i) r.push(f.call(this, arguments[i], x)); return r}}});
__
meta::sdoc('js::core/caterwaul.init', <<'__');
Init method.
This runs a compilation stage. If the input is a function or string, then the function or string is parsed, run through the macroexpander, and compiled.
caterwaul_global.clone = function (f) {return se(this.merge(calls_init(), this, this.instance_methods(), {constructor: this}), function () {
delete this._id, delete this._macros, delete this._environment})};
Compiler instance methods/attributes.
These are installed on each generated compiler function. You can change some of them if you know what you're doing (for instance, you can create a compiler for a different programming
language by changing the 'parse' function to handle different input). Unlike caterwaul < 1.0 there is no support for cloning a compiler function. However, you can compose things nicely by
doing stuff like this:
| var my_caterwaul = caterwaul(function (code) {...});
var other_caterwaul = caterwaul(my_caterwaul);
other_caterwaul.parse = function (x) {...};
In this example, other_caterwaul delegates its macroexpansion to my_caterwaul, but it uses a custom parse function.
caterwaul_global.instance_methods = function () {
return {compile: this.compile,
parse: this.parse,
macroexpand: this.macroexpand,
syntax: this.syntax,
ref: this.ref,
id: this.syntax_common.id,
init_function: this.init_function || this.macroexpand,
instance_methods: this.instance_methods,
ensure_syntax: this.ensure_syntax,
ensure_pattern: this.ensure_pattern,
ensure_expander: this.ensure_expander,
environment: function (e) {return arguments.length ? (this._environment = e, this) : this._environment},
macros: function () {return arguments.length ? (this._macros = this.flatten.apply(this, arguments), this) : this._macros},
toString: function () {return '[caterwaul instance ' + this.id() + ']'},
init: function (f, environment) {return this.is_precompiled(f) || this.init_not_precompiled(f, environment)},
init_not_precompiled: function (f, environment) {return f.constructor === this.syntax ? this.init_function(f) : this.compile(this(this.parse(f)), environment)}}};
__
meta::sdoc('js::core/caterwaul.macroexpander', <<'__');
Macroexpansion.
Caterwaul's main purpose is to transform your code, and the easiest way to transform things is through macroexpansion. The idea is to locate syntax nodes with a given pattern and rewrite them
somehow. For example, suppose we wanted to define a macro to enable the postfix /log modifier. Here's how it might look:
| x /log -> (function (it) {console.log(it); return it})(x)
The macro needs to first identify things of the form '_something /log' and transform them accordingly. Here's a macro to do that:
| var m = caterwaul.macro('_something /log', '(function (it) {console.log(it); return it})(_something)');
Building macros.
Caterwaul gives you several ways to build macros. The simplest is to use caterwaul.macro() as shown above. It will parse each string, using the first as a pattern and the second as a
template. It then fills in the values on the right from the ones on the left and re-expands the result. In place of each string, caterwaul.macro() can take either a syntax tree or a
function. The function on the left should take a syntax tree, try to match the pattern against it, and return either a match object or false. The function on the right should take a match
object and return a new syntax tree. (It won't be invoked if the left function returned false.)
Macro function internals.
Caterwaul's macros are just functions from syntax to syntax. They return false if they don't match a particular node. So, for example, the macro '_x * 2' -> '_x << 1' would return 'a << 1'
on 'a * 2', but would return false on 'a * 3'. The macroexpander knows to descend into child nodes when a macroexpander returns false. If a macroexpander returns a value then that value is
taken and no further expansion is performed. (This is necessary if we want to implement literal macros -- that is, literal(x) -> x and x isn't macroexpanded further.)
If a macro wants to re-expand stuff it should use 'this.expand', which invokes the macroexpander on a tree. Most of the time macros will do this, and it's done automatically by
caterwaul.macro() when you use a string or a syntax tree as the expansion. You'll have to call this.expand() if you're using a function as an expander.
caterwaul_global.ensure_syntax = function (thing) {return thing && thing.constructor === String ? this.parse(thing) : thing};
caterwaul_global.ensure_pattern = function (pattern) {return pattern.constructor === String ? this.ensure_pattern(this.parse(pattern)) :
pattern.constructor === this.syntax ? function (tree) {return pattern.match(tree)} : pattern};
caterwaul_global.ensure_expander = function (expander) {return expander.constructor === String ? this.ensure_expander(this.parse(expander)) :
expander.constructor === this.syntax ? function (match) {return this.expand(expander.replace(match))} : expander};
caterwaul_global.macro = caterwaul_global.right_variadic(function (pattern, expander) {var new_pattern = this.ensure_pattern(pattern), new_expander = this.ensure_expander(expander);
return se(function (tree) {var match = new_pattern.call(this, tree); return match && new_expander.call(this, match)},
function () {this.pattern = pattern, this.expander = expander})});
Macroexpander logic.
This behaves just like the pre-1.0 macroexpander, except that the patterns and expanders are now fused. The macro functions are also evaluated under a different context; rather than being
bound to the caterwaul function they came from, they are bound to a context object that gives them a way to re-expand stuff under the same set of macros. It also provides the caterwaul
function that is performing the expansion. (Though you shouldn't modify the macro list from inside a macro -- this pre-1.0 feature is now removed.)
Just like previous versions of caterwaul the macros are matched last-to-first. This means that the /last/ matching macro is used, allowing you to easily override stuff. Also, the
macroexpand() function takes optional extra parameters; these are either macros or arrays of macros to be added to the macro list stored on the caterwaul function.
caterwaul_global.macroexpand = function (tree) {for (var macros = arguments.length ? [].concat(this._macros || []) : this._macros || [], i = 1, l = arguments.length, x; i < l; ++i)
(x = arguments[i]) instanceof Array ? macros.push.apply(macros, x) : macros.push(x);
var context = {caterwaul: this, macros: macros, expand: function (tree) {
return tree.rmap(function (node) {
for (var new_node = null, i = macros.length - 1; i >= 0; --i) if (new_node = macros[i].call(context, node)) return new_node})}};
return context.expand(this.ensure_syntax(tree))};
__
meta::sdoc('js::core/caterwaul.parser', <<'__');
Parsing.
There are two distinct parts to parsing Javascript. One is parsing the irregular statement-mode expressions such as 'if (condition) {...}' and 'function f(x) {...}'; the other is parsing
expression-mode stuff like arithmetic operators. In Rebase I tried to model everything as an expression, but that failed sometimes because it required that each operator have fixed arity. In
particular this was infeasible for keywords such as 'break', 'continue', 'return', and some others (any of these can be nullary or unary). It also involved creating a bizarre hack for 'case
x:' inside a switch block. This hack made the expression passed in to 'case' unavailable, as it would be buried in a ':' node.
Caterwaul fixes these problems by using a proper context-free grammar. However, it's much looser than most grammars because it doesn't need to validate anything. Correspondingly, it can be
much faster as well. Instead of guessing and backtracking as a recursive-descent parser would, it classifies many different branches into the same basic structure and fills in the blanks. One
example of this is the () {} pair, which occurs in a bunch of different constructs, including function () {}, if () {}, for () {}, etc. In fact, any time a () group is followed by a {} group
we can grab the token that precedes () (along with perhaps one more in the case of function f () {}), and group that under whichever keyword is responsible.
Syntax folding.
The first thing to happen is that parenthetical, square bracket, and braced groups are folded up. This happens in a single pass that is linear in the number of tokens, and other foldable
tokens (including unary and binary operators) are indexed by associativity. The following pass runs through these indexes from high to low precedence and folds tokens into trees. By this
point all of the parentheticals have been replaced by proper nodes (here I include ?: groups in parentheticals, since they behave the same way). Finally, high-level rules are applied to the
remaining keywords, which are bound last. This forms a complete parse tree.
Doing all of this efficiently requires a linked list rather than an array. This gets built during the initial paren grouping stage. Arrays are used for the indexes, which are left-to-right
and are later processed in the order indicated by the operator associativity. That is, left-associative operators are processed 0 .. n and right associative are processed n .. 0. Keywords
are categorized by behavior and folded after all of the other operators. Semicolons are folded last, from left to right.
There are some corner cases due to Javascript's questionable heritage from C-style syntax. For example, most constructs take either syntax blocks or semicolon-delimited statements. Ideally,
else, while, and catch are associated with their containing if, do, and try blocks, respectively. This can be done easily, as the syntax is folded right-to-left. Another corner case would
come up if there were any binary operators with equal precedence and different associativity. Javascript doesn't have them however, and it wouldn't make much sense to; it would render
expressions such as 'a op1 b op2 c' ambiguous if op1 and op2 shared precedence but each wanted to bind first. (I mention this because at first I was worried about it, but now I realize it
isn't an issue.)
Notationally (for easier processing later on), a distinction is made between invocation and grouping, and between dereferencing and array literals. Dereferencing and function invocation are
placed into their own operators, where the left-hand side is the thing being invoked or dereferenced and the right-hand side is the paren-group or bracket-group that is responsible for the
operation. Also, commas inside these groups are flattened into a single variadic (possibly nullary) comma node so that you don't have to worry about the tree structure. This is the case for
all left-associative operators; right-associative operators preserve their hierarchical folding.
Parse/lex shared logic.
Lexing Javascript is not entirely straightforward, primarily because of regular expression literals. The first implementation of the lexer got things right 99% of the time by inferring the
role of a / by its preceding token. The problem comes in when you have a case like this:
| if (condition) /foo/.test(x)
In this case, (condition) will be incorrectly inferred to be a regular expression (since the close-paren terminates an expression, usually), and /foo/ will be interpreted as division by foo.
We mark the position before a token and then just increment the position. The token, then, can be retrieved by taking a substring from the mark to the position. This eliminates the need for
intermediate concatenations. In a couple of cases I've gone ahead and done them anyway -- these are for operators, where we grab the longest contiguous substring that is defined. I'm not too
worried about the O(n^2) complexity due to concatenation; they're bounded by four characters.
OK, so why use charAt() instead of regular expressions? It's a matter of asymptotic performance. V8 implements great regular expressions (O(1) in the match length for the (.*)$ pattern), but
the substring() method is O(n) in the number of characters returned. Firefox implements O(1) substring() but O(n) regular expression matching. Since there are O(n) tokens per document of n
characters, any O(n) step makes lexing quadratic. So I have to use the only reliably constant-time method provided by strings, charAt() (or in this case, charCodeAt()).
Of course, building strings via concatenation is also O(n^2), so I also avoid that for any strings that could be long. This is achieved by using a mark to indicate where the substring
begins, and advancing i independently. The span between mark and i is the substring that will be selected, and since each substring both requires O(n) time and consumes n characters, the
lexer as a whole is O(n). (Though perhaps with a large constant.)
Parse function.
As mentioned earlier, the parser and lexer aren't distinct. The lexer does most of the heavy lifting; it matches parens and brackets, arranges tokens into a hierarchical linked list, and
provides an index of those tokens by their fold order. It does all of this by streaming tokens into a micro-parser whose language is grouping and that knows about the oddities required to
handle regular expression cases. In the same function, though as a distinct case, the operators are folded and the syntax is compiled into a coherent tree form.
The input to the parse function can be anything whose toString() produces valid Javascript code.
caterwaul_global.parse = function (input) {
Lex variables.
s, obviously, is the string being lexed. mark indicates the position of the stream, while i is used for lookahead. The difference is later read into a token and pushed onto the result. c
is a temporary value used to store the current character code. re is true iff a slash would begin a regular expression. esc is a flag indicating whether the next character in a string or
regular expression literal is escaped. exp indicates whether we've seen the exponent marker in a number. close is used for parsing single and double quoted strings; it contains the
character code of the closing quotation mark. t is the token to be processed.
Parse variables.
grouping_stack and gs_top are used for paren/brace/etc. matching. head and parent mark two locations in the linked syntax tree; when a new group is created, parent points to the opener
(i.e. (, [, ?, or {), while head points to the most recently added child. (Hence the somewhat complex logic in push().) indexes[] determines reduction order, and contains references to the
nodes in the order in which they should be folded. invocation_nodes is an index of the nodes that will later need to be flattened.
The push() function manages the mechanics of adding a node to the initial linked structure. There are a few cases here; one is when we've just created a paren group and have no 'head'
node; in this case we append the node as 'head'. Another case is when 'head' exists; in that case we update head to be the new node, which gets added as a sibling of the old head.
var s = input.toString(), mark = 0, c = 0, re = true, esc = false, dot = false, exp = false, close = 0, t = '', i = 0, l = s.length, cs = function (i) {return s.charCodeAt(i)},
grouping_stack = [], gs_top = null, head = null, parent = null, indexes = map(function () {return []}, parse_reduce_order), invocation_nodes = [], all_nodes = [empty],
new_node = function (n) {return all_nodes.push(n), n}, push = function (n) {return head ? head._sibling(head = n) : (head = n._append_to(parent)), new_node(n)},
syntax_node = this.syntax;
Main lex loop.
This loop takes care of reading all of the tokens in the input stream. At the end, we'll have a linked node structure with paren groups. At the beginning, we set the mark to the current
position (we'll be incrementing i as we read characters), munch whitespace, and reset flags.
while ((mark = i) < l) {
while (lex_space[c = cs(i)] && i < l) mark = ++i;
esc = exp = dot = t = false;
Miscellaneous lexing.
This includes bracket resetting (the top case, where an open-bracket of any sort triggers regexp mode) and comment removal. Both line and block comments are removed by comparing against
lex_slash, which represents /, and lex_star, which represents *.
if (lex_bracket[c]) {t = !! ++i; re = lex_opener[c]}
else if (c === lex_slash && cs(i + 1) === lex_star && (i += 2)) {while (++i < l && cs(i) !== lex_slash || cs(i - 1) !== lex_star); t = ! ++i}
else if (c === lex_slash && cs(i + 1) === lex_slash) {while (++i < l && ! lex_eol[cs(i)]); t = false}
Regexp and string literal lexing.
These both take more or less the same form. The idea is that we have an opening delimiter, which can be ", ', or /; and we look for a closing delimiter that follows. It is syntactically
illegal for a string to occur anywhere that a slash would indicate division (and it is also illegal to follow a string literal with extra characters), so reusing the regular expression
logic for strings is not a problem. (This follows because we know ahead of time that the Javascript is valid.)
else if (lex_quote[c] && (close = c) && re && ! (re = ! (t = s.charAt(i)))) {while (++i < l && (c = cs(i)) !== close || esc) esc = ! esc && c === lex_back;
while (++i < l && lex_regexp_suffix[cs(i)]) ; t = true}
Numeric literal lexing.
This is far more complex than the above cases. Numbers have several different formats, each of which requires some custom logic. The reason we need to parse numbers so exactly is that it
influences how the rest of the stream is lexed. One example is '0.5.toString()', which is perfectly valid Javascript. What must be output here, though, is '0.5', '.', 'toString', '(',
')'; so we have to keep track of the fact that we've seen one dot and stop lexing the number on the second.
Another case is exponent-notation: 3.0e10. The hard part here is that it's legal to put a + or - on the exponent, which normally terminates a number. Luckily we can safely skip over any
character that comes directly after an E or e (so long as we're really in exponent mode, which I'll get to momentarily), since there must be at least one digit after an exponent.
The final case, which restricts the logic somewhat, is hexadecimal numbers. These also contain the characters 'e' and 'E', but we cannot safely skip over the following character, and any
decimal point terminates the number (since '0x5.toString()' is also valid Javascript). The same follows for octal numbers; the leading zero indicates that there will be no decimal point,
which changes the lex mode (for example, '0644.toString()' is valid).
So, all this said, there are different logic branches here. One handles guaranteed integer cases such as hex/octal, and the other handles regular numbers. The first branch is triggered
whenever a number starts with zero and is followed by 'x' or a digit (for conciseness I call 'x' a digit), and the second case is triggered when '.' is followed by a digit, or when a
digit starts.
A trivial change, using regular expressions, would reduce this logic significantly. I chose to write it out longhand because (1) it's more fun that way, and (2) the regular expression
approach has theoretically quadratic time in the length of the numbers, whereas this approach keeps things linear. Whether or not that actually makes a difference I have no idea.
Finally, in response to a recently discovered failure case, a period must be followed by a digit if it starts a number. The failure is the string '.end', which will be lexed as '.en',
'd' if it is assumed to be a floating-point number. (In fact, any method or property beginning with 'e' will cause this problem.)
else if (c === lex_zero && lex_integer[cs(i + 1)]) {while (++i < l && lex_integer[cs(i)]); re = ! (t = true)}
else if (lex_float[c] && (c !== lex_dot || lex_decimal[cs(i + 1)])) {while (++i < l && (lex_decimal[c = cs(i)] || (dot ^ (dot |= c === lex_dot)) || (exp ^ (exp |= lex_exp[c] && ++i))));
while (i < l && lex_decimal[cs(i)]) ++i; re = ! (t = true)}
Operator lexing.
The 're' flag is reused here. Some operators have both unary and binary modes, and as a heuristic (which happens to be accurate) we can assume that anytime we expect a regular
expression, a unary operator is intended. The only exception are ++ and --, which are always unary but sometimes are prefix and other times are postfix. If re is true, then the prefix
form is intended; otherwise, it is postfix. For this reason I've listed both '++' and 'u++' (same for --) in the operator tables; the lexer is actually doing more than its job here by
identifying the variants of these operators.
The only exception to the regular logic happens if the operator is postfix-unary. (e.g. ++, --.) If so, then the re flag must remain false, since expressions like 'x++ / 4' can be valid.
else if (lex_punct[c] && (t = re ? 'u' : '', re = true)) {while (i < l && lex_punct[cs(i)] && has(lex_op, t + s.charAt(i))) t += s.charAt(i++); re = ! has(lex_postfix_unary, t)}
Identifier lexing.
If nothing else matches, then the token is lexed as a regular identifier or Javascript keyword. The 're' flag is set depending on whether the keyword expects a value. The nuance here is
that you could write 'x / 5', and it is obvious that the / means division. But if you wrote 'return / 5', the / would be a regexp delimiter because return is an operator, not a value. So
at the very end, in addition to assigning t, we also set the re flag if the word turns out to be an operator.
else {while (++i < l && lex_ident[cs(i)]); re = has(lex_op, t = s.substring(mark, i))}
Token unification.
t will contain true, false, or a string. If false, no token was lexed; this happens when we read a comment, for example. If true, the substring method should be used. (It's a shorthand to
avoid duplicated logic.) For reasons that are not entirely intuitive, the lexer sometimes produces the artifact 'u;'. This is never useful, so I have a case dedicated to removing it.
if (i === mark) throw new Error('Caterwaul lex error at "' + s.substr(mark, 40) + '" with leading context "' + s.substr(mark - 40, 40) + '" (probably a Caterwaul bug)');
if (t === false) continue;
t = t === true ? s.substring(mark, i) : t === 'u;' ? ';' : t;
Grouping and operator indexing.
Now that we have a token, we need to see whether it affects grouping status. There are a couple of possibilities. If it's an opener, then we create a new group; if it's a matching closer
then we close the current group and pop out one layer. (We don't check for matching here. Any code provided to Caterwaul will already have been parsed by the host Javascript interpreter,
so we know that it is valid.)
All operator indexing is done uniformly, left-to-right. Note that the indexing isn't strictly by operator. It's by reduction order, which is arguably more important. That's what the
parse_inverse_order table does: it maps operator names to parse_reduce_order subscripts. (e.g. 'new' -> 2.)
t === gs_top ? (grouping_stack.pop(), gs_top = grouping_stack[grouping_stack.length - 1], head = head ? head.p : parent, parent = null) :
(has(parse_group, t) ? (grouping_stack.push(gs_top = parse_group[t]), parent = push(new_node(new syntax_node(t))), head = null) : push(new_node(new syntax_node(t))),
has(parse_inverse_order, t) && indexes[parse_inverse_order[t]].push(head || parent)); // <- This is where the indexing happens
Regexp flag special cases.
Normally a () group wraps an expression, so a following / would indicate division. The only exception to this is when we have a block construct; in this case, the next token appears in
statement-mode, which means that it begins, not modifies, a value. We'll know that we have such a case if (1) the immediately-preceding token is a close-paren, and (2) a block-accepting
syntactic form occurs to its left.
With all this trouble over regular expressions, I had to wonder whether it was possible to do it more cleanly. I don't think it is, unfortunately. Even lexing the stream backwards fails
to resolve the ambiguity:
| for (var k in foo) /foo/g.test(k) && bar();
In this case we won't know it's a regexp until we hit the 'for' keyword (or perhaps 'var', if we're being clever -- but a 'with' or 'if' would require complete lookahead). A perfectly
valid alternative parse, minus the 'for' and 'var', is this:
| ((k in foo) / (foo) / (g.test(k))) && bar();
The only case where reverse-lexing is useful is when the regexp has no modifiers.
re |= t === ')' && head.l && has(parse_r_until_block, head.l.data)}
Operator fold loop.
This is the second major part of the parser. Now that we've completed the lex process, we can fold operators and syntax, and take care of some exception cases.
First step: functions, calls, dots, and dereferences.
I'm treating this differently from the generalized operator folding because of the syntactic inference required for call and dereference detection. Nothing has been folded at this point
(with the exception of paren groups, which is appropriate), so if the node to the left of any ( or [ group is an operator, then the ( or [ is really a paren group or array literal. If, on
the other hand, it is another value, then the group is a function call or a dereference. This folding goes left-to-right. The reason we also process dot operators is that they share the same
precedence as calls and dereferences. Here's what a () or [] transform looks like:
| quux <--> foo <--> ( <--> bar quux <--> () <--> bar
\ / \ <-- This can be done by saying _.l.wrap(new node('()')).p.fold_r().
bif <--> , <--> baz --> foo ( _.l.wrap() returns l again, .p gets the wrapping node, and fold_r adds a child to it.
\
bif <--> , <--> baz
This is actually merged into the for loop below, even though it happens before other steps do (see 'Ambiguous parse groups').
Second step: fold operators.
Now we can go through the list of operators, folding each according to precedence and associativity. Highest to lowest precedence here, which is just going forwards through the indexes[]
array. The parse_index_forward[] array indicates which indexes should be run left-to-right and which should go right-to-left.
for (var i = 0, l = indexes.length, forward, _; _ = indexes[i], forward = parse_index_forward[i], i < l; ++i)
for (var j = forward ? 0 : _.length - 1, lj = _.length, inc = forward ? 1 : -1, node, data; node = _[j], data = node && node.data, forward ? j < lj : j >= 0; j += inc)
Binary node behavior.
The most common behavior is binary binding. This is the usual case for operators such as '+' or ',' -- they grab one or both of their immediate siblings regardless of what they are.
Operators in this class are considered to be 'fold_lr'; that is, they fold first their left sibling, then their right.
if (has(parse_lr, data)) node._fold_lr();
Ambiguous parse groups.
As mentioned above, we need to determine whether grouping constructs are invocations or real groups. This happens to take place before other operators are parsed (which is good -- that way
it reflects the precedence of dereferencing and invocation). The only change we need to make is to discard the explicit parenthetical or square-bracket grouping for invocations or
dereferences, respectively. It doesn't make much sense to have a doubly-nested structure, where we have a node for invocation and another for the group on the right-hand side of that
invocation. Better is to modify the group in-place to represent an invocation.
We can't solve this problem here, but we can solve it after the parse has finished. I'm pushing these invocation nodes onto an index for the end.
else if (has(parse_ambiguous_group, data) && node.l && (node.l.data === '.' ||
! (has(lex_op, node.l.data) || has(parse_not_a_value, node.l.data)))) invocation_nodes.push(node.l._wrap(new_node(new syntax_node(data + parse_group[data]))).p._fold_r());
Unary left and right-fold behavior.
Unary nodes have different fold directions. In this case, it just determines which side we grab the node from. I'm glad that Javascript doesn't allow stuff like '++x++', which would make
the logic here actually matter. Because there isn't that pathological case, exact rigidity isn't required.
else if (has(parse_l, data)) node._fold_l();
else if (has(parse_r, data)) node._fold_r();
Ternary operator behavior.
This is kind of interesting. If we have a ternary operator, then it will be treated first as a group; just like parentheses, for example. This is the case because the ternary syntax is
unambiguous for things in the middle. So, for example, '3 ? 4 : 5' initially parses out as a '?' node whose child is '4'. Its siblings are '3' and '5', so folding left and right is an
obvious requirement. The only problem is that the children will be in the wrong order. Instead of (3) (4) (5), we'll have (4) (3) (5). So after folding, we do a quick swap of the first two
to set the ordering straight.
else if (has(parse_ternary, data)) {node._fold_lr(); var temp = node[1]; node[1] = node[0]; node[0] = temp}
Grab-until-block behavior.
Not quite as simple as it sounds. This is used for constructs such as 'if', 'function', etc. Each of these constructs takes the form '