#!/usr/bin/perl # Run perldoc on this file for documentation. # For the benefit of HTML viewers: #
This is a paragraph...
#This is another paragraph...
#int main () {return 0;}
# int main () {return 0} // Won't compile
# $_"}; my $quoted = sub {&$escape_all(); &$unindent(); s/^\|(\s?)/ \1/; s/^ //mg; push @markup, &$indent() . "
$_"}; my $paragraph = sub {&$escape_some(); push @markup, &$indent() . "
$_
"}; my $section = sub {my $h = $_[0] > 6 ? 6 : $_[0]; push @markup, &$indent($_[0] - 1) . " elements.
A great article about this (and its cross-browser ramifications) at http://www.longren.org/2006/09/27/wrapping-text-inside-pre-tags/.
#tutorial-page pre {white-space: pre-wrap; word-wrap: break-word}
SDoc page styling.
This is optimized for long lines and lots of text.
#sdoc-page {padding-bottom: 100px; color: white; position: absolute; display: none}
#sdoc-page a.back {font-size: 16pt; color: #999; display: block; text-transform: lowercase; text-decoration: none}
#sdoc-page a.back:before {content: '<< '; color: #444}
#sdoc-page a.back:hover {color: #ccc}
#sdoc-page a.back:hover:before {content: '<< '; color: #fa4}
#sdoc-page .file > h1 {color: #999; cursor: pointer; font-weight: normal; font-size: 16pt; white-space: nowrap; word-wrap: none}
#sdoc-page .file > h1 .path {color: #444}
#sdoc-page .file > h1 .extension {display: none; color: #444}
#sdoc-page .file > h1:hover .path {color: #ccc}
#sdoc-page .file > h1:hover .extension {display: none; color: #ccc}
#sdoc-page .file > h1:hover {color: #ccc}
#sdoc-page .file > h1:after {content: ' >>'; color: #444}
#sdoc-page .file > h1:hover:after {content: ' >>'; color: #fa4}
#sdoc-page .section {margin-top: 50px}
#sdoc-page .section h1:before, #sdoc-page .section h2:before, #sdoc-page .section h3:before {content: '< '; color: #fa4}
#sdoc-page .section h1:after, #sdoc-page .section h2:after, #sdoc-page .section h3:after {content: ' >'; color: #fa4}
#sdoc-page .section h4:before {content: '> '; color: #fa4}
#sdoc-page .section h1 {font-size: 16pt}
#sdoc-page .section h2 {font-size: 13pt}
#sdoc-page .section h3 {font-size: 11pt}
#sdoc-page .section h4 {font-size: 10pt}
#sdoc-page .section h1 {text-transform: lowercase; color: #999; font-weight: normal; border-bottom: solid 4px #222}
#sdoc-page .section h2 {text-transform: lowercase; color: #999; font-weight: normal; border-bottom: solid 4px #222}
#sdoc-page .section h3 {text-transform: lowercase; color: #999; font-weight: normal}
#sdoc-page .section h4 {text-transform: lowercase; color: #999; font-weight: normal}
#sdoc-page p {color: #eee; font-family: 'Rosario', sans-serif; font-size: 10pt; line-height: 2.1em; max-width: 500px; text-align: justify}
#sdoc-page pre.code {border: solid 1px #333; color: white; font-size: 10pt; font-family: 'Droid Sans Mono', monospace; padding: 4px; background: black; white-space: pre; word-wrap: none}
#sdoc-page pre.code {line-height: 1.8em}
Tutorial page styling.
These styles, while they should be more or less consistent across pages, are designed specifically for the tutorial.
#tutorial-page {width: 500px; padding-bottom: 100px; padding-right: 200px; padding-left: 50px; position: relative}
#tutorial-page p {font-family: 'Rosario', sans-serif; font-size: 11pt}
#tutorial-page p a {color: #888; text-decoration: none}
#tutorial-page p a:before {content: '['; color: #444}
#tutorial-page p a:after {content: ']'; color: #444}
#tutorial-page p a:hover {color: #eee}
#tutorial-page p a:hover:before {content: '['; color: #fa4}
#tutorial-page p a:hover:after {content: ']'; color: #fa4}
#tutorial-page .toc h1, #tutorial-page .toc h2, #tutorial-page .toc h3, #tutorial-page .toc h4
{color: #999; cursor: pointer; font-weight: normal; white-space: nowrap; word-wrap: none; text-transform: lowercase}
#tutorial-page .popdown {position: fixed; top: 0; left: 50px; border-bottom: solid 1px #444; padding-bottom: 2px}
#tutorial-page .popdown:hover {border-bottom: solid 1px #fa4; padding-bottom: 10px}
#tutorial-page .popdown.open {background: rgba(0, 0, 0, 0.9); left: 40px; border-bottom: solid 4px #fa4; padding-bottom: 10px; padding-left: 10px; padding-right: 10px}
#tutorial-page .popdown .label {font-size: 14pt; color: #888; text-transform: lowercase; cursor: pointer}
#tutorial-page .popdown .label:hover {color: #eee}
#tutorial-page .popdown .label:before {content: '< '; color: #444}
#tutorial-page .popdown .label:after {content: ' >'; color: #444}
#tutorial-page .popdown:hover .label:before {content: '< '; color: #fa4}
#tutorial-page .popdown:hover .label:after {content: ' >'; color: #fa4}
#tutorial-page .popdown .contents {display: none}
#tutorial-page .popdown.open .contents {display: block; overflow-y: auto; overflow-x: hidden; max-height: 400px}
#tutorial-page .toc h1:hover, #tutorial-page .toc h2:hover, #tutorial-page .toc h3:hover, #tutorial-page .toc h4:hover {color: #eee}
#tutorial-page .toc h1:after, #tutorial-page .toc h2:after, #tutorial-page .toc h3:after, #tutorial-page .toc h4:after {content: ' >>'; color: #888}
#tutorial-page .toc h1:hover:after, #tutorial-page .toc h2:hover:after, #tutorial-page .toc h3:hover:after, #tutorial-page .toc h4:hover:after {content: ' >>'; color: #fa4}
#tutorial-page .toc h1 {font-size: 16pt}
#tutorial-page .toc h2 {font-size: 13pt; padding-left: 20px}
#tutorial-page .toc h3 {font-size: 11pt; padding-left: 40px}
#tutorial-page .toc h4 {font-size: 10pt; padding-left: 60px}
#tutorial-page .section h1:before, #tutorial-page .section h2:before, #tutorial-page .section h3:before {content: '< '; color: #fa4}
#tutorial-page .section h1:after, #tutorial-page .section h2:after, #tutorial-page .section h3:after {content: ' >'; color: #fa4}
#tutorial-page .shell {position: fixed; border-radius: 0px; right: 50px; top: 0; bottom: 0; left: 600px; border: solid 2px #222; border-width: 0 1px; overflow-y: auto; overflow-x: hidden}
#tutorial-page .shell {font-family: 'Droid Sans Mono', monospace; font-size: 10pt; color: white; background: rgba(0, 0, 0, 0.9)}
#tutorial-page .shell .prompt > span {margin: 4px}
#tutorial-page .shell .shadow {color: #888; display: block; margin: 4px; padding: 4px 0}
#tutorial-page .shell .shadow:before {color: #752; content: '>'; padding-right: 4px}
#tutorial-page .shell .shadow:hover:before {color: #888; content: 'compiled: '; padding-right: 4px}
#tutorial-page .shell .input {font-family: 'Droid Sans Mono', monospace; padding: 0; margin: 0; border: none !important; outline: none !important}
#tutorial-page .shell .input {font-size: 10pt; background: transparent; color: white}
#tutorial-page .shell .input:focus {border: none !important; outline: none !important}
#tutorial-page .shell .history {position: relative}
#tutorial-page .shell .history pre {font-family: 'Droid Sans Mono', monospace; font-size: 10pt}
#tutorial-page .shell .history .entry, .shell .history .result, .shell .history .error, .shell .history .log {margin: 4px}
#tutorial-page .shell .history .entry {color: white}
#tutorial-page .shell .history .entry .command {margin-left: 4px}
#tutorial-page .shell .history .result {color: #7bf}
#tutorial-page .shell .history .log {color: #7fb}
#tutorial-page .shell .history .error {color: #f87}
#tutorial-page .shell .history .log:hover:before {color: #888; content: 'log: '}
#tutorial-page .shell .sandbox {font-family: 'Neuton', 'Garamond', serif; background: #222; color: #eee; padding: 10px; margin-top: 4px}
#tutorial-page #seq-decipher {padding: 4px 0; margin: 20px 0}
#tutorial-page #seq-decipher input {border: solid 1px #444; background: black; padding: 4px; font-family: 'Droid Sans Mono', monospace; color: white; outline: none !important}
#tutorial-page #seq-decipher table.result {font-size: 9pt}
#tutorial-page #seq-decipher td.name {text-transform: lowercase; color: #aaa}
#tutorial-page #seq-decipher td.fragment {font-family: 'Droid Sans Mono', monospace; color: #fa4}
#tutorial-page #seq-decipher tr.variables td.desc {font-family: 'Droid Sans Mono', monospace}
#tutorial-page .section h1, #tutorial-page .section h2, #tutorial-page .section h3, #tutorial-page .section h4 {padding-top: 50px}
#tutorial-page .section h1 {font-size: 16pt}
#tutorial-page .section h2 {font-size: 13pt}
#tutorial-page .section h3 {font-size: 11pt}
#tutorial-page .section h1 {text-transform: lowercase; color: #999; font-weight: normal; border-bottom: solid 4px #222}
#tutorial-page .section h2 {text-transform: lowercase; color: #999; font-weight: normal; border-bottom: solid 4px #222}
#tutorial-page .section h3 {text-transform: lowercase; color: #999; font-weight: normal}
#tutorial-page .section h4 {text-transform: lowercase; color: #999; font-weight: normal}
#tutorial-page p {color: #eee; font-size: 10pt; line-height: 2.1em; text-align: justify}
#tutorial-page code {color: #fff; font-size: 10pt; font-family: 'Droid Sans Mono', monospace; background: black; padding: 4px; border: solid 1px #333}
#tutorial-page pre {color: #fff; font-size: 10pt; font-family: 'Droid Sans Mono', monospace; background: black; padding: 4px; border: solid 1px #333}
#tutorial-page pre.code {cursor: pointer; padding: 10px 4px}
#tutorial-page pre.code:before {content: '> '; color: #fa4}
#tutorial-page pre.code:hover {background: #222; color: #fa4}
__
meta::sdoc('js::caterwaul', <<'__');
Caterwaul JS | Spencer Tipping
Licensed under the terms of the MIT source code license
Introduction.
Caterwaul is a Javascript-to-Javascript compiler. Visit http://caterwauljs.org for information about how and why you might use it.
(function (f) {return f(f)})(function (initializer, key, undefined) {
Utility methods.
Utility functions here are:
| 1. qw Splits a string into space-separated words and returns an array of the results. This is a Perl idiom that's really useful when writing lists of things.
2. se Side-effects on a value and returns the value.
3. fail Throws an error. This isn't particularly special except for the fact that the keyword 'throw' can't be used in expression context.
4. gensym Generates a string that will never have been seen before.
5. bind Fixes 'this' inside the function being bound. This is a common Javascript idiom, but is reimplemented here because we don't know which other libraries are available.
6. map Maps a function over an array-like object and returns an array of the results.
7. rmap Recursively maps a function over arrays.
8. hash Takes a string, splits it into words, and returns a hash mapping each of those words to true. This is used to construct sets.
9. merge Takes an object and one or more extensions, and copies all properties from each extension onto the object. Returns the object.
Side-effecting is used to initialize things statefully; for example:
| return se(function () {return 5}, function (f) {
f.sourceCode = 'return 5';
});
Gensyms are unique identifiers that end with high-entropy noise that won't appear in the source being compiled. The general format of a gensym is name_count_suffix, where 'name' is provided by
whoever requested the gensym (this allows gensyms to be more readable), 'count' is a base-36 number that is incremented with each gensym, and 'suffix' is a constant base-64 string containing
128 bits of entropy. (Since 64 possibilities is 6 bits, this means that we have 22 characters.)
var qw = function (x) {return x.split(/\s+/)}, se = function (x, f) {return f && f.call(x, x) || x}, fail = function (m) {throw new Error(m)},
unique = key || (function () {for (var xs = [], d = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789$_', i = 21, n; i >= 0; --i) xs.push(d.charAt(Math.random() * 64 >>> 0));
return xs.join('')})(),
gensym = (function (c) {return function (name) {return [name || '', (++c).toString(36), unique].join('_')}})(0), is_gensym = function (s) {return s.substr(s.length - 22) === unique},
bind = function (f, t) {return function () {return f.apply(t, arguments)}},
map = function (f, xs) {for (var i = 0, ys = [], l = xs.length; i < l; ++i) ys.push(f(xs[i], i)); return ys},
rmap = function (f, xs) {return map(function (x) {return x instanceof Array ? rmap(f, x) : f(x)})},
hash = function (s) {for (var i = 0, xs = qw(s), o = {}, l = xs.length; i < l; ++i) o[xs[i]] = true; return annotate_keys(o)},
The merge() function is compromised for the sake of Internet Explorer, which contains a bug-ridden and otherwise horrible implementation of Javascript. The problem is that, due to a bug in
hasOwnProperty and DontEnum within JScript, these two expressions are evaluated incorrectly:
| for (var k in {toString: 5}) alert(k); // no alert on IE
({toString: 5}).hasOwnProperty('toString') // false on IE
To compensate, merge() manually copies toString if it is present on the extension object.
merge = (function (o) {for (var k in o) if (o.hasOwnProperty(k)) return true})({toString: true}) ?
// hasOwnProperty, and presumably iteration, both work, so we use the sensible implementation of merge():
function (o) {for (var i = 1, l = arguments.length, _; i < l; ++i) if (_ = arguments[i]) for (var k in _) if (has(_, k)) o[k] = _[k]; return o} :
// hasOwnProperty, and possibly iteration, both fail, so we hack around the problem with this gem:
function (o) {for (var i = 1, l = arguments.length, _; i < l; ++i)
if (_ = arguments[i]) {for (var k in _) if (has(_, k)) o[k] = _[k];
if (_.toString && ! /\[native code\]/.test(_.toString.toString())) o.toString = _.toString} return o},
Optimizations.
The parser and lexer each assume valid input and do no validation. This is possible because any function passed in to caterwaul will already have been parsed by the Javascript interpreter;
syntax errors would have caused an error there. This enables a bunch of optimization opportunities in the parser, ultimately making it not in any way recursive and requiring only three
linear-time passes over the token stream. (An approximate figure; it actually does about 19 fractional passes, but not all nodes are reached.)
Also, I'm not confident that all Javascript interpreters are smart about hash indexing. Particularly, suppose a hashtable has 10 entries, the longest of whose keys is 5 characters. If we
throw a 2K string at it, it might very well hash that whole thing just to find that, surprise, the entry doesn't exist. That's a big performance hit if it happens very often. To prevent this
kind of thing, I'm keeping track of the longest string in the hashtable by using the 'annotate_keys' function. 'has()' knows how to look up the maximum length of a hashtable to verify that
the candidate is in it, resulting in the key lookup being only O(n) in the longest key (generally this ends up being nearly O(1), since I don't like to type long keys), and average-case O(1)
regardless of the length of the candidate.
As of Caterwaul 0.7.0 the _max_length property has been replaced by a gensym. This basically guarantees uniqueness, so the various hacks associated with working around the existence of the
special _max_length key are no longer necessary.
max_length_key = gensym('hash'),
annotate_keys = function (o) {var max = 0; for (var k in o) own.call(o, k) && (max = k.length > max ? k.length : max); o[max_length_key] = max; return o},
has = function (o, p) {return p != null && ! (p.length > o[max_length_key]) && own.call(o, p)}, own = Object.prototype.hasOwnProperty,
Global caterwaul variable.
Caterwaul creates a global symbol, caterwaul. Like jQuery, there's a mechanism to get the original one back if you don't want to replace it. You can call caterwaul.deglobalize() to return
caterwaul and restore the global that was there when Caterwaul was loaded (might be useful in the unlikely event that someone else named their library Caterwaul). Note that deglobalize() is
available only on the global caterwaul() function.
calls_init = function () {var f = function () {return f.init.apply(f, arguments)}; return f},
original_global = typeof caterwaul === 'undefined' ? undefined : caterwaul,
caterwaul_global = se(calls_init(), function () {this.deglobalize = function () {caterwaul = original_global; return caterwaul_global};
merge(this, {merge: merge, map: map, rmap: rmap, gensym: gensym, is_gensym: is_gensym})}),
Shared parser data.
This data is used both for parsing and for serialization, so it's made available to all pieces of caterwaul.
Precomputed table values.
The lexer uses several character lookups, which I've optimized by using integer->boolean arrays. The idea is that instead of using string membership checking or a hash lookup, we use the
character codes and index into a numerical array. This is guaranteed to be O(1) for any sensible implementation, and is probably the fastest JS way we can do this. For space efficiency,
only the low 256 characters are indexed. High characters will trigger sparse arrays, which may degrade performance. Also, this parser doesn't handle Unicode characters properly; it assumes
lower ASCII only.
The lex_op table indicates which elements trigger regular expression mode. Elements that trigger this mode cause a following / to delimit a regular expression, whereas other elements would
cause a following / to indicate division. By the way, the operator ! must be in the table even though it is never used. The reason is that it is a substring of !==; without it, !== would
fail to parse.
lex_op = hash('. new ++ -- u++ u-- u+ u- typeof u~ u! ! * / % + - << >> >>> < > <= >= instanceof in == != === !== & ^ | && || ? = += -= *= /= %= &= |= ^= <<= >>= >>>= : , ' +
'return throw case var const break continue void else u; ;'),
lex_table = function (s) {for (var i = 0, xs = [false]; i < 8; ++i) xs.push.apply(xs, xs); for (var i = 0, l = s.length; i < l; ++i) xs[s.charCodeAt(i)] = true; return xs},
lex_float = lex_table('.0123456789'), lex_decimal = lex_table('0123456789'), lex_integer = lex_table('0123456789abcdefABCDEFx'), lex_exp = lex_table('eE'),
lex_space = lex_table(' \n\r\t'), lex_bracket = lex_table('()[]{}'), lex_opener = lex_table('([{'), lex_punct = lex_table('+-*/%&|^!~=<>?:;.,'),
lex_eol = lex_table('\n\r'), lex_regexp_suffix = lex_table('gims'), lex_quote = lex_table('\'"/'), lex_slash = '/'.charCodeAt(0),
lex_star = '*'.charCodeAt(0), lex_back = '\\'.charCodeAt(0), lex_x = 'x'.charCodeAt(0), lex_dot = '.'.charCodeAt(0),
lex_zero = '0'.charCodeAt(0), lex_postfix_unary = hash('++ --'), lex_ident = lex_table('$_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'),
Parse data.
The lexer and parser aren't entirely separate, nor can they be considering the complexity of Javascript's grammar. The lexer ends up grouping parens and identifying block constructs such
as 'if', 'for', 'while', and 'with'. The parser then folds operators and ends by folding these block-level constructs.
parse_reduce_order = map(hash, ['function', '( [ . [] ()', 'new delete', 'u++ u-- ++ -- typeof u~ u! u+ u-', '* / %', '+ -', '<< >> >>>', '< > <= >= instanceof in', '== != === !==', '&',
'^', '|', '&&', '||', 'case', '?', '= += -= *= /= %= &= |= ^= <<= >>= >>>=', ':', ',', 'return throw break continue void', 'var const',
'if else try catch finally for switch with while do', ';']),
parse_associates_right = hash('= += -= *= /= %= &= ^= |= <<= >>= >>>= ~ ! new typeof u+ u- -- ++ u-- u++ ? if else function try catch finally for switch case with while do'),
parse_inverse_order = (function (xs) {for (var o = {}, i = 0, l = xs.length; i < l; ++i) for (var k in xs[i]) has(xs[i], k) && (o[k] = i); return annotate_keys(o)})(parse_reduce_order),
parse_index_forward = (function (rs) {for (var xs = [], i = 0, l = rs.length, _ = null; _ = rs[i], xs[i] = true, i < l; ++i)
for (var k in _) if (has(_, k) && (xs[i] = xs[i] && ! has(parse_associates_right, k))) break; return xs})(parse_reduce_order),
parse_lr = hash('[] . () * / % + - << >> >>> < > <= >= instanceof in == != === !== & ^ | && || = += -= *= /= %= &= |= ^= <<= >>= >>>= , : ;'),
parse_r_until_block = annotate_keys({'function':2, 'if':1, 'do':1, 'catch':1, 'try':1, 'for':1, 'while':1, 'with':1, 'switch':1}),
parse_accepts = annotate_keys({'if':'else', 'do':'while', 'catch':'finally', 'try':'catch'}), parse_invocation = hash('[] ()'),
parse_r_optional = hash('return throw break continue else'), parse_r = hash('u+ u- u! u~ u++ u-- new typeof finally case var const void delete'),
parse_block = hash('; {'), parse_invisible = hash('i;'), parse_l = hash('++ --'), parse_group = annotate_keys({'(':')', '[':']', '{':'}', '?':':'}),
parse_ambiguous_group = hash('[ ('), parse_ternary = hash('?'), parse_not_a_value = hash('function if for while catch void delete new typeof in instanceof'),
parse_also_expression = hash('function'),
Syntax data structures.
There are two data structures used for syntax trees. At first, paren-groups are linked into doubly-linked lists, described below. These are then folded into immutable array-based specific
nodes. At the end of folding there is only one child per paren-group.
Doubly-linked paren-group lists.
When the token stream is grouped into paren groups it has a hierarchical linked structure that conceptually has these pointers:
| +--------+
+------ | node | ------+
| +-> | | <--+ |
first | | +--------+ | | last
| | parent parent | |
V | | V
+--------+ +--------+
| node | --- r --> | node | --- r ---/
/--- l --- | | <-- l --- | |
+--------+ +--------+
The primary operation performed on this tree, at least initially, is repeated folding. So we have a chain of linear nodes, and one by one certain nodes fold their siblings underneath them,
breaking the children's links and linking instead to the siblings' neighbors. For example, if we fold node (3) as a binary operator:
| (1) <-> (2) <-> (3) <-> (4) <-> (5) (1) <--> (3) <--> (5)
/ \ / \ / \ / \ / \ --> / \ / \ / \
/ \
(2) (4) <- No link between children
/ \ / \ (see 'Fold nodes', below)
Fold nodes.
Once a node has been folded (e.g. (3) in the diagram above), none of its children will change and it will gain no more children. The fact that none of its children will change can be shown
inductively: suppose you've decided to fold the '+' in 'x + y' (here x and y are arbitrary expressions). This means that x and y are comprised of higher-precedence operators. Since there is
no second pass back to high-precedence operators, x and y will not change nor will they interact with one another. The fact that a folded node never gains more children arrives from the fact
that it is folded only once; this is by virtue of folding by index instead of by tree structure. (Though a good tree traversal algorithm also wouldn't hit the same node twice -- it's just
less obvious when the tree is changing.)
Anyway, the important thing about fold nodes is that their children don't change. This means that an array is a completely reasonable data structure to use for the children; it certainly
makes the structure simpler. It also means that the only new links that must be added to nodes as they are folded are links to new children (via the array), and links to the new siblings.
Once we have the array-form of fold nodes, we can build a query interface similar to jQuery, but designed for syntactic traversal. This will make routine operations such as macro
transformation and quasiquoting far simpler later on.
Both grouping and fold nodes are represented by the same data structure. In the case of grouping, the 'first' pointer is encoded as [0] -- that is, the first array element. It doesn't
contain pointers to siblings of [0]; these are still accessed by their 'l' and 'r' pointers. As the structure is folded, the number of children of each paren group should be reduced to just
one. At this point the remaining element's 'l' and 'r' pointers will both be null, which means that it is in hierarchical form instead of linked form.
After the tree has been fully generated and we have the root node, we have no further use for the parent pointers. This means that we can use subtree sharing to save memory. Once we're past
the fold stage, push() should be used instead of append(). append() works in a bidirectionally-linked tree context (much like the HTML DOM), whereas push() works like it does for arrays
(i.e. no parent pointer).
Syntax node functions.
These functions are common to various pieces of syntax nodes. Not all of them will always make sense, but the prototypes of the constructors can be modified independently later on if it
turns out to be an issue.
syntax_common = caterwaul_global.syntax_common = {
Mutability.
These functions let you modify nodes in-place. They're used during syntax folding and shouldn't really be used after that (hence the underscores).
_replace: function (n) {return (n.l = this.l) && (this.l.r = n), (n.r = this.r) && (this.r.l = n), this}, _append_to: function (n) {return n && n._append(this), this},
_reparent: function (n) {return this.p && this.p[0] === this && (this.p[0] = n), this}, _fold_l: function (n) {return this._append(this.l && this.l._unlink(this) || empty)},
_append: function (n) {return (this[this.length++] = n) && (n.p = this), this}, _fold_r: function (n) {return this._append(this.r && this.r._unlink(this) || empty)},
_sibling: function (n) {return n.p = this.p, (this.r = n).l = this}, _fold_lr: function () {return this._fold_l()._fold_r()},
_fold_rr: function () {return this._fold_r()._fold_r()},
_wrap: function (n) {return n.p = this._replace(n).p, this._reparent(n), delete this.l, delete this.r, this._append_to(n)},
_unlink: function (n) {return this.l && (this.l.r = this.r), this.r && (this.r.l = this.l), delete this.l, delete this.r, this._reparent(n)},
These methods are OK for use after the syntax folding stage is over (though because syntax nodes are shared it's generally dangerous to go modifying them):
pop: function () {return --this.length, this}, push: function (x) {return this[this.length++] = x || empty, this},
Identification.
You can request that a syntax node identify itself, in which case it will give you an identifier if it hasn't already. The identity is not determined until the first time it is requested,
and after that it is stable. As of Caterwaul 0.7.0 the mechanism works differently (i.e. isn't borked) in that it replaces the prototype definition with an instance-specific closure the
first time it gets called. This may reduce the number of decisions in the case that the node's ID has already been computed.
id: function () {var id = gensym('id'); return (this.id = function () {return id})()},
is_caterwaul_syntax: true,
Traversal functions.
each() is the usual side-effecting shallow traversal that returns 'this'. map() distributes a function over a node's children and returns the array of results, also as usual. Two variants,
reach and rmap, perform the process recursively. reach is non-consing; it returns the original as a reference. rmap, on the other hand, follows some rules to cons a new tree. If the
function passed to rmap() returns the node verbatim then its children are traversed. If it returns a distinct node, however, then traversal doesn't descend into the children of the newly
returned tree but rather continues as if the original node had been a leaf. For example:
| parent Let's suppose that a function f() has these mappings:
/ \
node1 node2 f(parent) = parent f(node1) = q
/ \ | f(node2) = node2
c1 c2 c3
In this example, f() would be called on parent, node1, node2, and c3 in that order. c1 and c2 are omitted because node1 was replaced by q -- and there is hardly any point in going through
the replaced node's previous children. (Nor is there much point in forcibly iterating over the new node's children, since presumably they are already processed.) If a mapping function
returns something falsy, it will have exactly the same effect as returning the node without modification.
Using the old s() to do gensym-safe replacement requires that you invoke it only once, and this means that for complex macroexpansion you'll have a long array of values. This isn't ideal,
so syntax trees provide a replace() function that handles replacement more gracefully:
| qs[(foo(_foo), _before_bar + bar(_bar))].replace({_foo: qs[x], _before_bar: qs[3 + 5], _bar: qs[foo.bar]})
each: function (f) {for (var i = 0, l = this.length; i < l; ++i) f(this[i], i); return this},
map: function (f) {for (var n = new this.constructor(this), i = 0, l = this.length; i < l; ++i) n.push(f(this[i], i) || this[i]); return n},
reach: function (f) {f(this); this.each(function (n) {n.reach(f)}); return this},
rmap: function (f) {var r = f(this); return ! r || r === this ? this.map(function (n) {return n.rmap(f)}) : r.rmap === undefined ? new this.constructor(r) : r},
clone: function () {return this.rmap(function () {return false})},
collect: function (p) {var ns = []; this.reach(function (n) {p(n) && ns.push(n)}); return ns},
replace: function (rs) {var r; return own.call(rs, this.data) && (r = rs[this.data]) ?
r.constructor === String ? se(this.map(function (n) {return n.replace(rs)}), function () {this.data = r}) : r :
this.map(function (n) {return n.replace(rs)})},
Alteration.
These functions let you make "changes" to a node by returning a modified copy.
repopulated_with: function (xs) {return new this.constructor(this.data, xs)},
with_data: function (d) {return new this.constructor(d, Array.prototype.slice.call(this))},
change: function (i, x) {return se(new this.constructor(this.data, Array.prototype.slice.call(this)), function (n) {n[i] = x})},
compose_single: function (i, f) {return this.change(i, f(this[i]))},
slice: function (x1, x2) {return new this.constructor(this.data, Array.prototype.slice.call(this, x1, x2))},
General-purpose traversal.
This is a SAX-style traversal model, useful for analytical or scope-oriented tree traversal. You specify a callback function that is invoked in pre-post-order on the tree (you get events
for entering and exiting each node, including leaves). Each time a node is entered, the callback is invoked with an object of the form {entering: node}, where 'node' is the syntax node
being entered. Each time a node is left, the callback is invoked with an object of the form {exiting: node}. The return value of the function is not used. Any null nodes are not traversed,
since they would fail any standard truthiness tests for 'entering' or 'exiting'.
I used to have a method to perform scope-annotated traversal, but I removed it for two reasons. First, I had no use for it (and no tests, so I had no reason to believe that it worked).
Second, Caterwaul is too low-level to need such a method. That would be more appropriate for an analysis extension.
traverse: function (f) {f({entering: this}); f({exiting: this.each(function (n) {n.traverse(f)})}); return this},
Structural transformation.
Having nested syntax trees can be troublesome. For example, suppose you're writing a macro that needs a comma-separated list of terms. It's a lot of work to dig through the comma nodes,
each of which is binary. Javascript is better suited to using a single comma node with an arbitrary number of children. (This also helps with the syntax tree API -- we can use .map() and
.each() much more effectively.) Any binary operator can be transformed this way, and that is exactly what the flatten() method does. (flatten() returns a new tree; it doesn't modify the
original.)
The tree flattening operation looks like this for a left-associative binary operator:
| (+)
/ \ (+)
(+) z -> / | \
/ \ x y z
x y
This flatten() method returns the nodes along the chain of associativity, always from left to right. It is shallow, since generally you only need a localized flat tree. That is, it doesn't
descend into the nodes beyond the one specified by the flatten() call. It takes an optional parameter indicating the operator to flatten over; if the operator in the tree differs, then the
original node is wrapped in a unary node of the specified operator. The transformation looks like this:
| (,)
(+) |
/ \ .flatten(',') -> (+)
x y / \
x y
Because ',' is a binary operator, a ',' tree with just one operand will be serialized exactly as its lone operand would be. This means that plurality over a binary operator such as comma
or semicolon degrades gracefully for the unary case (this sentence makes more sense in the context of macro definitions; see in particular 'let' and 'where' in std.bind).
The unflatten() method performs the inverse transformation. It doesn't delete a converted unary operator in the tree case, but if called on a node with more than two children it will nest
according to associativity.
flatten: function (d) {d = d || this.data; return d !== this.data ? this.as(d) : ! (has(parse_lr, d) && this.length) ? this : has(parse_associates_right, d) ?
se(new this.constructor(d), bind(function (n) {for (var i = this; i && i.data === d; i = i[1]) n.push(i[0]); n.push(i)}, this)) :
se(new this.constructor(d), bind(function (n) {for (var i = this, ns = []; i.data === d; i = i[0]) i[1] && ns.push(i[1]); ns.push(i);
for (i = ns.length - 1; i >= 0; --i) n.push(ns[i])}, this))},
unflatten: function () {var t = this, right = has(parse_associates_right, this.data); return this.length <= 2 ? this : se(new this.constructor(this.data), function (n) {
if (right) for (var i = 0, l = t.length - 1; i < l; ++i) n = n.push(t[i]).push(i < l - 2 ? new t.constructor(t.data) : t[i])[1];
else for (var i = t.length - 1; i >= 1; --i) n = n.push(i > 1 ? new t.constructor(t.data) : t[0]).push(t[i])[0]})},
Wrapping.
Sometimes you want your syntax tree to have a particular operator, and if it doesn't have that operator you want to wrap it in a node that does. Perhaps the most common case of this is
when you have a possibly-plural node representing a variable or expression -- often the case when you're dealing with argument lists -- and you want to be able to assume that it's wrapped
in a comma node. Calling node.as(',') will return the node if it's a comma, and will return a new comma node containing the original one if it isn't.
as: function (d) {return this.data === d ? this : new this.constructor(d).push(this)},
Value construction.
Syntax nodes sometimes represent hard references to values instead of just syntax. (See 'References' for more information.) In order to compile a syntax tree in the right environment you
need a mapping of symbols to these references, which is what the bindings() method returns. (It also collects references for all descendant nodes.) It takes an optional argument to
populate, in case you already had a hash set aside for bindings -- though it always returns the hash.
A bug in Caterwaul 0.5 and earlier failed to bind falsy values. This is no longer the case; nodes which bind values should indicate that they do so by setting a binds_a_value attribute
(ref nodes do this on the prototype), indicating that their value should be read from the 'value' property. (This allows other uses of a 'value' property while making it unambiguous
whether a particular node intends to bind something.)
bindings: function (hash) {var result = hash || {}; this.reach(function (n) {if (n.binds_a_value) result[n.data] = n.value}); return result},
Matching.
Any syntax tree can act as a matching pattern to destructure another one. It's often much more fun to do things this way than it is to try to pick it apart by hand. For example, suppose
you wanted to determine whether a node represents a function that immediately returns, and to know what it returns. The simplest way to do it is like this:
| var tree = ...
var match = caterwaul.parse('function (_) {return _value}').match(tree);
if (match) {
var value = match._value;
...
}
The second parameter 'variables' stores a running total of match data. You don't provide this; match() creates it for you on the toplevel invocation. The entire original tree is available
as a match variable called '_'; for example: t.match(u)._ === u if u matches t.
match: function (target, variables) {target = target.constructor === String ? caterwaul_global.parse(target) : target;
variables || (variables = {_: target});
if (this.is_wildcard()) return variables[this.data] = target, variables;
else if (this.length === target.length && this.data === target.data) {for (var i = 0, l = this.length; i < l; ++i)
if (! this[i].match(target[i], variables)) return null;
return variables}},
Inspection and syntactic serialization.
Syntax nodes can be both inspected (producing a Lisp-like structural representation) and serialized (producing valid Javascript code). In the past, stray 'r' links were serialized as block
comments. Now they are folded into implied semicolons by the parser, so they should never appear by the time serialization happens.
toString: function () {var xs = ['']; this.serialize(xs); return xs.join('')},
structure: function () {if (this.length) return '(' + ['"' + this.data + '"'].concat(map(function (x) {return x.structure()}, this)).join(' ') + ')';
else return this.data}};
Syntax node subclassing.
Caterwaul 1.1.1 generalizes the variadic syntax node model to support arbitrary subclasses. This is useful when defining syntax trees for languages other than Javascript. Note that this
method is destructive to your constructor; it adds a bunch of methods to the prototype automatically.
caterwaul_global.syntax_subclass = function (ctor) {var extensions = Array.prototype.slice.call(arguments, 1);
merge.apply(this, [ctor.prototype, syntax_common].concat(extensions));
ctor.prototype.constructor = ctor;
return ctor};
Type detection and retrieval.
These methods are used to detect the literal type of a node and to extract that value if it exists. You should use the as_x methods only once you know that the node does represent an x;
otherwise you will get misleading results. (For example, calling as_boolean on a non-boolean will always return false.)
Other methods are provided to tell you higher-level things about what this node does. For example, is_contextualized_invocation() tells you whether the node represents a call that can't be
eta-reduced (if it were, then the 'this' binding would be lost).
Wildcards are used for pattern matching and are identified by beginning with an underscore. This is a very frequently-called method, so I'm using a very inexpensive numeric check rather
than a string comparison. The ASCII value for underscore is 95.
var parse_hex = caterwaul_global.parse_hex = function (digits) {for (var result = 0, i = 0, l = digits.length, d; i < l; ++i)
result *= 16, result += (d = digits.charCodeAt(i)) <= 58 ? d - 48 : (d & 0x5f) - 55;
return result},
parse_octal = caterwaul_global.parse_octal = function (digits) {for (var result = 0, i = 0, l = digits.length; i < l; ++i) result *= 8, result += digits.charCodeAt(i) - 48;
return result},
unescape_string = caterwaul_global.unescape_string = function (s) {for (var i = 0, c, l = s.length, result = [], is_escaped = false; i < l; ++i)
if (is_escaped) is_escaped = false,
result.push((c = s.charAt(i)) === '\\' ? '\\' :
c === 'n' ? '\n' : c === 'r' ? '\r' : c === 'b' ? '\b' : c === 'f' ? '\f' :
c === '0' ? '\u0000' : c === 't' ? '\t' : c === 'v' ? '\v' :
c === '"' || c === '\'' ? c :
c === 'x' ? String.fromCharCode(parse_hex(s.substring(i, ++i + 1))) :
c === 'u' ? String.fromCharCode(parse_hex(s.substring(i, (i += 3) + 1))) :
String.fromCharCode(parse_octal(s.substring(i, (i += 2) + 1))));
else if ((c = s.charAt(i)) === '\\') is_escaped = true;
else result.push(c);
return result.join('')};
caterwaul_global.javascript_tree_type_methods = {
is_string: function () {return /['"]/.test(this.data.charAt(0))}, as_escaped_string: function () {return this.data.substr(1, this.data.length - 2)},
is_number: function () {return /^-?(0x|\d|\.\d+)/.test(this.data)}, as_number: function () {return Number(this.data)},
is_boolean: function () {return this.data === 'true' || this.data === 'false'}, as_boolean: function () {return this.data === 'true'},
is_regexp: function () {return /^\/./.test(this.data)}, as_escaped_regexp: function () {return this.data.substring(1, this.data.lastIndexOf('/'))},
is_array: function () {return this.data === '['}, as_unescaped_string: function () {return unescape_string(this.as_escaped_string())},
is_wildcard: function () {return this.data.charCodeAt(0) === 95},
is_identifier: function () {return this.length === 0 && /^[A-Za-z_$]\w*$/.test(this.data) && ! this.is_boolean() && ! this.is_null_or_undefined() && ! has(lex_op, this.data)},
has_grouped_block: function () {return has(parse_r_until_block, this.data)}, is_block: function () {return has(parse_block, this.data)},
is_blockless_keyword: function () {return has(parse_r_optional, this.data)}, is_null_or_undefined: function () {return this.data === 'null' || this.data === 'undefined'},
is_constant: function () {return this.is_number() || this.is_string() || this.is_boolean() || this.is_regexp() || this.is_null_or_undefined()},
left_is_lvalue: function () {return /=$/.test(this.data) || /\+\+$/.test(this.data) || /--$/.test(this.data)},
is_empty: function () {return !this.length}, has_parameter_list: function () {return this.data === 'function' || this.data === 'catch'},
has_lvalue_list: function () {return this.data === 'var' || this.data === 'const'}, is_dereference: function () {return this.data === '.' || this.data === '[]'},
is_invocation: function () {return this.data === '()'}, is_contextualized_invocation: function () {return this.is_invocation() && this[0].is_dereference()},
is_invisible: function () {return has(parse_invisible, this.data)}, is_binary_operator: function () {return has(parse_lr, this.data)},
is_prefix_unary_operator: function () {return has(parse_r, this.data)}, is_postfix_unary_operator: function () {return has(parse_l, this.data)},
is_unary_operator: function () {return this.is_prefix_unary_operator() || this.is_postfix_unary_operator()},
accepts: function (e) {return has(parse_accepts, this.data) && parse_accepts[this.data] === (e.data || e)}};
Javascript-specific serialization.
These methods are specific to the Javascript language. Other languages will have different serialization logic.
caterwaul_global.javascript_tree_serialization_methods = {
Block detection.
Block detection is required for multi-level if/else statements. Consider this code:
| if (foo) for (...) {}
else bif;
A naive approach (the one I was using before version 0.6) would miss the fact that the 'for' was trailed by a block, and insert a spurious semicolon, which would break compilation:
| if (foo) for (...) {}; // <- note!
else bif;
What we do instead is dig through the tree and find out whether the last thing in the 'if' case ends with a block. If so, then no semicolon is inserted; otherwise we insert one. This
algorithm makes serialization technically O(n^2), but nobody nests if/else blocks to such an extent that it would matter.
ends_with_block: function () {var block = this[parse_r_until_block[this.data]];
return this.data === '{' || has(parse_r_until_block, this.data) && (this.data !== 'function' || this.length === 3) && block && block.ends_with_block()},
There's a hack here for single-statement if-else statements. (See 'Grab-until-block behavior' in the parsing code below.) Basically, for various reasons the syntax tree won't munch the
semicolon and connect it to the expression, so we insert one automatically whenever the second node in an if, else, while, etc. isn't a block.
Update for Caterwaul 0.6.6: I had removed mandatory spacing for unary prefix operators, but now it's back. The reason is to help out the host Javascript lexer, which can misinterpret
postfix increment/decrement: x + +y will be serialized as x++y, which is invalid Javascript. The fix is to introduce a space in front of the second plus: x+ +y, which is unambiguous.
Update for caterwaul 1.0: The serialize() method is now aggressively optimized for common cases. It also uses a flattened array-based concatenation strategy rather than the deeply nested
approach from before.
Optimized serialization cases.
We can tell a lot about how to serialize a node based on just a few properties. For example, if the node has zero length then its serialization is simply its data. This is the leaf case,
which is likely to be half of the total number of nodes in the whole syntax tree. If a node has length 1, then we assume a prefix operator unless we identify it as postfix. Otherwise we
break it down by the kind of operator that it is.
Nodes might be flattened, so we can't assume any upper bound on the arity regardless of what kind of operator it is. Realistically you shouldn't hand flattened nodes over to the compile()
function, but it isn't the end of the world if you do.
serialize: function (xs) {var l = this.length, d = this.data, semi = ';\n',
push = function (x) {if (lex_ident[xs[xs.length - 1].charCodeAt(0)] === lex_ident[x.charCodeAt(0)]) xs.push(' ', x);
else xs.push(x)};
switch (l) {case 0: if (has(parse_r_optional, d)) return push(d.replace(/^u/, ''));
else if (has(parse_group, d)) return push(d), push(parse_group[d]);
else return push(d);
case 1: if (has(parse_r, d) || has(parse_r_optional, d)) return push(d.replace(/^u/, '')), this[0].serialize(xs);
else if (has(parse_group, d)) return push(d), this[0].serialize(xs), push(parse_group[d]);
else if (has(parse_lr, d)) return push('/* unary ' + d + ' node */'), this[0].serialize(xs);
else return this[0].serialize(xs), push(d);
case 2: if (has(parse_invocation, d)) return this[0].serialize(xs), push(d.charAt(0)), this[1].serialize(xs), push(d.charAt(1));
else if (has(parse_r_until_block, d)) return push(d), this[0].serialize(xs), this[1].serialize(xs);
else if (has(parse_invisible, d)) return this[0].serialize(xs), this[1].serialize(xs);
else if (d === ';') return this[0].serialize(xs), push(semi), this[1].serialize(xs);
else return this[0].serialize(xs), push(d), this[1].serialize(xs);
default: if (has(parse_ternary, d)) return this[0].serialize(xs), push(d), this[1].serialize(xs), push(':'), this[2].serialize(xs);
else if (has(parse_r_until_block, d)) return this.accepts(this[2]) && ! this[1].ends_with_block() ?
(push(d), this[0].serialize(xs), this[1].serialize(xs), push(semi), this[2].serialize(xs)) :
(push(d), this[0].serialize(xs), this[1].serialize(xs), this[2].serialize(xs));
else return this.unflatten().serialize(xs)}}};
References.
You can drop references into code that you're compiling. This is basically variable closure, but a bit more fun. For example:
| caterwaul.compile(qs[function () {return _ + 1}].replace({_: caterwaul.ref(3)}))() // -> 4
What actually happens is that caterwaul.compile runs through the code replacing refs with gensyms, and the function is evaluated in a scope where those gensyms are bound to the values they
represent. This gives you the ability to use a ref even as an lvalue, since it's really just a variable. References are always leaves on the syntax tree, so the prototype has a length of 0.
Caterwaul 1.0 adds named gensyms, and one of the things you can do is name your refs accordingly. If you don't name one it will just be called 'ref', but you can make it more descriptive by
passing in a second constructor argument. This name will automatically be wrapped in a gensym, but that gensym will be removed at compile-time unless you specify not to rename gensyms.
caterwaul_global.ref = caterwaul_global.syntax_subclass(
function (value, name) {if (value instanceof this.constructor) this.value = value.value, this.data = value.data;
else this.value = value, this.data = gensym(name && name.constructor === String ? name : 'ref')},
caterwaul_global.javascript_tree_type_methods,
caterwaul_global.javascript_tree_serialization_methods,
{binds_a_value: true, length: 0},
Reference replace() support.
Refs aren't normal nodes; in particular, invoking the constructor as we do in replace() will lose the ref's value and cause all kinds of problems. In order to avoid this we override the
replace() method for syntax refs to behave more sensibly. Note that you can't replace a ref with a syntax
{replace: function (replacements) {var r; return own.call(replacements, this.data) && (r = replacements[this.data]) ?
r.constructor === String ? se(new this.constructor(this.value), function () {this.data = r}) : r :
this}});
Syntax node constructor.
Here's where we combine all of the pieces above into a single function with a large prototype. Note that the 'data' property is converted from a variety of types; so far we support strings,
numbers, and booleans. Any of these can be added as children. Also, I'm using an instanceof check rather than (.constructor ===) to allow array subclasses such as Caterwaul finite sequences
to be used.
caterwaul_global.syntax = caterwaul_global.syntax_subclass(
function (data) {if (data instanceof this.constructor) this.data = data.data, this.length = 0;
else {this.data = data && data.toString(); this.length = 0;
for (var i = 1, l = arguments.length, _; _ = arguments[i], i < l; ++i)
for (var j = 0, lj = _.length, it, c; _ instanceof Array ? (it = _[j], j < lj) : (it = _, ! j); ++j)
this._append((c = it.constructor) === String || c === Number || c === Boolean ? new this.constructor(it) : it)}},
caterwaul_global.javascript_tree_type_methods,
caterwaul_global.javascript_tree_serialization_methods);
var empty = caterwaul_global.empty = new caterwaul_global.syntax('');
Parsing.
There are two distinct parts to parsing Javascript. One is parsing the irregular statement-mode expressions such as 'if (condition) {...}' and 'function f(x) {...}'; the other is parsing
expression-mode stuff like arithmetic operators. In Rebase I tried to model everything as an expression, but that failed sometimes because it required that each operator have fixed arity. In
particular this was infeasible for keywords such as 'break', 'continue', 'return', and some others (any of these can be nullary or unary). It also involved creating a bizarre hack for 'case
x:' inside a switch block. This hack made the expression passed in to 'case' unavailable, as it would be buried in a ':' node.
Caterwaul fixes these problems by using a proper context-free grammar. However, it's much looser than most grammars because it doesn't need to validate anything. Correspondingly, it can be
much faster as well. Instead of guessing and backtracking as a recursive-descent parser would, it classifies many different branches into the same basic structure and fills in the blanks. One
example of this is the () {} pair, which occurs in a bunch of different constructs, including function () {}, if () {}, for () {}, etc. In fact, any time a () group is followed by a {} group
we can grab the token that precedes () (along with perhaps one more in the case of function f () {}), and group that under whichever keyword is responsible.
Syntax folding.
The first thing to happen is that parenthetical, square bracket, and braced groups are folded up. This happens in a single pass that is linear in the number of tokens, and other foldable
tokens (including unary and binary operators) are indexed by associativity. The following pass runs through these indexes from high to low precedence and folds tokens into trees. By this
point all of the parentheticals have been replaced by proper nodes (here I include ?: groups in parentheticals, since they behave the same way). Finally, high-level rules are applied to the
remaining keywords, which are bound last. This forms a complete parse tree.
Doing all of this efficiently requires a linked list rather than an array. This gets built during the initial paren grouping stage. Arrays are used for the indexes, which are left-to-right
and are later processed in the order indicated by the operator associativity. That is, left-associative operators are processed 0 .. n and right associative are processed n .. 0. Keywords
are categorized by behavior and folded after all of the other operators. Semicolons are folded last, from left to right.
There are some corner cases due to Javascript's questionable heritage from C-style syntax. For example, most constructs take either syntax blocks or semicolon-delimited statements. Ideally,
else, while, and catch are associated with their containing if, do, and try blocks, respectively. This can be done easily, as the syntax is folded right-to-left. Another corner case would
come up if there were any binary operators with equal precedence and different associativity. Javascript doesn't have them however, and it wouldn't make much sense to; it would render
expressions such as 'a op1 b op2 c' ambiguous if op1 and op2 shared precedence but each wanted to bind first. (I mention this because at first I was worried about it, but now I realize it
isn't an issue.)
Notationally (for easier processing later on), a distinction is made between invocation and grouping, and between dereferencing and array literals. Dereferencing and function invocation are
placed into their own operators, where the left-hand side is the thing being invoked or dereferenced and the right-hand side is the paren-group or bracket-group that is responsible for the
operation. Also, commas inside these groups are flattened into a single variadic (possibly nullary) comma node so that you don't have to worry about the tree structure. This is the case for
all left-associative operators; right-associative operators preserve their hierarchical folding.
Parse/lex shared logic.
Lexing Javascript is not entirely straightforward, primarily because of regular expression literals. The first implementation of the lexer got things right 99% of the time by inferring the
role of a / by its preceding token. The problem comes in when you have a case like this:
| if (condition) /foo/.test(x)
In this case, (condition) will be incorrectly inferred to be a regular expression (since the close-paren terminates an expression, usually), and /foo/ will be interpreted as division by foo.
We mark the position before a token and then just increment the position. The token, then, can be retrieved by taking a substring from the mark to the position. This eliminates the need for
intermediate concatenations. In a couple of cases I've gone ahead and done them anyway -- these are for operators, where we grab the longest contiguous substring that is defined. I'm not too
worried about the O(n^2) complexity due to concatenation; they're bounded by four characters.
OK, so why use charAt() instead of regular expressions? It's a matter of asymptotic performance. V8 implements great regular expressions (O(1) in the match length for the (.*)$ pattern), but
the substring() method is O(n) in the number of characters returned. Firefox implements O(1) substring() but O(n) regular expression matching. Since there are O(n) tokens per document of n
characters, any O(n) step makes lexing quadratic. So I have to use the only reliably constant-time method provided by strings, charAt() (or in this case, charCodeAt()).
Of course, building strings via concatenation is also O(n^2), so I also avoid that for any strings that could be long. This is achieved by using a mark to indicate where the substring
begins, and advancing i independently. The span between mark and i is the substring that will be selected, and since each substring both requires O(n) time and consumes n characters, the
lexer as a whole is O(n). (Though perhaps with a large constant.)
Parse function.
As mentioned earlier, the parser and lexer aren't distinct. The lexer does most of the heavy lifting; it matches parens and brackets, arranges tokens into a hierarchical linked list, and
provides an index of those tokens by their fold order. It does all of this by streaming tokens into a micro-parser whose language is grouping and that knows about the oddities required to
handle regular expression cases. In the same function, though as a distinct case, the operators are folded and the syntax is compiled into a coherent tree form.
The input to the parse function can be anything whose toString() produces valid Javascript code.
caterwaul_global.parse = function (input) {
// Caterwaul 1.1 revision: Allow the parse() function to be used as a 'make sure this thing is a syntax node' function.
if (input.constructor === caterwaul_global.syntax) return input;
Lex variables.
s, obviously, is the string being lexed. mark indicates the position of the stream, while i is used for lookahead. The difference is later read into a token and pushed onto the result. c
is a temporary value used to store the current character code. re is true iff a slash would begin a regular expression. esc is a flag indicating whether the next character in a string or
regular expression literal is escaped. exp indicates whether we've seen the exponent marker in a number. close is used for parsing single and double quoted strings; it contains the
character code of the closing quotation mark. t is the token to be processed.
Parse variables.
grouping_stack and gs_top are used for paren/brace/etc. matching. head and parent mark two locations in the linked syntax tree; when a new group is created, parent points to the opener
(i.e. (, [, ?, or {), while head points to the most recently added child. (Hence the somewhat complex logic in push().) indexes[] determines reduction order, and contains references to the
nodes in the order in which they should be folded. invocation_nodes is an index of the nodes that will later need to be flattened.
The push() function manages the mechanics of adding a node to the initial linked structure. There are a few cases here; one is when we've just created a paren group and have no 'head'
node; in this case we append the node as 'head'. Another case is when 'head' exists; in that case we update head to be the new node, which gets added as a sibling of the old head.
var s = input.toString(), mark = 0, c = 0, re = true, esc = false, dot = false, exp = false, close = 0, t = '', i = 0, l = s.length, cs = function (i) {return s.charCodeAt(i)},
grouping_stack = [], gs_top = null, head = null, parent = null, indexes = map(function () {return []}, parse_reduce_order), invocation_nodes = [], all_nodes = [empty],
new_node = function (n) {return all_nodes.push(n), n}, push = function (n) {return head ? head._sibling(head = n) : (head = n._append_to(parent)), new_node(n)},
syntax_node = this.syntax;
Trivial case.
The empty string will break the lexer because we won't generate a token (since we're already at the end). To prevent this we return an empty syntax node immediately, since this is an
accurate representation of no input.
if (l === 0) return empty;
Main lex loop.
This loop takes care of reading all of the tokens in the input stream. At the end, we'll have a linked node structure with paren groups. At the beginning, we set the mark to the current
position (we'll be incrementing i as we read characters), munch whitespace, and reset flags.
while ((mark = i) < l) {
while (lex_space[c = cs(i)] && i < l) mark = ++i;
esc = exp = dot = t = false;
Miscellaneous lexing.
This includes bracket resetting (the top case, where an open-bracket of any sort triggers regexp mode) and comment removal. Both line and block comments are removed by comparing against
lex_slash, which represents /, and lex_star, which represents *.
if (lex_bracket[c]) {t = !! ++i; re = lex_opener[c]}
else if (c === lex_slash && cs(i + 1) === lex_star && (i += 2)) {while (++i < l && cs(i) !== lex_slash || cs(i - 1) !== lex_star); t = ! ++i}
else if (c === lex_slash && cs(i + 1) === lex_slash) {while (++i < l && ! lex_eol[cs(i)]); t = false}
Regexp and string literal lexing.
These both take more or less the same form. The idea is that we have an opening delimiter, which can be ", ', or /; and we look for a closing delimiter that follows. It is syntactically
illegal for a string to occur anywhere that a slash would indicate division (and it is also illegal to follow a string literal with extra characters), so reusing the regular expression
logic for strings is not a problem. (This follows because we know ahead of time that the Javascript is valid.)
else if (lex_quote[c] && (close = c) && re && ! (re = ! (t = s.charAt(i)))) {while (++i < l && (c = cs(i)) !== close || esc) esc = ! esc && c === lex_back;
while (++i < l && lex_regexp_suffix[cs(i)]) ; t = true}
Numeric literal lexing.
This is far more complex than the above cases. Numbers have several different formats, each of which requires some custom logic. The reason we need to parse numbers so exactly is that it
influences how the rest of the stream is lexed. One example is '0.5.toString()', which is perfectly valid Javascript. What must be output here, though, is '0.5', '.', 'toString', '(',
')'; so we have to keep track of the fact that we've seen one dot and stop lexing the number on the second.
Another case is exponent-notation: 3.0e10. The hard part here is that it's legal to put a + or - on the exponent, which normally terminates a number. Luckily we can safely skip over any
character that comes directly after an E or e (so long as we're really in exponent mode, which I'll get to momentarily), since there must be at least one digit after an exponent.
The final case, which restricts the logic somewhat, is hexadecimal numbers. These also contain the characters 'e' and 'E', but we cannot safely skip over the following character, and any
decimal point terminates the number (since '0x5.toString()' is also valid Javascript). The same follows for octal numbers; the leading zero indicates that there will be no decimal point,
which changes the lex mode (for example, '0644.toString()' is valid).
So, all this said, there are different logic branches here. One handles guaranteed integer cases such as hex/octal, and the other handles regular numbers. The first branch is triggered
whenever a number starts with zero and is followed by 'x' or a digit (for conciseness I call 'x' a digit), and the second case is triggered when '.' is followed by a digit, or when a
digit starts.
A trivial change, using regular expressions, would reduce this logic significantly. I chose to write it out longhand because (1) it's more fun that way, and (2) the regular expression
approach has theoretically quadratic time in the length of the numbers, whereas this approach keeps things linear. Whether or not that actually makes a difference I have no idea.
Finally, in response to a recently discovered failure case, a period must be followed by a digit if it starts a number. The failure is the string '.end', which will be lexed as '.en',
'd' if it is assumed to be a floating-point number. (In fact, any method or property beginning with 'e' will cause this problem.)
else if (c === lex_zero && lex_integer[cs(i + 1)]) {while (++i < l && lex_integer[cs(i)]); re = ! (t = true)}
else if (lex_float[c] && (c !== lex_dot || lex_decimal[cs(i + 1)])) {while (++i < l && (lex_decimal[c = cs(i)] || (dot ^ (dot |= c === lex_dot)) || (exp ^ (exp |= lex_exp[c] && ++i))));
while (i < l && lex_decimal[cs(i)]) ++i; re = ! (t = true)}
Operator lexing.
The 're' flag is reused here. Some operators have both unary and binary modes, and as a heuristic (which happens to be accurate) we can assume that anytime we expect a regular
expression, a unary operator is intended. The only exception are ++ and --, which are always unary but sometimes are prefix and other times are postfix. If re is true, then the prefix
form is intended; otherwise, it is postfix. For this reason I've listed both '++' and 'u++' (same for --) in the operator tables; the lexer is actually doing more than its job here by
identifying the variants of these operators.
The only exception to the regular logic happens if the operator is postfix-unary. (e.g. ++, --.) If so, then the re flag must remain false, since expressions like 'x++ / 4' can be valid.
else if (lex_punct[c] && (t = re ? 'u' : '', re = true)) {while (i < l && lex_punct[cs(i)] && has(lex_op, t + s.charAt(i))) t += s.charAt(i++); re = ! has(lex_postfix_unary, t)}
Identifier lexing.
If nothing else matches, then the token is lexed as a regular identifier or Javascript keyword. The 're' flag is set depending on whether the keyword expects a value. The nuance here is
that you could write 'x / 5', and it is obvious that the / means division. But if you wrote 'return / 5', the / would be a regexp delimiter because return is an operator, not a value. So
at the very end, in addition to assigning t, we also set the re flag if the word turns out to be an operator.
else {while (++i < l && lex_ident[cs(i)]); re = has(lex_op, t = s.substring(mark, i))}
Token unification.
t will contain true, false, or a string. If false, no token was lexed; this happens when we read a comment, for example. If true, the substring method should be used. (It's a shorthand to
avoid duplicated logic.) For reasons that are not entirely intuitive, the lexer sometimes produces the artifact 'u;'. This is never useful, so I have a case dedicated to removing it.
if (i === mark) throw new Error('Caterwaul lex error at "' + s.substr(mark, 40) + '" with leading context "' + s.substr(mark - 40, 40) + '" (probably a Caterwaul bug)');
if (t === false) continue;
t = t === true ? s.substring(mark, i) : t === 'u;' ? ';' : t;
Grouping and operator indexing.
Now that we have a token, we need to see whether it affects grouping status. There are a couple of possibilities. If it's an opener, then we create a new group; if it's a matching closer
then we close the current group and pop out one layer. (We don't check for matching here. Any code provided to Caterwaul will already have been parsed by the host Javascript interpreter,
so we know that it is valid.)
All operator indexing is done uniformly, left-to-right. Note that the indexing isn't strictly by operator. It's by reduction order, which is arguably more important. That's what the
parse_inverse_order table does: it maps operator names to parse_reduce_order subscripts. (e.g. 'new' -> 2.)
t === gs_top ? (grouping_stack.pop(), gs_top = grouping_stack[grouping_stack.length - 1], head = head ? head.p : parent, parent = null) :
(has(parse_group, t) ? (grouping_stack.push(gs_top = parse_group[t]), parent = push(new_node(new syntax_node(t))), head = null) : push(new_node(new syntax_node(t))),
has(parse_inverse_order, t) && indexes[parse_inverse_order[t]].push(head || parent)); // <- This is where the indexing happens
Regexp flag special cases.
Normally a () group wraps an expression, so a following / would indicate division. The only exception to this is when we have a block construct; in this case, the next token appears in
statement-mode, which means that it begins, not modifies, a value. We'll know that we have such a case if (1) the immediately-preceding token is a close-paren, and (2) a block-accepting
syntactic form occurs to its left.
With all this trouble over regular expressions, I had to wonder whether it was possible to do it more cleanly. I don't think it is, unfortunately. Even lexing the stream backwards fails
to resolve the ambiguity:
| for (var k in foo) /foo/g.test(k) && bar();
In this case we won't know it's a regexp until we hit the 'for' keyword (or perhaps 'var', if we're being clever -- but a 'with' or 'if' would require complete lookahead). A perfectly
valid alternative parse, minus the 'for' and 'var', is this:
| ((k in foo) / (foo) / (g.test(k))) && bar();
The only case where reverse-lexing is useful is when the regexp has no modifiers.
re |= t === ')' && head.l && has(parse_r_until_block, head.l.data)}
Operator fold loop.
This is the second major part of the parser. Now that we've completed the lex process, we can fold operators and syntax, and take care of some exception cases.
First step: functions, calls, dots, and dereferences.
I'm treating this differently from the generalized operator folding because of the syntactic inference required for call and dereference detection. Nothing has been folded at this point
(with the exception of paren groups, which is appropriate), so if the node to the left of any ( or [ group is an operator, then the ( or [ is really a paren group or array literal. If, on
the other hand, it is another value, then the group is a function call or a dereference. This folding goes left-to-right. The reason we also process dot operators is that they share the same
precedence as calls and dereferences. Here's what a () or [] transform looks like:
| quux <--> foo <--> ( <--> bar quux <--> () <--> bar
\ / \ <-- This can be done by saying _.l.wrap(new node('()')).p.fold_r().
bif <--> , <--> baz --> foo ( _.l.wrap() returns l again, .p gets the wrapping node, and fold_r adds a child to it.
\
bif <--> , <--> baz
This is actually merged into the for loop below, even though it happens before other steps do (see 'Ambiguous parse groups').
Second step: fold operators.
Now we can go through the list of operators, folding each according to precedence and associativity. Highest to lowest precedence here, which is just going forwards through the indexes[]
array. The parse_index_forward[] array indicates which indexes should be run left-to-right and which should go right-to-left.
for (var i = 0, l = indexes.length, forward, _; _ = indexes[i], forward = parse_index_forward[i], i < l; ++i)
for (var j = forward ? 0 : _.length - 1, lj = _.length, inc = forward ? 1 : -1, node, data, ll; forward ? j < lj : j >= 0; j += inc)
Binary node behavior.
The most common behavior is binary binding. This is the usual case for operators such as '+' or ',' -- they grab one or both of their immediate siblings regardless of what they are.
Operators in this class are considered to be 'fold_lr'; that is, they fold first their left sibling, then their right.
if (has(parse_lr, data = (node = _[j]).data)) node._fold_lr();
Ambiguous parse groups.
As mentioned above, we need to determine whether grouping constructs are invocations or real groups. This happens to take place before other operators are parsed (which is good -- that way
it reflects the precedence of dereferencing and invocation). The only change we need to make is to discard the explicit parenthetical or square-bracket grouping for invocations or
dereferences, respectively. It doesn't make much sense to have a doubly-nested structure, where we have a node for invocation and another for the group on the right-hand side of that
invocation. Better is to modify the group in-place to represent an invocation.
We can't solve this problem here, but we can solve it after the parse has finished. I'm pushing these invocation nodes onto an index for the end.
Sometimes we have a paren group that doesn't represent a value. This is the case for most control flow constructs:
| for (var k in o) (...)
We need to detect this and not fold the (var k in o)(...) as an invocation, since doing so would seriously break the resulting syntax.
There is an even more pathological case to consider. Firefox and other SpiderMonkey-based runtimes rewrite anonymous functions without parentheses, so you end up with stuff like this:
| function () {} ()
In this case we need to encode an invocation. Fortunately by this point the function node is already folded.
else if (has(parse_ambiguous_group, data) && node.l && ! ((ll = node.l.l) && has(parse_r_until_block, ll.data)) &&
(node.l.data === '.' || (node.l.data === 'function' && node.l.length === 2) ||
! (has(lex_op, node.l.data) ||
has(parse_not_a_value, node.l.data)))) invocation_nodes.push(node.l._wrap(new_node(new syntax_node(data + parse_group[data]))).p._fold_r());
Unary left and right-fold behavior.
Unary nodes have different fold directions. In this case, it just determines which side we grab the node from. I'm glad that Javascript doesn't allow stuff like '++x++', which would make
the logic here actually matter. Because there isn't that pathological case, exact rigidity isn't required.
else if (has(parse_l, data)) node._fold_l();
else if (has(parse_r, data)) node._fold_r();
Ternary operator behavior.
This is kind of interesting. If we have a ternary operator, then it will be treated first as a group; just like parentheses, for example. This is the case because the ternary syntax is
unambiguous for things in the middle. So, for example, '3 ? 4 : 5' initially parses out as a '?' node whose child is '4'. Its siblings are '3' and '5', so folding left and right is an
obvious requirement. The only problem is that the children will be in the wrong order. Instead of (3) (4) (5), we'll have (4) (3) (5). So after folding, we do a quick swap of the first two
to set the ordering straight.
else if (has(parse_ternary, data)) {node._fold_lr(); var temp = node[1]; node[1] = node[0]; node[0] = temp}
Grab-until-block behavior.
Not quite as simple as it sounds. This is used for constructs such as 'if', 'function', etc. Each of these constructs takes the form ' [identifier] () {}', but they can also
have variants that include ' () {}', ' () statement;', and most problematically ' () ;'. Some of these constructs also have optional child components; for
example, 'if () {} else {}' should be represented by an 'if' whose children are '()', '{}', and 'else' (whose child is '{}'). The tricky part is that 'if' doesn't accept another 'if' as a
child (e.g. 'if () {} if () {}'), nor does it accept 'for' or any number of other things. This discrimination is encoded in the parse_accepts table.
There are some weird edge cases, as always. The most notable is what happens when we have nesting without blocks:
| if (foo) bar; else bif;
In this case we want to preserve the semicolon on the 'then' block -- that is, 'bar;' should be its child; so the semicolon is required. But the 'bif' in the 'else' case shouldn't have a
semicolon, since that separates top-level statements. Because desperate situations call for desperate measures, there's a hack specifically for this in the syntax tree serialization.
One more thing. Firefox rewrites syntax trees, and one of the optimizations it performs on object literals is removing quotation marks from regular words. This means that it will take the
object {'if': 4, 'for': 1, etc.} and render it as {if: 4, for: 1, etc.}. As you can imagine, this becomes a big problem as soon as the word 'function' is present in an object literal. To
prevent this from causing problems, I only collapse a node if it is not followed by a colon. (And the only case where any of these would legally be followed by a colon is as an object
key.)
else if (has(parse_r_until_block, data) && node.r && node.r.data !== ':')
{for (var count = 0, limit = parse_r_until_block[data]; count < limit && node.r && ! has(parse_block, node.r.data); ++count) node._fold_r();
node.r && (node.r.data === ';' ? node.push(empty) : node._fold_r());
if (has(parse_accepts, data) && parse_accepts[data] === (node.r && node.r.r && node.r.r.data)) node._fold_r().pop()._fold_r();
else if (has(parse_accepts, data) && parse_accepts[data] === (node.r && node.r.data)) node._fold_r()}
Optional right-fold behavior.
The return, throw, break, and continue keywords can each optionally take an expression. If the token to the right is an expression, then we take it, but if the token to the right is a
semicolon then the keyword should be nullary.
else if (has(parse_r_optional, data)) node.r && node.r.data !== ';' && node._fold_r();
Third step.
Find all elements with right-pointers and wrap them with semicolon nodes. This is necessary because of certain constructs at the statement-level don't use semicolons; they use brace syntax
instead. (e.g. 'if (foo) {bar} baz()' is valid, even though no semicolon precedes 'baz()'.) By this point everything else will already be folded. Note that this does some weird things to
associativity; in general, you can't make assumptions about the exact layout of semicolon nodes. Fortunately semicolon is associative, so it doesn't matter in practice. And just in case,
these nodes are 'i;' rather than ';', meaning 'inferred semicolon' -- that way it's clear that they aren't original. (They also won't appear when you call toString() on the syntax tree.)
for (var i = all_nodes.length - 1, _; i >= 0; --i) (_ = all_nodes[i]).r && _._wrap(new_node(new syntax_node('i;'))).p._fold_r();
Fourth step.
Flatten out all of the invocation nodes. As explained earlier, they are nested such that the useful data on the right is two levels down. We need to grab the grouping construct on the
right-hand side and remove it so that only the invocation or dereference node exists. During the parse phase we built an index of all of these invocation nodes, so we can iterate through
just those now. I'm preserving the 'p' pointers, though they're probably not useful beyond here.
for (var i = 0, l = invocation_nodes.length, _, child; i < l; ++i) (child = (_ = invocation_nodes[i])[1] = _[1][0] || empty) && (child.p = _);
while (head.p) head = head.p;
Fifth step.
Prevent a space leak by clearing out all of the 'p', 'l', and 'r' pointers.
for (var i = all_nodes.length - 1, _; i >= 0; --i) delete (_ = all_nodes[i]).p, delete _.l, delete _.r;
return head};
Environment-dependent compilation.
It's possible to bind variables from 'here' (i.e. this runtime environment) inside a compiled function. The way we do it is to create a closure using a gensym. (Another reason that gensyms
must really be unique.) Here's the idea. We use the Function constructor to create an outer function, bind a bunch of variables directly within that scope, and return the function we're
compiling. The variables correspond to gensyms placed in the code, so the code will have closure over those variables.
An optional second parameter 'environment' can contain a hash of variable->value bindings. These will be defined as locals within the compiled function.
New in caterwaul 0.6.5 is the ability to specify a 'this' binding to set the context of the expression being evaluated.
Caterwaul 1.0 and later automatically bind a variable called 'undefined' that is set to Javascript's 'undefined' value. This is done to defend against pathological cases of 'undefined' being
set to something else. If you really wnat some other value of undefined, you can always bind it as an environment variable.
(function () {var bound_expression_template = caterwaul_global.parse('var _bindings; return(_expression)'),
binding_template = caterwaul_global.parse('_variable = _base._variable'),
undefined_binding = caterwaul_global.parse('undefined = void(0)');
Compilation options.
Gensym renaming will break some things that expect the compiled code to be source-identical to the original tree. As a result, I'm introducing an options hash that lets you tell the compiler
things like "don't rename the gensyms this time around". Right now gensym_renaming is the only option, and it defaults to true.
caterwaul_global.compile = function (tree, environment, options) {
options = merge({gensym_renaming: true}, options);
var bindings = merge({}, this._environment || {}, environment || {}, tree.bindings()), variables = [undefined_binding], s = gensym('base');
for (var k in bindings) if (own.call(bindings, k) && k !== 'this') variables.push(binding_template.replace({_variable: k, _base: s}));
var variable_definitions = new this.syntax(',', variables).unflatten(),
function_body = bound_expression_template.replace({_bindings: variable_definitions, _expression: tree});
if (options.gensym_renaming) {var renaming_table = this.gensym_rename_table(function_body);
for (var k in bindings) own.call(bindings, k) && (bindings[renaming_table[k] || k] = bindings[k]);
function_body = function_body.replace(renaming_table);
s = renaming_table[s]}
var code = function_body.toString();
try {return (new Function(s, code)).call(bindings['this'], bindings)}
catch (e) {throw new Error((e.message || e) + ' while compiling ' + code)}};
Gensym erasure.
Gensyms are horrible. They look like foo_1_j15190ba29n1_$1AC151953, which both takes up a lot of space and is hard to read. Fortunately, we can convert them at compile-time. This is possible
because Javascript (mostly) supports alpha-conversion for functions.
I said "mostly" because some symbols are converted into runtime strings; these are property keys. In the unlikely event that you've got a gensym being used to dereference something, e.g.
foo.gensym, then renaming is no longer safe. This, as far as I know, is the only situation where renaming won't work as intended. Because I can't imagine a situation where this would
actually arise, I'm not handling this case yet. (Though let me know if I need to fix this.)
New gensym names are chosen by choosing the smallest nonnegative integer N such that the gensym's name plus N.toString(36) doesn't occur as an identifier anywhere in the code. (The most
elegant option is to use scope analysis to keep N low, but I'm too lazy to implement it.)
caterwaul_global.gensym_rename_table = function (tree) {
var names = {}, gensyms = [];
tree.reach(function (node) {var d = node.data; if (is_gensym(d)) names[d] || gensyms.push(d); names[d] = d.replace(/^(.*)_[a-z0-9]+_.{22}$/, '$1') || 'anon'});
var unseen_count = {}, next_unseen = function (name) {if (! (name in names)) return name;
var n = unseen_count[name] || 0; while (names[name + (++n).toString(36)]); return name + (unseen_count[name] = n).toString(36)};
for (var renamed = {}, i = 0, l = gensyms.length, g; i < l; ++i) renamed[g = gensyms[i]] || (names[renamed[g] = next_unseen(names[g])] = true);
return renamed}})();
Initialization method.
Caterwaul 1.1 is a huge deviation from before. Now you don't use the global caterwaul as a compiler, because it isn't one. Rather, it's a compiler-generator. You pass in arguments to construct
the new function. So, for example:
| var compiler = caterwaul(my_macroexpander);
compiler(function () {return 5})() // -> 5, unless your macroexpander does something really bizarre
The function returned here will have a permanent link to the global caterwaul that generated it, so deglobalizing is a safe thing to do. These generated functions can be composed by doing the
parse step ahead of time:
| var my_caterwaul = caterwaul(my_macroexpander);
var my_other_caterwaul = caterwaul(my_other_macroexpander);
var compiler = function (tree) {
return caterwaul.compile(my_other_caterwaul(my_caterwaul(caterwaul.parse(tree))));
};
This informs my_caterwaul and my_other_caterwaul that your intent is just to macroexpand trees to trees, not transform functions into other functions.
caterwaul_global.init = function (macroexpander) {
var result = function (f, environment, options) {
return f.constructor === Function || f.constructor === String ? caterwaul_global.compile(result.call(result, caterwaul_global.parse(f)), environment, options) :
macroexpander ? f.rmap(function (node) {return macroexpander.call(result, node, environment, options)}) : f};
result.global = caterwaul_global;
result.macroexpander = macroexpander;
return result};
caterwaul_global.initializer = initializer;
caterwaul_global.clone = function () {return se(initializer(initializer, unique).deglobalize(),
function () {for (var k in caterwaul_global) this[k] || (this[k] = caterwaul_global[k])})};
return caterwaul = caterwaul_global});
__
meta::sdoc('js::caterwaul.all', <<'__');
This file isn't rendered -- it's just used internally for node testing.
- pinclude pp::js::caterwaul
- pinclude pp::js::extensions/std
- pinclude pp::js::extensions/ui
__
meta::sdoc('js::caterwaul.node', <<'__');
CommonJS-compatible Caterwaul build | Spencer Tipping
Licensed under the terms of the MIT source code license
- pinclude pp::js::caterwaul
exports.caterwaul = caterwaul;
__
meta::sdoc('js::extensions/std', <<'__');
Caterwaul standard library | Spencer Tipping
Licensed under the terms of the MIT source code license
(caterwaul.std_initializer = function () {
Internal libraries.
These operate on caterwaul in some way, but don't necessarily have an effect on generated code.
- pinclude pp::js::extensions/std/macro
- pinclude pp::js::extensions/std/anonymize
Language specializations.
These provide configurations that specialize caterwaul to operate well with a given programming language. This is relevant because not all languages compile to Javascript the same way, and
caterwaul should be able to adapt to the syntactic limitations of generated code (and thus be usable with non-Javascript languages like Coffeescript).
Also included is a standard set of words that can be combined with the Javascript forms to produce useful macros. Together these form a base language that is used by other parts of the
standard library.
- pinclude pp::js::extensions/std/js
- pinclude pp::js::extensions/std/js-literals
- pinclude pp::js::extensions/std/words
Libraries.
These apply more advanced syntactic transforms to the code and can depend on everything above.
- pinclude pp::js::extensions/std/seq
caterwaul.js_all = function () {return this.seq(this.words(this.js_literals(this.js())))}})();
__
meta::sdoc('js::extensions/std/anonymize', <<'__');
Symbol anonymization | Spencer Tipping
Licensed under the terms of the MIT source code license
Introduction.
A recurring pattern in previous versions of caterwaul was to clone the global caterwaul function and set it up as a DSL processor by defining a macro that manually dictated tree traversal
semantics. This was often difficult to implement because any context had to be encoded bottom-up and in terms of searching rather than top-down inference. This library tries to solve the
problem by implementing a grammar-like structure for tree traversal.
Use cases.
One fairly obvious use case is code tracing. When we trace some code, we need to keep track of whether it should be interpreted in sequence or expression context. Although there are only two
states here, it still is too complex for a single-layer macroexpander to handle gracefully; so we create two separate caterwaul functions that delegate control to one another. We then create
a set of annotations to indicate which state or states should be chosen next. For example, here are some expansions from the tracing behavior:
| E[_x = _y] -> H[_x = E[_y]]
S[_x = _y] -> _x = E[_y]
It's straightforward enough to define macros this way; all that needs to be done is to mark the initial state and put state information into the macro patterns. The hard part is making sure
that the markers don't interfere with the existing syntax. This requires that all of the markers be replaced by gensyms before the macroexpansion happens.
Gensym anonymizing.
Replacing symbols in macro patterns is trivial with the replace() method. The only hard part is performing this same substitution on the macroexpansions. (In fact, this is impossible to do
transparently given Turing-complete macros.) In order to work around this, strings are automatically expanded (because it's easy to do), but functions must call translate_state_markers() on
any patterns they intend to use. This call must happen before substituting syntax into the patterns (!) because otherwise translate_state_markers() may rewrite code that happens to contain
markers, thus reintroducing the collision problem that all of this renaming is intended to avoid.
Usage.
To anonymize a set of macros you first need to create an anonymizer. This is easy; you just give it a list of symbols to anonymize and then use that anonymizer to transform a series of macros
(this process is non-destructive):
| var anonymize = caterwaul.anonymizer('X', 'Y', 'Z');
var m = caterwaul.replacer(anonymize('X[foo]'), ...); // Matches against gensym_1_aj49Az0_885nr1q[foo]
Each anonymizer uses a separate symbol table. This means that two anonymizers that match against 'A' (or any other macro pattern) will always map them to different gensyms.
(function ($) {$.anonymizer = function () {for (var translation_table = {}, i = 0, l = arguments.length; i < l; ++i) translation_table[arguments[i]] = $.gensym(arguments[i]);
return function (node) {return $.parse(node).replace(translation_table)}}})(caterwaul);
__
meta::sdoc('js::extensions/std/js', <<'__');
Javascript-specific macros | Spencer Tipping
Licensed under the terms of the MIT source code license
(function ($) {
Structured forms in Javascript.
These aren't macros, but forms. Each language has its own ways of expressing certain idioms; in Javascript we can set up some sensible defaults to make macros more consistent. For example,
caterwaul pre-1.0 had the problem of wildly divergent macros. The fn[] macro was always prefix and required parameters, whereas /se[] was always postfix and had a single optional parameter.
/cps[] was similarly postfix, which was especially inappropriate considering that it could theoretically handle multiple parameters.
In caterwaul 1.0, the macro author's job is reduced to specifying which words have which behavior; the language driver takes care of the rest. For instance, rather than specifying the full
pattern syntax, you just specify a word and its definition with respect to an opaque expression and perhaps set of modifiers. Here are the standard Javascript macro forms:
$.js = function (macroexpander) {
Javascript-specific shorthands.
Javascript has some syntactic weaknesses that it's worth correcting. These don't relate to any structured macros, but are hacks designed to make JS easier to use.
String interpolation.
Javascript normally doesn't have this, but it's straightforward enough to add. This macro implements Ruby-style interpolation; that is, "foo#{bar}" becomes "foo" + bar. A caveat (though not
bad one in my experience) is that single and double-quoted strings are treated identically. This is because Spidermonkey rewrites all strings to double-quoted form.
This version of string interpolation is considerably more sophisticated than the one implemented in prior versions of caterwaul. It still isn't possible to reuse the same quotation marks
used on the string itself, but you can now include balanced braces in the interpolated text. For example, this is now valid:
| 'foo #{{bar: "bif"}.bar}'
There are some caveats; if you have unbalanced braces (even in substrings), it will get confused and misread the boundary of your text. So stuff like this won't work properly:
| 'foo #{"{" + bar}' // won't find the ending properly and will try to compile the closing brace
var string_interpolator = function (node) {
var s = node.data, q = s.charAt(0), syntax = $.syntax;
if (q !== '\'' && q !== '"' || ! /#\{[^\}]+\}/.test(s)) return false; // DeMorgan's applied to (! ((q === ' || q === ") && /.../test(s)))
for (var pieces = [], is_code = [], i = 1, l = s.length - 1, brace_depth = 0, got_hash = false, start = 1, c; i < l; ++i)
if (brace_depth) if ((c = s.charAt(i)) === '}') --brace_depth || (pieces.push(s.substring(start, i)), is_code.push(true)) && (start = i + 1), got_hash = false;
else brace_depth += c === '{';
else if ((c = s.charAt(i)) === '#') got_hash = true;
else if (c === '{' && got_hash) pieces.push(s.substring(start, i - 1)), is_code.push(false), start = i + 1, ++brace_depth;
else got_hash = false;
pieces.push(s.substring(start, l)), is_code.push(false);
for (var quoted = new RegExp('\\\\' + q, 'g'), i = 0, l = pieces.length; i < l; ++i) pieces[i] = is_code[i] ? this($.parse(pieces[i].replace(quoted, q)).as('(')) :
new syntax(q + pieces[i] + q);
return new syntax('+', pieces).unflatten().as('(')};
Destructuring function creation.
This is a beautiful hack made possible by Internet Explorer. We can intercept cases of assigning into a function and rewrite them to create a function body. For example, f(x) = y becomes the
regular assignment f = function (x) {return y}. Because this macro is repeatedly applied we get currying for free.
There's a special case. You can grab the whole arguments array by setting something equal to it. For example, f(xs = arguments) = xs[0] + xs[1]. This makes it easy to use binding constructs
inside the body of the function without worrying whether you'll lose the function context.
var function_rule = $.rereplacer('_left(_args) = _right', '_left = (function (_args) {return _right})'),
function_args_rule = $.rereplacer('_left(_var = arguments) = _right', '_left = (function () {var _var = arguments; return _right})'),
function_destructure = function (node) {return function_args_rule.call(this, node) || function_rule.call(this, node)};
Infix function application.
Caterwaul 1.1.2 introduces infix function notation, which lets the user avoid grouping constructs. It locates anything of the form x /-f/ y or x |-f| y and converts it into f(x, y). The
notation is vaguely borrowed from Haskell, though due to Javascript's limitations it doesn't look as good.
You can change the minus sign to a tilde to get n-ary flattening; that is, x /y /... /~f/z becomes f(x, y, ..., z). The same goes for vertical bar syntax. This macroexpansion follows
associativity, so you can do this:
| x /!f /-g/ y // -> g(f(x), y)
var infix_function_slash = $.rereplacer('_x /-_f/ _y', '_f(_x, _y)'), infix_function_bar = $.rereplacer('_x |-_f| _y', '_f(_x, _y)'),
infix_function_flat = function (node) {var d = node.data, left, fn;
if ((d === '/' || d === '|') && (left = node[0]).data === d && left[1] && left[1].data === 'u~' && (fn = left[1][0])) {
// Pre-expand macros in the left-hand side.
for (var comma = new $.syntax(','), n = this(left[0]); n.data === d; n = n[0]) comma.push(n[1]);
comma.push(n);
// The comma arguments are backwards, so reverse them in-place:
for (var i = 0, l = comma.length, temp; i < l >> 1; ++i) temp = comma[i], comma[i] = comma[l - i - 1], comma[l - i - 1] = temp;
return new $.syntax('()', fn, comma.push(this(node[1])).unflatten())}},
infix_function = function (node) {return infix_function_flat.call(this, node) || infix_function_slash.call(this, node) || infix_function_bar.call(this, node)};
Postfix function application.
This is a bit simpler than infix function application and is used when you have a unary function. Sometimes it's simpler to think of a function as a filter than as a wrapper, and this macro
makes it easier to do that. This is particularly useful when you have many nested function calls, for instance if you're defining multi-level function composition:
| compose(f, g, h)(x) = x /!h /!g /!f // -> f(g(h(x)))
var postfix_function = $.rereplacer('_x /!_f', '_f(_x)');
Literal modification.
Caterwaul 1.1.2 introduces literal modification, which provides ways to reinterpret various types of literals at compile-time. These are always written as postfix property accesses, e.g.
/foo bar/.x -- here, 'x' is the modifier. Cool as it would be to be able to stack modifiers up, right now Caterwaul doesn't support this. Part of the reason is that I'm too lazy/uninsightful
to know how to do it performantly considering the present architecture, but another part of it is that the bugs would become strange and subtle. My goal is to keep the compilation process
reasonably transparent, and you can imagine the bizarre chain of events that would occur if someone wrote a modifier that, for instance, returned a different type of literal. It would be
utter chaos (though a really cool form of it).
Sadly, you can't modify object literals. The reason has to do with syntactic ambiguity. Suppose you've got a function like this:
| function () {
{foo: 'bar'}.modifier
return true;
}
This function fails to parse under SpiderMonkey, since it assumes that {foo: 'bar'} is a statement-level block with a label 'foo' and a discarded string literal 'bar'. Rather than open this
can of worms, I'm just nixing the whole idea of modifying object literals (besides, it doesn't seem particularly useful anyway, though perhaps I'm being myopic about it).
var modified_literal_form = $.pattern('_literal._modifier'),
lookup_literal_modifier = function (caterwaul, type, modifier) {var hash = caterwaul.literal_modifiers[type];
return hash.hasOwnProperty(modifier) && hash[modifier]},
literal_modifier = function (node) {var modified_literal = modified_literal_form.call(this, node), literal, expander;
if (modified_literal && (literal = modified_literal._literal) &&
(expander = literal.is_identifier() ? lookup_literal_modifier(this, 'identifier', modified_literal._modifier.data) :
literal.is_array() ? lookup_literal_modifier(this, 'array', modified_literal._modifier.data) :
literal.is_regexp() ? lookup_literal_modifier(this, 'regexp', modified_literal._modifier.data) :
literal.is_number() ? lookup_literal_modifier(this, 'number', modified_literal._modifier.data) :
literal.is_string() ? lookup_literal_modifier(this, 'string', modified_literal._modifier.data) :
null))
return expander.call(this, literal)};
Modifier syntax.
These are the 'structured forms' I was talking about above. Prior to caterwaul 1.1 these were stored as individual pre-expanded macros. This had a number of problems, perhaps most notably
that it was extremely inefficient. I loaded up caterwaul in the REPL and found that caterwaul.js_ui(caterwaul.js_all()) had 329 macros installed. This meant 329 tree-match tests for every
function.
Now modifiers are stored on the compiler function directly. Some modifiers take parameters, so there is always some degree of overhead involved in determining whether a modifier case does in
fact match. However, there are only a few checks that need to happen before determining whether a modifier match is possible, unlike before.
var bracket_modifier_form = $.pattern('_modifier[_expression]'), slash_modifier_form = $.pattern('_expression /_modifier'),
minus_modifier_form = $.pattern('_expression -_modifier'), in_modifier_form = $.pattern('_modifier in _expression'),
pipe_modifier_form = $.pattern('_expression |_modifier'), comma_modifier_form = $.pattern('_expression, _modifier'),
dot_parameters = $.pattern('_modifier._parameters'), bracket_parameters = $.pattern('_modifier[_parameters]'),
parameterized_wickets = $.pattern('_expression <_modifier> _parameters'), parameterized_minus = $.pattern('_expression -_modifier- _parameters'),
modifier = function (node) {var parameterized_match = parameterized_wickets.call(this, node) || parameterized_minus.call(this, node);
if (parameterized_match)
for (var es = this.parameterized_modifiers, i = es.length - 1, r; i >= 0; --i)
if (r = es[i].call(this, parameterized_match)) return r;
var regular_match = bracket_modifier_form.call(this, node) || slash_modifier_form.call(this, node) ||
minus_modifier_form .call(this, node) || in_modifier_form .call(this, node) ||
pipe_modifier_form .call(this, node) || comma_modifier_form.call(this, node);
if (regular_match) {
// Could still be a parameterized function; try to match one of the parameter forms against the modifier.
var parameter_match = dot_parameters .call(this, regular_match._modifier) ||
bracket_parameters.call(this, regular_match._modifier);
if (parameter_match) {
regular_match._modifier = parameter_match._modifier;
regular_match._parameters = parameter_match._parameters;
for (var es = this.parameterized_modifiers, i = es.length - 1, r; i >= 0; --i)
if (r = es[i].call(this, regular_match)) return r}
else
for (var es = this.modifiers, i = es.length - 1, r; i >= 0; --i)
if (r = es[i].call(this, regular_match)) return r}};
Tying it all together.
This is where we write a big macroexpander to perform all of the tasks mentioned above. It just falls through cases, which is now a fairly standard pattern for macros. There is a high-level
optimization that we can perform: leaf nodes can only be expanded by the string interpolator, so we try this one first and reject any further matching attempts if the node has no children.
Because roughly half of the nodes will have no children, this saves on average 5 matching attempts per node.
I've got two closures here to avoid putting a conditional in either one of them. In particular, we know already whether we got a macroexpander, so there's no need to test it inside the
function (which will be called lots of times).
var each_node = function (node) {return string_interpolator.call(this, node) || literal_modifier.call(this, node) ||
node.length && (modifier.call(this, node) || function_destructure.call(this, node) ||
infix_function.call(this, node) || postfix_function.call(this, node))},
result = macroexpander ? $(function (node) {return macroexpander.call(this, node) || each_node.call(this, node)}) :
$(each_node);
result.modifiers = [];
result.parameterized_modifiers = [];
result.literal_modifiers = {regexp: {}, array: {}, string: {}, number: {}, identifier: {}};
return result}})(caterwaul);
__
meta::sdoc('js::extensions/std/js-literals', <<'__');
Javascript literal notation | Spencer Tipping
Licensed under the terms of the MIT source code license
Introduction.
These macros provide some convenient literal notation for various Javascript literals. For obvious reasons they have names that are unlikely to collide with methods.
(function ($) {
$.js_literals = function (caterwaul_function) {
var function_template = $.parse('function (_) {return _body}');
Regular expression literals.
Right now we just support the 'x' flag, which causes all whitespace within the regular expression to be ignored. This is a straightforward preprocessing transformation, since we have access
to the regexp in string form anyway.
To make Javascript's regular expressions more useful I've also included the 'qf' modifier. This turns a regular expression into a matching function; for example, /foo/.qf becomes (function
(s) {return /foo/.exec(s)}).
(function (r) {r.x = $.reexpander(function (node) {return node.with_data(node.data.replace(/\s+/g, ''))});
var call_exec_template = $.parse('_regexp.exec(_)');
r.qf = function (node) {return function_template.replace({_body: call_exec_template.replace({_regexp: node})})}})(caterwaul_function.literal_modifiers.regexp);
String literals.
There are a couple of things we can do with strings. First, there's the 'qw' modifier, which causes a string to be split into an array of words at compile-time. So, for instance, the
expression 'foo bar bif'.qw would be compiled into ['foo', 'bar', 'bif']. Another modifier is 'qh', which is like 'qw' but creates a hash instead. So 'foo bar bif baz'.qh would result in
{foo: 'bar', bif: 'baz'}. There's also qr, which converts from a string to a regular expression and does all of the appropriate escape conversions. Some care should be taken with this,
however, because not all regexp escapes are valid in strings. In particular, you can't do things like 'foo\[bar\]'.qr because \[ isn't recognized in strings.
Another modifier is 'qs', which is rarely used outside of the context of writing macros. The idea here is to have Caterwaul parse the string and return a reference to the parse tree. So, for
example, 'foo.bar'.qs is compiled into a reference to the parse tree for foo.bar. A caveat here is that the parse happens only once, so any mutations that happen to the syntax tree are
persisted across invocations. (Unlike the way that array and object literals are interpreted, which is to create a new array or object each time that node is evaluated.)
Functions can be written concisely using qf. This causes the string to be interpreted as the body of a function whose sole argument is called _. This may change at some point in the future.
(function (s) {s.qw = $.reexpander(function (node) {for (var array_node = new $.syntax('['), comma = new $.syntax(','), delimiter = node.data.charAt(0),
pieces = node.as_escaped_string().split(/\s+/), i = 0, l = pieces.length; i < l; ++i)
comma.push(new $.syntax(delimiter + pieces[i] + delimiter));
return array_node.push(comma.unflatten())});
s.qh = $.reexpander(function (node) {for (var hash_node = new $.syntax('{'), comma = new $.syntax(','), delimiter = node.data.charAt(0),
pieces = node.as_escaped_string().split(/\s+/), i = 0, l = pieces.length; i < l; i += 2)
comma.push(new $.syntax(':', new $.syntax(delimiter + pieces[i] + delimiter), new $.syntax(delimiter + pieces[i + 1] + delimiter)));
return hash_node.push(comma.unflatten())});
s.qr = $.reexpander(function (node) {return node.with_data('/' + node.as_escaped_string().replace(/\//g, '\\/') + '/')});
s.qs = function (node) {return new $.ref($.parse(node.as_unescaped_string()))};
s.qf = $.reexpander(function (node) {return function_template.replace({_body: $.parse(node.as_unescaped_string())})})})(caterwaul_function.literal_modifiers.string);
return caterwaul_function}})(caterwaul);
__
meta::sdoc('js::extensions/std/macro', <<'__');
Macro authoring utilities | Spencer Tipping
Licensed under the terms of the MIT source code license
Macro definitions.
A macro is simply a partial function from source trees to source trees. It returns a falsy value if it cannot be applied to a given tree; otherwise it returns the replacement (as shown
above). Because most macros end up replacing one pattern with another, caterwaul lets you use strings instead of requiring you to construct recognizer functions.
The expander() method distributes across arrays. That is, you can give it an array of things that can be converted into expanders (strings, functions, syntax trees, or arrays), and it will
build a function that runs backwards through the array, taking the last entry.
(function ($) {
var syntax_manipulator = function (base_case) {
var result = function (x) {if (x.constructor === Array) {for (var i = 0, l = x.length, ys = []; i < l; ++i) ys.push(result(x[i]));
return function (tree) {for (var i = ys.length - 1, r; i >= 0; --i) if (r = ys[i].call(this, tree)) return r}}
else return x.constructor === String ? result($.parse(x)) :
x.constructor === $.syntax ? base_case.call(this, x) : x};
return result};
$.pattern = syntax_manipulator(function (pattern) {return function (tree) {return pattern.match(tree)}});
$.expander = syntax_manipulator(function (expander) {return function (match) {return expander.replace(match)}});
$.alternatives = syntax_manipulator(function (alternative) {throw new Error('must use replacer functions with caterwaul.alternatives()')});
$.reexpander = function (expander) {var e = $.expander(expander);
return function (match) {var r = e.call(this, match); return r && this(r)}};
var composer = function (expander_base_case) {
return function (pattern, expander) {var new_pattern = $.pattern(pattern), new_expander = expander_base_case(expander);
return function (tree) {var match = new_pattern.call(this, tree); return match && new_expander.call(this, match)}}};
$.replacer = composer($.expander);
$.rereplacer = composer($.reexpander);
Global macroexpansion.
This is a shorthand to enable one-off macroexpansion. The idea is that we build a temporary caterwaul function to do some temporary work.
$.macroexpand = function (tree) {return $($.alternatives(Array.prototype.slice.call(arguments, 1)))(tree)}})(caterwaul);
__
meta::sdoc('js::extensions/std/seq', <<'__');
Sequence comprehensions | Spencer Tipping
Licensed under the terms of the MIT source code license
Introduction.
Caterwaul pre-1.0 had a module called 'seq' that provided a finite and an infinite sequence class and localized operator overloading to make them easier to use. Using wrapper classes was both
unnecessary (since most sequence operations were done inside the seq[] macro anyway) and problematic, as it required the user to remember to cast sequences back into arrays and such. It also
reduced runtime performance and created a lot of unnecessary copying.
Caterwaul 1.0 streamlines the seq[] macro by removing the sequence classes and operating directly on arrays or array-like things. Not everything in Javascript is an array, but I'm going to
pretend that everything is (or at least looks like one) and rely on the [i] and .length properties. This allows the sequence library to (1) have a very thin design, and (2) compile down to
tight loops without function calls.
Distributive property.
The seq[] modifier distributes across several operators. They are:
| 1. Ternary ?:
2. Short-circuit && and ||
3. Parentheses
It won't cross a square-bracket or invocation boundary, however. This includes distributing over array elements and [] dereferencing. You can cause it to cross an array boundary by prefixing
the array with ~ (which should be familiar, as it is the same syntax that's used to cause function bodies to be interpreted in sequence context). For instance:
| [1, 2, 3, X] -seq // <- X is interpreted in regular Javascript context
~[1, 2, 3, X] -seq // <- X is interpreted in sequence context
Notation.
The notation is mostly a superset of the pre-1.0 sequence notation. Operators that have the same functionality as before (others are reserved for future meanings, but probably won't do what
they used to):
| * = map e.g. [1, 2, 3] *[x + 1] |seq -> [2, 3, 4]
*! = each e.g. [1, 2, 3] *![console.log(x)] |seq -> [1, 2, 3] (and logs 1, 2, 3)
/ = foldl e.g. [1, 2, 3] /[x - next] |seq -> -4
/! = foldr e.g. [1, 2, 3] /![x - next] |seq -> 2
% = filter e.g. [1, 2, 3] %[x & 1] |seq -> [1, 3]
%! = filter-not e.g. [1, 2, 3] %![x & 1] |seq -> [2]
+ = concatenate e.g. [1, 2, 3] + [4, 5] |seq -> [1, 2, 3, 4, 5]
- = cartesian product e.g. [1, 2] - [3, 4] |seq -> [[1, 3], [1, 4], [2, 3], [2, 4]]
^ = zip e.g. [1, 2, 3] ^ [4, 5, 6] |seq -> [[1, 4], [2, 5], [3, 6]]
| = exists e.g. [1, 2, 3] |[x === 2] |seq -> true
Note that ^ has higher precedence than |, so we can use it in a sequence comprehension without interfering with the |seq macro (so long as the |seq macro is placed on the right).
Modifiers.
Modifiers are unary operators that come after the primary operator. These have the same (or similar) functionality as before:
| ~ = interpret something in sequence context e.g. [[1], [2], [3]] *~[x *[x + 1]] |seq -> [[2], [3], [4]]
x = rename the variable from 'x' e.g. [1, 2, 3] *y[y + 1] |seq -> [2, 3, 4]
Here, 'x' means any identifier. Caterwaul 1.0 introduces some new stuff. The map function now has a new variant, *~!. Filter also supports this variant. Like other operators, they support
variable renaming and sequence context. You can do this by putting those modifiers after the *~!; for instance, xs *~!~[exp] interprets 'exp' in sequence context. Similarly, *~!y[exp] uses
'y' rather than 'x'.
| *~! = flatmap e.g. [1, 2, 3] *~![[x, x + 1]] |seq -> [1, 2, 2, 3, 3, 4]
%~! = map/filter e.g. [1, 2, 3] %~![x & 1 && x + 1] |seq -> [2, 4]
/~! = unfold e.g. 1 /~![x < 5 ? x + 1 : null] |seq -> [1, 2, 3, 4, 5]
Variables.
All of the variables from before are still available and the naming is still mostly the same. Each block has access to 'x', which is the immediate element. 'xi' is the index, and 'x0' is the
alternative element for folds. Because all sequences are finite, a new variable 'xl' is available -- this is the total number of elements in the source sequence. The sequence object is no
longer accessible because there may not be a concrete sequence. (I'm leaving room for cross-operation optimizations in the future.) The renaming is done exactly as before:
| [1, 2, 3] *[x + 1] |seq -> [2, 3, 4]
[1, 2, 3] *y[y + 1] |seq -> [2, 3, 4]
[1, 2, 3] *[xi] |seq -> [0, 1, 2]
[1, 2, 3] *foo[fooi] |seq -> [0, 1, 2]
Word operators.
Some operators are designed to work with objects, just like in prior versions. However, the precedence has been changed to improve ergonomics. For example, it's uncommon to use objects as an
intermediate form because all of the sequence operators are built around arrays. Similarly, it's very common to unpack objects immediately before using them. Therefore the unpack operators
should be very high precedence and the pack operator should have very low precedence:
| {foo: 'bar'} /keys |seq -> ['foo']
{foo: 'bar'} /values |seq -> ['bar']
{foo: 'bar'} /pairs |seq -> [['foo', 'bar']]
{foo: 'bar'} /pairs |object |seq -> {foo: 'bar'}
Note that unlike regular modifiers you can't use a variety of operators with each word. Each one is defined for just one form. I may change this in the future, but I'm reluctant to start
with it because it would remove a lot of syntactic flexibility.
Update: After using this in the field, I've found that the low-precedence |object form is kind of a pill. Now the sequence library supports several variants, /object, -object, and |object.
Prefixes.
New in Caterwaul 1.0.3 is the ability to specify the scope of operation for sequence macros. For instance, you might want to operate on one of several types of data. Normally the sequence
macro assumes arrays, but you may want to modify a unary operator such as *[] to transform an object's keys or values. Prefixes let you do this.
| o %k*[x.substr(1)] -seq (equivalent to o /pairs *[[x[0].substr(1), x[1]]] -object -seq)
o %v*[x.split(/a/)] -seq (equivalent to o /pairs *[[x[0], x[1].split(/a/)]] -object -seq)
Prefixes are generally faster than manual unpacking and repacking. However, some operations (e.g. fold and its variants) don't work with prefixes. The reason is that it's unclear what to do
with the values that correspond to a folded key, for instance. (Imagine what this would mean: o %k/[x + x0] -seq) The following operators can be used with prefixes:
| * = map
*! = each <- returns the original object
% = filter <- removes key/value pairs
%! = filter-not
%~! = map-filter <- changes some key-value pairs, removes others
These operators support the standard set of modifiers, including ~ prefixing and variable renaming. However, indexing variables such as xi and xl are unavailable because no temporary arrays
are constructed.
The following operators cannot be used with prefixes because it's difficult to imagine what purpose they would serve:
| *~! = flatmap
/ = foldl
/! = foldr
/~! = unfold
None of the binary operators (e.g. +, -, ^, etc) can be used with prefixes because of precedence. Any prefix would bind more strongly to the left operand than it would to the binary
operator, which would disrupt the syntax tree.
Folding prefixes.
New in Caterwaul 1.1 is the ability to specify fold prefixes. This allows you to specify the initial element of a fold:
| xs /[0][x0 + x*x] -seq (sum the squares of each element)
xs /~[[]][x0 + [x, x + 1]] -seq (equivalent to xs *~![[x, x + 1]] -seq)
Function promotion.
Caterwaul 1.1 also adds support for implicit function promotion of sequence block expressions:
| f(x) = x + 1
seq in [1, 2, 3] *f
seq in [-1, 0, 1] %f
You can use this to make method calls, which will remain bound to the original object:
| xs *foo.bar -seq (equivalent to xs *[foo.bar(x)] -seq)
xs *(bar + bif).baz -seq (equivalent to xs *[(bar + bif).baz(x)] -seq)
The only restriction is that you can't use a bracketed expression as the last operator; otherwise it will be interpreted as a block. You also can't invoke a promoted function in sequence
context, since it is unclear what the intent would be.
Calling convention.
All functions you promote will always be called with these arguments, in this order:
| f(x, x0, xi, xl)
This may seem strange, since x0 may or may not be defined. I chose this setup to simplify code generation, even if it is a bit redundant. If x0 isn't provided by the current operator, then
its value will be undefined.
Scope wrapping.
Normally sequences use thin compilation; that is, the body of each sequence element is inserted directly into a for-loop. This increases performance by eliminating a function call, but it
has the usual caveats about variable sharing. For instance:
| fs = [1, 2, 3] *[delay in x] -seq
fs[0]() -> 3 (counterintuitive)
fs[1]() -> 3 (counterintuitive)
fs[2]() -> 3 (expected)
The problem is that all three closures get the same value of 'x', which is a single lexically-scoped variable. To fix this, caterwaul 1.1 introduces the unary + modifier on blocks. This
wraps them in a closure to give each iteration its own lexical scope:
| fs = [1, 2, 3] *+[delay in x] -seq
fs[0]() -> 1
fs[1]() -> 2
fs[2]() -> 3
Numbers.
Caterwaul 1.0 removes support for the infinite stream of naturals (fun though it was), since all sequences are now assumed to be finite and are strictly evaluated. So the only macros
available are n[] and ni[], which generate finite sequences of evenly-spaced numbers. The only difference between n[] and ni[] is that ni[] uses an inclusive upper bound, whereas n[] is
exclusive.
| n[1, 10] -seq -> [1, 2, 3, 4, 5, 6, 7, 8, 9]
| ni[1, 10] -seq -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n[10] -seq -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
ni[10] -seq -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n[0, 10, 2] -seq -> [0, 2, 4, 6, 8]
ni[0, 10, 2] -seq -> [0, 2, 4, 6, 8, 10]
Generated code.
Previously the code was factored into separate methods that took callback functions. (Basically the traditional map/filter/each arrangement in functional languages.) However, now the library
optimizes the methods out of the picture. This means that now we manage all of the dataflow between the different sequence operators. I thought about allocating gensym variables -- one for
each temporary result -- but this means that the temporary results won't be garbage-collected until the entire sequence comprehension is complete. So instead it generates really gnarly code,
with each dependent sequence listed in the for-loop variable initialization.
Luckily this won't matter because, like, there aren't any bugs or anything ;)
Portability.
The seq library is theoretically portable to syntaxes besides JS, but you'll probably want to do some aggressive preprocessing if you do this. It assumes a lot about operator precedence and
such (from a design perspective).
caterwaul.words(caterwaul.js())(function ($) {
$.seq(caterwaul_function) = caterwaul_function -se-
it.modifiers.push(given.match in seq_expand.call(seq_expand, anon_pattern.replace({_x: match._expression})) -re- this(it) /when.it
-when [match._modifier.data === 'seq'])
-where [anon_pattern = anon('S[_x]'),
seq_expand = $($.alternatives(operator_macros.concat(word_macros)))],
where [anon = $.anonymizer('S'),
rule(p, e) = $.rereplacer(p.constructor === String ? anon(p) : p, e.constructor === String ? anon(e) : e),
operator_macros = [rule('S[_x]', '_x'), rule('S[_xs + _ys]', concat), rule('S[_xs ^ _ys]', zip), rule('S[_xs - _ys]', cross),
// Distributive property
rule('S[(_x)]', '(S[_x])'), rule('S[_x[_y]]', 'S[_x][_y]'), rule('S[_xs(_ys)]', 'S[_xs](_ys)'),
rule('S[[_x]]', '[_x]'), rule('S[_x, _y]', 'S[_x], S[_y]'), rule('S[_xs._p]', 'S[_xs]._p'),
rule('S[~[_x]]', '[S[_x]]'), // <- ~ modifier on arrays
rule('S[_x ? _y : _z]', '(S[_x]) ? (S[_y]) : (S[_z])'), rule('S[_x && _y]', '(S[_x]) && (S[_y])'), rule('S[_x || _y]', '(S[_x]) || (S[_y])'),
// Unary seq operators
rule('S[_xs %_thing]', handle_filter_forms), rule('S[_xs *_thing]', handle_map_forms),
rule('S[_xs /_thing]', handle_fold_forms), rule('S[_xs |_thing]', handle_exists_forms),
rule('S[_xs %k*_thing]', handle_kmap_forms), rule('S[_xs %v*_thing]', handle_vmap_forms),
rule('S[_xs %k%_thing]', handle_kfilter_forms), rule('S[_xs %v%_thing]', handle_vfilter_forms)]
-where [// High-level form specializations
unrecognized(reason) = raise [new Error(reason)],
use_form(form, xs, body, init, vars) = form ? form.replace({_f: body, _init: init}).replace($.merge({_xs: xs}, vars)) :
unrecognized('unsupported sequence operator or modifiers used on #{body}'),
operator_case(forms)(match) = parse_modifiers(match._thing,
use(forms.normal, forms.inormal), use(forms.bang, forms.ibang), use(forms.tbang, forms.itbang))
-where [xs = match._xs,
expander = this,
form_function(form)(body, vars) = use_form(form, xs, body, null, vars),
iform_function(form)(body, init, vars) = use_form(form, xs, body, init, vars),
use(form, iform)(body) = parse_body(body, expander, form_function(form), iform_function(iform))],
handle_map_forms = operator_case({normal: map, bang: each, tbang: flatmap}),
handle_filter_forms = operator_case({normal: filter, bang: filter_not, tbang: map_filter}),
handle_fold_forms = operator_case({normal: foldl, bang: foldr, tbang: unfold, inormal: ifoldl, ibang: ifoldr}),
handle_kmap_forms = operator_case({normal: kmap, bang: keach}),
handle_kfilter_forms = operator_case({normal: kfilter, bang: kfilter_not, tbang: kmap_filter}),
handle_vmap_forms = operator_case({normal: vmap, bang: veach}),
handle_vfilter_forms = operator_case({normal: vfilter, bang: vfilter_not, tbang: vmap_filter}),
handle_exists_forms = operator_case({normal: exists}),
// Body parsing
block = anon('[_x]'),
block_with_variable = anon('_var[_x]'),
block_with_init = anon('[_init][_x]'),
block_with_variable_and_init = anon('_var[_init][_x]'),
block_with_closure = anon('+_x'),
block_with_seq = anon('~_x'),
standard_names = {_x: 'x', _x0: 'x0', _xi: 'xi', _xl: 'xl'},
prefixed_names(p) = {_x: p , _x0: '#{p}0', _xi: '#{p}i', _xl: '#{p}l'},
function_promotion = anon('_f(_x, _x0, _xi, _xl)'),
promote_function(f) = function_promotion.replace({_f: f}),
closure_wrapper = anon('(function (_x, _x0, _xi, _xl) {return _f}).call(this, _x, _x0, _xi, _xl)'),
close_body(vars, f) = closure_wrapper.replace(vars).replace({_f: f}),
seq_pattern = anon('S[_x]'),
promote_seq(f) = seq_pattern.replace({_x: f}),
parse_body(tree, expand, normal, init) = ((r = block_with_seq.match(tree)) ? parse_body(r._x, expand, sequence_context_normal, sequence_context_init) :
(r = block_with_closure.match(tree)) ? parse_body(r._x, expand, wrapping_normal, wrapping_init) :
(r = block_with_variable_and_init.match(tree)) ? init(r._x, r._init, prefixed_names(r._var)) :
(r = block_with_init.match(tree)) ? init(r._x, r._init, standard_names) :
(r = block_with_variable.match(tree)) ? normal(r._x, prefixed_names(r._var)) :
(r = block.match(tree)) ? normal(r._x, standard_names) :
normal(promote_function(tree), standard_names))
-where [in_sequence_context(f) = expand.call(expand, promote_seq(f)),
sequence_context_normal(f, names) = normal(in_sequence_context(f), names),
sequence_context_init(f, init_expression, names) = init (in_sequence_context(f), init_expression, names),
wrapping_normal(f, names) = normal(close_body(names, f), names),
wrapping_init(f, init_expression, names) = init (close_body(names, f), init_expression, names),
r = null],
// Modifier parsing
tbang_modifier = anon('~!_x'),
bang_modifier = anon('!_x'),
parse_modifiers(tree, normal, bang, tbang) = ((result = tbang_modifier.match(tree)) ? tbang(result._x) :
(result = bang_modifier.match(tree)) ? bang(result._x) : normal(tree)) -where [result = null]]
-where [// Setup for form definitions (see below)
loop_anon = $.anonymizer('xs', 'ys', 'x', 'y', 'i', 'j', 'l', 'lj', 'r', 'o', 'k'),
scope = anon('(function (xs) {var _x, _x0, _xi, _xl; _body}).call(this, S[_xs])'),
scoped(t) = scope.replace({_body: t}),
form(x) = loop_anon(scoped(anon(x))),
// Form definitions
map = form('for (var ys = [], _xi = 0, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], ys.push((_f)); return ys'),
each = form('for (var _xi = 0, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], (_f); return xs'),
flatmap = form('for (var ys = [], _xi = 0, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], ys.push.apply(ys, ys.slice.call((_f))); return ys'),
filter = form('for (var ys = [], _xi = 0, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], (_f) && ys.push(_x); return ys'),
filter_not = form('for (var ys = [], _xi = 0, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], (_f) || ys.push(_x); return ys'),
map_filter = form('for (var ys = [], _xi = 0, _xl = xs.length, _y; _xi < _xl; ++_xi) _x = xs[_xi], (_y = (_f)) && ys.push(_y); return ys'),
foldl = form('for (var _x0 = xs[0], _xi = 1, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], _x0 = (_f); return _x0'),
foldr = form('for (var _xl = xs.length, _xi = _xl - 2, _x0 = xs[_xl - 1]; _xi >= 0; --_xi) _x = xs[_xi], _x0 = (_f); return _x0'),
unfold = form('for (var ys = [], _x = xs, _xi = 0; _x !== null; ++_xi) ys.push(_x), _x = (_f); return ys'),
ifoldl = form('for (var _x0 = (_init), _xi = 0, _xl = xs.length; _xi < _xl; ++_xi) _x = xs[_xi], _x0 = (_f); return _x0'),
ifoldr = form('for (var _xl = xs.length - 1, _xi = _xl, _x0 = (_init); _xi >= 0; --_xi) _x = xs[_xi], _x0 = (_f); return _x0'),
exists = form('for (var _x = xs[0], _xi = 0, _xl = xs.length, x; _xi < _xl; ++_xi) {_x = xs[_xi]; if (x = (_f)) return x} return false'),
concat = anon('(S[_xs]).concat((S[_ys]))'),
zip = form('for (var ys = (S[_ys]), pairs = [], i = 0, l = xs.length; i < l; ++i) pairs.push([xs[i], ys[i]]); return pairs'),
cross = form('for (var ys = (S[_ys]), pairs = [], i = 0, l = xs.length, lj = ys.length; i < l; ++i) ' +
'for (var j = 0; j < lj; ++j) pairs.push([xs[i], ys[j]]);' + 'return pairs'),
kmap = form('var r = {}; for (var _x in xs) if (Object.prototype.hasOwnProperty.call(xs, _x)) r[_f] = xs[_x]; return r'),
keach = form(' for (var _x in xs) if (Object.prototype.hasOwnProperty.call(xs, _x)) _f; return xs'),
kfilter = form('var r = {}; for (var _x in xs) if (Object.prototype.hasOwnProperty.call(xs, _x) && (_f)) r[_x] = xs[_x]; return r'),
kfilter_not = form('var r = {}; for (var _x in xs) if (Object.prototype.hasOwnProperty.call(xs, _x) && ! (_f)) r[_x] = xs[_x]; return r'),
kmap_filter = form('var r = {}, x; for (var _x in xs) if (Object.prototype.hasOwnProperty.call(xs, _x) && (x = (_f))) r[x] = xs[_x]; return r'),
vmap = form('var r = {}; for (var k in xs) if (Object.prototype.hasOwnProperty.call(xs, k)) _x = xs[k], r[k] = (_f); return r'),
veach = form(' for (var k in xs) if (Object.prototype.hasOwnProperty.call(xs, k)) _x = xs[k], _f; return xs'),
vfilter = form('var r = {}; for (var k in xs) if (Object.prototype.hasOwnProperty.call(xs, k)) _x = xs[k], (_f) && (r[k] = _x); return r'),
vfilter_not = form('var r = {}; for (var k in xs) if (Object.prototype.hasOwnProperty.call(xs, k)) _x = xs[k], (_f) || (r[k] = _x); return r'),
vmap_filter = form('var r = {}, x; for (var k in xs) if (Object.prototype.hasOwnProperty.call(xs, k)) _x = xs[k], x = (_f), x && (r[k] = x); return r')],
word_macros = [rule('S[n[_upper]]', n), rule('S[ni[_upper]]', ni), rule('S[_o /keys]', keys), rule('S[_o |object]', object),
rule('S[n[_lower, _upper]]', n), rule('S[ni[_lower, _upper]]', ni), rule('S[_o /values]', values), rule('S[_o -object]', object),
rule('S[n[_lower, _upper, _step]]', n), rule('S[ni[_lower, _upper, _step]]', ni), rule('S[_o /pairs]', pairs), rule('S[_o /object]', object)]
-where [n(match) = n_pattern .replace($.merge({_lower: '0', _step: '1'}, match)),
ni(match) = ni_pattern.replace($.merge({_lower: '0', _step: '1'}, match)),
n_pattern = anon('(function (i, u, s) {if ((u - i) * s <= 0) return [];' + // Check for degenerate iteration
'for (var r = [], d = u - i; d > 0 ? i < u : i > u; i += s) r.push(i); return r})((_lower), (_upper), (_step))'),
ni_pattern = anon('(function (i, u, s) {if ((u - i) * s <= 0) return [];' + // Check for degenerate iteration
'for (var r = [], d = u - i; d > 0 ? i <= u : i >= u; i += s) r.push(i); return r})((_lower), (_upper), (_step))'),
scope = anon('(function (o) {_body}).call(this, (S[_o]))'),
scoped(t) = scope.replace({_body: t}),
form(p) = tree.replace(match) -given.match -where [tree = scoped(anon(p))],
keys = form('var ks = []; for (var k in o) Object.prototype.hasOwnProperty.call(o, k) && ks.push(k); return ks'),
values = form('var vs = []; for (var k in o) Object.prototype.hasOwnProperty.call(o, k) && vs.push(o[k]); return vs'),
pairs = form('var ps = []; for (var k in o) Object.prototype.hasOwnProperty.call(o, k) && ps.push([k, o[k]]); return ps'),
object = form('for (var r = {}, i = 0, l = o.length, x; i < l; ++i) x = o[i], r[x[0]] = x[1]; return r')]]})(caterwaul);
__
meta::sdoc('js::extensions/std/words', <<'__');
Common adjectives and adverbs | Spencer Tipping
Licensed under the terms of the MIT source code license
Introduction.
This behavior installs a bunch of common words and sensible behaviors for them. The goal is to handle most Javascript syntactic cases by using words rather than Javascript primitive syntax.
For example, constructing lambdas can be done with 'given' rather than the normal function() construct:
| [1, 2, 3].map(x + 1, given[x]) // -> [1, 2, 3].map(function (x) {return x + 1})
In this case, given[] is registered as a postfix binary adverb. Any postfix binary adverb forms added later will extend the possible uses of given[].
(function ($) {
$.words = function (caterwaul_function) {
var filtered_expander = function (word, expander) {return function (match) {return match._modifier.data === word && expander.call(this, match)}},
modifier = function (word, expander) {caterwaul_function.modifiers .push(filtered_expander(word, expander))},
parameterized_modifier = function (word, expander) {caterwaul_function.parameterized_modifiers.push(filtered_expander(word, expander))};
Quotation.
qs[] comes from pre-1.0 caterwaul; this lets you quote a piece of syntax, just like quote in Lisp. The idea is that qs[something] returns 'something' as a syntax tree. qse[] is a variant
that macroexpands the syntax tree before returning it; this used to be there for performance reasons (now irrelevant with the introduction of precompilation) but is also useful for macro
reuse.
modifier('qs', function (match) {return new $.ref(match._expression, 'qs')});
modifier('qse', function (match) {return new $.ref(this(match._expression), 'qse')});
Macroexpansion control.
Sometimes it's useful to request an additional macroexpansion or suppress macroexpansion for a piece of code. The 'reexpand' and 'noexpand' modifiers do these two things, respectively.
modifier('reexpand', function (match) {return this(this(match._expression))});
modifier('noexpand', function (match) {return match._expression});
Error handling.
Javascript in particular has clunky error handling constructs. These words provide error handling in expression context.
modifier ('raise', $.reexpander('(function () {throw _expression}).call(this)'));
parameterized_modifier('rescue', $.reexpander('(function () {try {return (_expression)} catch (e) {return (_parameters)}}).call(this)'));
Evaluation.
Caterwaul 1.1.2 introduces the 'eval' modifier, which lets you force certain expressions to be evaluated at compile-time. A reference containing the resulting value is dropped into the code,
and any errors are reported as compile-time errors. The expression being evaluated is macroexpanded under the compiling caterwaul function.
modifier('eval', function (match) {return new $.ref($.compile(this(match._expression)), 'eval')});
Scoping and referencing.
These all impact scope or references somehow -- in other words, they create variable references but don't otherwise impact the nature of evaluation.
Function words.
These define functions in some form. given[] and bgiven[] are modifiers to turn an expression into a function; given[] creates a regular closure while bgiven[] preserves the closure binding.
For example:
| var f = x + 1 -given [x];
var f = x + 1 -given.x;
parameterized_modifier('given', $.reexpander('(function (_parameters) {return _expression})'));
parameterized_modifier('bgiven', $.reexpander('(function (t, f) {return (function () {return f.apply(t, arguments)})})(this, (function (_parameters) {return _expression}))'));
Nullary function words.
These are used to provide quick function wrappers for values. There are actually a couple of possibilities here. One is to wrap a value in a nullary function that recomputes its expression
each time, and another is to compute the value lazily and return the cached value for each future invocation. The modifiers are called 'delay' and 'lazy', and they always bind to the
surrounding context (analogous to bgiven, above).
Here are their operational semantics by example:
| var x = 10;
var f = ++x -delay;
f() -> 11
f() -> 12
var g = ++x -lazy;
g() -> 13
g() -> 13
modifier('delay', $.reexpander('(function (t, f) {return (function () {return f.call(t)})})(this, (function () {return _expression}))'));
modifier('lazy', $.reexpander('(function (t, f, v, vc) {return (function () {return vc ? v : (vc = true, v = f.call(t))})})(this, (function () {return _expression}))'));
Side-effecting.
The goal here is to take an existing value, modify it somehow, and then return it without allocating an actual variable. This can be done using the /se[] adverb. Older versions of caterwaul
bound the variable as _; version 1.0 changes this convention to bind the variable to 'it'. For example:
| hash(k, v) = {} /se[it[k] = v];
compose(f, g)(x) = g(x) -re- f(it);
parameterized_modifier('se', $.reexpander('(function (it) {return (_parameters), it}).call(this, (_expression))'));
parameterized_modifier('re', $.reexpander('(function (it) {return (_parameters)}).call(this, (_expression))'));
Scoping.
You can create local variables by using the where[] modifier. If you do this, the locals can all see each other since they're placed into a 'var' statement. For example:
| where[x = 10][alert(x)]
alert(x), where[x = 10]
parameterized_modifier('where', $.reexpander('(function () {var _parameters; return (_expression)}).call(this)'));
Object construction.
This is similar to where[], but constructs a hash object instead of binding local variables. The idea is to be able to use the f(x) = x + 1 function notation but end up with an object. You
can also use regular assignments, each of which will be converted into a key/value pair:
| var o = capture [f(x) = 10, g(x)(y) = x + y];
o.g(10)(20) // -> 30
modifier('capture', function (match) {for (var r = new $.syntax('{'), comma = new $.syntax(','), bindings = match._expression.flatten(','), i = 0, l = bindings.length; i < l; ++i)
comma.push(this(bindings[i]).with_data(':'));
return r.push(comma.unflatten())});
Importation.
This is a fun one. Caterwaul 1.1.2 introduces the 'using' modifier, which lets you statically import an object. For example:
| log(x) -using- console // -> (function () {var log = console.log; return log(x)}).call(this)
Variables are computed at compile-time, not at runtime. This is much better than using the 'with' keyword, which degrades performance ('using' has no significant performance impact).
However, the calling context is incomplete, as shown above. In particular, methods of the object that you're using will be called with a global 'this' rather than being bound to the object.
var scope_template = $.parse('(function () {var _variables; return _expression}).call(this)');
parameterized_modifier('using', $.reexpander(function (match) {var o = $.compile(this(match._parameters)), comma = new $.syntax(',');
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) comma.push(new $.syntax('=', k, new $.ref(o[k])));
return scope_template.replace({_variables: comma.unflatten(), _expression: match._expression})}));
Control flow modifiers.
These impact how something gets evaluated.
Conditionals.
These impact whether an expression gets evaluated. x /when.y evaluates to x when y is true, and y when y is false. Similarly, x /unless[y] evaluates to x when y is false, and !y when y is
truthy.
parameterized_modifier('when', $.reexpander('((_parameters) && (_expression))'));
parameterized_modifier('unless', $.reexpander('(! (_parameters) && (_expression))'));
return caterwaul_function}})(caterwaul);
__
meta::sdoc('js::extensions/ui', <<'__');
Caterwaul UI macros | Spencer Tipping
Licensed under the terms of the MIT source code license
(caterwaul.ui_initializer = function () {
DOM libraries.
Right now I've only got a set of combinators for jQuery.
- pinclude pp::js::extensions/ui/dom.jquery
})();
__
meta::sdoc('js::extensions/ui/dom.jquery', <<'__');
JQuery DOM combinators | Spencer Tipping
Licensed under the terms of the MIT source code license
Introduction.
DOM drivers are macro systems that transform HAML-like markup into underlying DOM calls. For instance:
| div.foo /jquery -> $('').addClass('foo')
table(tr(td('hi')), tbody) /jquery -> $('').append($('').append($('').append('hi')).add($(' ')))
None of the macroexpansions here rely on opaque syntax refs, so they can all be precompiled. Also, the generated code refers to jQuery rather than $ -- this gives you more flexibility about
setting noConflict(). If you need to set noConflict(true) (which removes the global jQuery variable), you can bind it locally to make the DOM stuff work:
| div.foo /jquery -where [jQuery = stashed_copy_of_jquery]
Notation.
Caterwaul didn't previously have a DOM plugin in its core distribution. The html[] macro in previous versions of caterwaul came from montenegro, a web framework I was developing in tandem with
caterwaul. However, it's useful to have DOM functionality so I'm including it in the main caterwaul distribution.
Most of the syntax is copied from the html[] macro in montenegro:
| element.class -> $('').addClass('class')
element *foo('bar') -> $('').attr('foo', 'bar')
element *!foo('bar') -> $('').data('foo', 'bar') <- new!
element /foo('bar') -> $('').foo('bar')
element /!foo(bar) -> $('').bind('foo', bar) <- new!
+element -> element <- new!
element %foo -> foo($(''))
element(child) -> $('').append(child /jquery) <- here the /jquery marker indicates that 'child' will be re-expanded
element(child1, child2) -> $('').append((child1 /jquery).add((child2 /jquery)))
element[child] -> $('').append(child) <- no re-expansion here
element[child1, child2] -> $('').append(child1.add(child2))
element > child -> $('').append(child /jquery)
element >= child -> $('').append(child)
element1, element2 -> (element1 /jquery).add((element2 /jquery))
There's also some new syntax to make certain things easier. In particular, I didn't like the way nesting worked in previous versions, so this driver supports some new operators to make it more
intuitive:
| element1 + element2 -> (element1 /jquery).add((element2 /jquery))
The result of this operator is that you have options as far as nesting is concerned:
| div.foo > span.first + span.second, ->
div.bar > span.third + span.fourth
Also, you can now dig through the DOM using HTML selectors. Here's what that looks like:
| element >> div.foo -> element.filter('div.foo')
element >> _.foo -> element.filter('*.foo')
element >>> div.foo -> element.find('div.foo')
element << div.foo -> element.parents('div.foo')
element >> div.foo /first -> element.filter('div.foo:first')
element >> div.foo /contains(x) -> element.filter('div.foo:contains("#{x}")')
element >> div.foo + div.bar -> element.filter('div.foo, div.bar')
element >> (span >> p) -> element.filter('span p')
element >> (span >>> p) -> element.filter('span p')
element >> (span > p) -> element.filter('span > p')
element >> span[foo] -> element.filter('span[foo]')
element >> span[data_bar] -> element.filter('span[data-bar]') <- note conversion of _ to -
element >> span[foo=x] -> element.filter('span[foo="#{string_escape(x)}"]')
Note that this isn't really intended to be a replacement for jQuery's builtin methods; it's just an easy way to do some simple selection. I highly recommend using native jQuery selectors if
you need something more powerful.
You shouldn't try to get too elaborate with these; I'm not sure how much stuff jQuery's CSS parser can handle. Also note that CSS3's operator precedence differs from Javascript's. In
particular, doing things like div > span + div > code is incorrect because it will be parsed as 'div > (span, div) > code' (though it may render properly as a CSS pattern). It's a good idea to
parenthesize in this case, just to communicate your intent to whoever's reading your code. Caterwaul removes the parentheses to make it a valid CSS selector.
Unlike the montenegro html[] macro, this one doesn't do any autodetection. The good part about this is that you can create custom HTML elements this way. For example:
| my_element /jquery -> $('') <- note the conversion of _ to -; this happens in class and attribute names too
caterwaul.words(caterwaul.js())(function ($) {
$.jquery(caterwaul_function) = caterwaul_function -se-
it.modifiers.push(given.match in jquery_expand.call(jquery_expand, anon_pattern.replace({_x: match._expression})) -re- this(it) /when.it
-when [match._modifier.data === 'jquery'])
-where [anon_pattern = anon('J[_x]'),
jquery_expand = $($.alternatives(jquery_macros.concat(string_macros).concat(search_macros)))],
Transforms.
There are a lot of stages here, but most of them are fairly trivial. The first, J[], is used to indicate that something needs to be expanded under the jquery grammar. This is responsible for
turning elements into jQuery calls, dot operators into classes, etc, and it does most of the heavy lifting. The other large stage is P[], which converts the pattern language into a jQuery
CSS selector.
The small stages are S[], which just turns something into a string with underscore-to-dash conversion; TS[], which turns something into a tag-string (e.g. TS[foo] = ""); and PS[], which
quotes a compiled pattern.
where [jq = 'jQuery',
anon = $.anonymizer('J', 'TS', 'S', 'P', 'PS'),
hyphenate(s) = s.replace(/_/g, '-'),
rule(p, e) = $.rereplacer(anon(p), e.constructor === Function ? e.call(this, match) -given.match : anon(e)),
p = where [p_pattern = anon('P[_thing]')] in p_pattern.replace({_thing: node}) -given.node,
jquery_macros = [rule('J[_element]', given.match [match._element.is_constant() || match._element.length ? wrap_in_jquery(match) : become_dom_node(match)]),
rule('J[_element._class]', 'J[_element].addClass(S[_class])'),
rule('J[_element *_attr(_val)]', 'J[_element].attr(S[_attr], _val)'),
rule('J[_element *!_name(_val)]', 'J[_element].data(S[_name], _val)'),
rule('J[_element /_method(_args)]', 'J[_element]._method(_args)'),
rule('J[_element /!_event(_args)]', 'J[_element].bind(S[_event], _args)'),
rule('J[_element %_function]', '_function(J[_element])'),
rule('J[_element(_children)]', 'J[_element].append(J[_children])'),
rule('J[_element[_children]]', 'J[_element].append(_children)'),
rule('J[_element > _child]', 'J[_element].append(J[_child])'),
rule('J[_element >= _child]', 'J[_element].append(_child)'),
rule('J[_element1, _element2]', 'J[_element1].add(J[_element2])'),
rule('J[_element1 + _element2]', 'J[_element1].add(J[_element2])'),
rule('J[_element >> _pattern]', 'J[_element].filter(PS[_pattern])'),
rule('J[_element >>> _pattern]', 'J[_element].find(PS[_pattern])'),
rule('J[_element << _pattern]', 'J[_element].parents(PS[_pattern])'),
rule('J[(_element)]', '(J[_element])'),
rule('J[[_element]]', '[J[_element]]'),
rule('J[+_expression]', '_expression')]
-where [dom_node_template = anon('#{jq}(TS[_element])'), jquery_template = anon('#{jq}("" + (_element) + "")'),
become_dom_node(match) = dom_node_template.replace(match), wrap_in_jquery(match) = jquery_template.replace(match)],
string_macros = [rule('TS[_identifier]', string('<#{hyphenate(match._identifier.data)}>') -given.match),
rule('S[_identifier]', string( hyphenate(match._identifier.data)) -given.match),
rule('PS[_identifier]', string( expand(p(match._identifier)).data) -given.match)]
-where [string(s) = new $.syntax('"' + s.replace(/\\/g, '\\\\').replace(/"/g, '\\"') + '"')],
search_macros = [rule('P[_element]', new $.syntax(hyphenate(match._element.data -re [it === '_' ? '*' : it])) -given.match),
rule('P[_element._class]', new $.syntax('#{this(p(match._element)).data}.#{hyphenate(match._class.data)}') -given.match),
rule('P[_element[_attributes]]', new $.syntax('#{this(p(match._element)).data}[#{this(p(match._attributes))}]') -given.match),
rule('P[_attribute = _value]', new $.syntax('#{this(p(match._attribute)).data}="#{' + interpolated(match._value) + '}"') -given.match),
rule('P[(_element)]', 'P[_element]'), // No paren support
rule('P[_element1 + _element2]', binary(', ')),
rule('P[_element1, _element2]', binary(', ')),
rule('P[_element1 >> _element2]', binary(' ')),
rule('P[_element1 >>> _element2]', binary(' ')),
rule('P[_element1 > _element2]', binary(' > ')),
rule('P[_element1(_element2)]', binary(' > ')),
rule('P[_element /_selector]', new $.syntax('#{expand(p(match._element)).data}:#{hyphenate(match._selector.data)}') -given.match),
rule('P[_element /_selector(_value)]', new $.syntax('#{expand(p(match._element)).data}:#{hyphenate(match._selector.data)}("#' +
'{' + interpolated(match._value) + '}")') -given.match)]
-where [interpolated(node) = '(#{node.toString()}).replace(/(\\)/g, "$1$1").replace(/(")/g, "\\$1")',
binary(op)(match) = new $.syntax('#{expand(p(match._element1)).data}#{op}#{expand(p(match._element2)).data}')]]})(caterwaul);
__
meta::sdoc('js::web/benchmark', <<'__');
$('#benchmark').append(jquery [table(tr(td(a.caterwaul_core('+ caterwaul core initialization') %clickable), td.core_result),
tr(td(a.caterwaul_std('+ compile std extension') %clickable), td.std_result),
tr(td(a.caterwaul_ui('+ compile ui extension') %clickable), td.ui_result),
tr(td(a.caterwaul_init('+ create compiler') %clickable), td.init_result),
tr(td(a.caterwaul_parse_core('parse caterwaul core') %clickable), td.parse_core_result),
tr(td(a.caterwaul_parse_std('parse std extension') %clickable), td.parse_std_result),
tr(td(a.caterwaul_parse_ui('parse ui extension') %clickable), td.parse_ui_result))]
-se- functions %k*![setup_link(it, x)] /seq
-where [clickable(e) = e.attr('href', 'javascript:void(0)'),
setup_link(table, name) = setTimeout(delay in
table.find('.caterwaul-#{name}').click(delay in true
-where [cell = table.find('.#{name}-result'),
each(x) = cell.text(n[x] *['|'] -seq -re- it.join('')),
end(ms) = cell.text('#{ms}ms'),
result = benchmark(functions[name], each, end)]), 10),
functions = {core: delay in caterwaul.clone(),
init: delay in caterwaul.jquery(caterwaul.js_all()),
std: delay in caterwaul.std_initializer(),
ui: delay in caterwaul.ui_initializer(),
'parse-core': delay in caterwaul.parse(caterwaul.initializer),
'parse-std': delay in caterwaul.parse(caterwaul.std_initializer),
'parse-ui': delay in caterwaul.parse(caterwaul.ui_initializer)},
benchmark(f, each, end) = f -where [trials = 4,
start = +new Date(),
shortly(f) = setTimeout(f, 1),
result = n[trials] *!+[shortly(delay in each(xl - --trials) -se-
f() -se- end((+new Date() - start) / xl) /unless.trials)] -seq]]);
__
meta::sdoc('js::web/code-snippets', <<'__');
Code snippet initialization.
This runs after the page is fully loaded. The idea is to setup clickability for each code snippet.
setTimeout(linkify_code_snippets, 0),
where [linkify_snippet(s) = s.click(send_code_to_prompt),
send_code_to_prompt() = $('.shell .prompt .input').val($(this).text())
-se- $('.shell').click(),
linkify_code_snippets() = $('#tutorial-page pre.code') *[linkify_snippet($(x))] /seq];
__
meta::sdoc('js::web/header', <<'__');
Page header.
This is basically just a navigation container.
var page_header = div.header(div.title(span.caterwaul('caterwaul'), span.js('the ', span.accent('edge'), ' of javascript'))) -jquery;
__
meta::sdoc('js::web/main', <<'__');
Caterwaul JS web interface | Spencer Tipping
Licensed under the terms of the MIT source code license
$('#cover .status').text('loading tutorial content');
$(caterwaul.jquery(caterwaul.js_all())(function () {
var original_html = $('body').html(),
original_pages = $('#tutorial-page, #sdoc-page'),
original_styles = $('style, link[rel="stylesheet"]');
$('#cover .status').text('parsing state');
var construct_page = function (original_html) {
- pinclude pp::js::web/header
- pinclude pp::js::web/shell
- pinclude pp::js::web/code-snippets
- pinclude pp::js::web/state
- pinclude pp::js::web/sdoc
- pinclude pp::js::web/source
- pinclude pp::js::web/render-tutorial
- pinclude pp::js::web/benchmark
- pinclude pp::js::web/seq-decipher
$('head').append(jquery in title('caterwaul js'));
$('body').empty().append(page_header, original_pages);
original_styles.appendTo('head')};
$.get(document.location.href, construct_page).error(delay in construct_page(original_html))}));
__
meta::sdoc('js::web/render-tutorial', <<'__');
Building the tutorial.
All we have to do here is create a div to contain the tutorial and populate it with the SDoc obtained by parsing the object state. (Since this HTML file is actually a self-modifying Perl
object -- long story.) We also build a table of contents.
$('#tutorial-page').append(top_wrapper_for(toc, 'Contents'), tutorial, shell())
-where [tutorial_attribute = attributes |[x.namespace === 'sdoc' && x.attribute === 'web/tutorial' && x] |seq,
tutorial = jquery [div.tutorial[sdoc_to_dom(tutorial_attribute.value)]]
-se- it.find('pre.code') *![$(x).text($(x).text().replace(/^\s*/, ''))] /seq,
top_wrapper_for(e, name) = jquery [div.popdown(div.label /text(name), div.contents[e])]
-se- setTimeout(delay in it.click(given.e in it.toggleClass('open')), 10),
toc = jquery [div.toc] -se- toc_links *!it.append /seq
-where [section_id_count = 0,
assign_section_id() = $(this).attr('id', 'section-#{++section_id_count}'),
title_of(section) = where [level = Number(/level(\d+)/.exec($(section).attr('class'))[1])] in
$('').text($(section).children('h#{level}').text()),
sections = tutorial.find('.section').each(assign_section_id),
toc_links = [jquery in a.toc_link(h1('source code')) *href('#annotated')] +
sections *[jquery in a.toc_link[title_of(x)] *href('##{$(x).attr("id")}')] -seq]];
__
meta::sdoc('js::web/sdoc', <<'__');
SDoc-to-HTML converter.
SDoc is a fairly simple format to parse. We just emit stuff based on indentation deltas (basically like the algorithm in the Perl). I'm doing this here rather than up-front to reduce the page
size. If we converted every SDoc attribute into HTML up front it would easily double the download size.
By the way, I'm using the same heuristic HTML-escaping algorithm that the Perl script uses. This basically tries to do the right thing with <> symbols in SDoc paragraphs by escaping anything
that doesn't look like a well-formed tag. Of course, this precludes your saying things like < and expecting that to render verbatim; instead it will be converted to an actual less-than sign
in the markup.
var sdoc_to_dom = given.text in $([]) -se- paragraphs *~![convert(x)] *![it.push(x)] /seq
-where [known_tags = ('html head body meta script style link title div a span input button textarea option select form label iframe ' +
'blockquote code caption table tbody tr td th thead tfoot img h1 h2 h3 h4 h5 h6 li ol ul noscript p pre samp sub sup ' +
'var canvas audio video').replace(/\s+/g, '|'),
paragraphs = text.split(/\n\n+/),
indentation_of(p) = (/^(\s*(\|\s)?)/.exec(p)[1].length >> 1) + 1,
convert(p) = /^\s*[A-Z]/.test(p) ? documentation(p) :
/^\s*\|/.test(p) ? quoted(unindent(p)) :
code(p),
not_a_valid_tag = new RegExp('<(?!\/|(#{known_tags})[^>]*>(?:.|\n)*\\1>)', 'g'),
escape_html_in(s) = s.replace(/&(?!gt;|lt;|amp;)/g, '&').replace(not_a_valid_tag, '<'),
escape_all_in(s) = s.replace(/&/g, '&').replace(//g, '>'),
quoted(p) = jquery in pre.quoted[escape_all_in(p)],
code(p) = jquery in pre.code[p.replace(/^\s*c\n/, '')],
starts_section(p) = /^\s*(.*\.)\n\s*(.*)/.exec(p) -re [it && it[1].length + 10 <= it[2].length],
unindent(p) = p.replace(indentation, '') -where [spaces = n[indentation_of(p) - 1] *['( |\\|){2}'] -seq -re- it.join(''),
indentation = new RegExp('^#{spaces}', 'gm')],
documentation(p) = starts_section(p) ? documentation_section(p) : jquery in p[escape_html_in(p)],
documentation_section(p) = jquery [div.section[header, paragraph]] -se- it.addClass('level#{indentation_of(p)}')
-where [parts = /^\s*(.*)\.\n((?:.|\n)*)/.exec(p),
header = $('').text(parts[1]),
paragraph = jquery in p[escape_html_in(parts[2])]]];
__
meta::sdoc('js::web/seq-decipher', <<'__');
Sequence deciphering applet.
This uses regular expressions to parse a sequence operator. It then populates a table describing the operation and invocation context.
$('#seq-decipher').append(jquery in input /val('%k*~!+y') %fills_result + table.result[row_for('Prefix'), row_for('Operator'), row_for('Block'), row_for('Variables')])
-where [row_for(name) = jquery [tr(td.name[name], td.fragment, td.desc)] -se- it.addClass(name.toLowerCase()),
fills_result(e) = e -se- setTimeout(delay in e.keyup(change).change(change) -se- change.call(e[0]) -where [change(e) = fill_table_from(parse($(this).val()))], 10),
fill_table_from(p) = $('#seq-decipher table.result') -se- it.find('td.fragment, td.desc').text('')
-se- p /pairs *![it.find(x[0].replace(/^/, 'tr.').replace(/_/, ' td.')).text(x[1])] /seq,
op_names = {'*': 'map', '*!': 'each', '*~!': 'flatmap', '%': 'filter', '%!': 'filter-not', '%~!': 'map/filter',
'/': 'fold-left', '/!': 'fold-right', '/~!': 'unfold', '|': 'exists'},
simple_op(op, desc) = {operator_fragment: op, operator_desc: desc},
parse(op) = op === '/pairs' && simple_op(op, 'convert object to array of key/value pairs') ||
op === '/keys' && simple_op(op, 'return array of keys from object') ||
op === '/values' && simple_op(op, 'return array of values from object') ||
/[-\/\|]object/.test(op) && simple_op(op, 'construct object from array of key/value pairs') ||
op === '+' && simple_op(op, 'concatenate arrays') ||
op === '-' && simple_op(op, 'cartesian product of arrays') ||
op === '^' && simple_op(op, 'zip arrays') ||
/^(%[kv])?([\*%\/\|](?:~!|!)?)([~\+]{0,2})(\w+)?/.exec(op.replace(/\s/g, '')) -re-
{operator_fragment: it[2], operator_desc: op_names[it[2]],
prefix_fragment: it[1], prefix_desc: it[1] === '%k' ? 'object keys' : it[1] === '%v' ? 'object values' : '',
block_fragment: it[3], block_desc: [/~/.test(it[3]) ? 'sequence context' : '', /\+/.test(it[3]) ? 'closure wrapping' : ''] %[x] -seq
-re- it.join(' and '),
variables_fragment: it[4], variables_desc: it[4] ? '#{it[4]}, #{it[4]}0, #{it[4]}i, #{it[4]}l' : 'x, x0, xi, xl'} /when.it];
__
meta::sdoc('js::web/shell', <<'__');
var shell = given.nothing in
shell.append(history_container, shell_prompt)
-se- setTimeout(given.nothing in shell.click(setTimeout(given.nothing in shell.find('.prompt .input').focus(), 10) -given.e), 0)
-where [shell = jquery in div.shell,
history_container = jquery in div.history,
history_entry_for(s) = jquery in pre.entry(span.accent('>'), span.command /text(s)),
history_result_for(o) = jquery in pre.result[ui_for(o)],
history_log_for(o) = jquery in pre.log /text('' + o),
history_error_for(e) = jquery in pre.error /text('' + e),
ui_for(x) = ! x ? jquery in span /text('' + x) :
x.is_caterwaul_syntax ? jquery in span /text(x.toString()) :
x.constructor === jQuery ? x.parent().length ? jquery in div.sandbox('(A DOM node that is already in document)')
/hover(given.e in x.addClass('identify'), given.e in x.removeClass('identify')) :
jquery in span /text(jquery in span[x] /html()) + div.sandbox[x.clone(true)] :
jquery in span /text('' + x),
realign() = setTimeout(input.css({width: input.parent().width() - (input.prev().width() + 10)})
-where [input = shell.find('.prompt .input')]
-given.nothing, 10),
log(xs = arguments) = xs *![shell.children('.history').append(x) -se- realign()] -seq -re- xs[0],
history_n = 0,
context = capture [expand() = shell.animate({left: 0, right: 0}, realign),
collapse() = shell.animate({left: 600, right: 50}, realign),
clear() = shell.children('.history').empty() -re- realign() -re- '',
caterwaul = caterwaul.clone(),
history = [],
help() = 'available variables:\n' +
(pairs %[x[1] && x[1].description] *['#{x[0]}: #{x[1].description}'] /seq
-where [keys = context /keys -seq -re- it.sort(),
pairs = keys *[[x, context[x]]] -seq]).join("\n"),
log(xs = arguments) = xs *![log(history_log_for(x))] -seq -re- xs[0],
it = null]
-se [it.context = it]
-se [it.compiler = it.caterwaul.jquery(it.caterwaul.js_all())]
-se [it.context.description = 'variables available to the shell',
it.expand.description = 'expands the shell',
it.collapse.description = 'collapses the shell',
it.clear.description = 'clears old output',
it.caterwaul.description = 'a copy of the caterwaul global',
it.compiler.description = 'the compiler for this shell',
it.history.description = 'shell input history',
it.log.description = 'logs a value to the shell'],
run_command(c) = log(history_entry_for(c))
-re- log(history_result_for(context.it = context.compiler(c, context))) /rescue [log(history_error_for(context.it = e))],
shell_prompt = jquery [div.prompt[prompt, input, shadow]]
-se- setTimeout(realign, 10)
-se- setInterval(update_shadow, 10)
-se- it.find('span.prompt').click($(this).siblings('.input').focus() -given.e)
-se- setTimeout(given.nothing in
it.find('.input').keydown(realign() -re [history_prev() /se [e.preventDefault()] /when [e.which === 38] ||
history_next() /se [e.preventDefault()] /when [e.which === 40] ||
run_it() /se [e.preventDefault()] /when [e.which === 13] || true] -given.e), 0)
-where [input = jquery in input.input,
prompt = jquery in span.accent('>'),
shadow = jquery in div.shadow,
last_shadow_text = null,
rename_gensyms_in(t) = t.replace(context.caterwaul.gensym_rename_table(t)),
update_shadow() = shadow.text(context.compiler(context.caterwaul.parse(last_shadow_text = input.val()))
-re- rename_gensyms_in(it).toString()
-rescue- e.toString())
-unless [input.val() === last_shadow_text],
h_index = 0,
history_prev() = (h[h_index] = input.val()) -when [h_index < history_n] -re- input.val(h[--h_index]) -when [h_index > 0]
-where [h = context.history],
history_next() = (h[h_index] = input.val()) -when [h_index < history_n] -re- input.val(h[++h_index]) -when [h_index < history_n]
-where [h = context.history],
history_add(s) = history_n = h_index = context.history.push(s),
scroll_to_end() = setTimeout(shell.scrollTop(shell.children(':last') -re [shell.scrollTop() + it.position().top + it.height()])
-given.nothing, 0),
run_it() = history_add(t) -re- run_command(t) -re- input.val('') -re- scroll_to_end() -when.t -where [t = input.val()]]];
__
meta::sdoc('js::web/source', <<'__');
Building the documentation pages.
This is just a matter of finding the right SDoc sources and tying them together.
var attributes = perl_attributes(original_html.replace(/>/g, '>').replace(/</g, '<').replace(/&/g, '&'));
$('#sdoc-page').append(sections) -where[sdocs = attributes %[x.namespace === 'sdoc' && /^js::/.test(x.attribute)] -seq,
core = sdocs %[/^js::caterwaul/.test(x.attribute)] -seq,
extension = sdocs %[/extensions\//.test(x.attribute)] -seq,
web = sdocs %[/web\//.test(x.attribute)] -seq,
back_link = jquery in a.back('Back to tutorial') *href('#tutorial'),
title(a) = /\//.test(a) ? jquery [span.path[a.replace(/^js::(.*\/).*/, '$1')],
span.name[a.replace(/.*\//, '')],
span.extension('.js')] :
jquery [span.name[a.replace(/^js::/, '')],
span.extension('.js')],
section(x) = jquery [div.file(h1[title(x.attribute)], div.contents)]
-se- setTimeout(given.nothing in it.find('h1').click(given.e in $(this).next('.contents').toggle()), 0)
-se- it.find('.contents').hide().append(sdoc_to_dom(x.value)),
sections = jquery [div *id('annotated') >= back_link >= core_sections >= extension_sections >= web_sections]
-where [core_sections = $([]) -se- core *~!section *![it.push(x)] /seq,
extension_sections = $([]) -se- extension *~!section *![it.push(x)] /seq,
web_sections = $([]) -se- web *~!section *![it.push(x)] /seq]];
setInterval(check_for_destination, 50)
-where [viewing_annotated_source = false,
moving = false,
check_for_destination() = show_annotated_source() -when [! moving && ! viewing_annotated_source && /^#annotated/.test(document.location.hash)]
hide_annotated_source() -when [! moving && viewing_annotated_source && ! /^#annotated/.test(document.location.hash)],
show_annotated_source() = moving = $('#sdoc-page').css({display: 'block', top: $(window).scrollTop(), left: $(window).width()}).
animate({left: 50}, 'slow')
-se- $('#tutorial-page').animate({left: '-=#{distance}'}, 'slow', given.nothing [viewing_annotated_source = true, moving = false])
-se- $('.shell').animate({left: '-=#{distance}', right: '+=#{distance}', opacity: 0}, 'slow')
-se- $('.header, .popdown').hide('slow')
-where [distance = $(window).width()],
hide_annotated_source() = moving = $('#sdoc-page').animate({left: $(window).width()}, 'slow',
given.nothing [viewing_annotated_source = moving = false, $(this).css({display: 'none'})])
-se- $('.shell').animate({left: '+=#{distance}', right: '-=#{distance}', opacity: 1}, 'slow')
-se- $('#tutorial-page').animate({left: '+=#{distance}'}, 'slow')
-se- $('.header, .popdown').show('slow')
-where [distance = -$('#tutorial-page').position().left]];
__
meta::sdoc('js::web/state', <<'__');
Self-modifying Perl state parser.
This is actually really easy. All of the attributes in self-modifying Perl come in one of two forms. One is the short form, written as meta::\w+('stuff', 'stuff');\n. The other is the long
form, written meta::\w+('stuff', <<'eof');\n...\neof\n. We just need to find all occurrences of either one of these things.
var perl_attributes = given.text in text.match(long_form) *parse_long + text.match(short_form) *parse_short -seq
-where [long_form = /^meta::(\w+)\('([^']+)', (?:<|<){2}'([^']+)'\);[\r\n]{1,2}([\s\S]*?)[\r\n]{1,2}\3$/mg,
short_form = /^meta::(\w+)\('([^']+)', '([^']+)'\);$/mg,
long_form_parser = new RegExp(long_form .toString().replace(/^\/(.*)\/[mg][mg]$/, '$1')),
short_form_parser = new RegExp(short_form.toString().replace(/^\/(.*)\/[mg][mg]$/, '$1')),
parse_long(match) = long_form_parser.exec(match) -re- {namespace: it[1], attribute: it[2], value: it[4]},
parse_short(match) = short_form_parser.exec(match) -re- {namespace: it[1], attribute: it[2], value: it[3]}];
__
meta::sdoc('vim_highlighter::caterwaul', <<'__');
Caterwaul VIM highlighter | Spencer Tipping
Licensed under the terms of the MIT source code license
Language: Javascript with Caterwaul extensions
Maintainer: Spencer Tipping
URL: http://caterwauljs.org/build/caterwaul.vim
if !exists("main_syntax")
if version < 600
syntax clear
elseif exists("b:current_syntax")
finish
endif
let main_syntax = 'caterwaul'
endif
syn case match
setlocal iskeyword=48-57,95,36,A-Z,a-z
syn region jsParenGroup matchgroup=jsParen start=/(/ end=/)/ contains=TOP
syn region jsBracketGroup matchgroup=jsBracket start=/\[/ end=/\]/ contains=TOP
syn region jsBraceGroup matchgroup=jsBrace start=/{/ end=/}/ contains=TOP
syn region jsTernary matchgroup=jsTernaryOperator start=/?/ end=/:/ contains=TOP,jsColonLHS
syn match jsOperator /[-+*^%&\|!~;=><,]\{1,4\}/
syn match jsDot /\./
syn keyword jsReservedToplevel if else switch while for do break continue return with case default try catch finally throw delete void
syn keyword jsOperator in instanceof typeof new
syn keyword jsBuiltinType Array Boolean Date Function Number Object String RegExp
syn keyword jsBuiltinLiteral true false null undefined
syn keyword jsBuiltinValue this arguments
syn keyword jsPrototype prototype constructor
syn match jsAssignment /\k\+\s*[-+*/^&|%<>]*=[^=]\@=/ contains=jsOperator
syn match jsWordPrefix /[-\/|,<]\k\@=/
syn match jsIdentifier /[A-Za-z$_][A-Za-z0-9$_]*/
syn match jsNumber /-\?0x[0-9A-Fa-f]\+\|-\?\(\d*\.\d\+\|\d\+\.\d*\|\d\+\)\([eE][+-]\?\d\{1,3\}\)\?\|-\?0[0-7]\+/
syn region jsStringD matchgroup=jsQuote start=/"/ skip=/\\\\\|\\"/ end=/"\(\.qf\>\)\@!/ oneline keepend contains=jsStringEscape,jsCaterwaulEscape
syn region jsStringS matchgroup=jsQuote start=/'/ skip=/\\\\\|\\'/ end=/'\(\.qf\>\)\@!/ oneline keepend contains=jsStringEscape,jsCaterwaulEscape
syn region jsRegexp matchgroup=jsQuote start=+/[^/ ]+rs=e-1 skip=+\\\\\|\\/+ end=+/[gims]*[^-~\+!\/A-Za-z0-9 #(\[{]\@=+ oneline contains=jsRegexpEscape
syn region jsCodeString matchgroup=jsCodeQuote start=/\z(['"]\)/ end=/\z1\.qf\>/ skip=/\\./ oneline contains=TOP
syn match jsCodeStringVariable /\<_\>/ containedin=jsCodeString contained
syn match jsRegexpEscape /\\[sSbBwWdDnr\\\[\]]\|[+*|]\|\[\([^]\\\/]\|\\.\)\+\]/ contained
syn match jsStringEscape /\\\d\{3\}\|\\u[0-9A-Za-z]\{4\}\|\\[a-z"'\\]/ contained
syn region jsCaterwaulEscape start=/#{/ end=/}/ contained contains=TOP keepend
syn match jsCaterwaulNumericHex /xl\?\(_\?[0-9a-f]\{2\}_\?\)\+/
syn match jsCaterwaulNumericBinary /bl\?\(_\?[01]\{2\}_\?\)\{4,\}/
syn match jsColonLHS /\k\+\s*:/
syn region jsVarBinding matchgroup=jsVarBindingConstruct start=/\\|\/ end=/;/ contains=TOP
syn match jsVarInBinding /var\s\+\k\+\s\+in/ contains=jsVarBindingKeyword,jsOperator
syn region jsParamBinding matchgroup=jsBindingConstruct start=/\(function\|catch\)\s*(/ end=/)/ contains=jsOperator
syn keyword jsVarBindingKeyword const var contained
syn keyword jsBindingKeyword function catch contained
syn match jsBindingAssignment /\k\+\s*=\([^=]\|$\)\@=/ contains=jsOperator contained containedin=jsVarBinding
syn match jsExtraBindingAssignment /[A-Za-z0-9$_ ]\+\(([A-Za-z0-9$_,= ]*)\)*\s*=\([^=]\|$\)\@=/ contains=jsOperator,jsParens contained containedin=jsBindingGroup
syn match jsCpsBindingAssignment /[A-Za-z0-9$_ ]\+\s*<-/ contains=jsOperator,jsParens contained containedin=jsCaterwaulLetCps
syn keyword jsBindingMacro where capture nextgroup=jsBindingGroup
syn keyword jsFunctionMacro given bgiven nextgroup=jsFunctionGroup
syn keyword jsQuotationMacro qs qse nextgroup=jsQuotationGroup
syn keyword jsFunctionMacro delay lazy
syn keyword jsOtherMacro raise seq noexpand reexpand
syn keyword jsParameterizedMacro se re when unless using rescue nextgroup=jsModifierSuffix
syn match jsModifierSuffix /[->]?/ contained
syn cluster jsMacro add=jsBindingMacro,jsFunctionMacro,jsQuotationMacro,jsOtherMacro
syn match jsSeqFilter /\/\(pairs\|keys\|values\)\>/
syn match jsSeqFilter /%[kv][\*%\/~!]/
syn match jsSeqFilter /[-\/|]object\>/
syn region jsBindingGroup matchgroup=jsCaterwaulMacro start='\s*\[' end=']' contained contains=TOP
syn region jsFunctionGroup matchgroup=jsCaterwaulMacro start='\s*\[' end=']' contained
syn region jsQuotationGroup matchgroup=jsCaterwaulMacro start='\s*\[' end=']' contained contains=TOP
syn match jsBindingGroup /\.\k\+/ contained
syn match jsFunctionGroup /\.\k\+/ contained
syn match jsParens /[()]/ contained
syn match jsClosers /[\]})]/
syn match jsCaterwaulInfixFunction /\([|\/]\)[-~][^ \t\/|]\+\1/
syn match jsCaterwaulUnaryFunction +/![^ ,\]\)\}]\++
syn cluster jsCaterwaulHtmlOps contains=jsCaterwaulHtmlClass,jsCaterwaulHtmlSlash,jsCaterwaulHtmlMap,jsCaterwaulHtmlAttr,jsCaterwaulHtmlElement,jsCaterwaulHtmlParens
syn cluster jsCaterwaulHtmlOps add=jsCaterwaulHtmlArray,jsCaterwaulHtmlSlashB,jsCaterwaulHtmlAttrB,jsCaterwaulHtmlPlus,jsCaterwaulHtmlContains
syn region jsCaterwaulHtmlPrefix1 matchgroup=jsCaterwaulMacro start=/\\s*/ contained nextgroup=@jsCaterwaulHtmlOps
syn region jsCaterwaulHtmlParens matchgroup=jsParens start=/(/ end=/)/ contained nextgroup=@jsCaterwaulHtmlOps containedin=@jsCaterwaulHtmlGroups contains=jsCaterwaulHtmlElement,jsStringS,jsStringD
syn region jsCaterwaulHtmlArray matchgroup=jsParens start=/\[/ end=/]/ contained nextgroup=@jsCaterwaulHtmlOps containedin=@jsCaterwaulHtmlGroups contains=TOP
syn keyword jsCaterwaulHtmlElement html head body meta script style link title div a span input button textarea option contained containedin=@jsCaterwaulHtmlGroups nextgroup=@jsCaterwaulHtmlOps
syn keyword jsCaterwaulHtmlElement table tbody tr td th thead tfoot img h1 h2 h3 h4 h5 h6 li ol ul noscript p pre samp contained containedin=@jsCaterwaulHtmlGroups nextgroup=@jsCaterwaulHtmlOps
syn keyword jsCaterwaulHtmlElement blockquote select form label iframe sub sup var code caption canvas audio video contained containedin=@jsCaterwaulHtmlGroups nextgroup=@jsCaterwaulHtmlOps
syn region jsBlockComment start=+/\*+ end=+\*/+ contains=@Spell,jsCommentTags
syn region jsLineComment start=+//+ end=+$+ contains=@Spell,jsCommentTags
syn keyword jsCommentTags TODO FIXME XXX TBD contained
syn sync fromstart
if main_syntax == "caterwaul"
syn sync ccomment javaScriptComment
endif
hi def link jsClosers Error
hi def link jsCaterwaulNumericHex Number
hi def link jsCaterwaulNumericBinary Number
hi def link jsCaterwaulHtmlElement Keyword
hi def link jsCaterwaulHtmlClass Special
hi def link jsCaterwaulHtmlClassName Type
hi def link jsCaterwaulHtmlSlash Special
hi def link jsCaterwaulHtmlSlashB Special
hi def link jsCaterwaulHtmlMap Special
hi def link jsCaterwaulHtmlAttr Special
hi def link jsCaterwaulHtmlAttrB Special
hi def link jsCaterwaulHtmlPlus Special
hi def link jsCaterwaulHtmlContains Special
hi def link jsCaterwaulHtmlPrefix2 Special
hi def link jsCaterwaulSeqVariable Identifier
hi def link jsCaterwaulUnaryLeftOp Special
hi def link jsCaterwaulComplexOp Special
hi def link jsCaterwaulOperatorFn Special
hi def link jsCaterwaulMacro Special
hi def link jsCaterwaulFn Identifier
hi def link jsCaterwaulInfixFunction Type
hi def link jsCaterwaulUnaryFunction Type
hi def link jsSeqFilter Special
hi def link jsWordPrefix Special
hi def link jsParameterizedMacro Special
hi def link jsModifierSuffix Special
hi def link jsBindingMacro Special
hi def link jsFunctionMacro Special
hi def link jsOtherMacro Special
hi def link jsQuotationMacro Keyword
hi def link jsFunctionGroup Identifier
hi def link jsQuotationGroup String
hi def link jsLineComment Comment
hi def link jsBlockComment Comment
hi def link jsCommentTags Todo
hi def link jsCodeQuote Special
hi def link jsCodeStringVariable Identifier
hi def link jsQuote Special
hi def link jsNumber Number
hi def link jsStringS String
hi def link jsStringD String
hi def link jsRegexp String
hi def link jsRegexpEscape Special
hi def link jsStringEscape Special
hi def link jsCaterwaulEscape Special
hi def link jsColonLHS Type
hi def link jsAssignment Type
hi def link jsParen Special
hi def link jsParens Special
hi def link jsBracket Special
hi def link jsBrace Special
hi def link jsParenCloseError Error
hi def link jsBracketCloseError Error
hi def link jsBraceCloseError Error
hi def link jsTernaryOperator Special
hi def link jsVarInBinding Type
hi def link jsVarBindingKeyword Keyword
hi def link jsVarBindingConstruct Keyword
hi def link jsBindingConstruct Special
hi def link jsBindingKeyword Keyword
hi def link jsBindingAssignment Type
hi def link jsExtraBindingAssignment Identifier
hi def link jsParamBinding Identifier
hi def link jsReservedToplevel Keyword
hi def link jsOperator Keyword
hi def link jsDot Special
hi def link jsBuiltinType Type
hi def link jsBuiltinLiteral Special
hi def link jsBuiltinValue Special
hi def link jsPrototype Special
let b:current_syntax = "caterwaul"
if main_syntax == 'caterwaul'
unlet main_syntax
endif
__
meta::sdoc('web/tutorial', <<'__');
Introduction.
Caterwaul is a Javascript recompiler that lets you change the semantics of functions. To do this it implements a modular decompiler, macroexpander, and compiler that allow you to manipulate
code in a first-class way. It also comes with several macro (Lisp-style, not C-style) libraries to make Javascript more fun (though you can easily disable them and/or write your own).
This page is written in Caterwaul using the libraries covered below. If you're feeling adventurous, you might be interested to see the annotated source (the code that
drives this page is in the web/ section near the bottom).
A shell is available to interactively use Caterwaul while reading the tutorial below.
I've also started writing documentation in a more traditional form, available from the documentation directory. This is a great place to start if you're
interested in using Caterwaul for production projects.
Using Caterwaul.
Caterwaul is pure Javascript, so you can integrate it into any web page without much effort. For example:
| <script src='http://caterwauljs.org/build/caterwaul.min.js'></script>
<script src='http://caterwauljs.org/build/caterwaul.std.min.js'></script>
<script src='http://caterwauljs.org/build/caterwaul.ui.min.js'></script>
The next step is to configure a compiler instance. Caterwaul provides a framework that you can use to build custom compilers, so this step tells Caterwaul what kind of macros you want to
expand. (This could include your own custom macros or third-party extensions.)
For example, here's how to get a compiler that compiles all of the macros discussed on this page:
| var compiler = caterwaul.jquery(caterwaul.js_all());
What we're saying here is "build a compiler with all Javascript core macros, then add jQuery support." A compiler is a function that recompiles any function you give it:
| var compiled = compiler(function () {alert(x) -where [x = 10]});
Generally you create the compiler inline and use it to transform your app's initialization function:
| // For jQuery apps:
$(caterwaul.jquery(caterwaul.js_all())(function () {
$('body').append(jquery in div('hi there!'));
}));
| // For non-jQuery apps, or libraries:
caterwaul.js_all()(function () {
// app code
})();
Check out Caterwaul by example for a more detailed discussion about writing apps with Caterwaul.
Libraries.
I maintain a motley set of libraries that are in various ways related to Caterwaul. Right now they are:
- Regular expression parser (uses Caterwaul syntax trees)
- Nonlinear parser combinators
- Futures implementation
- Value generation combinators (for testing, etc)
- Numerical integration with error function
I should mention that they're in various states of disrepair and sometimes break. However, if you'd like to use one in production, feel free to e-mail me and I'll set up a stable versioning scheme for it. They're all MIT-licensed, as is Caterwaul.
Downloading Caterwaul.
Here are some relevant links if you want to hack on or use Caterwaul:
- Caterwaul on Github
- Stable versions of Caterwaul
- Vim highlighter for Javascript/Caterwaul
- Javascript in Ten Minutes
Caterwaul in node.js.
It's really easy to use Caterwaul with node.js. You just need to download caterwaul.node.js and whatever extensions you want to
use (I'll assume caterwaul.std.js for the purposes of this example), and concatenate them into one big file. On Mac or Linux the process looks like this:
| $ curl http://caterwauljs.org/build/caterwaul.{node,std}.js > caterwaul-custom.js
$ node
> var caterwaul = require('./caterwaul-custom.js').caterwaul;
> caterwaul.js_all()('[1, 2, 3] *[x + 1] -seq');
[ 2, 3, 4 ]
>
Because Caterwaul recompiles functions, you'll have to explicitly bind require if you want your app to have access to it. This can be done by specifying a hash of variable
bindings after the function you're compiling. For example:
| var main_function = function () {...};
caterwaul.js_all()(main_function, {require: require});
Doing this puts require into the local scope of the compiled function. (Unfortunately, Caterwaul has no way of doing this automatically, since all functions that it constructs
are created in the global scope and lexical scopes are not first-class in Javascript.)
Performance.
Caterwaul 1.1 parses, transforms, and compiles Javascript very quickly. It's also quite compact; the core with std and ui libraries is about 11K minified and
gzipped. The code that it generates is about as fast as handwritten Javascript.
Here's a benchmark to test how well Caterwaul performs in your browser (entries marked with + are part of the standard Caterwaul load cycle; the others are more granular benchmarks):
Javascript extensions.
Caterwaul's core macro set starts by extending Javascript syntax in some helpful ways. In particular, it enables quick function assignment, infix function application,
and Ruby-style string interpolation (which works with both single and double quotes):
add(x, y) = x + y
c
String.prototype.say_hi() = 'hi from #{this}!'
Caterwaul translates these expressions into this:
| add = function (x, y) {
return x + y;
};
String.prototype.say_hi = function () {
return 'hi ' + (this) + '!';
};
Now we can use the new functions in the shell:
add(3, 4)
'javascript'.say_hi()
String interpolation, function assignment, and infix function application are the only irregular syntactic forms provided by Caterwaul. Everything else is implemented as a regular form called
a modifier.
General modifiers.
A modifier is a word that is used with an operator to modify a bit of syntax. For example, Caterwaul provides a modifier called when to execute things conditionally:
log('hi') -when['foo'.length === 3]
There are two parts to a modifier. The first is the operator you use with it (in this case minus), and the second is the modifier and any arguments it takes. The operator is very important; it
determines how much stuff you're modifying. For example:
log('hi'), log('again') -when[1 === 2]
Here the when[1 === 2] only modifies log('again') because minus has much higher precedence than the comma operator. However, Caterwaul lets you use several other
operators to change this:
log('hi'), log('again'), when[1 === 2]
In this case the when[1 === 2] modifies both log statements. The reason for this is kind of subtle: comma left-associates, so the first comma was collapsed into a
single syntax node that then became the left-hand side of the second comma. Because Caterwaul operates on the structure of your code, it groups both log statements into the
conditional.
You can inspect Caterwaul's parse tree by using the qs modifier (for "quote syntax"). For example:
qs[log('hi'), log('again'), when[1 === 2]]
qs[log('hi'), log('again'), when[1 === 2]].data
qs[log('hi'), log('again'), when[1 === 2]].length
qs[log('hi'), log('again'), when[1 === 2]][0]
qs[log('hi'), log('again'), when[1 === 2]][1]
qs[log('hi'), log('again'), when[1 === 2]].structure()
The structure method gives you a string containing the syntax tree in S-expression form. I talk more about this in the section about quotation.
Modifier operators.
There are about six different operators you can use in conjunction with a modifier. From highest to lowest precedence they are:
- The slash. For example,
log('hi') /when [true]. I use this when I need something tighter than a minus.
- The minus. For example,
log('hi') -when [true]. It also comes in another form: log('hi') -when- true. I use this most of the time because it seems easier to
read.
- The
in operator. For example, given [x] in x + 1. in has the same precedence as < and >, which is lower
than the arithmetic operators. As a result, it's useful when you're binding variables or creating functions around simple expressions.
- The
<> operators. These are used around a modifier: log('hi') no_logging . This has the same precedence as in and other relational
operators.
- The
| operator. This is the lowest-precedence regular operator; the only things lower are &&, ||, ?:, assignment, and the
comma.
- The
, operator. This is the lowest-precedence operator in Javascript. It can be dangerous to use because it left-associates; for example,
f(x, y, z, where [z = 10]) will invoke f on just one parameter, since the where gobbles everything to its left. (Using a | here
would fix the problem.)
- The
[] operator. This starts the precedence hierarchy over by using explicit grouping. For example, where[x = 10][log(x)].
Conditional modifiers.
when and unless are used to make conditionals more flexible. The semantics and return values are:
| x -when- y -> y && x
x -unless- y -> !y && x
Binding modifiers.
These let you define locally-scoped variables. There are two of these modifiers, where and using. where is used to bind local variables at runtime,
much like it is in Haskell:
x -where [x = 10]
f(10) -where [f(x) = x + 1]
Sometimes, though, you want to have access to all of the properties of an object without qualifying them. Javascript provides the with keyword for this purpose, but because it
is evaluated at runtime it has significant performance implications. A much faster alternative is to use Caterwaul's using modifier, which evaluates its object at compile-time
and produces a list of local variable definitions that refer to the object's properties. (Naturally, this means that the object you're using needs to be computable at compile-time.) Also,
any variables defined with using will shadow surrounding variables with the same name. For example, this refers to caterwaul.compile:
compile -using.caterwaul
The opposite of using is capture, which makes an object out of a series of assignments. The assignment structure is just like it is for where:
result = capture [f(x) = x + 1, y = 10]
result.f(10)
result.y
Function modifiers.
There are two words that create functions. One is given, which creates a regular function. The other is bgiven, which binds the function to the this
where it was defined. For example:
given[x] in x + 1
x + 1 -given[x]
f.call(10) -where [f = this -given- x]
f.call(10) -where [f = this -bgiven- x]
There's a shorthand you can use if you just have a single operand for a modifier:
x + 1 -given.x
given.x in x + 1
given.x [x + 1]
Delayed evaluation.
Functional languages like Haskell provide lazy semantics in the language; these allow you to represent infinite data structures and other cool things. Javascript uses strict semantics by
default, but you can wrap something in a function to delay its evaluation. Caterwaul provides a couple of macros that are linguistically suggestive of true lazy evaluation:
x = 10
f = ++x -delay
f()
g = ++x -lazy
g()
The delay modifier just wraps an expression in a context-preserving nullary function (equivalent to bgiven.nothing in _expression), and the lazy
modifier does the same thing but invokes the function at most once. (Future invocations of the wrapper function return the cached return value.)
Side-effecting modifiers.
These make it easy to manipulate values and return things without using an explicit variable. We do this in English using pronouns, and Caterwaul binds the variable it to refer
to "the thing that we're working with."
There are two ways to create a side-effect. One is to return the side-effecting expression and the other is to return the original value. For example, suppose you want to write a function
hash(k, v) that returns a hash h such that h[k] === v. In plain Javascript you'd write this:
| var hash = function (k, v) {
var result = {};
result[k] = v;
return result;
};
However, the amount of typing required is much larger than the complexity of the problem. We want to return an object after applying a side-effect to it; to do this with Caterwaul we would
use the se modifier, which stands for "side-effect":
hash(k, v) = {} -se [it[k] = v]
This style of side-effects returns the original expression. Sometimes, though, you want to return the result of the side-effect rather than the original. For example, here's a zero-division
check in plain Javascript:
| var x_over_yp1 = function (x, y) {
var y_plus_1 = y + 1;
return y_plus_1 === 0 ? 0 : x / y_plus_1;
};
Here's the same function using a returning side-effect ("re" stands for "returning effect"):
x_over_yp1(x, y) = y + 1 -re [it === 0 ? 0 : x / it]
Side-effecting won't impact the evaluation order of your code. That is, x -se- y and x -re- y will always evaluate x before y.
Quotation.
Most people won't use this, but it's handy if you're doing heavy-duty syntax analysis or writing complex macros. The standard library includes an obscure
modifier called qs that you can use to quote a piece of code. Quotation is basically grabbing the literal syntax rather than evaluating it normally. For example:
qs[foo + bar]
qs[foo + bar].data
qs[foo + bar].length
qs[foo + bar][0]
qs[foo + bar].structure()
Quotation is an idea that comes from Lisp and is handled similarly by Caterwaul. (The only difference is that Caterwaul returns its own n-ary syntax tree format instead of cons trees.)
A variant, qse, macroexpands the quoted code before returning it as a syntax tree. For example:
qse[log(foo) -unless[true]]
log(foo) -unless[true], qse
You can use this in the shell to see how Caterwaul will macroexpand something. Note that the shell's caterwaul function is configured with all extensions enabled.
Evaluation modifiers.
You can inform Caterwaul that you want to evaluate an expression at compile-time rather than at runtime by using the eval modifier. For example:
given.x in x + Math.sqrt(2)
given.x in x + Math.sqrt(2) /eval
In the second example, Math.sqrt(2) is evaluated when the code is macroexpanded and a variable called eval is inserted into the code in its place. eval
is bound to the result of the compile-time evaluation. Generally you wouldn't use this modifier, but I've included it for completeness.
Macroexpansion modifiers.
The reexpand and noexpand modifiers give Caterwaul instructions about how to handle an expression. For instance, suppose you have a variable called
given, and you want to use it without worrying that Caterwaul will interpret it as a modifier. You can wrap expressions that contain this variable with the noexpand
modifier to prevent any macroexpansion from happening:
qse in noexpand [x -given.x]
Similar to noexpand is reexpand, which tells Caterwaul to re-expand the output of the first macroexpansion. Normally you don't need to use this because all of the
standard macros re-expand their output automatically and therefore require only the initial expansion.
Other modifiers.
There are a few more modifiers that I threw in to the standard library to make some edge cases easier:
new Error('uh-oh') -raise
null.foo -rescue- log('caught #{e}')
The exception is always called e when using the rescue modifier.
Sequence library.
This is probably the gnarliest part of Caterwaul, but in my opinion it's also the most useful. The sequence library provides a modifier called seq that
reinterprets some syntax within an APL-like domain-specific language. It generates very efficient code and lets you express maps, folds, cartesian products, zips, etc, with very little effort.
For instance, suppose we want an array of the first 10 squares. Using the sequence library looks like this:
ni[1, 10] *[x * x] /seq
Deciphering sequence code.
Enter a sequence operator here to see how it is decoded by the seq library (this will be more relevant when reading the sections below):
Mapping and iterating.
The * operator is responsible for mapping, iterating, and flat-mapping. It's fairly easy to use; you just "multiply" a sequence by a bracketed expression. * will
create a variable called x and evaluate your expression for each element in the sequence. It then collects these results and returns a new array. For example:
seq in [1, 2, 3] *['x = #{x}']
You don't have to use just arrays. You can use anything with a .length and [0] ... [n - 1] attributes. One of the most common non-array collections I
use is a jQuery selector (just be sure to wrap x again so that you're not dealing with a plain DOM node):
seq in $('div') *[$(x).attr('class')]
Alternative forms.
Most operators have an alternative form that does something similar to the original. You specify this form by using a ! after the operator. The alternative form of
* is used to iterate without collecting the results; doing this returns the original array. For example:
seq in [1, 2, 3] *![log(x)]
The third use of * is flat-mapping, which is denoted by writing *~!. For example:
seq in [1, 2, 3] *~![[x, x + 1]]
Like the original form, these alternative forms can be combined with any of the operator features below.
Prefixes.
Caterwaul 1.0.3 supports prefixes for mapping and iterating over non-array structures. Right now there are two prefixes, %k and %v. These can be used with
* and *!, but not *~!.
These prefixes stand for 'keys' and 'values', respectively, and they're used to limit the scope of a map or iteration to the keys or values of an object. For example:
(seq in {foo: 'bar'} %k*[x + 'hi']).foohi
(seq in {foo: 'bar'} %v*[x + 'hi']).foo
seq in {foo: 'bar'} %v*![log(x)]
The reason you can't use %k and %v with *~! is that *~! isn't a componentwise operator. Because it could return zero, one, or many
values for each one that gets visited, it isn't clear what to do with the result. (One example of this is seq in value %v*~![[x, x + 'foo']] -- the intent is unclear.)
Operator features.
The sequence library uses operators to describe operations on arrays. Most of them are regular binary infix operators like + and *, though a few of them have names
(such as ni[] above).
Despite the wide array of operators supported, there is a high degree of regularity among them. Each operator that takes a block (like * does) has several options that can be
set to change the way it interprets the block.
Sequence interpretation.
Normally the expression inside [] is interpreted as a regular Javascript expression. But sometimes you want to remain in sequence context so that you don't have to explicitly
modify the expression. To do that, you prefix the [] with a ~:
seq in [[1], [2], [3]] *~[x *[x + 1]]
Variable renaming.
In the example above we lost access to the outer x due to shadowing. To avoid this problem, the sequence language lets you rename any variable by prefixing the []
with a new variable name:
seq in [1, 2, 3] *y[y + 1]
You can use both of these options at the same time, yielding this:
seq in [[1], [2], [3]] *~y[y *[x + 1]]
Note that you can't say *y~[...], as this is invalid Javascript syntax (~ is always a unary operator).
Function promotion.
Caterwaul 1.1 adds the ability to implicitly promote functions by using them instead of a [] block. For example:
seq in [1, 2, 3] *![log(x)]
seq in [1, 2, 3] *!log
Note that the function name is evaluated within the context of the loop, so it is re-evaluated each iteration. This could, in pathological cases, be beneficial; but mostly it's something
to watch out for. Generally you should only use local variables, or simple permutations of them, as functions. Function promotion applies to all operators that take blocks.
Filtering.
The filtering family of operators is denoted by %. For instance, here's a way to get multiples of three:
seq in [1, 2, 3] %[x % 3 === 0]
Alternative forms.
Negation is so high precedence that it's often difficult to work it into a form without adding parentheses. The alternative form of % negates the predicate:
seq in [1, 2, 3] %![x % 3]
The other alternative form of % is a simultaneous map/filter. The idea is to return the expression value when it's truthy and drop the element otherwise. For example, we can
get the squares of all negative elements this way:
seq in [1, -2, -3, 4] %~![x < 0 && x * x]
Prefixes.
Filter operations can all take the %k and %v prefixes. These remove and/or transform key-value mapping in objects. For example:
seq in {foo: 'bar', bif: 'baz'} %k%[/foo/.test(x)] /pairs
seq in {foo: 'bar', bif: 'baz'} %v%[/z/.test(x)] /pairs
seq in {foo: 'bar', bif: 'baz'} %k%![/o/.test(x)] /pairs
seq in {foo: 'bar', bif: 'baz'} %k%~![/o/.test(x) && x.replace(/o/g, 'a')] /pairs
Using /pairs with prefixes isn't necessary in most cases. I've included it in these examples to better illustrate what's happening.
Folding.
You can fold stuff through a binary expression by using the / family of operators. / has two forms: left fold (the default), and right fold (written as
/!). For example, here is how you might sum a bunch of numbers:
seq in [1, 2, 3] /[x + x0]
Since + is associative it doesn't matter which direction the fold goes. It becomes obvious, however, if we interpolate the values into a string:
seq in [1, 2, 3] /['[#{x0}, #{x}]']
seq in [1, 2, 3] /!['[#{x}, #{x0}]']
Notice that for folding we have a new variable x0, which is available only when folding. x0 is always used as the accumulator; that is, the inductive step is
x0 = f(x0, x). There are actually a few variables you have access to depending on what you're doing. Inside any block you'll have x, xi (the current
index), and xl (the length of the original sequence). Each of these changes uniformly if you rename the variable; so for instance:
seq in [1, 2, 3] /bar[bar + bar0 + bari + barl]
Prefixes.
Sometimes you want to fold into an existing element. For example, suppose you want the sum of the squares of numbers from 1 to 10. This code won't work:
seq in n[1, 11] /[x*x + x0*x0]
The reason is that you can't square the accumulator. If you can't specify the starting element of the fold, the best you can do is to pre-map the elements and then sum them normally:
seq in n[1, 11] *[x * x] /[x + x0]
However, the other option is to specify the initial value for x0 by using the fold prefix:
seq in n[1, 11] /[0][x0 + x*x]
seq in n[1, 11] /![0][x0 + x*x]
The fold prefix value is never interpreted in sequence context, even if you modify the body of the fold to do so.
Unfolding.
Sometimes it's useful to have anamorphic value generators. These are the opposite of folds: unfolds produce multiple values from one. For instance, summing the array [1, 2, 3, 4,
5] can be done using a fold over the + operator:
[1, 2, 3, 4, 5] /[x + x0] -seq
Similarly, generating the array [1, 2, 3, 4, 5] can be done using an unfold over the increment operator:
1 /~![x < 5 ? x + 1 : null] -seq
If we treat the body of the unfold as a function f(x) = x < 5 ? x + 1 : null, then an unfold could be seen as [1, f(1), f(f(1)), f(f(f(1))), f(f(f(f(1))))]. The
last element returns null, which tells the sequence library to stop unfolding.
Quantification.
The sequence library provides existential quantification on arrays. This uses a block that acts as a predicate. So, for instance, to determine whether any element in an array is positive:
[-4, -5, 10, 2] |[x > 0] |seq
The | operator returns the first truthy value generated by the expression (not just true or false), so you can use it to detect things too. This block causes the sequence
comprehension to return not only whether an element is positive, but if so the first such element will be returned:
[-4, -5, 10, 2] |[x > 0 && x] |seq
[-4, -5, 10, 2] |[x -when[x > 0]] |seq
We can also use this construct to return the index of the first matching element. Because an index of 0 is falsy, we'll have to add one (so 0 is the not-found value rather than -1):
[-4, -5, 10, 2] |[xi + 1 -when[x > 0]] |seq
Combination.
There are three ways you can combine things. The most obvious is concatenation, written +:
seq in [1, 2, 3] + [4, 5, 6]
Less obvious are zipping, written ^, and the cartesian product, written -. Because ^ has lower precedence than in, we have to switch to a
lower-precedence modifier form for seq. For example:
[1, 2, 3] ^ [4, 5, 6] |seq
The cartesian product takes every possible pairing of elements from the two sequences:
seq in [1, 2, 3] - [4, 5, 6]
Each of these operators has lower precedence than *, /, and % (all of which have equal precedence), so they can be used without parentheses. Zipping
has lower precedence than cartesian product and concatenation; this choice was made because a zip is a common operation prior to folding a bunch of pairs into an object and thus ending the
sequence comprehension.
Objects.
A really useful and important feature of the sequence library is that it works with objects very easily. It has four operators, /keys, /values, /pairs,
and |object, that can convert between objects and arrays.
You can pull an array of the keys or values of an object (not in any particular order of course) by using /keys and /values. For example:
window /keys -seq
jQuery /values -seq
More interesting is the /pairs operator. This pulls out key-value pairs as two-element arrays:
{foo: 'bar', bif: 'baz'} /pairs -seq
Its inverse is the |object operator (also can be written as -object or /object, depending on what kind of precedence you want), which turns an array of
those pairs back into an object:
[['foo', 'bar'], ['bif', 'baz']] |object |seq
[['foo', 'bar'], ['bif', 'baz']] -object -seq
[['foo', 'bar'], ['bif', 'baz']] /object /seq
Note the differing precedences of /keys etc. and |object. This is intentional. The rationale is that you rarely manipulate objects as objects in sequence
comprehensions, since the sequence library has no useful operators for objects other than unpacking. Therefore, objects come from various other values and enter a sequence comprehension,
which may at the very end zip an intermediate result into a final object return value. The alternative higher-precedence forms of object didn't exist before, but after some
real-world use I've found it useful to be able to maintain a certain precedence level.
Also note that when possible you should use %k and %v instead of packing and unpacking objects. These prefixes are faster and, in my experience, make the code
easier to read.
Numerical iteration.
Within a sequence comprehension you have access to the n[] operator, which generates arrays of evenly-spaced numbers. It has three uses. When invoked on one argument it returns
integers between 0, inclusive, and the number, exclusive. When invoked with two arguments the first becomes the inclusive lower bound and the second is the exclusive upper bound. Adding a
third argument changes the increment from its default value of 1. For example:
n[10] -seq
n[5, 8] -seq
n[0, 1, 0.25] -seq
n[0, -1, 0.25] -seq
n[0, -1, -0.25] -seq
Another similar operator is ni[], which behaves exactly like n[] except that it includes its upper bound. For instance:
n[10] -seq
ni[10] -seq
n[1, 4] -seq
ni[1, 4] -seq
n[0, 1, 0.25] -seq
ni[0, 1, 0.25] -seq
Note that the usual floating-point caveats apply; for example:
n[0, 1, 0.1] -seq
ni[0, 1, 0.1] -seq
These results are the same because of the inductive loops used in n and ni. If you need endpoint accuracy in floating-point situations, your best bet is to generate
an integer sequence and map across a scaling factor:
n[10] *[x * 0.1] -seq
ni[10] *[x * 0.1] -seq
DOM/jQuery driver.
One of the benefits of promoting syntax into a first-class construct is that you can specialize certain syntactic constructs for library interoperation. Caterwaul provides a module that
integrates jQuery-based DOM node construction right into the syntax of your program. (You can also write modules to do similar things for other client-side libraries.) For example:
jquery in div.foo('hi there')
In this example, jquery is a modifier that interprets its code as HAML-like DOM construction. The code above is translated into this:
| jQuery('').addClass('foo').append('' + ('hi there') + '')
Nodes and classes.
The example above illustrates the node and class syntax. The way Caterwaul sees this is that div is a node, and each dot-expression after it denotes a class. For example,
div.foo.bar.bif creates a div with three classes. You can also create just plain elements; div creates an empty div element with no CSS
classes.
This DOM driver uses context to determine when a word should be interpreted as an element name. Importantly, it doesn't have a list of known elements that it knows to promote. So, for
example, this is also perfectly valid code:
jquery in foo.bar(bif)
If you run this you'll get a <foo> node that contains an empty <bif> node.
Appending children.
If you invoke one node on another, you're telling the driver to add the "parameters" of the invocation as children. This is translated into an append call to jQuery. So, for
example, div.foo(span.bar('hi there')) creates an anonymous span containing hi there, adds that to a span with the "bar" class, and adds that to a div with the
"foo" class. The div is returned.
For reasons that will shortly become apparent there is a lower-precedence way to represent appending. You can use the > operator to do the same thing as invocation. For example:
jquery [div > p]
Perhaps counterintuitively, chaining the > operator does not result in further nesting. This is because > left-associates, so div > p > pre would be
interpreted as (div > p) > pre. This actually ends up being really convenient -- more so than if it did what it does in CSS, in my opinion.
Appending other stuff.
Because you can easily use functional abstraction over DOM nodes you'll probably end up factoring the creation of elements into a bunch of different functions. As a result, you'll end up
calling those functions and wrapping some of the children in new nodes. The way to do this is to append stuff in a non-DOM context using [] instead of ():
foo = jquery in div.foo
jquery in div.container[foo]
The low-precedence counterpart is >=, and like > it left-associates. You can also mix the two because its precedence is identical to >.
In the "hi there" example at the top of this section I appended the string "hi there" (which was interpreted as a Javascript value, not as a node constructor) using parentheses rather than
square brackets. The DOM driver has an exception for string values, since often you'll want to insert plain text between other nodes:
jquery in div('foo', button, 'bar')
There's also a much more sinister aspect to it, though. Firefox (and SpiderMonkey-based Javascript engines in general) rewrites your code at compile-time, before Caterwaul can see it. One of
the optimizations it performs is constant-folding, which involves rewriting things of the form x['y'] to x.y whenever y is a valid identifier. As a
result, if you write something like this:
jquery in button['hi']
You will get the undesirable outcome <button class='hi'> in the generated code on Firefox. As a result you are always better off using () when there is
text involved (as long as the text is a literal string, that is).
Attributes and jQuery data.
These can be setup by using the * operator. For example:
jquery in a('google') *href('http://google.com')
This invokes jQuery's attr() method on 'href', 'http://google.com'. A similar shorthand is provided for jQuery's data():
jquery in a('got data') *!foo('bar')
This results in data('foo', 'bar'). The expression inside parentheses is evaluated in normal Javascript context.
Arbitrary methods and event bindings.
These are available by using the / operator. For instance:
jquery in button /text('hi')
The slash simply turns into a regular method call: $('<button>').text('hi'). Similar is the /! operator, which turns into a bind() call:
jquery in button /!click(given.e in alert('hi'))
Calling functions.
One of the downsides of having a DSL for DOM node construction is that it's hard to call a function on a small piece of the structure. The DOM library addresses this by using the
% operator to represent function invocation. For instance:
says_hi(e) = e.text('hi there')
jquery in button %says_hi
This expands into say_hi($('<button>')). Sometimes you want to pass parameters into the function you're using. This is achieved by currying:
says(thing)(e) = e.text(thing)
jquery in button %says('click me')
Experimental extensions.
These are things that I'm experimenting with before committing to the design. As such, they may change or be removed in the next few versions. (So use at your own
peril, both due to volatility and because they might be useless.)
Infix function application.
Haskell gives you a nice way to use a function as an infix operator: you use backticks (so x `f` y becomes f x y). This reduces an API designer's pressure to use
operator overloading, since now there's another way to get the normal subject-verb-object ordering in expressions.
Caterwaul has a couple of similar constructs, though they don't look very nice compared to Haskell. The first is a simple binary application, which looks like this:
1 /-log/ 2
1 |-log| 2
You can stack these up, as they associate leftward:
1 /-log/ 2 /-log/ 3
More interesting, however, is the n-ary application form, for which you change the - to a ~:
1 / 2 / 3 /~log/ 4
1 | 2 | 3 |~log| 4
This creates a single function call containing all of the arguments.
Naturally, this syntax is somewhat dangerous since it might collide with certain arithmetic expressions. Hopefully nobody will be dividing twice in a row, but it could happen. That's one of
the reasons I consider this extension to be experimental.
Postfix function application.
This is a special case of infix function application for unary functions. Expressions of the form x /!f are converted to f(x):
100 /!log
Literal modifiers.
There are two kinds of modifiers. Literal modifiers are used to modify literal values (such as strings and regular expressions). For example, Caterwaul provides a literal modifier called
.x that removes spaces from regular expressions:
/foo bar bif/.x
All literal modifiers look like this; that is, they appear to be properties and aren't invoked. Caterwaul will only process literal modifiers that it knows about and that match literals of
the right type. All of these expansions happen at compile-time, so there is no significant runtime impact of using them.
Regular expression modifiers.
The simplest regular expression modifier is .x. This lets you insert whitespace into your regular expressions without actually matching against whitespace; it's very similar
to the 'x' flag in Perl or Ruby.
/^\w+ @ \w+ \. com$/.x
Another is .qf, short for "quote function". This causes the regular expression to be turned into a function that matches against strings (very similar to Perl's behavior):
/foo/.qf
String modifiers.
Strings can be modified in several different ways:
'foo bar'.qw
Here, .qw causes the string to be split into an array of words. You can put arbitrarily much whitespace between words, though the usual Javascript syntax rules apply.
'foo bar bif baz'.qh
Similar to .qw, but a hash (object) is constructed instead of an array. Every other word is used as a key, and each following word is a value.
'^http://'.qr
Converts a string into a regular expression, but properly escapes the forward slashes. This is primarily for notational convenience, and has the caveat that some things that you'd do
within regular expressions aren't allowed in strings. (For example, 'foo\[bar\]'.qr will fail in Javascript's initial parse because \[ and \] are
invalid escape sequences.)
'x + 1'.qs
This creates a reference to the syntax tree generated by parsing x + 1. It's rare that you'd use this unless you're writing macros.
'_ + 1'.qf
Constructs a function whose sole argument is _ and whose body is _ + 1. Code within the function is macroexpanded automatically, so you can do things like this:
'_ + 1 -when._'.qf
__
meta::template('comment', '\'\'; # A mechanism for line or block comments.');
meta::template('eval', <<'__');
my $result = eval $_[0];
terminal::warning("Error during template evaluation: $@") if $@;
$result;
__
meta::template('failing_conditional', <<'__');
my ($commands) = @_;
my $should_return = $commands =~ / if (.*)$/ && ! eval $1;
terminal::warning("eval of template condition failed: $@") if $@;
$should_return;
__
meta::template('include', <<'__');
my ($commands) = @_;
return '' if template::failing_conditional($commands);
join "\n", map retrieve($_), split /\s+/, $commands;
__
meta::template('pinclude', <<'__');
# Just like the regular include, but makes sure to insert paragraph boundaries
# (this is required for SDoc to function properly).
my ($commands) = @_;
return '' if template::failing_conditional($commands);
my $text = join "\n\n", map retrieve($_), split /\s+/, $commands;
"\n\n$text\n\n";
__
meta::template('script-include', <<'__');
my ($name) = @_;
my $s = 'script';
my $script = retrieve($name);
"<$s>\n$script\n$s>";
__
meta::template('style-include', <<'__');
my ($name) = @_;
my $s = 'style';
my $style = retrieve($name);
"<$s>\n$style\n$s>";
__
internal::main();
__END__