// A framework for simple tokenizers. Takes care of newlines and // white-space, and of getting the text from the source stream into // the token object. A state is a function of two arguments -- a // string stream and a setState function. The second can be used to // change the tokenizer's state, and can be ignored for stateless // tokenizers. This function should advance the stream over a token // and return a string or object containing information about the next // token, or null to pass and have the (new) state be called to finish // the token. When a string is given, it is wrapped in a {style, type} // object. In the resulting object, the characters consumed are stored // under the content property. Any whitespace following them is also // automatically consumed, and added to the value property. (Thus, // content is the actual meaningful part of the token, while value // contains all the text it spans.) function tokenizer(source, state) { // Newlines are always a separate token. function isWhiteSpace(ch) { // The messy regexp is because IE's regexp matcher is of the // opinion that non-breaking spaces are no whitespace. return ch != "\n" && /^[\s\u00a0]*$/.test(ch); } var tokenizer = { state: state, take: function(type) { if (typeof(type) == "string") type = {style: type, type: type}; type.content = (type.content || "") + source.get(); if (!/\n$/.test(type.content)) source.nextWhile(isWhiteSpace); type.value = type.content + source.get(); return type; }, next: function () { if (!source.more()) throw StopIteration; var type; if (source.equals("\n")) { source.next(); return this.take("whitespace"); } if (source.applies(isWhiteSpace)) type = "whitespace"; else while (!type) type = this.state(source, function(s) {tokenizer.state = s;}); return this.take(type); } }; return tokenizer; }