TokenStreamRewriteEngine.hpp   [plain text]


#ifndef INC_TokenStreamRewriteEngine_hpp__
#define INC_TokenStreamRewriteEngine_hpp__

/* ANTLR Translator Generator
 * Project led by Terence Parr at http://www.jGuru.com
 * Software rights: http://www.antlr.org/license.html
 */

#include <string>
#include <list>
#include <vector>
#include <map>
#include <utility>
#include <ostream>
#include <iterator>
#include <cassert>
#include <algorithm>

#include <antlr/config.hpp>

#include <antlr/TokenStream.hpp>
#include <antlr/TokenWithIndex.hpp>
#include <antlr/BitSet.hpp>

#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
namespace antlr {
#endif

/** This token stream tracks the *entire* token stream coming from
 *	 a lexer, but does not pass on the whitespace (or whatever else
 *	 you want to discard) to the parser.
 *
 *	 This class can then be asked for the ith token in the input stream.
 *	 Useful for dumping out the input stream exactly after doing some
 *	 augmentation or other manipulations.	Tokens are index from 0..n-1
 *
 *	 You can insert stuff, replace, and delete chunks.	 Note that the
 *	 operations are done lazily--only if you convert the buffer to a
 *	 String.	 This is very efficient because you are not moving data around
 *	 all the time.	 As the buffer of tokens is converted to strings, the
 *	 toString() method(s) check to see if there is an operation at the
 *	 current index.  If so, the operation is done and then normal String
 *	 rendering continues on the buffer.	 This is like having multiple Turing
 *	 machine instruction streams (programs) operating on a single input tape. :)
 *
 *	 Since the operations are done lazily at toString-time, operations do not
 *	 screw up the token index values.  That is, an insert operation at token
 *	 index i does not change the index values for tokens i+1..n-1.
 *
 *	 Because operations never actually alter the buffer, you may always get
 *	 the original token stream back without undoing anything.  Since
 *	 the instructions are queued up, you can easily simulate transactions and
 *	 roll back any changes if there is an error just by removing instructions.
 *	 For example,
 *
 *			TokenStreamRewriteEngine rewriteEngine =
 *				new TokenStreamRewriteEngine(lexer);
 *		  JavaRecognizer parser = new JavaRecognizer(rewriteEngine);
 *		  ...
 *		  rewriteEngine.insertAfter("pass1", t, "foobar");}
 *			rewriteEngine.insertAfter("pass2", u, "start");}
 *			System.out.println(rewriteEngine.toString("pass1"));
 *			System.out.println(rewriteEngine.toString("pass2"));
 *
 *	 You can also have multiple "instruction streams" and get multiple
 *	 rewrites from a single pass over the input.	 Just name the instruction
 *	 streams and use that name again when printing the buffer.	This could be
 *	 useful for generating a C file and also its header file--all from the
 *	 same buffer.
 *
 *	 If you don't use named rewrite streams, a "default" stream is used.
 *
 *	 Terence Parr, parrt@cs.usfca.edu
 *	 University of San Francisco
 *	 February 2004
 */
class TokenStreamRewriteEngine : public TokenStream
{
public:
	typedef ANTLR_USE_NAMESPACE(std)vector<antlr::RefTokenWithIndex> token_list;
	static const char* DEFAULT_PROGRAM_NAME;
#ifndef NO_STATIC_CONSTS
	static const size_t MIN_TOKEN_INDEX;
	static const int PROGRAM_INIT_SIZE;
#else
	enum {
		MIN_TOKEN_INDEX = 0,
		PROGRAM_INIT_SIZE = 100
	};
#endif

	struct tokenToStream {
		tokenToStream( ANTLR_USE_NAMESPACE(std)ostream& o ) : out(o) {}
		template <typename T> void operator() ( const T& t ) {
			out << t->getText();
		}
		ANTLR_USE_NAMESPACE(std)ostream& out;
	};

	class RewriteOperation {
	protected:
		RewriteOperation( size_t idx, const ANTLR_USE_NAMESPACE(std)string& txt )
		: index(idx), text(txt)
		{
		}
	public:
		virtual ~RewriteOperation()
		{
		}
		/** Execute the rewrite operation by possibly adding to the buffer.
		 *	 Return the index of the next token to operate on.
		 */
		virtual size_t execute( ANTLR_USE_NAMESPACE(std)ostream& /* out */ ) {
			return index;
		}
		virtual size_t getIndex() const {
			return index;
		}
		virtual const char* type() const {
			return "RewriteOperation";
		}
	protected:
		size_t index;
		ANTLR_USE_NAMESPACE(std)string text;
	};

	struct executeOperation {
		ANTLR_USE_NAMESPACE(std)ostream& out;
		executeOperation( ANTLR_USE_NAMESPACE(std)ostream& s ) : out(s) {}
		void operator () ( RewriteOperation* t ) {
			t->execute(out);
		}
	};

	/// list of rewrite operations
	typedef ANTLR_USE_NAMESPACE(std)list<RewriteOperation*> operation_list;
	/// map program name to <program counter,program> tuple
	typedef ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,operation_list> program_map;

	class InsertBeforeOp : public RewriteOperation
	{
	public:
		InsertBeforeOp( size_t index, const ANTLR_USE_NAMESPACE(std)string& text )
		: RewriteOperation(index, text)
		{
		}
		virtual ~InsertBeforeOp() {}
		virtual size_t execute( ANTLR_USE_NAMESPACE(std)ostream& out )
		{
			out << text;
			return index;
		}
		virtual const char* type() const {
			return "InsertBeforeOp";
		}
	};

	class ReplaceOp : public RewriteOperation
	{
	public:
		ReplaceOp(size_t from, size_t to, ANTLR_USE_NAMESPACE(std)string text)
		: RewriteOperation(from,text)
		, lastIndex(to)
		{
		}
		virtual ~ReplaceOp() {}
		virtual size_t execute( ANTLR_USE_NAMESPACE(std)ostream& out ) {
			out << text;
			return lastIndex+1;
		}
		virtual const char* type() const {
			return "ReplaceOp";
		}
	protected:
		size_t lastIndex;
	};

	class DeleteOp : public ReplaceOp {
	public:
		DeleteOp(size_t from, size_t to)
		: ReplaceOp(from,to,"")
		{
		}
		virtual const char* type() const {
			return "DeleteOp";
		}
	};

	TokenStreamRewriteEngine(TokenStream& upstream);

	TokenStreamRewriteEngine(TokenStream& upstream, size_t initialSize);

	RefToken nextToken( void );

	void rollback(size_t instructionIndex) {
		rollback(DEFAULT_PROGRAM_NAME, instructionIndex);
	}

	/** Rollback the instruction stream for a program so that
	 *	 the indicated instruction (via instructionIndex) is no
	 *	 longer in the stream.	UNTESTED!
	 */
	void rollback(const ANTLR_USE_NAMESPACE(std)string& programName,
					  size_t instructionIndex );

	void deleteProgram() {
		deleteProgram(DEFAULT_PROGRAM_NAME);
	}

	/** Reset the program so that no instructions exist */
	void deleteProgram(const ANTLR_USE_NAMESPACE(std)string& programName) {
		rollback(programName, MIN_TOKEN_INDEX);
	}

	void insertAfter( RefTokenWithIndex t,
							const ANTLR_USE_NAMESPACE(std)string& text )
	{
		insertAfter(DEFAULT_PROGRAM_NAME, t, text);
	}

	void insertAfter(size_t index, const ANTLR_USE_NAMESPACE(std)string& text) {
		insertAfter(DEFAULT_PROGRAM_NAME, index, text);
	}

	void insertAfter( const ANTLR_USE_NAMESPACE(std)string& programName,
							RefTokenWithIndex t,
							const ANTLR_USE_NAMESPACE(std)string& text )
	{
		insertAfter(programName, t->getIndex(), text);
	}

	void insertAfter( const ANTLR_USE_NAMESPACE(std)string& programName,
							size_t index,
							const ANTLR_USE_NAMESPACE(std)string& text )
	{
		// to insert after, just insert before next index (even if past end)
		insertBefore(programName,index+1, text);
	}

	void insertBefore( RefTokenWithIndex t,
							 const ANTLR_USE_NAMESPACE(std)string& text )
	{
		// std::cout << "insertBefore index " << t->getIndex() << " " << text << std::endl;
		insertBefore(DEFAULT_PROGRAM_NAME, t, text);
	}

	void insertBefore(size_t index, const ANTLR_USE_NAMESPACE(std)string& text) {
		insertBefore(DEFAULT_PROGRAM_NAME, index, text);
	}

	void insertBefore( const ANTLR_USE_NAMESPACE(std)string& programName,
							 RefTokenWithIndex t,
							 const ANTLR_USE_NAMESPACE(std)string& text )
	{
		insertBefore(programName, t->getIndex(), text);
	}

	void insertBefore( const ANTLR_USE_NAMESPACE(std)string& programName,
							 size_t index,
							 const ANTLR_USE_NAMESPACE(std)string& text )
	{
		addToSortedRewriteList(programName, new InsertBeforeOp(index,text));
	}

	void replace(size_t index, const ANTLR_USE_NAMESPACE(std)string& text)
	{
		replace(DEFAULT_PROGRAM_NAME, index, index, text);
	}

	void replace( size_t from, size_t to,
					  const ANTLR_USE_NAMESPACE(std)string& text)
	{
		replace(DEFAULT_PROGRAM_NAME, from, to, text);
	}

	void replace( RefTokenWithIndex indexT,
					  const ANTLR_USE_NAMESPACE(std)string& text )
	{
		replace(DEFAULT_PROGRAM_NAME, indexT->getIndex(), indexT->getIndex(), text);
	}

	void replace( RefTokenWithIndex from,
					  RefTokenWithIndex to,
					  const ANTLR_USE_NAMESPACE(std)string& text )
	{
		replace(DEFAULT_PROGRAM_NAME, from, to, text);
	}

	void replace(const ANTLR_USE_NAMESPACE(std)string& programName,
					 size_t from, size_t to,
					 const ANTLR_USE_NAMESPACE(std)string& text )
	{
		addToSortedRewriteList(programName,new ReplaceOp(from, to, text));
	}

	void replace( const ANTLR_USE_NAMESPACE(std)string& programName,
					  RefTokenWithIndex from,
					  RefTokenWithIndex to,
					  const ANTLR_USE_NAMESPACE(std)string& text )
	{
		replace(programName,
				  from->getIndex(),
				  to->getIndex(),
				  text);
	}

	void remove(size_t index) {
		remove(DEFAULT_PROGRAM_NAME, index, index);
	}

	void remove(size_t from, size_t to) {
		remove(DEFAULT_PROGRAM_NAME, from, to);
	}

	void remove(RefTokenWithIndex indexT) {
		remove(DEFAULT_PROGRAM_NAME, indexT, indexT);
	}

	void remove(RefTokenWithIndex from, RefTokenWithIndex to) {
		remove(DEFAULT_PROGRAM_NAME, from, to);
	}

	void remove( const ANTLR_USE_NAMESPACE(std)string& programName,
					 size_t from, size_t to)
	{
		replace(programName,from,to,"");
	}

	void remove( const ANTLR_USE_NAMESPACE(std)string& programName,
					 RefTokenWithIndex from, RefTokenWithIndex to )
	{
		replace(programName,from,to,"");
	}

	void discard(int ttype) {
		discardMask.add(ttype);
	}

	RefToken getToken( size_t i )
	{
		return RefToken(tokens.at(i));
	}

	size_t getTokenStreamSize() const {
		return tokens.size();
	}

	void originalToStream( ANTLR_USE_NAMESPACE(std)ostream& out ) const {
		ANTLR_USE_NAMESPACE(std)for_each( tokens.begin(), tokens.end(), tokenToStream(out) );
	}

	void originalToStream( ANTLR_USE_NAMESPACE(std)ostream& out,
								  size_t start, size_t end ) const;

	void toStream( ANTLR_USE_NAMESPACE(std)ostream& out ) const {
		toStream( out, MIN_TOKEN_INDEX, getTokenStreamSize());
	}

	void toStream( ANTLR_USE_NAMESPACE(std)ostream& out,
						const ANTLR_USE_NAMESPACE(std)string& programName ) const
	{
		toStream( out, programName, MIN_TOKEN_INDEX, getTokenStreamSize());
	}

	void toStream( ANTLR_USE_NAMESPACE(std)ostream& out,
						size_t start, size_t end ) const
	{
		toStream(out, DEFAULT_PROGRAM_NAME, start, end);
	}

	void toStream( ANTLR_USE_NAMESPACE(std)ostream& out,
						const ANTLR_USE_NAMESPACE(std)string& programName,
						size_t firstToken, size_t lastToken ) const;

	void toDebugStream( ANTLR_USE_NAMESPACE(std)ostream& out ) const {
		toDebugStream( out, MIN_TOKEN_INDEX, getTokenStreamSize());
	}

	void toDebugStream( ANTLR_USE_NAMESPACE(std)ostream& out,
							  size_t start, size_t end ) const;

	size_t getLastRewriteTokenIndex() const {
		return getLastRewriteTokenIndex(DEFAULT_PROGRAM_NAME);
	}

	/** Return the last index for the program named programName
	 * return 0 if the program does not exist or the program is empty.
	 * (Note this is different from the java implementation that returns -1)
	 */
	size_t getLastRewriteTokenIndex(const ANTLR_USE_NAMESPACE(std)string& programName) const {
		program_map::const_iterator rewrites = programs.find(programName);

		if( rewrites == programs.end() )
			return 0;

		const operation_list& prog = rewrites->second;
		if( !prog.empty() )
		{
			operation_list::const_iterator last = prog.end();
			--last;
			return (*last)->getIndex();
		}
		return 0;
	}

protected:
	/** If op.index > lastRewriteTokenIndexes, just add to the end.
	 *	 Otherwise, do linear */
	void addToSortedRewriteList(RewriteOperation* op) {
		addToSortedRewriteList(DEFAULT_PROGRAM_NAME, op);
	}

	void addToSortedRewriteList( const ANTLR_USE_NAMESPACE(std)string& programName,
										  RewriteOperation* op );

protected:
	/** Who do we suck tokens from? */
	TokenStream& stream;
	/** track index of tokens */
	size_t index;

	/** Track the incoming list of tokens */
	token_list tokens;

	/** You may have multiple, named streams of rewrite operations.
	 *  I'm calling these things "programs."
	 *  Maps String (name) -> rewrite (List)
	 */
	program_map programs;

	/** Which (whitespace) token(s) to throw out */
	BitSet discardMask;
};

#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
}
#endif

#endif