Basic Use

Lets have a look at a simple example of parsing an XML document, where we are interested in examining the start and end tags in the document. To keep things simple, we will use the following piece of XML in a file called simple.xml:

<?xml version="1.0"?>
<document>
  <parent>
    <child>
      Some text
    </child>
  </parent>
</document>
      

To parse this document, we need to construct a tokenizer for it, then iterate over all it's tokens, e.g.

import org.millscript.commons.xml.api.token.*;
import org.millscript.commons.xml.api.tokenizer.XmlTokenizer;
import org.millscript.commons.xml.tokenizer.XmlTokenizerFactory;

import java.io.*;

public class SimpleParser {

    public static void main( String[] args ) throws FileNotFoundException {
        XmlTokenizerFactory f = new XmlTokenizerFactory();
        XmlTokenizer xt = f.getTokenizer(
            new FileInputStream( "simple.xml" )
        );
        while ( xt.hasNextToken() ) {
            Token t = xt.nextToken();
            if ( t instanceof EmptyElementToken ) {
                System.out.println( "EMPTY: " + ((EmptyElementToken) t).getName() );
            } else if ( t instanceof StartTagToken ) {
                System.out.println( "START: " + ((StartTagToken) t).getName() );
            } else if ( t instanceof EndTagToken ) {
                System.out.println( "END: " + ((EndTagToken) t).getName() );
            }
        }
    }
}
      

Now all that instanceof, if statement and casting stuff is untidy, but handily the Token class implements the visitor pattern, so if we make the following token visitor we can improve the code a little:

import org.millscript.commons.xml.api.token.*;

public class SimpleTokenVisitor implements TokenVisitor {

    public void visit( AttListDeclToken token ) {}
    public void visit( CharDataToken token ) {}
    public void visit( CommentToken token ) {}
    public void visit( DTDToken token ) {}
    public void visit( ElementDeclToken token ) {}
    public void visit( EmptyElementToken token ) {
        System.out.println( "EMPTY: " + token.getName() );
    }

    public void visit( EndTagToken token ) {
        System.out.println( "END: " + token.getName() );
    }

    public void visit( EntityDeclToken token ) {}
    public void visit( NotationDeclToken token ) {}
    public void visit( PIToken token ) {}

    public void visit( StartTagToken token ) {
        System.out.println( "START: " + token.getName() );
    }

    public void visit( XmlDeclToken token ) {}

}
      

and now our example parser becomes much neater:

import org.millscript.commons.xml.api.token.TokenVisitor;
import org.millscript.commons.xml.api.tokenizer.XmlTokenizer;
import org.millscript.commons.xml.tokenizer.XmlTokenizerFactory;

import java.io.FileInputStream;
import java.io.FileNotFoundException;

public class SimpleParser {

    public static void main( String[] args ) throws FileNotFoundException {
        XmlTokenizerFactory f = new XmlTokenizerFactory();
        XmlTokenizer xt = f.getTokenizer(
            new FileInputStream( "simple.xml" )
        );
        TokenVisitor tv = new SimpleTokenVisitor();
        while ( xt.hasNextToken() ) {
            xt.nextToken().visit( tv );
        }
    }
}
      

Now, when we run either version we would see the following output:

START: document
START: parent
START: child
END: child
END: parent
END: document