Example 4

import be.arci.html.*;
import java.io.File;
/**Extracts formatted text content from HTML documents */
public class HTMLScannerExample4
{//this bunch of constant definitions really is easy to create with a decent editor
 //the int fields will serve as mnemonic
 static final int TGID_NOTAG      =  0; static final String TGNM_00 = ""; 
 static final int TGID_ADDR       =  1; static final String TGNM_01 = "ADDR";
 static final int TGID_BLOCKQUOTE =  2; static final String TGNM_02 = "BLOCKQUOTE";
 static final int TGID_BR         =  3; static final String TGNM_03 = "BR";
 static final int TGID_CENTER     =  4; static final String TGNM_04 = "CENTER";
 static final int TGID_DD         =  5; static final String TGNM_05 = "DD";
 static final int TGID_DIR        =  6; static final String TGNM_06 = "DIR";
 static final int TGID_DIV        =  7; static final String TGNM_07 = "DIV";
 static final int TGID_DL         =  8; static final String TGNM_08 = "DL";
 static final int TGID_H1         =  9; static final String TGNM_09 = "H1";
 static final int TGID_H2         = 10; static final String TGNM_10 = "H2";
 static final int TGID_H3         = 11; static final String TGNM_11 = "H3";
 static final int TGID_H4         = 12; static final String TGNM_12 = "H4";
 static final int TGID_H5         = 13; static final String TGNM_13 = "H5";
 static final int TGID_H6         = 14; static final String TGNM_14 = "H6";
 static final int TGID_HR         = 15; static final String TGNM_15 = "HR";
 static final int TGID_LI         = 16; static final String TGNM_16 = "LI";
 static final int TGID_LISTING    = 17; static final String TGNM_17 = "LISTING";
 static final int TGID_MARQUEE    = 18; static final String TGNM_18 = "MARQUEE";
 static final int TGID_MENU       = 19; static final String TGNM_19 = "MENU";
 static final int TGID_MULTICOL   = 20; static final String TGNM_20 = "MULTICOL";
 static final int TGID_OL         = 21; static final String TGNM_21 = "OL";
 static final int TGID_P          = 22; static final String TGNM_22 = "P";
 static final int TGID_PLAINTEXT  = 23; static final String TGNM_23 = "PLAINTEXT";
 static final int TGID_PRE        = 24; static final String TGNM_24 = "PRE";
 static final int TGID_UL         = 25; static final String TGNM_25 = "UL";
 static final int TGID_XMP        = 26; static final String TGNM_26 = "XMP";
 static final int TGID_TABLE      = 27; static final String TGNM_27 = "TABLE";
 static final int TGID_TR         = 28; static final String TGNM_28 = "TR";
 static final int TGID_TD         = 29; static final String TGNM_29 = "TD";
 static final int TGID_TH         = 30; static final String TGNM_30 = "TH";
 static final int TGID_TITLE      = 31; static final String TGNM_31 = "TITLE";
 static final int TGID_CAPTION    = 32; static final String TGNM_32 = "CAPTION";
 static final int TGID_SELECT     = 33; static final String TGNM_33 = "SELECT";
 static final int TGID_OPTION     = 34; static final String TGNM_34 = "OPTION";
 static final int TGID_DT         = 35; static final String TGNM_35 = "DT";

 static String[] asTagNames = new String[] 
 { 
  TGNM_00,
  TGNM_01,
  TGNM_02,
  TGNM_03,
  TGNM_04,
  TGNM_05,
  TGNM_06,
  TGNM_07,
  TGNM_08,
  TGNM_09,
  TGNM_10,
  TGNM_11,
  TGNM_12,
  TGNM_13,
  TGNM_14,
  TGNM_15,
  TGNM_16,
  TGNM_17,
  TGNM_18,
  TGNM_19,
  TGNM_20,
  TGNM_21,
  TGNM_22,
  TGNM_23,
  TGNM_24,
  TGNM_25,
  TGNM_26,
  TGNM_27,
  TGNM_28,
  TGNM_29,
  TGNM_30,
  TGNM_31,
  TGNM_32,
  TGNM_33,
  TGNM_34,
  TGNM_35,
 };

 public static void main(String[] args)
 {
  for (int i = 0; i < args.length; i++)
  {
   try {
     //replace with "new HTMLScanner(new URL(args[i]));" for networked documents
    HTMLScanner hs = new HTMLScanner(new File(args[i]));
    HTMLTag[] tags = hs.getTags(asTagNames, true);//true: discard tags we are not interested in
    StringBuffer content = new StringBuffer();
    for (int j = 0; j < tags.length; j++)
    {
     switch (tags[j].iID)
     {
      case TGID_NOTAG:
       tags[j].accumulateContent(content);
       break;
       //line break on opening and/or closing tag
      case  TGID_ADDR:     case -TGID_ADDR:
      case  TGID_BR:       case -TGID_BR:
      case  TGID_CAPTION:
      case  TGID_CENTER:   case -TGID_CENTER:
      case  TGID_DIV:      case -TGID_DIV:
      case  TGID_DT:
      case  TGID_LI:
      case  TGID_MARQUEE:  case -TGID_MARQUEE:
      case  TGID_OPTION:
                           case -TGID_SELECT:
      case  TGID_TABLE:    case -TGID_TABLE:
      case  TGID_TR:
      case  TGID_TITLE:    case -TGID_TITLE:
       if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
        content.append('\n');
       break;
       //tab on opening tag
      case  TGID_TD:
      case  TGID_TH:
       content.append('\t');
       break;
       //line break and indent on opening tag
      case  TGID_DD:
       if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
        content.append('\n');
       content.append('\t');
       break;
       //new paragraph and indent on opening tag
      case  TGID_BLOCKQUOTE: 
       if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
       {
        if (content.length() > 1 && content.charAt(content.length() - 2) != '\n')
         content.append('\n');
        content.append('\n');
       }
       content.append('\t');
       break;
       //new paragraph on opening and/or closing tag
      case  TGID_H1:         case -TGID_H1:
      case  TGID_H2:         case -TGID_H2:
      case  TGID_H3:         case -TGID_H3:
      case  TGID_H4:         case -TGID_H4:
      case  TGID_H5:         case -TGID_H5:
      case  TGID_H6:         case -TGID_H6:
      case  TGID_HR:
                             case -TGID_BLOCKQUOTE:
      case  TGID_DIR:        case -TGID_DIR:
      case  TGID_DL:         case -TGID_DL:
      case  TGID_LISTING:    case -TGID_LISTING:
      case  TGID_MENU:       case -TGID_MENU:
      case  TGID_MULTICOL:
      case  TGID_OL:         case -TGID_OL:
      case  TGID_P:          case -TGID_P:
      case  TGID_PLAINTEXT:
      case  TGID_PRE:        case -TGID_PRE:
      case  TGID_UL:         case -TGID_UL:
      case  TGID_XMP:        case -TGID_XMP:
       if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
       {
        if (content.length() > 1 && content.charAt(content.length() - 2) != '\n')
         content.append('\n');
        content.append('\n');
       }
       break;
      default: 
       break;
     }
    }
    System.out.println(content);
   } catch (Exception e) { e.printStackTrace(); } //URL exceptions or IO exceptions
  }
 }
}