|
Extracting formatted text content from HTML documents |
|
import be.arci.html.*;
import java.io.File;
/**Extracts formatted text content from HTML documents */
public class HTMLScannerExample4
{//this bunch of constant definitions really is easy to create with a decent editor
//the int fields will serve as mnemonic
static final int TGID_NOTAG = 0; static final String TGNM_00 = "";
static final int TGID_ADDR = 1; static final String TGNM_01 = "ADDR";
static final int TGID_BLOCKQUOTE = 2; static final String TGNM_02 = "BLOCKQUOTE";
static final int TGID_BR = 3; static final String TGNM_03 = "BR";
static final int TGID_CENTER = 4; static final String TGNM_04 = "CENTER";
static final int TGID_DD = 5; static final String TGNM_05 = "DD";
static final int TGID_DIR = 6; static final String TGNM_06 = "DIR";
static final int TGID_DIV = 7; static final String TGNM_07 = "DIV";
static final int TGID_DL = 8; static final String TGNM_08 = "DL";
static final int TGID_H1 = 9; static final String TGNM_09 = "H1";
static final int TGID_H2 = 10; static final String TGNM_10 = "H2";
static final int TGID_H3 = 11; static final String TGNM_11 = "H3";
static final int TGID_H4 = 12; static final String TGNM_12 = "H4";
static final int TGID_H5 = 13; static final String TGNM_13 = "H5";
static final int TGID_H6 = 14; static final String TGNM_14 = "H6";
static final int TGID_HR = 15; static final String TGNM_15 = "HR";
static final int TGID_LI = 16; static final String TGNM_16 = "LI";
static final int TGID_LISTING = 17; static final String TGNM_17 = "LISTING";
static final int TGID_MARQUEE = 18; static final String TGNM_18 = "MARQUEE";
static final int TGID_MENU = 19; static final String TGNM_19 = "MENU";
static final int TGID_MULTICOL = 20; static final String TGNM_20 = "MULTICOL";
static final int TGID_OL = 21; static final String TGNM_21 = "OL";
static final int TGID_P = 22; static final String TGNM_22 = "P";
static final int TGID_PLAINTEXT = 23; static final String TGNM_23 = "PLAINTEXT";
static final int TGID_PRE = 24; static final String TGNM_24 = "PRE";
static final int TGID_UL = 25; static final String TGNM_25 = "UL";
static final int TGID_XMP = 26; static final String TGNM_26 = "XMP";
static final int TGID_TABLE = 27; static final String TGNM_27 = "TABLE";
static final int TGID_TR = 28; static final String TGNM_28 = "TR";
static final int TGID_TD = 29; static final String TGNM_29 = "TD";
static final int TGID_TH = 30; static final String TGNM_30 = "TH";
static final int TGID_TITLE = 31; static final String TGNM_31 = "TITLE";
static final int TGID_CAPTION = 32; static final String TGNM_32 = "CAPTION";
static final int TGID_SELECT = 33; static final String TGNM_33 = "SELECT";
static final int TGID_OPTION = 34; static final String TGNM_34 = "OPTION";
static final int TGID_DT = 35; static final String TGNM_35 = "DT";
static String[] asTagNames = new String[]
{
TGNM_00,
TGNM_01,
TGNM_02,
TGNM_03,
TGNM_04,
TGNM_05,
TGNM_06,
TGNM_07,
TGNM_08,
TGNM_09,
TGNM_10,
TGNM_11,
TGNM_12,
TGNM_13,
TGNM_14,
TGNM_15,
TGNM_16,
TGNM_17,
TGNM_18,
TGNM_19,
TGNM_20,
TGNM_21,
TGNM_22,
TGNM_23,
TGNM_24,
TGNM_25,
TGNM_26,
TGNM_27,
TGNM_28,
TGNM_29,
TGNM_30,
TGNM_31,
TGNM_32,
TGNM_33,
TGNM_34,
TGNM_35,
};
public static void main(String[] args)
{
for (int i = 0; i < args.length; i++)
{
try {
//replace with "new HTMLScanner(new URL(args[i]));" for networked documents
HTMLScanner hs = new HTMLScanner(new File(args[i]));
HTMLTag[] tags = hs.getTags(asTagNames, true);//true: discard tags we are not interested in
StringBuffer content = new StringBuffer();
for (int j = 0; j < tags.length; j++)
{
switch (tags[j].iID)
{
case TGID_NOTAG:
tags[j].accumulateContent(content);
break;
//line break on opening and/or closing tag
case TGID_ADDR: case -TGID_ADDR:
case TGID_BR: case -TGID_BR:
case TGID_CAPTION:
case TGID_CENTER: case -TGID_CENTER:
case TGID_DIV: case -TGID_DIV:
case TGID_DT:
case TGID_LI:
case TGID_MARQUEE: case -TGID_MARQUEE:
case TGID_OPTION:
case -TGID_SELECT:
case TGID_TABLE: case -TGID_TABLE:
case TGID_TR:
case TGID_TITLE: case -TGID_TITLE:
if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
content.append('\n');
break;
//tab on opening tag
case TGID_TD:
case TGID_TH:
content.append('\t');
break;
//line break and indent on opening tag
case TGID_DD:
if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
content.append('\n');
content.append('\t');
break;
//new paragraph and indent on opening tag
case TGID_BLOCKQUOTE:
if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
{
if (content.length() > 1 && content.charAt(content.length() - 2) != '\n')
content.append('\n');
content.append('\n');
}
content.append('\t');
break;
//new paragraph on opening and/or closing tag
case TGID_H1: case -TGID_H1:
case TGID_H2: case -TGID_H2:
case TGID_H3: case -TGID_H3:
case TGID_H4: case -TGID_H4:
case TGID_H5: case -TGID_H5:
case TGID_H6: case -TGID_H6:
case TGID_HR:
case -TGID_BLOCKQUOTE:
case TGID_DIR: case -TGID_DIR:
case TGID_DL: case -TGID_DL:
case TGID_LISTING: case -TGID_LISTING:
case TGID_MENU: case -TGID_MENU:
case TGID_MULTICOL:
case TGID_OL: case -TGID_OL:
case TGID_P: case -TGID_P:
case TGID_PLAINTEXT:
case TGID_PRE: case -TGID_PRE:
case TGID_UL: case -TGID_UL:
case TGID_XMP: case -TGID_XMP:
if (content.length() > 0 && content.charAt(content.length() - 1) != '\n')
{
if (content.length() > 1 && content.charAt(content.length() - 2) != '\n')
content.append('\n');
content.append('\n');
}
break;
default:
break;
}
}
System.out.println(content);
} catch (Exception e) { e.printStackTrace(); } //URL exceptions or IO exceptions
}
}
}