|
Extracting content from a HTML document |
|
import be.arci.html.*;
import java.io.File;
/**Extracts content from HTML documents */
public class HTMLScannerExample2
{
public static void main(String[] args)
{
String[] asTagNames = new String[] { ""};//text content only
for (int i = 0; i < args.length; i++)
{
try {
//replace with "new HTMLScanner(new URL(args[i]));" for networked documents
HTMLScanner hs = new HTMLScanner(new File(args[i]));
HTMLTag[] tags = hs.getTags(asTagNames, true);//true: discard tags we are not interested in
StringBuffer content = new StringBuffer();
for (int j = 0; j < tags.length; j++)
tags[j].accumulateContent(content);
System.out.println(content);
} catch (Exception e) { e.printStackTrace(); } //URL exceptions or IO exceptions
}
}
}