|
Getting all files referred to from HTML documents |
|
import be.arci.html.*;
import java.io.File;
import java.util.Vector;
/**Extracts hyperlinks from HTML documents. */
public class HTMLScannerExample1
{
public static void main(String[] args)
{
//element [0] == null: not interested in text content
String[] asTagNames = new String[] { null, "IMG", "A", "BODY", "FRAME"};
for (int i = 0; i < args.length; i++)
{
try {
Vector vec = new Vector();
//replace with "new HTMLScanner(new URL(args[i]));" for networked documents
HTMLScanner hs = new HTMLScanner(new File(args[i]));
HTMLTag[] tags = hs.getTags(asTagNames, true);//true: discard tags we are not interested in
for (int j = 0; j < tags.length; j++)
{
String link = null;
switch (tags[j].iID)
{
case 1: //IMG
case 4: //FRAME
link = tags[j].getAttribute("src");
break;
case 2: //A
link = tags[j].getAttribute("href");
break;
case 3: //BODY
link = tags[j].getAttribute("background");
break;
default:
break;
}
if (link != null && link.length() > 0)
vec.addElement(link);
}
for (int j = 0; j < vec.size(); j++)
System.out.println(vec.elementAt(j));
} catch (Exception e) { e.printStackTrace(); } //URL exceptions or IO exceptions
}
}
}