/* * JavaWalker -- A simple web walker. * * Given a root URL to a document, JavaWalker will recursively visit all * URLs it can find in the document and all documents it can reach. * * JavaWalker can be run standalone in the interpreter or as an applet. */ import java.applet.Applet; import java.awt.*; import java.util.*; import java.net.*; import java.io.*; public class JavaWalker extends Applet { public void init () { initStandalone (false); } public void initStandalone (boolean standalone) { setLayout (new GridLayout (1, 1)); add (new JavaWalkerPanel (this, standalone)); } public static void main (String args[]) { Frame f = new Frame ("JavaWalker"); JavaWalker walker = new JavaWalker (); walker.initStandalone (true); f.add ("Center", walker); f.pack (); f.resize (350, f.bounds ().height); f.show (); } } class JavaWalkerPanel extends Panel { JavaWalker walker; // The applet. List walkerList; // The listbox showing the URL hierarchy. Thread walkerThread; // The thread doing the searching. Label status; // The status box for status messages. int urlIndex; // The index into the listbox of the current URL. Button pause, resume; String rootURLStr; Label rootLabel; TextField rootTextField; static final String spaces = " "; JavaWalkerPanel (JavaWalker myWalker, boolean standalone) { walker = myWalker; GridBagConstraints endConst = new GridBagConstraints (); GridBagLayout gridbag = new GridBagLayout (); setLayout (gridbag); /* Constraints for the last component in a row. */ endConst.weightx = 1.0; endConst.fill = GridBagConstraints.HORIZONTAL; endConst.gridwidth = GridBagConstraints.REMAINDER; /* The applet label. */ Label label = new Label ("Java Walker", Label.LEFT); label.setFont (new Font ("Helvetica", Font.ITALIC, 24)); endConst.insets = new Insets (4, 0, 4, 0); gridbag.setConstraints (label, endConst); add (label); /* The text field for entering the root URL. */ add (new Label ("Root:", Label.CENTER)); rootTextField = new TextField (""); gridbag.setConstraints (rootTextField, endConst); add (rootTextField); /* The button control panel. */ Panel p = new Panel (); p.add (new Button ("Start")); p.add (pause = new Button ("Pause")); p.add (resume = new Button ("Resume")); if (standalone) { /* Provide a way for the user to quit the program. */ p.add (new Button ("Close")); } endConst.insets = new Insets (0, 0, 0, 0); gridbag.setConstraints (p, endConst); add (p); /* The listbox displaying the URL hierarchy. */ walkerList = new List (8, false); walkerList.setFont (new Font ("Helvetica", Font.PLAIN, 10)); endConst.insets = new Insets (2, 10, 2, 10); gridbag.setConstraints (walkerList, endConst); add (walkerList); /* The label for displaying status messages. */ status = new Label ("Enter a URL and click on Start.", Label.LEFT); status.setFont (new Font ("Times", Font.BOLD, 10)); endConst.insets = new Insets (0, 10, 2, 10); gridbag.setConstraints (status, endConst); add (status); /* Initialize the components. */ pause.disable (); resume.disable (); setRootURL ("http://www.cs.washington.edu"); rootTextField.setText (rootURLStr); } public boolean action (Event e, Object arg) { /* Handle the various button events. */ if (e.target instanceof Button) { String label = (String) arg; if (label.equals ("Close")) { reportStatus ("Closing."); System.exit (0); } else if (label.equals ("Start")) { if (walkerThread != null) { walkerThread.stop (); } walkerList.clear (); setRootURL(rootTextField.getText()); walkerThread = new Thread (new WalkerThread (this, rootURL ())); walkerThread.start (); pause.enable (); resume.disable (); } else if (label.equals ("Pause")) { reportStatus ("Paused."); walkerThread.suspend (); pause.disable (); resume.enable (); } else if (label.equals ("Resume")) { reportStatus ("Resuming..."); walkerThread.resume (); resume.disable (); pause.enable (); } return true; } return false; } public synchronized void reportStatus (String message) { status.setText (message); getToolkit ().sync (); } public String rootURL () { return rootURLStr; } public void setRootURL (String url) { rootURLStr = url; } public void addURLs (Queue q, int depth) { /* Add these URLs just after the current URL, assuming that that the current URL referenced to them. */ String indent = spaces.substring (0, depth); for (int i = 0; i < q.size (); i++) { walkerList.addItem (indent + ((URL) q.element (i)).toString (), urlIndex + i + 1); } } public void setCurrentURL (URL url, int depth) { /* Make sure that the current URL is visible and that it is roughly centered in the listbox. */ int rows, vis; reportStatus ("Fetching " + url.toString () + "."); String spaced = spaces.substring (0, depth) + url; urlIndex = -1; for (int i = 0; i < walkerList.countItems (); i++) { if (spaced.equals (walkerList.getItem (i))) { urlIndex = i; } } if (urlIndex < 0) { System.out.println ("Failed to find URL in list."); urlIndex = walkerList.countItems () - 1; } rows = walkerList.getRows (); vis = Math.max (0, (urlIndex - (rows / 2))); walkerList.makeVisible (urlIndex); walkerList.makeVisible (urlIndex + (rows / 2) - 1); walkerList.select (urlIndex); } } class WalkerThread implements Runnable { Hashtable urlsVisited; // To keep track of visited URLs. JavaWalkerPanel walkerPanel; // A handle to the UI. Queue workQ; // URLs to visit in FIFO order. URL rootURL; // The URL we start with. public WalkerThread (JavaWalkerPanel panel, String urlStr) { try { walkerPanel = panel; rootURL = new URL (urlStr); } catch (MalformedURLException e) { System.out.println (e.getMessage ()); } workQ = new Queue (); urlsVisited = new Hashtable (); } public void run () { walkTheWeb (); } public void walkTheWeb () { int depth = 0; int nextDepth; walkerPanel.addURLs (new Queue ().enqueue (rootURL), 0); workQ.removeAllElements (); // Reset the work queue. workQ.enqueue (new Integer (0)); // Start at depth 0. workQ.enqueue (rootURL); // Start with the root URL. while (!workQ.isEmpty ()) { Object o = workQ.dequeue (); if (o instanceof Integer) { /* Go down another level in the hierarchy. */ depth = ((Integer) o).intValue (); continue; } URL url = (URL) o; /* Don't revisit documents. */ if (urlsVisited.containsKey (url.toString ())) continue; StringBuffer buffer = new StringBuffer (); walkerPanel.setCurrentURL (url, depth); try { DataInputStream stream = new DataInputStream (url.openStream ()); String str; while ((str = stream.readLine ()) != null) { buffer.append (str); } } catch (IOException e) { System.out.println (e.getMessage ()); } /* Remember that we visited this URL. */ urlsVisited.put (url.toString (), new Integer (depth)); /* Parse the URLs in this document. */ URLParser parser = new URLParser (buffer.toString ()); walkerPanel.reportStatus ("Parsing " + url.toString () + "..."); Queue q = parser.parse (); walkerPanel.reportStatus ("Parsed " + q.size () + " URLs."); /* Add the parsed URLs to our work queue. We prefix the URLs in the queue with the depth in the hierarchy that they were found. */ walkerPanel.addURLs (q, depth + 4); workQ.enqueue (new Integer (depth + 4)); workQ.enqueue (q); /* Pause for a second so we don't hammer web servers. */ try { Thread.currentThread ().sleep (1000); } catch (InterruptedException e) { System.out.println ("Pause interrupted."); } } } } /* * A simple parser that extracts URLs from HTML documents. */ class URLParser { String contents; public URLParser (String data) { contents = data; } public Queue parse () { int tag, quote, http, html, last; Queue q = new Queue (); String href; /* * Loop through a lowercase copy of the document searching for * the keywords that correspond to URLs. We determine the indicies * of the URL text in the copy, and then index into the original * to extract the correctly cased URL and place it in the queue. * * Note that the parser only recognizes URLs of the form * "http:....html". */ String text = new String (contents); text = text.toLowerCase (); try { last = 0; while (last < contents.length ()) { if ((tag = text.indexOf ("