Whilst I was hacking away at the docs a couple of weeks ago, I noticed that a most time-consuming thing was checking all the urls embedded in them.

Since I had a couple of hours free today, I decided to knock up a little task to do it for me.
Use Case:
- check links in text source to ensure that they respond with a HTTP 200
- source can by anything (including source code)
- links must be valid xhtml or docbook links ie
<a href="link"></a>
<ulink url="link"/>
or
<ulink url="link"></ulink>
- all links must be in "" quotes
- all links must be on one line (not great I know)


I know about Anteater, but that looked way over the top for what seemed a pretty small task (and Not Invented Here I have to admit).

I've used it to track down some broken links in antnews.html (and I'm sure I'll find a few others). I'm not using it as a test tool, rather as part of a doc generation/validation task.

Anyway, patch attached for your amusement.

Kev

Index: CheckLink.java
===================================================================
RCS file: CheckLink.java
diff -N CheckLink.java
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ CheckLink.java      1 Jan 1970 00:00:00 -0000
@@ -0,0 +1,284 @@
+/*
+ * Copyright  2001-2005 The Apache Software Foundation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+package org.apache.tools.ant.taskdefs.optional;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.Enumeration;
+import java.util.Hashtable;
+import java.util.Vector;
+
+import org.apache.tools.ant.BuildException;
+import org.apache.tools.ant.DirectoryScanner;
+import org.apache.tools.ant.Project;
+import org.apache.tools.ant.Task;
+import org.apache.tools.ant.types.FileSet;
+import org.apache.tools.ant.util.FileUtils;
+
+/**
+ * A simple link checking task. 
+ * Checks any text file type (including source code), 
+ * to ensure that embedded links are reachable. 
+ * Most useful for checking documentation as part of the build.
+ */
+public class CheckLink extends Task {
+    
+    /* messages */
+    private static final String CANT_READ_FILE = "Cannot open file ";
+    private static final String FILE_AND_FILESETS = "Use either file or a 
fileset, not both";
+    private static final String NO_FILES = "No files to check";
+    
+    private static final FileUtils FILE_UTILS = FileUtils.getFileUtils();
+    
+    private int verbosity = Project.MSG_VERBOSE;
+    private boolean readAndCheck = true;
+    private File file = null;
+    private int retries = 0;
+    private Vector filesets = new Vector();
+    private Vector checkedLinks = new Vector();
+    private Hashtable links = new Hashtable();
+    private int filesProcessed = 0;
+    
+    /* default to standard html links */
+    private boolean docbook = false;
+    private String linkForm = "<a href=\""; 
+    
+    /**
+     * Set the format of the link.
+     * Tested with (x)html and docbook links:
+     * <pre>
+     *         a href
+     *  ulink url
+     * </pre>
+     * @param linkForm
+     */
+    public void setLinkForm(final String linkForm) {
+        this.linkForm = "<"+linkForm+"=\"";
+        if (linkForm.charAt(0) == 'u') {
+            docbook = true;
+        }
+    }
+    
+    /**
+     * Set the single source file to check
+     * @param file
+     */
+    public void setFile(final File file) {
+        this.file = file;
+    }
+    
+    /**
+     * Add a fileset to be checked
+     * @param fs
+     */
+    public void addFileset(final FileSet fs) {
+        this.filesets.add(fs);
+    }
+    
+    /**
+     * Sets the number of times a connection attempt is 
+     * made before giving up
+     * <strong>A high number will slow down the build significantly</strong>
+     * @param retries
+     */
+    public void setRetries(final int retries) {
+        this.retries = retries;
+    }
+    
+    /**
+     * Sets to check links as the file[s] are read
+     * In come circumstances it maybe better to read all links
+     * and check them after the files are closed.  Defaults to
+     * check links as files are read.
+     * @param readAndCheck
+     */
+    public void setReadAndCheck(final boolean readAndCheck) {
+        this.readAndCheck = readAndCheck;
+    }
+    
+    /**
+     * Set the verbosity of the logging.
+     * Default is <code>Project.MSG_INFO</code>
+     * @param verbose
+     */
+    public void setVerbose(final boolean verbose) {
+        this.verbosity = verbose ? Project.MSG_INFO : Project.MSG_VERBOSE;
+    }
+    
+    /**
+     * Check the links.
+     */
+    public void execute() {
+        //check options are sensible
+        validate();
+        if (file != null) {
+            //check single file
+            log("Checking links in ["+file.toString()+"]", verbosity);
+            scan(file);
+            filesProcessed =1;
+        } else {
+            //go through the fileset
+            for (int i = 0, size = filesets.size(); i<size; i++) {
+                final FileSet fs = (FileSet)filesets.get(i);
+                final DirectoryScanner ds = 
fs.getDirectoryScanner(getProject());
+                final String [] files = ds.getIncludedFiles();
+                final File dir = fs.getDir(getProject());
+                for (int j = 0;j < files.length; j++) {
+                    /* for each file, scan for links and try to resolve URL
+                     write out any failures (404 etc) to log
+                     */
+                    File fileToCheck = FILE_UTILS.resolveFile(dir, files[j]);
+                    log(fileToCheck.toString(), verbosity);
+                    scan(fileToCheck);
+                    filesProcessed++;
+                }
+            }
+        }
+        if (!readAndCheck) {
+            for(Enumeration e = links.keys();e.hasMoreElements();) {
+                final String lineNo = (String)e.nextElement();
+                final String link = (String)links.get(lineNo);
+                checkLink(link,Integer.parseInt(lineNo));
+            }
+        }
+        log("Links checked in " + 
+                filesProcessed + 
+                " file" + (filesProcessed > 1 ? "s." : ".") 
+                , verbosity);
+    }
+    
+    /*
+     * Checks task options for sensible values
+     */
+    private void validate() throws BuildException {
+        if (file != null && !filesets.isEmpty()) {
+            throw new BuildException(FILE_AND_FILESETS);
+        }
+        if (file == null && filesets.isEmpty()) {
+            throw new BuildException(NO_FILES);
+        }
+    }
+    
+    /*
+     * Extracts links from files and checks each link
+     */
+    private void scan(final File f) throws BuildException {
+        if ((null == f) || (!f.canRead())) {
+            throw new BuildException(CANT_READ_FILE + f);
+        }
+        BufferedReader b = null;
+        try {
+            b = new BufferedReader(
+                    new InputStreamReader(new FileInputStream(f))
+            );
+            String read;
+            int lineNo = 0;
+            while(null != (read = b.readLine())) {
+                lineNo++;
+                int start = read.indexOf(linkForm);
+                if (start != -1) {
+                    start += linkForm.length();
+                    int end = read.length();
+                    //found link
+                    //find end of link : only deals with valid html/xml links
+                    for(int i=start; ;i++) {
+                        try {
+                            if (docbook) {
+                                if(read.charAt(i) == '>' && read.charAt(i-1) 
== '"') {
+                                    end = i-1;
+                                    break;
+                                } else if (read.charAt(i) == '>' && 
read.charAt(i-1) == '/') {
+                                    end = i-2;
+                                    break;
+                                }
+                            } else {
+                                if(read.charAt(i) == '>' && read.charAt(i-1) 
== '"') {
+                                    end = i-1;
+                                    break;
+                                }
+                            }
+                        } catch (Exception e) {
+                            throw new BuildException("Link is (probably) 
broken across lines ["+lineNo+"]\n"+
+                                    e.getMessage());
+                        }
+                    }
+                    //check link or add to list
+                    final String link = read.substring(start, end);
+                    //only deal with urls, forget local links
+                    if (link.indexOf("http")!= -1) {
+                        if (readAndCheck) {
+                            log("Checking... "+link, verbosity);
+                            checkLink(link, lineNo);
+                        } else {
+                            log("Adding... "+link, verbosity);
+                            links.put(""+lineNo, link);
+                        }
+                    }
+                }
+            }
+        } catch (IOException e) {
+            throw new BuildException(e.getMessage());
+        } finally {
+            FileUtils.close(b);
+        }
+    }
+    
+    /*
+     * check link to see if it's reachable
+     * returns the HTTP response code or an exception message
+     */
+    private String checkLink(final String linkToCheck, final int lineNo) {
+        String result = null;
+        if (checkedLinks.contains(linkToCheck)) {
+            log("Previously checked link ["+linkToCheck+"]", verbosity);
+            return "Previously checked link ["+linkToCheck+"]";
+        }
+        for (int i = 0; i <=retries; i++) {
+            try {
+                final URL url = new URL(linkToCheck);
+                final URLConnection con = url.openConnection();
+                final HttpURLConnection hcon = (HttpURLConnection) con;
+                final int response = hcon.getResponseCode();
+                if (response != HttpURLConnection.HTTP_OK) {
+                    log("Link found at line ["+lineNo+"] produced "+ 
+                            response + " [ERROR]"); 
+                } else {
+                    log(HttpURLConnection.HTTP_OK + " [OK]", verbosity);
+                    //no need to retry
+                    checkedLinks.add(linkToCheck);
+                    break;
+                }
+            } catch (MalformedURLException e) {
+                /* do nothing, if URL is bust, then it's unreachable */
+                result = e.getMessage();
+                break;
+            } catch (IOException e) {
+                /* do nothing, if we have a connection problem then it's 
unreachable */
+                result = e.getMessage();
+                break;
+            }
+        }
+        return result;
+    }
+}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to