Since I had a couple of hours free today, I decided to knock up a little task to do it for me.
Use Case:
- check links in text source to ensure that they respond with a HTTP 200
- source can by anything (including source code)
- links must be valid xhtml or docbook links ie
<a href="link"></a>
<ulink url="link"/>
or
<ulink url="link"></ulink>
- all links must be in "" quotes
- all links must be on one line (not great I know)
I know about Anteater, but that looked way over the top for what seemed a pretty small task (and Not Invented Here I have to admit).
I've used it to track down some broken links in antnews.html (and I'm sure I'll find a few others). I'm not using it as a test tool, rather as part of a doc generation/validation task.
Anyway, patch attached for your amusement.
Kev
Index: CheckLink.java =================================================================== RCS file: CheckLink.java diff -N CheckLink.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ CheckLink.java 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,284 @@ +/* + * Copyright 2001-2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package org.apache.tools.ant.taskdefs.optional; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLConnection; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.Vector; + +import org.apache.tools.ant.BuildException; +import org.apache.tools.ant.DirectoryScanner; +import org.apache.tools.ant.Project; +import org.apache.tools.ant.Task; +import org.apache.tools.ant.types.FileSet; +import org.apache.tools.ant.util.FileUtils; + +/** + * A simple link checking task. + * Checks any text file type (including source code), + * to ensure that embedded links are reachable. + * Most useful for checking documentation as part of the build. + */ +public class CheckLink extends Task { + + /* messages */ + private static final String CANT_READ_FILE = "Cannot open file "; + private static final String FILE_AND_FILESETS = "Use either file or a fileset, not both"; + private static final String NO_FILES = "No files to check"; + + private static final FileUtils FILE_UTILS = FileUtils.getFileUtils(); + + private int verbosity = Project.MSG_VERBOSE; + private boolean readAndCheck = true; + private File file = null; + private int retries = 0; + private Vector filesets = new Vector(); + private Vector checkedLinks = new Vector(); + private Hashtable links = new Hashtable(); + private int filesProcessed = 0; + + /* default to standard html links */ + private boolean docbook = false; + private String linkForm = "<a href=\""; + + /** + * Set the format of the link. + * Tested with (x)html and docbook links: + * <pre> + * a href + * ulink url + * </pre> + * @param linkForm + */ + public void setLinkForm(final String linkForm) { + this.linkForm = "<"+linkForm+"=\""; + if (linkForm.charAt(0) == 'u') { + docbook = true; + } + } + + /** + * Set the single source file to check + * @param file + */ + public void setFile(final File file) { + this.file = file; + } + + /** + * Add a fileset to be checked + * @param fs + */ + public void addFileset(final FileSet fs) { + this.filesets.add(fs); + } + + /** + * Sets the number of times a connection attempt is + * made before giving up + * <strong>A high number will slow down the build significantly</strong> + * @param retries + */ + public void setRetries(final int retries) { + this.retries = retries; + } + + /** + * Sets to check links as the file[s] are read + * In come circumstances it maybe better to read all links + * and check them after the files are closed. Defaults to + * check links as files are read. + * @param readAndCheck + */ + public void setReadAndCheck(final boolean readAndCheck) { + this.readAndCheck = readAndCheck; + } + + /** + * Set the verbosity of the logging. + * Default is <code>Project.MSG_INFO</code> + * @param verbose + */ + public void setVerbose(final boolean verbose) { + this.verbosity = verbose ? Project.MSG_INFO : Project.MSG_VERBOSE; + } + + /** + * Check the links. + */ + public void execute() { + //check options are sensible + validate(); + if (file != null) { + //check single file + log("Checking links in ["+file.toString()+"]", verbosity); + scan(file); + filesProcessed =1; + } else { + //go through the fileset + for (int i = 0, size = filesets.size(); i<size; i++) { + final FileSet fs = (FileSet)filesets.get(i); + final DirectoryScanner ds = fs.getDirectoryScanner(getProject()); + final String [] files = ds.getIncludedFiles(); + final File dir = fs.getDir(getProject()); + for (int j = 0;j < files.length; j++) { + /* for each file, scan for links and try to resolve URL + write out any failures (404 etc) to log + */ + File fileToCheck = FILE_UTILS.resolveFile(dir, files[j]); + log(fileToCheck.toString(), verbosity); + scan(fileToCheck); + filesProcessed++; + } + } + } + if (!readAndCheck) { + for(Enumeration e = links.keys();e.hasMoreElements();) { + final String lineNo = (String)e.nextElement(); + final String link = (String)links.get(lineNo); + checkLink(link,Integer.parseInt(lineNo)); + } + } + log("Links checked in " + + filesProcessed + + " file" + (filesProcessed > 1 ? "s." : ".") + , verbosity); + } + + /* + * Checks task options for sensible values + */ + private void validate() throws BuildException { + if (file != null && !filesets.isEmpty()) { + throw new BuildException(FILE_AND_FILESETS); + } + if (file == null && filesets.isEmpty()) { + throw new BuildException(NO_FILES); + } + } + + /* + * Extracts links from files and checks each link + */ + private void scan(final File f) throws BuildException { + if ((null == f) || (!f.canRead())) { + throw new BuildException(CANT_READ_FILE + f); + } + BufferedReader b = null; + try { + b = new BufferedReader( + new InputStreamReader(new FileInputStream(f)) + ); + String read; + int lineNo = 0; + while(null != (read = b.readLine())) { + lineNo++; + int start = read.indexOf(linkForm); + if (start != -1) { + start += linkForm.length(); + int end = read.length(); + //found link + //find end of link : only deals with valid html/xml links + for(int i=start; ;i++) { + try { + if (docbook) { + if(read.charAt(i) == '>' && read.charAt(i-1) == '"') { + end = i-1; + break; + } else if (read.charAt(i) == '>' && read.charAt(i-1) == '/') { + end = i-2; + break; + } + } else { + if(read.charAt(i) == '>' && read.charAt(i-1) == '"') { + end = i-1; + break; + } + } + } catch (Exception e) { + throw new BuildException("Link is (probably) broken across lines ["+lineNo+"]\n"+ + e.getMessage()); + } + } + //check link or add to list + final String link = read.substring(start, end); + //only deal with urls, forget local links + if (link.indexOf("http")!= -1) { + if (readAndCheck) { + log("Checking... "+link, verbosity); + checkLink(link, lineNo); + } else { + log("Adding... "+link, verbosity); + links.put(""+lineNo, link); + } + } + } + } + } catch (IOException e) { + throw new BuildException(e.getMessage()); + } finally { + FileUtils.close(b); + } + } + + /* + * check link to see if it's reachable + * returns the HTTP response code or an exception message + */ + private String checkLink(final String linkToCheck, final int lineNo) { + String result = null; + if (checkedLinks.contains(linkToCheck)) { + log("Previously checked link ["+linkToCheck+"]", verbosity); + return "Previously checked link ["+linkToCheck+"]"; + } + for (int i = 0; i <=retries; i++) { + try { + final URL url = new URL(linkToCheck); + final URLConnection con = url.openConnection(); + final HttpURLConnection hcon = (HttpURLConnection) con; + final int response = hcon.getResponseCode(); + if (response != HttpURLConnection.HTTP_OK) { + log("Link found at line ["+lineNo+"] produced "+ + response + " [ERROR]"); + } else { + log(HttpURLConnection.HTTP_OK + " [OK]", verbosity); + //no need to retry + checkedLinks.add(linkToCheck); + break; + } + } catch (MalformedURLException e) { + /* do nothing, if URL is bust, then it's unreachable */ + result = e.getMessage(); + break; + } catch (IOException e) { + /* do nothing, if we have a connection problem then it's unreachable */ + result = e.getMessage(); + break; + } + } + return result; + } +}
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]