Hm, not sure why you're emailing [EMAIL PROTECTED] [EMAIL PROTECTED] may be better. Here are 2 ancient classes from 2003 that I once used to normalize URLs, to help me identify URL duplicates. This may get stripped on its way to the list.
Otis --- Chris Fraschetti <[EMAIL PROTECTED]> wrote: > Writing simple code to trim down a URL is trivial, but to actually > trim it down to its most meaningful state is very hard. In same cases > the URL parameters actually define the page in others they are > useless > babble. I'd like to use the hash of a page's URL as well as a hash of > the content data to help me eliminate duplicates... is there any good > methods that are commonly used for URL stemming? > > -- > ___________________________________________________ > Chris Fraschetti > e [EMAIL PROTECTED] > > --------------------------------------------------------------------- > To unsubscribe, e-mail: [EMAIL PROTECTED] > For additional commands, e-mail: [EMAIL PROTECTED] > >
package com.krumpir.util; import java.net.MalformedURLException; import java.net.URL; /** * Class <code>FullURLFormatter</code> FIXME. * * @author Otis Gospodnetic * @version $Id: FullURLFormatter.java,v 1.1.1.1 2003/11/03 13:31:58 otis Exp $ */ public class FullURLFormatter extends URLFormatter { /**********************************************************************************************/ // Constructors /**********************************************************************************************/ public FullURLFormatter(int cacheSize) { super(cacheSize); } /** * Sets an URL for formatting. */ public void setURL(URL url) { originalURL = url; protocol = url.getProtocol(); host = url.getHost(); file = url.getFile(); port = url.getPort(); if (port < 1) { port = HTTPUtils.getDefaultPort(); } if (file.indexOf("?") != -1) { query = file.substring(file.indexOf("?") + 1); file = file.substring(0, file.indexOf("?")); } reference = url.getRef(); } /**********************************************************************************************/ // Methods /**********************************************************************************************/ /** * Given a dirty URL, returns a consistently formatted URL. */ public URL format(URL url) { // getFromCache() will return null if the cache is not enabled or // if the url has not been seen before. URL cachedURL = getFromCache(url); if (cachedURL != null) { return cachedURL; } else { setURL(url); lowerCaseProtocol(); lowerCaseHost(); //stripPortNumber(); stripReference(); stripQuery(); removeDuplicateSlashes(); stringifyTilde(); URL formattedURL = toURL(); cacheURL(originalURL, formattedURL); return formattedURL; } } /** * */ public String toString() { // FIXME: OG: toURL() can return null return toURL().toString(); } /** * */ public URL toURL() { StringBuffer urlBuffer = new StringBuffer(); try { // protocol :// host : port file urlBuffer.append(protocol + "://" + host + ":" + port + file); // add query if it existed in the original URL if (query != null) { urlBuffer.append("?" + query); } // add referece if it existed in the original URL if (reference != null) { urlBuffer.append("#" + reference); } return new URL(urlBuffer.toString()); } catch (MalformedURLException e) { // FIXME: OG System.err.println( "Problem constructing URL <" + urlBuffer.toString() + ">: " + e.getMessage()); return null; } } }
package com.krumpir.util; import java.net.MalformedURLException; import java.net.URL; import com.krumpir.util.adt.LRUHashtable; /** * Utility for making URLs consistent. * * <p> * * This class provides a bunch of little methods for altering an URL, * to make URLs print more readably and match more consistently. The * JDK URL class does some things to clean up URLs passed in as weird * strings, but it takes a conservative approach and does not do as * much as some applications might need -- particularly those that * deal with document addresses equivalent from a network or Web * server perspective but unequal as a string comparison. * * @author Otis Gospodnetic * @version $Id: URLFormatter.java,v 1.1.1.1 2003/11/03 13:32:00 otis Exp $ */ public abstract class URLFormatter { protected LRUHashtable cache = null; protected boolean cacheEnabled = false; protected int cacheSize = 0; protected URL originalURL; protected String protocol, host, file, reference, query; protected int port; /**********************************************************************************************/ // Methods /**********************************************************************************************/ /** * Constructs an empty URLFormatter. */ public URLFormatter(int cacheSize) { if (cacheSize > 0) { cache = new LRUHashtable(cacheSize); cacheEnabled = true; } this.cacheSize = cacheSize; } /** * Given a dirty URL, returns a consistently formatted URL. */ public abstract URL format(URL url); /** * */ public abstract void setURL(URL url); /** * */ public abstract URL toURL(); /** * */ public abstract String toString(); /** * Enables a cache of (un)formatted URLs to avoid reprocessing the same * URL. The cache is disabled by default. */ public void cacheSwitch(short cacheSize) { if (cacheSize > 0) { cache = new LRUHashtable(cacheSize); cacheEnabled = true; } else { cacheEnabled = false; cache = null; } this.cacheSize = cacheSize; } /** * Indicates whether the cache is currently enabled or disabled. */ public boolean isCacheEnabled() { return cacheEnabled; } /** * Empties the cache of its current contents. */ public void clearCache() { if (cacheEnabled) cache.clear(); } /** * Retrieves a formatted URL from the cache of previously-seen * URLs, using the original URL as a cache key. If the cache is * disabled, this will always return null. If the cache does not * contain the argument URL as a key, it will return null. An URL * is stored in the cache when it is first seen by the format(URL). */ protected URL getFromCache(URL url) { String strURL = url.toString(); if (cacheEnabled && cache.containsKey(strURL)) { return (URL)cache.get(strURL); } else { return null; } } /** * Converts the protocol (<code>http</code> in * <code>HTTP://www.company.com/</code>) to lower-case. An upper- or * lower-case protocol is valid in most browsers, and this method * masks the apparent difference between them. */ protected void lowerCaseProtocol() { protocol = protocol.toLowerCase(); } /** * Converts the host (<code>WWW.COMPANY.COM</code> in * <code>HTTP://WWW.COMPANY.COM/</code>) to lower-case. Most domain * name resolvers are case-insensitive, so this prevents host case from * making two URLs unequal. */ protected void lowerCaseHost() { host = host.toLowerCase(); } /** * */ protected String stripReference() { // get a reference to it, null the original reference to it, and return a copy String foundRef = reference; reference = null; return foundRef; } /** * */ protected String stripQuery() { // get a reference to it, null the original reference to it, and return a copy String foundQuery = query; query = null; return foundQuery; } // This may be buggy. I'm seeing weird behavior when dupes appears // within a string rather than at the head of it. Maybe use // StringCharacterIterator instead of doing it this way. protected int removeDuplicateSlashes() { char[] fileChars = new char[file.length()]; char lastChar = '\u0000'; StringBuffer fileBuffer = new StringBuffer(); int duplicateCount = 0; file.getChars(0, file.length(), fileChars, 0); for (int i = 0; i < fileChars.length; i++) { if (fileChars[i] == '/' && fileChars[i] == lastChar) { duplicateCount++; } else { lastChar = fileChars[i]; fileBuffer.append(fileChars[i]); } } file = fileBuffer.toString(); return duplicateCount; } /** * */ protected boolean stringifyTilde() { if ((file.startsWith("/%7E") || file.startsWith("/%7e")) && file.length() > 4) { String afterTilde = file.substring(4); file = "/~" + afterTilde; return true; } else { return false; } } /** * */ protected void cacheURL(URL originalURL, URL formattedURL) { // TODO: OG: should we include the else case? Throw an exception? Why? if (cacheEnabled && !cache.containsKey(originalURL.toString())) { cache.put(originalURL.toString(), formattedURL); } } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]