Re: URL Stemmer

Otis Gospodnetic Wed, 27 Jul 2005 16:31:10 -0700

Hm, not sure why you're emailing [EMAIL PROTECTED]  [EMAIL PROTECTED]
may be better.  Here are 2 ancient classes from 2003 that I once used
to normalize URLs, to help me identify URL duplicates.  This may get
stripped on its way to the list.


Otis


--- Chris Fraschetti <[EMAIL PROTECTED]> wrote:

> Writing simple code to trim down a URL is trivial, but to actually
> trim it down to its most meaningful state is very hard. In same cases
> the URL parameters actually define the page in others they are
> useless
> babble. I'd like to use the hash of a page's URL as well as a hash of
> the content data to help me eliminate duplicates... is there any good
> methods that are commonly used for URL stemming?
> 
> -- 
> ___________________________________________________
> Chris Fraschetti
> e [EMAIL PROTECTED]
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [EMAIL PROTECTED]
> For additional commands, e-mail: [EMAIL PROTECTED]
> 
>

package com.krumpir.util;

import java.net.MalformedURLException;
import java.net.URL;


/**
 * Class <code>FullURLFormatter</code> FIXME.
 *
 * @author Otis Gospodnetic
 * @version $Id: FullURLFormatter.java,v 1.1.1.1 2003/11/03 13:31:58 otis Exp $
 */
public class FullURLFormatter extends URLFormatter
{
    /**********************************************************************************************/

    // Constructors

    /**********************************************************************************************/
    public FullURLFormatter(int cacheSize)
    {
        super(cacheSize);
    }

    /**
     * Sets an URL for formatting.
     */
    public void setURL(URL url)
    {
        originalURL = url;
        protocol = url.getProtocol();
        host = url.getHost();
        file = url.getFile();
        port = url.getPort();

        if (port < 1)
        {
            port = HTTPUtils.getDefaultPort();
        }

        if (file.indexOf("?") != -1)
        {
            query = file.substring(file.indexOf("?") + 1);
            file = file.substring(0, file.indexOf("?"));
        }

        reference = url.getRef();
    }

    /**********************************************************************************************/

    // Methods

    /**********************************************************************************************/
    /**
     * Given a dirty URL, returns a consistently formatted URL.
     */
    public URL format(URL url)
    {
        // getFromCache() will return null if the cache is not enabled or
        // if the url has not been seen before.
        URL cachedURL = getFromCache(url);

        if (cachedURL != null)
        {
            return cachedURL;
        }
        else
        {
            setURL(url);
            lowerCaseProtocol();
            lowerCaseHost();

            //stripPortNumber();
            stripReference();
            stripQuery();
            removeDuplicateSlashes();
            stringifyTilde();

            URL formattedURL = toURL();
            cacheURL(originalURL, formattedURL);

            return formattedURL;
        }
    }

    /**
     *
     */
    public String toString()
    {
        // FIXME: OG: toURL() can return null
        return toURL().toString();
    }

    /**
     *
     */
    public URL toURL()
    {
        StringBuffer urlBuffer = new StringBuffer();

        try
        {
            // protocol :// host : port file
            urlBuffer.append(protocol + "://" + host + ":" + port + file);

            // add query if it existed in the original URL
            if (query != null)
            {
                urlBuffer.append("?" + query);
            }

            // add referece if it existed in the original URL
            if (reference != null)
            {
                urlBuffer.append("#" + reference);
            }

            return new URL(urlBuffer.toString());
        }
        catch (MalformedURLException e)
        {
            // FIXME: OG
            System.err.println(
                "Problem constructing URL <" + urlBuffer.toString() + ">: " + e.getMessage());

            return null;
        }
    }
}

package com.krumpir.util;

import java.net.MalformedURLException;
import java.net.URL;

import com.krumpir.util.adt.LRUHashtable;

/**
 * Utility for making URLs consistent.
 *
 * <p>
 *
 * This class provides a bunch of little methods for altering an URL,
 * to make URLs print more readably and match more consistently.  The
 * JDK URL class does some things to clean up URLs passed in as weird
 * strings, but it takes a conservative approach and does not do as
 * much as some applications might need -- particularly those that
 * deal with document addresses equivalent from a network or Web
 * server perspective but unequal as a string comparison.
 *
 * @author Otis Gospodnetic
 * @version $Id: URLFormatter.java,v 1.1.1.1 2003/11/03 13:32:00 otis Exp $
 */
public abstract class URLFormatter
{
    protected LRUHashtable cache = null;
    protected boolean cacheEnabled = false;
    protected int cacheSize = 0;

    protected URL originalURL;
    protected String protocol, host, file, reference, query;
    protected int port;

    /**********************************************************************************************/
    // Methods
    /**********************************************************************************************/

    /**
     * Constructs an empty URLFormatter.
     */
    public URLFormatter(int cacheSize)
    {
        if (cacheSize > 0)
        {
            cache = new LRUHashtable(cacheSize);
            cacheEnabled = true;
        }
        this.cacheSize  = cacheSize;
    }

    /**
     * Given a dirty URL, returns a consistently formatted URL.
     */
    public abstract URL format(URL url);

    /**
     *
     */
    public abstract void setURL(URL url);

    /**
     *
     */
    public abstract URL toURL();

    /**
     *
     */
    public abstract String toString();

    /**
     * Enables a cache of (un)formatted URLs to avoid reprocessing the same
     * URL.  The cache is disabled by default.
     */
    public void cacheSwitch(short cacheSize)
    {
        if (cacheSize > 0)
        {
            cache = new LRUHashtable(cacheSize);
            cacheEnabled = true;
        }
        else
        {
            cacheEnabled = false;
            cache = null;
        }
        this.cacheSize = cacheSize;
    }

    /**
     * Indicates whether the cache is currently enabled or disabled.
     */
    public boolean isCacheEnabled()
    {
        return cacheEnabled;
    }

    /**
     * Empties the cache of its current contents.
     */
    public void clearCache()
    {
        if (cacheEnabled)
            cache.clear();
    }

    /**
     * Retrieves a formatted URL from the cache of previously-seen
     * URLs, using the original URL as a cache key.  If the cache is
     * disabled, this will always return null.  If the cache does not
     * contain the argument URL as a key, it will return null.  An URL
     * is stored in the cache when it is first seen by the format(URL).
     */
    protected URL getFromCache(URL url)
    {
        String strURL = url.toString();
        if (cacheEnabled && cache.containsKey(strURL))
        {
            return (URL)cache.get(strURL);
        }
        else
        {
            return null;
        }
    }

    /**
     * Converts the protocol (<code>http</code> in
     * <code>HTTP://www.company.com/</code>) to lower-case.  An upper- or
     * lower-case protocol is valid in most browsers, and this method
     * masks the apparent difference between them.
     */
    protected void lowerCaseProtocol()
    {
        protocol = protocol.toLowerCase();
    }

    /**
     * Converts the host (<code>WWW.COMPANY.COM</code> in
     * <code>HTTP://WWW.COMPANY.COM/</code>) to lower-case.  Most domain
     * name resolvers are case-insensitive, so this prevents host case from
     * making two URLs unequal.
     */
    protected void lowerCaseHost()
    {
        host = host.toLowerCase();
    }

    /**
     *
     */
    protected String stripReference()
    {
        // get a reference to it, null the original reference to it, and return a copy
        String foundRef = reference;
        reference = null;
        return foundRef;
    }

    /**
     *
     */
    protected String stripQuery()
    {
        // get a reference to it, null the original reference to it, and return a copy
        String foundQuery = query;
        query = null;
        return foundQuery;
    }

    // This may be buggy.  I'm seeing weird behavior when dupes appears
    // within a string rather than at the head of it.  Maybe use
    // StringCharacterIterator instead of doing it this way.
    protected int removeDuplicateSlashes()
    {
        char[] fileChars = new char[file.length()];
        char lastChar = '\u0000';
        StringBuffer fileBuffer = new StringBuffer();
        int duplicateCount = 0;
        file.getChars(0, file.length(), fileChars, 0);
        for (int i = 0; i < fileChars.length; i++)
        {
            if (fileChars[i] == '/' && fileChars[i] == lastChar)
            {
                duplicateCount++;
            }
            else
            {
                lastChar = fileChars[i];
                fileBuffer.append(fileChars[i]);
            }
        }
        file = fileBuffer.toString();
        return duplicateCount;
    }

    /**
     *
     */
    protected boolean stringifyTilde()
    {
        if ((file.startsWith("/%7E") || file.startsWith("/%7e")) && file.length() > 4)
        {
            String afterTilde = file.substring(4);
            file = "/~" + afterTilde;
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     *
     */
    protected void cacheURL(URL originalURL, URL formattedURL)
    {
        // TODO: OG: should we include the else case? Throw an exception? Why?
        if (cacheEnabled && !cache.containsKey(originalURL.toString()))
        {
            cache.put(originalURL.toString(), formattedURL);
        }
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: URL Stemmer

Reply via email to