The following script is a high-performance link (<a href="...">...</a>) extractor. I'm posting to this list in hopes that anyone interested will offer constructive criticism/suggestions/comments/etc. Mainly I'm curious what comments folks have on my regular expressions. Hopefully someone finds this kind of thing as interesting like I do! :)
My design goals were as follows: * extract links from text (most likey valid HTML) * work faster than BeautifulSoup, sgmllib, or other markup parsing libraries * return accurate results The basic idea is to: 1. find anchor ('a') tags within some HTML text that contain 'href' attributes (I assume these are hyperlinks) 2. extract all attributes from each 'a' tag found as name, value pairs import re import urllib whiteout = re.compile(r'\s+') # grabs hyperlinks from text href_re = re.compile(r''' <a(?P<attrs>[^>]* # start of tag href=(?P<delim>["']) # delimiter (?P<link>[^"']*) # link (?P=delim) # delimiter [^>]*)> # rest of start tag (?P<content>.*?) # link content </a> # end tag ''', re.VERBOSE | re.IGNORECASE) # grabs attribute name, value pairs attrs_re = re.compile(r''' (?P<name>\w+)= # attribute name (?P<delim>["']) # delimiter (?P<value>[^"']*) # attribute value (?P=delim) # delimiter ''', re.VERBOSE) def getLinks(html_data): newdata = whiteout.sub(' ', html_data) matches = href_re.finditer(newdata) ancs = [] for match in matches: d = match.groupdict() a = {} a['href'] = d.get('link', None) a['content'] = d.get('content', None) attr_matches = attrs_re.finditer(d.get('attrs', None)) for match in attr_matches: da = match.groupdict() name = da.get('name', None) a[name] = da.get('value', None) ancs.append(a) return ancs if __name__ == '__main__': opener = urllib.FancyURLopener() url = 'http://adammonsen.com/tut/libgladeTest.html' html_data = opener.open(url).read() for a in getLinks(html_data): print a -- Adam Monsen http://adammonsen.com/ -- http://mail.python.org/mailman/listinfo/python-list