"""Check for broken links in an XHTML file. Demonstrates several Pyana features: 1. how to accept input from both a string and a URL 2. how to pass a top-level parameter 3. how to have the output of an tranformation sent to a writer object 4. how to extend XPath with Python functions Sample output: > check_links.py http://pyana.sourceforge.net/ Checking for broken links in "http://pyana.sourceforge.net/" Checking image "http://sourceforge.net/sflogo.php?group_id=28142&type=1".ok Checking link "http://sourceforge.net".................................ok Checking link "http://sourceforge.net/projects/pyana/".................ok Checking link "http://sourceforge.net/project/showfiles.php?group_id=28142".ok Checking link "build.html".............................................failed Checking link "examples"...............................................ok Checking link "http://pirxx.sourceforge.net"...........................ok Checking link "mailto:brian@sweetapp.com"..............................maybe """ import Pyana import urllib2 import urlparse import sys import urlparse def checkBroken(base, frag): """checkBroken('http://www.python.org/', 'img.gif') => 'failed' Checks to see if a URL that can be accessed. If a URL can be accessed, returns 'ok', if not returns 'failed'. If a URL uses a protocol that cannot be chacked, returns 'maybe'.""" # Encode the Unicode URIs as UTF-8. If they are ASCII then there # content will be unchanged. url = urlparse.urljoin(base.encode('utf-8'), frag.encode('utf-8')) if urlparse.urlparse(url)[0] not in ['ftp', 'http', 'https', 'gopher']: return 'maybe' # urllib2 is going to have problems try: urllib2.urlopen(urlparse.urljoin(base.encode('utf-8'), url.encode('utf-8'))) return 'ok' except IOError: return 'failed' def format(str, width=55, char='.'): """format("image.gif") => '"image.gif"........................................'""" return '"' + str + '"' + '.' * max(width - len(str), 1) # Install functions as XPath extensions, stick all of the # functions in the 'pyNS' namespace Pyana.installGlobalExtension('pyNS', checkBroken, 'checkBroken') Pyana.installGlobalExtension('pyNS', format, 'format') checkURLsXSL = r''' Checking for broken links in "" Checking image Checking link ''' def printBrokenLinksForURL(url): # Send the output to a writer so we don't have to wait until the entire transformation is complete before # seeing output. Any object with a "write" method that takes a string is fine, so just use sys.stdout # instead of creating our own object. try: Pyana.transform2Writer(source=Pyana.URI(url), style=checkURLsXSL, params={'url': repr(url)}, writer=sys.stdout) except Pyana.SAXError: print 'The source does not seem to be valid XHTML' if __name__ == '__main__': if len(sys.argv) == 1: print 'usage: %s url1 [url2] .. [urln]\n'\ 'e.g. %s http://pyana.sourceforge.net/' % (sys.argv[0], sys.argv[0]) else: for url in sys.argv[1:]: printBrokenLinksForURL(url)