Sophie

Sophie

distrib > Mandriva > 2010.0 > i586 > media > contrib-release > by-pkgid > 7d3d6e5bed16fe556d7983604bb8c690 > files > 32

python-parsing-1.4.2-8mdv2010.0.noarch.rpm

import urllib

from pyparsing import *

anchorStart,anchorEnd = makeHTMLTags("a")

# read HTML from a web page
serverListPage = urllib.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()

anchor = anchorStart + SkipTo(anchorEnd).setResultsName("body") + anchorEnd


for tokens,start,end in anchor.scanString(htmlText):
    print tokens.body,'->',tokens.href