Sophie: htmlparser-1.6-5mdv2010.0 noarch

htmlparser-1.6-5mdv2010.0.noarch.rpm

<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<!-- $Id: head.tmpl,v 1.5 2002/12/15 01:30:47 carstenklapp Exp $ -->
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<meta name="robots" content="index,follow" />
<meta name="keywords" content="Web Crawler, PhpWiki" />
<meta name="description" content="A crawler is a program that picks up a page and follows all the links on the page. Crawlers are used in search engines to index all the pages on a website, starting only from the first page (as long as it is linked)." />
<meta name="language" content="" />
<meta name="document-type" content="Public" />
<meta name="document-rating" content="General" />
<meta name="generator" content="phpWiki" />
<meta name="PHPWIKI_VERSION" content="1.3.4" />

<link rel="shortcut icon" href="/wiki/themes/default/images/favicon.ico" />
<link rel="home" title="HomePage" href="HomePage" />
<link rel="help" title="HowToUseWiki" href="HowToUseWiki" />
<link rel="copyright" title="GNU General Public License" href="http://www.gnu.org/copyleft/gpl.html#SEC1" />
<link rel="author" title="The PhpWiki Programming Team" href="http://phpwiki.sourceforge.net/phpwiki/ThePhpWikiProgrammingTeam" />
<link rel="search" title="FindPage" href="FindPage" />
<link rel="alternate" title="View Source: WebCrawler" href="WebCrawler?action=viewsource&amp;version=8" />
<link rel="alternate" type="application/rss+xml" title="RSS" href="RecentChanges?format=rss" />

<link rel="bookmark" title="SandBox" href="SandBox" />
<link rel="bookmark" title="WikiWikiWeb" href="WikiWikiWeb" />



<link rel="stylesheet" title="MacOSX" type="text/css" charset="iso-8859-1" href="/wiki/themes/MacOSX/MacOSX.css" /><link rel="alternate stylesheet" title="Printer" type="text/css" charset="iso-8859-1" href="/wiki/themes/default/phpwiki-printer.css" media="print, screen" /><link rel="alternate stylesheet" title="Modern" type="text/css" charset="iso-8859-1" href="/wiki/themes/default/phpwiki-modern.css" /><style type="text/css">
<!--
body {background-image: url(/wiki/themes/MacOSX/images/bgpaper8.png);}
-->
</style>
<title>PhpWiki - Web Crawler</title>
</head>
<!-- End head -->
<!-- Begin body -->
<!-- $Id: body.tmpl,v 1.30 2002/09/02 14:36:58 rurban Exp $ -->
<body>
<!-- Begin top -->
<!-- $Id: top.tmpl,v 1.20 2002/12/15 01:30:47 carstenklapp Exp $ -->

<!-- End top -->
<!-- Begin browse -->
<!-- $Id: browse.tmpl,v 1.22 2002/02/19 23:00:26 carstenklapp Exp $ -->


<div class="wikitext"><p><b>Web Crawler (aka Spider)</b></p>
<p>A crawler is a program that picks up a page and follows all the links on the page. Crawlers are used in search engines to index all the pages on a website, starting only from the first page (as long as it is linked).</p>
<p>There are several crawlers out there, but few are good quality open-source crawlers. The problem is most crawlers could fail if the parser they use is not powerful. Using HTMLParser, it is possible to crawl through dirty html - with great speed.</p>
<p>There are two types of crawlers:</p>
<ul>
<li>Breadth First</li>
<li>Depth First</li>
</ul>
<p><b>Breadth First crawlers</b> use the BFS (Breadth-First Search) algorithm. Here's a brief description :</p>
<p>Get all links from the starting page and add to the queue
Pick first link from queue, get all links on this page, and add to queue
Repeat Step 2, until queue is empty</p>
<p><b>Depth First crawlers</b> use the DFS (Depth-First Search) algorithm. Here's a brief description :</p>
<p>Get first link that has not yet been visited, from the starting page.
Visit link and get first non-visited link
Repeat Step 2, until there are no further non-visited links.
Go to next non-visited link in the previous level of recursion, and repeat step 2, until no more non-visited links are present
BFS crawlers are simple to write. DFS can be slightly more involved, so we shall present a simple DFS crawler program below. This is a basic program, and is included in the <i>org.htmlparser.parserapplications</i> package - <i>Robot.java</i>. Feel free to modify it or add functionality to it.</p>
<pre>
import org.htmlparser.Parser;

public class Robot {
  private Parser parser;
  /**
   * Robot crawler - Provide the starting url
   */
  public Robot(String resourceLocation) {
    try {
      parser = new Parser(resourceLocation,new DefaultParserFeedback());
      parser.registerScanners();
    }
    catch (ParserException e) {
      System.err.println("Error, could not create parser object");
      e.printStackTrace();
    }
  }
  /**
   * Crawl using a given crawl depth.
   * @param crawlDepth Depth of crawling
   */
  public void crawl(int crawlDepth) throws ParserException
  {
    try {
      crawl(parser,crawlDepth);
    }
    catch (ParserException e) {
      throw new ParserException("ParserException at crawl("+crawlDepth+")",e);
    }
  }
  /**
   * Crawl using a given parser object, and a given crawl depth.
   * @param parser Parser object
   * @param crawlDepth Depth of crawling
   */
  public void crawl(Parser parser,int crawlDepth) throws ParserException {
    System.out.println(" crawlDepth = "+crawlDepth);
    for (NodeIterator e = parser.elements();e.hasMoreNodes();)
    {
      Node node = e.nextNode();
      if (node instanceof LinkTag)
      {
        LinkTag linkTag = (LinkTag)node;
        {
          if (!linkTag.isMailLink())
          {
            if (linkTag.getLink().toUpperCase().indexOf("HTM")!=-1 ||
              linkTag.getLink().toUpperCase().indexOf("COM")!=-1 ||
              linkTag.getLink().toUpperCase().indexOf("ORG")!=-1)
            {
              if (crawlDepth&gt;0)
              {
                Parser newParser = new Parser(linkTag.getLink(),new DefaultParserFeedback());
                newParser.registerScanners();
                System.out.print("Crawling to "+linkTag.getLink());
                crawl(newParser,crawlDepth-1);
              }
              else System.out.println(linkTag.getLink());
            }
          }
        }
      }
    }
  }

  public static void main(String[] args)
  {
    System.out.println("Robot Crawler v"+Parser.VERSION_STRING);
    if (args.length&lt;2 || args[0].equals("-help"))
    {
      System.out.println();
      System.out.println("Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot &lt;resourceLocn/website&gt; &lt;depth&gt;");
      System.out.println();
      System.out.println("   &lt;resourceLocn&gt; the name of the file to be parsed (with complete path ");
      System.out.println("                  if not in current directory)");
      System.out.println("   &lt;depth&gt; No of links to be followed from each link");
      System.out.println("   -help This screen");
      System.out.println();
      System.out.println("HTML Parser home page : http://htmlparser.sourceforge.net");
      System.out.println();
      System.out.println("Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3");
      System.out.println();
      System.out.println("If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
      System.exit(-1);
    }
    String resourceLocation="";
    int crawlDepth = 1;
    if (args.length!=0) resourceLocation = args[0];
    if (args.length==2) crawlDepth=Integer.valueOf(args[1]).intValue();


    Robot robot = new Robot(resourceLocation);
    System.out.println("Crawling Site "+resourceLocation);
    try {
      robot.crawl(crawlDepth);
    }
    catch (ParserException e) {
      e.printStackTrace();
    }
  }
}</pre>
<p>The method that does the crawling is the recursive method crawl(parser,depth). The crawler goes about creating multiple parsers and moving through sites using the DFS approach.</p>
<p>You have to be careful of the depth provided to the crawler. Studying the time taken to map all the links is itself an interesting research project. A word of caution, some sites dont like crawlers going through them. They would have a file called robots.txt in the root directory which should be accessed to know the rules and honor them. Read more about this. The above program is only a demonstration program. Please note that it will only follow links that have ".com", ".htm" or ".org" ending. In real-life situations, you'd also want to support dynamic links.</p>
<p>Before you set out to design an open-source or commercia crawler, please study what others have already researched in this area.</p>
<p><b>Some Useful Links on Crawlers</b></p>
<ul>
<li><a href="http://dollar.biz.uiowa.edu/%7Efil/IS/" class="namedurl"><span style="white-space: nowrap"><img src="../themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />InfoSpiders</span></a></li>
<li><a href="http://www.searchtools.com/robots/robots-articles.html" class="namedurl"><span style="white-space: nowrap"><img src="../themes/MacOSX/images/http.png" alt="http" class="linkicon" border="0" />Collection</span> of Crawler Links</a></li>
</ul>
<p>--<a href="../index.php/SomikRaha" class="wiki">SomikRaha</a>, Sunday, February 16, 2003 2:13:46 pm.</p>
</div>


<!-- End browse -->
<!-- Begin bottom -->
<!-- $Id: bottom.tmpl,v 1.3 2002/09/15 20:21:16 rurban Exp $ -->
<!-- Add your Disclaimer here -->
<!-- Begin debug -->
<!-- $Id: debug.tmpl,v 1.9 2002/09/17 02:10:33 dairiki Exp $ -->
<table width="%100" border="0" cellpadding="0" cellspacing="0">
<tr><td>

</td><td>
<span class="debug">Page Execution took 0.291 seconds</span>
</td></tr></table>
<!-- This keeps the valid XHTML! icons from "hanging off the bottom of the scree" -->
<br style="clear: both;" />
<!-- End debug -->
<!-- End bottom -->
</body>
<!-- End body -->
<!-- phpwiki source:
$Id: prepend.php,v 1.13 2002/09/18 19:23:25 dairiki Exp $
$Id: ErrorManager.php,v 1.16 2002/09/14 22:23:36 dairiki Exp $
$Id: HtmlElement.php,v 1.27 2002/10/31 03:28:30 carstenklapp Exp $
$Id: XmlElement.php,v 1.17 2002/08/17 15:52:51 rurban Exp $
$Id: WikiCallback.php,v 1.2 2001/11/21 20:01:52 dairiki Exp $
$Id: index.php,v 1.99 2002/12/31 01:13:14 wainstead Exp $
$Id: main.php,v 1.90 2002/11/19 07:07:37 carstenklapp Exp $
$Id: config.php,v 1.68 2002/11/14 22:28:03 carstenklapp Exp $
$Id: FileFinder.php,v 1.11 2002/09/18 18:34:13 dairiki Exp $
$Id: Request.php,v 1.24 2002/12/14 16:21:46 dairiki Exp $
$Id: WikiUser.php,v 1.29 2002/11/19 07:07:38 carstenklapp Exp $
$Id: WikiDB.php,v 1.17 2002/09/15 03:56:22 dairiki Exp $
$Id: SQL.php,v 1.2 2001/09/19 03:24:36 wainstead Exp $
$Id: mysql.php,v 1.3 2001/12/08 16:02:35 dairiki Exp $
$Id: PearDB.php,v 1.28 2002/09/12 11:45:33 rurban Exp $
$Id: backend.php,v 1.3 2002/01/10 23:32:04 carstenklapp Exp $
$Id: DB.php,v 1.2 2002/09/12 11:45:33 rurban Exp $
From Pear CVS: Id: DB.php,v 1.13 2002/07/02 15:19:49 cox Exp
$Id: PEAR.php,v 1.1 2002/01/28 04:01:56 dairiki Exp $
From Pear CVS: Id: PEAR.php,v 1.29 2001/12/15 15:01:35 mj Exp
$Id: mysql.php,v 1.2 2002/09/12 11:45:33 rurban Exp $
From Pear CVS: Id: mysql.php,v 1.5 2002/06/19 00:41:06 cox Exp
$Id: common.php,v 1.2 2002/09/12 11:45:33 rurban Exp $
From Pear CVS: Id: common.php,v 1.8 2002/06/12 15:03:16 fab Exp
$Id: themeinfo.php,v 1.46 2002/03/08 20:31:14 carstenklapp Exp $
$Id: Theme.php,v 1.58 2002/10/12 08:55:03 carstenklapp Exp $
$Id: display.php,v 1.38 2002/09/15 20:17:58 rurban Exp $
$Id: Template.php,v 1.46 2002/09/15 15:05:47 rurban Exp $
$Id: WikiPlugin.php,v 1.27 2002/11/04 03:15:59 carstenklapp Exp $
$Id: BlockParser.php,v 1.29 2002/11/25 22:25:49 dairiki Exp $
$Id: InlineParser.php,v 1.19 2002/11/25 22:51:37 dairiki Exp $
$Id: interwiki.php,v 1.23 2002/10/06 16:45:10 dairiki Exp $
$Id: PageType.php,v 1.13 2002/09/04 20:39:47 dairiki Exp $
-->
</html>