<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="htmlparser-conf.xsl"?>

<!-- Do not modify this file directly.  Instead, copy entries that you -->
<!-- wish to modify from this file into htmlparser-site.xml and change them -->
<!-- there.  If htmlparser-site.xml does not already exist, create it.      -->

<htmlparser-conf>

<!-- HTTP properties -->

<property>
  <name>http.agent.name</name>
  <value>WinistaBot</value>
  <description>Our HTTP 'User-Agent' request header.</description>
</property>

<property>
  <name>http.robots.agents</name>
  <value>WinistaBot,Winista,*</value>
  <description>The agent strings we'll look for in robots.txt files,
  comma-separated, in decreasing order of precedence.</description>
</property>

<property>
  <name>http.robots.403.allow</name>
  <value>true</value>
  <description>Some servers return HTTP status 403 (Forbidden) if
  /robots.txt doesn't exist. This should probably mean that we are
  allowed to crawl the site nonetheless. If this is set to false,
  then such sites will be treated as forbidden.</description>
</property>

<property>
	<name>http.robots.honor</name>
	<value>true</value>
	<description>This configuration setting can be used to
	bypass checks for robots.txt file checks. By default this property
	is set to true meaning that we always honor settings in robots.txt
	file for that site.</description>
</property>

<property>
  <name>http.agent.description</name>
  <value>Winista - Crwaler for HTTP content indexing</value>
  <description>Further description of our bot- this text is used in
  the User-Agent header.  It appears in parenthesis after the agent name.
  </description>
</property>

<property>
  <name>http.agent.url</name>
  <value>http://www.netomatix.com/Winista/bot.html</value>
  <description>A URL to advertise in the User-Agent header.  This will 
   appear in parenthesis after the agent name.
  </description>
</property>

<property>
  <name>http.agent.email</name>
  <value>support@netomatix.com</value>
  <description>An email address to advertise in the HTTP 'From' request
   header and User-Agent header.</description>
</property>

<property>
  <name>http.agent.version</name>
  <value>1.6.7-dev</value>
  <description>A version string to advertise in the User-Agent 
   header.</description>
</property>

<property>
  <name>http.timeout</name>
  <value>10000</value>
  <description>The default network timeout, in milliseconds.</description>
</property>

<property>
  <name>http.max.delays</name>
  <value>3</value>
  <description>The number of times a thread will delay when trying to
  fetch a page.  Each time it finds that a host is busy, it will wait
  fetcher.server.delay.  After http.max.delays attepts, it will give
  up on the page for now.</description>
</property>

<property>
	<name>http.allow.https</name>
	<value>false</value>
	<decription>
		This value can be set to allow or disallow crawling of secure sites.
	</decription>
</property>

<property>
  <name>http.content.limit</name>
  <value>128536</value>
  <description>The length limit for downloaded content, in bytes.
  If this value is nonnegative (>=0), content longer than it will be truncated;
  otherwise, no truncation at all.
  </description>
</property>

<property>
  <name>http.proxy.host</name>
  <value></value>
  <description>The proxy hostname.  If empty, no proxy is used.</description>
</property>

<property>
  <name>http.proxy.port</name>
  <value></value>
  <description>The proxy port.</description>
</property>

<property>
  <name>http.verbose</name>
  <value>false</value>
  <description>If true, HTTP will log more verbosely.</description>
</property>

<property>
  <name>http.redirect.max</name>
  <value>3</value>
  <description>The maximum number of redirects the fetcher will follow when
    trying to fetch a page.</description>
</property>

<property>
  <name>http.contenttype.check.strict</name>
  <value>false</value>
  <description>
    
  </description>
</property>

<!-- Character Set Settings -->
<property>
	<name>charset.user.defined.mapping</name>
	<value>ISO-8859-1</value>
	<description>Code page to use when page has content-type tag set to x-user-defined</description>
</property>
<!-- web db properties -->

<property>
  <name>db.default.fetch.interval</name>
  <value>30</value>
  <description>The default number of days between re-fetches of a page.
  </description>
</property>

<property>
  <name>db.ignore.internal.links</name>
  <value>true</value>
  <description>If true, when adding new links to a page, links from
  the same host are ignored.  This is an effective way to limit the
  size of the link database, keeping the only the highest quality
  links.
  </description>
</property>

<property>
  <name>db.score.injected</name>
  <value>1.0</value>
  <description>The score of new pages added by the injector.
  </description>
</property>

<property>
  <name>db.score.link.external</name>
  <value>1.0</value>
  <description>The score factor for new pages added due to a link from
  another host relative to the referencing page's score.
  </description>
</property>

<property>
  <name>db.score.link.internal</name>
  <value>1.0</value>
  <description>The score factor for pages added due to a link from the
  same host, relative to the referencing page's score.
  </description>
</property>

<property>
  <name>db.max.outlinks.per.page</name>
  <value>100</value>
  <description>The maximum number of outlinks that we'll process for a page.
  </description>
</property>

<property>
  <name>db.max.anchor.length</name>
  <value>100</value>
  <description>The maximum number of characters permitted in an anchor.
  </description>
</property>

<property>
  <name>db.fetch.retry.max</name>
  <value>3</value>
  <description>The maximum number of times a url that has encountered
  recoverable errors is generated for fetch.</description>
</property>

<!-- fetchlist tool properties -->

<property>
  <name>fetchlist.score.by.link.count</name>
  <value>true</value>
  <description>If true, set page scores on fetchlist entries based on
  log(number of anchors), instead of using original page scores. This
  results in prioritization of pages with many incoming links.
  </description>
</property>

<!-- fetcher properties -->

<property>
  <name>fetcher.server.delay</name>
  <value>5.0</value>
  <description>The number of seconds the fetcher will delay between 
   successive requests to the same server.</description>
</property>

<property>
  <name>fetcher.threads.fetch</name>
  <value>10</value>
  <description>The number of FetcherThreads the fetcher should use.
    This is also determines the maximum number of requests that are 
    made at once (each FetcherThread handles one connection).</description>
</property>

<property>
  <name>fetcher.threads.per.host</name>
  <value>1</value>
  <description>This number is the maximum number of threads that
    should be allowed to access a host at one time.</description>
</property>

<property>
  <name>fetcher.verbose</name>
  <value>false</value>
  <description>If true, fetcher will log more verbosely.</description>
</property>

<!-- parser properties -->
<property>
  <name>parser.threads.parse</name>
  <value>10</value>
  <description>Number of ParserThreads ParseSegment should use.</description>
</property>

<!-- parser properties -->

<property>
  <name>parser.character.encoding.default</name>
  <value>windows-1252</value>
  <description>The character encoding to fall back to when no other information
  is available</description>
</property>

<property>
  <name>parser.html.impl</name>
  <value>Winista.HTMLParser</value>
  <description>HTML Parser implementation.
  </description>
</property>

<!-- query-basic plugin properties -->

<property>
  <name>query.url.boost</name>
  <value>4.0</value>
  <description> Used as a boost for url field in Lucene query.
  </description>
</property>

<property>
  <name>query.anchor.boost</name>
  <value>2.0</value>
  <description> Used as a boost for anchor field in Lucene query.
  </description>
</property>


<property>
  <name>query.title.boost</name>
  <value>1.5</value>
  <description> Used as a boost for title field in Lucene query.
  </description>
</property>

<property>
  <name>query.host.boost</name>
  <value>2.0</value>
  <description> Used as a boost for host field in Lucene query.
  </description>
</property>

<property>
  <name>query.phrase.boost</name>
  <value>1.0</value>
  <description> Used as a boost for phrase in Lucene query.
  Multiplied by boost for field phrase is matched in.
  </description>
</property>

</htmlparser-conf>

