Created
October 12, 2012 06:46
-
-
Save masayu-a/3877668 to your computer and use it in GitHub Desktop.
Heritrix 3 crawler setting in NINJAL, Japan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<!-- | |
HERITRIX 3 CRAWL JOB CONFIGURATION FILE | |
This is a relatively minimal configuration suitable for many crawls. | |
Commented-out beans and properties are provided as an example; values | |
shown in comments reflect the actual defaults which are in effect | |
if not otherwise specified specification. (To change from the default | |
behavior, uncomment AND alter the shown values.) | |
--> | |
<beans xmlns="http://www.springframework.org/schema/beans" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context" xmlns:aop="http://www.springframework.org/schema/aop" xmlns:tx="http://www.springframework.org/schema/tx" xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-3.0.xsd http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.0.xsd http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd"> | |
<context:annotation-config/> | |
<!-- | |
OVERRIDES | |
Values elsewhere in the configuration may be replaced ('overridden') | |
by a Properties map declared in a PropertiesOverrideConfigurer, | |
using a dotted-bean-path to address individual bean properties. | |
This allows us to collect a few of the most-often changed values | |
in an easy-to-edit format here at the beginning of the model | |
configuration. | |
--> | |
<!-- overrides from a text property list --> | |
<bean id="simpleOverrides" class="org.springframework.beans.factory.config.PropertyOverrideConfigurer"> | |
<property name="properties"> | |
<value> | |
# This Properties map is specified in the Java 'property list' text format | |
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29 | |
<!-- modified 2012/09/27 masayu-a クロール広報用 URL --> | |
metadata.operatorContactUrl=http://www.ninjal.ac.jp/corpus_center/ulc/crawl-en | |
metadata.jobName=basic | |
<!-- BEGIN modified 2012/09/27 masayu-a デスクリプション --> | |
metadata.description=NINJAL crawler | |
</value> | |
</property> | |
</bean> | |
<!-- overrides from declared <prop> elements, more easily allowing | |
multiline values or even declared beans --> | |
<!-- CRAWL METADATA: including identification of crawler/operator --> | |
<bean id="metadata" class="org.archive.modules.CrawlMetadata" autowire="byName"> | |
<property name="operatorContactUrl" value="[see override above]"/> | |
<property name="jobName" value="[see override above]"/> | |
<property name="description" value="[see override above]"/> | |
<!-- <property name="robotsPolicyName" value="obey"/> --> | |
<!-- <property name="operator" value=""/> --> | |
<!-- <property name="operatorFrom" value=""/> --> | |
<!-- <property name="organization" value=""/> --> | |
<!-- <property name="audience" value=""/> --> | |
<!-- <property name="userAgentTemplate" | |
value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)"/> --> | |
</bean> | |
<!-- SEEDS: crawl starting points | |
ConfigString allows simple, inline specification of a moderate | |
number of seeds; see below comment for example of using an | |
arbitrarily-large external file. --> | |
<!-- SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in | |
the job directory, similar to the H1 approach. | |
Use either the above, or this, but not both. --> | |
<!-- ファイルによるシードの指定 --> | |
<bean id="seeds" class="org.archive.modules.seeds.TextSeedModule"> | |
<property name="textSource"> | |
<bean class="org.archive.spring.ConfigFile"> | |
<property name="path" value="/home/heritrix/URLlist/urllist6.txt"/> | |
</bean> | |
</property> | |
<property name="sourceTagSeeds" value="false"/> | |
<property name="blockAwaitingSeedLines" value="-1"/> | |
</bean> | |
<bean id="acceptSurts" class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> | |
<!-- <property name="decision" value="ACCEPT"/> --> | |
<!-- <property name="seedsAsSurtPrefixes" value="true" /> --> | |
<!-- <property name="alsoCheckVia" value="false" /> --> | |
<!-- <property name="surtsSourceFile" value="" /> --> | |
<!-- <property name="surtsDumpFile" value="${launchId}/surts.dump" /> --> | |
<!-- <property name="surtsSource"> | |
<bean class="org.archive.spring.ConfigString"> | |
<property name="value"> | |
<value> | |
# example.com | |
# http://www.example.edu/path1/ | |
# +http://(org,example, | |
</value> | |
</property> | |
</bean> | |
</property> --> | |
</bean> | |
<!-- SCOPE: rules for which discovered URIs to crawl; order is very | |
important because last decision returned other than 'NONE' wins. --> | |
<!-- クロールするファイルの絞込み用 --> | |
<bean id="scope" class="org.archive.modules.deciderules.DecideRuleSequence"> | |
<!-- BEGIN modified 2012/09/27 masayu-a --> | |
<property name="logToFile" value="false"/> <!-- scope.log 対策 --> | |
<property name="rules"> | |
<list> | |
<!-- Begin by REJECTing all... --> | |
<bean class="org.archive.modules.deciderules.RejectDecideRule"/> | |
<!-- ...then ACCEPT those within configured/seed-implied SURT prefixes... --> | |
<ref bean="acceptSurts"/> | |
<!-- ...but REJECT those from a configurable (initially empty) set of REJECT SURTs... --> | |
<bean class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule"> | |
<property name="decision" value="REJECT"/> | |
<property name="seedsAsSurtPrefixes" value="false"/> | |
<property name="surtsDumpFile" value="${launchId}/negative-surts.dump"/> | |
<!-- クロールしてほしくないホストのリストはここに残す --> | |
<!-- <property name="surtsSource"> | |
<bean class="org.archive.spring.ConfigFile"> | |
<property name="path" value="negative-surts.txt" /> | |
</bean> | |
</property> --> | |
</bean> | |
<!-- ...but REJECT those more than a configured link-hop-count from start... --> | |
<bean class="org.archive.modules.deciderules.TooManyHopsDecideRule"> | |
<!-- BEGIN modified 2012/09/27 masayu-a --> | |
<property name="maxHops" value="1"/> | |
<!-- END modified 2012/09/27 masayu-a --> | |
</bean> | |
<!-- ...but ACCEPT those more than a configured link-hop-count from start... --> | |
<!-- <bean class="org.archive.modules.deciderules.TransclusionDecideRule"> | |
<property name="maxTransHops" value="1"/> | |
<property name="maxSpeculativeHops" value="1"/> | |
</bean> | |
--> | |
<!-- ...and REJECT those from a configurable (initially empty) set of URI regexes... --> | |
<!-- ...and REJECT those with suspicious repeating path-segments... --> | |
<bean class="org.archive.modules.deciderules.PathologicalPathDecideRule"> | |
<property name="maxRepetitions" value="2"/> | |
</bean> | |
<!-- ...and REJECT those with more than threshold number of path-segments... --> | |
<bean class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule"> | |
<property name="maxPathDepth" value="20"/> | |
</bean> | |
<!-- BEGIN modified 2012/09/27 masayu-a --> | |
<!-- | |
<bean class="org.archive.modules.deciderules.NotMatchesFilePatternDecideRule"> | |
<property name="decision" value="REJECT"/> | |
<property name="regex" value=".*(/|\.HTML|\.htm|\.HTM|\.html|\.txt|\.js|\.rdf|\.php)$" /> | |
</bean> | |
<bean class="org.archive.modules.deciderules.MatchesFilePatternDecideRule"> | |
<property name="decision" value="REJECT"/> | |
<property name="regex" value=".*(\.gif|\.jpg|\.jpeg|\.GIF|\.JPG|\.JPEG|\.swf|\.SWF|\.css|\.CSS|\.js|\.JS)$" /> | |
</bean> | |
--> | |
<bean class="org.archive.modules.deciderules.ContentTypeNotMatchesRegexDecideRule"> | |
<property name="decision" value="REJECT"/> | |
<property name="regex" value="^.*text/(plain|html|dns).*$"/> | |
</bean> | |
<!-- END modified 2012/09/27 masayu-a --> | |
<!-- ...but always ACCEPT those marked as prerequisitee for another URI... --> | |
<bean class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule"> | |
</bean> | |
<!-- ...but always REJECT those with unsupported URI schemes --> | |
<!-- Heritrix が通常処理できない URI の排除 --> | |
<bean class="org.archive.modules.deciderules.SchemeNotInSetDecideRule"> | |
</bean> | |
</list> | |
</property> | |
</bean> | |
<!-- | |
PROCESSING CHAINS | |
Much of the crawler's work is specified by the sequential | |
application of swappable Processor modules. These Processors | |
are collected into three 'chains'. The CandidateChain is applied | |
to URIs being considered for inclusion, before a URI is enqueued | |
for collection. The FetchChain is applied to URIs when their | |
turn for collection comes up. The DispositionChain is applied | |
after a URI is fetched and analyzed/link-extracted. | |
--> | |
<!-- CANDIDATE CHAIN --> | |
<!-- first, processors are declared as top-level named beans --> | |
<bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper"> | |
</bean> | |
<bean id="preparer" class="org.archive.crawler.prefetch.FrontierPreparer"> | |
<!-- <property name="preferenceDepthHops" value="-1" /> --> | |
<!-- <property name="preferenceEmbedHops" value="1" /> --> | |
<!-- <property name="canonicalizationPolicy"> | |
<ref bean="canonicalizationPolicy" /> | |
</property> --> | |
<!-- <property name="queueAssignmentPolicy"> | |
<ref bean="queueAssignmentPolicy" /> | |
</property> --> | |
<!-- <property name="uriPrecedencePolicy"> | |
<ref bean="uriPrecedencePolicy" /> | |
</property> --> | |
<!-- <property name="costAssignmentPolicy"> | |
<ref bean="costAssignmentPolicy" /> | |
</property> --> | |
</bean> | |
<!-- now, processors are assembled into ordered CandidateChain bean --> | |
<bean id="candidateProcessors" class="org.archive.modules.CandidateChain"> | |
<property name="processors"> | |
<list> | |
<!-- apply scoping rules to each individual candidate URI... --> | |
<ref bean="candidateScoper"/> | |
<!-- ...then prepare those ACCEPTed to be enqueued to frontier. --> | |
<ref bean="preparer"/> | |
</list> | |
</property> | |
</bean> | |
<!-- FETCH CHAIN --> | |
<!-- first, processors are declared as top-level named beans --> | |
<bean id="preselector" class="org.archive.crawler.prefetch.Preselector"> | |
<!-- <property name="recheckScope" value="false" /> --> | |
<!-- <property name="blockAll" value="false" /> --> | |
<!-- <property name="blockByRegex" value="" /> --> | |
<!-- <property name="allowByRegex" value="" /> --> | |
</bean> | |
<bean id="preconditions" class="org.archive.crawler.prefetch.PreconditionEnforcer"> | |
<!-- <property name="ipValidityDurationSeconds" value="21600" /> --> | |
<!-- <property name="robotsValidityDurationSeconds" value="86400" /> --> | |
<!-- <property name="calculateRobotsOnly" value="false" /> --> | |
</bean> | |
<bean id="fetchDns" class="org.archive.modules.fetcher.FetchDNS"> | |
<!-- <property name="acceptNonDnsResolves" value="false" /> --> | |
<!-- <property name="digestContent" value="true" /> --> | |
<!-- <property name="digestAlgorithm" value="sha1" /> --> | |
</bean> | |
<!-- <bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois"> | |
<property name="specialQueryTemplates"> | |
<map> | |
<entry key="whois.verisign-grs.com" value="domain %s" /> | |
<entry key="whois.arin.net" value="z + %s" /> | |
<entry key="whois.denic.de" value="-T dn %s" /> | |
</map> | |
</property> | |
</bean> --> | |
<bean id="fetchHttp" class="org.archive.modules.fetcher.FetchHTTP"> | |
<!-- <property name="useHTTP11" value="false" /> --> | |
<!-- <property name="maxLengthBytes" value="0" /> --> | |
<!-- <property name="timeoutSeconds" value="1200" /> --> | |
<!-- <property name="maxFetchKBSec" value="0" /> --> | |
<!-- <property name="defaultEncoding" value="ISO-8859-1" /> --> | |
<!-- <property name="shouldFetchBodyRule"> | |
<bean class="org.archive.modules.deciderules.AcceptDecideRule"/> | |
</property> --> | |
<!-- <property name="soTimeoutMs" value="20000" /> --> | |
<!-- <property name="sendIfModifiedSince" value="true" /> --> | |
<!-- <property name="sendIfNoneMatch" value="true" /> --> | |
<!-- <property name="sendConnectionClose" value="true" /> --> | |
<!-- <property name="sendReferer" value="true" /> --> | |
<!-- <property name="sendRange" value="false" /> --> | |
<!-- <property name="ignoreCookies" value="false" /> --> | |
<!-- <property name="sslTrustLevel" value="OPEN" /> --> | |
<!-- <property name="acceptHeaders"> | |
<list> | |
<value>Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> | |
</list> | |
</property> | |
--> | |
<!-- <property name="httpBindAddress" value="" /> --> | |
<!-- <property name="httpProxyHost" value="" /> --> | |
<!-- <property name="httpProxyPort" value="0" /> --> | |
<!-- <property name="httpProxyUser" value="" /> --> | |
<!-- <property name="httpProxyPassword" value="" /> --> | |
<!-- <property name="digestContent" value="true" /> --> | |
<!-- <property name="digestAlgorithm" value="sha1" /> --> | |
</bean> | |
<bean id="extractorHttp" class="org.archive.modules.extractor.ExtractorHTTP"> | |
</bean> | |
<bean id="extractorHtml" class="org.archive.modules.extractor.ExtractorHTML"> | |
<property name="extractJavascript" value="false"/> | |
<!-- <property name="extractValueAttributes" value="true" /> --> | |
<!-- <property name="ignoreFormActionUrls" value="false" /> --> | |
<!-- <property name="extractOnlyFormGets" value="true" /> --> | |
<!-- <property name="treatFramesAsEmbedLinks" value="true" /> --> | |
<!-- <property name="ignoreUnexpectedHtml" value="true" /> --> | |
<!-- <property name="maxElementLength" value="1024" /> --> | |
<!-- <property name="maxAttributeNameLength" value="1024" /> --> | |
<!-- <property name="maxAttributeValueLength" value="16384" /> --> | |
</bean> | |
<bean id="extractorCss" class="org.archive.modules.extractor.ExtractorCSS"> | |
</bean> | |
<bean id="extractorJs" class="org.archive.modules.extractor.ExtractorJS"> | |
</bean> | |
<bean id="extractorSwf" class="org.archive.modules.extractor.ExtractorSWF"> | |
</bean> | |
<!-- now, processors are assembled into ordered FetchChain bean --> | |
<bean id="fetchProcessors" class="org.archive.modules.FetchChain"> | |
<property name="processors"> | |
<list> | |
<!-- re-check scope, if so enabled... --> | |
<ref bean="preselector"/> | |
<!-- ...then verify or trigger prerequisite URIs fetched, allow crawling... --> | |
<ref bean="preconditions"/> | |
<!-- ...fetch if DNS URI... --> | |
<ref bean="fetchDns"/> | |
<!-- <ref bean="fetchWhois"/> --> | |
<!-- ...fetch if HTTP URI... --> | |
<ref bean="fetchHttp"/> | |
<!-- ...extract outlinks from HTTP headers... --> | |
<ref bean="extractorHttp"/> | |
<!-- ...extract outlinks from HTML content... --> | |
<ref bean="extractorHtml"/> | |
<!-- ...extract outlinks from CSS content... --> | |
<!-- CSS は取らない <ref bean="extractorCss"/> --> | |
<!-- ...extract outlinks from Javascript content... --> | |
<!-- Javascript は取らない <ref bean="extractorJs"/> --> | |
<!-- ...extract outlinks from Flash content... --> | |
<!-- <ref bean="extractorSwf"/> --> | |
</list> | |
</property> | |
</bean> | |
<!-- DISPOSITION CHAIN --> | |
<!-- first, processors are declared as top-level named beans --> | |
<bean id="warcWriter" class="org.archive.modules.writer.WARCWriterProcessor"> | |
<!-- <property name="compress" value="true" /> --> | |
<!-- BEGIN modified 2012/09/27 masayu-a --> | |
<property name="prefix" value="NINJAL-6"/> | |
<!-- <property name="suffix" value="${HOSTNAME}" /> --> | |
<!-- <property name="maxFileSizeBytes" value="1000000000" /> --> | |
<!-- <property name="poolMaxActive" value="1" /> --> | |
<!-- <property name="MaxWaitForIdleMs" value="500" /> --> | |
<!-- <property name="skipIdenticalDigests" value="false" /> --> | |
<!-- <property name="maxTotalBytesToWrite" value="0" /> --> | |
<!-- <property name="directory" value="${launchId}" /> --> | |
<!-- BEGIN modified 2012/09/27 masayu-a ファイルを格納するパス --> | |
<property name="storePaths"> | |
<list> | |
<value>/work/warc</value> | |
</list> | |
</property> | |
<!-- <property name="writeRequests" value="true" /> --> | |
<!-- <property name="writeMetadata" value="true" /> --> | |
<!-- <property name="writeRevisitForIdenticalDigests" value="true" /> --> | |
<!-- <property name="writeRevisitForNotModified" value="true" /> --> | |
</bean> | |
<bean id="candidates" class="org.archive.crawler.postprocessor.CandidatesProcessor"> | |
<!-- <property name="seedsRedirectNewSeeds" value="true" /> --> | |
<!-- <property name="processErrorOutlinks" value="false" /> --> | |
</bean> | |
<bean id="disposition" class="org.archive.crawler.postprocessor.DispositionProcessor"> | |
<!-- <property name="delayFactor" value="5.0" /> --> | |
<!-- <property name="minDelayMs" value="3000" /> --> | |
<!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> --> | |
<!-- <property name="maxDelayMs" value="30000" /> --> | |
<!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> --> | |
</bean> | |
<!-- <bean id="rescheduler" class="org.archive.crawler.postprocessor.ReschedulingProcessor"> | |
<property name="rescheduleDelaySeconds" value="-1" /> | |
</bean> --> | |
<!-- now, processors are assembled into ordered DispositionChain bean --> | |
<bean id="dispositionProcessors" class="org.archive.modules.DispositionChain"> | |
<property name="processors"> | |
<list> | |
<!-- write to aggregate archival files... --> | |
<ref bean="warcWriter"/> | |
<!-- ...send each outlink candidate URI to CandidateChain, | |
and enqueue those ACCEPTed to the frontier... --> | |
<ref bean="candidates"/> | |
<!-- ...then update stats, shared-structures, frontier decisions --> | |
<ref bean="disposition"/> | |
<!-- <ref bean="rescheduler" /> --> | |
</list> | |
</property> | |
</bean> | |
<!-- CRAWLCONTROLLER: Control interface, unifying context --> | |
<bean id="crawlController" class="org.archive.crawler.framework.CrawlController"> | |
<!-- <property name="maxToeThreads" value="25" /> --> | |
<!-- <property name="pauseAtStart" value="true" /> --> | |
<!-- <property name="runWhileEmpty" value="false" /> --> | |
<!-- <property name="recorderInBufferBytes" value="524288" /> --> | |
<!-- <property name="recorderOutBufferBytes" value="16384" /> --> | |
<!-- <property name="scratchDir" value="scratch" /> --> | |
</bean> | |
<!-- FRONTIER: Record of all URIs discovered and queued-for-collection --> | |
<bean id="frontier" class="org.archive.crawler.frontier.BdbFrontier"> | |
<!-- <property name="queueTotalBudget" value="-1" /> --> | |
<!-- <property name="balanceReplenishAmount" value="3000" /> --> | |
<!-- <property name="errorPenaltyAmount" value="100" /> --> | |
<!-- <property name="precedenceFloor" value="255" /> --> | |
<!-- <property name="queuePrecedencePolicy"> | |
<bean class="org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy" /> | |
</property> --> | |
<!-- <property name="snoozeLongMs" value="300000" /> --> | |
<!-- <property name="retryDelaySeconds" value="900" /> --> | |
<!-- <property name="maxRetries" value="30" /> --> | |
<!-- <property name="recoveryLogEnabled" value="true" /> --> | |
<!-- <property name="maxOutlinks" value="6000" /> --> | |
<!-- <property name="extractIndependently" value="false" /> --> | |
<!-- <property name="outbound"> | |
<bean class="java.util.concurrent.ArrayBlockingQueue"> | |
<constructor-arg value="200"/> | |
<constructor-arg value="true"/> | |
</bean> | |
</property> --> | |
<!-- <property name="inbound"> | |
<bean class="java.util.concurrent.ArrayBlockingQueue"> | |
<constructor-arg value="40000"/> | |
<constructor-arg value="true"/> | |
</bean> | |
</property> --> | |
<!-- <property name="dumpPendingAtClose" value="false" /> --> | |
</bean> | |
<!-- URI UNIQ FILTER: Used by frontier to remember already-included URIs --> | |
<bean id="uriUniqFilter" class="org.archive.crawler.util.BdbUriUniqFilter"> | |
</bean> | |
<!-- | |
EXAMPLE SETTINGS OVERLAY SHEETS | |
Sheets allow some settings to vary by context - usually by URI context, | |
so that different sites or sections of sites can be treated differently. | |
Here are some example Sheets for common purposes. The SheetOverlaysManager | |
(below) automatically collects all Sheet instances declared among the | |
original beans, but others can be added during the crawl via the scripting | |
interface. | |
--> | |
<!-- forceRetire: any URI to which this sheet's settings are applied | |
will force its containing queue to 'retired' status. --> | |
<bean id="forceRetire" class="org.archive.spring.Sheet"> | |
<property name="map"> | |
<map> | |
<entry key="disposition.forceRetire" value="true"/> | |
</map> | |
</property> | |
</bean> | |
<!-- smallBudget: any URI to which this sheet's settings are applied | |
will give its containing queue small values for balanceReplenishAmount | |
(causing it to have shorter 'active' periods while other queues are | |
waiting) and queueTotalBudget (causing the queue to enter 'retired' | |
status once that expenditure is reached by URI attempts and errors) --> | |
<bean id="smallBudget" class="org.archive.spring.Sheet"> | |
<property name="map"> | |
<map> | |
<entry key="frontier.balanceReplenishAmount" value="20"/> | |
<entry key="frontier.queueTotalBudget" value="100"/> | |
</map> | |
</property> | |
</bean> | |
<!-- veryPolite: any URI to which this sheet's settings are applied | |
will cause its queue to take extra-long politeness snoozes --> | |
<bean id="veryPolite" class="org.archive.spring.Sheet"> | |
<property name="map"> | |
<map> | |
<entry key="disposition.delayFactor" value="10"/> | |
<entry key="disposition.minDelayMs" value="10000"/> | |
<entry key="disposition.maxDelayMs" value="1000000"/> | |
<entry key="disposition.respectCrawlDelayUpToSeconds" value="3600"/> | |
</map> | |
</property> | |
</bean> | |
<!-- highPrecedence: any URI to which this sheet's settings are applied | |
will give its containing queue a slightly-higher than default | |
queue precedence value. That queue will then be preferred over | |
other queues for active crawling, never waiting behind lower- | |
precedence queues. --> | |
<bean id="highPrecedence" class="org.archive.spring.Sheet"> | |
<property name="map"> | |
<map> | |
<entry key="frontier.balanceReplenishAmount" value="20"/> | |
<entry key="frontier.queueTotalBudget" value="100"/> | |
</map> | |
</property> | |
</bean> | |
<!-- | |
EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION | |
A SheetAssociation says certain URIs should have certain overlay Sheets | |
applied. This example applies two sheets to URIs matching two SURT-prefixes. | |
New associations may also be added mid-crawl using the scripting facility. | |
--> | |
<!-- | |
<bean class='org.archive.crawler.spring.SurtPrefixesSheetAssociation'> | |
<property name='surtPrefixes'> | |
<list> | |
<value>http://(org,example,</value> | |
<value>http://(com,example,www,)/</value> | |
</list> | |
</property> | |
<property name='targetSheetNames'> | |
<list> | |
<value>veryPolite</value> | |
<value>smallBudget</value> | |
</list> | |
</property> | |
</bean> | |
--> | |
<!-- | |
OPTIONAL BUT RECOMMENDED BEANS | |
--> | |
<!-- ACTIONDIRECTORY: disk directory for mid-crawl operations | |
Running job will watch directory for new files with URIs, | |
scripts, and other data to be processed during a crawl. --> | |
<bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory"> | |
<!-- <property name="actionDir" value="action" /> --> | |
<!-- <property name="doneDir" value="${launchId}/actions-done" /> --> | |
<!-- <property name="initialDelaySeconds" value="10" /> --> | |
<!-- <property name="delaySeconds" value="30" /> --> | |
</bean> | |
<!-- CRAWLLIMITENFORCER: stops crawl when it reaches configured limits --> | |
<bean id="crawlLimiter" class="org.archive.crawler.framework.CrawlLimitEnforcer"> | |
<!-- <property name="maxBytesDownload" value="0" /> --> | |
<!-- <property name="maxDocumentsDownload" value="0" /> --> | |
<!-- <property name="maxTimeSeconds" value="0" /> --> | |
</bean> | |
<!-- CHECKPOINTSERVICE: checkpointing assistance --> | |
<bean id="checkpointService" class="org.archive.crawler.framework.CheckpointService"> | |
<!-- <property name="checkpointIntervalMinutes" value="-1"/> --> | |
<!-- <property name="checkpointsDir" value="checkpoints"/> --> | |
</bean> | |
<!-- | |
OPTIONAL BEANS | |
Uncomment and expand as needed, or if non-default alternate | |
implementations are preferred. | |
--> | |
<!-- CANONICALIZATION POLICY --> | |
<!-- | |
<bean id="canonicalizationPolicy" | |
class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy"> | |
<property name="rules"> | |
<list> | |
<bean class="org.archive.modules.canonicalize.LowercaseRule" /> | |
<bean class="org.archive.modules.canonicalize.StripUserinfoRule" /> | |
<bean class="org.archive.modules.canonicalize.StripWWWNRule" /> | |
<bean class="org.archive.modules.canonicalize.StripSessionIDs" /> | |
<bean class="org.archive.modules.canonicalize.StripSessionCFIDs" /> | |
<bean class="org.archive.modules.canonicalize.FixupQueryString" /> | |
</list> | |
</property> | |
</bean> | |
--> | |
<!-- QUEUE ASSIGNMENT POLICY --> | |
<!-- | |
<bean id="queueAssignmentPolicy" | |
class="org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy"> | |
<property name="forceQueueAssignment" value="" /> | |
<property name="deferToPrevious" value="true" /> | |
<property name="parallelQueues" value="1" /> | |
</bean> | |
--> | |
<!-- URI PRECEDENCE POLICY --> | |
<!-- | |
<bean id="uriPrecedencePolicy" | |
class="org.archive.crawler.frontier.precedence.CostUriPrecedencePolicy"> | |
</bean> | |
--> | |
<!-- COST ASSIGNMENT POLICY --> | |
<!-- | |
<bean id="costAssignmentPolicy" | |
class="org.archive.crawler.frontier.UnitCostAssignmentPolicy"> | |
</bean> | |
--> | |
<!-- CREDENTIAL STORE: HTTP authentication or FORM POST credentials --> | |
<!-- | |
<bean id="credentialStore" | |
class="org.archive.modules.credential.CredentialStore"> | |
</bean> | |
--> | |
<!-- DISK SPACE MONITOR: | |
Pauses the crawl if disk space at monitored paths falls below minimum threshold --> | |
<!-- | |
<bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor"> | |
<property name="pauseThresholdMiB" value="500" /> | |
<property name="monitorConfigPaths" value="true" /> | |
<property name="monitorPaths"> | |
<list> | |
<value>PATH</value> | |
</list> | |
</property> | |
</bean> | |
--> | |
<!-- | |
REQUIRED STANDARD BEANS | |
It will be very rare to replace or reconfigure the following beans. | |
--> | |
<!-- STATISTICSTRACKER: standard stats/reporting collector --> | |
<bean id="statisticsTracker" class="org.archive.crawler.reporting.StatisticsTracker" autowire="byName"> | |
<!-- <property name="reports"> | |
<list> | |
<bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" /> | |
<bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" /> | |
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" /> | |
<bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" /> | |
<bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" /> | |
<bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" /> | |
<bean id="processorsReport" class="org.archive.crawler.reporting.ProcessorsReport" /> | |
<bean id="frontierSummaryReport" class="org.archive.crawler.reporting.FrontierSummaryReport" /> | |
<bean id="frontierNonemptyReport" class="org.archive.crawler.reporting.FrontierNonemptyReport" /> | |
<bean id="toeThreadsReport" class="org.archive.crawler.reporting.ToeThreadsReport" /> | |
</list> | |
</property> --> | |
<!-- <property name="reportsDir" value="${launchId}/reports" /> --> | |
<!-- <property name="liveHostReportSize" value="20" /> --> | |
<!-- <property name="intervalSeconds" value="20" /> --> | |
<!-- <property name="keepSnapshotsCount" value="5" /> --> | |
<!-- <property name="liveHostReportSize" value="20" /> --> | |
</bean> | |
<!-- CRAWLERLOGGERMODULE: shared logging facility --> | |
<bean id="loggerModule" class="org.archive.crawler.reporting.CrawlerLoggerModule"> | |
<!-- <property name="path" value="${launchId}/logs" /> --> | |
<!-- <property name="crawlLogPath" value="crawl.log" /> --> | |
<!-- <property name="alertsLogPath" value="alerts.log" /> --> | |
<!-- <property name="progressLogPath" value="progress-statistics.log" /> --> | |
<!-- <property name="uriErrorsLogPath" value="uri-errors.log" /> --> | |
<!-- <property name="runtimeErrorsLogPath" value="runtime-errors.log" /> --> | |
<!-- <property name="nonfatalErrorsLogPath" value="nonfatal-errors.log" /> --> | |
<!-- <property name="logExtraInfo" value="false" /> --> | |
</bean> | |
<!-- SHEETOVERLAYMANAGER: manager of sheets of contextual overlays | |
Autowired to include any SheetForSurtPrefix or | |
SheetForDecideRuled beans --> | |
<bean id="sheetOverlaysManager" autowire="byType" class="org.archive.crawler.spring.SheetOverlaysManager"> | |
</bean> | |
<!-- BDBMODULE: shared BDB-JE disk persistence manager --> | |
<bean id="bdb" class="org.archive.bdb.BdbModule"> | |
<!-- <property name="dir" value="state" /> --> | |
<!-- if neither cachePercent or cacheSize are specified (the default), bdb | |
uses its own default of 60% --> | |
<!-- <property name="cachePercent" value="0" /> --> | |
<!-- <property name="cacheSize" value="0" /> --> | |
<!-- <property name="useSharedCache" value="true" /> --> | |
<!-- <property name="expectedConcurrency" value="25" /> --> | |
</bean> | |
<!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP --> | |
<bean id="cookieStorage" class="org.archive.modules.fetcher.BdbCookieStorage"> | |
<!-- <property name="cookiesLoadFile"><null/></property> --> | |
<!-- <property name="cookiesSaveFile"><null/></property> --> | |
<!-- <property name="bdb"> | |
<ref bean="bdb"/> | |
</property> --> | |
</bean> | |
<!-- SERVERCACHE: shared cache of server/host info --> | |
<bean id="serverCache" class="org.archive.modules.net.BdbServerCache"> | |
<!-- <property name="bdb"> | |
<ref bean="bdb"/> | |
</property> --> | |
</bean> | |
<!-- CONFIG PATH CONFIGURER: required helper making crawl paths relative | |
to crawler-beans.cxml file, and tracking crawl files for web UI --> | |
<bean id="configPathConfigurer" class="org.archive.spring.ConfigPathConfigurer"> | |
</bean> | |
</beans> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment