Skip to content

Instantly share code, notes, and snippets.

@ato
Created November 30, 2024 04:14
Show Gist options
  • Save ato/4c919a7e01cfa8ac3666c264538914d4 to your computer and use it in GitHub Desktop.
Save ato/4c919a7e01cfa8ac3666c264538914d4 to your computer and use it in GitHub Desktop.
Heritrix default profile as Groovy Bean Definition DSL
/**
* HERITRIX 3 CRAWL JOB CONFIGURATION FILE
*
* This is a relatively minimal configuration suitable for many crawls.
*
* Commented-out beans and properties are provided as an example; values
* shown in comments reflect the actual defaults which are in effect
* if not otherwise specified specification. (To change from the default
* behavior, uncomment AND alter the shown values.)
*/
import org.archive.bdb.BdbModule
import org.archive.crawler.framework.*
import org.archive.crawler.frontier.*
import org.archive.crawler.monitor.DiskSpaceMonitor
import org.archive.crawler.postprocessor.*
import org.archive.crawler.prefetch.*
import org.archive.crawler.reporting.*
import org.archive.crawler.spring.*
import org.archive.crawler.util.BdbUriUniqFilter
import org.archive.modules.*
import org.archive.modules.canonicalize.*
import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule
import org.archive.modules.extractor.*
import org.archive.modules.fetcher.*
import org.archive.modules.net.BdbServerCache
import org.archive.modules.seeds.TextSeedModule
import org.archive.modules.writer.WARCWriterChainProcessor
import org.springframework.beans.factory.config.PropertyOverrideConfigurer
import org.archive.modules.deciderules.*
import org.archive.spring.*
beans {
/*
* OVERRIDES
* Values elsewhere in the configuration may be replaced ('overridden')
* by a Properties map declared in a PropertiesOverrideConfigurer,
* using a dotted-bean-path to address individual bean properties.
* This allows us to collect a few of the most-often changed values
* in an easy-to-edit format here at the beginning of the model
* configuration.
*/
/**
* overrides from a text property list
*/
simpleOverrides(PropertyOverrideConfigurer) {
properties = '''
# This Properties map is specified in the Java 'property list' text format
# http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load%28java.io.Reader%29
metadata.operatorContactUrl=ENTER_AN_URL_WITH_YOUR_CONTACT_INFO_HERE_FOR_WEBMASTERS_AFFECTED_BY_YOUR_CRAWL
metadata.jobName=basic
metadata.description=Basic crawl starting with useful defaults
##..more?..##
'''
}
/**
* overrides from declared <prop> elements, more easily allowing
* multiline values or even declared beans
*/
longerOverrides(PropertyOverrideConfigurer) {
properties = ['seeds.textSource.value': ''''
# URLS HERE
http://example.example/example
''']
}
/**
* CRAWL METADATA: including identification of crawler/operator
*/
metadata(CrawlMetadata) { bean ->
bean.autowire = 'byName'
operatorContactUrl = '[see override above]'
jobName = '[see override above]'
description = '[see override above]'
// robotsPolicyName = 'obey'
// operator = ''
// operatorFrom = ''
// organization = ''
// audience = ''
// userAgentTemplate = 'Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)'
}
/**
* SEEDS: crawl starting points
*
* ConfigString allows simple, inline specification of a moderate
* number of seeds; see below comment for example of using an
* arbitrarily-large external file.
*/
seeds(TextSeedModule) {
textSource = new ConfigString('''
# [see override above]
''')
// sourceTagSeeds = false
// blockAwaitingSeedLines = -1
}
/**
* SEEDS ALTERNATE APPROACH: specifying external seeds.txt file in
* the job directory, similar to the H1 approach.
* Use either the above, or this, but not both.
*/
/*
seeds(TextSeedModule) {
textSource = new ConfigFile(path: 'seeds.txt')
// sourceTagSeeds = false
// blockAwaitingSeedLines = -1
}
*/
acceptSurts(SurtPrefixedDecideRule) {
// decision = 'ACCEPT'
// seedsAsSurtPrefixes = true
// alsoCheckVia = false
// surtsSourceFile = ''
// surtsDumpFile = '${launchId}/surts.dump'
/*
surtsSource = new ConfigString('''
# example.com
# http://www.example.edu/path1/
# +http://(org,example,
''')
}
*/
}
/**
* SCOPE: rules for which discovered URIs to crawl; order is very
* important because last decision returned other than 'NONE' wins.
*/
scope(DecideRuleSequence) {
logToFile = false
rules = [
// Begin by REJECTing all...
new RejectDecideRule(),
// ...then ACCEPT those within configured/seed-implied SURT prefixes...
new TooManyHopsDecideRule(
// maxHops: 20,
),
// ...but ACCEPT those more than a configured link-hop-count from start...
new TransclusionDecideRule(
// maxTransHops: 2,
// maxSpeculativeHops: 1,
),
// ...but REJECT those from a configurable (initially empty) set of REJECT SURTs...
new SurtPrefixedDecideRule(
decision: 'REJECT',
seedsAsSurtPrefixes: false,
surtsDumpFile: new ConfigFile(path: '${launchId}/negative-surts.dump'),
// surtsSource: new ConfigFile(path: 'negative-surts.txt'),
),
// ...and REJECT those from a configurable (initially empty) set of URI regexes...
new MatchesListRegexDecideRule(
decision: 'REJECT',
// listLogicalOr: false,
// regexList: [],
),
// ...and REJECT those with suspicious repeating path-segments...
new PathologicalPathDecideRule(
// maxRepetitions: 2,
),
// ...and REJECT those with more than threshold number of path-segments...
new TooManyPathSegmentsDecideRule(
// maxPathDepth: 20,
),
// ...but always ACCEPT those marked as prerequisitee for another URI...
new PrerequisiteAcceptDecideRule(),
// ...but always REJECT those with unsupported URI schemes
new SchemeNotInSetDecideRule(),
]
}
/*
* PROCESSING CHAINS
* Much of the crawler's work is specified by the sequential
* application of swappable Processor modules. These Processors
* are collected into three 'chains'. The CandidateChain is applied
* to URIs being considered for inclusion, before a URI is enqueued
* for collection. The FetchChain is applied to URIs when their
* turn for collection comes up. The DispositionChain is applied
* after a URI is fetched and analyzed/link-extracted.
*/
/*
* CANDIDATE CHAIN
*/
// first, processors are declared as top-level named beans
candidateScoper(CandidateScoper)
preparer(FrontierPreparer) {
// preferenceDepthHops = -1
// preferenceEmbedHops = 1
// canonicalizationPolicy = ref('canonicalizationPolicy')
// queueAssignmentPolicy = ref('queueAssignmentPolicy')
// uriPrecedencePolicy = ref('uriPrecedencePolicy')
// costAssignmentPolicy = ref('costAssignmentPolicy')
}
// now, processors are assembled into ordered CandidateChain bean
candidateProcessors(CandidateChain) {
processors = [
// apply scoping rules to each individual candidate URI...
ref('candidateScoper'),
// ...then prepare those ACCEPTed to be enqueued to frontier.
ref('preparer'),
]
}
/*
* FETCH CHAIN
*/
// first, processors are declared as top-level named beans
preselector(Preselector) {
// recheckScope = false
// blockAll = false
// blockByRegex = ''
// allowByRegex = ''
}
preconditions(PreconditionEnforcer) {
// ipValidityDurationSeconds = 21600
// robotsValidityDurationSeconds = 86400
// calculateRobotsOnly = false
}
fetchDns(FetchDNS) {
// acceptNonDnsResolves = false
// digestContent = true
// digestAlgorithm = 'sha1'
// dnsOverHttpServer = 'https://dns.google/dns-query'
}
/*
fetchWhois(FetchWhois) {
specialQueryTemplates = [
'whois.verisign-grs.com': 'domain %s',
'whois.arin.net': 'z + %s',
'whois.denic.de': '-T dn %s'
]
}
*/
fetchHttp(FetchHTTP) {
// useHTTP11 = false
// maxLengthBytes = 0
// timeoutSeconds = 1200
// maxFetchKBSec = 0
// defaultEncoding = 'ISO-8859-1'
// shouldFetchBodyRule = new AcceptDecideRule()
// soTimeoutMs = 20000
// sendIfModifiedSince = true
// sendIfNoneMatch = true
// sendConnectionClose = true
// sendReferer = true
// sendRange = false
// ignoreCookies = false
// sslTrustLevel = 'OPEN'
// acceptHeaders = [
// 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
// ]
// httpBindAddress = ''
// httpProxyHost = ''
// httpProxyPort = 0
// httpProxyUser = ''
// httpProxyPassword = ''
// socksProxyHost = ''
// socksProxyPort = ''
// digestContent = true
// digestAlgorithm = 'sha1'
}
extractorHttp(ExtractorHTTP)
extractorRobotsTxt(ExtractorRobotsTxt)
extractorSitemap(ExtractorSitemap)
extractorHtml(ExtractorHTML) {
// extractJavascript = true
// extractValueAttributes = true
// ignoreFormActionUrls = false
// extractOnlyFormGets = true
// treatFramesAsEmbedLinks = true
// ignoreUnexpectedHtml = true
// maxElementLength = 1024
// maxAttributeNameLength = 1024
// maxAttributeValueLength = 16384
}
extractorCss(ExtractorCSS)
extractorJs(ExtractorJS)
extractorSwf(ExtractorSWF)
// now, processors are assembled into ordered FetchChain bean
fetchProcessors(FetchChain) {
processors = [
// re-check scope, if so enabled...
ref('preselector'),
// ...then verify or trigger prerequisite URIs fetched, allow crawling...
ref('preconditions'),
// ...fetch if DNS URI...
ref('fetchDns'),
// ref('fetchWhois'),
// ...fetch if HTTP URI...
ref('fetchHttp'),
// ...extract outlinks from HTTP headers...
ref('extractorHttp'),
// ...extract sitemap urls from robots.txt...
ref('extractorRobotsTxt'),
// ...extract links from sitemaps...
ref('extractorSitemap'),
// ...extract outlinks from HTML content...
ref('extractorHtml'),
// ...extract outlinks from CSS content...
ref('extractorCss'),
// ...extract outlinks from Javascript content...
ref('extractorJs'),
// ...extract outlinks from Flash content...
ref('extractorSwf')
]
}
/*
* DISPOSITION CHAIN
*/
// first, processors are declared as top-level named beans
warcWriter(WARCWriterChainProcessor) {
// compress = true
// prefix = 'IAH'
// maxFileSizeBytes = 1000000000
// poolMaxActive = 1
// MaxWaitForIdleMs = 500
// skipIdenticalDigests = false
// maxTotalBytesToWrite = 0
// directory = '${launchId}'
// storePaths = ['warcs']
// template = '${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}'
// startNewFilesOnCheckpoint = true
/*
chain = [
new org.archive.modules.warc.DnsResponseRecordBuilder(),
new org.archive.modules.warc.HttpResponseRecordBuilder(),
new org.archive.modules.warc.WhoisResponseRecordBuilder(),
new org.archive.modules.warc.FtpControlConversationRecordBuilder(),
new org.archive.modules.warc.FtpResponseRecordBuilder(),
new org.archive.modules.warc.RevisitRecordBuilder(),
new org.archive.modules.warc.HttpRequestRecordBuilder(),
new org.archive.modules.warc.MetadataRecordBuilder()
]
*/
}
candidates(CandidatesProcessor) {
// seedsRedirectNewSeeds = true
// processErrorOutlinks = false
}
disposition(DispositionProcessor) {
// delayFactor = 5.0
// minDelayMs = 3000
// respectCrawlDelayUpToSeconds = 300
// maxDelayMs = 30000
// maxPerHostBandwidthUsageKbSec = 0
}
/*
rescheduler(ReschedulingProcessor) {
rescheduleDelaySeconds = -1
}
*/
// now, processors are assembled into ordered DispositionChain bean
dispositionProcessors(DispositionChain) {
processors = [
// write to aggregate archival files...
ref('warcWriter'),
// ...send each outlink candidate URI to CandidateChain,
// and enqueue those ACCEPTed to the frontier...
ref('candidates'),
// ...then update stats, shared-structures, frontier decisions
ref('disposition')
// ref('rescheduler')
]
}
/**
* CRAWLCONTROLLER: Control interface, unifying context
*/
crawlController(CrawlController) {
// maxToeThreads = 25
// pauseAtStart = true
// runWhileEmpty = false
// recorderInBufferBytes = 524288
// recorderOutBufferBytes = 16384
// scratchDir = 'scratch'
}
/**
* FRONTIER: Record of all URIs discovered and queued-for-collection
*/
frontier(BdbFrontier) {
// queueTotalBudget = -1
// balanceReplenishAmount = 3000
// errorPenaltyAmount = 100
// precedenceFloor = 255
// queuePrecedencePolicy = new org.archive.crawler.frontier.precedence.BaseQueuePrecedencePolicy()
// snoozeLongMs = 300000
// retryDelaySeconds = 900
// maxRetries = 30
// recoveryLogEnabled = true
// maxOutlinks = 6000
// extractIndependently = false
// outbound = new ArrayBlockingQueue(200, true)
// inbound = new ArrayBlockingQueue(40000, true)
// dumpPendingAtClose = false
}
/**
* URI UNIQ FILTER: Used by frontier to remember already-included URIs
*/
uriUniqFilter(BdbUriUniqFilter)
/*
* EXAMPLE SETTINGS OVERLAY SHEETS
* Sheets allow some settings to vary by context - usually by URI context,
* so that different sites or sections of sites can be treated differently.
* Here are some example Sheets for common purposes. The SheetOverlaysManager
* (below) automatically collects all Sheet instances declared among the
* original beans, but others can be added during the crawl via the scripting
* interface.
*/
/**
* forceRetire: any URI to which this sheet's settings are applied
* will force its containing queue to 'retired' status.
*/
forceRetire(Sheet) {
map = [
'disposition.forceRetire': 'true'
]
}
/**
* smallBudget: any URI to which this sheet's settings are applied
* will give its containing queue small values for balanceReplenishAmount
* (causing it to have shorter 'active' periods while other queues are
* waiting) and queueTotalBudget (causing the queue to enter 'retired'
* status once that expenditure is reached by URI attempts and errors)
*/
smallBudget(Sheet) {
map = [
'frontier.balanceReplenishAmount': '20',
'frontier.queueTotalBudget': '100'
]
}
/**
* veryPolite: any URI to which this sheet's settings are applied
* will cause its queue to take extra-long politeness snoozes
*/
veryPolite(Sheet) {
map = [
'disposition.delayFactor': '10',
'disposition.minDelayMs': '10000',
'disposition.maxDelayMs': '1000000',
'disposition.respectCrawlDelayUpToSeconds': '3600'
]
}
/**
* highPrecedence: any URI to which this sheet's settings are applied
* will give its containing queue a slightly-higher than default
* queue precedence value. That queue will then be preferred over
* other queues for active crawling, never waiting behind lower-
* precedence queues.
*/
highPrecedence(Sheet) {
map = [
'frontier.balanceReplenishAmount': '20',
'frontier.queueTotalBudget': '100'
]
}
/*
* EXAMPLE SETTINGS OVERLAY SHEET-ASSOCIATION
* A SheetAssociation says certain URIs should have certain overlay Sheets
* applied. This example applies two sheets to URIs matching two SURT-prefixes.
* New associations may also be added mid-crawl using the scripting facility.
*/
/*
surtPrefixesSheetAssociation(SurtPrefixesSheetAssociation) {
surtPrefixes = [
'http://(org,example,',
'http://(com,example,www,)/'
]
targetSheetNames = [
'veryPolite',
'smallBudget'
]
}
*/
/*
* OPTIONAL BUT RECOMMENDED BEANS
*/
/**
* ACTIONDIRECTORY: disk directory for mid-crawl operations
* Running job will watch directory for new files with URIs,
* scripts, and other data to be processed during a crawl.
*/
actionDirectory(ActionDirectory) {
// actionDir = 'action'
// doneDir = '${launchId}/actions-done'
// initialDelaySeconds = 10
// delaySeconds = 30
}
/**
* CRAWLLIMITENFORCER: stops crawl when it reaches configured limits
*/
crawlLimiter(CrawlLimitEnforcer) {
// maxBytesDownload = 0
// maxDocumentsDownload = 0
// maxTimeSeconds = 0
}
/**
* CHECKPOINTSERVICE: checkpointing assistance
*/
checkpointService(CheckpointService) {
// checkpointIntervalMinutes = -1
// checkpointOnShutdown = true
// checkpointsDir = 'checkpoints'
// forgetAllButLatest = true
}
/*
* OPTIONAL BEANS
*
* Uncomment and expand as needed, or if non-default alternate
* implementations are preferred.
*/
/**
* CANONICALIZATION POLICY
*/
/*
canonicalizationPolicy(RulesCanonicalizationPolicy) {
rules = [
new LowercaseRule(),
new StripUserinfoRule(),
new StripWWWNRule(),
new StripSessionIDs(),
new StripSessionCFIDs(),
new FixupQueryString()
]
}
*/
/**
* QUEUE ASSIGNMENT POLICY
*/
/*
queueAssignmentPolicy(SurtAuthorityQueueAssignmentPolicy) {
forceQueueAssignment = ''
deferToPrevious = true
parallelQueues = 1
}
*/
/**
* URI PRECEDENCE POLICY
*/
// uriPrecedencePolicy(CostUriPrecedencePolicy)
/**
* COST ASSIGNMENT POLICY
*/
costAssignmentPolicy(UnitCostAssignmentPolicy)
/**
* CREDENTIAL STORE: HTTP authentication or FORM POST credentials
*/
// credentialStore(org.archive.modules.credential.CredentialStore)
/**
* DISK SPACE MONITOR:
* Pauses the crawl if disk space at monitored paths falls below minimum threshold
* Note: If there's less than 5 GiB free for state directory BDB will throw
* an error which the crawl job will likely not be able to fully recover from.
*/
/*
diskSpaceMonitor(DiskSpaceMonitor) {
pauseThresholdMiB = 8192
monitorConfigPaths = true
monitorPaths = [
'PATH'
]
}
*/
/*
* REQUIRED STANDARD BEANS
* It will be very rare to replace or reconfigure the following beans.
*/
/**
* STATISTICSTRACKER: standard stats/reporting collector
*/
statisticsTracker(StatisticsTracker) { bean ->
bean.autowire = 'byName'
/*
reports = [
new CrawlSummaryReport(),
new SeedsReport(),
new HostsReport(
maxSortSize: -1,
suppressEmptyHosts: false,
),
new SourceTagsReport(),
new MimetypesReport(),
new ResponseCodeReport(),
new ProcessorsReport(),
new FrontierSummaryReport(),
new FrontierNonemptyReport(),
new ToeThreadsReport(),
]
*/
// reportsDir = '${launchId}/reports'
// liveHostReportSize = 20
// intervalSeconds = 20
// keepSnapshotsCount = 5
// liveHostReportSize = 20
}
/**
* CRAWLERLOGGERMODULE: shared logging facility
*/
loggerModule(CrawlerLoggerModule) {
// path = '${launchId}/logs'
// crawlLogPath = 'crawl.log'
// alertsLogPath = 'alerts.log'
// progressLogPath = 'progress-statistics.log'
// uriErrorsLogPath = 'uri-errors.log'
// runtimeErrorsLogPath = 'runtime-errors.log'
// nonfatalErrorsLogPath = 'nonfatal-errors.log'
// logExtraInfo = false
}
/**
* SHEETOVERLAYMANAGER: manager of sheets of contextual overlays
* Autowired to include any SheetForSurtPrefix or
* SheetForDecideRuled beans
*/
sheetOverlaysManager(SheetOverlaysManager) { bean ->
bean.autowire = 'byType'
}
/**
* BDBMODULE: shared BDB-JE disk persistence manager
*/
bdb(BdbModule) {
// dir = 'state'
/*
* if neither cachePercent or cacheSize are specified (the default), bdb
* uses its own default of 60%
*/
// cachePercent = 0
// cacheSize = 0
// useSharedCache = true
// expectedConcurrency = 25
}
/**
* BDBCOOKIESTORE: disk-based cookie storage for FetchHTTP
*/
cookieStore(BdbCookieStore) {
// cookiesLoadFile = null
// cookiesSaveFile = null
// bdbModule = ref('bdb')
}
/**
* SERVERCACHE: shared cache of server/host info
*/
serverCache(BdbServerCache) {
// bdb = ref('bdb')
}
/**
* CONFIG PATH CONFIGURER: required helper making crawl paths relative
* to crawler-beans.cxml file, and tracking crawl files for web U
*/
configPathConfigurer(ConfigPathConfigurer)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment