order.xml是整个Heritrix的核心,里面的每个一个配置都关系到Heritrix的运行情况,没读源码之前我只能从有限的渠道去获知这些配置的运用.读完之后才知道Heritrix竟然有如此灵活的运用,如可以控制抓取速度,可以优化电脑性能,可以在某一次的抓取上继续抓取.当然整个order.xml里我也没有全部掌握,只知道大部分配置的作用,希望大家指点改正以及补充,谢谢!
- <meta></meta>代表着该抓取JOB的元素,相当于Html的meta
- <meta>
- <name>myheritrix</name>
- <description>myheritrix</description>
- <operator>Admin</operator>
- <organization></organization>
- <audience></audience>
- <date>20090520051654</date>
2.<controller></controller> 跟抓取有关的所有参数,由于内容较多,并且Heritrix也已将他们分成不同模块,所以这里我也将他们拆分来说明.
- <controller>
- <stringname="settings-directory">settings</string>
- <stringname="disk-path"></string>
- <stringname="logs-path">logs</string>
- <stringname="checkpoints-path">checkpoints</string>
- <stringname="state-path">state</string>
- <stringname="scratch-path">scratch</string>
- <longname="max-bytes-download">0</long>
- <longname="max-document-download">0</long>
- <longname="max-time-sec">0</long>
- <integername="max-toe-threads">30</integer>
- <integername="recorder-out-buffer-bytes">4096</integer>
- <integername="recorder-in-buffer-bytes">65536</integer>
- <integername="bdb-cache-percent">0</integer>
- <newObjectname="scope"class="org.archive.crawler.deciderules.DecidingScope">
- </newObject>
- <mapname="http-headers">
- </map>
- <newObjectname="robots-honoring-policy"class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- </newObject>
- <newObjectname="frontier"class="org.archive.crawler.frontier.BdbFrontier"><!--Frontier调度器,等下拆分来说明-->
- </newObject>
- <mapname="uri-canonicalization-rules">
- </map>
- <mapname="pre-fetch-processors">
- </map>
- <mapname="fetch-processors">
- </map>
- <mapname="extract-processors">
- </map>
- <mapname="write-processors">
- </map>
- <mapname="post-processors">
- </map>
- <mapname="loggers">
- </map>
- <newObjectname="credential-store"class="org.archive.crawler.datamodel.CredentialStore">
- </newObject>
- </controller>
3.接下来拆分每个组件的配置文件一一进行说明,最后对Heritrix主要的配置也就是我们可以影响抓取的配置进行说明。
3.1:抓取范围<newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
- <newObjectname="scope"class="org.archive.crawler.deciderules.DecidingScope">
- <booleanname="enabled">false</boolean>
- <stringname="seedsfile">seeds.txt</string>
- <booleanname="reread-seeds-on-config">true</boolean>
- <newObjectname="decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- <newObjectname="rejectByDefault"class="org.archive.crawler.deciderules.RejectDecideRule">
- </newObject>
- <newObjectname="acceptIfSurtPrefixed"class="org.archive.crawler.deciderules.SurtPrefixedDecideRule">
- <stringname="decision">ACCEPT</string>
- <stringname="surts-source-file"></string>
- <booleanname="seeds-as-surt-prefixes">true</boolean>
- <stringname="surts-dump-file"></string>
- <booleanname="also-check-via">false</boolean>
- <booleanname="rebuild-on-reconfig">true</boolean>
- </newObject>
- <newObjectname="rejectIfTooManyHops"class="org.archive.crawler.deciderules.TooManyHopsDecideRule">
- <integername="max-hops">20</integer>
- </newObject>
- <newObjectname="acceptIfTranscluded"class="org.archive.crawler.deciderules.TransclusionDecideRule">
- <integername="max-trans-hops">3</integer>
- <integername="max-speculative-hops">1</integer>
- </newObject>
- <newObjectname="rejectIfPathological"class="org.archive.crawler.deciderules.PathologicalPathDecideRule">
- <integername="max-repetitions">2</integer>
- </newObject>
- <newObjectname="rejectIfTooManyPathSegs"class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">
- <integername="max-path-depth">20</integer>
- </newObject>
- <newObjectname="acceptIfPrerequisite"class="org.archive.crawler.deciderules.PrerequisiteAcceptDecideRule">
- </newObject>
- </map>
- </newObject>
- </newObject>
3.2: HTTP协议<map name="http-headers">
- <mapname="http-headers">
- <stringname="user-agent">Mozilla/5.0(compatible;heritrix/1.14.3+http://127.0.0.1)</string>
- <stringname="from">guoyunsky@hotmail.com</string>
- </map>
3.3:爬虫协议 <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- <newObjectname="robots-honoring-policy"class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- <stringname="type">classic</string>
- <booleanname="masquerade">false</boolean>
- <textname="custom-robots"></text>
- <stringListname="user-agents">
- </stringList>
- </newObject>
3.4:Frontier 调度器<newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier"><!-- Frontier 调度器-->
- <newObjectname="frontier"class="org.archive.crawler.frontier.BdbFrontier">
- <floatname="delay-factor">4.0</float>
- <integername="max-delay-ms">20000</integer>
- <integername="min-delay-ms">2000</integer>
- <integername="respect-crawl-delay-up-to-secs">300</integer>
- <integername="max-retries">30</integer>
- <longname="retry-delay-seconds">900</long>
- <integername="preference-embed-hops">1</integer>
- <integername="total-bandwidth-usage-KB-sec">0</integer>
- <integername="max-per-host-bandwidth-usage-KB-sec">0</integer>
- <stringname="queue-assignment-policy">org.archive.crawler.frontier.HostnameQueueAssignmentPolicy</string>
- <stringname="force-queue-assignment"></string>
- <booleanname="pause-at-start">false</boolean>
- <booleanname="pause-at-finish">false</boolean>
- <booleanname="source-tag-seeds">false</boolean>
- <booleanname="recovery-log-enabled">true</boolean>
- <booleanname="hold-queues">true</boolean>
- <integername="balance-replenish-amount">3000</integer>
- <integername="error-penalty-amount">100</integer>
- <longname="queue-total-budget">-1</long>
- <stringname="cost-policy">org.archive.crawler.frontier.ZeroCostAssignmentPolicy</string>
- <longname="snooze-deactivate-ms">300000</long>
- <integername="target-ready-backlog">50</integer>
- <stringname="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>
- <booleanname="dump-pending-at-close">false</boolean>
- </newObject>
3.5:URL规范化规则,主要用来规范化每个URL,用Heritrix默认的就好了,这里不做说明了,其实也是通过各种规则
3.6:预先处理链组件: <map name="pre-fetch-processors">
- <mapname="pre-fetch-processors">
- <newObjectname="Preselector"class="org.archive.crawler.prefetch.Preselector">
- <booleanname="enabled">true</boolean>
- <newObjectname="Preselector#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <booleanname="override-logger">false</boolean>
- <booleanname="recheck-scope">true</boolean>
- <booleanname="block-all">false</boolean>
- <stringname="block-by-regexp"></string>
- <stringname="allow-by-regexp"></string>
- </newObject>
- <newObjectname="Preprocessor"class="org.archive.crawler.prefetch.PreconditionEnforcer">
- <booleanname="enabled">true</boolean>
- <newObjectname="Preprocessor#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <integername="ip-validity-duration-seconds">86400</integer>
- <integername="robot-validity-duration-seconds">86400</integer>
- <booleanname="calculate-robots-only">false</boolean>
- </newObject>
- </map>
3.7:获取组件:<map name="fetch-processors">
- <mapname="fetch-processors">
- <newObjectname="DNS"class="org.archive.crawler.fetcher.FetchDNS">
- <booleanname="enabled">true</boolean>
- <newObjectname="DNS#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <booleanname="accept-non-dns-resolves">false</boolean>
- <booleanname="digest-content">true</boolean>
- <stringname="digest-algorithm">sha1</string>
- </newObject>
- <newObjectname="HTTP"class="org.archive.crawler.fetcher.FetchHTTP">
- <booleanname="enabled">true</boolean>
- <newObjectname="HTTP#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <newObjectname="midfetch-decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <integername="timeout-seconds">1200</integer>
- <integername="sotimeout-ms">20000</integer>
- <integername="fetch-bandwidth">0</integer>
- <longname="max-length-bytes">0</long>
- <booleanname="ignore-cookies">false</boolean>
- <booleanname="use-bdb-for-cookies">true</boolean>
- <stringname="load-cookies-from-file"></string>
- <stringname="save-cookies-to-file"></string>
- <stringname="trust-level">open</string>
- <stringListname="accept-headers">
- </stringList>
- <stringname="http-proxy-host"></string>
- <stringname="http-proxy-port"></string>
- <stringname="default-encoding">GB2312</string>
- <booleanname="digest-content">true</boolean>
- <stringname="digest-algorithm">sha1</string>
- <booleanname="send-if-modified-since">true</boolean>
- <booleanname="send-if-none-match">true</boolean>
- <booleanname="send-connection-close">true</boolean>
- <booleanname="send-referer">true</boolean>
- <booleanname="send-range">false</boolean>
- <stringname="http-bind-address"></string>
- </newObject>
- </map>
3.8:抽取组件<map name="extract-processors"> <!-- 抽取链 -->
- <mapname="extract-processors">
- <newObjectname="ExtractorHTTP"class="org.archive.crawler.extractor.ExtractorHTTP">
- <booleanname="enabled">true</boolean>
- <newObjectname="ExtractorHTTP#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- </newObject>
- <newObjectname="ExtractorHTML"class="org.archive.crawler.extractor.ExtractorHTML">
- <booleanname="enabled">true</boolean>
- <newObjectname="ExtractorHTML#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <booleanname="extract-javascript">true</boolean>
- <booleanname="treat-frames-as-embed-links">true</boolean>
- <booleanname="ignore-form-action-urls">true</boolean>
- <booleanname="extract-only-form-gets">true</boolean>
- <booleanname="extract-value-attributes">true</boolean>
- <booleanname="ignore-unexpected-html">true</boolean>
- </newObject>
- </map>
3.9:写组件<map name="write-processors">
- <mapname="write-processors">
- <newObjectname="Archiver"class="com.steel.heritrix.extend.MyWriterMirror">
- <booleanname="enabled">true</boolean>
- <newObjectname="Archiver#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <booleanname="case-sensitive">true</boolean>
- <stringListname="character-map"/>
- <stringListname="content-type-map"/>
- <stringname="directory-file">index.html</string>
- <stringname="dot-begin">%2E</string>
- <stringname="dot-end">.</string>
- <stringListname="host-map"/>
- <booleanname="host-directory">true</boolean>
- <stringname="path">mirror</string>
- <integername="max-path-length">1023</integer>
- <integername="max-segment-length">255</integer>
- <booleanname="port-directory">false</boolean>
- <booleanname="suffix-at-end">true</boolean>
- <stringname="too-long-directory">LONG</string>
- <stringListname="underscore-set"/>
- </newObject>
- </map>
3.10:请求链组件<map name="post-processors">里面可以配置自己的调度器
- <mapname="post-processors">
- <newObjectname="Updater"class="org.archive.crawler.postprocessor.CrawlStateUpdater">
- <booleanname="enabled">true</boolean>
- <newObjectname="Updater#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- </newObject>
- <newObjectname="LinksScoper"class="org.archive.crawler.postprocessor.LinksScoper">
- <booleanname="enabled">true</boolean>
- <newObjectname="LinksScoper#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- <booleanname="override-logger">false</boolean><!--如果启用则覆盖这个类的默认日志器,默认日志器将日志打印在控制台.覆盖的日志器将把所有日志发送到
- 在日志目录下的以本类命名的日志文件中。在heritrix.properties中设置好日志等级和日志格式,这个属性在重启后知获取一次.-->
- <booleanname="seed-redirects-new-seed">true</boolean>
- <integername="preference-depth-hops">-1</integer>
- <newObjectname="scope-rejected-url-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- </newObject>
- <newObjectname="Scheduler"class="com.steel.heritrix.extend.MyFrontierScheduler">
- <booleanname="enabled">true</boolean>
- <newObjectname="Scheduler#decide-rules"class="org.archive.crawler.deciderules.DecideRuleSequence">
- <mapname="rules">
- </map>
- </newObject>
- </newObject>
- </map>
3.11:统计跟踪链组件<map name="loggers">
- <mapname="loggers">
- <newObjectname="crawl-statistics"class="org.archive.crawler.admin.StatisticsTracker">
- <integername="interval-seconds">20</integer>
- </newObject>
- </map>
(编辑:李大同)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|