GitHub user JaspreetSinghChahal created a discussion: Storm crawler not
honouring crawl delay
Hi,
I have spent days on it but couldn't get it honour crawl delay. I have tried
multipe combinations. I have come here as a last resort.
my crawler.flux
name: "crawler"
includes:
- resource: true
file: "/crawler-default.yaml"
override: false
- resource: false
file: "crawler-conf.yaml"
override: true
- resource: false
file: "opensearch-conf.yaml"
override: true
components:
- id: "WARCFileNameFormat"
className: "org.apache.stormcrawler.warc.WARCFileNameFormat"
configMethods:
- name: "withPath"
args:
- "/data/warc_storage" # Ensure this matches your Docker volume
- name: "withPrefix"
args:
- "crawler-data"
- id: "WARCFileRotationPolicy"
className: "org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy"
constructorArgs:
- 1
- GB
spouts:
- id: "spout"
className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout"
parallelism: 2
bolts:
- id: "partitioner"
className: "org.apache.stormcrawler.bolt.URLPartitionerBolt"
parallelism: 4
- id: "fetcher"
className: "org.apache.stormcrawler.bolt.FetcherBolt"
parallelism: 4
- id: "sitemap"
className: "org.apache.stormcrawler.bolt.SiteMapParserBolt"
parallelism: 2
- id: "feed"
className: "org.apache.stormcrawler.bolt.FeedParserBolt"
parallelism: 2
- id: "parse"
className: "org.apache.stormcrawler.bolt.JSoupParserBolt"
parallelism: 4
- id: "shunt"
className: "org.apache.stormcrawler.tika.RedirectionBolt"
parallelism: 2
- id: "tika"
className: "org.apache.stormcrawler.tika.ParserBolt"
parallelism: 4
- id: "index"
className: "org.apache.stormcrawler.opensearch.bolt.IndexerBolt"
parallelism: 2
- id: "warc"
className: "org.apache.stormcrawler.warc.WARCHdfsBolt"
parallelism: 2
configMethods:
- name: "withFileNameFormat"
args:
- ref: "WARCFileNameFormat"
- name: "withRotationPolicy"
args:
- ref: "WARCFileRotationPolicy"
- name: "withRequestRecords"
- name: "withConfigKey"
args:
- "warc"
- id: "status"
className:
"org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt"
parallelism: 4
- id: "deleter"
className: "org.apache.stormcrawler.opensearch.bolt.DeletionBolt"
parallelism: 1
- id: "status_metrics"
className: "org.apache.stormcrawler.opensearch.metrics.StatusMetricsBolt"
parallelism: 1
streams:
- from: "spout"
to: "partitioner"
grouping:
type: FIELDS
args: ["url"]
# System Tick -> Status Metrics
- from: "__system"
to: "status_metrics"
grouping:
type: SHUFFLE
streamId: "__tick"
- from: "partitioner"
to: "fetcher"
grouping:
type: FIELDS
args: ["key"]
# Tapping the fetcher output to send raw HTML to WARC storage
- from: "fetcher"
to: "warc"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "fetcher"
to: "sitemap"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "fetcher"
to: "feed"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "sitemap"
to: "feed"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "feed"
to: "parse"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "parse"
to: "shunt"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "shunt"
to: "tika"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "tika"
- from: "parse"
to: "index"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "tika"
to: "index"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "fetcher"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "sitemap"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "feed"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "parse"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "tika"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "index"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
# Status -> Deleter
- from: "status"
to: "deleter"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "deletion"
My crawler-conf.yaml
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and
provide additional ones
# for your custom components.
# Use this file with the parameter -conf when launching your extension of
ConfigurableTopology.
# This file does not contain all the key values but only the most frequently
used ones. See crawler-default.xml for an extensive list.
config:
topology.workers: 1
topology.message.timeout.secs: 1200
topology.max.spout.pending: 300
topology.debug: false
topology.receiver.buffer.size: 16
topology.transfer.buffer.size: 1024
# --- Extreme Politeness Settings ---
# 15 minutes = 900 seconds
fetcher.server.delay: 900.0
fetcher.server.min.delay: 900.0
# generous max. crawl delay
# (fetch content even if the robots.txt specifies a large host-specific crawl
delay:
# waiting 90 sec. between successive fetches would still allow to fetch
# about 1000 pages per day, enough for small news sites)
fetcher.max.crawl.delay: 900
# use the larger default delay (fetcher.server.delay)
# in case a shorter crawl-delay is defined in the robots.txt
fetcher.server.delay.force: true
fetcher.max.crawl.delay.force: true
# Ensure only ONE thread touches any domain at a time
fetcher.threads.per.queue: 1
# limit the number of queued URLs
# - avoid duplicate fetches (queues are not sets)
fetcher.max.queue.size: 10
fetcher.max.urls.in.queues: 10000
# --- REDUCED thread count to prevent Playwright protocol lock contention ---
# Reducing from 40 to 8 to match Playwright browser instances
# and prevent thread starvation on the protocol lock
fetcher.threads.number: 10
# override the JVM parameters for the workers
# Optimized for 32GB machine: 2 workers × 14GB = 28GB (leaving 4GB for OS)
topology.worker.childopts: "-Xmx10g -Djava.net.preferIPv4Stack=true"
# mandatory when using Flux
topology.kryo.register:
- org.apache.stormcrawler.Metadata:
org.apache.stormcrawler.util.MetadataSerializer
- org.apache.stormcrawler.Metadata
- org.apache.stormcrawler.persistence.Status
- java.util.Collections$EmptyMap
- java.util.Collections$EmptyList
- java.util.LinkedList
- java.util.ArrayList
- java.util.HashMap
- java.util.HashSet
topology.kryo.registration.required: false
partition.url.mode: "byHost"
# fetch Scheduler implementation
scheduler.class: "org.apache.stormcrawler.persistence.AdaptiveScheduler"
# AdaptiveScheduler properties
scheduler.adaptive.setLastModified: true
# frequently changing feeds or news sitemaps are refetched after 90 min.
scheduler.adaptive.fetchInterval.min: 90
# if there are no changes the interval may grow to 7 days
scheduler.adaptive.fetchInterval.max: 10080
scheduler.adaptive.fetchInterval.rate.incr: .5
scheduler.adaptive.fetchInterval.rate.decr: .2
fetchInterval.isFeed: 60
fetchInterval.isHub: 720
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with
"follow.".
#metadata.transfer:
# - "depth"
# - "source"
# - "referrer"
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg.
"follow.*".
metadata.transfer:
- "sub_index"
metadata.persist:
- _redirTo
- error.cause
- fetch.statusCode
- fetch.exception
- discoveryDate
- lastProcessedDate
- last-modified
- signature
- signatureChangeDate
- canonical
- domain
- host
- parse.robots
- index.timestamp
- depth
- numLinks
- isHub
- isFeed
- isSitemap
- sub_index
metadata.track.path: true
metadata.track.depth: true
# Agent name info - given here as an example. Do not be an anonynmous coward,
use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.
# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z),
underscores ("_"), and hyphens ("-")
http.agent.name: "AbcBot"
# version of your crawler
http.agent.version: "1"
# description of what it does
http.agent.description: "Abc."
# URL webmasters can go to to learn about it
http.agent.url: "https://abc.com"
# Finally, an email so that they can get in touch with you
http.agent.email: "[email protected]"
http.protocol.implementation:
"org.apache.stormcrawler.protocol.DelegatorProtocol"
https.protocol.implementation:
"org.apache.stormcrawler.protocol.DelegatorProtocol"
protocol.delegator.config:
# Route 1: Feeds (by URL pattern)
- className: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
regex:
- ".*\\.rss$"
- ".*\\.atom$"
- ".*\\.xml$"
- ".*/rss/.*"
- ".*/feed/.*"
- ".*/feeds/.*"
- ".*/atom/.*"
# Route 2: Sitemaps (by URL pattern)
- className: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
regex:
- ".*sitemap.*\\.xml$"
# Route 3: Default - everything else (dynamic content) uses Playwright
- className: "org.apache.stormcrawler.protocol.playwright.HttpProtocol"
# The maximum number of bytes for returned HTTP response bodies.
# The fetched page will be trimmed to 65KB in this case
# Set -1 to disable the limit.
http.content.limit: 5242880
http.timeout: 60000
# store partial fetches as trimmed content (some content has been fetched,
# but reading more data from socket failed, eg. because of a network timeout)
http.content.partial.as.trimmed: true
sitemap.discovery: true
sitemap.sniffContent: true
feed.sniffContent: true
# FetcherBolt queue dump => comment out to activate
# if a file exists on the worker machine with the corresponding port number
# the FetcherBolt will log the content of its internal queues to the logs
# fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"
parser.filters:
- class: "org.apache.stormcrawler.parse.filter.BoilerpipeFilter"
filters:
- text/html
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
jsoup.filters.config.file: "jsoupfilters.json"
indexingfilters.config.file: "indexingfilters.json"
# revisit a page daily (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.default: 1440
# revisit a page with a fetch error after 2 hours (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.fetch.error: 1440
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1
# set to true if you don't need any text to be extracted by JSoup
textextractor.no.text: false
# text extraction for JSoupParserBolt
textextractor.include.pattern:
- MAIN
- ARTICLE
- BODY
textextractor.exclude.tags:
- STYLE
- SCRIPT
- NAV # Navigation menus (header, sidebar)
- FOOTER # Footers (links, copyright, metadata)
- HEADER # Main site header (usually logos, nav, search)
- ASIDE # Sidebars, supplementary, or tangential content (ads, related
stories)
- FORM # Form fields (search boxes, comments)
# Common, non-content junk containers (optional but helpful)
- IFRAME # Embedded ads or external content (like your Taboola ad content)
- NOSCRIPT
# needed for parsing with Tika
jsoup.treat.non.html.as.error: false
# restricts the documents types to be parsed with Tika
parser.mimetype.whitelist:
- application/.+word.*
- application/.+excel.*
- application/.+powerpoint.*
- application/.*pdf.*
# Tika parser configuration file
parse.tika.config.file: "tika-config.xml"
# custom fetch interval to be used when a document has the key/value in its
metadata
# and has been fetched successfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true: 30
# fetchInterval.isFeed=true: 10
# configuration for the classes extending AbstractIndexerBolt
indexer.md.filter:
- isSitemap=true
- isFeed=true
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default
anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you
can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
# --- Essentials ---
- canonical=canonical
- parse.robots=robots_meta
- parse.title=title
- parse.description=description
- parse.keywords=keywords
- parse.language=language
# --- Author Split ---
- parse.author.meta=author_meta
- parse.author.og=author_og
- parse.author.span=author_span
- parse.author.link=author_link
# --- Date Split ---
- parse.date.article_pub=date_article_pub
- parse.date.modified_time=date_modified_time
- parse.date.publish_date=date_publish_date
- parse.date.pubdate=date_pubdate
- parse.date.og_pub=date_og_pub
- parse.date.meta_date=date_meta_date
- parse.date.last_modified=date_last_modified
- parse.date.time_pub_class=date_time_raw
- parse.date.time_any=date_time_raw
# --- Images & Branding ---
- parse.image.og=image_og
- parse.image.twitter=image_twitter
- parse.image.link_src=image_link_src
- parse.img_width=thumbnail_width
- parse.img_height=thumbnail_height
- parse.site.og=site_og
- parse.site.app_name=site_app_name
- parse.site.apple_title=site_apple_title
- parse.og_type=og_type
- parse.og_locale=og_locale
# --- Tech & UI ---
- parse.favicon_hrefs=favicon_hrefs
- parse.favicon_sizes=favicon_sizes
- parse.apple_icon=apple_icon
# --- Content Cluster ---
- parse.content.cleaned=content
- parse.content.article=content_article
- parse.content.main=content_article_main
- parse.content.main_role=content_main_role
- parse.article:ld_json=ld_json
# --- Graph/Crawler Info ---
- domain=domain
- host=host
- outlinks=outlinks
- referrer=referenced_by
- depth=crawl_depth
- sub_index=sub_index
# Metrics consumers:
topology.metrics.consumer.register:
- class: "org.apache.storm.metric.LoggingMetricsConsumer"
parallelism.hint: 1
# Optimized for 32GB machine: 14GB per worker
worker.heap.memory.mb: 6144
topology.worker.max.heap.size.mb: 6144.0
# 2. Selenium Server Endpoint (where the WebDriver is running)
#selenium.addresses: ["http://172.29.130.2:4444/wd/hub"]
#selenium.timeouts:
# pageLoad: 60000
#selenium.capabilities:
# browserName: "chrome"
# # Use 'goog:chromeOptions' which is the standard W3C way for
Chrome-specific settings
# goog:chromeOptions:
# args:
# - "--headless"
# - "--no-sandbox"
# - "--disable-dev-shm-usage"
# protocol.playwright.launch.options
playwright.browser.type: "chromium"
playwright.headless: true
playwright.launch.args:
- "--no-sandbox"
- "--disable-setuid-sandbox"
- "--disable-dev-shm-usage"
- "--disable-gpu"
- "--disable-software-rasterizer"
- "--disable-extensions"
- "--disable-background-networking"
- "--disable-blink-features=AutomationControlled"
- "--lang=en-US"
playwright.navigation.timeout: 45000
playwright.browser.path: "/home/storm/.cache/ms-playwright"
playwright.browser.launch.options:
userAgent: "AbcBot/1.0 (+https://abc.com; [email protected])"
# Essential for WARC
fetcher.store.http.content: true
fetcher.store.http.headers: true
# WARC File Settings
warc: { "fs.file.impl": "org.apache.hadoop.fs.RawLocalFileSystem" }
http.store.headers: true
My opensearch-conf.yaml
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# configuration for OpenSearch resources
config:
# address to use unless a more specific one has been
# defined for a component
# also accepts a list or multiple values in a single line
# separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200"
opensearch.addresses: "https://search.sooth.fyi:9200"
opensearch.user: "admin"
opensearch.password: "yRF0yn2uiNNa1a0V"
opensearch.concurrentRequests: 1
# Disable TLS validation for connection to OpenSearch
opensearch.disable.tls.validation: true
# Indexer bolt
# addresses can be specified as a full URL
# if not we assume that the protocol is http and the port 9200
opensearch.indexer.addresses: "https://search.sooth.fyi:9200"
opensearch.indexer.index.name: "content"
# opensearch.indexer.pipeline: "_PIPELINE_"
opensearch.indexer.create: false
opensearch.indexer.bulkActions: 50
opensearch.indexer.flushInterval: "5s"
opensearch.indexer.concurrentRequests: 1
opensearch.indexer.sniff: false
# MetricsConsumer
opensearch.metrics.addresses: "https://search.sooth.fyi:9200"
opensearch.metrics.index.name: "metrics"
opensearch.metrics.sniff: false
# Spout and persistence bolt
opensearch.status.addresses: "https://search.sooth.fyi:9200"
opensearch.status.index.name: "status"
opensearch.status.user: "admin"
opensearch.status.password: "yRF0yn2uiNNa1a0V"
# the routing is done on the value of 'partition.url.mode'
opensearch.status.routing: true
# stores the value used for grouping the URLs as a separate field
# needed by the spout implementations
# also used for routing if the value above is set to true
opensearch.status.routing.fieldname: "key"
opensearch.status.bulkActions: 500
opensearch.status.flushInterval: "5s"
opensearch.status.concurrentRequests: 1
opensearch.status.sniff:
false
# spout config #
# positive or negative filters parsable by the Lucene Query Parser
# opensearch.status.filterQuery:
# - "-(key:stormcrawler.net)"
# - "-(key:stormcrawler.apache.org)"
# time in secs for which the URLs will be considered for fetching after a ack
of fail
spout.ttl.purgatory: 900
# Min time (in msecs) to allow between 2 successive queries to OpenSearch
spout.min.delay.queries: 5000
# Max time (in msecs) to allow between 2 successive queries to OpenSearch
spout.max.delay.queries: 60000
# Delay since previous query date (in secs) after which the nextFetchDate
value will be reset to the current time
# Setting this to -1 or a large value means that OpenSearch will cache the
results but also that fewer and fewer
# results might be returned.
spout.reset.fetchdate.after: -1
opensearch.status.max.results: 500
opensearch.status.max.buckets: 3000
opensearch.status.max.urls.per.bucket: 1
# field to group the URLs into buckets
opensearch.status.bucket.field: "key"
# fields to sort the URLs within a bucket
opensearch.status.bucket.sort.field:
- "nextFetchDate"
- "url"
# field to sort the buckets
opensearch.status.global.sort.field: "nextFetchDate"
# AggregationSpout : sampling improves the performance on large crawls
opensearch.status.sample: true
# max allowed duration of a query in sec
opensearch.status.query.timeout: -1
# AggregationSpout (expert): adds this value in mins to the latest date
returned in the results and
# use it as nextFetchDate
opensearch.status.recentDate.increase: -1
opensearch.status.recentDate.min.gap: -1
topology.metrics.consumer.register:
- class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
parallelism.hint: 1
#whitelist:
# - "fetcher_counter"
# - "fetcher_average.bytes_fetched"
#blacklist:
# - "__receive.*"
log4j.logger.org.apache.stormcrawler.opensearch: DEBUG
GitHub link: https://github.com/apache/stormcrawler/discussions/1808
----
This is an automatically sent email for [email protected].
To unsubscribe, please send an email to: [email protected]