GitHub user JaspreetSinghChahal created a discussion: Storm crawler not 
honouring crawl delay

Hi,

I have spent days on it but couldn't get it honour crawl delay. I have tried 
multipe combinations. I have come here as a last resort.

my crawler.flux
name: "crawler"

includes:
    - resource: true
      file: "/crawler-default.yaml"
      override: false

    - resource: false
      file: "crawler-conf.yaml"
      override: true

    - resource: false
      file: "opensearch-conf.yaml"
      override: true

components:
  - id: "WARCFileNameFormat"
    className: "org.apache.stormcrawler.warc.WARCFileNameFormat"
    configMethods:
      - name: "withPath"
        args:
          - "/data/warc_storage" # Ensure this matches your Docker volume
      - name: "withPrefix"
        args:
          - "crawler-data"

  - id: "WARCFileRotationPolicy"
    className: "org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy"
    constructorArgs:
      - 1
      - GB

spouts:
  - id: "spout"
    className: "org.apache.stormcrawler.opensearch.persistence.AggregationSpout"
    parallelism: 2

bolts:
  - id: "partitioner"
    className: "org.apache.stormcrawler.bolt.URLPartitionerBolt"
    parallelism: 4
  - id: "fetcher"
    className: "org.apache.stormcrawler.bolt.FetcherBolt"
    parallelism: 4
  - id: "sitemap"
    className: "org.apache.stormcrawler.bolt.SiteMapParserBolt"
    parallelism: 2
  - id: "feed"
    className: "org.apache.stormcrawler.bolt.FeedParserBolt"
    parallelism: 2
  - id: "parse"
    className: "org.apache.stormcrawler.bolt.JSoupParserBolt"
    parallelism: 4
  - id: "shunt"
    className: "org.apache.stormcrawler.tika.RedirectionBolt"
    parallelism: 2 
  - id: "tika"
    className: "org.apache.stormcrawler.tika.ParserBolt"
    parallelism: 4
  - id: "index"
    className: "org.apache.stormcrawler.opensearch.bolt.IndexerBolt"
    parallelism: 2
  - id: "warc"
    className: "org.apache.stormcrawler.warc.WARCHdfsBolt"
    parallelism: 2
    configMethods:
      - name: "withFileNameFormat"
        args:
          - ref: "WARCFileNameFormat"
      - name: "withRotationPolicy"
        args:
          - ref: "WARCFileRotationPolicy"
      - name: "withRequestRecords"
      - name: "withConfigKey"
        args:
          - "warc"
  - id: "status"
    className: 
"org.apache.stormcrawler.opensearch.persistence.StatusUpdaterBolt"
    parallelism: 4
  - id: "deleter"
    className: "org.apache.stormcrawler.opensearch.bolt.DeletionBolt"
    parallelism: 1
  - id: "status_metrics"
    className: "org.apache.stormcrawler.opensearch.metrics.StatusMetricsBolt"
    parallelism: 1


streams:
  - from: "spout"
    to: "partitioner"
    grouping:
      type: FIELDS
      args: ["url"]

# System Tick -> Status Metrics
  - from: "__system"
    to: "status_metrics"
    grouping:
      type: SHUFFLE
      streamId: "__tick"

  - from: "partitioner"
    to: "fetcher"
    grouping:
      type: FIELDS
      args: ["key"]

  # Tapping the fetcher output to send raw HTML to WARC storage
  - from: "fetcher"
    to: "warc"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "fetcher"
    to: "sitemap"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "fetcher"
    to: "feed"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "sitemap"
    to: "feed"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "feed"
    to: "parse"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "parse"
    to: "shunt"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "shunt"
    to: "tika"
    grouping:
      type: LOCAL_OR_SHUFFLE
      streamId: "tika"

  - from: "parse"
    to: "index"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "tika"
    to: "index"
    grouping:
      type: LOCAL_OR_SHUFFLE

  - from: "fetcher"
    to: "status"
    grouping:
      type: FIELDS
      args: ["url"]
      streamId: "status"

  - from: "sitemap"
    to: "status"
    grouping:
      type: FIELDS
      args: ["url"]
      streamId: "status"
      
  - from: "feed"
    to: "status"
    grouping:
      type: FIELDS
      args: ["url"]
      streamId: "status"

  - from: "parse"
    to: "status"
    grouping:
      type: FIELDS
      args: ["url"]
      streamId: "status"

  - from: "tika"
    to: "status"
    grouping:
      type: FIELDS
      args: ["url"]
      streamId: "status"

  - from: "index"
    to: "status"
    grouping:
      type: FIELDS
      args: ["url"]
      streamId: "status"

# Status -> Deleter
  - from: "status"
    to: "deleter"
    grouping:
      type: LOCAL_OR_SHUFFLE
      streamId: "deletion"

My crawler-conf.yaml
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and 
provide additional ones
# for your custom components.
# Use this file with the parameter -conf when launching your extension of 
ConfigurableTopology.
# This file does not contain all the key values but only the most frequently 
used ones. See crawler-default.xml for an extensive list.

config:
  topology.workers: 1
  topology.message.timeout.secs: 1200
  topology.max.spout.pending: 300
  topology.debug: false

  topology.receiver.buffer.size: 16
  topology.transfer.buffer.size: 1024

  # --- Extreme Politeness Settings ---
  # 15 minutes = 900 seconds
  fetcher.server.delay: 900.0
  fetcher.server.min.delay: 900.0

  # generous max. crawl delay
  # (fetch content even if the robots.txt specifies a large host-specific crawl 
delay:
  #  waiting 90 sec. between successive fetches would still allow to fetch
  #  about 1000 pages per day, enough for small news sites)
  fetcher.max.crawl.delay: 900

  # use the larger default delay (fetcher.server.delay)
  # in case a shorter crawl-delay is defined in the robots.txt
  fetcher.server.delay.force: true
  fetcher.max.crawl.delay.force: true

  # Ensure only ONE thread touches any domain at a time
  fetcher.threads.per.queue: 1

  # limit the number of queued URLs
  # - avoid duplicate fetches (queues are not sets)
  fetcher.max.queue.size: 10
  fetcher.max.urls.in.queues: 10000

  # --- REDUCED thread count to prevent Playwright protocol lock contention ---
  # Reducing from 40 to 8 to match Playwright browser instances
  # and prevent thread starvation on the protocol lock
  fetcher.threads.number: 10

  # override the JVM parameters for the workers
  # Optimized for 32GB machine: 2 workers × 14GB = 28GB (leaving 4GB for OS)
  topology.worker.childopts: "-Xmx10g -Djava.net.preferIPv4Stack=true"

  # mandatory when using Flux
  topology.kryo.register:
    - org.apache.stormcrawler.Metadata: 
org.apache.stormcrawler.util.MetadataSerializer
    - org.apache.stormcrawler.Metadata
    - org.apache.stormcrawler.persistence.Status
    - java.util.Collections$EmptyMap
    - java.util.Collections$EmptyList
    - java.util.LinkedList
    - java.util.ArrayList
    - java.util.HashMap
    - java.util.HashSet

  topology.kryo.registration.required: false

  partition.url.mode: "byHost"

  # fetch Scheduler implementation
  scheduler.class: "org.apache.stormcrawler.persistence.AdaptiveScheduler"
  # AdaptiveScheduler properties
  scheduler.adaptive.setLastModified: true
  # frequently changing feeds or news sitemaps are refetched after 90 min.
  scheduler.adaptive.fetchInterval.min: 90
  # if there are no changes the interval may grow to 7 days
  scheduler.adaptive.fetchInterval.max: 10080
  scheduler.adaptive.fetchInterval.rate.incr: .5
  scheduler.adaptive.fetchInterval.rate.decr: .2

  fetchInterval.isFeed: 60
  fetchInterval.isHub: 720
  # Lists the metadata to transfer to outlinks
  # Used by Fetcher and SiteMapParser for redirections,
  # discovered links, passing cookies to child pages, etc.
  # These are also persisted for the parent document (see below).
  # Allows wildcards, eg. "follow.*" transfers all metadata starting with 
"follow.".
  #metadata.transfer:
  #  - "depth"
  #  - "source"
  #  - "referrer"

  # Lists the metadata to persist to storage
  # These are not transferred to the outlinks. Also allows wildcards, eg. 
"follow.*".
  metadata.transfer:
    - "sub_index"

  metadata.persist:
    - _redirTo
    - error.cause
    - fetch.statusCode
    - fetch.exception
    - discoveryDate
    - lastProcessedDate
    - last-modified
    - signature
    - signatureChangeDate
    - canonical
    - domain
    - host
    - parse.robots
    - index.timestamp
    - depth
    - numLinks
    - isHub
    - isFeed
    - isSitemap
    - sub_index

  metadata.track.path: true
  metadata.track.depth: true

  # Agent name info - given here as an example. Do not be an anonynmous coward, 
use your real information!
  # The full user agent value sent as part of the HTTP requests
  # is built from the elements below. Only the agent.name is mandatory,
  # it is also used to parse the robots.txt directives.

  # The agent name must be compliant with RFC 9309 (section 2.2.1)
  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), 
underscores ("_"), and hyphens ("-")
  http.agent.name: "AbcBot"
  # version of your crawler
  http.agent.version: "1"
  # description of what it does
  http.agent.description: "Abc."
  # URL webmasters can go to to learn about it
  http.agent.url: "https://abc.com";
  # Finally, an email so that they can get in touch with you
  http.agent.email: "[email protected]"

  http.protocol.implementation: 
"org.apache.stormcrawler.protocol.DelegatorProtocol"
  https.protocol.implementation: 
"org.apache.stormcrawler.protocol.DelegatorProtocol"

  protocol.delegator.config:
    # Route 1: Feeds (by URL pattern)
    - className: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
      regex:
        - ".*\\.rss$"
        - ".*\\.atom$"
        - ".*\\.xml$"
        - ".*/rss/.*"
        - ".*/feed/.*"
        - ".*/feeds/.*"
        - ".*/atom/.*"

    # Route 2: Sitemaps (by URL pattern)
    - className: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
      regex:
        - ".*sitemap.*\\.xml$"

    # Route 3: Default - everything else (dynamic content) uses Playwright
    - className: "org.apache.stormcrawler.protocol.playwright.HttpProtocol"

  # The maximum number of bytes for returned HTTP response bodies.
  # The fetched page will be trimmed to 65KB in this case
  # Set -1 to disable the limit.
  http.content.limit: 5242880
  http.timeout: 60000

  # store partial fetches as trimmed content (some content has been fetched,
  # but reading more data from socket failed, eg. because of a network timeout)
  http.content.partial.as.trimmed: true

  sitemap.discovery: true
  sitemap.sniffContent: true

  feed.sniffContent: true

  # FetcherBolt queue dump => comment out to activate
  # if a file exists on the worker machine with the corresponding port number
  # the FetcherBolt will log the content of its internal queues to the logs
  # fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"

  parser.filters:
    - class: "org.apache.stormcrawler.parse.filter.BoilerpipeFilter"
      filters:
        - text/html

  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"
  jsoup.filters.config.file: "jsoupfilters.json"
  indexingfilters.config.file: "indexingfilters.json"

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: 1440

  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 1440

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # set to true if you don't need any text to be extracted by JSoup
  textextractor.no.text: false

  # text extraction for JSoupParserBolt
  textextractor.include.pattern:
    - MAIN
    - ARTICLE
    - BODY

  textextractor.exclude.tags:
    - STYLE
    - SCRIPT
    - NAV # Navigation menus (header, sidebar)
    - FOOTER # Footers (links, copyright, metadata)
    - HEADER # Main site header (usually logos, nav, search)
    - ASIDE # Sidebars, supplementary, or tangential content (ads, related 
stories)
    - FORM # Form fields (search boxes, comments)

    # Common, non-content junk containers (optional but helpful)
    - IFRAME # Embedded ads or external content (like your Taboola ad content)
    - NOSCRIPT

  # needed for parsing with Tika
  jsoup.treat.non.html.as.error: false

  # restricts the documents types to be parsed with Tika
  parser.mimetype.whitelist:
    - application/.+word.*
    - application/.+excel.*
    - application/.+powerpoint.*
    - application/.*pdf.*

  # Tika parser configuration file
  parse.tika.config.file: "tika-config.xml"

  # custom fetch interval to be used when a document has the key/value in its 
metadata
  # and has been fetched successfully (value in minutes)
  # fetchInterval.FETCH_ERROR.isFeed=true: 30
  # fetchInterval.isFeed=true: 10

  # configuration for the classes extending AbstractIndexerBolt
  indexer.md.filter:
    - isSitemap=true
    - isFeed=true

  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.canonical.name: "canonical"
  # How to convert metadata key values into fields for indexing
  #
  # if no alias is specified with =alias, the key value is used
  # for instance below, _domain_ and _format_ will be used
  # as field names, whereas _title_ will be used for _parse.title_.
  # You can specify the index of the value to store from the values array
  # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
  # get the first value for the metadata _parse.title_ (which is the default 
anyway).
  # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
  # index all the keys with _parse_ as a prefix. Note that in that case, you 
can't
  # specify an alias with =, nor can you specify an index.
  indexer.md.mapping:
    # --- Essentials ---
    - canonical=canonical
    - parse.robots=robots_meta
    - parse.title=title
    - parse.description=description
    - parse.keywords=keywords
    - parse.language=language

    # --- Author Split ---
    - parse.author.meta=author_meta
    - parse.author.og=author_og
    - parse.author.span=author_span
    - parse.author.link=author_link

    # --- Date Split ---
    - parse.date.article_pub=date_article_pub
    - parse.date.modified_time=date_modified_time
    - parse.date.publish_date=date_publish_date
    - parse.date.pubdate=date_pubdate
    - parse.date.og_pub=date_og_pub
    - parse.date.meta_date=date_meta_date
    - parse.date.last_modified=date_last_modified
    - parse.date.time_pub_class=date_time_raw
    - parse.date.time_any=date_time_raw

    # --- Images & Branding ---
    - parse.image.og=image_og
    - parse.image.twitter=image_twitter
    - parse.image.link_src=image_link_src
    - parse.img_width=thumbnail_width
    - parse.img_height=thumbnail_height
    - parse.site.og=site_og
    - parse.site.app_name=site_app_name
    - parse.site.apple_title=site_apple_title

    - parse.og_type=og_type
    - parse.og_locale=og_locale
    # --- Tech & UI ---
    - parse.favicon_hrefs=favicon_hrefs
    - parse.favicon_sizes=favicon_sizes
    - parse.apple_icon=apple_icon

    # --- Content Cluster ---
    - parse.content.cleaned=content
    - parse.content.article=content_article
    - parse.content.main=content_article_main
    - parse.content.main_role=content_main_role
    - parse.article:ld_json=ld_json

    # --- Graph/Crawler Info ---
    - domain=domain
    - host=host
    - outlinks=outlinks
    - referrer=referenced_by
    - depth=crawl_depth
    - sub_index=sub_index

  # Metrics consumers:
  topology.metrics.consumer.register:
    - class: "org.apache.storm.metric.LoggingMetricsConsumer"
      parallelism.hint: 1

  # Optimized for 32GB machine: 14GB per worker
  worker.heap.memory.mb: 6144
  topology.worker.max.heap.size.mb: 6144.0

  # 2. Selenium Server Endpoint (where the WebDriver is running)
  #selenium.addresses: ["http://172.29.130.2:4444/wd/hub";]

  #selenium.timeouts:
  #  pageLoad: 60000

  #selenium.capabilities:
  #  browserName: "chrome"
  #  # Use 'goog:chromeOptions' which is the standard W3C way for 
Chrome-specific settings
  #  goog:chromeOptions:
  #    args:
  #      - "--headless"
  #      - "--no-sandbox"
  #      - "--disable-dev-shm-usage"

  # protocol.playwright.launch.options
  playwright.browser.type: "chromium"
  playwright.headless: true
  playwright.launch.args:
    - "--no-sandbox"
    - "--disable-setuid-sandbox"
    - "--disable-dev-shm-usage"
    - "--disable-gpu"
    - "--disable-software-rasterizer"
    - "--disable-extensions"
    - "--disable-background-networking"
    - "--disable-blink-features=AutomationControlled"
    - "--lang=en-US"
  playwright.navigation.timeout: 45000
  playwright.browser.path: "/home/storm/.cache/ms-playwright"
  playwright.browser.launch.options:
    userAgent: "AbcBot/1.0 (+https://abc.com; [email protected])"

  # Essential for WARC
  fetcher.store.http.content: true
  fetcher.store.http.headers: true

  # WARC File Settings
  warc: { "fs.file.impl": "org.apache.hadoop.fs.RawLocalFileSystem" }
  http.store.headers: true

My opensearch-conf.yaml

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# configuration for OpenSearch resources

config:
  # address to use unless a more specific one has been
  # defined for a component
  # also accepts a list or multiple values in a single line
  # separated by a semi-colon e.g. "opensearch1:9200; opensearch2:9200"
  opensearch.addresses: "https://search.sooth.fyi:9200";
  opensearch.user: "admin"
  opensearch.password: "yRF0yn2uiNNa1a0V"
  opensearch.concurrentRequests: 1

  # Disable TLS validation for connection to OpenSearch
  opensearch.disable.tls.validation: true

  # Indexer bolt
  # addresses can be specified as a full URL
  # if not we assume that the protocol is http and the port 9200
  opensearch.indexer.addresses: "https://search.sooth.fyi:9200";
  opensearch.indexer.index.name: "content"
  # opensearch.indexer.pipeline: "_PIPELINE_"
  opensearch.indexer.create: false
  opensearch.indexer.bulkActions: 50
  opensearch.indexer.flushInterval: "5s"
  opensearch.indexer.concurrentRequests: 1
  opensearch.indexer.sniff: false

  # MetricsConsumer
  opensearch.metrics.addresses: "https://search.sooth.fyi:9200";
  opensearch.metrics.index.name: "metrics"
  opensearch.metrics.sniff: false

  # Spout and persistence bolt
  opensearch.status.addresses: "https://search.sooth.fyi:9200";
  opensearch.status.index.name: "status"
  opensearch.status.user: "admin"
  opensearch.status.password: "yRF0yn2uiNNa1a0V"
  # the routing is done on the value of 'partition.url.mode'
  opensearch.status.routing: true
  # stores the value used for grouping the URLs as a separate field
  # needed by the spout implementations
  # also used for routing if the value above is set to true
  opensearch.status.routing.fieldname: "key"
  opensearch.status.bulkActions: 500
  opensearch.status.flushInterval: "5s"
  opensearch.status.concurrentRequests: 1
  opensearch.status.sniff:
    false

    # spout config #

  # positive or negative filters parsable by the Lucene Query Parser
  # opensearch.status.filterQuery:
  #  - "-(key:stormcrawler.net)"
  #  - "-(key:stormcrawler.apache.org)"

  # time in secs for which the URLs will be considered for fetching after a ack 
of fail
  spout.ttl.purgatory: 900

  # Min time (in msecs) to allow between 2 successive queries to OpenSearch
  spout.min.delay.queries: 5000

  # Max time (in msecs) to allow between 2 successive queries to OpenSearch
  spout.max.delay.queries: 60000

  # Delay since previous query date (in secs) after which the nextFetchDate 
value will be reset to the current time
  # Setting this to -1 or a large value means that OpenSearch will cache the 
results but also that fewer and fewer
  # results might be returned.
  spout.reset.fetchdate.after: -1

  opensearch.status.max.results: 500
  opensearch.status.max.buckets: 3000
  opensearch.status.max.urls.per.bucket: 1
  # field to group the URLs into buckets
  opensearch.status.bucket.field: "key"
  # fields to sort the URLs within a bucket
  opensearch.status.bucket.sort.field:
    - "nextFetchDate"
    - "url"
  # field to sort the buckets
  opensearch.status.global.sort.field: "nextFetchDate"

  # AggregationSpout : sampling improves the performance on large crawls
  opensearch.status.sample: true

  # max allowed duration of a query in sec
  opensearch.status.query.timeout: -1

  # AggregationSpout (expert): adds this value in mins to the latest date 
returned in the results and
  # use it as nextFetchDate
  opensearch.status.recentDate.increase: -1
  opensearch.status.recentDate.min.gap: -1

  topology.metrics.consumer.register:
    - class: "org.apache.stormcrawler.opensearch.metrics.MetricsConsumer"
      parallelism.hint: 1
      #whitelist:
      #  - "fetcher_counter"
      #  - "fetcher_average.bytes_fetched"
      #blacklist:
      #  - "__receive.*"

  log4j.logger.org.apache.stormcrawler.opensearch: DEBUG


GitHub link: https://github.com/apache/stormcrawler/discussions/1808

----
This is an automatically sent email for [email protected].
To unsubscribe, please send an email to: [email protected]

Reply via email to