Hello,
this module is used by `mailman3-web`. It is called from a cron-job
every minute and the warnings trigger cron to send an email, as the
output of the cron job is no longer empty.
See https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1082541
As Ben Sturmfels has already noticed, the module contains a massive list
of strings, which are regular expressions. To fix this:
- replace \- and \/ with - respective / as both are no special regular
expression characters.
- change any remaing string containing \. to be a raw string, e.g.
r'…\.…'.
- there is one \', which is to NOT terminate the Python string.
- there are two \s, which represent the class of white-space characters.
- there are two \\, which represent a backslash itself.
If's unfortunate, that upstream is archived but understandable, as the
list of robots is not static and ever changing. Even its upstream source
for the DB seems dead: http://www.robotstxt.org/db.html
PS: The implementation is also very inefficient as it uses
any(re.match(…) for re in …).
Generally it's much more efficient to combine all regular expressions
with | and compile them into a single regular expression. Even better is
to use a Trie as Python builds a very inefficient regular expression
automaton when given a very large set of alternatives.
Rust has https://docs.rs/regex/latest/regex/struct.RegexSet.html for
this.
Philipp
--
/ / (_)__ __ ____ __ Philipp Hahn 🐓
/ /__/ / _ \/ // /\ \/ /
/____/_/_//_/\_,_/ /_/\_\ [email protected]
from __future__ import print_function
import sys, os.path, codecs, re
robot_useragents = [
'appie',
'architext',
'jeeves',
'bjaaland',
'contentmatch',
'ferret',
'googlebot',
'google-sitemaps',
'gulliver',
'virus[_+ ]detector', # Must be before harvest
'harvest',
'htdig',
'linkwalker',
'lilina',
'lycos[_+ ]',
'moget',
'muscatferret',
'myweb',
'nomad',
'scooter',
'slurp',
'^voyager/',
'weblayers',
# Common robots (Not in robot file)
'antibot',
'bruinbot',
'digout4u',
'echo!',
'fast-webcrawler',
r'ia_archiver-web\.archive\.org', # Must be before ia_archiver to avoid confusion with alexa
'ia_archiver',
'jennybot',
'mercator',
'netcraft',
'msnbot-media',
'msnbot',
'petersnews',
r'relevantnoise\.com',
'unlost_web_crawler',
'voila',
'webbase',
'webcollage',
'cfetch',
'zyborg', # Must be before wisenut
'wisenutbot'
# Less common robots (In robot file)
'[^a]fish',
'abcdatos',
r'acme\.spider',
'ahoythehomepagefinder',
'alkaline',
'anthill',
'arachnophilia',
'arale',
'araneo',
'aretha',
'ariadne',
'powermarks',
'arks',
'aspider',
r'atn\.txt',
'atomz',
'auresys',
'backrub',
'bbot',
'bigbrother',
'blackwidow',
'blindekuh',
'bloodhound',
'borg-bot',
'brightnet',
'bspider',
'cactvschemistryspider',
'calif[^r]',
'cassandra',
'cgireader',
'checkbot',
'christcrawler',
'churl',
'cienciaficcion',
'collective',
'combine',
'conceptbot',
'coolbot',
'core',
'cosmos',
'cruiser',
'cusco',
'cyberspyder',
'desertrealm',
'deweb',
'dienstspider',
'digger',
'diibot',
'direct_hit',
'dnabot',
'download_express',
'dragonbot',
'dwcp',
'e-collector',
'ebiness',
'elfinbot',
'emacs',
'emcspider',
'esther',
'evliyacelebi',
'fastcrawler',
'feedcrawl',
'fdse',
'felix',
'fetchrover',
'fido',
'finnish',
'fireball',
'fouineur',
'francoroute',
'freecrawl',
'funnelweb',
'gama',
'gazz',
'gcreep',
'getbot',
'geturl',
'golem',
'gougou',
'grapnel',
'griffon',
'gromit',
'gulperbot',
'hambot',
'havindex',
'hometown',
'htmlgobble',
'hyperdecontextualizer',
'iajabot',
'iaskspider',
'hl_ftien_spider',
'sogou',
'iconoclast',
'ilse',
'imagelock',
'incywincy',
'informant',
'infoseek',
'infoseeksidewinder',
'infospider',
'inspectorwww',
'intelliagent',
'irobot',
'iron33',
'israelisearch',
'javabee',
'jbot',
'jcrawler',
'jobo',
'jobot',
'joebot',
'jubii',
'jumpstation',
'kapsi',
'katipo',
'kilroy',
'ko[_+ ]yappo[_+ ]robot',
'kummhttp',
r'labelgrabber\.txt',
'larbin',
'legs',
'linkidator',
'linkscan',
'lockon',
'logo_gif',
'macworm',
'magpie',
'marvin',
'mattie',
'mediafox',
'merzscope',
'meshexplorer',
'mindcrawler',
'mnogosearch',
'momspider',
'monster',
'motor',
'muncher',
'mwdsearch',
'ndspider',
r'nederland\.zoek',
'netcarta',
'netmechanic',
'netscoop',
'newscan-online',
'nhse',
'northstar',
'nzexplorer',
'objectssearch',
'occam',
'octopus',
'openfind',
'orb_search',
'packrat',
'pageboy',
'parasite',
'patric',
'pegasus',
'perignator',
'perlcrawler',
'phantom',
'phpdig',
'piltdownman',
'pimptrain',
'pioneer',
'pitkow',
'pjspider',
'plumtreewebaccessor',
'poppi',
'portalb',
'psbot',
'python',
'raven',
'rbse',
'resumerobot',
'rhcs',
'road_runner',
'robbie',
'robi',
'robocrawl',
'robofox',
'robozilla',
'roverbot',
'rules',
'safetynetrobot',
'search-info',
'search_au',
'searchprocess',
'senrigan',
'sgscout',
'shaggy',
'shaihulud',
'sift',
'simbot',
'site-valet',
'sitetech',
'skymob',
'slcrawler',
'smartspider',
'snooper',
'solbot',
'speedy',
'spider[_+ ]monkey',
'spiderbot',
'spiderline',
'spiderman',
'spiderview',
'spry',
'sqworm',
'ssearcher',
'suke',
'sunrise',
'suntek',
'sven',
'tach_bw',
'tagyu_agent',
'tailrank',
'tarantula',
'tarspider',
'techbot',
'templeton',
'titan',
'titin',
'tkwww',
'tlspider',
'ucsd',
'udmsearch',
'universalfeedparser',
'urlck',
'valkyrie',
'verticrawl',
'victoria',
'visionsearch',
'voidbot',
'vwbot',
'w3index',
'w3m2',
'wallpaper',
'wanderer',
'wapspIRLider',
'webbandit',
'webcatcher',
'webcopy',
'webfetcher',
'webfoot',
'webinator',
'weblinker',
'webmirror',
'webmoose',
'webquest',
'webreader',
'webreaper',
'websnarf',
'webspider',
'webvac',
'webwalk',
'webwalker',
'webwatch',
'whatuseek',
'whowhere',
'wired-digital',
'wmir',
'wolp',
'wombat',
'wordpress',
'worm',
'woozweb',
'wwwc',
'wz101',
'xget',
# Other robots reported by users
'1-more_scanner',
'accoona-ai-agent',
'activebookmark',
'adamm_bot',
'almaden',
'aipbot',
'aleadsoftbot',
'alpha_search_agent',
'allrati',
'aport',
r'archive\.org_bot',
'argus', # Must be before nutch
r'arianna\.libero\.it',
'aspseek',
'asterias',
'awbot',
'baiduspider',
'becomebot',
'bender',
'betabot',
'biglotron',
'bittorrent_bot',
'biz360[_+ ]spider',
'blogbridge[_+ ]service',
'bloglines',
'blogpulse',
'blogsearch',
'blogshares',
'blogslive',
'blogssay',
r'bncf\.firenze\.sbn\.it/raccolta\.txt',
'bobby',
r'boitho\.com-dc',
'bookmark-manager',
'boris',
'bumblebee',
'candlelight[_+ ]favorites[_+ ]inspector',
'cbn00glebot',
'cerberian_drtrs',
'cfnetwork',
'cipinetbot',
'checkweb_link_validator',
'commons-httpclient',
'computer_and_automation_research_institute_crawler',
'converamultimediacrawler',
'converacrawler',
'cscrawler',
'cse_html_validator_lite_online',
'cuasarbot',
'cursor',
'custo',
'datafountains/dmoz_downloader',
'daviesbot',
'daypopbot',
'deepindex',
r'dipsie\.bot',
'dnsgroup',
'domainchecker',
r'domainsdb\.net',
'dulance',
'dumbot',
r'dumm\.de-bot',
r'earthcom\.info',
'easydl',
'edgeio-retriever',
'ets_v',
'exactseek',
'extreme[_+ ]picture[_+ ]finder',
'eventax',
'everbeecrawler',
'everest-vulcan',
'ezresult',
'enteprise',
'facebook',
r'fast_enterprise_crawler.*crawleradmin\.t-info@telekom\.de',
r'fast_enterprise_crawler.*t-info_bi_cluster_crawleradmin\.t-info@telekom\.de',
r'matrix_s\.p\.a\._-_fast_enterprise_crawler', # must come before fast enterprise crawler
'fast_enterprise_crawler',
'fast-search-engine',
'favicon',
'favorg',
'favorites_sweeper',
'feedburner',
'feedfetcher-google',
'feedflow',
'feedster',
'feedsky',
'feedvalidator',
'filmkamerabot',
'findlinks',
'findexa_crawler',
r'fooky\.com/ScorpionBot',
'g2crawler',
'gaisbot',
'geniebot',
'gigabot',
'girafabot',
'global_fetch',
'gnodspider',
r'goforit\.com',
'goforitbot',
'gonzo',
'grub',
'gpu_p2p_crawler',
'henrythemiragorobot',
'heritrix',
'holmes',
'hoowwwer',
'hpprint',
'htmlparser',
'html[_+ ]link[_+ ]validator',
'httrack',
r'hundesuche\.com-bot',
'ichiro',
'iltrovatore-setaccio',
'infobot',
'infociousbot',
'infomine',
'insurancobot',
'internet[_+ ]ninja',
'internetarchive',
'internetseer',
'internetsupervision',
'irlbot',
'isearch2006',
'iupui_research_bot',
'jrtwine[_+ ]software[_+ ]check[_+ ]favorites[_+ ]utility',
'justview',
'kalambot',
r'kamano\.de_newsfeedverzeichnis',
'kazoombot',
'kevin',
'keyoshid', # Must come before Y!J
'kinjabot',
'kinja-imagebot',
'knowitall',
r'knowledge\.com',
'kouaa_krawler',
'krugle',
'ksibot',
'kurzor',
'lanshanbot',
r'letscrawl\.com',
'libcrawl',
'linkbot',
'link_valet_online',
'metager-linkchecker', # Must be before linkchecker
'linkchecker',
r'livejournal\.com',
'lmspider',
'lwp-request',
'lwp-trivial',
'magpierss',
r'mail\.ru',
r'mapoftheinternet\.com',
'mediapartners-google',
'megite',
'metaspinner',
'microsoft[_+ ]url[_+ ]control',
'mini-reptile',
'minirank',
'missigua_locator',
'misterbot',
'miva',
'mizzu_labs',
'mj12bot',
'mojeekbot',
'msiecrawler',
r'ms_search_4\.0_robot',
'msrabot',
'msrbot',
'mt::telegraph::agent',
'nagios',
'nasa_search',
'mydoyouhike',
'netluchs',
'netsprint',
'newsgatoronline',
'nicebot',
'nimblecrawler',
'noxtrumbot',
'npbot',
'nutchcvs',
'nutchosu-vlib',
'nutch', # Must come after other nutch versions
'ocelli',
'octora_beta_bot',
'omniexplorer[_+ ]bot',
r'onet\.pl[_+ ]sa',
'onfolio',
'opentaggerbot',
'openwebspider',
'oracle_ultra_search',
'orbiter',
'yodaobot',
'qihoobot',
r'passwordmaker\.org',
'pear_http_request_class',
'peerbot',
'perman',
'php[_+ ]version[_+ ]tracker',
'pictureofinternet',
r'ping\.blo\.gs',
'plinki',
'pluckfeedcrawler',
'pogodak',
'pompos',
'popdexter',
'port_huron_labs',
'postfavorites',
'projectwf-java-test-crawler',
'proodlebot',
'pyquery',
'rambler',
'redalert',
'rojo',
'rssimagesbot',
'ruffle',
'rufusbot',
'sandcrawler',
'sbider',
'schizozilla',
'scumbot',
'searchguild[_+ ]dmoz[_+ ]experiment',
'seekbot',
'sensis_web_crawler',
'seznambot',
'shim-crawler',
'shoutcast',
'slysearch',
r'snap\.com_beta_crawler',
'sohu-search',
'sohu', # "sohu agent"
'snappy',
'sphere_scout',
'spip',
'sproose_crawler',
'steeler',
'steroid__download',
'suchfin-bot',
'superbot',
'surveybot',
'susie',
'syndic8',
'syndicapi',
'synoobot',
'tcl_http_client_package',
'technoratibot',
'teragramcrawlersurf',
'test_crawler',
'testbot',
't-h-u-n-d-e-r-s-t-o-n-e',
'topicblogs',
'turnitinbot',
'turtlescanner', # Must be before turtle
'turtle',
'tutorgigbot',
'twiceler',
'ubicrawler',
'ultraseek',
'unchaos_bot_hybrid_web_search_engine',
'unido-bot',
'updated',
'ustc-semantic-group',
'vagabondo-wap',
'vagabondo',
'vermut',
r'versus_crawler_from_eda\.baykan@epfl\.ch',
'vespa_crawler',
'vortex',
'vse/',
'w3c-checklink',
'w3c[_+ ]css[_+ ]validator[_+ ]jfouffa',
'w3c_validator',
'watchmouse',
'wavefire',
r'webclipping\.com',
'webcompass',
r'webcrawl\.net',
'web_downloader',
'webdup',
'webfilter',
'webindexer',
'webminer',
'website[_+ ]monitoring[_+ ]bot',
'webvulncrawl',
'wells_search',
'wonderer',
'wume_crawler',
'wwweasel',
'xenu\'s_link_sleuth',
'xenu_link_sleuth',
'xirq',
'y!j', # Must come after keyoshid Y!J
'yacy',
'yahoo-blogs',
'yahoo-verticalcrawler',
'yahoofeedseeker',
'yahooseeker-testing',
'yahooseeker',
'yahoo-mmcrawler',
'yahoo!_mindset',
'yandex',
'flexum',
'yanga',
'yooglifetchagent',
'z-add_link_checker',
'zealbot',
'zhuaxia',
'zspider',
'zeus',
r'ng/1\.', # put at end to avoid false positive
r'ng/2\.', # put at end to avoid false positive
'exabot', # put at end to avoid false positive
# Other id that are 99% of robots
'wget',
'libwww',
'java/[0-9]' # put at end to avoid false positive
# Generic robot
'robot',
'checker',
'crawl',
'discovery',
'hunter',
'scanner',
'spider',
'sucker',
r'bot[\s_+:,.;/\\-]',
r'[\s_+:,\.;/\\-]bot',
'no_user_agent',
# manually added
'yeti',
]
robot_useragents = [re.compile(x) for x in robot_useragents]
def is_robot(user_agent):
if not isinstance(user_agent, str):
raise TypeError
if len(user_agent) == 0:
raise ValueError
try:
# See if any one matches
return any(robot_ua.search(user_agent.lower()) for robot_ua in robot_useragents)
except UnicodeDecodeError:
# Unicode error, robot_useragents is unicode strings. user_agent might have malformed bytes, so try looking at boring ascii
return any(robot_ua.search(user_agent.lower().encode('ascii', 'ignore')) for robot_ua in robot_useragents)
def _parse_db_export(filename):
assert os.path.isfile(filename)
lines = codecs.open(filename, encoding="latin1").readlines()
exclude_ua = set()
for line in lines:
if line.startswith("robot-exclusion-useragent:"):
line = line.strip()
dont_care, ua = line.split(":", 1)
ua = ua.strip()
if ' or ' in ua:
uas = ua.split(" or ")
# remove quotes
uas = [x[1:-1] if (x[0] in ['"', "'"] and x[-1] in ['"', "'"]) else x for x in uas]
else:
uas = [ua]
for ua in uas:
# don't include nonsense stuff
if ua.lower() not in ['', '*', 'n/a', 'none', 'yes', 'no', "due to a deficiency in java it's not currently possible to set the user-agent."]:
exclude_ua.add(ua)
if robot_useragents != exclude_ua:
print("robot_detection is out of date. Here's the new robot_useragents variable:")
print(exclude_ua)
else:
print("No changes, robot_detection is up to date")
if __name__ == '__main__' and len(sys.argv) == 2:
_parse_db_export(sys.argv[1])