bin/get-bugzilla-attachments-by-mimetype | 278 +++++++++++++++++-------------- 1 file changed, 161 insertions(+), 117 deletions(-)
New commits: commit 17f79d80484c5d5b492efd46e2e52481a17e0095 Author: Thorsten Behrens <thorsten.behr...@allotropia.de> AuthorDate: Tue Dec 19 19:42:00 2023 +0100 Commit: Thorsten Behrens <thorsten.behr...@allotropia.de> CommitDate: Sun Dec 24 00:19:27 2023 +0100 get-bz-attachments: some flake8 cleanup - fix the most obvious stylistic problems - add some brief doc strings - remove one instance of dead code, left over from this change: commit bd2eee0bd4ae83ff453522b7cf09b69f1b8b5e1b Date: Wed Jun 3 23:41:32 2015 +0200 get-bugzilla-attachments: avoid FDO-TDF duplicates... Change-Id: I88672ae99bc42e9af09ea6033f87240463b2c038 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/161003 Tested-by: Thorsten Behrens <thorsten.behr...@allotropia.de> Reviewed-by: Thorsten Behrens <thorsten.behr...@allotropia.de> diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype index c9a0d41cf5cc..9df55914360e 100755 --- a/bin/get-bugzilla-attachments-by-mimetype +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -23,53 +23,61 @@ # it is already downloaded by a previous run, and up-to-date. from __future__ import print_function -import feedparser + import base64 import datetime import glob +import os +import os.path import re -import os, os.path import stat import sys import threading try: import queue -except: +except Exception: import Queue as queue try: from urllib.request import urlopen -except: +except Exception: from urllib import urlopen try: import xmlrpc.client as xmlrpclib -except: +except Exception: import xmlrpclib from xml.dom import minidom from xml.sax.saxutils import escape + from attachment_mimetypes import mimetypes +import feedparser + + def urlopen_retry(url): + """Open url, retry 3 times.""" maxretries = 3 for i in range(maxretries + 1): try: return urlopen(url) except IOError as e: - print("caught IOError: " + str(e)) + print('caught IOError: ' + str(e)) if maxretries == i: raise - print("retrying...") + print('retrying...') + def get_from_bug_url_via_xml(url, mimetype, prefix, suffix): - id = url.rsplit('=', 2)[1] - print("id is " + prefix + id + " " + suffix) - print("parsing " + id) - sock = urlopen_retry(url+"&ctype=xml") + """Parse bug xml, download attachments with matching suffix.""" + bugid = url.rsplit('=', 2)[1] + print('id is ' + prefix + bugid + ' ' + suffix) + print('parsing ' + bugid) + sock = urlopen_retry(url+'&ctype=xml') dom = minidom.parse(sock) sock.close() - attachmentid=0 + attachmentid = 0 for attachment in dom.getElementsByTagName('attachment'): attachmentid += 1 - print(" mimetype is", end=' ') + print(' mimetype is', end=' ') for node in attachment.childNodes: if node.nodeName == 'type': # check if attachment is deleted @@ -87,53 +95,58 @@ def get_from_bug_url_via_xml(url, mimetype, prefix, suffix): print('deleted attachment, skipping') continue - download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix + download = (suffix + '/' + prefix + bugid + '-' + + str(attachmentid) + '.' + suffix) if os.path.isfile(download): - print("assuming " + download + " is up to date") + print('assuming ' + download + ' is up to date') continue # prevent re-downloading FDO attachments from TDF - if prefix == "tdf" and int(id) < 88776: - fdodownload = download.replace("tdf", "fdo") + if prefix == 'tdf' and int(bugid) < 88776: + fdodownload = download.replace('tdf', 'fdo') if os.path.isfile(fdodownload): - print("assuming FDO " + fdodownload + " is up to date") + print('assuming FDO ' + fdodownload + ' is up to date') continue print('downloading as ' + download) - tmpfile = download + ".tmp" + tmpfile = download + '.tmp' f = open(tmpfile, 'wb') f.write(base64.b64decode(node.firstChild.nodeValue)) f.close() os.rename(tmpfile, download) break + def get_novell_bug_via_xml(url, mimetype, prefix, suffix): - id = url.rsplit('=', 2)[1] - print("id is " + prefix + id + " " + suffix) - print("parsing " + id) - sock = urlopen_retry(url+"&ctype=xml") + """Parse bug xml, download attachments with matching suffix.""" + bugid = url.rsplit('=', 2)[1] + print('id is ' + prefix + bugid + ' ' + suffix) + print('parsing ' + bugid) + sock = urlopen_retry(url+'&ctype=xml') dom = minidom.parse(sock) sock.close() - attachmentid=0 + attachmentid = 0 for comment in dom.getElementsByTagName('thetext'): commentText = comment.firstChild.nodeValue - match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText) + match = re.search(r'.*Created an attachment \(id=([0-9]+)\)', + commentText) if not match: continue attachmentid += 1 - download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix + download = (suffix + '/' + prefix + bugid + '-' + + str(attachmentid) + '.' + suffix) if os.path.isfile(download): - print("assuming " + download + " is up to date") + print('assuming ' + download + ' is up to date') continue realAttachmentId = match.group(1) handle = urlopen_retry(novellattach + realAttachmentId) if not handle: - print("attachment %s is not accessible" % realAttachmentId) + print('attachment ' + realAttachmentId + ' is not accessible') continue - print(" mimetype is", end=' ') + print(' mimetype is', end=' ') info = handle.info() if info.get_content_type: @@ -142,40 +155,50 @@ def get_novell_bug_via_xml(url, mimetype, prefix, suffix): remoteMime = info.gettype() print(remoteMime, end=' ') if remoteMime != mimetype: - print("skipping") + print('skipping') continue print('downloading as ' + download) - tmpfile = download + ".tmp" + tmpfile = download + '.tmp' f = open(tmpfile, 'wb') f.write(handle.read()) f.close() os.rename(tmpfile, download) + def create_query(mimetype): - query = dict() - query['query_format']='advanced' - query['field0-0-0']='attachments.mimetype' - query['type0-0-0']='equals' - query['value0-0-0']=mimetype + """Query all bugs with suitable mimetype attachments.""" + query = {} + query['query_format'] = 'advanced' + query['field0-0-0'] = 'attachments.mimetype' + query['type0-0-0'] = 'equals' + query['value0-0-0'] = mimetype return query + def get_downloaded_files(prefix, suffix): + """Generate list of existing downloads (matching pre/suffix).""" return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix))) + def get_file_bz_ids(files, prefix): + """Generate list of existing downloads (matching pre/suffix).""" return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files]) + def get_changed_date(files): + """Compute date of last downloaded attachment.""" newest = max([os.stat(f)[stat.ST_MTIME] for f in files]) # Subtract a day to avoid timezone differences. The worst thing that # can happen is that we are going to process more bugs than necessary. return datetime.date.fromtimestamp(newest - 24 * 60 * 60) + def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix): + """Poke Bugzilla via RPC query.""" try: os.mkdir(suffix) - except: + except Exception: pass def process(query, full, have=[]): @@ -189,19 +212,19 @@ def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix): available = set([str(bug['id']) for bug in bugs]) # we already have files from all available bugs if available.difference(set(have)) == set(): - print("assuming all downloaded files are up to date") + print('assuming all downloaded files are up to date') return for bug in bugs: url = showurl + str(bug['id']) get_from_bug_url_via_xml(url, mimetype, prefix, suffix) except xmlrpclib.Fault as err: - print("A fault occurred") - print("Fault code: %s" % err.faultCode) + print('A fault occurred') + print('Fault code: ' + err.faultCode) print(err.faultString) query = create_query(mimetype) - query['column_list']='bug_id' + query['column_list'] = 'bug_id' files = get_downloaded_files(prefix, suffix) @@ -216,15 +239,19 @@ def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix): print('looking for all bugs having %s attachment(s)' % mimetype) process(query, True, get_file_bz_ids(files, prefix)) + def get_through_rss_query(queryurl, mimetype, prefix, suffix): + """Poke Bugzilla via RSS query.""" try: os.mkdir(suffix) - except: + except Exception: pass - #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla - #get_novell_bug_via_xml function is a workaround for that situation - get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml + # Getting detailed bug information and downloading an attachment + # body is not possible without logging in to Novell bugzilla + # get_novell_bug_via_xml function is a workaround for that + # situation + get_bug_function = get_novell_bug_via_xml if prefix == 'novell' else get_from_bug_url_via_xml def process(query, full, have=[]): url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()]) @@ -232,16 +259,12 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix): d = feedparser.parse(url) print(str(len(d['entries'])) + ' bugs to process') - entries = [] - for entry in d['entries']: - bugid = entry['id'].split('=')[-1] - entries.append(entry) - + entries = d['entries'] if full: available = set([str(entry['id'].split('=')[-1]) for entry in entries]) # we already have files from all available bugs if available.difference(set(have)) == set(): - print("assuming all downloaded files are up to date") + print('assuming all downloaded files are up to date') return for entry in entries: @@ -249,11 +272,11 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix): get_bug_function(entry['id'], mimetype, prefix, suffix) except KeyboardInterrupt: raise # Ctrl+C should work - except: - print(entry['id'] + " failed: " + str(sys.exc_info()[0])) + except Exception: + print(entry['id'] + ' failed: ' + str(sys.exc_info()[0])) pass - query = create_query(escape(mimetype.replace("+","%2B"))) + query = create_query(escape(mimetype.replace('+', '%2B'))) query['ctype'] = 'rss' files = get_downloaded_files(prefix, suffix) @@ -269,56 +292,68 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix): print('looking for all bugs having %s attachment(s)' % mimetype) process(query, True, get_file_bz_ids(files, prefix)) -#since searching bugs having attachments with specific mimetypes is not available in launchpad API -#we're iterating over all bugs of the most interesting source packages + +# since searching bugs having attachments with specific mimetypes is not +# available in launchpad API: +# we're iterating over all bugs of the most interesting source packages launchpad_pkgs = ( - "abiword", - "calibre", - "calligra", - "gnumeric", - "inkscape", - "koffice", - "libabw", - "libcdr", - "libe-book", - "libetonyek", - "libfreehand", - "libmspub", - "libmwaw", - "liborcus", - "libpagemaker", - "libreoffice", - "libvisio", - "libwpd", - "libwpg", - "libwps", - "openoffice.org", - "python-uniconvertor", - "scribus", - "sk1", - "unoconv", + 'abiword', + 'calibre', + 'calligra', + 'gnumeric', + 'inkscape', + 'koffice', + 'libabw', + 'libcdr', + 'libe-book', + 'libetonyek', + 'libfreehand', + 'libmspub', + 'libmwaw', + 'liborcus', + 'libpagemaker', + 'libreoffice', + 'libvisio', + 'libwpd', + 'libwpg', + 'libwps', + 'openoffice.org', + 'python-uniconvertor', + 'scribus', + 'sk1', + 'unoconv', ) + def get_launchpad_bugs(prefix): - #launchpadlib python module is required to download launchpad attachments + """Query launchpad bugtracker (via launchpadlib).""" + # launchpadlib python module is required to download launchpad attachments from launchpadlib.launchpad import Launchpad - launchpad = Launchpad.login_anonymously("attachmentdownload", "production") - ubuntu = launchpad.distributions["ubuntu"] + launchpad = Launchpad.login_anonymously('attachmentdownload', 'production') + ubuntu = launchpad.distributions['ubuntu'] for pkg in launchpad_pkgs: srcpkg = ubuntu.getSourcePackage(name=pkg) - pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"]) + pkgbugs = srcpkg.searchTasks(status=['New', 'Fix Committed', 'Invalid', + "Won't Fix", 'Confirmed', + 'Triaged', 'In Progress', + 'Incomplete', + 'Incomplete (with response)', + 'Incomplete (without response)', + 'Fix Released', 'Opinion', + 'Expired']) for bugtask in pkgbugs: bug = bugtask.bug - id = str(bug.id) - print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50]) + bugid = str(bug.id) + print('parsing ' + bugid + ' status: ' + bugtask.status + + ' title: ' + bug.title[:50]) attachmentid = 0 for attachment in bug.attachments: attachmentid += 1 handle = attachment.data.open() - if not handle.content_type in mimetypes: + if handle.content_type not in mimetypes: #print "skipping" continue @@ -326,32 +361,35 @@ def get_launchpad_bugs(prefix): if not os.path.isdir(suffix): try: os.mkdir(suffix) - except: + except Exception: pass - download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix + download = (suffix + '/' + prefix + bugid + '-' + + str(attachmentid) + '.' + suffix) if os.path.isfile(download): - print("assuming " + id + " is up to date") + print('assuming ' + bugid + ' is up to date') break - print('mimetype is ' + handle.content_type + ' downloading as ' + download) + print('mimetype is ' + handle.content_type + + ' downloading as ' + download) - tmpfile = download + ".tmp" - f = open(tmpfile, "wb") + tmpfile = download + '.tmp' + f = open(tmpfile, 'wb') f.write(handle.read()) f.close() os.rename(tmpfile, download) + rss_bugzillas = ( # note: currently abisource has an expired TLS cert -# ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword - ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ), - ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ), -# ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric - ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra - ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ), - ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ), +# ('abi', 'http://bugzilla.abisource.com/buglist.cgi'), #added for abiword + ('fdo', 'http://bugs.freedesktop.org/buglist.cgi'), + ('gentoo', 'http://bugs.gentoo.org/buglist.cgi'), +# ('gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric + ('kde', 'http://bugs.kde.org/buglist.cgi'), # added for koffice/calligra + ('mandriva', 'https://qa.mandriva.com/buglist.cgi'), + ('moz', 'https://bugzilla.mozilla.org/buglist.cgi'), # It seems something has changed and it is no longer possible to # download any files from there. # NOTE: I am leaving it in the list, commented out, just so someone @@ -359,19 +397,22 @@ rss_bugzillas = ( # 'novell': 'https://bugzilla.novell.com/buglist.cgi', # note: running this script against bz.apache.org apparently causes one's IP # to be banned or something; you won't get new files in any case... -# ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ), - ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ), +# ('ooo', 'https://bz.apache.org/ooo/buglist.cgi'), + ('tdf', 'http://bugs.documentfoundation.org/buglist.cgi'), ) redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi' redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id=' -#Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc. -#As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually -#python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login -#system is a nightmare +# Novell Bugzilla requires users to log in, in order to get details of +# the bugs such as attachment bodies etc. As a dirty workaround, we +# parse comments containing "Created an attachment (id=xxxxxx)" and +# download attachments manually python-bugzilla claims that it +# supports Novell bugzilla login but it's not working right now and +# novell bugzilla login system is a nightmare novellattach = 'https://bugzilla.novell.com/attachment.cgi?id=' + class manage_threads(threading.Thread): def run(self): while 1: @@ -380,7 +421,7 @@ class manage_threads(threading.Thread): # Get job from queue # Use job parameters to call our query # Then let the queue know we are done with this job - (uri, mimetype, prefix, extension) = jobs.get(True,6) + (uri, mimetype, prefix, extension) = jobs.get(True, 6) try: get_through_rss_query(uri, mimetype, prefix, extension) finally: @@ -390,42 +431,45 @@ class manage_threads(threading.Thread): except queue.Empty: break + def generate_multi_threading(): # Initialize threads - for i in range(max_threads): + for _i in range(max_threads): manage_threads().start() for (prefix, uri) in rss_bugzillas: # Create a job for every mimetype for a bugzilla - for (mimetype,extension) in mimetypes.items(): + for (mimetype, extension) in mimetypes.items(): # It seems that bugzilla has problems returning that many results # (10000 results is probably a limit set somewhere) so we always # end processing the complete list. if mimetype == 'text/html' and prefix == 'moz': - continue + continue jobs.put([uri, mimetype, prefix, extension], block=True) - print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix) + print('successfully placed a job in the queue searching for ' + + mimetype + ' in bugtracker ' + prefix) # Continue when all mimetypes are done for a bugzilla - print("STARTED all bugtracker " + prefix) + print('STARTED all bugtracker ' + prefix) jobs.join() + # Number of threads to create, (1 = without multi-threading, default = 20) max_threads = int(os.environ.get('PARALLELISM', 20)) jobs = queue.Queue() generate_multi_threading() -for (mimetype,extension) in mimetypes.items(): - get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension) +for (mimetype, extension) in mimetypes.items(): + get_through_rpc_query(redhatrpc, redhatbug, mimetype, 'rhbz', extension) try: - get_launchpad_bugs("lp") + get_launchpad_bugs('lp') except ImportError: - print("launchpadlib unavailable, skipping Ubuntu tracker") + print('launchpadlib unavailable, skipping Ubuntu tracker') # vim:set shiftwidth=4 softtabstop=4 expandtab: