core.git: bin/get-bugzilla-attachments-by-mimetype

Thorsten Behrens (via logerrit) Sat, 23 Dec 2023 15:19:52 -0800

 bin/get-bugzilla-attachments-by-mimetype |  278 +++++++++++++++++--------------
 1 file changed, 161 insertions(+), 117 deletions(-)


New commits:
commit 17f79d80484c5d5b492efd46e2e52481a17e0095
Author:     Thorsten Behrens <thorsten.behr...@allotropia.de>
AuthorDate: Tue Dec 19 19:42:00 2023 +0100
Commit:     Thorsten Behrens <thorsten.behr...@allotropia.de>
CommitDate: Sun Dec 24 00:19:27 2023 +0100

    get-bz-attachments: some flake8 cleanup
    
    - fix the most obvious stylistic problems
    - add some brief doc strings
    - remove one instance of dead code, left over from this change:
      commit bd2eee0bd4ae83ff453522b7cf09b69f1b8b5e1b
      Date:   Wed Jun 3 23:41:32 2015 +0200
    
        get-bugzilla-attachments: avoid FDO-TDF duplicates...
    
    Change-Id: I88672ae99bc42e9af09ea6033f87240463b2c038
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/161003
    Tested-by: Thorsten Behrens <thorsten.behr...@allotropia.de>
    Reviewed-by: Thorsten Behrens <thorsten.behr...@allotropia.de>

diff --git a/bin/get-bugzilla-attachments-by-mimetype 
b/bin/get-bugzilla-attachments-by-mimetype
index c9a0d41cf5cc..9df55914360e 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -23,53 +23,61 @@
 # it is already downloaded by a previous run, and up-to-date.
 
 from __future__ import print_function
-import feedparser
+
 import base64
 import datetime
 import glob
+import os
+import os.path
 import re
-import os, os.path
 import stat
 import sys
 import threading
 try:
     import queue
-except:
+except Exception:
     import Queue as queue
 try:
     from urllib.request import urlopen
-except:
+except Exception:
     from urllib import urlopen
 try:
     import xmlrpc.client as xmlrpclib
-except:
+except Exception:
     import xmlrpclib
 from xml.dom import minidom
 from xml.sax.saxutils import escape
+
 from attachment_mimetypes import mimetypes
 
+import feedparser
+
+
 def urlopen_retry(url):
+    """Open url, retry 3 times."""
     maxretries = 3
     for i in range(maxretries + 1):
         try:
             return urlopen(url)
         except IOError as e:
-            print("caught IOError: " + str(e))
+            print('caught IOError: ' + str(e))
             if maxretries == i:
                 raise
-            print("retrying...")
+            print('retrying...')
+
 
 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
-    id = url.rsplit('=', 2)[1]
-    print("id is " + prefix + id + " " + suffix)
-    print("parsing " + id)
-    sock = urlopen_retry(url+"&ctype=xml")
+    """Parse bug xml, download attachments with matching suffix."""
+    bugid = url.rsplit('=', 2)[1]
+    print('id is ' + prefix + bugid + ' ' + suffix)
+    print('parsing ' + bugid)
+    sock = urlopen_retry(url+'&ctype=xml')
     dom = minidom.parse(sock)
     sock.close()
-    attachmentid=0
+    attachmentid = 0
     for attachment in dom.getElementsByTagName('attachment'):
         attachmentid += 1
-        print(" mimetype is", end=' ')
+        print(' mimetype is', end=' ')
         for node in attachment.childNodes:
             if node.nodeName == 'type':
                 # check if attachment is deleted
@@ -87,53 +95,58 @@ def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
                     print('deleted attachment, skipping')
                     continue
 
-                download = suffix + '/' +prefix + id + '-' + str(attachmentid) 
+ '.' + suffix
+                download = (suffix + '/' + prefix + bugid + '-' +
+                            str(attachmentid) + '.' + suffix)
                 if os.path.isfile(download):
-                    print("assuming " + download + " is up to date")
+                    print('assuming ' + download + ' is up to date')
                     continue
 
                 # prevent re-downloading FDO attachments from TDF
-                if prefix == "tdf" and int(id) < 88776:
-                    fdodownload = download.replace("tdf", "fdo")
+                if prefix == 'tdf' and int(bugid) < 88776:
+                    fdodownload = download.replace('tdf', 'fdo')
                     if os.path.isfile(fdodownload):
-                        print("assuming FDO " + fdodownload + " is up to date")
+                        print('assuming FDO ' + fdodownload + ' is up to date')
                         continue
 
                 print('downloading as ' + download)
-                tmpfile = download + ".tmp"
+                tmpfile = download + '.tmp'
                 f = open(tmpfile, 'wb')
                 f.write(base64.b64decode(node.firstChild.nodeValue))
                 f.close()
                 os.rename(tmpfile, download)
                 break
 
+
 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
-    id = url.rsplit('=', 2)[1]
-    print("id is " + prefix + id + " " + suffix)
-    print("parsing " + id)
-    sock = urlopen_retry(url+"&ctype=xml")
+    """Parse bug xml, download attachments with matching suffix."""
+    bugid = url.rsplit('=', 2)[1]
+    print('id is ' + prefix + bugid + ' ' + suffix)
+    print('parsing ' + bugid)
+    sock = urlopen_retry(url+'&ctype=xml')
     dom = minidom.parse(sock)
     sock.close()
-    attachmentid=0
+    attachmentid = 0
     for comment in dom.getElementsByTagName('thetext'):
         commentText = comment.firstChild.nodeValue
-        match = re.search(r".*Created an attachment \(id=([0-9]+)\)", 
commentText)
+        match = re.search(r'.*Created an attachment \(id=([0-9]+)\)',
+                          commentText)
         if not match:
             continue
 
         attachmentid += 1
 
-        download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' 
+ suffix
+        download = (suffix + '/' + prefix + bugid + '-' +
+                    str(attachmentid) + '.' + suffix)
         if os.path.isfile(download):
-            print("assuming " + download + " is up to date")
+            print('assuming ' + download + ' is up to date')
             continue
 
         realAttachmentId = match.group(1)
         handle = urlopen_retry(novellattach + realAttachmentId)
         if not handle:
-            print("attachment %s is not accessible" % realAttachmentId)
+            print('attachment ' + realAttachmentId + ' is not accessible')
             continue
-        print(" mimetype is", end=' ')
+        print(' mimetype is', end=' ')
 
         info = handle.info()
         if info.get_content_type:
@@ -142,40 +155,50 @@ def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
             remoteMime = info.gettype()
         print(remoteMime, end=' ')
         if remoteMime != mimetype:
-            print("skipping")
+            print('skipping')
             continue
 
         print('downloading as ' + download)
-        tmpfile = download + ".tmp"
+        tmpfile = download + '.tmp'
         f = open(tmpfile, 'wb')
         f.write(handle.read())
         f.close()
         os.rename(tmpfile, download)
 
+
 def create_query(mimetype):
-    query = dict()
-    query['query_format']='advanced'
-    query['field0-0-0']='attachments.mimetype'
-    query['type0-0-0']='equals'
-    query['value0-0-0']=mimetype
+    """Query all bugs with suitable mimetype attachments."""
+    query = {}
+    query['query_format'] = 'advanced'
+    query['field0-0-0'] = 'attachments.mimetype'
+    query['type0-0-0'] = 'equals'
+    query['value0-0-0'] = mimetype
     return query
 
+
 def get_downloaded_files(prefix, suffix):
+    """Generate list of existing downloads (matching pre/suffix)."""
     return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
 
+
 def get_file_bz_ids(files, prefix):
+    """Generate list of existing downloads (matching pre/suffix)."""
     return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f 
in files])
 
+
 def get_changed_date(files):
+    """Compute date of last downloaded attachment."""
     newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
     # Subtract a day to avoid timezone differences. The worst thing that
     # can happen is that we are going to process more bugs than necessary.
     return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
 
+
 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
+    """Poke Bugzilla via RPC query."""
     try:
         os.mkdir(suffix)
-    except:
+    except Exception:
         pass
 
     def process(query, full, have=[]):
@@ -189,19 +212,19 @@ def get_through_rpc_query(rpcurl, showurl, mimetype, 
prefix, suffix):
                 available = set([str(bug['id']) for bug in bugs])
                 # we already have files from all available bugs
                 if available.difference(set(have)) == set():
-                    print("assuming all downloaded files are up to date")
+                    print('assuming all downloaded files are up to date')
                     return
 
             for bug in bugs:
                 url = showurl + str(bug['id'])
                 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
         except xmlrpclib.Fault as err:
-            print("A fault occurred")
-            print("Fault code: %s" % err.faultCode)
+            print('A fault occurred')
+            print('Fault code: ' + err.faultCode)
             print(err.faultString)
 
     query = create_query(mimetype)
-    query['column_list']='bug_id'
+    query['column_list'] = 'bug_id'
 
     files = get_downloaded_files(prefix, suffix)
 
@@ -216,15 +239,19 @@ def get_through_rpc_query(rpcurl, showurl, mimetype, 
prefix, suffix):
     print('looking for all bugs having %s attachment(s)' % mimetype)
     process(query, True, get_file_bz_ids(files, prefix))
 
+
 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
+    """Poke Bugzilla via RSS query."""
     try:
         os.mkdir(suffix)
-    except:
+    except Exception:
         pass
 
-    #Getting detailed bug information and downloading an attachment body is 
not possible without logging in to Novell bugzilla
-    #get_novell_bug_via_xml function is a workaround for that situation
-    get_bug_function = get_novell_bug_via_xml if prefix == "novell" else 
get_from_bug_url_via_xml
+    # Getting detailed bug information and downloading an attachment
+    # body is not possible without logging in to Novell bugzilla
+    # get_novell_bug_via_xml function is a workaround for that
+    # situation
+    get_bug_function = get_novell_bug_via_xml if prefix == 'novell' else 
get_from_bug_url_via_xml
 
     def process(query, full, have=[]):
         url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
@@ -232,16 +259,12 @@ def get_through_rss_query(queryurl, mimetype, prefix, 
suffix):
         d = feedparser.parse(url)
         print(str(len(d['entries'])) + ' bugs to process')
 
-        entries = []
-        for entry in d['entries']:
-            bugid = entry['id'].split('=')[-1]
-            entries.append(entry)
-
+        entries = d['entries']
         if full:
             available = set([str(entry['id'].split('=')[-1]) for entry in 
entries])
             # we already have files from all available bugs
             if available.difference(set(have)) == set():
-                print("assuming all downloaded files are up to date")
+                print('assuming all downloaded files are up to date')
                 return
 
         for entry in entries:
@@ -249,11 +272,11 @@ def get_through_rss_query(queryurl, mimetype, prefix, 
suffix):
                 get_bug_function(entry['id'], mimetype, prefix, suffix)
             except KeyboardInterrupt:
                 raise # Ctrl+C should work
-            except:
-                print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
+            except Exception:
+                print(entry['id'] + ' failed: ' + str(sys.exc_info()[0]))
                 pass
 
-    query = create_query(escape(mimetype.replace("+","%2B")))
+    query = create_query(escape(mimetype.replace('+', '%2B')))
     query['ctype'] = 'rss'
 
     files = get_downloaded_files(prefix, suffix)
@@ -269,56 +292,68 @@ def get_through_rss_query(queryurl, mimetype, prefix, 
suffix):
     print('looking for all bugs having %s attachment(s)' % mimetype)
     process(query, True, get_file_bz_ids(files, prefix))
 
-#since searching bugs having attachments with specific mimetypes is not 
available in launchpad API
-#we're iterating over all bugs of the most interesting source packages
+
+# since searching bugs having attachments with specific mimetypes is not
+# available in launchpad API:
+# we're iterating over all bugs of the most interesting source packages
 launchpad_pkgs = (
-    "abiword",
-    "calibre",
-    "calligra",
-    "gnumeric",
-    "inkscape",
-    "koffice",
-    "libabw",
-    "libcdr",
-    "libe-book",
-    "libetonyek",
-    "libfreehand",
-    "libmspub",
-    "libmwaw",
-    "liborcus",
-    "libpagemaker",
-    "libreoffice",
-    "libvisio",
-    "libwpd",
-    "libwpg",
-    "libwps",
-    "openoffice.org",
-    "python-uniconvertor",
-    "scribus",
-    "sk1",
-    "unoconv",
+    'abiword',
+    'calibre',
+    'calligra',
+    'gnumeric',
+    'inkscape',
+    'koffice',
+    'libabw',
+    'libcdr',
+    'libe-book',
+    'libetonyek',
+    'libfreehand',
+    'libmspub',
+    'libmwaw',
+    'liborcus',
+    'libpagemaker',
+    'libreoffice',
+    'libvisio',
+    'libwpd',
+    'libwpg',
+    'libwps',
+    'openoffice.org',
+    'python-uniconvertor',
+    'scribus',
+    'sk1',
+    'unoconv',
 )
 
+
 def get_launchpad_bugs(prefix):
-    #launchpadlib python module is required to download launchpad attachments
+    """Query launchpad bugtracker (via launchpadlib)."""
+    # launchpadlib python module is required to download launchpad attachments
     from launchpadlib.launchpad import Launchpad
 
-    launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
-    ubuntu = launchpad.distributions["ubuntu"]
+    launchpad = Launchpad.login_anonymously('attachmentdownload', 'production')
+    ubuntu = launchpad.distributions['ubuntu']
 
     for pkg in launchpad_pkgs:
         srcpkg = ubuntu.getSourcePackage(name=pkg)
-        pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", 
"Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", 
"Incomplete (with response)", "Incomplete (without response)", "Fix Released", 
"Opinion", "Expired"])
+        pkgbugs = srcpkg.searchTasks(status=['New', 'Fix Committed', 'Invalid',
+                                             "Won't Fix", 'Confirmed',
+                                             'Triaged', 'In Progress',
+                                             'Incomplete',
+                                             'Incomplete (with response)',
+                                             'Incomplete (without response)',
+                                             'Fix Released', 'Opinion',
+                                             'Expired'])
 
         for bugtask in pkgbugs:
             bug = bugtask.bug
-            id = str(bug.id)
-            print("parsing " + id + " status: " + bugtask.status + " title: " 
+ bug.title[:50])
+            bugid = str(bug.id)
+            print('parsing ' + bugid + ' status: ' + bugtask.status +
+                  ' title: ' + bug.title[:50])
             attachmentid = 0
             for attachment in bug.attachments:
                 attachmentid += 1
                 handle = attachment.data.open()
-                if not handle.content_type in mimetypes:
+                if handle.content_type not in mimetypes:
                     #print "skipping"
                     continue
 
@@ -326,32 +361,35 @@ def get_launchpad_bugs(prefix):
                 if not os.path.isdir(suffix):
                     try:
                         os.mkdir(suffix)
-                    except:
+                    except Exception:
                         pass
 
-                download = suffix + '/' + prefix + id + '-' + 
str(attachmentid) + '.' + suffix
+                download = (suffix + '/' + prefix + bugid + '-' +
+                            str(attachmentid) + '.' + suffix)
 
                 if os.path.isfile(download):
-                    print("assuming " + id + " is up to date")
+                    print('assuming ' + bugid + ' is up to date')
                     break
 
-                print('mimetype is ' + handle.content_type + ' downloading as 
' + download)
+                print('mimetype is ' + handle.content_type +
+                      ' downloading as ' + download)
 
-                tmpfile = download + ".tmp"
-                f = open(tmpfile, "wb")
+                tmpfile = download + '.tmp'
+                f = open(tmpfile, 'wb')
                 f.write(handle.read())
                 f.close()
                 os.rename(tmpfile, download)
 
+
 rss_bugzillas = (
 # note: currently abisource has an expired TLS cert
-#    ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
-    ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
-    ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
-#    ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
-    ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
-    ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
-    ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
+#    ('abi', 'http://bugzilla.abisource.com/buglist.cgi'), #added for abiword
+    ('fdo', 'http://bugs.freedesktop.org/buglist.cgi'),
+    ('gentoo', 'http://bugs.gentoo.org/buglist.cgi'),
+#    ('gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
+    ('kde', 'http://bugs.kde.org/buglist.cgi'), # added for koffice/calligra
+    ('mandriva', 'https://qa.mandriva.com/buglist.cgi'),
+    ('moz', 'https://bugzilla.mozilla.org/buglist.cgi'),
     # It seems something has changed and it is no longer possible to
     # download any files from there.
     # NOTE: I am leaving it in the list, commented out, just so someone
@@ -359,19 +397,22 @@ rss_bugzillas = (
     # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
 # note: running this script against bz.apache.org apparently causes one's IP
 # to be banned or something; you won't get new files in any case...
-#    ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
-    ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
+#    ('ooo', 'https://bz.apache.org/ooo/buglist.cgi'),
+    ('tdf', 'http://bugs.documentfoundation.org/buglist.cgi'),
 )
 
 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 
-#Novell Bugzilla requires users to log in, in order to get details of the bugs 
such as attachment bodies etc.
-#As a dirty workaround, we parse comments containing "Created an attachment 
(id=xxxxxx)" and download attachments manually
-#python-bugzilla claims that it supports Novell bugzilla login but it's not 
working right now and novell bugzilla login
-#system is a nightmare
+# Novell Bugzilla requires users to log in, in order to get details of
+# the bugs such as attachment bodies etc.  As a dirty workaround, we
+# parse comments containing "Created an attachment (id=xxxxxx)" and
+# download attachments manually python-bugzilla claims that it
+# supports Novell bugzilla login but it's not working right now and
+# novell bugzilla login system is a nightmare
 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
 
+
 class manage_threads(threading.Thread):
     def run(self):
         while 1:
@@ -380,7 +421,7 @@ class manage_threads(threading.Thread):
                 # Get job from queue
                 # Use job parameters to call our query
                 # Then let the queue know we are done with this job
-                (uri, mimetype, prefix, extension) = jobs.get(True,6)
+                (uri, mimetype, prefix, extension) = jobs.get(True, 6)
                 try:
                     get_through_rss_query(uri, mimetype, prefix, extension)
                 finally:
@@ -390,42 +431,45 @@ class manage_threads(threading.Thread):
             except queue.Empty:
                 break
 
+
 def generate_multi_threading():
 
     # Initialize threads
-    for i in range(max_threads):
+    for _i in range(max_threads):
         manage_threads().start()
 
     for (prefix, uri) in rss_bugzillas:
 
         # Create a job for every mimetype for a bugzilla
-        for (mimetype,extension) in mimetypes.items():
+        for (mimetype, extension) in mimetypes.items():
             # It seems that bugzilla has problems returning that many results
             # (10000 results is probably a limit set somewhere) so we always
             # end processing the complete list.
             if mimetype == 'text/html' and prefix == 'moz':
-                    continue
+                continue
 
             jobs.put([uri, mimetype, prefix, extension], block=True)
-            print("successfully placed a job in the queue searching for " + 
mimetype + " in bugtracker " + prefix)
+            print('successfully placed a job in the queue searching for ' +
+                  mimetype + ' in bugtracker ' + prefix)
 
         # Continue when all mimetypes are done for a bugzilla
-        print("STARTED all bugtracker " + prefix)
+        print('STARTED all bugtracker ' + prefix)
 
     jobs.join()
 
+
 # Number of threads to create, (1 = without multi-threading, default = 20)
 max_threads = int(os.environ.get('PARALLELISM', 20))
 jobs = queue.Queue()
 
 generate_multi_threading()
 
-for (mimetype,extension) in mimetypes.items():
-    get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
+for (mimetype, extension) in mimetypes.items():
+    get_through_rpc_query(redhatrpc, redhatbug, mimetype, 'rhbz', extension)
 
 try:
-    get_launchpad_bugs("lp")
+    get_launchpad_bugs('lp')
 except ImportError:
-    print("launchpadlib unavailable, skipping Ubuntu tracker")
+    print('launchpadlib unavailable, skipping Ubuntu tracker')
 
 # vim:set shiftwidth=4 softtabstop=4 expandtab:

core.git: bin/get-bugzilla-attachments-by-mimetype

Reply via email to