bin/get-bugzilla-attachments-by-mimetype | 80 +++++++++++++++++-------------- 1 file changed, 45 insertions(+), 35 deletions(-)
New commits: commit 56763e94bf6f59dde3e33e522553eb39b77e81a2 Author: Michael Stahl <mst...@redhat.com> Date: Thu Jun 4 16:57:59 2015 +0200 get-bugzilla-attachments: actually use the fdo bugzilla bugs.libreoffice.org redirects to bugs.documentfoundation.org, which isn't very helpful as it just causes duplicate downloads. Arguably freedesktop.org could be removed; the are just ~5 interesing attachments since the TDF bugzilla migration. Change-Id: I26d2667848582209e382226108c47549e99cee97 diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype index a5f1570..7f0dfa2 100755 --- a/bin/get-bugzilla-attachments-by-mimetype +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -339,7 +339,7 @@ def get_launchpad_bugs(prefix): rss_bugzillas = ( ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword - ( 'fdo', 'http://bugs.libreoffice.org/buglist.cgi' ), + ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ), ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ), ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra commit e5f9ee18476697a64d7ef646a072f8c76cf95b50 Author: Michael Stahl <mst...@redhat.com> Date: Thu Jun 4 12:56:35 2015 +0200 get-bugzilla-attachments: avoid writing incomplete files Change-Id: I7d1139ddf8c88626dd716aa537a305c31b5be5d9 diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype index fbc4031..a5f1570 100755 --- a/bin/get-bugzilla-attachments-by-mimetype +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -94,9 +94,11 @@ def get_from_bug_url_via_xml(url, mimetype, prefix, suffix): continue print('downloading as ' + download) - f = open(download, 'wb') + tmpfile = download + ".tmp" + f = open(tmpfile, 'wb') f.write(base64.b64decode(node.firstChild.nodeValue)) f.close() + os.rename(tmpfile, download) break def get_novell_bug_via_xml(url, mimetype, prefix, suffix): @@ -138,9 +140,11 @@ def get_novell_bug_via_xml(url, mimetype, prefix, suffix): continue print('downloading as ' + download) - f = open(download, 'wb') + tmpfile = download + ".tmp" + f = open(tmpfile, 'wb') f.write(handle.read()) f.close() + os.rename(tmpfile, download) def create_query(mimetype): query = dict() @@ -327,9 +331,11 @@ def get_launchpad_bugs(prefix): print('mimetype is ' + handle.content_type + ' downloading as ' + download) - f = open(download, "w") + tmpfile = download + ".tmp" + f = open(tmpfile, "wb") f.write(handle.read()) f.close() + os.rename(tmpfile, download) rss_bugzillas = ( ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword commit bd2eee0bd4ae83ff453522b7cf09b69f1b8b5e1b Author: Michael Stahl <mst...@redhat.com> Date: Wed Jun 3 23:41:32 2015 +0200 get-bugzilla-attachments: avoid FDO-TDF duplicates... ... by checking that a file with "fdo" already exists for bugs older than the migration, instead of just ignoring the old bugs on TDF. There are > 300 additional attachments not on freedesktop.org. Change-Id: Ib7ee63041109071cc1241a875ef2cccbddfc699d diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype index 7e6dc83e..fbc4031 100755 --- a/bin/get-bugzilla-attachments-by-mimetype +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -86,6 +86,13 @@ def get_from_bug_url_via_xml(url, mimetype, prefix, suffix): print("assuming " + download + " is up to date") continue + # prevent re-downloading FDO attachments from TDF + if prefix == "tdf" and int(id) < 88776: + fdodownload = download.replace("tdf", "fdo") + if os.path.isfile(fdodownload): + print("assuming FDO " + fdodownload + " is up to date") + continue + print('downloading as ' + download) f = open(download, 'wb') f.write(base64.b64decode(node.firstChild.nodeValue)) @@ -199,7 +206,7 @@ def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix): print('looking for all bugs having %s attachment(s)' % mimetype) process(query, True, get_file_bz_ids(files, prefix)) -def get_through_rss_query(queryurl, mimetype, prefix, suffix, startid): +def get_through_rss_query(queryurl, mimetype, prefix, suffix): try: os.mkdir(suffix) except: @@ -218,10 +225,7 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix, startid): entries = [] for entry in d['entries']: bugid = entry['id'].split('=')[-1] - if (int(bugid) >= startid): - entries.append(entry) - else: - print("Dropping " + bugid + " because < startid of " + str(startid)) + entries.append(entry) if full: available = set([str(entry['id'].split('=')[-1]) for entry in entries]) @@ -328,20 +332,20 @@ def get_launchpad_bugs(prefix): f.close() rss_bugzillas = ( - ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi', 0 ), #added for abiword - ( 'fdo', 'http://bugs.libreoffice.org/buglist.cgi', 0 ), - ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi', 0 ), - ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi', 0 ), # added for gnumeric - ( 'kde', 'http://bugs.kde.org/buglist.cgi', 0 ), # added for koffice/calligra - ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi', 0 ), - ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi', 0 ), + ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword + ( 'fdo', 'http://bugs.libreoffice.org/buglist.cgi' ), + ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ), + ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric + ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra + ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ), + ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ), # It seems something has changed and it is no longer possible to # download any files from there. # NOTE: I am leaving it in the list, commented out, just so someone # does not add it back immediately .-) # 'novell': 'https://bugzilla.novell.com/buglist.cgi', - ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi', 0 ), - ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi', 88776 ), + ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ), + ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ), ) redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi' @@ -497,9 +501,9 @@ class manage_threads(threading.Thread): # Get job from queue # Use job parameters to call our query # Then let the queue know we are done with this job - (uri, mimetype, prefix, extension, startid) = jobs.get(True,6) + (uri, mimetype, prefix, extension) = jobs.get(True,6) try: - get_through_rss_query(uri, mimetype, prefix, extension, startid) + get_through_rss_query(uri, mimetype, prefix, extension) finally: jobs.task_done() except KeyboardInterrupt: @@ -508,7 +512,7 @@ class manage_threads(threading.Thread): break def generate_multi_threading(): - for (prefix, uri, startid) in rss_bugzillas: + for (prefix, uri) in rss_bugzillas: # Initialize threads for i in range(max_threads): @@ -522,7 +526,7 @@ def generate_multi_threading(): if mimetype == 'text/html' and prefix == 'moz': continue - jobs.put([uri, mimetype, prefix, extension, startid], block=True) + jobs.put([uri, mimetype, prefix, extension], block=True) print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix) # Continue when all mimetypes are done for a bugzilla commit 0cfe2c8c893bfe6d1c2dce5941065eb4e841e7cc Author: Michael Stahl <mst...@redhat.com> Date: Wed Jun 3 12:14:31 2015 +0200 get-bugzilla-attachments: fix the multi-threading The queue was limited to an arbitrary maximum size, causing half of the jobs to be droppend on the floor. Also it didn't run on Python 3. Change-Id: I90bfba448291d901c5a7c83389d17c6acdd919c8 diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype index 9ae182c..7e6dc83e 100755 --- a/bin/get-bugzilla-attachments-by-mimetype +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -31,7 +31,11 @@ import re import os, os.path import stat import sys -import threading, Queue +import threading +try: + import queue +except: + import Queue as queue try: from urllib.request import urlopen except: @@ -206,7 +210,7 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix, startid): get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml def process(query, full, have=[]): - url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()]) + url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()]) print('url is ' + url) d = feedparser.parse(url) print(str(len(d['entries'])) + ' bugs to process') @@ -493,44 +497,40 @@ class manage_threads(threading.Thread): # Get job from queue # Use job parameters to call our query # Then let the queue know we are done with this job - job = jobs.get(True,6) - get_through_rss_query(job[0], job[1], job[2], job[3], job[4]) # [0] = uri; [1] = mimetype; [2] = prefix; [3] = extension; [4] = startid - jobs.task_done() + (uri, mimetype, prefix, extension, startid) = jobs.get(True,6) + try: + get_through_rss_query(uri, mimetype, prefix, extension, startid) + finally: + jobs.task_done() except KeyboardInterrupt: raise # Ctrl+C should work - except: + except queue.Empty: break def generate_multi_threading(): for (prefix, uri, startid) in rss_bugzillas: # Initialize threads - for i in xrange(max_threads): + for i in range(max_threads): manage_threads().start() # Create a job for every mimetype for a bugzilla for (mimetype,extension) in mimetypes.items(): - - # It seems that bugzilla has problems returing that many results # (10000 results is probably a limit set somewhere) so we always # end processing the complete list. if mimetype == 'text/html' and prefix == 'moz': continue - try: - jobs.put([uri, mimetype, prefix, extension, startid], block=True, timeout=3) - print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix) - except KeyboardInterrupt: - raise # Ctrl+C should work - except: - print("Queue full") + jobs.put([uri, mimetype, prefix, extension, startid], block=True) + print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix) # Continue when all mimetypes are done for a bugzilla jobs.join() + print("DONE with bugtracker " + prefix) max_threads = 20 # Number of threads to create, (1 = without multi-threading) -jobs = Queue.Queue(40) +jobs = queue.Queue() generate_multi_threading() _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits