On 21/09/14 02:12, David Prévot wrote: > As can be seen in /usr/share/doc/xul-ext-password-editor/changelog.gz, > the output is terrible: lists are not honored, and limited to the first > page available on the website. >
Hello, please find attached 2 patches. 0001-amo-changelog-parse-into-human-readable-form-and-sav.patch This one changes amo-changelog to generate two files, debian/upstream/changelog{.html,} with the HTML directly from AMO and the plain version generated by lynx or pandoc, depending what the maintainer thinks looks best. I have tested this with status-4-evar, tree-style-tab, flashblock, noscript, saved-password-editor, adblock-plus, https-everywhere, firebug, public-fox and believe it produces pretty reasonable output for all of them. If you disagree, please make some constructive suggestions to move forward, instead of merely "it looks rubbish". 0002-amo-changelog-fetch-all-pages-of-Version-History.patch This one enables amo-changelog to automatically follow and download <link rel="next"> RSS references to the next page. This is dependant on AMO enabling support for this on their website, which is currently pending a pull request from me. https://github.com/mozilla/olympia/pull/301 Please review the above patches, bearing in mind the second one is not directly testable yet. X -- GPG: 4096R/1318EFAC5FBBDBCE git://github.com/infinity0/pubkeys.git
From 07bd31befd66311727a48d6518b4e51f92507dff Mon Sep 17 00:00:00 2001 From: Ximin Luo <infini...@pwned.gg> Date: Tue, 30 Sep 2014 01:59:25 +0100 Subject: [PATCH 1/2] amo-changelog: parse into human-readable form and save to debian/upstream/changelog{,.html} --- amo-changelog | 142 ++++++++++++++++++++++++++++++++++++++++++++-------- man/amo-changelog.1 | 16 ++++-- 2 files changed, 132 insertions(+), 26 deletions(-) diff --git a/amo-changelog b/amo-changelog index 4ce9e73..273d6ad 100755 --- a/amo-changelog +++ b/amo-changelog @@ -19,43 +19,141 @@ from __future__ import print_function import argparse import os +import re +import subprocess import sys import urllib2 import xml.etree.cElementTree as etree URL_TEMPLATE = "https://addons.mozilla.org/en-US/addon/{ext}/versions/format:rss" +OUTGOING_HREF = re.compile(r'href="https?://outgoing\.mozilla\.org/v\d+/\w+/(.+?)"') +HTML_HEAD = "<html>\n<head><title>{title}</title></head>\n<body>\n" +HTML_FOOT = "</body>\n</html>" +def fix_outgoing_href(match): + return 'href="%s"' % urllib2.unquote(match.group(1)) + +def convert_rss_to_html(source, target): + elements = etree.iterparse(source) + # title + element = next(elements)[1] + while element.tag != "title": + element = next(elements)[1] + print(HTML_HEAD.format(title=element.text), file=target) + # items + for _, element in elements: + if element.tag != "item": + continue + title = element.find("title").text.encode("utf-8") + print("<h2>%s</h2>" % title, file=target) + descel = element.find("description") + if descel is not None and descel.text: + desc = descel.text.rstrip("\n").encode("utf-8") + # process manual line breaks, e.g. adblock-plus + desc = desc.replace("\n", "\n<br/>").replace("<br/>\n", "<br/> \n") + # strip outgoing redirect + desc = OUTGOING_HREF.sub(fix_outgoing_href, desc) + print(desc, file=target) + else: + print("[no description]", file=target) + print("", file=target) + print(HTML_FOOT, file=target) + +def which(cmd): + path = os.environ.get("PATH", os.defpath).split(os.pathsep) + for dir in path: + name = os.path.join(dir, cmd) + if (os.path.exists(name) and os.access(name, os.F_OK | os.X_OK) + and not os.path.isdir(name)): + return name + return None + +def try_external_write(out, args, **kwargs): + prog = args[0] + if not which(prog): + print("failed to write %s: program not found: %s" % (out, prog), file=sys.stderr) + return False + try: + subprocess.check_call(args, **kwargs) + print("wrote %s" % out, file=sys.stderr) + return True + except Exception as e: + print("failed to write %s: %s" % (out, e), file=sys.stderr) + return False def main(): parser = argparse.ArgumentParser( - description="fetch Version History of an addon from the Mozilla Extensions website.") + description="Fetch Version History of an addon from the Mozilla " + "Extensions website and convert it into a human-readable format.") parser.add_argument("extension", - help="Extension short-name, as used on addons.mozilla.org.") + help="Extension short-name, as used on addons.mozilla.org.") + parser.add_argument("-f", "--html-file", + metavar="FILE", default="debian/upstream/changelog.html", + help="File to write to. Default: %(default)s.") + parser.add_argument("-p", "--plain-format", metavar="FORMAT", + choices=["text", "markdown", "rst"], default="none", + help="Generate a human-readable form of the changelog in the file " + "without the .html extension, using an external program. Possible " + "options are text (uses lynx(1)), markdown (pandoc(1)), or rst " + "(pandoc(1)). Default: %(default)s.") options = parser.parse_args() - url = URL_TEMPLATE.format(ext=options.extension) - try: - fp = urllib2.urlopen(url) - except urllib2.HTTPError as error: - print("%s: For extension '%s', error fetching '%s': %s" % - (os.path.basename(sys.argv[0]), options.extension, url, error), - file=sys.stderr) + progname = os.path.basename(sys.argv[0]) + + html_file = options.html_file + if not html_file.endswith(".html"): + print("%s: Output filename must end with .html: %s" % + (progname, html_file), file=sys.stderr) return 1 + plain_file = html_file[:-5] + try: - for _, element in etree.iterparse(fp): - if element.tag != "item": - continue - title = element.find("title").text.encode("utf-8") - print(title) - print("=" * len(title)) - descel = element.find("description") - if descel is not None and descel.text: - print(descel.text.rstrip("\n").encode("utf-8")) + with open(html_file, "w") as target: + url = URL_TEMPLATE.format(ext=options.extension) + try: + source = urllib2.urlopen(url) + except urllib2.HTTPError as error: + print("%s: For extension '%s', error fetching '%s': %s" % + (progname, options.extension, url, error), file=sys.stderr) + raise + try: + convert_rss_to_html(source, target) + finally: + source.close() + print("wrote %s" % html_file, file=sys.stderr) + except Exception as e: + print("failed to write %s: %s" % (html_file, e), file=sys.stderr) + #os.remove(html_file) + return 1 + + if options.plain_format == "text": + with open(plain_file, "w") as target: + if not try_external_write(plain_file, + ["lynx", "-dump", "-list_inline", "-width=84", html_file], stdout=target): + #os.remove(plain_file) + return 1 else: - print("[no description]") - print("") - finally: - fp.close() + # 2 space indent is a bit more reasonable than lynx's 3 default + # width=84 above (3*2-2) effectively cancels the right margin + subprocess.call(["sed", "-i", "-e", "s/^ / /g", plain_file]) + + elif options.plain_format == "markdown": + if not try_external_write(plain_file, + ["pandoc", "-i", html_file, "--columns=79", "-wmarkdown", "-o", plain_file]): + return 1 + + elif options.plain_format == "rst": + if not try_external_write(plain_file, + ["pandoc", "-i", html_file, "--columns=79", "-wrst", "-o", plain_file]): + return 1 + else: + # work around https://github.com/jgm/pandoc/issues/1656 + # by adding two spaces to all line-block continuation lines + subprocess.call(["sed", "-i", "-r", + "-e", r"/^\|/,/^ |^$/{s/^([^ |])/ \1/g}", plain_file]) + + return 0 + if __name__ == "__main__": sys.exit(main()) diff --git a/man/amo-changelog.1 b/man/amo-changelog.1 index 197ff17..0eb61f5 100644 --- a/man/amo-changelog.1 +++ b/man/amo-changelog.1 @@ -12,7 +12,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.TH XPI-REPACK "1" "April 2014" "amo-changelog" "mozilla-devscripts suite" +.TH AMO-CHANGELOG "1" "April 2014" "amo-changelog" "mozilla-devscripts suite" .SH NAME amo-changelog \- fetch Version History of an addon .SH SYNOPSIS @@ -37,20 +37,28 @@ Here is an example for debian/rules: .br override_dh_installchangelogs: .br - dh_installchangelogs debian/changelog.upstream + dh_installchangelogs debian/changelog/upstream.html debian/changelog/upstream \[char46]PHONY: get-orig-changelog .br get-orig-changelog: .br - amo-changelog adblock-plus > debian/changelog.upstream + amo-changelog -p rst adblock-plus -Using this approach, one would save the output file (debian/changelog.upstream) +Using this approach, one would save the output files debian/changelog/{upstream.html,upstream} as part of the Debian packaging. When updating the package with a new upstream release, one would run `debian/rules get-orig-changelog` .SH OPTIONS .TP \fB\-h\fR, \fB\-\-help\fR Display a brief help message. +.TP +\fB\-f\fR, \fB\-\-html\-file\fR +File to write to. Default: debian/upstream/changelog.html +.TP +\fB\-p\fR, \fB\-\-plain\-format\fR +Generate a human-readable form of the changelog in the file without the .html +extension, using an external program. Possible options are text (uses lynx(1)), +markdown (pandoc(1)), or rst (pandoc(1)). Default: none. .SH AUTHOR Jakub Wilk <jw...@debian.org> and Ximin Luo <infini...@pwned.gg> -- 2.1.0
From 4c5d1d525e1068540c6ee80f0e1be73f9c458653 Mon Sep 17 00:00:00 2001 From: Ximin Luo <infini...@pwned.gg> Date: Tue, 30 Sep 2014 02:04:36 +0100 Subject: [PATCH 2/2] amo-changelog: fetch all pages of Version History --- amo-changelog | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/amo-changelog b/amo-changelog index 273d6ad..f95359b 100755 --- a/amo-changelog +++ b/amo-changelog @@ -33,15 +33,21 @@ HTML_FOOT = "</body>\n</html>" def fix_outgoing_href(match): return 'href="%s"' % urllib2.unquote(match.group(1)) -def convert_rss_to_html(source, target): +def convert_rss_to_html(first, source, target): elements = etree.iterparse(source) - # title - element = next(elements)[1] - while element.tag != "title": + next_url = None + # header if first page + if first: element = next(elements)[1] - print(HTML_HEAD.format(title=element.text), file=target) - # items + while element.tag != "title": + element = next(elements)[1] + print(HTML_HEAD.format(title=element.text), file=target) + # items, rel for _, element in elements: + if element.tag == "{http://www.w3.org/2005/Atom}link": + if element.attrib["rel"] == "next": + next_url = element.attrib["href"] + continue if element.tag != "item": continue title = element.find("title").text.encode("utf-8") @@ -57,7 +63,10 @@ def convert_rss_to_html(source, target): else: print("[no description]", file=target) print("", file=target) - print(HTML_FOOT, file=target) + # footer if last page + if not next_url: + print(HTML_FOOT, file=target) + return next_url def which(cmd): path = os.environ.get("PATH", os.defpath).split(os.pathsep) @@ -110,16 +119,19 @@ def main(): try: with open(html_file, "w") as target: url = URL_TEMPLATE.format(ext=options.extension) - try: - source = urllib2.urlopen(url) - except urllib2.HTTPError as error: - print("%s: For extension '%s', error fetching '%s': %s" % - (progname, options.extension, url, error), file=sys.stderr) - raise - try: - convert_rss_to_html(source, target) - finally: - source.close() + first = True + while url: + try: + source = urllib2.urlopen(url) + except urllib2.HTTPError as error: + print("%s: For extension '%s', error fetching '%s': %s" % + (progname, options.extension, url, error), file=sys.stderr) + raise + try: + url = convert_rss_to_html(first, source, target) + first = False + finally: + source.close() print("wrote %s" % html_file, file=sys.stderr) except Exception as e: print("failed to write %s: %s" % (html_file, e), file=sys.stderr) -- 2.1.0
signature.asc
Description: OpenPGP digital signature