On 21/09/14 02:12, David Prévot wrote:
> As can be seen in /usr/share/doc/xul-ext-password-editor/changelog.gz,
> the output is terrible: lists are not honored, and limited to the first
> page available on the website.
> 

Hello, please find attached 2 patches.

0001-amo-changelog-parse-into-human-readable-form-and-sav.patch

This one changes amo-changelog to generate two files, 
debian/upstream/changelog{.html,} with the HTML directly from AMO and the plain 
version generated by lynx or pandoc, depending what the maintainer thinks looks 
best.

I have tested this with status-4-evar, tree-style-tab, flashblock, noscript, 
saved-password-editor, adblock-plus, https-everywhere, firebug, public-fox and 
believe it produces pretty reasonable output for all of them. If you disagree, 
please make some constructive suggestions to move forward, instead of merely 
"it looks rubbish".

0002-amo-changelog-fetch-all-pages-of-Version-History.patch

This one enables amo-changelog to automatically follow and download <link 
rel="next"> RSS references to the next page. This is dependant on AMO enabling 
support for this on their website, which is currently pending a pull request 
from me.

https://github.com/mozilla/olympia/pull/301

Please review the above patches, bearing in mind the second one is not directly 
testable yet.

X

-- 
GPG: 4096R/1318EFAC5FBBDBCE
git://github.com/infinity0/pubkeys.git
From 07bd31befd66311727a48d6518b4e51f92507dff Mon Sep 17 00:00:00 2001
From: Ximin Luo <infini...@pwned.gg>
Date: Tue, 30 Sep 2014 01:59:25 +0100
Subject: [PATCH 1/2] amo-changelog: parse into human-readable form and save to
 debian/upstream/changelog{,.html}

---
 amo-changelog       | 142 ++++++++++++++++++++++++++++++++++++++++++++--------
 man/amo-changelog.1 |  16 ++++--
 2 files changed, 132 insertions(+), 26 deletions(-)

diff --git a/amo-changelog b/amo-changelog
index 4ce9e73..273d6ad 100755
--- a/amo-changelog
+++ b/amo-changelog
@@ -19,43 +19,141 @@ from __future__ import print_function
 
 import argparse
 import os
+import re
+import subprocess
 import sys
 import urllib2
 import xml.etree.cElementTree as etree
 
 URL_TEMPLATE = "https://addons.mozilla.org/en-US/addon/{ext}/versions/format:rss";
+OUTGOING_HREF = re.compile(r'href="https?://outgoing\.mozilla\.org/v\d+/\w+/(.+?)"')
+HTML_HEAD = "<html>\n<head><title>{title}</title></head>\n<body>\n"
+HTML_FOOT = "</body>\n</html>"
 
+def fix_outgoing_href(match):
+    return 'href="%s"' % urllib2.unquote(match.group(1))
+
+def convert_rss_to_html(source, target):
+    elements = etree.iterparse(source)
+    # title
+    element = next(elements)[1]
+    while element.tag != "title":
+        element = next(elements)[1]
+    print(HTML_HEAD.format(title=element.text), file=target)
+    # items
+    for _, element in elements:
+        if element.tag != "item":
+            continue
+        title = element.find("title").text.encode("utf-8")
+        print("<h2>%s</h2>" % title, file=target)
+        descel = element.find("description")
+        if descel is not None and descel.text:
+            desc = descel.text.rstrip("\n").encode("utf-8")
+            # process manual line breaks, e.g. adblock-plus
+            desc = desc.replace("\n", "\n<br/>").replace("<br/>\n", "<br/>&nbsp;\n")
+            # strip outgoing redirect
+            desc = OUTGOING_HREF.sub(fix_outgoing_href, desc)
+            print(desc, file=target)
+        else:
+            print("[no description]", file=target)
+        print("", file=target)
+    print(HTML_FOOT, file=target)
+
+def which(cmd):
+    path = os.environ.get("PATH", os.defpath).split(os.pathsep)
+    for dir in path:
+        name = os.path.join(dir, cmd)
+        if (os.path.exists(name) and os.access(name, os.F_OK | os.X_OK)
+            and not os.path.isdir(name)):
+            return name
+    return None
+
+def try_external_write(out, args, **kwargs):
+    prog = args[0]
+    if not which(prog):
+        print("failed to write %s: program not found: %s" % (out, prog), file=sys.stderr)
+        return False
+    try:
+        subprocess.check_call(args, **kwargs)
+        print("wrote %s" % out, file=sys.stderr)
+        return True
+    except Exception as e:
+        print("failed to write %s: %s" % (out, e), file=sys.stderr)
+        return False
 
 def main():
     parser = argparse.ArgumentParser(
-        description="fetch Version History of an addon from the Mozilla Extensions website.")
+        description="Fetch Version History of an addon from the Mozilla "
+        "Extensions website and convert it into a human-readable format.")
     parser.add_argument("extension",
-                        help="Extension short-name, as used on addons.mozilla.org.")
+        help="Extension short-name, as used on addons.mozilla.org.")
+    parser.add_argument("-f", "--html-file",
+        metavar="FILE", default="debian/upstream/changelog.html",
+        help="File to write to. Default: %(default)s.")
+    parser.add_argument("-p", "--plain-format", metavar="FORMAT",
+        choices=["text", "markdown", "rst"], default="none",
+        help="Generate a human-readable form of the changelog in the file "
+        "without the .html extension, using an external program. Possible "
+        "options are text (uses lynx(1)), markdown (pandoc(1)), or rst "
+        "(pandoc(1)). Default: %(default)s.")
     options = parser.parse_args()
 
-    url = URL_TEMPLATE.format(ext=options.extension)
-    try:
-        fp = urllib2.urlopen(url)
-    except urllib2.HTTPError as error:
-        print("%s: For extension '%s', error fetching '%s': %s" %
-              (os.path.basename(sys.argv[0]), options.extension, url, error),
-              file=sys.stderr)
+    progname = os.path.basename(sys.argv[0])
+
+    html_file = options.html_file
+    if not html_file.endswith(".html"):
+        print("%s: Output filename must end with .html: %s" %
+            (progname, html_file), file=sys.stderr)
         return 1
+    plain_file = html_file[:-5]
+
     try:
-        for _, element in etree.iterparse(fp):
-            if element.tag != "item":
-                continue
-            title = element.find("title").text.encode("utf-8")
-            print(title)
-            print("=" * len(title))
-            descel = element.find("description")
-            if descel is not None and descel.text:
-                print(descel.text.rstrip("\n").encode("utf-8"))
+        with open(html_file, "w") as target:
+            url = URL_TEMPLATE.format(ext=options.extension)
+            try:
+                source = urllib2.urlopen(url)
+            except urllib2.HTTPError as error:
+                print("%s: For extension '%s', error fetching '%s': %s" %
+                      (progname, options.extension, url, error), file=sys.stderr)
+                raise
+            try:
+                convert_rss_to_html(source, target)
+            finally:
+                source.close()
+        print("wrote %s" % html_file, file=sys.stderr)
+    except Exception as e:
+        print("failed to write %s: %s" % (html_file, e), file=sys.stderr)
+        #os.remove(html_file)
+        return 1
+
+    if options.plain_format == "text":
+        with open(plain_file, "w") as target:
+            if not try_external_write(plain_file,
+              ["lynx", "-dump", "-list_inline", "-width=84", html_file], stdout=target):
+                #os.remove(plain_file)
+                return 1
             else:
-                print("[no description]")
-            print("")
-    finally:
-        fp.close()
+                # 2 space indent is a bit more reasonable than lynx's 3 default
+                # width=84 above (3*2-2) effectively cancels the right margin
+                subprocess.call(["sed", "-i", "-e", "s/^   /  /g", plain_file])
+
+    elif options.plain_format == "markdown":
+        if not try_external_write(plain_file,
+          ["pandoc", "-i", html_file, "--columns=79", "-wmarkdown", "-o", plain_file]):
+            return 1
+
+    elif options.plain_format == "rst":
+        if not try_external_write(plain_file,
+          ["pandoc", "-i", html_file, "--columns=79", "-wrst", "-o", plain_file]):
+            return 1
+        else:
+            # work around https://github.com/jgm/pandoc/issues/1656
+            # by adding two spaces to all line-block continuation lines
+            subprocess.call(["sed", "-i", "-r",
+                "-e", r"/^\|/,/^ |^$/{s/^([^ |])/  \1/g}", plain_file])
+
+    return 0
+
 
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/man/amo-changelog.1 b/man/amo-changelog.1
index 197ff17..0eb61f5 100644
--- a/man/amo-changelog.1
+++ b/man/amo-changelog.1
@@ -12,7 +12,7 @@
 .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 .\"
-.TH XPI-REPACK "1" "April 2014" "amo-changelog" "mozilla-devscripts suite"
+.TH AMO-CHANGELOG "1" "April 2014" "amo-changelog" "mozilla-devscripts suite"
 .SH NAME
 amo-changelog \- fetch Version History of an addon
 .SH SYNOPSIS
@@ -37,20 +37,28 @@ Here is an example for debian/rules:
 .br
 override_dh_installchangelogs:
 .br
-	dh_installchangelogs debian/changelog.upstream
+	dh_installchangelogs debian/changelog/upstream.html debian/changelog/upstream
 
 \[char46]PHONY: get-orig-changelog
 .br
 get-orig-changelog:
 .br
-	amo-changelog adblock-plus > debian/changelog.upstream
+	amo-changelog -p rst adblock-plus
 
-Using this approach, one would save the output file (debian/changelog.upstream)
+Using this approach, one would save the output files debian/changelog/{upstream.html,upstream}
 as part of the Debian packaging. When updating the package with a new upstream
 release, one would run `debian/rules get-orig-changelog`
 .SH OPTIONS
 .TP
 \fB\-h\fR, \fB\-\-help\fR
 Display a brief help message.
+.TP
+\fB\-f\fR, \fB\-\-html\-file\fR
+File to write to. Default: debian/upstream/changelog.html
+.TP
+\fB\-p\fR, \fB\-\-plain\-format\fR
+Generate a human-readable form of the changelog in the file without the .html
+extension, using an external program. Possible options are text (uses lynx(1)),
+markdown (pandoc(1)), or rst (pandoc(1)). Default: none.
 .SH AUTHOR
 Jakub Wilk <jw...@debian.org> and Ximin Luo <infini...@pwned.gg>
-- 
2.1.0

From 4c5d1d525e1068540c6ee80f0e1be73f9c458653 Mon Sep 17 00:00:00 2001
From: Ximin Luo <infini...@pwned.gg>
Date: Tue, 30 Sep 2014 02:04:36 +0100
Subject: [PATCH 2/2] amo-changelog: fetch all pages of Version History

---
 amo-changelog | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/amo-changelog b/amo-changelog
index 273d6ad..f95359b 100755
--- a/amo-changelog
+++ b/amo-changelog
@@ -33,15 +33,21 @@ HTML_FOOT = "</body>\n</html>"
 def fix_outgoing_href(match):
     return 'href="%s"' % urllib2.unquote(match.group(1))
 
-def convert_rss_to_html(source, target):
+def convert_rss_to_html(first, source, target):
     elements = etree.iterparse(source)
-    # title
-    element = next(elements)[1]
-    while element.tag != "title":
+    next_url = None
+    # header if first page
+    if first:
         element = next(elements)[1]
-    print(HTML_HEAD.format(title=element.text), file=target)
-    # items
+        while element.tag != "title":
+            element = next(elements)[1]
+        print(HTML_HEAD.format(title=element.text), file=target)
+    # items, rel
     for _, element in elements:
+        if element.tag == "{http://www.w3.org/2005/Atom}link":
+            if element.attrib["rel"] == "next":
+                next_url = element.attrib["href"]
+            continue
         if element.tag != "item":
             continue
         title = element.find("title").text.encode("utf-8")
@@ -57,7 +63,10 @@ def convert_rss_to_html(source, target):
         else:
             print("[no description]", file=target)
         print("", file=target)
-    print(HTML_FOOT, file=target)
+    # footer if last page
+    if not next_url:
+        print(HTML_FOOT, file=target)
+    return next_url
 
 def which(cmd):
     path = os.environ.get("PATH", os.defpath).split(os.pathsep)
@@ -110,16 +119,19 @@ def main():
     try:
         with open(html_file, "w") as target:
             url = URL_TEMPLATE.format(ext=options.extension)
-            try:
-                source = urllib2.urlopen(url)
-            except urllib2.HTTPError as error:
-                print("%s: For extension '%s', error fetching '%s': %s" %
-                      (progname, options.extension, url, error), file=sys.stderr)
-                raise
-            try:
-                convert_rss_to_html(source, target)
-            finally:
-                source.close()
+            first = True
+            while url:
+                try:
+                    source = urllib2.urlopen(url)
+                except urllib2.HTTPError as error:
+                    print("%s: For extension '%s', error fetching '%s': %s" %
+                          (progname, options.extension, url, error), file=sys.stderr)
+                    raise
+                try:
+                    url = convert_rss_to_html(first, source, target)
+                    first = False
+                finally:
+                    source.close()
         print("wrote %s" % html_file, file=sys.stderr)
     except Exception as e:
         print("failed to write %s: %s" % (html_file, e), file=sys.stderr)
-- 
2.1.0

Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to