Bug#488033: chm2pdf: fails converting filenames with spaces

Neil Schemenauer Mon, 12 Mar 2012 00:48:22 -0700

Package: chm2pdf
Version: 0.9.1-1.1
Followup-For: Bug #488033

This script has quite a lot of problems.  The logic to rewrite URLs,
for example, is purely heuristic and could use the wrong image if
the file is similarly named.


Attached are two patches.  The first fixes filename quoting bugs.
The script passes filenames to the shell without proper escaping.
Instead, I use 'shutil' or 'subprocess' to do the work.  The
majority of the changed lines are due to building a list of htmldoc
options as a list instead of one big string to be parsed by the
shell.

The second patch is more invasive.  The script uses a fixed
directory name in /tmp.  That's obviously a security hole and I'm
suprised this packaged passed review.  My second patch changes
the script to use the tempfile module to securely create a
temporary working directory.  While fixing the correct_file()
function, I also improved it by avoiding parsing it twice
by SGMLParser.  Also, I match heuristic for URL matching more
accurate (use endswith() rather than 'in').


-- System Information:
Debian Release: wheezy/sid
  APT prefers testing
  APT policy: (650, 'testing'), (600, 'unstable'), (1, 'experimental')
Architecture: amd64 (x86_64)

Kernel: Linux 3.2.0-1-amd64 (SMP w/2 CPU cores)
Locale: LANG=en_US.UTF-8, LC_CTYPE=en_CA.utf8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/bash

Versions of packages chm2pdf depends on:
ii  htmldoc         1.8.27-7
ii  libchm-bin      2:0.40a-2
ii  python          2.7.2-10
ii  python-chm      0.8.4-1+b2
ii  python-support  1.0.14

chm2pdf recommends no packages.

Versions of packages chm2pdf suggests:
pn  python-beautifulsoup  <none>

-- no debconf information

>From d2245733ad7097273889058d31edbf84ff0027fb Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <n...@arctrix.com>
Date: Mon, 12 Mar 2012 01:29:13 -0600
Subject: [PATCH 2/2] Use tempfile module instead of fixed temporary
 directories.

Use os.chdir() to switch to temporary directory.  Overhaul
correct_file() function to substitute relative paths for CSS and
image URLs.

Make --dontextract take a filename argument, since the temporary
directory is no longer fixed.  Make --extract-only print the name
of the temporary directory to stdout.
---
 chm2pdf |  180 ++++++++++++++++++++++-----------------------------------------
 1 files changed, 62 insertions(+), 118 deletions(-)

diff --git a/chm2pdf b/chm2pdf
index 216807a..a3f0f08 100644
--- a/chm2pdf
+++ b/chm2pdf
@@ -29,22 +29,10 @@ import re, glob
 import getopt
 import shutil
 import subprocess
+import tempfile
 # from BeautifulSoup import BeautifulSoup
 
-global version
-
-global CHM2PDF_TEMP_WORK_DIR #where conversions etc. take place
-global CHM2PDF_TEMP_ORIG_DIR #where the chm file is exploded
-global CHM2PDF_WORK_DIR
-global CHM2PDF_ORIG_DIR
-
-global filename #the input filename
-
 version = '0.9.1'
-CHM2PDF_TEMP_WORK_DIR='/tmp/chm2pdf/work' 
-CHM2PDF_TEMP_ORIG_DIR='/tmp/chm2pdf/orig'
-
-
 
 # YOU DON'T NEED TO CHANGE ANYTHING BELOW THIS LINE!
 
@@ -66,29 +54,22 @@ class PageLister(sgmllib.SGMLParser):
            if urlparam_flag and key=='value':
                self.pages.append('/'+value)  
      
-class ImageCatcher(sgmllib.SGMLParser):
+class UrlCatcher(sgmllib.SGMLParser):
     '''
-    finds image urls in the current html page, so to take them out from the chm file.
+    finds image and CSS urls in the current html page, so to take them out from the chm file.
     '''
     def reset(self):
         sgmllib.SGMLParser.reset(self)
         self.imgurls=[]
-        
+        self.cssurls=[]
+
     def start_img(self,attrs):
         for key,value in attrs:
             if key=='src' or key=='SRC':
                 # Avoid duplicates in the list of image URLs.
                 if not self.imgurls.count(value):
                     self.imgurls.append(value)
-     
-class CssCatcher(sgmllib.SGMLParser):
-    '''
-    finds CSS urls in the current html page, so to take them out from the chm file.
-    '''
-    def reset(self):
-        sgmllib.SGMLParser.reset(self)
-        self.cssurls=[]
-        
+
     def start_link(self,attrs):
         for key,value in attrs:
             if key=='href' or key=='HREF':
@@ -121,7 +102,7 @@ def get_objective_urls_list(filename):
     return urls_list
 
 
-def correct_file(input_file, output_file, html_list, objective_urls, options):
+def correct_file(input_file, output_file, objective_urls, options, orig_dir):
 
     # Correct image links in file
     pf=open(input_file,'rU')
@@ -134,41 +115,21 @@ def correct_file(input_file, output_file, html_list, objective_urls, options):
         soup = BeautifulSoup(page)
         page = str(soup)
 
-    image_catcher=ImageCatcher()
-    image_catcher.feed(page)
-        
-    css_catcher=CssCatcher()
-    css_catcher.feed(page)
-        
-    # We substitute the image URLs of input_file with the *actual* URLs on the CHM2PDF_ORIG_DIR directory
-    for iurl in image_catcher.imgurls:
-        # print 'iurl = '  + iurl
+    url_catcher = UrlCatcher()
+    url_catcher.feed(page)
 
-        img_filename = ''
+    # We substitute the image and CSS URLs of input_file with the *actual* URLs
+    for url in url_catcher.imgurls + url_catcher.cssurls:
+        # print 'url = '  + url
         for item in objective_urls:
-            if iurl in item:
-                img_filename=CHM2PDF_ORIG_DIR+item
-                if ';' in img_filename: #hack to get rid of mysterious ; in filenames and urls...
-                    img_filename=img_filename.split(';')[0]
-        # substitute the new image filenames - but only if an img_filename was found!
-        if img_filename:
-            page=re.sub(iurl,img_filename,page)
-            
+            if item.endswith(url):
+                new_url = os.path.join('..', orig_dir, '.' + os.path.sep + item)
+                if ';' in new_url: #hack to get rid of mysterious ; in filenames and urls...
+                    new_url = new_url.split(';')[0]
+                #print url, '->', new_url
+                page = page.replace(url, new_url)
+                break
 
-    # We substitute the CSS URLs of input_file with the *actual* URLs on the CHM2PDF_ORIG_DIR directory
-    for curl in css_catcher.cssurls:
-        # print 'curl = '  + curl
-
-        css_filename = ''
-        for item in objective_urls:
-            if curl in item:
-                css_filename=CHM2PDF_ORIG_DIR+item
-                if ';' in css_filename: #hack to get rid of mysterious ; in filenames and urls...
-                    css_filename=img_filename.split(';')[0]
-        # substitute the new image filenames - but only if a css_filename was found!
-        if css_filename:
-            page=re.sub(curl,css_filename,page)
-            
     # Fontsize hack:
     # Since htmldoc ignores the --fontsize option, we have to do something about it...
     # If --fontsize xxx was given on the command line, 
@@ -288,7 +249,7 @@ def correct_file(input_file, output_file, html_list, objective_urls, options):
     f.close()
 
 
-def convert_to_pdf(cfile, filename, outputfilename, options):
+def convert_to_pdf(cfile, filename, outputfilename, options, orig_dir, work_dir):
     '''
     Performs actual converting.
     '''
@@ -297,27 +258,11 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
     # ########################### File extraction and correction: START ############################
     #
     if options['dontextract'] == '':
-    
-        try:
-            os.mkdir(CHM2PDF_TEMP_WORK_DIR)
-        except OSError: # The directory already exists.
-            pass
-        
-        try:
-            os.mkdir(CHM2PDF_TEMP_ORIG_DIR)
-        except OSError: # The directory already exists.
-            pass
-        
-        try:
-            os.mkdir(CHM2PDF_ORIG_DIR)
-        except OSError: # The directory already exists.
-            pass
-        
-        try:
-            os.mkdir(CHM2PDF_WORK_DIR)
-        except OSError: # The directory already exists.
-            pass
-   
+        if not os.path.exists(orig_dir):
+            os.mkdir(orig_dir)    
+        if not os.path.exists(work_dir):
+            os.mkdir(work_dir)
+
     # Compute filenames and lists. This is needed no matter if '--dontextract' was given or not!
      
     html_list=get_html_list(cfile)
@@ -338,15 +283,15 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
     for html_file in html_list:
         for item in objective_urls:
             if html_file in item:
-                true_html_list.append(CHM2PDF_ORIG_DIR+item)
+                true_html_list.append(orig_dir+item)
             if not options['titlefile']=='' and options['titlefile'] in item:
-                input_titlefile = CHM2PDF_ORIG_DIR+item
-                output_titlefile = CHM2PDF_WORK_DIR + os.sep + options['titlefile']
+                input_titlefile = orig_dir+item
+                output_titlefile = work_dir + os.sep + options['titlefile']
 
     if not options['titlefile']=='' and not output_titlefile:
         print '### WARNING: ' + options['titlefile'] + ' not found inside ' + filename + ' - possible spelling error.'
         print '### You can check it if you do  \'' + sys.argv[0] + ' --extract-only\','
-        print '### then have a look at the files in  ' + CHM2PDF_ORIG_DIR + '.'
+        print '### then have a look at the files in  ' + orig_dir + '.'
         print '### Option \'--titlefile ' + options['titlefile'] + '\' ignored'
         options['titlefile'] = ''
 
@@ -356,7 +301,7 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
     if options['dontextract'] == '':
         # Correct image links in toc file.
         if not options['titlefile']=='' and os.path.exists(input_titlefile):
-            correct_file(input_titlefile, output_titlefile, html_list, objective_urls, options)
+            correct_file(input_titlefile, output_titlefile, objective_urls, options, orig_dir)
 
     
     # Now process the rest of HTML files.
@@ -372,7 +317,7 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
     replace_garbled_strings = []
     for url in html_list:
         c+=1
-        page_filename=CHM2PDF_ORIG_DIR + url
+        page_filename=orig_dir + url
         # Some names contain a '%20' (an HTML code for a space). We substitute with a "real space"
         # otherwise a 'File not found' error will occur.
         page_filename = re.sub('%20',' ',page_filename)
@@ -382,12 +327,12 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
 
  
         if os.path.exists(page_filename) and (options['titlefile'] == '' or not options['titlefile'] in url):
-            htmlout_filename=CHM2PDF_WORK_DIR+'/temp'+'%(#)04d' %{"#":c}+'.html'
+            htmlout_filename=work_dir+'/temp'+'%(#)04d' %{"#":c}+'.html'
             htmlout_filenames.append(htmlout_filename)
     
             if options['dontextract'] == '':
                 # Correct image links in file page_filename.
-                correct_file(page_filename, htmlout_filename, html_list, objective_urls, options)
+                correct_file(page_filename, htmlout_filename, objective_urls, options, orig_dir)
 
             # Escape slashes in url.
             url_filename_escaped = re.sub('/', '\/', os.path.basename(url))
@@ -593,6 +538,8 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
             ['-f', outputfilename])
     if options['verbose']=='--verbose' and options['verbositylevel']=='high':
         print cmd
+    with open('/tmp/c', 'w') as fp:
+        fp.write('\n'.join(cmd))
     exit_value = subprocess.call(cmd)
 
     if exit_value != 0:
@@ -622,12 +569,12 @@ def usage (name):
     print '\t--continuous\n\t\tSpecifies  that  the  HTML  sources are unstructured (plain web pages).\n\t\tNo page breaks are inserted between each file or URL in the output.'
     print '\t--cookies \'name="value with space"; name=value\'\n\t\t'
     print '\t--datadir directory\n\t\tSpecifies the  location  of  the  HTMLDOC  data  files,  usually  /usr/share/htmldoc  or  C:\Program Files\HTMLDOC '
-    print "\t--dontextract \n\t\tIf given, %s will not extract the HTML files from the given CHM file, but will use previously extracted copies from the temporary directory " %name + '(i.e. ' + CHM2PDF_TEMP_ORIG_DIR + ' and ' + CHM2PDF_TEMP_WORK_DIR + '). Usually you will use this option after you have used the \'--extract-only\' option to extract the files in order to correct them manually (in ' + CHM2PDF_TEMP_WORK_DIR + '). After the correction, a call with \'--dontextract\' will not overwrite your changes, but will use the corrected files instead.'
+    print "\t--dontextract \n\t\tIf given, %s will not extract the HTML files from the given CHM file, but will use previously extracted copies from the provided directory name). Usually you will use this option after you have used the \'--extract-only\' option to extract the files in order to correct them manually. After the correction, a call with \'--dontextract\' will not overwrite your changes, but will use the corrected files instead."
     print '\t--duplex\n\t\tSpecifies that the output should be formatted for double-sided printing.'
     print '\t--effectduration {0.1..10.0}\n\t\tSpecifies the duration in seconds of PDF page transition effects.'
     print '\t--embedfonts\n\t\tSpecifies that fonts should be embedded in PDF output.'
     print '\t--encryption\n\t\tEnables encryption of PDF files.'
-    print '\t--extract-only\n\t\tExtract the HTML files from the CHM file and stop.\n\t\tThe extracted files will be found in CHM2PDF_WORK_DIR/input_filename_without_extension.'
+    print '\t--extract-only\n\t\tExtract the HTML files from the CHM file and stop.\n\t\tThe extracted files will be found in the directory name printed to stdout.'
     print '\t--firstpage {p1,toc,c1}\n\t\t'
     print '\t--fontsize {4.0..24.0}\n\t\tSpecifies the default font size for body text.'
     print '\t--fontspacing {1.0..3.0}\n\t\tSpecifies  the  default  line  spacing  for body text.\n\t\tThe line spacing is a multiplier for the font size, so a value of 1.2 \n\t\twill provide an additional 20% of space between the lines.'
@@ -734,10 +681,6 @@ def split(path):
 
 
 def main(argv):
-
-    global CHM2PDF_WORK_DIR
-    global CHM2PDF_ORIG_DIR
-
     # Defaults
     options={}
     options['beautifulsoup'] = ''
@@ -849,7 +792,7 @@ def main(argv):
                       "continuous",
                       "cookies=",
                       "datadir=",
-                      "dontextract",
+                      "dontextract=",
                       "duplex",
                       "effectduration=",
                       "embedfonts",
@@ -947,7 +890,7 @@ def main(argv):
         elif o == '--continuous': options['continuous'] = '--continuous'
         elif o == '--cookies': options['cookies'] = a
         elif o == '--datadir': options['datadir'] = a
-        elif o == '--dontextract': options['dontextract'] = '--dontextract'
+        elif o == '--dontextract': options['dontextract'] = a
         elif o == '--duplex': options['duplex'] = '--duplex'
         elif o == '--effectduration': options['effectduration'] = a
         elif o == '--embedfonts': options['embedfonts'] = '--embedfonts'
@@ -1034,7 +977,7 @@ def main(argv):
     # Option validation checks
     #
     # Only one of '--extract-only' and '--dontextract' may be given!
-    if options['dontextract'] == '--dontextract' and options['extract-only'] == '--extract-only':
+    if options['dontextract'] and options['extract-only']:
         usage(sys.argv[0])
         print
         print '### Either \'--dontextract\' or \'extract-only\' may be given!'
@@ -1058,7 +1001,7 @@ def main(argv):
         usage(sys.argv[0])
         return
     elif len(args)==1:
-        filename = args[0]
+        filename = os.path.abspath(args[0])
         dirname, basename, suffix = split(filename)
         if dirname:
             outputfilename = dirname + os.sep + basename +'.pdf'
@@ -1067,43 +1010,39 @@ def main(argv):
         # print 'outputfilename = ' + outputfilename
 
     elif len(args)==2:
-        filename = args[0]
+        filename = os.path.abspath(args[0])
         dirname, basename, suffix = split(filename)
-        outputfilename = args[1]
+        outputfilename = os.path.abspath(args[1])
         # print 'outputfilename = ' + outputfilename
     else:
         usage(sys.argv[0])
         return
- 
-    CHM2PDF_WORK_DIR = CHM2PDF_TEMP_WORK_DIR + os.sep + basename
-    CHM2PDF_ORIG_DIR = CHM2PDF_TEMP_ORIG_DIR + os.sep + basename
+
+    if options['dontextract']:
+        base_dir = options['dontextract']
+    else:
+        base_dir = tempfile.mkdtemp(prefix='chm2pdf.')
+    os.chdir(base_dir)
+    orig_dir = 'orig'
+    work_dir = 'work'
 
     if options['verbose']=='--verbose' and options['verbositylevel']=='low':
-        print 'CHM2PDF_WORK_DIR = ' + CHM2PDF_WORK_DIR
-        print 'CHM2PDF_ORIG_DIR = ' + CHM2PDF_ORIG_DIR
+        print 'work dir', work_dir
+        print 'orig dir', orig_dir
 
     if not os.path.exists(filename):
         print 'CHM file "' + filename + '" not found!'
         return
     
-    #remove temporary files
-    if options['dontextract'] == '':
-        if options['verbose']=='--verbose' and options['verbositylevel']=='high':
-            print 'Removing any previous temporary files...'
-	if os.path.exists(CHM2PDF_ORIG_DIR):
-            shutil.rmtree(CHM2PDF_ORIG_DIR)
-        if os.path.exists(CHM2PDF_WORK_DIR):
-            shutil.rmtree(CHM2PDF_WORK_DIR)
-    
     cfile = chm.CHMFile()
     cfile.LoadCHM(filename)
 
-    if options['dontextract'] == '--dontextract':
+    if options['dontextract']:
         if options['verbose'] == '--verbose':
             print '\'--dontextract\' option was given. No files will be extracted from CHM.'
-            print 'Will use the files in ' + CHM2PDF_ORIG_DIR + ' and ' + CHM2PDF_WORK_DIR + '.'
+            print 'Will use the files in %r and %r' % (orig_dir, work_dir)
     else:
-        cmd = ['extract_chmLib', filename, CHM2PDF_ORIG_DIR]
+        cmd = ['extract_chmLib', filename, orig_dir]
         if options['verbose'] == '--verbose' and options['verbositylevel'] == 'high':
             subprocess.call(cmd)
         else:
@@ -1112,7 +1051,12 @@ def main(argv):
                     stderr=subprocess.STDOUT)
             out, err = p.communicate()
     
-    convert_to_pdf(cfile, filename, outputfilename, options)
+    convert_to_pdf(cfile, filename, outputfilename, options, orig_dir, work_dir)
+
+    if options['extract-only']:
+        print 'Extracted files to %r' % base_dir
+    else:
+        shutil.rmtree(base_dir)
 
 
 if __name__ == '__main__':
-- 
1.7.9.1

>From eec69042196730e541eb9df9e1d158f2b736b99e Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <n...@arctrix.com>
Date: Mon, 12 Mar 2012 00:01:32 -0600
Subject: [PATCH 1/2] Use subprocess and shutil instead of os.system().

This avoids problems with spaces in filenames since shell
globbing is not used.
---
 chm2pdf |  236 ++++++++++++++++++++++++++++++++------------------------------
 1 files changed, 122 insertions(+), 114 deletions(-)

diff --git a/chm2pdf b/chm2pdf
index 2448712..216807a 100644
--- a/chm2pdf
+++ b/chm2pdf
@@ -27,6 +27,8 @@ import sgmllib
 import os, os.path
 import re, glob
 import getopt
+import shutil
+import subprocess
 # from BeautifulSoup import BeautifulSoup
 
 global version
@@ -109,17 +111,13 @@ def get_objective_urls_list(filename):
     '''
     takes the list of files inside the chm archive, with the correct urls of each one.
     '''
-
-    os.system('enum_chmLib '+filename+' > '+CHM2PDF_WORK_DIR+'/urlslist.txt')
-    flist=open(CHM2PDF_WORK_DIR+'/urlslist.txt','rU')
+    p = subprocess.Popen(['enum_chmLib', filename],
+            stdout=subprocess.PIPE,
+            universal_newlines=True)
     urls_list=[]
-    for line in flist.readlines()[3:]:
-        #print 'line',line
-        spline=line.split()
-        urls_list.append(spline[5])
-    flist.close()
-    # os.remove(CHM2PDF_WORK_DIR+'/urlslist.txt')
-    
+    for line in p.stdout.readlines()[3:]:
+        urls_list.append(line.split()[5])
+    out, err = p.communicate()
     return urls_list
 
 
@@ -366,7 +364,6 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
     # Compute some lists. Again, this is independent of the '--dontextract' option.
 
     c=0
-    htmlout_filename_list=''
     htmlout_filenames = []
     if output_titlefile:
         htmlout_filenames.append(output_titlefile)
@@ -386,7 +383,6 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
  
         if os.path.exists(page_filename) and (options['titlefile'] == '' or not options['titlefile'] in url):
             htmlout_filename=CHM2PDF_WORK_DIR+'/temp'+'%(#)04d' %{"#":c}+'.html'
-            htmlout_filename_list+=' '+ htmlout_filename
             htmlout_filenames.append(htmlout_filename)
     
             if options['dontextract'] == '':
@@ -483,7 +479,7 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
 
     # Here ends the extraction and correction of the HTML files which, as said above,
     # will take place ONLY IF '--dontextract' was NOT given.
-    # If '--dontextract' was given, only the file lists like htmlout_filename_list 
+    # If '--dontextract' was given, only the file lists like htmlout_filenames 
     # were computed above, but no file extraction or correction took place.
     #
     # ########################### File extraction and correction: END   ############################
@@ -497,101 +493,107 @@ def convert_to_pdf(cfile, filename, outputfilename, options):
     if options['verbose']=='--verbose' and options['verbositylevel']=='low':
         print 'Producing the PDF from the '+str(c)+' individual HTML files...'
 
-    htmldoc_opts = ''
+    htmldoc_opts = []
+    def add(*args):
+        htmldoc_opts.extend(args)
     # print options
     for key in options.keys():
         value = options[key]
         if not value == '':
-            if   key=='bodycolor': htmldoc_opts += ' --bodycolor ' + value
-            elif key=='bodyfont': htmldoc_opts += ' --bodyfont ' + value
-            elif key=='bodyimage': htmldoc_opts += ' --bodyimage ' + value
-            elif key=='book': htmldoc_opts += ' ' + value
-            elif key=='bottom': htmldoc_opts += ' --bottom ' + value
-            elif key=='browserwidth': htmldoc_opts += ' --browserwidth ' + value
-            elif key=='charset': htmldoc_opts += ' --charset ' + value
-            elif key=='color': htmldoc_opts += ' ' + value
-            elif key=='compression': htmldoc_opts += ' --compression=' + value
-            elif key=='continuous': htmldoc_opts += ' ' + value
-            elif key=='cookies': htmldoc_opts += ' --cookies ' + value
-            elif key=='datadir': htmldoc_opts += ' --datadir ' + value
-            elif key=='duplex': htmldoc_opts += ' ' + value
-            elif key=='effectduration': htmldoc_opts += ' --effectduration ' + value
-            elif key=='embedfonts': htmldoc_opts += ' ' + value
-            elif key=='encryption': htmldoc_opts += ' ' + value
-            elif key=='firstpage': htmldoc_opts += ' --firstpage ' + value
-            elif key=='fontsize': htmldoc_opts += ' --fontsize ' + value
-            elif key=='fontspacing': htmldoc_opts += ' --fontspacing ' + value
-            elif key=='footer': htmldoc_opts += ' --footer ' + value
-            elif key=='format': htmldoc_opts += ' --format ' + value
-            elif key=='gray': htmldoc_opts += ' ' + value
-            elif key=='header': htmldoc_opts += ' --header ' + value
-            elif key=='header1': htmldoc_opts += ' --header1 ' + value
-            elif key=='headfootfont': htmldoc_opts += ' --headfootfont ' + value
-            elif key=='headfootsize': htmldoc_opts += ' --headfootsize ' + value
-            elif key=='headingfont': htmldoc_opts += ' --headingfont ' + value
-            elif key=='help': htmldoc_opts += ' ' + value
-            elif key=='hfimage0': htmldoc_opts += ' --hfimage0 ' + value
-            elif key=='hfimage1': htmldoc_opts += ' --hfimage1 ' + value
-            elif key=='hfimage2': htmldoc_opts += ' --hfimage2 ' + value
-            elif key=='hfimage3': htmldoc_opts += ' --hfimage3 ' + value
-            elif key=='hfimage4': htmldoc_opts += ' --hfimage4 ' + value
-            elif key=='hfimage5': htmldoc_opts += ' --hfimage5 ' + value
-            elif key=='hfimage6': htmldoc_opts += ' --hfimage6 ' + value
-            elif key=='hfimage7': htmldoc_opts += ' --hfimage7 ' + value
-            elif key=='hfimage8': htmldoc_opts += ' --hfimage8 ' + value
-            elif key=='hfimage9': htmldoc_opts += ' --hfimage9 ' + value
-            elif key=='jpeg': htmldoc_opts += ' --jpeg=' + value
-            elif key=='landscape': htmldoc_opts += ' ' + value
-            elif key=='left': htmldoc_opts += ' --left ' + value
-            elif key=='linkcolor': htmldoc_opts += ' --linkcolor ' + value
-            elif key=='links': htmldoc_opts += ' ' + value
-            elif key=='linkstyle': htmldoc_opts += ' --linkstyle ' + value
-            elif key=='logoimage': htmldoc_opts += ' --logoimage ' + value
-            elif key=='logoimage': htmldoc_opts += ' --logoimage ' + value
-            elif key=='no-compression': htmldoc_opts += ' ' + value
-            elif key=='no-duplex': htmldoc_opts += ' ' + value
-            elif key=='no-embedfonts': htmldoc_opts += ' ' + value
-            elif key=='no-encryption': htmldoc_opts += ' ' + value
-            elif key=='no-links': htmldoc_opts += ' ' + value
-            elif key=='no-localfiles': htmldoc_opts += ' ' + value
-            elif key=='no-numbered': htmldoc_opts += ' ' + value
-            elif key=='no-overflow': htmldoc_opts += ' ' + value
-            elif key=='no-strict': htmldoc_opts += ' ' + value
-            elif key=='no-title': htmldoc_opts += ' ' + value
-            elif key=='no-toc': htmldoc_opts += ' ' + value
-            elif key=='numbered': htmldoc_opts += ' ' + value
-            elif key=='nup': htmldoc_opts += ' --nup ' + value
-            elif key=='outfile': htmldoc_opts += ' --outfile ' + value
-            elif key=='overflow': htmldoc_opts += ' ' + value
-            elif key=='owner-password': htmldoc_opts += ' --owner-password ' + value
-            elif key=='pageduration': htmldoc_opts += ' --pageduration ' + value
-            elif key=='pageeffect': htmldoc_opts += ' --pageeffect ' + value
-            elif key=='pagelayout': htmldoc_opts += ' --pagelayout ' + value
-            elif key=='pagemode': htmldoc_opts += ' --pagemode ' + value
-            elif key=='path': htmldoc_opts += ' --path ' + value
-            elif key=='permissions': htmldoc_opts += ' --permissions ' + value
-            elif key=='portrait': htmldoc_opts += ' ' + value
-            elif key=='quiet': htmldoc_opts += ' ' + value
-            elif key=='right': htmldoc_opts += ' --right ' + value
-            elif key=='size': htmldoc_opts += ' --size ' + value
-            elif key=='strict': htmldoc_opts += ' ' + value
-            elif key=='textcolor': htmldoc_opts += ' --textcolor ' + value
-            elif key=='textfont': htmldoc_opts += ' --textfont ' + value
-            elif key=='title': htmldoc_opts += ' ' + value
-            elif key=='titlefile': htmldoc_opts += ' --titlefile ' + output_titlefile
-            elif key=='titleimage': htmldoc_opts += ' --titleimage ' + value
-            elif key=='tocfooter': htmldoc_opts += ' --tocfooter ' + value
-            elif key=='tocheader': htmldoc_opts += ' --tocheader ' + value
-            elif key=='toclevels': htmldoc_opts += ' --toclevels ' + value
-            elif key=='toctitle': htmldoc_opts += ' --toctitle ' + value
-            elif key=='top': htmldoc_opts += ' --top ' + value
-            elif key=='user-password': htmldoc_opts += ' --user-password ' + value
-            elif key=='version': htmldoc_opts += ' ' + value
-            elif key=='webpage': htmldoc_opts += ' ' + value
-           
+            if   key=='bodycolor': add('--bodycolor', value)
+            elif key=='bodyfont': add('--bodyfont', value)
+            elif key=='bodyimage': add('--bodyimage', value)
+            elif key=='book': add(value)
+            elif key=='bottom': add('--bottom', value)
+            elif key=='browserwidth': add('--browserwidth', value)
+            elif key=='charset': add('--charset', value)
+            elif key=='color': add(value)
+            elif key=='compression': add('--compression=', value)
+            elif key=='continuous': add(value)
+            elif key=='cookies': add('--cookies', value)
+            elif key=='datadir': add('--datadir', value)
+            elif key=='duplex': add(value)
+            elif key=='effectduration': add('--effectduration', value)
+            elif key=='embedfonts': add(value)
+            elif key=='encryption': add(value)
+            elif key=='firstpage': add('--firstpage', value)
+            elif key=='fontsize': add('--fontsize', value)
+            elif key=='fontspacing': add('--fontspacing', value)
+            elif key=='footer': add('--footer', value)
+            elif key=='format': add('--format', value)
+            elif key=='gray': add(value)
+            elif key=='header': add('--header', value)
+            elif key=='header1': add('--header1', value)
+            elif key=='headfootfont': add('--headfootfont', value)
+            elif key=='headfootsize': add('--headfootsize', value)
+            elif key=='headingfont': add('--headingfont', value)
+            elif key=='help': add(value)
+            elif key=='hfimage0': add('--hfimage0', value)
+            elif key=='hfimage1': add('--hfimage1', value)
+            elif key=='hfimage2': add('--hfimage2', value)
+            elif key=='hfimage3': add('--hfimage3', value)
+            elif key=='hfimage4': add('--hfimage4', value)
+            elif key=='hfimage5': add('--hfimage5', value)
+            elif key=='hfimage6': add('--hfimage6', value)
+            elif key=='hfimage7': add('--hfimage7', value)
+            elif key=='hfimage8': add('--hfimage8', value)
+            elif key=='hfimage9': add('--hfimage9', value)
+            elif key=='jpeg': add('--jpeg=%s' % value)
+            elif key=='landscape': add(value)
+            elif key=='left': add('--left', value)
+            elif key=='linkcolor': add('--linkcolor', value)
+            elif key=='links': add(value)
+            elif key=='linkstyle': add('--linkstyle', value)
+            elif key=='logoimage': add('--logoimage', value)
+            elif key=='logoimage': add('--logoimage', value)
+            elif key=='no-compression': add(value)
+            elif key=='no-duplex': add(value)
+            elif key=='no-embedfonts': add(value)
+            elif key=='no-encryption': add(value)
+            elif key=='no-links': add(value)
+            elif key=='no-localfiles': add(value)
+            elif key=='no-numbered': add(value)
+            elif key=='no-overflow': add(value)
+            elif key=='no-strict': add(value)
+            elif key=='no-title': add(value)
+            elif key=='no-toc': add(value)
+            elif key=='numbered': add(value)
+            elif key=='nup': add('--nup', value)
+            elif key=='outfile': add('--outfile', value)
+            elif key=='overflow': add(value)
+            elif key=='owner-password': add('--owner-password', value)
+            elif key=='pageduration': add('--pageduration', value)
+            elif key=='pageeffect': add('--pageeffect', value)
+            elif key=='pagelayout': add('--pagelayout', value)
+            elif key=='pagemode': add('--pagemode', value)
+            elif key=='path': add('--path', value)
+            elif key=='permissions': add('--permissions', value)
+            elif key=='portrait': add(value)
+            elif key=='quiet': add(value)
+            elif key=='right': add('--right', value)
+            elif key=='size': add('--size', value)
+            elif key=='strict': add(value)
+            elif key=='textcolor': add('--textcolor', value)
+            elif key=='textfont': add('--textfont', value)
+            elif key=='title': add(value)
+            elif key=='titlefile': add('--titlefile', output_titlefile)
+            elif key=='titleimage': add('--titleimage', value)
+            elif key=='tocfooter': add('--tocfooter', value)
+            elif key=='tocheader': add('--tocheader', value)
+            elif key=='toclevels': add('--toclevels', value)
+            elif key=='toctitle': add('--toctitle', value)
+            elif key=='top': add('--top', value)
+            elif key=='user-password': add('--user-password', value)
+            elif key=='version': add(value)
+            elif key=='webpage': add(value)
+
+    cmd = (['htmldoc'] +
+            htmldoc_opts +
+            htmlout_filenames +
+            ['-f', outputfilename])
     if options['verbose']=='--verbose' and options['verbositylevel']=='high':
-        print 'htmldoc' + htmldoc_opts + ' ' + htmlout_filename_list + " -f "+ outputfilename + " > /dev/null"
-    exit_value=os.system ('htmldoc' + htmldoc_opts + ' ' + htmlout_filename_list + " -f "+ outputfilename + " > /dev/null")
+        print cmd
+    exit_value = subprocess.call(cmd)
 
     if exit_value != 0:
         print 'Something wrong happened when launching htmldoc.'
@@ -760,10 +762,10 @@ def main(argv):
     options['firstpage'] = ''
     options['fontsize'] = ''
     options['fontspacing'] = ''
-    options['footer'] = '\'c C\''
-    options['format'] = '\'pdf14\''
+    options['footer'] = 'c C'
+    options['format'] = 'pdf14'
     options['gray'] = ''
-    options['header'] = '\'c C\''
+    options['header'] = 'c C'
     options['header1'] = ''
     options['headfootfont'] = ''
     options['headfootsize'] = ''
@@ -779,12 +781,12 @@ def main(argv):
     options['hfimage7'] = ''
     options['hfimage8'] = ''
     options['hfimage9'] = ''
-    options['jpeg'] = '\'100\''
+    options['jpeg'] = '100'
     options['landscape'] = ''
     options['left'] = ''
-    options['linkcolor'] = '\'blue\''
+    options['linkcolor'] = 'blue'
     options['links'] = ''
-    options['linkstyle'] = '\'plain\''
+    options['linkstyle'] = 'plain'
     options['logoimage'] = ''
     options['logoimage'] = ''
     options['no-compression'] = ''
@@ -813,7 +815,7 @@ def main(argv):
     options['prefontsize'] = ''
     options['quiet'] = ''
     options['right'] = ''
-    options['size'] = '\'a4\''
+    options['size'] = 'a4'
     options['strict'] = ''
     options['textcolor'] = ''
     options['textfont'] = ''
@@ -1088,8 +1090,10 @@ def main(argv):
     if options['dontextract'] == '':
         if options['verbose']=='--verbose' and options['verbositylevel']=='high':
             print 'Removing any previous temporary files...'
-        os.system('rm -r '+CHM2PDF_ORIG_DIR+'/*')
-        os.system('rm -r '+CHM2PDF_WORK_DIR+'/*')
+	if os.path.exists(CHM2PDF_ORIG_DIR):
+            shutil.rmtree(CHM2PDF_ORIG_DIR)
+        if os.path.exists(CHM2PDF_WORK_DIR):
+            shutil.rmtree(CHM2PDF_WORK_DIR)
     
     cfile = chm.CHMFile()
     cfile.LoadCHM(filename)
@@ -1099,10 +1103,14 @@ def main(argv):
             print '\'--dontextract\' option was given. No files will be extracted from CHM.'
             print 'Will use the files in ' + CHM2PDF_ORIG_DIR + ' and ' + CHM2PDF_WORK_DIR + '.'
     else:
+        cmd = ['extract_chmLib', filename, CHM2PDF_ORIG_DIR]
         if options['verbose'] == '--verbose' and options['verbositylevel'] == 'high':
-            os.system('extract_chmLib ' + filename + ' ' + CHM2PDF_ORIG_DIR)
+            subprocess.call(cmd)
         else:
-            os.system('extract_chmLib ' + filename + ' ' + CHM2PDF_ORIG_DIR + '&> /dev/null')
+            p = subprocess.Popen(cmd,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT)
+            out, err = p.communicate()
     
     convert_to_pdf(cfile, filename, outputfilename, options)
 
-- 
1.7.9.1

Bug#488033: chm2pdf: fails converting filenames with spaces

Reply via email to