[patch] new Python csv2lyx importer

Uwe Stöhr Wed, 14 May 2008 15:55:19 -0700

Attached is a new CSV2lyx importer written mainly by Hartmut Haase.

It uses Python's built in CSV reader that is available since Python 2.3. I added a method toautomatically detect the correct column separator.

There are only minor tweaks to do I think. I'm sending it that the Python masters could have a lookat the basic design - I'm sure you find some optimizations ;-)

Attached are some small testfiles made with OpenOffice that allows you to specify the columnseparator when creating CSV files.


regards Uwe

#! /usr/bin/env python
# -*- coding: utf-8 -*-

# file csv2lyx.py
# This file is part of LyX, the document processor.
# Licence details can be found in the file COPYING.

# author Hartmut Haase
# author Uwe StÃ¶hr

# Full author contact details are available in file CREDITS

# This script reads a csv-table (file name.csv) and converts it into
# a LyX-table for versions 1.5.0 and higher (LyX table format 276).
# It uses Python's csv module for parsing.
# The original csv2lyx was witten by Antonio Gulino <[EMAIL PROTECTED]>
# in Perl for LyX 1.x and modified for LyX table format 276 by the author.
#
import csv, os, re, string, sys, unicodedata

def error(message):
    sys.stderr.write(message + '\n')
    sys.exit(1)

# processing command line options
if len(sys.argv) == 1 or sys.argv[1] == '--help':
    print '''Usage:
   csv2lyx [options] mycsvfile mytmptable.lyx

This script creates a LyX document containing a table
from a comma-separated-value file. The LyX file has format 276
and can be opened with LyX 1.5.0 and newer.

Options:
   -e 'character'  Excel type, default is 'n'
                        = 'e': Excel-generated CSV file
                        = 't': Excel-generated TAB-delimited CSV file
   -s 'character'  column separator, default is ','
   --help          usage instructions

Remarks:
   If your .csv file contains special characters (e. g. umlauts,
   accented letters, etc.) make sure it is coded in UTF-8 (unicode).
   Else LyX will loose some cell contents.
   If your .csv file was not written according to the
   "Common Format and MIME Type for Comma-Separated Values (CSV) Files"
   (http://tools.ietf.org/html/rfc4180)
   there may be unexpected results.'''
    sys.exit(0)

# print len(sys.argv), sys.argv
excel = 'n'
infile = ""
# the default column separator for CSV is of course the comma
column_sep = ','
dia_excel = 'none'
if len(sys.argv) == 3:
        infile = sys.argv[1]
        outfile = sys.argv[2]
elif len(sys.argv) == 5:
        infile = sys.argv[3]
        outfile = sys.argv[4]
        if sys.argv[1] == '-s':
                column_sep = sys.argv[2]
        elif sys.argv[1] == '-e':
                excel = sys.argv[2]
elif len(sys.argv) == 7:
        infile = sys.argv[5]
        outfile = sys.argv[6]
        if sys.argv[1] == '-s':
                column_sep = sys.argv[2]
        elif sys.argv[1] == '-e':
                excel = sys.argv[2]
        if sys.argv[3] == '-s':
                column_sep = sys.argv[4]
        elif sys.argv[3] == '-e':
                excel = sys.argv[4]
if not os.path.exists(infile):
        error('File "%s" not found.' % infile)
# look for dialects
if excel == 'e':
        dia_excel = 'excel'
elif excel == 't':
        dia_excel = 'excel-tab'
#print 'excel ', dia_excel

# when no special column separator is given, try to detect it:
# 1. open the file as standard text file
# 2. count the different allowed separator characters
# 3. set the delimiter
# (The encoding doesn't matter because the allowed delimiters are on the same
# position in any code table.)

if column_sep == ",":
    textfile = open(infile)
    
    # the allowed delimiters:
    counter = 0
    colon = 0
    semicolon = 0
    space = 0
    tab =  0

    # read the text file
    while True:
        line = textfile.readline()
        if not line:
            break
        counter += 1
        if line.find(":") != -1:
            colon += 1
        if line.find(";") != -1:
            semicolon += 1
        if line.find(" ") != -1:
            space += 1
        if line.find("\t") != -1:
            tab += 1

    # close the text file
    textfile.close()

    # set the delimiter according to this rule:
    # When a delimiter character appears at least as much as number of text
    # lines it is the delimiter character.
    # This works for more than 95 % of all files.
    if tab >= counter:
        column_sep = "\t"
    if colon >= counter:
        column_sep = ":"
    if semicolon >= counter:
        column_sep = ";"
    if space >= counter:
        column_sep = " "


# read input
if dia_excel == 'none':
        reader = csv.reader(open(infile, "rb"), delimiter=column_sep)
else:
        reader = csv.reader(open(infile, "rb"), dialect=dia_excel, 
delimiter=column_sep)
num_cols = 1 # max columns
rows = []
for row in reader:
    #print row
    num_cols = max(num_cols, len(row))
    rows.append(row)
#print rows
num_rows = reader.line_num # number of lines
#print 'num_rows', reader.line_num
#print 'num_cols', num_cols

# create a LyX file
fout = open(outfile, 'w')
#####################
# write first part
####################
fout.write("""#csv2lyx created this file
\lyxformat 276
\\begin_document
\\begin_header
\\textclass article
\\inputencoding auto
\\font_roman default
\\font_sans default
\\font_typewriter default
\\font_default_family default
\\font_sc false
\\font_osf false
\\font_sf_scale 100
\\font_tt_scale 100
\\graphics default
\\paperfontsize default
\\papersize default
\\use_geometry false
\\use_amsmath 1
\\use_esint 0
\\cite_engine basic
\\use_bibtopic false
\\paperorientation portrait
\\secnumdepth 3
\\tocdepth 3
\\paragraph_separation indent
\\defskip medskip
\\papercolumns 1
\\papersides 1
\\paperpagestyle default
\\tracking_changes false
\\output_changes false
\\end_header

\\begin_body

\\begin_layout Standard
\\align left
\\begin_inset Tabular
""")
fout.write('<lyxtabular version="3" rows=\"' + str(num_rows) + '\" columns=\"' 
+ str(num_cols) + '\">\n')
fout.write('<features>\n')
#####################
# write table
####################
i = 0
while i < num_cols:
        fout.write('<column alignment="left" valignment="top" width="0pt">\n')
        i += 1
j = 0
while j < num_rows:
        fout.write('<row>\n')
        num_cols_2 = len(rows[j]) # columns in current row
        #print j, ': ' , rows[j]
############################
# write contents of one line
############################
        i = 0
        while i < num_cols_2:
                fout.write("""<cell alignment="left" valignment="top" 
usebox="none">
\\begin_inset Text

\\begin_layout Standard\n""")
                #print rows[j][i]
                fout.write(rows[j][i])
                fout.write('\n\\end_layout\n\n\\end_inset\n</cell>\n')
                i += 1
# If row has less columns than num_cols
        if num_cols_2 < num_cols:
                while i < num_cols:
                        fout.write("""<cell alignment="left" valignment="top" 
usebox="none">
\\begin_inset Text

\\begin_layout Standard\n""")
                        fout.write(' ')
                        fout.write('\n\\end_layout\n\n\\end_inset\n</cell>\n')
                        i += 1
        fout.write('</row>\n')
        j += 1
#####################
# write last part
####################
fout.write("""</lyxtabular>

\\end_inset


\\end_layout

\\end_body
\\end_document\n""")
# close the LyX file
fout.close()

"asda"  "asdasd"
"asda"  "asdasd"
3456    36
4.56    478
58      568
56,78   568
568     568

"asda","asdasd"
"asda","asdasd"
3456,36
4.56,478
58,568
56,78,568
568,568

"asda";"asdasd"
"asda";"asdasd"
3456;36
4.56;478
58;568
56,78;568
568;568

"asda":"asdasd"
"asda":"asdasd"
3456:36
4.56:478
58:568
56,78:568
568:568

"asda" "asdasd"
"asda" "asdasd"
3456 36
4.56 478
58 568
56,78 568
568 568

[patch] new Python csv2lyx importer

Reply via email to