I figured it out (posting the solution for the archives, and possibly
for comments). Reading the Julia issues about exceptions, I came across
a blog post about the Midori error model [1], and also some discussions
on how exceptions are not the way to handle errors which are not
bugs. So I realized I need a version of parse that returns a
Nullable, then found it that it already exists (tryparse).

So here is my solution (for the self-contained stylized example, the
actual code is much more complex):

parsefield{T <: Real}(::Type{T}, string) = tryparse(T, string)

function parsefile(io, schema)
    line = 1
    while !eof(io)
        strings = split(chomp(readline(io)), ';')
        values = parsefield.(schema, strings)
        function checked(column, value)
            if isnull(value)
                error("could not parse \"$(strings[column])\" as " *
                      "$(schema[column]) in line $(line), column $(column)")
            else
                value
            end
        end
        # do something with this
        [checked(column,value) for (column, value) in enumerate(values)]
        line += 1
    end
end

test_file = """
1;2;3
4;5;6
7;error;9
"""

parsefile(IOBuffer(test_file), fill(Int, 3))

I still need to figure out type stability etc, but I think I am on the
right track.

[1] http://joeduffyblog.com/2016/02/07/the-error-model/

On Thu, Nov 03 2016, Tamas Papp wrote:

> Unfortunately, the data is too large to fit in memory -- I must process
> it in a stream.
>
> I will look at some libraries, hoping to find an idiomatic solution. I
> am sure that I am not the first one encountering this pattern.
>
> On Thu, Nov 03 2016, Jeffrey Sarnoff wrote:
>
>> or split the string into rows of strings and rows into individual
>> value-keeper strings and put that into a matrix of strings and process the
>> matrix, tracking row and col and checking for "error"
>>
>> On Thursday, November 3, 2016 at 5:15:06 AM UTC-4, Jeffrey Sarnoff wrote:
>>>
>>> Or, redefine the question :>
>>>
>>> If you are not tied to string processing, reading the test_file  as a
>>> string (if it is) and then splitting the string
>>> ```julia
>>>    rowstrings = map(String, split(test_file, '\n')) # need the map to
>>> avoid SubString results, if it matters
>>>    # then split the rows on ';' and convert to ?Float64 with NaN for error
>>> or ?Nullable Ints
>>>    # and put the values in a matrix, processing the matrix you have the
>>> rows and cols
>>> ```
>>>
>>>
>>> On Thursday, November 3, 2016 at 4:34:53 AM UTC-4, Tamas Papp wrote:
>>>>
>>>> Jeffrey,
>>>>
>>>> Thanks, but my question was about how to have line and column in the
>>>> error message. So I would like to have an error message like this:
>>>>
>>>> ERROR: Failed to parse "error" as type Int64 in column 2, line 3.
>>>>
>>>> My best idea so far: catch the error at each level, and add i and line
>>>> number. But this requires two try-catch-end blocks with rethrow.
>>>>
>>>> Extremely convoluted mess with rethrow here:
>>>> https://gist.github.com/tpapp/6f67ff36a228f47a1792e011d9b0fc13
>>>>
>>>> It does what I want, but it is ugly. A simpler solution would be
>>>> appreciated. I am sure I am missing something.
>>>>
>>>> Best,
>>>>
>>>> Tamas
>>>>
>>>> On Thu, Nov 03 2016, Jeffrey Sarnoff wrote:
>>>>
>>>> > Tamas,
>>>> >
>>>> > running this
>>>> >
>>>> >
>>>> >
>>>> > typealias AkoString Union{String, SubString{String}}
>>>> >
>>>> > function parsefield{T <: Real, S <: AkoString}(::Type{T}, str::S)
>>>> >     result = T(0)
>>>> >     try
>>>> >         result = parse(T, str)
>>>> >     catch ArgumentError
>>>> >         errormsg = string("Failed to parse \"",str,"\" as type ", T)
>>>> >         throw(ErrorException(errormsg))
>>>> >     end
>>>> >     return result
>>>> > end
>>>> >
>>>> > function parserow(schema, strings)
>>>> >     # keep i for reporting column, currently not used
>>>> >     [parsefield(T, string) for (i, (T, string)) in
>>>> enumerate(zip(schema,
>>>> > strings))]
>>>> > end
>>>> >
>>>> > function parsefile(io, schema)
>>>> >     line = 1
>>>> >     while !eof(io)
>>>> >         strings = split(chomp(readline(io)), ';')
>>>> >         parserow(schema, strings)
>>>> >         line += 1 # currently not used, use for error reporting
>>>> >     end
>>>> > end
>>>> >
>>>> > test_file = """
>>>> > 1;2;3
>>>> > 4;5;6
>>>> > 7;8;error
>>>> > """
>>>> >
>>>> > parsefile(IOBuffer(test_file), fill(Int, 3))
>>>> >
>>>> >
>>>> >
>>>> >
>>>> > by evaluating parsefile(...), results in
>>>> >
>>>> >
>>>> >
>>>> > julia> parsefile(IOBuffer(test_file), fill(Int, 3))
>>>> > ERROR: Failed to parse "error" as type Int64
>>>> >  in parsefield(::Type{Int64}, ::SubString{String}) at ./REPL[2]:7
>>>> >  in (::##1#2)(::Tuple{Int64,Tuple{DataType,SubString{String}}}) at
>>>> > ./<missing>:0
>>>> >  in collect_to!(::Array{Int64,1},
>>>> >
>>>> ::Base.Generator{Enumerate{Base.Zip2{Array{DataType,1},Array{SubString{String},1}}},##1#2},
>>>>
>>>> > ::Int64, ::Tuple{Int64,Tuple{Int64,Int64}}) at ./array.jl:340
>>>> >  in
>>>> >
>>>> collect(::Base.Generator{Enumerate{Base.Zip2{Array{DataType,1},Array{SubString{String},1}}},##1#2})
>>>>
>>>> > at ./array.jl:308
>>>> >  in parsefile(::Base.AbstractIOBuffer{Array{UInt8,1}},
>>>> ::Array{DataType,1})
>>>> > at ./REPL[4]:5
>>>> >
>>>> >
>>>> >
>>>> >
>>>> >
>>>> > On Wednesday, November 2, 2016 at 1:01:30 PM UTC-4, Tamas Papp wrote:
>>>> >>
>>>> >> This is a conceptual question. Consider the following (extremely
>>>> >> stylized, but self-contained) code
>>>> >>
>>>> >> parsefield{T <: Real}(::Type{T}, string) = parse(T, string)
>>>> >>
>>>> >> function parserow(schema, strings)
>>>> >>     # keep i for reporting column, currently not used
>>>> >>     [parsefield(T, string) for (i, (T, string)) in
>>>> enumerate(zip(schema,
>>>> >> strings))]
>>>> >> end
>>>> >>
>>>> >> function parsefile(io, schema)
>>>> >>     line = 1
>>>> >>     while !eof(io)
>>>> >>         strings = split(chomp(readline(io)), ';')
>>>> >>         parserow(schema, strings)
>>>> >>         line += 1 # currently not used, use for error reporting
>>>> >>     end
>>>> >> end
>>>> >>
>>>> >> test_file = """
>>>> >> 1;2;3
>>>> >> 4;5;6
>>>> >> 7;8;error
>>>> >> """
>>>> >>
>>>> >> parsefile(IOBuffer(test_file), fill(Int, 3))
>>>> >>
>>>> >> This will fail with an error message
>>>> >>
>>>> >> ERROR: ArgumentError: invalid base 10 digit 'e' in "error"
>>>> >>  in tryparse_internal(::Type{Int64}, ::SubString{String}, ::Int64,
>>>> >> ::Int64, ::Int64
>>>> >> , ::Bool) at ./parse.jl:88
>>>> >>  in parse(::Type{Int64}, ::SubString{String}) at ./parse.jl:152
>>>> >>  in parsefield(::Type{Int64}, ::SubString{String}) at ./REPL[152]:1
>>>> >>  in (::##5#6)(::Tuple{Int64,Tuple{DataType,SubString{String}}}) at
>>>> >> ./<missing>:0
>>>> >>  in collect_to!(::Array{Int64,1},
>>>> >> ::Base.Generator{Enumerate{Base.Zip2{Array{DataTy
>>>> >> pe,1},Array{SubString{String},1}}},##5#6}, ::Int64,
>>>> >> ::Tuple{Int64,Tuple{Int64,Int64
>>>> >> }}) at ./array.jl:340
>>>> >>  in
>>>> >>
>>>> collect(::Base.Generator{Enumerate{Base.Zip2{Array{DataType,1},Array{SubString{
>>>>
>>>> >>
>>>> >> String},1}}},##5#6}) at ./array.jl:308
>>>> >>  in parsefile(::Base.AbstractIOBuffer{Array{UInt8,1}},
>>>> >> ::Array{DataType,1}) at ./RE
>>>> >> PL[154]:5
>>>> >>
>>>> >> Instead, I would like to report something like this:
>>>> >>
>>>> >> ERROR: Failed to parse "error" as Int on line 3, column 3.
>>>> >>
>>>> >> What's the idiomatic way of doing this in Julia? My problem is that
>>>> >> parsefield fails without knowing line or column (i in parserow). I
>>>> could
>>>> >> catch and rethrow, constructing an error object gradually. Or I could
>>>> >> pass line and column numbers to parserow and parsefield for error
>>>> >> reporting, but that seems somehow inelegant (I have seen it in code
>>>> >> though).
>>>> >>
>>>> >> Best,
>>>> >>
>>>> >> Tamas
>>>> >>
>>>>
>>>

Reply via email to