Hi Gary, what do you think about the Decorator approach, suggested by Adrian Crum in JIRA?
Benedikt 2014-10-29 6:44 GMT+01:00 <ggreg...@apache.org>: > Author: ggregory > Date: Wed Oct 29 05:44:40 2014 > New Revision: 1635052 > > URL: http://svn.apache.org/r1635052 > Log: > [CSV-131] Save positions of records to enable random access. The floor is > open for code review and further discussion based on the comments in the > Jira. > > Modified: > commons/proper/csv/trunk/src/changes/changes.xml > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVRecord.java > > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVParserTest.java > > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVRecordTest.java > > Modified: commons/proper/csv/trunk/src/changes/changes.xml > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/changes/changes.xml?rev=1635052&r1=1635051&r2=1635052&view=diff > > ============================================================================== > --- commons/proper/csv/trunk/src/changes/changes.xml (original) > +++ commons/proper/csv/trunk/src/changes/changes.xml Wed Oct 29 05:44:40 > 2014 > @@ -39,6 +39,7 @@ > </properties> > <body> > <release version="1.1" date="2014-mm-dd" description="Feature and bug > fix release"> > + <action issue="CSV-131" type="add" dev="ggregory" due-to="Holger > Stratmann">Save positions of records to enable random access</action> > <action issue="CSV-130" type="fix" dev="ggregory" due-to="Sergei > Lebedev">CSVFormat#withHeader doesn't work well with #printComment, add > withHeaderComments(String...)</action> > <action issue="CSV-128" type="fix" dev="ggregory">CSVFormat.EXCEL > should ignore empty header names</action> > <action issue="CSV-129" type="add" dev="ggregory">Add > CSVFormat#with 0-arg methods matching boolean arg methods</action> > > Modified: > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java?rev=1635052&r1=1635051&r2=1635052&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > (original) > +++ > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVParser.java > Wed Oct 29 05:44:40 2014 > @@ -219,6 +219,12 @@ public final class CSVParser implements > private final List<String> record = new ArrayList<String>(); > > private long recordNumber; > + > + /** > + * Lexer offset if the parser does not start parsing at the beginning > of the source. Usually used in combination > + * with {@link #setNextRecordNumber(long)} > + */ > + private long characterOffset; > > private final Token reusableToken = new Token(); > > @@ -296,6 +302,43 @@ public final class CSVParser implements > } > > /** > + * Sets the record number to be assigned to the next record read. > + * <p> > + * Use this if the reader is not positioned at the first record when > you create the parser. For example, the first > + * record read might be the 51st record in the source file. > + * </p> > + * <p> > + * If you want the records to also have the correct character > position referring to the underlying source, call > + * {@link #setNextCharacterPosition(long)}. > + * </p> > + * > + * @param nextRecordNumber > + * the next record number > + * @since 1.1 > + */ > + public void setNextRecordNumber(long nextRecordNumber) { > + this.recordNumber = nextRecordNumber - 1; > + } > + > + /** > + * Sets the current position in the source stream regardless of where > the parser and lexer start reading. > + * <p> > + * For example: We open a file and seek to position 5434 in order to > start reading at record 42. In order to have > + * the parser assign the correct characterPosition to records, we > call this method. > + * </p> > + * <p> > + * If you want the records to also have the correct record numbers, > call {@link #setNextRecordNumber(long)} > + * </p> > + * > + * @param position > + * the new character position > + * @since 1.1 > + */ > + public void setNextCharacterPosition(long position) { > + this.characterOffset = position - lexer.getCharacterPosition(); > + } > + > + /** > * Returns the current record number in the input stream. > * > * <p> > @@ -445,6 +488,7 @@ public final class CSVParser implements > CSVRecord result = null; > this.record.clear(); > StringBuilder sb = null; > + final long startCharPosition = lexer.getCharacterPosition() + > this.characterOffset; > do { > this.reusableToken.reset(); > this.lexer.nextToken(this.reusableToken); > @@ -480,7 +524,7 @@ public final class CSVParser implements > this.recordNumber++; > final String comment = sb == null ? null : sb.toString(); > result = new CSVRecord(this.record.toArray(new > String[this.record.size()]), this.headerMap, comment, > - this.recordNumber); > + this.recordNumber, startCharPosition); > } > return result; > } > > Modified: > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVRecord.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVRecord.java?rev=1635052&r1=1635051&r2=1635052&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVRecord.java > (original) > +++ > commons/proper/csv/trunk/src/main/java/org/apache/commons/csv/CSVRecord.java > Wed Oct 29 05:44:40 2014 > @@ -36,6 +36,8 @@ public final class CSVRecord implements > > private static final long serialVersionUID = 1L; > > + private final long characterPosition; > + > /** The accumulated comments (if any) */ > private final String comment; > > @@ -44,15 +46,16 @@ public final class CSVRecord implements > > /** The record number. */ > private final long recordNumber; > - > + > /** The values of the record */ > private final String[] values; > > - CSVRecord(final String[] values, final Map<String, Integer> mapping, > final String comment, final long recordNumber) { > + CSVRecord(final String[] values, final Map<String, Integer> mapping, > final String comment, final long recordNumber, long characterPosition) { > this.recordNumber = recordNumber; > this.values = values != null ? values : EMPTY_STRING_ARRAY; > this.mapping = mapping; > this.comment = comment; > + this.characterPosition = characterPosition; > } > > /** > @@ -110,6 +113,16 @@ public final class CSVRecord implements > } > > /** > + * Returns the start position of this record as a character position > in the source stream. This may or may not > + * correspond to the byte position depending on the character set. > + * > + * @return the position of this record in the source stream. > + */ > + public long getCharacterPosition() { > + return characterPosition; > + } > + > + /** > * Returns the comment for this record, if any. > * > * @return the comment for this record, or null if no comment for > this record is available. > > Modified: > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVParserTest.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVParserTest.java?rev=1635052&r1=1635051&r2=1635052&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVParserTest.java > (original) > +++ > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVParserTest.java > Wed Oct 29 05:44:40 2014 > @@ -299,22 +299,23 @@ public class CSVParserTest { > } > } > > -// @Test > -// public void testStartWithEmptyLinesThenHeaders() throws Exception { > -// final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n", > "hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n", "hello,\"\"\n\n\n" }; > -// final String[][] res = { { "hello", "" }, { "" }, // Excel > format does not ignore empty lines > -// { "" } }; > -// for (final String code : codes) { > -// final CSVParser parser = CSVParser.parse(code, > CSVFormat.EXCEL); > -// final List<CSVRecord> records = parser.getRecords(); > -// assertEquals(res.length, records.size()); > -// assertTrue(records.size() > 0); > -// for (int i = 0; i < res.length; i++) { > -// assertArrayEquals(res[i], records.get(i).values()); > -// } > -// parser.close(); > -// } > -// } > + // @Test > + // public void testStartWithEmptyLinesThenHeaders() throws Exception { > + // final String[] codes = { "\r\n\r\n\r\nhello,\r\n\r\n\r\n", > "hello,\n\n\n", "hello,\"\"\r\n\r\n\r\n", > + // "hello,\"\"\n\n\n" }; > + // final String[][] res = { { "hello", "" }, { "" }, // Excel format > does not ignore empty lines > + // { "" } }; > + // for (final String code : codes) { > + // final CSVParser parser = CSVParser.parse(code, CSVFormat.EXCEL); > + // final List<CSVRecord> records = parser.getRecords(); > + // assertEquals(res.length, records.size()); > + // assertTrue(records.size() > 0); > + // for (int i = 0; i < res.length; i++) { > + // assertArrayEquals(res[i], records.get(i).values()); > + // } > + // parser.close(); > + // } > + // } > > @Test > public void testEndOfFileBehaviorCSV() throws Exception { > @@ -475,6 +476,16 @@ public class CSVParserTest { > } > > @Test > + public void testGetRecordPositionWithCRLF() throws Exception { > + this.validateRecordPosition(CRLF); > + } > + > + @Test > + public void testGetRecordPositionWithLF() throws Exception { > + this.validateRecordPosition(String.valueOf(LF)); > + } > + > + @Test > public void testGetOneLine() throws IOException { > final CSVParser parser = CSVParser.parse(CSV_INPUT_1, > CSVFormat.DEFAULT); > final CSVRecord record = parser.getRecords().get(0); > @@ -902,4 +913,65 @@ public class CSVParserTest { > parser.close(); > } > > + private void validateRecordPosition(final String lineSeparator) > throws IOException { > + final String nl = lineSeparator; // used as linebreak in values > for better distinction > + > + String code = "a,b,c" + lineSeparator + "1,2,3" + lineSeparator + > + // to see if recordPosition correctly points to the enclosing > quote > + "'A" + nl + "A','B" + nl + "B',CC" + lineSeparator + > + // unicode test... not very relevant while operating on > strings instead of bytes, but for > + // completeness... > + "\u00c4,\u00d6,\u00dc" + lineSeparator + "EOF,EOF,EOF"; > + > + final CSVFormat format = > CSVFormat.newFormat(',').withQuote('\'').withRecordSeparator(lineSeparator); > + CSVParser parser = CSVParser.parse(code, format); > + > + CSVRecord record; > + assertEquals(0, parser.getRecordNumber()); > + > + assertNotNull(record = parser.nextRecord()); > + assertEquals(1, record.getRecordNumber()); > + assertEquals(code.indexOf('a'), record.getCharacterPosition()); > + > + assertNotNull(record = parser.nextRecord()); > + assertEquals(2, record.getRecordNumber()); > + assertEquals(code.indexOf('1'), record.getCharacterPosition()); > + > + assertNotNull(record = parser.nextRecord()); > + final long positionRecord3 = record.getCharacterPosition(); > + assertEquals(3, record.getRecordNumber()); > + assertEquals(code.indexOf("'A"), record.getCharacterPosition()); > + assertEquals("A" + lineSeparator + "A", record.get(0)); > + assertEquals("B" + lineSeparator + "B", record.get(1)); > + assertEquals("CC", record.get(2)); > + > + assertNotNull(record = parser.nextRecord()); > + assertEquals(4, record.getRecordNumber()); > + assertEquals(code.indexOf('\u00c4'), > record.getCharacterPosition()); > + > + assertNotNull(record = parser.nextRecord()); > + assertEquals(5, record.getRecordNumber()); > + assertEquals(code.indexOf("EOF"), record.getCharacterPosition()); > + > + parser.close(); > + > + // now try to read starting at record 3 > + parser = CSVParser.parse(code.substring((int) positionRecord3), > format); > + parser.setNextRecordNumber(3); > + parser.setNextCharacterPosition(positionRecord3); > + > + assertNotNull(record = parser.nextRecord()); > + assertEquals(3, record.getRecordNumber()); > + assertEquals(code.indexOf("'A"), record.getCharacterPosition()); > + assertEquals("A" + lineSeparator + "A", record.get(0)); > + assertEquals("B" + lineSeparator + "B", record.get(1)); > + assertEquals("CC", record.get(2)); > + > + assertNotNull(record = parser.nextRecord()); > + assertEquals(4, record.getRecordNumber()); > + assertEquals(code.indexOf('\u00c4'), > record.getCharacterPosition()); > + assertEquals("\u00c4", record.get(0)); > + > + parser.close(); > + } > } > > Modified: > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVRecordTest.java > URL: > http://svn.apache.org/viewvc/commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVRecordTest.java?rev=1635052&r1=1635051&r2=1635052&view=diff > > ============================================================================== > --- > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVRecordTest.java > (original) > +++ > commons/proper/csv/trunk/src/test/java/org/apache/commons/csv/CSVRecordTest.java > Wed Oct 29 05:44:40 2014 > @@ -45,12 +45,12 @@ public class CSVRecordTest { > @Before > public void setUp() throws Exception { > values = new String[] { "A", "B", "C" }; > - record = new CSVRecord(values, null, null, 0); > + record = new CSVRecord(values, null, null, 0, -1); > header = new HashMap<String, Integer>(); > header.put("first", Integer.valueOf(0)); > header.put("second", Integer.valueOf(1)); > header.put("third", Integer.valueOf(2)); > - recordWithHeader = new CSVRecord(values, header, null, 0); > + recordWithHeader = new CSVRecord(values, header, null, 0, -1); > } > > @Test > > > -- http://people.apache.org/~britter/ http://www.systemoutprintln.de/ http://twitter.com/BenediktRitter http://github.com/britter