This is an automated email from the ASF dual-hosted git repository.
kturner pushed a commit to branch elasticity
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/elasticity by this push:
new 1123f12273 adds tests to verify estimated split file sizes (#4625)
1123f12273 is described below
commit 1123f12273c16d66bfbbd46c7bdfaa22afc1dad2
Author: Keith Turner <[email protected]>
AuthorDate: Mon Jun 3 09:34:59 2024 -0400
adds tests to verify estimated split file sizes (#4625)
---
.../apache/accumulo/test/functional/SplitIT.java | 102 +++++++++++++++++++--
1 file changed, 96 insertions(+), 6 deletions(-)
diff --git
a/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
b/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
index a25b6c749e..1985b96b32 100644
--- a/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
+++ b/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
@@ -22,6 +22,7 @@ import static java.util.Collections.singletonMap;
import static java.util.concurrent.TimeUnit.SECONDS;
import static
org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ServerColumnFamily.LOCK_COLUMN;
import static
org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ServerColumnFamily.OPID_COLUMN;
+import static
org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.FILES;
import static org.apache.accumulo.core.util.LazySingletons.RANDOM;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
@@ -36,7 +37,6 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
-import java.util.Random;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
@@ -58,10 +58,12 @@ import org.apache.accumulo.core.client.rfile.RFile;
import org.apache.accumulo.core.client.rfile.RFileWriter;
import org.apache.accumulo.core.conf.Property;
import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.TableId;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.core.metadata.AccumuloTable;
+import org.apache.accumulo.core.metadata.schema.DataFileValue;
import
org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.TabletColumnFamily;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.Pair;
@@ -85,8 +87,6 @@ import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.MoreCollectors;
-import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
-
public class SplitIT extends AccumuloClusterHarness {
private static final Logger log = LoggerFactory.getLogger(SplitIT.class);
@@ -135,6 +135,98 @@ public class SplitIT extends AccumuloClusterHarness {
}
}
+ // Test that checks the estimated file sizes created by a split are
reasonable
+ @Test
+ public void testEstimatedSizes() throws Exception {
+ try (AccumuloClient c =
Accumulo.newClient().from(getClientProps()).build()) {
+ String table = getUniqueNames(1)[0];
+
+ Map<String,String> props = new HashMap<>();
+ props.put(Property.TABLE_MAJC_RATIO.getKey(), "10");
+
+ c.tableOperations().create(table, new
NewTableConfiguration().setProperties(props));
+
+ var random = RANDOM.get();
+ byte[] data = new byte[1000];
+
+ try (var writer = c.createBatchWriter(table)) {
+ // create 5 files with each file covering a larger range of data
+ for (int batch = 1; batch <= 5; batch++) {
+ for (int i = 0; i < 100; i++) {
+ Mutation m = new Mutation(String.format("%09d", i * batch));
+ random.nextBytes(data);
+ m.at().family("data").qualifier("random").put(data);
+ writer.addMutation(m);
+ }
+ writer.flush();
+ c.tableOperations().flush(table, null, null, true);
+ }
+ }
+
+ var tableId = getServerContext().getTableId(table);
+ var files = getServerContext().getAmple().readTablet(new
KeyExtent(tableId, null, null))
+ .getFilesMap();
+
+ // map of file name and the estimates for that file from the original
tablet
+ Map<String,DataFileValue> filesSizes1 = new HashMap<>();
+ files.forEach((file, dfv) -> filesSizes1.put(file.getFileName(), dfv));
+
+ TreeSet<Text> splits = new TreeSet<>();
+ for (int batch = 1; batch < 5; batch++) {
+ splits.add(new Text(String.format("%09d", batch * 100)));
+ }
+
+ c.tableOperations().addSplits(table, splits);
+
+ // map of file name and the estimates for that file from all splits
+ Map<String,List<DataFileValue>> filesSizes2 = new HashMap<>();
+ try (var tablets =
+
getServerContext().getAmple().readTablets().forTable(tableId).fetch(FILES).build())
{
+ for (var tablet : tablets) {
+ tablet.getFilesMap().forEach((file, dfv) -> filesSizes2
+ .computeIfAbsent(file.getFileName(), k -> new
ArrayList<>()).add(dfv));
+ }
+ }
+
+ assertEquals(5, filesSizes1.size());
+ assertEquals(filesSizes1.keySet(), filesSizes2.keySet());
+
+ // the way the data is relative to splits should have a tablet w/ 1
file, a tablet w/ 2 files,
+ // etc
+ assertEquals(Set.of(1, 2, 3, 4, 5),
+
filesSizes2.values().stream().map(List::size).collect(Collectors.toSet()));
+
+ // compare the estimates from the split tablets to the estimates from
the original tablet
+ for (var fileName : filesSizes1.keySet()) {
+ var origSize = filesSizes1.get(fileName);
+
+ // wrote 100 entries to each file
+ assertEquals(100, origSize.getNumEntries());
+ // wrote 100x1000 random byte array which should not compress
+ assertTrue(origSize.getSize() > 100_000);
+ assertTrue(origSize.getSize() < 110_000);
+
+ var splitSize =
filesSizes2.get(fileName).stream().mapToLong(DataFileValue::getSize).sum();
+ var diff = 1 - splitSize / (double) origSize.getSize();
+ // the sum of the sizes in the split should be very close to the
original
+ assertTrue(diff < .02,
+ "diff:" + diff + " file:" + fileName + " orig:" + origSize + "
split:" + splitSize);
+
+ var splitEntries =
+
filesSizes2.get(fileName).stream().mapToLong(DataFileValue::getNumEntries).sum();
+ diff = 1 - splitEntries / (double) origSize.getNumEntries();
+ // the sum of the entries in the split should be very close to the
original
+ assertTrue(diff < .02,
+ "diff:" + diff + " file:" + fileName + " orig:" + origSize + "
split:" + splitSize);
+
+ // all the splits should have the same file size and estimate because
the data was evenly
+ // split
+ assertEquals(1, filesSizes2.get(fileName).stream().distinct().count(),
+ "" + filesSizes2.get(fileName));
+ }
+ }
+ }
+
@Test
public void tabletShouldSplit() throws Exception {
try (AccumuloClient c =
Accumulo.newClient().from(getClientProps()).build()) {
@@ -338,8 +430,6 @@ public class SplitIT extends AccumuloClusterHarness {
return dir;
}
- @SuppressFBWarnings(value = {"PREDICTABLE_RANDOM",
"DMI_RANDOM_USED_ONLY_ONCE"},
- justification = "predictable random with specific seed is intended for
this test")
@Test
public void bulkImportThatCantSplitHangsCompaction() throws Exception {
@@ -361,7 +451,7 @@ public class SplitIT extends AccumuloClusterHarness {
c.tableOperations().create(tableName, new NewTableConfiguration()
.setProperties(singletonMap(Property.TABLE_SPLIT_THRESHOLD.getKey(),
"10K")));
- Random random = new Random();
+ var random = RANDOM.get();
byte[] val = new byte[100];
String dir = getDir();