(accumulo) branch elasticity updated: adds tests to verify estimated split file sizes (#4625)

kturner Mon, 03 Jun 2024 06:35:11 -0700

This is an automated email from the ASF dual-hosted git repository.

kturner pushed a commit to branch elasticity
in repository https://gitbox.apache.org/repos/asf/accumulo.git



The following commit(s) were added to refs/heads/elasticity by this push:
     new 1123f12273 adds tests to verify estimated split file sizes (#4625)
1123f12273 is described below

commit 1123f12273c16d66bfbbd46c7bdfaa22afc1dad2
Author: Keith Turner <[email protected]>
AuthorDate: Mon Jun 3 09:34:59 2024 -0400

    adds tests to verify estimated split file sizes (#4625)
---
 .../apache/accumulo/test/functional/SplitIT.java   | 102 +++++++++++++++++++--
 1 file changed, 96 insertions(+), 6 deletions(-)

diff --git 
a/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java 
b/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
index a25b6c749e..1985b96b32 100644
--- a/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
+++ b/test/src/main/java/org/apache/accumulo/test/functional/SplitIT.java
@@ -22,6 +22,7 @@ import static java.util.Collections.singletonMap;
 import static java.util.concurrent.TimeUnit.SECONDS;
 import static 
org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ServerColumnFamily.LOCK_COLUMN;
 import static 
org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.ServerColumnFamily.OPID_COLUMN;
+import static 
org.apache.accumulo.core.metadata.schema.TabletMetadata.ColumnType.FILES;
 import static org.apache.accumulo.core.util.LazySingletons.RANDOM;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
@@ -36,7 +37,6 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
-import java.util.Random;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
@@ -58,10 +58,12 @@ import org.apache.accumulo.core.client.rfile.RFile;
 import org.apache.accumulo.core.client.rfile.RFileWriter;
 import org.apache.accumulo.core.conf.Property;
 import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
 import org.apache.accumulo.core.data.TableId;
 import org.apache.accumulo.core.data.Value;
 import org.apache.accumulo.core.dataImpl.KeyExtent;
 import org.apache.accumulo.core.metadata.AccumuloTable;
+import org.apache.accumulo.core.metadata.schema.DataFileValue;
 import 
org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection.TabletColumnFamily;
 import org.apache.accumulo.core.security.Authorizations;
 import org.apache.accumulo.core.util.Pair;
@@ -85,8 +87,6 @@ import org.slf4j.LoggerFactory;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.MoreCollectors;
 
-import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
-
 public class SplitIT extends AccumuloClusterHarness {
   private static final Logger log = LoggerFactory.getLogger(SplitIT.class);
 
@@ -135,6 +135,98 @@ public class SplitIT extends AccumuloClusterHarness {
     }
   }
 
+  // Test that checks the estimated file sizes created by a split are 
reasonable
+  @Test
+  public void testEstimatedSizes() throws Exception {
+    try (AccumuloClient c = 
Accumulo.newClient().from(getClientProps()).build()) {
+      String table = getUniqueNames(1)[0];
+
+      Map<String,String> props = new HashMap<>();
+      props.put(Property.TABLE_MAJC_RATIO.getKey(), "10");
+
+      c.tableOperations().create(table, new 
NewTableConfiguration().setProperties(props));
+
+      var random = RANDOM.get();
+      byte[] data = new byte[1000];
+
+      try (var writer = c.createBatchWriter(table)) {
+        // create 5 files with each file covering a larger range of data
+        for (int batch = 1; batch <= 5; batch++) {
+          for (int i = 0; i < 100; i++) {
+            Mutation m = new Mutation(String.format("%09d", i * batch));
+            random.nextBytes(data);
+            m.at().family("data").qualifier("random").put(data);
+            writer.addMutation(m);
+          }
+          writer.flush();
+          c.tableOperations().flush(table, null, null, true);
+        }
+      }
+
+      var tableId = getServerContext().getTableId(table);
+      var files = getServerContext().getAmple().readTablet(new 
KeyExtent(tableId, null, null))
+          .getFilesMap();
+
+      // map of file name and the estimates for that file from the original 
tablet
+      Map<String,DataFileValue> filesSizes1 = new HashMap<>();
+      files.forEach((file, dfv) -> filesSizes1.put(file.getFileName(), dfv));
+
+      TreeSet<Text> splits = new TreeSet<>();
+      for (int batch = 1; batch < 5; batch++) {
+        splits.add(new Text(String.format("%09d", batch * 100)));
+      }
+
+      c.tableOperations().addSplits(table, splits);
+
+      // map of file name and the estimates for that file from all splits
+      Map<String,List<DataFileValue>> filesSizes2 = new HashMap<>();
+      try (var tablets =
+          
getServerContext().getAmple().readTablets().forTable(tableId).fetch(FILES).build())
 {
+        for (var tablet : tablets) {
+          tablet.getFilesMap().forEach((file, dfv) -> filesSizes2
+              .computeIfAbsent(file.getFileName(), k -> new 
ArrayList<>()).add(dfv));
+        }
+      }
+
+      assertEquals(5, filesSizes1.size());
+      assertEquals(filesSizes1.keySet(), filesSizes2.keySet());
+
+      // the way the data is relative to splits should have a tablet w/ 1 
file, a tablet w/ 2 files,
+      // etc
+      assertEquals(Set.of(1, 2, 3, 4, 5),
+          
filesSizes2.values().stream().map(List::size).collect(Collectors.toSet()));
+
+      // compare the estimates from the split tablets to the estimates from 
the original tablet
+      for (var fileName : filesSizes1.keySet()) {
+        var origSize = filesSizes1.get(fileName);
+
+        // wrote 100 entries to each file
+        assertEquals(100, origSize.getNumEntries());
+        // wrote 100x1000 random byte array which should not compress
+        assertTrue(origSize.getSize() > 100_000);
+        assertTrue(origSize.getSize() < 110_000);
+
+        var splitSize = 
filesSizes2.get(fileName).stream().mapToLong(DataFileValue::getSize).sum();
+        var diff = 1 - splitSize / (double) origSize.getSize();
+        // the sum of the sizes in the split should be very close to the 
original
+        assertTrue(diff < .02,
+            "diff:" + diff + " file:" + fileName + " orig:" + origSize + " 
split:" + splitSize);
+
+        var splitEntries =
+            
filesSizes2.get(fileName).stream().mapToLong(DataFileValue::getNumEntries).sum();
+        diff = 1 - splitEntries / (double) origSize.getNumEntries();
+        // the sum of the entries in the split should be very close to the 
original
+        assertTrue(diff < .02,
+            "diff:" + diff + " file:" + fileName + " orig:" + origSize + " 
split:" + splitSize);
+
+        // all the splits should have the same file size and estimate because 
the data was evenly
+        // split
+        assertEquals(1, filesSizes2.get(fileName).stream().distinct().count(),
+            "" + filesSizes2.get(fileName));
+      }
+    }
+  }
+
   @Test
   public void tabletShouldSplit() throws Exception {
     try (AccumuloClient c = 
Accumulo.newClient().from(getClientProps()).build()) {
@@ -338,8 +430,6 @@ public class SplitIT extends AccumuloClusterHarness {
     return dir;
   }
 
-  @SuppressFBWarnings(value = {"PREDICTABLE_RANDOM", 
"DMI_RANDOM_USED_ONLY_ONCE"},
-      justification = "predictable random with specific seed is intended for 
this test")
   @Test
   public void bulkImportThatCantSplitHangsCompaction() throws Exception {
 
@@ -361,7 +451,7 @@ public class SplitIT extends AccumuloClusterHarness {
       c.tableOperations().create(tableName, new NewTableConfiguration()
           .setProperties(singletonMap(Property.TABLE_SPLIT_THRESHOLD.getKey(), 
"10K")));
 
-      Random random = new Random();
+      var random = RANDOM.get();
       byte[] val = new byte[100];
 
       String dir = getDir();

(accumulo) branch elasticity updated: adds tests to verify estimated split file sizes (#4625)

Reply via email to