codope commented on code in PR #6016:
URL: https://github.com/apache/hudi/pull/6016#discussion_r931683314
##########
hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java:
##########
@@ -244,12 +255,7 @@ private Map<PartitionPath, FileStatus[]>
loadPartitionPathFiles() {
);
fetchedPartitionToFiles =
- FSUtils.getFilesInPartitions(
- engineContext,
- metadataConfig,
- basePath,
- fullPartitionPathsMapToFetch.keySet().toArray(new String[0]),
- fileSystemStorageConfig.getSpillableDir())
Review Comment:
So if this is not needed then let's remove and avoid instantiating
`fileSystemStorageConfig` in the constructor.
##########
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestHoodieSparkMergeOnReadTableCompaction.java:
##########
@@ -98,8 +99,13 @@ public void testWriteDuringCompaction() throws IOException {
.withLayoutConfig(HoodieLayoutConfig.newBuilder()
.withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name())
.withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build())
-
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("1").build()).build();
- metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ,
config.getProps());
+
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("1").build())
+ .build();
+
+ Properties props = getPropertiesForKeyGen(true);
Review Comment:
pass `HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()` instead of
hard-coding true?
##########
hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java:
##########
@@ -353,12 +395,14 @@ public String getPath() {
return path;
}
- Path fullPartitionPath(String basePath) {
+ Path fullPartitionPath(Path basePath) {
if (!path.isEmpty()) {
- return new CachingPath(basePath, path);
+ // NOTE: Since we now that the path is a proper relative path that
doesn't require
Review Comment:
```suggestion
// NOTE: Since we know that the path is a proper relative path that
doesn't require
```
##########
hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java:
##########
@@ -398,7 +398,7 @@ public HoodieArchivedTimeline getArchivedTimeline(String
startTs) {
public void validateTableProperties(Properties properties) {
// Once meta fields are disabled, it cant be re-enabled for a given table.
if (!getTableConfig().populateMetaFields()
- && Boolean.parseBoolean((String)
properties.getOrDefault(HoodieTableConfig.POPULATE_META_FIELDS.key(),
HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()))) {
+ && Boolean.parseBoolean((String)
properties.getOrDefault(HoodieTableConfig.POPULATE_META_FIELDS.key(),
HoodieTableConfig.POPULATE_META_FIELDS.defaultValue().toString()))) {
Review Comment:
is it necessary? it's already being type cast to String
##########
hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java:
##########
@@ -167,9 +164,8 @@ protected ClosableIterator<IndexedRecord>
deserializeRecords(byte[] content) thr
// Get schema from the header
Schema writerSchema = new
Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
- FileSystem fs = FSUtils.getFs(pathForReader.toString(), new
Configuration());
// Read the content
- HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(fs,
pathForReader, content, Option.of(writerSchema));
+ HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(null,
pathForReader, content, Option.of(writerSchema));
Review Comment:
This could affect HFile reading. I believe there is some validation in HFile
system or HFile's reader context for fs to be non-null. I think we should still
pass `fs` and still keep this line in `HoodieHFileUtils#createHFileReader`:
```
Configuration conf = new Configuration(false);
```
##########
hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java:
##########
@@ -124,28 +125,28 @@ public static HoodieTableMetaClient init(Configuration
hadoopConf, String basePa
}
public static HoodieTableMetaClient init(Configuration hadoopConf, String
basePath, HoodieTableType tableType,
- Properties properties)
- throws IOException {
- properties = HoodieTableMetaClient.withPropertyBuilder()
- .setTableName(RAW_TRIPS_TEST_NAME)
- .setTableType(tableType)
- .setPayloadClass(HoodieAvroPayload.class)
- .fromProperties(properties)
- .build();
- return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf,
basePath, properties);
+ Properties properties) throws
IOException {
+ return init(hadoopConf, basePath, tableType, properties, null);
}
public static HoodieTableMetaClient init(Configuration hadoopConf, String
basePath, HoodieTableType tableType,
Properties properties, String
databaseName)
throws IOException {
- properties = HoodieTableMetaClient.withPropertyBuilder()
- .setDatabaseName(databaseName)
- .setTableName(RAW_TRIPS_TEST_NAME)
- .setTableType(tableType)
- .setPayloadClass(HoodieAvroPayload.class)
- .fromProperties(properties)
- .build();
- return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf,
basePath, properties);
+ HoodieTableMetaClient.PropertyBuilder builder =
+ HoodieTableMetaClient.withPropertyBuilder()
+ .setDatabaseName(databaseName)
+ .setTableName(RAW_TRIPS_TEST_NAME)
+ .setTableType(tableType)
+ .setPayloadClass(HoodieAvroPayload.class);
+
+ String keyGen =
properties.getProperty("hoodie.datasource.write.keygenerator.class");
+ if (!Objects.equals(keyGen,
"org.apache.hudi.keygen.NonpartitionedKeyGenerator")) {
+ builder.setPartitionFields("some_nonexistent_field");
Review Comment:
extract to constant to standardize across tests?
##########
hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java:
##########
@@ -138,13 +144,17 @@ public FileStatus[] getAllFilesInPartition(Path
partitionPath)
}
}
- return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf,
dataBasePath, metadataConfig.shouldAssumeDatePartitioning())
+ return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf,
dataBasePath.toString(), metadataConfig.shouldAssumeDatePartitioning())
.getAllFilesInPartition(partitionPath);
}
@Override
public Map<String, FileStatus[]> getAllFilesInPartitions(List<String>
partitions)
throws IOException {
+ if (partitions.isEmpty()) {
+ return Collections.emptyMap();
Review Comment:
If `BaseHoodieTableFileIndex#getAllPartitionPathsUnchecked` returns
`Collections.singletonList("")` then should we add an entry for `""` in the
map, or rather make `getAllPartitionPathsUnchecked` return
`Collections.emptyList()`
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala:
##########
@@ -341,4 +340,9 @@ object SparkHoodieTableFileIndex {
override def invalidate(): Unit = cache.invalidateAll()
}
}
+
+ private def shouldValidatePartitionColumns(spark: SparkSession): Boolean = {
+ // NOTE: We can't use helper, method nor the config-entry to stay
compatible w/ Spark 2.4
+
spark.sessionState.conf.getConfString("spark.sql.sources.validatePartitionColumns",
"true").toBoolean
Review Comment:
Should this go into Spark2Adapter or Spark2ParsePartitionUtil?
##########
hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java:
##########
@@ -83,4 +96,41 @@ public String toString() {
}
return fullPathStr;
}
+
+ public CachingPath subPath(String relativePath) {
+ return new CachingPath(this, createPathUnsafe(relativePath));
+ }
+
+ public static CachingPath wrap(Path path) {
+ if (path instanceof CachingPath) {
+ return (CachingPath) path;
+ }
+
+ return new CachingPath(path.toUri());
+ }
+
+ /**
+ * TODO elaborate
+ */
+ public static CachingPath createPathUnsafe(String relativePath) {
+ try {
+ // NOTE: {@code normalize} is going to be invoked by {@code Path} ctor,
so there's no
+ // point in invoking it here
+ URI uri = new URI(null, null, relativePath, null, null);
+ return new CachingPath(uri);
+ } catch (URISyntaxException e) {
+ throw new HoodieException("Failed to instantiate relative path", e);
+ }
+ }
+
+ /**
+ * TODO elaborate
Review Comment:
todo? javadoc only right?
##########
hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java:
##########
@@ -324,6 +335,26 @@ private void doRefresh() {
LOG.info(String.format("Refresh table %s, spent: %d ms",
metaClient.getTableConfig().getTableName(), duration));
}
+ private Map<String, FileStatus[]>
getAllFilesInPartitionsUnchecked(Collection<String>
fullPartitionPathsMapToFetch) {
+ try {
+ return tableMetadata.getAllFilesInPartitions(new
ArrayList<>(fullPartitionPathsMapToFetch));
+ } catch (IOException e) {
+ throw new HoodieIOException("Failed to list partition paths for a
table", e);
+ }
+ }
+
+ private List<String> getAllPartitionPathsUnchecked() {
+ try {
+ if (partitionColumns.length == 0) {
+ return Collections.singletonList("");
Review Comment:
Should it be `Collections.emptyList()`?
##########
hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java:
##########
@@ -32,11 +33,12 @@
* NOTE: This class is thread-safe
*/
@ThreadSafe
-public class CachingPath extends Path implements Serializable {
Review Comment:
`Path` is not serializable right? Have we vetted all paths that this does
not get sent to executors (to avoid TaskNotSerializable execption)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]