zxealous commented on code in PR #12897: URL: https://github.com/apache/doris/pull/12897#discussion_r982998726
########## be/src/io/cache/file_cache_manager.cpp: ########## @@ -18,15 +18,39 @@ #include "io/cache/file_cache_manager.h" #include "gutil/strings/util.h" +#include "io/cache/dummy_file_cache.h" #include "io/cache/sub_file_cache.h" #include "io/cache/whole_file_cache.h" #include "io/fs/local_file_system.h" +#include "olap/storage_engine.h" #include "util/file_utils.h" #include "util/string_util.h" namespace doris { namespace io { +void GCContextPerDisk::init(const std::string& path, int64_t max_size) { + _disk_path = path; + _conf_max_size = max_size; + _used_size = 0; +} +bool GCContextPerDisk::try_add_file_cache(FileCachePtr cache, int64_t file_size) { + if (cache->cache_dir().string().substr(0, _disk_path.size()) == _disk_path) { Review Comment: May be we should determine whether the cache path exists in the _lru_queue? For example, When there is a cache file in the disk but not in the memory, the cache will be add to the _lru_queue. But when the cache needs to be used to create the cache in the memory, the cache pointer will also be added to the _lru_queue in the cleanup task. The cache last match time in the lru queue will be earlier, which may cause the cache to be cleared in advance? ########## be/src/io/cache/file_cache_manager.cpp: ########## @@ -56,88 +80,82 @@ void FileCacheManager::remove_file_cache(const std::string& cache_path) { } } -void FileCacheManager::clean_timeout_caches() { - std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock); - for (std::map<std::string, FileCachePtr>::const_iterator iter = _file_cache_map.cbegin(); - iter != _file_cache_map.cend(); ++iter) { - if (iter->second == nullptr) { - continue; +void FileCacheManager::_add_file_cache_for_gc_by_disk(std::vector<GCContextPerDisk>& contexts, + FileCachePtr file_cache) { + // sort file cache by last match time + if (config::file_cache_max_size_per_disk_gb > 0) { + auto file_size = file_cache->cache_file_size(); + if (file_size <= 0) { + return; + } + for (size_t i = 0; i < contexts.size(); ++i) { + if (contexts[i].try_add_file_cache(file_cache, file_size)) { + break; + } } - iter->second->clean_timeout_cache(); } } +void FileCacheManager::gc_file_caches() { + int64_t gc_conf_size = config::file_cache_max_size_per_disk_gb * 1024 * 1024 * 1024; + std::vector<GCContextPerDisk> contexts; + // init for GC by disk size + if (gc_conf_size > 0) { + std::vector<DataDir*> data_dirs = doris::StorageEngine::instance()->get_stores(); + contexts.resize(data_dirs.size()); + for (size_t i = 0; i < contexts.size(); ++i) { + contexts[i].init(data_dirs[i]->path(), gc_conf_size); + } + } -void FileCacheManager::clean_timeout_file_not_in_mem(const std::string& cache_path) { - time_t now = time(nullptr); + // process unused file caches std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock); - // Deal with caches not in _file_cache_map - if (_file_cache_map.find(cache_path) == _file_cache_map.end()) { - std::vector<Path> cache_file_names; - if (io::global_local_filesystem()->list(cache_path, &cache_file_names).ok()) { - std::map<std::string, bool> cache_names; - std::list<std::string> done_names; - for (Path cache_file_name : cache_file_names) { - std::string filename = cache_file_name.native(); - if (!ends_with(filename, CACHE_DONE_FILE_SUFFIX)) { - cache_names[filename] = true; - continue; - } - done_names.push_back(filename); - std::stringstream done_file_ss; - done_file_ss << cache_path << "/" << filename; - std::string done_file_path = done_file_ss.str(); - time_t m_time; - if (!FileUtils::mtime(done_file_path, &m_time).ok()) { - continue; - } - if (now - m_time < config::file_cache_alive_time_sec) { - continue; - } - std::string cache_file_path = - StringReplace(done_file_path, CACHE_DONE_FILE_SUFFIX, "", true); - LOG(INFO) << "Delete timeout done_cache_path: " << done_file_path - << ", cache_file_path: " << cache_file_path << ", m_time: " << m_time; - if (!io::global_local_filesystem()->delete_file(done_file_path).ok()) { - LOG(ERROR) << "delete_file failed: " << done_file_path; + std::vector<TabletSharedPtr> tablets = + StorageEngine::instance()->tablet_manager()->get_all_tablet(); + for (const auto& tablet : tablets) { + std::vector<Path> seg_file_paths; + if (io::global_local_filesystem()->list(tablet->tablet_path(), &seg_file_paths).ok()) { + for (Path seg_file : seg_file_paths) { + std::string seg_filename = seg_file.native(); + // check if it is a dir name + if (ends_with(seg_filename, ".dat")) { continue; } - if (!io::global_local_filesystem()->delete_file(cache_file_path).ok()) { - LOG(ERROR) << "delete_file failed: " << cache_file_path; + // skip file cache already in memory + std::stringstream ss; + ss << tablet->tablet_path() << "/" << seg_filename; + std::string cache_path = ss.str(); + if (_file_cache_map.find(cache_path) != _file_cache_map.end()) { Review Comment: Should we determine whether DummyFileCache exists at the same time? Otherwise, each time the cleaning task is executed, a DummyFileCache object will be created for the cache that is not cleaned only in the disk -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org