zxealous commented on code in PR #12897:
URL: https://github.com/apache/doris/pull/12897#discussion_r982998726


##########
be/src/io/cache/file_cache_manager.cpp:
##########
@@ -18,15 +18,39 @@
 #include "io/cache/file_cache_manager.h"
 
 #include "gutil/strings/util.h"
+#include "io/cache/dummy_file_cache.h"
 #include "io/cache/sub_file_cache.h"
 #include "io/cache/whole_file_cache.h"
 #include "io/fs/local_file_system.h"
+#include "olap/storage_engine.h"
 #include "util/file_utils.h"
 #include "util/string_util.h"
 
 namespace doris {
 namespace io {
 
+void GCContextPerDisk::init(const std::string& path, int64_t max_size) {
+    _disk_path = path;
+    _conf_max_size = max_size;
+    _used_size = 0;
+}
+bool GCContextPerDisk::try_add_file_cache(FileCachePtr cache, int64_t 
file_size) {
+    if (cache->cache_dir().string().substr(0, _disk_path.size()) == 
_disk_path) {

Review Comment:
   May be we should determine whether the cache path exists in the _lru_queue? 
   For example, When there is a cache file in the disk but not in the memory, 
the cache will be add to the _lru_queue. But when the cache needs to be used to 
create the cache in the memory, the cache pointer will also be added to the 
_lru_queue in the cleanup task. The cache last match time in the lru queue will 
be earlier, which may cause the cache to be cleared in advance?



##########
be/src/io/cache/file_cache_manager.cpp:
##########
@@ -56,88 +80,82 @@ void FileCacheManager::remove_file_cache(const std::string& 
cache_path) {
     }
 }
 
-void FileCacheManager::clean_timeout_caches() {
-    std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    for (std::map<std::string, FileCachePtr>::const_iterator iter = 
_file_cache_map.cbegin();
-         iter != _file_cache_map.cend(); ++iter) {
-        if (iter->second == nullptr) {
-            continue;
+void 
FileCacheManager::_add_file_cache_for_gc_by_disk(std::vector<GCContextPerDisk>& 
contexts,
+                                                      FileCachePtr file_cache) 
{
+    // sort file cache by last match time
+    if (config::file_cache_max_size_per_disk_gb > 0) {
+        auto file_size = file_cache->cache_file_size();
+        if (file_size <= 0) {
+            return;
+        }
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            if (contexts[i].try_add_file_cache(file_cache, file_size)) {
+                break;
+            }
         }
-        iter->second->clean_timeout_cache();
     }
 }
+void FileCacheManager::gc_file_caches() {
+    int64_t gc_conf_size = config::file_cache_max_size_per_disk_gb * 1024 * 
1024 * 1024;
+    std::vector<GCContextPerDisk> contexts;
+    // init for GC by disk size
+    if (gc_conf_size > 0) {
+        std::vector<DataDir*> data_dirs = 
doris::StorageEngine::instance()->get_stores();
+        contexts.resize(data_dirs.size());
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            contexts[i].init(data_dirs[i]->path(), gc_conf_size);
+        }
+    }
 
-void FileCacheManager::clean_timeout_file_not_in_mem(const std::string& 
cache_path) {
-    time_t now = time(nullptr);
+    // process unused file caches
     std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    // Deal with caches not in _file_cache_map
-    if (_file_cache_map.find(cache_path) == _file_cache_map.end()) {
-        std::vector<Path> cache_file_names;
-        if (io::global_local_filesystem()->list(cache_path, 
&cache_file_names).ok()) {
-            std::map<std::string, bool> cache_names;
-            std::list<std::string> done_names;
-            for (Path cache_file_name : cache_file_names) {
-                std::string filename = cache_file_name.native();
-                if (!ends_with(filename, CACHE_DONE_FILE_SUFFIX)) {
-                    cache_names[filename] = true;
-                    continue;
-                }
-                done_names.push_back(filename);
-                std::stringstream done_file_ss;
-                done_file_ss << cache_path << "/" << filename;
-                std::string done_file_path = done_file_ss.str();
-                time_t m_time;
-                if (!FileUtils::mtime(done_file_path, &m_time).ok()) {
-                    continue;
-                }
-                if (now - m_time < config::file_cache_alive_time_sec) {
-                    continue;
-                }
-                std::string cache_file_path =
-                        StringReplace(done_file_path, CACHE_DONE_FILE_SUFFIX, 
"", true);
-                LOG(INFO) << "Delete timeout done_cache_path: " << 
done_file_path
-                          << ", cache_file_path: " << cache_file_path << ", 
m_time: " << m_time;
-                if 
(!io::global_local_filesystem()->delete_file(done_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << done_file_path;
+    std::vector<TabletSharedPtr> tablets =
+            StorageEngine::instance()->tablet_manager()->get_all_tablet();
+    for (const auto& tablet : tablets) {
+        std::vector<Path> seg_file_paths;
+        if (io::global_local_filesystem()->list(tablet->tablet_path(), 
&seg_file_paths).ok()) {
+            for (Path seg_file : seg_file_paths) {
+                std::string seg_filename = seg_file.native();
+                // check if it is a dir name
+                if (ends_with(seg_filename, ".dat")) {
                     continue;
                 }
-                if 
(!io::global_local_filesystem()->delete_file(cache_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << cache_file_path;
+                // skip file cache already in memory
+                std::stringstream ss;
+                ss << tablet->tablet_path() << "/" << seg_filename;
+                std::string cache_path = ss.str();
+                if (_file_cache_map.find(cache_path) != _file_cache_map.end()) 
{

Review Comment:
   Should we determine whether DummyFileCache exists at the same time? 
Otherwise, each time the cleaning task is executed, a DummyFileCache object 
will be created for the cache that is not cleaned only in the disk



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to