commit:     c5a2a0edc4f4b01b16a274268431fa21f7f678b2
Author:     Daniel Robbins <drobbins <AT> funtoo <DOT> org>
AuthorDate: Sat Oct 14 23:38:05 2017 +0000
Commit:     Zac Medico <zmedico <AT> gentoo <DOT> org>
CommitDate: Sun Oct 15 00:53:40 2017 +0000
URL:        https://gitweb.gentoo.org/proj/portage.git/commit/?id=c5a2a0ed

portdbapi: factor out _better_cache class

Better_cache -- now even better :) This version only scans individual
categories on-demand. I have addressed concerns about PMS-compliance by
enhancing the documentation so that developers are aware of what
assumptions to make (and not make) when using better_cache.

Closes: https://github.com/gentoo/portage/pull/219

 pym/portage/dbapi/porttree.py | 124 ++++++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 53 deletions(-)

diff --git a/pym/portage/dbapi/porttree.py b/pym/portage/dbapi/porttree.py
index 53edcd18f..f5979d2d0 100644
--- a/pym/portage/dbapi/porttree.py
+++ b/pym/portage/dbapi/porttree.py
@@ -16,7 +16,7 @@ portage.proxy.lazyimport.lazyimport(globals(),
        'portage.package.ebuild.doebuild:doebuild',
        'portage.util:ensure_dirs,shlex_split,writemsg,writemsg_level',
        'portage.util.listdir:listdir',
-       
'portage.versions:best,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str',
+       
'portage.versions:best,catsplit,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str',
 )
 
 from portage.cache import volatile
@@ -103,6 +103,68 @@ class _dummy_list(list):
                except ValueError:
                        pass
 
+
+class _better_cache(object):
+
+       """
+       The purpose of better_cache is to locate catpkgs in repositories using 
``os.listdir()`` as much as possible, which
+       is less expensive IO-wise than exhaustively doing a stat on each repo 
for a particular catpkg. better_cache stores a
+       list of repos in which particular catpkgs appear. Various dbapi methods 
use better_cache to locate repositories of
+       interest related to particular catpkg rather than performing an 
exhaustive scan of all repos/overlays.
+
+       Better_cache.items data may look like this::
+
+         { "sys-apps/portage" : [ repo1, repo2 ] }
+
+       Without better_cache, Portage will get slower and slower (due to 
excessive IO) as more overlays are added.
+
+       Also note that it is OK if this cache has some 'false positive' catpkgs 
in it. We use it to search for specific
+       catpkgs listed in ebuilds. The likelihood of a false positive catpkg in 
our cache causing a problem is extremely
+       low, because the user of our cache is passing us a catpkg that came 
from somewhere and has already undergone some
+       validation, and even then will further interrogate the short-list of 
repos we return to gather more information
+       on the catpkg.
+
+       Thus, the code below is optimized for speed rather than painstaking 
correctness. I have added a note to
+       ``dbapi.getRepositories()`` to ensure that developers are aware of this 
just in case.
+
+       The better_cache has been redesigned to perform on-demand scans -- it 
will only scan a category at a time, as
+       needed. This should further optimize IO performance by not scanning 
category directories that are not needed by
+       Portage.
+       """
+
+       def __init__(self, repositories):
+               self._items = collections.defaultdict(list)
+               self._scanned_cats = set()
+
+               # ordered list of all portree locations we'll scan:
+               self._repo_list = [repo for repo in reversed(list(repositories))
+                       if repo.location is not None]
+
+       def __getitem__(self, catpkg):
+               result = self._items.get(catpkg)
+               if result is not None:
+                       return result
+
+               cat, pkg = catsplit(catpkg)
+               if cat not in self._scanned_cats:
+                       self._scan_cat(cat)
+               return self._items[catpkg]
+
+       def _scan_cat(self, cat):
+               for repo in self._repo_list:
+                       cat_dir = repo.location + "/" + cat
+                       try:
+                               pkg_list = os.listdir(cat_dir)
+                       except OSError as e:
+                               if e.errno not in (errno.ENOTDIR, errno.ENOENT, 
errno.ESTALE):
+                                       raise
+                               continue
+                       for p in pkg_list:
+                               if os.path.isdir(cat_dir + "/" + p):
+                                       self._items[cat + "/" + p].append(repo)
+               self._scanned_cats.add(cat)
+
+
 class portdbapi(dbapi):
        """this tree will scan a portage directory located at root (passed to 
init)"""
        portdbapi_instances = _dummy_list()
@@ -346,11 +408,14 @@ class portdbapi(dbapi):
                        return None
 
        def getRepositories(self, catpkg=None):
+
                """
                With catpkg=None, this will return a complete list of 
repositories in this dbapi. With catpkg set to a value,
                this method will return a short-list of repositories that 
contain this catpkg. Use this second approach if
                possible, to avoid exhaustively searching all repos for a 
particular catpkg. It's faster for this method to
-               find the catpkg than for you do it yourself.
+               find the catpkg than for you do it yourself. When specifying 
catpkg, you should have reasonable assurance that
+               the category is valid and PMS-compliant as the caching 
mechanism we use does not perform validation checks for
+               categories.
 
                This function is required for GLEP 42 compliance.
 
@@ -358,7 +423,8 @@ class portdbapi(dbapi):
                  catpkg; if None, return a list of all Repositories that 
contain a particular catpkg.
                @return: a list of repositories.
                """
-               if catpkg is not None and self._better_cache is not None and 
catpkg in self._better_cache:
+
+               if catpkg is not None and self._better_cache is not None:
                        return [repo.name for repo in 
self._better_cache[catpkg]]
                return self._ordered_repo_name_list
 
@@ -796,12 +862,7 @@ class portdbapi(dbapi):
                elif self._better_cache is None:
                        mytrees = self.porttrees
                else:
-                       try:
-                               repos = self._better_cache[mycp]
-                       except KeyError:
-                               mytrees = []
-                       else:
-                               mytrees = [repo.location for repo in repos]
+                       mytrees = [repo.location for repo in 
self._better_cache[mycp]]
                for oroot in mytrees:
                        try:
                                file_list = os.listdir(os.path.join(oroot, 
mycp))
@@ -850,50 +911,7 @@ class portdbapi(dbapi):
                        "minimum-all-ignore-profile", "minimum-visible"):
                        self.xcache[x]={}
                self.frozen=1
-               self._better_cache = better_cache = 
collections.defaultdict(list)
-
-               # The purpose of self._better_cache is to perform an initial 
quick scan of all repositories
-               # using os.listdir(), which is less expensive IO-wise than 
exhaustively doing a stat on each
-               # repo. self._better_cache stores a list of repos in which 
particular catpkgs appear.
-               #
-               # For example, better_cache data may look like this:
-               #
-               # { "sys-apps/portage" : [ repo1, repo2 ] }
-               #
-               # Without this tweak, Portage will get slower and slower as 
more overlays are added.
-               #
-               # Also note that it is OK if this cache has some 'false 
positive' catpkgs in it. We use it
-               # to search for specific catpkgs listed in ebuilds. The 
likelihood of a false positive catpkg
-               # in our cache causing a problem is extremely low. Thus, the 
code below is optimized for
-               # speed rather than painstaking correctness.
-
-               valid_categories = self.settings.categories
-               for repo_loc in reversed(self.porttrees):
-                       repo = self.repositories.get_repo_for_location(repo_loc)
-                       try:
-                               categories = os.listdir(repo_loc)
-                       except OSError as e:
-                               if e.errno not in (errno.ENOTDIR, errno.ENOENT, 
errno.ESTALE):
-                                       raise
-                               continue
-
-                       for cat in categories:
-                               if cat not in valid_categories:
-                                       continue
-                               cat_dir = repo_loc + "/" + cat
-                               try:
-                                       pkg_list = os.listdir(cat_dir)
-                               except OSError as e:
-                                       if e.errno != errno.ENOTDIR:
-                                               raise
-                                       continue
-
-                               for p in pkg_list:
-                                       catpkg_dir = cat_dir + "/" + p
-                                       if not os.path.isdir(catpkg_dir):
-                                               continue
-                                       catpkg = cat + "/" + p
-                                       better_cache[catpkg].append(repo)
+               self._better_cache = _better_cache(self.repositories)
 
        def melt(self):
                self.xcache = {}

Reply via email to