Tom Lord <[EMAIL PROTECTED]> writes:

> Thank you for your experiment.

you are welcome.

> I think that to a large extent you are seeing artifacts
> of the questionable trade-offs that (reports tell me) the
> ext* filesystems make.   With a different filesystem, the 
> results would be very different.

No, this is not the only thing that we observe.  For example, here are the
reports for the following two experiments:

Indexing method = [2]

Max keys at level  0:     256
Max keys at level  1:     108
Total number of dirs:     257
Total number of keys:   21662
Disk footprint      :    1.8M

Indexing method = [4 4]

Max keys at level  0:   18474
Max keys at level  1:       5
Max keys at level  2:       1
Total number of dirs:   40137
Total number of keys:   21662
Disk footprint      :    157M

Notice the huge number of directories in the second experiment and they don't
help at all in performing discrimination.

> I'm imagining a blob database containing may revisions of the linux
> kernel.  It will contain millions of blobs.

It is very easy to write code that uses an adaptive discrimination method
(i.e. when a directory becomes too full, introduce an additional level of
discrimination and rehash).  In fact I have code that does that (rehashing if
the size of a leaf directory exceed 256), but the [2] method above doesn't even
need it even though it has 21662 keys.

Just in case there is some interest, I attach below the python scripts which I
used for my experiments:

To create an indexed archive:

        python build.py SRC DST N1 ... Nk

where SRC is the root directory of the tree to be indexed, and DST names the
root directory of the indexed archive to be created.  N1 through Nk are integers
that each indicate how many chars to chop off the key to create the next level
indexing key.

        python info.py DST

collects and then prints out statistics about an indexed archive.

For example, the invocation that relates to your original proposal would be:

        python build.py /usr/src/linux store 4 4
        python info.py store

import os,os.path,stat,sha

tree      = None
archive   = None
slices    = []
lastslice = (0,-1)

def recurse(path):
    s = os.stat(path)
    if stat.S_ISDIR(s.st_mode):
        print path
        contents = []
        for n in os.listdir(path):
            uid = recurse(os.path.join(path,n))
            contents.append('\t'.join((n,uid)))
        contents = '\n'.join(contents)
        buf = sha.new(contents)
        uid = buf.hexdigest()
        uid = ','.join((uid,str(len(contents))))
        store(uid)
        return uid
    else:
        fd = file(path,"rb")
        contents = fd.read()
        fd.close()
        buf = sha.new(contents)
        uid = ','.join((buf.hexdigest(),str(s.st_size)))
        store(uid)
        return uid

def store(uid):
    p = archive
    if not os.path.exists(p):
        os.mkdir(p)
    for s in slices:
        p = os.path.join(p,uid[s[0]:s[1]])
        if not os.path.exists(p):
            os.mkdir(p)
    p = os.path.join(p,uid[lastslice[0]:lastslice[1]])
    fd = file(p,"wb")
    fd.close()

if __name__ == '__main__':
    import sys
    from optparse import OptionParser
    from types import IntType
    parser = OptionParser(usage="usage: %prog TREE ARCHIVE N1 ... Nk")
    (options, args) = parser.parse_args()
    if len(args) < 3:
        print sys.stderr, "expected at least 3 positional arguments"
        sys.exit(1)
    tree    = args[0]
    archive = args[1]
    prev    = 0
    for a in args[2:]:
        try:
            next = prev+int(a)
            slices.append((prev,next))
            prev = next
        except:
            print >>sys.stderr, "positional argument is not an integer:",a
            sys.exit(1)
    lastslice = (next,-1)
    recurse(tree)
    sys.exit(0)
import os,os.path,stat

info = []
archive = None
total_keys = 0
total_dirs = 0

def collect_info(path,i):
    global total_dirs,total_keys
    s = os.stat(path)
    if stat.S_ISDIR(s.st_mode):
        total_dirs += 1
        l = os.listdir(path)
        n = len(l)
        if i==len(info):
            info.append(n)
        elif n>info[i]:
            info[i] = n
        i += 1
        for f in l:
            collect_info(os.path.join(path,f),i)
    else:
        total_keys += 1

def print_info():
    i = 0
    for n in info:
        print "Max keys at level %2s: %7s" % (i,n)
        i += 1
    print "Total number of dirs: %7s" % total_dirs
    print "Total number of keys: %7s" % total_keys
    fd = os.popen("du -csh %s" % archive,"r")
    s = fd.read()
    fd.close()
    s = s.split()[0]
    print "Disk footprint      : %7s"  % s

if __name__ == '__main__':
    import sys
    from optparse import OptionParser
    parser = OptionParser(usage="usage: %prog ARCHIVE")
    (options, args) = parser.parse_args()
    if len(args) != 1:
        print sys.stderr, "expected exactly 1 positional argument"
        sys.exit(1)
    archive = args[0]
    collect_info(archive,0)
    print_info()
    sys.exit(0)
Cheers,

PS: I should mention again, that my indexed archives only contain empty files
because I am only interested in measuring overhead.

-- 
Dr. Denys Duchier - IRI & LIFL - CNRS, Lille, France
AIM: duchierdenys

Reply via email to