Hello, I have worked on an improved version of fsfs-reshard.py: * support Subversion 1.6 format * able to unpack revisions (1.6 feature) in order to change the shard size * generate statistics about shards with current shard size or a target shard size, to be able to fine tune the shard size for each repository
Usage details are available at http://ymartin59.free.fr/wordpress/index.php/projets/tune-subversion-fsfs-repository/ and may be copied into svnbook. Best regards [[[ Support for 1.6 format (packed shards are unpacked) and statistics generation to estimate effective shard sizes * tools/server-side/fsfs-reshard.py ]]] -- Yves Martin
Index: fsfs-reshard.py =================================================================== --- fsfs-reshard.py (revision 40515) +++ fsfs-reshard.py (working copy) @@ -1,10 +1,15 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # +# fsfs-reshard.py REPOS_PATH +# fsfs-reshard.py REPOS_PATH -target=MAX_FILES_PER_SHARD +# +# Display repository information about fsfs db. +# # fsfs-reshard.py REPOS_PATH MAX_FILES_PER_SHARD # # Perform an offline conversion of an FSFS repository between linear (format -# 2, usable by Subversion 1.4+) and sharded (format 3, usable by Subversion +# 2, usable by Subversion 1.4+) and sharded (format 3/4, usable by Subversion # 1.5+) layouts. # # The MAX_FILES_PER_SHARD argument specifies the maximum number of files @@ -46,23 +51,39 @@ # under the License. # ==================================================================== # +# Subversion 1.6 format 4 support, unpack operation and shard file +# computation contributed by Yves Martin (ymartin59 0x40 free 0x2E fr) +# http://ymartin59.free.fr/wordpress/index.php/projets/tune-subversion-fsfs-repository/ +# # $HeadURL$ # $LastChangedDate$ # $LastChangedBy$ # $LastChangedRevision$ -import os, stat, sys +import os, stat, sys, shutil from errno import EEXIST def usage(): """Print a usage message and exit.""" - print("""usage: %s REPOS_PATH MAX_FILES_PER_SHARD [START END] + print("""usage: %s REPOS_PATH [target=MAX_FILES_PER_SHARD] +Computes shard sizes for current repository or for a target +MAX_FILES_PER_SHARD to tune this parameter according to +performance criteria. + +usage: %s REPOS_PATH MAX_FILES_PER_SHARD [START END] + Perform an offline conversion of an FSFS repository between linear -(readable by Subversion 1.4 or later) and sharded (readable by +(usable by Subversion 1.4 or later) and sharded (usable by Subversion 1.5 or later) layouts. +It is recommended to first upgrade your repository to your current +Subversion release with 'svnadmin upgrade REPOS_PATH'. + +Packed shards are unpacked before converting. According to your +needs, you may want to invoke 'svnadmin pack REPOS_PATH' after. + The MAX_FILES_PER_SHARD argument specifies the maximum number of files that will be stored in each shard (directory), or zero to specify a linear layout. Subversion 1.5 uses a default value of @@ -70,7 +91,7 @@ Convert revisions START through END inclusive if specified, or all revisions if unspecified. -""" % sys.argv[0]) +""" % (sys.argv[0], sys.argv[0])) sys.exit(1) def incompatible_repos_format(repos_path, format): @@ -163,10 +184,18 @@ def check_fs_format(repos_path): """Check that REPOS_PATH contains a filesystem with a suitable format, or that it contains no format file; print a message and exit if neither - is true. Return bool whether the filesystem is sharded.""" - sharded = False + is true. Return an array [format number, shard size] whether the filesystem is sharded.""" + result = [0, 0] db_path = os.path.join(repos_path, 'db') format_path = os.path.join(db_path, 'format') + if not(os.path.exists(format_path)): + # Recover from format.bak if interrupted + format_path = os.path.join(db_path, 'format.bak') + if not(os.path.exists(format_path)): + sys.stderr.write("error: db/format and db/format.bak missing.\n") + sys.stderr.flush() + sys.exit(1) + try: format_file = open(format_path) format = format_file.readline() @@ -182,9 +211,13 @@ pass elif format == '3': pass + elif format == '4': + pass else: incompatible_fs_format(repos_path, format) + result[0] = format; + for line in format_file: if format == '2': unexpected_fs_format_options(repos_path) @@ -193,7 +226,7 @@ if line == 'layout linear': pass elif line.startswith('layout sharded '): - sharded = True + result[1] = int(line[15:]) else: incompatible_fs_format_option(repos_path, line) @@ -205,16 +238,16 @@ # compatible. pass - return sharded + return result def current_file(repos_path): """Return triple of (revision, next_node_id, next_copy_id) from REPOS_PATH/db/current .""" return open(os.path.join(repos_path, 'db', 'current')).readline().split() -def remove_fs_format(repos_path): - """Remove the filesystem format file for repository REPOS_PATH. - Do not raise an error if the file is already missing.""" +def backup_fs_format(repos_path): + """Rename the filesystem format file for repository REPOS_PATH. + Do not raise an error if the file is already renamed.""" format_path = os.path.join(repos_path, 'db', 'format') try: statinfo = os.stat(format_path) @@ -222,36 +255,50 @@ # The file probably doesn't exist. return + format_bak_path = os.path.join(repos_path, 'db', 'format.bak') # On Windows, we need to ensure the file is writable before we can - # remove it. + # rename/remove it. os.chmod(format_path, statinfo.st_mode | stat.S_IWUSR) - os.remove(format_path) + try: + os.rename(format_path, format_bak_path) + except OSError: + # Unexpected but try to go on + os.remove(format_bak_path) + os.rename(format_path, format_bak_path) def write_fs_format(repos_path, contents): """Write a new filesystem format file for repository REPOS_PATH containing CONTENTS.""" format_path = os.path.join(repos_path, 'db', 'format') + format_bak_path = os.path.join(repos_path, 'db', 'format.bak') + # Permissions and owner/group are preserved with rename + try: + os.rename(format_bak_path, format_path) + except OSError: + # Unexpected but try to go on + os.remove(format_path) f = open(format_path, 'wb') f.write(contents) f.close() - os.chmod(format_path, stat.S_IRUSR | stat.S_IRGRP) -def linearise(path): - """Move all the files in subdirectories of PATH into PATH, and remove the - subdirectories. Handle conflicts between subdirectory names and files - contained in subdirectories by ensuring subdirectories have a '.shard' - suffix prior to moving (the files are assumed not to have this suffix. - Abort if a subdirectory is found to contain another subdirectory.""" - # First enumerate all subdirectories of DIR and rename where necessary - # to include a .shard suffix. +def suffix_unpacked_shard(path): + """Add '.shard' suffix to unpacked shard number directory.""" for name in os.listdir(path): - if name.endswith('.shard'): + if name.endswith('.shard') or name.endswith('.pack'): continue subdir_path = os.path.join(path, name) if not os.path.isdir(subdir_path): continue os.rename(subdir_path, subdir_path + '.shard') +def linearise(path): + """Move all the files in subdirectories of PATH into PATH, and remove the + subdirectories. Handle conflicts between subdirectory names and files + contained in subdirectories by ensuring subdirectories have a '.shard' + suffix prior to moving (the files are assumed not to have this suffix. + Abort if a subdirectory is found to contain another subdirectory.""" + suffix_unpacked_shard(path) + # Now move all the subdirectory contents into the parent and remove # the subdirectories. for root_path, dirnames, filenames in os.walk(path): @@ -311,11 +358,171 @@ os.rename(from_path, to_path) skipped == 0 and os.rmdir(tmp) +def unpack_shard(packed_path, unpacklinear, first_rev, revs_size): + """Compute revision sizes in a packed shard at packed_path + and unpack revision except if unpacklinear is false. + The first revision of the shard has first_rev number. + Revision sizes are stored in rev_sizes dictionnary.""" + + copy_buffer_size = 4096 + manifest = open(os.path.join(packed_path, 'manifest'), 'r') + pack_path = os.path.join(packed_path, 'pack') + end_pack = os.path.getsize(pack_path) + if unpacklinear: + pack = open(pack_path, 'rb') + last_position = int(manifest.readline()) + rev_index = first_rev + while last_position < end_pack: + # Read next revision start byte in pack file + try: + byte_position = int(manifest.readline()) + except ValueError: + # last revision: end of pack file + byte_position = end_pack + revs_size[rev_index] = byte_position - last_position + if unpacklinear: + # Extract revision from pack file + arev = open(os.path.join(packed_path, os.path.pardir, str(rev_index)), 'wb') + pack.seek(last_position) + while last_position < byte_position: + bytes_tocopy = copy_buffer_size + if (byte_position - last_position) < copy_buffer_size: + bytes_tocopy = byte_position - last_position + rev_buffer = pack.read(bytes_tocopy) + arev.write(rev_buffer) + last_position += len(rev_buffer) + if bytes_tocopy < copy_buffer_size: + break + arev.close() + else: + last_position = byte_position + rev_index += 1 + # Close file descriptors + manifest.close() + if unpacklinear: + pack.close() + return revs_size + +def compute_rev_sizes(revs_path, current_shard, unpacklinear): + """Compute revision sizes based on current shard capacity + Support either linear, sharded or packed revisions. + If unpacklinear is True, packed sharded are linearized too.""" + revs_size = {} + for root_path, dirnames, filenames in os.walk(revs_path): + if len(filenames) > 0: + for name in filenames: + try: + revnum = int(name) + revs_size[revnum] = os.path.getsize(os.path.join(root_path, name)) + except ValueError: + pass + if len(dirnames) > 0: + for name in dirnames: + if (not(name.endswith('.pack'))): + continue + shard_number = int(name[:-5]) + shard_path = os.path.join(root_path, name) + # get revision sizes from packed shard [and unpack] + revs_size = unpack_shard(shard_path, unpacklinear, current_shard * shard_number, revs_size) + if unpacklinear: + # remove x.pack structure + shutil.rmtree(shard_path) + return revs_size + +def compute_shard_sizes(revs_size, max_files_per_shard): + """Compute shard sizes based on target max_files_per_shard + and map of revision size.""" + current_shard = 0 + current_shard_size = 0 + min_shard_size = 2**63 + max_shard_size = 0 + shard_size_sum = 0 + for i, size in revs_size.iteritems(): + current_shard_size += size + if ((i + 1) % max_files_per_shard) == 0: + print 'Shard %d size: %d' % (current_shard, current_shard_size) + shard_size_sum += current_shard_size + if current_shard_size < min_shard_size: + min_shard_size = current_shard_size + if current_shard_size > max_shard_size: + max_shard_size = current_shard_size + current_shard_size = 0 + current_shard += 1 + if current_shard_size != 0: + print 'Shard %d size: %d' % (current_shard, current_shard_size) + if current_shard > 0: + print 'Average full-shard size %d. Minimum: %d, Maximum: %d.' \ + % ((shard_size_sum / current_shard), min_shard_size, max_shard_size) + +def linearise_packed_shards(revs_path, current_shard, min_unpacked_rev_path): + """Linearise packed shards in revs_path directory based on + current_shard number of revisions per shard. + min-unpacked-rev at min_unpacked_rev_path is reset to 0.""" + # Suffix unpacked shard to prevent conflicts + suffix_unpacked_shard(revs_path) + # Linearise packed shards + compute_rev_sizes(revs_path, current_shard, True) + # Reset min-unpacked-rev + min_unpacked_rev_file = open(min_unpacked_rev_path, 'wb') + min_unpacked_rev_file.write('0\n') + min_unpacked_rev_file.close() + def main(): - if len(sys.argv) < 3: + if len(sys.argv) < 2: usage() repos_path = sys.argv[1] + + # Get [number format, sharded] + fs_format = check_fs_format(repos_path) + + # Get minimum unpacked revision, Subversion >= 1.6 + min_unpacked_rev = 0 + min_unpacked_rev_path = os.path.join(repos_path, 'db', 'min-unpacked-rev') + if os.path.exists(min_unpacked_rev_path): + min_unpacked_rev_file = open(min_unpacked_rev_path) + try: + min_unpacked_rev = int(min_unpacked_rev_file.readline()) + except ValueError, OverflowError: + sys.stderr.write("error: repository db/min-unpacked-rev does not contain a valid number.\n") + sys.stderr.flush() + sys.exit(1) + min_unpacked_rev_file.close() + + if len(sys.argv) == 2 or (len(sys.argv) == 3 and sys.argv[2].startswith('target=')): + # Print repository information [and computes shard sizes [for the specified target]] + fs_format = check_fs_format(repos_path) + target_shard = fs_format[1] + if len(sys.argv) == 3: + try: + target_shard = int(sys.argv[2][7:]) + except ValueError, OverflowError: + sys.stderr.write("error: target maximum files per shard ('%s') is not a valid number.\n" \ + % max_files_per_shard) + sys.stderr.flush() + sys.exit(1) + revs_path = os.path.join(repos_path, 'db', 'revs') + sys.stdout.write("Current FSFS db format version ") + sys.stdout.write(fs_format[0]) + if fs_format[1] > 0: + sys.stdout.write(" with sharded layout, max files per shard: ") + sys.stdout.write(str(fs_format[1])) + if min_unpacked_rev > 0: + sys.stdout.write(", packed shards: ") + sys.stdout.write(str(min_unpacked_rev / fs_format[1])) + else: + sys.stdout.write(" with linear layout") + if target_shard > 0: + sys.stdout.write(".\nList of shard sizes for max files per shard = ") + sys.stdout.write(str(target_shard)) + sys.stdout.write("\n") + revs_size = compute_rev_sizes(revs_path, fs_format[1], False) + compute_shard_sizes(revs_size, target_shard) + else: + sys.stdout.write(".\n") + sys.stdout.flush() + exit(0) + max_files_per_shard = sys.argv[2] try: start = int(sys.argv[3]) @@ -349,13 +556,12 @@ # Check the format of the repository. check_repos_format(repos_path) - sharded = check_fs_format(repos_path) # Let the user know what's going on. if max_files_per_shard > 0: print("Converting '%s' to a sharded structure with %d files per directory" \ % (repos_path, max_files_per_shard)) - if sharded: + if fs_format[1]: print('(will convert to a linear structure first)') else: print("Converting '%s' to a linear structure" % repos_path) @@ -364,20 +570,30 @@ # There's no clean way to do this, but since the format of the repository # is indeterminate, let's remove the format file while we're converting. print('- marking the repository as invalid') - remove_fs_format(repos_path) + backup_fs_format(repos_path) # First, convert to a linear scheme (this makes recovery easier because # it's easier to reason about the behaviour on restart). - if sharded: - print('- linearising db/revs') - linearise(os.path.join(repos_path, 'db', 'revs')) + if fs_format[1] > 0: + revs_path = os.path.join(repos_path, 'db', 'revs') + if min_unpacked_rev > 0: + print('- linearising db/revs (unpacking first)') + linearise_packed_shards(revs_path, fs_format[1], min_unpacked_rev_path) + min_unpacked_rev = 0 + else: + print('- linearising db/revs') + # Process unpacked shard + linearise(revs_path) print('- linearising db/revprops') linearise(os.path.join(repos_path, 'db', 'revprops')) if max_files_per_shard == 0: - # We're done. Stamp the filesystem with a format 2 db/format file. + # We're done. Stamp the filesystem with a format 2/3/4 db/format file. print('- marking the repository as a valid linear repository') - write_fs_format(repos_path, '2\n') + format_layout = '\n' + if fs_format[0] > 2: + format_layout = '\nlayout linear\n' + write_fs_format(repos_path, fs_format[0] + format_layout) else: print('- sharding db/revs') shard(os.path.join(repos_path, 'db', 'revs'), max_files_per_shard, @@ -386,14 +602,18 @@ shard(os.path.join(repos_path, 'db', 'revprops'), max_files_per_shard, start, end) - # We're done. Stamp the filesystem with a format 3 db/format file. + # Sharded. Keep original 2/3/4 format. + target_format = fs_format[0] + + # We're done. Stamp the filesystem with a format db/format file. print('- marking the repository as a valid sharded repository') - write_fs_format(repos_path, '3\nlayout sharded %d\n' % max_files_per_shard) + write_fs_format(repos_path, target_format + '\nlayout sharded %d\n' % max_files_per_shard) print('- done.') sys.exit(0) +main() + if __name__ == '__main__': raise Exception("""This script is unfinished and not ready to be used on live data. Trust us.""") - main()